src/libstd/sys/common/wtf8.rs

   1 // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
  12 //!
  13 //! This library uses Rust’s type system to maintain
  14 //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
  15 //! like the `String` and `&str` types do for UTF-8.
  16 //!
  17 //! Since [WTF-8 must not be used
  18 //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
  19 //! this library deliberately does not provide access to the underlying bytes
  20 //! of WTF-8 strings,
  21 //! nor can it decode WTF-8 from arbitrary bytes.
  22 //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
  23
  24 // this module is imported from @SimonSapin's repo and has tons of dead code on
  25 // unix (it's mostly used on windows), so don't worry about dead code here.
  26 #![allow(dead_code)]
  27
  28 use core::prelude::*;
  29
  30 use core::char::{encode_utf8_raw, encode_utf16_raw};
  31 use core::str::next_code_point;
  32
  33 use ascii::*;
  34 use borrow::Cow;
  35 use char;
  36 use cmp;
  37 use fmt;
  38 use hash::{Hash, Hasher};
  39 use iter::FromIterator;
  40 use mem;
  41 use ops;
  42 use rustc_unicode::str::{Utf16Item, utf16_items};
  43 use slice;
  44 use str;
  45 use string::String;
  46 use sys_common::AsInner;
  47 use vec::Vec;
  48
  49 const UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD";
  50
  51 /// A Unicode code point: from U+0000 to U+10FFFF.
  52 ///
  53 /// Compare with the `char` type,
  54 /// which represents a Unicode scalar value:
  55 /// a code point that is not a surrogate (U+D800 to U+DFFF).
  56 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
  57 pub struct CodePoint {
  58     value: u32
  59 }
  60
  61 /// Format the code point as `U+` followed by four to six hexadecimal digits.
  62 /// Example: `U+1F4A9`
  63 impl fmt::Debug for CodePoint {
  64     #[inline]
  65     fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
  66         write!(formatter, "U+{:04X}", self.value)
  67     }
  68 }
  69
  70 impl CodePoint {
  71     /// Unsafely creates a new `CodePoint` without checking the value.
  72     ///
  73     /// Only use when `value` is known to be less than or equal to 0x10FFFF.
  74     #[inline]
  75     pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
  76         CodePoint { value: value }
  77     }
  78
  79     /// Creates a new `CodePoint` if the value is a valid code point.
  80     ///
  81     /// Returns `None` if `value` is above 0x10FFFF.
  82     #[inline]
  83     pub fn from_u32(value: u32) -> Option<CodePoint> {
  84         match value {
  85             0 ... 0x10FFFF => Some(CodePoint { value: value }),
  86             _ => None
  87         }
  88     }
  89
  90     /// Creates a new `CodePoint` from a `char`.
  91     ///
  92     /// Since all Unicode scalar values are code points, this always succeeds.
  93     #[inline]
  94     pub fn from_char(value: char) -> CodePoint {
  95         CodePoint { value: value as u32 }
  96     }
  97
  98     /// Returns the numeric value of the code point.
  99     #[inline]
 100     pub fn to_u32(&self) -> u32 {
 101         self.value
 102     }
 103
 104     /// Optionally returns a Unicode scalar value for the code point.
 105     ///
 106     /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
 107     #[inline]
 108     pub fn to_char(&self) -> Option<char> {
 109         match self.value {
 110             0xD800 ... 0xDFFF => None,
 111             _ => Some(unsafe { char::from_u32_unchecked(self.value) })
 112         }
 113     }
 114
 115     /// Returns a Unicode scalar value for the code point.
 116     ///
 117     /// Returns `'\u{FFFD}'` (the replacement character “�”)
 118     /// if the code point is a surrogate (from U+D800 to U+DFFF).
 119     #[inline]
 120     pub fn to_char_lossy(&self) -> char {
 121         self.to_char().unwrap_or('\u{FFFD}')
 122     }
 123 }
 124
 125 /// An owned, growable string of well-formed WTF-8 data.
 126 ///
 127 /// Similar to `String`, but can additionally contain surrogate code points
 128 /// if they’re not in a surrogate pair.
 129 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
 130 pub struct Wtf8Buf {
 131     bytes: Vec<u8>
 132 }
 133
 134 impl ops::Deref for Wtf8Buf {
 135     type Target = Wtf8;
 136
 137     fn deref(&self) -> &Wtf8 {
 138         self.as_slice()
 139     }
 140 }
 141
 142 /// Format the string with double quotes,
 143 /// and surrogates as `\u` followed by four hexadecimal digits.
 144 /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
 145 impl fmt::Debug for Wtf8Buf {
 146     #[inline]
 147     fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
 148         fmt::Debug::fmt(&**self, formatter)
 149     }
 150 }
 151
 152 impl Wtf8Buf {
 153     /// Creates an new, empty WTF-8 string.
 154     #[inline]
 155     pub fn new() -> Wtf8Buf {
 156         Wtf8Buf { bytes: Vec::new() }
 157     }
 158
 159     /// Creates an new, empty WTF-8 string with pre-allocated capacity for `n` bytes.
 160     #[inline]
 161     pub fn with_capacity(n: usize) -> Wtf8Buf {
 162         Wtf8Buf { bytes: Vec::with_capacity(n) }
 163     }
 164
 165     /// Creates a WTF-8 string from a UTF-8 `String`.
 166     ///
 167     /// This takes ownership of the `String` and does not copy.
 168     ///
 169     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 170     #[inline]
 171     pub fn from_string(string: String) -> Wtf8Buf {
 172         Wtf8Buf { bytes: string.into_bytes() }
 173     }
 174
 175     /// Creates a WTF-8 string from a UTF-8 `&str` slice.
 176     ///
 177     /// This copies the content of the slice.
 178     ///
 179     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 180     #[inline]
 181     pub fn from_str(str: &str) -> Wtf8Buf {
 182         Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
 183     }
 184
 185     /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
 186     ///
 187     /// This is lossless: calling `.encode_wide()` on the resulting string
 188     /// will always return the original code units.
 189     pub fn from_wide(v: &[u16]) -> Wtf8Buf {
 190         let mut string = Wtf8Buf::with_capacity(v.len());
 191         for item in utf16_items(v) {
 192             match item {
 193                 Utf16Item::ScalarValue(c) => string.push_char(c),
 194                 Utf16Item::LoneSurrogate(s) => {
 195                     // Surrogates are known to be in the code point range.
 196                     let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
 197                     // Skip the WTF-8 concatenation check,
 198                     // surrogate pairs are already decoded by utf16_items
 199                     string.push_code_point_unchecked(code_point)
 200                 }
 201             }
 202         }
 203         string
 204     }
 205
 206     /// Copied from String::push
 207     /// This does **not** include the WTF-8 concatenation check.
 208     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
 209         let cur_len = self.len();
 210         // This may use up to 4 bytes.
 211         self.reserve(4);
 212
 213         unsafe {
 214             // Attempt to not use an intermediate buffer by just pushing bytes
 215             // directly onto this string.
 216             let slice = slice::from_raw_parts_mut(
 217                 self.bytes.as_mut_ptr().offset(cur_len as isize), 4
 218             );
 219             let used = encode_utf8_raw(code_point.value, slice).unwrap();
 220             self.bytes.set_len(cur_len + used);
 221         }
 222     }
 223
 224     #[inline]
 225     pub fn as_slice(&self) -> &Wtf8 {
 226         unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
 227     }
 228
 229     /// Reserves capacity for at least `additional` more bytes to be inserted
 230     /// in the given `Wtf8Buf`.
 231     /// The collection may reserve more space to avoid frequent reallocations.
 232     ///
 233     /// # Panics
 234     ///
 235     /// Panics if the new capacity overflows `usize`.
 236     #[inline]
 237     pub fn reserve(&mut self, additional: usize) {
 238         self.bytes.reserve(additional)
 239     }
 240
 241     /// Returns the number of bytes that this string buffer can hold without reallocating.
 242     #[inline]
 243     pub fn capacity(&self) -> usize {
 244         self.bytes.capacity()
 245     }
 246
 247     /// Append a UTF-8 slice at the end of the string.
 248     #[inline]
 249     pub fn push_str(&mut self, other: &str) {
 250         self.bytes.push_all(other.as_bytes())
 251     }
 252
 253     /// Append a WTF-8 slice at the end of the string.
 254     ///
 255     /// This replaces newly paired surrogates at the boundary
 256     /// with a supplementary code point,
 257     /// like concatenating ill-formed UTF-16 strings effectively would.
 258     #[inline]
 259     pub fn push_wtf8(&mut self, other: &Wtf8) {
 260         match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
 261             // Replace newly paired surrogates by a supplementary code point.
 262             (Some(lead), Some(trail)) => {
 263                 let len_without_lead_surrogate = self.len() - 3;
 264                 self.bytes.truncate(len_without_lead_surrogate);
 265                 let other_without_trail_surrogate = &other.bytes[3..];
 266                 // 4 bytes for the supplementary code point
 267                 self.bytes.reserve(4 + other_without_trail_surrogate.len());
 268                 self.push_char(decode_surrogate_pair(lead, trail));
 269                 self.bytes.push_all(other_without_trail_surrogate);
 270             }
 271             _ => self.bytes.push_all(&other.bytes)
 272         }
 273     }
 274
 275     /// Append a Unicode scalar value at the end of the string.
 276     #[inline]
 277     pub fn push_char(&mut self, c: char) {
 278         self.push_code_point_unchecked(CodePoint::from_char(c))
 279     }
 280
 281     /// Append a code point at the end of the string.
 282     ///
 283     /// This replaces newly paired surrogates at the boundary
 284     /// with a supplementary code point,
 285     /// like concatenating ill-formed UTF-16 strings effectively would.
 286     #[inline]
 287     pub fn push(&mut self, code_point: CodePoint) {
 288         match code_point.to_u32() {
 289             trail @ 0xDC00...0xDFFF => {
 290                 match (&*self).final_lead_surrogate() {
 291                     Some(lead) => {
 292                         let len_without_lead_surrogate = self.len() - 3;
 293                         self.bytes.truncate(len_without_lead_surrogate);
 294                         self.push_char(decode_surrogate_pair(lead, trail as u16));
 295                         return
 296                     }
 297                     _ => {}
 298                 }
 299             }
 300             _ => {}
 301         }
 302
 303         // No newly paired surrogates at the boundary.
 304         self.push_code_point_unchecked(code_point)
 305     }
 306
 307     /// Shortens a string to the specified length.
 308     ///
 309     /// # Panics
 310     ///
 311     /// Panics if `new_len` > current length,
 312     /// or if `new_len` is not a code point boundary.
 313     #[inline]
 314     pub fn truncate(&mut self, new_len: usize) {
 315         assert!(is_code_point_boundary(self, new_len));
 316         self.bytes.truncate(new_len)
 317     }
 318
 319     /// Consumes the WTF-8 string and tries to convert it to UTF-8.
 320     ///
 321     /// This does not copy the data.
 322     ///
 323     /// If the contents are not well-formed UTF-8
 324     /// (that is, if the string contains surrogates),
 325     /// the original WTF-8 string is returned instead.
 326     pub fn into_string(self) -> Result<String, Wtf8Buf> {
 327         match self.next_surrogate(0) {
 328             None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
 329             Some(_) => Err(self),
 330         }
 331     }
 332
 333     /// Consumes the WTF-8 string and converts it lossily to UTF-8.
 334     ///
 335     /// This does not copy the data (but may overwrite parts of it in place).
 336     ///
 337     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
 338     pub fn into_string_lossy(mut self) -> String {
 339         let mut pos = 0;
 340         loop {
 341             match self.next_surrogate(pos) {
 342                 Some((surrogate_pos, _)) => {
 343                     pos = surrogate_pos + 3;
 344                     slice::bytes::copy_memory(
 345                         UTF8_REPLACEMENT_CHARACTER,
 346                         &mut self.bytes[surrogate_pos .. pos],
 347                     );
 348                 },
 349                 None => return unsafe { String::from_utf8_unchecked(self.bytes) }
 350             }
 351         }
 352     }
 353 }
 354
 355 /// Create a new WTF-8 string from an iterator of code points.
 356 ///
 357 /// This replaces surrogate code point pairs with supplementary code points,
 358 /// like concatenating ill-formed UTF-16 strings effectively would.
 359 impl FromIterator<CodePoint> for Wtf8Buf {
 360     fn from_iter<T: IntoIterator<Item=CodePoint>>(iter: T) -> Wtf8Buf {
 361         let mut string = Wtf8Buf::new();
 362         string.extend(iter);
 363         string
 364     }
 365 }
 366
 367 /// Append code points from an iterator to the string.
 368 ///
 369 /// This replaces surrogate code point pairs with supplementary code points,
 370 /// like concatenating ill-formed UTF-16 strings effectively would.
 371 impl Extend<CodePoint> for Wtf8Buf {
 372     fn extend<T: IntoIterator<Item=CodePoint>>(&mut self, iterable: T) {
 373         let iterator = iterable.into_iter();
 374         let (low, _high) = iterator.size_hint();
 375         // Lower bound of one byte per code point (ASCII only)
 376         self.bytes.reserve(low);
 377         for code_point in iterator {
 378             self.push(code_point);
 379         }
 380     }
 381 }
 382
 383 /// A borrowed slice of well-formed WTF-8 data.
 384 ///
 385 /// Similar to `&str`, but can additionally contain surrogate code points
 386 /// if they’re not in a surrogate pair.
 387 pub struct Wtf8 {
 388     bytes: [u8]
 389 }
 390
 391 impl AsInner<[u8]> for Wtf8 {
 392     fn as_inner(&self) -> &[u8] { &self.bytes }
 393 }
 394
 395 // FIXME: https://github.com/rust-lang/rust/issues/18805
 396 impl PartialEq for Wtf8 {
 397     fn eq(&self, other: &Wtf8) -> bool { self.bytes.eq(&other.bytes) }
 398 }
 399
 400 // FIXME: https://github.com/rust-lang/rust/issues/18805
 401 impl Eq for Wtf8 {}
 402
 403 // FIXME: https://github.com/rust-lang/rust/issues/18738
 404 impl PartialOrd for Wtf8 {
 405     #[inline]
 406     fn partial_cmp(&self, other: &Wtf8) -> Option<cmp::Ordering> {
 407         self.bytes.partial_cmp(&other.bytes)
 408     }
 409     #[inline]
 410     fn lt(&self, other: &Wtf8) -> bool { self.bytes.lt(&other.bytes) }
 411     #[inline]
 412     fn le(&self, other: &Wtf8) -> bool { self.bytes.le(&other.bytes) }
 413     #[inline]
 414     fn gt(&self, other: &Wtf8) -> bool { self.bytes.gt(&other.bytes) }
 415     #[inline]
 416     fn ge(&self, other: &Wtf8) -> bool { self.bytes.ge(&other.bytes) }
 417 }
 418
 419 // FIXME: https://github.com/rust-lang/rust/issues/18738
 420 impl Ord for Wtf8 {
 421     #[inline]
 422     fn cmp(&self, other: &Wtf8) -> cmp::Ordering { self.bytes.cmp(&other.bytes) }
 423 }
 424
 425 /// Format the slice with double quotes,
 426 /// and surrogates as `\u` followed by four hexadecimal digits.
 427 /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
 428 impl fmt::Debug for Wtf8 {
 429     fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
 430         fn write_str_escaped(f: &mut fmt::Formatter, s: &str) -> fmt::Result {
 431             use fmt::Write;
 432             for c in s.chars().flat_map(|c| c.escape_default()) {
 433                 try!(f.write_char(c))
 434             }
 435             Ok(())
 436         }
 437
 438         try!(formatter.write_str("\""));
 439         let mut pos = 0;
 440         loop {
 441             match self.next_surrogate(pos) {
 442                 None => break,
 443                 Some((surrogate_pos, surrogate)) => {
 444                     try!(write_str_escaped(
 445                         formatter,
 446                         unsafe { str::from_utf8_unchecked(
 447                             &self.bytes[pos .. surrogate_pos]
 448                         )},
 449                     ));
 450                     try!(write!(formatter, "\\u{{{:X}}}", surrogate));
 451                     pos = surrogate_pos + 3;
 452                 }
 453             }
 454         }
 455         try!(write_str_escaped(
 456             formatter,
 457             unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) },
 458         ));
 459         formatter.write_str("\"")
 460     }
 461 }
 462
 463 impl Wtf8 {
 464     /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
 465     ///
 466     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 467     #[inline]
 468     pub fn from_str(value: &str) -> &Wtf8 {
 469         unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
 470     }
 471
 472     /// Creates a WTF-8 slice from a WTF-8 byte slice.
 473     ///
 474     /// Since the byte slice is not checked for valid WTF-8, this functions is
 475     /// marked unsafe.
 476     #[inline]
 477     unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
 478         mem::transmute(value)
 479     }
 480
 481     /// Returns the length, in WTF-8 bytes.
 482     #[inline]
 483     pub fn len(&self) -> usize {
 484         self.bytes.len()
 485     }
 486
 487     /// Returns the code point at `position` if it is in the ASCII range,
 488     /// or `b'\xFF' otherwise.
 489     ///
 490     /// # Panics
 491     ///
 492     /// Panics if `position` is beyond the end of the string.
 493     #[inline]
 494     pub fn ascii_byte_at(&self, position: usize) -> u8 {
 495         match self.bytes[position] {
 496             ascii_byte @ 0x00 ... 0x7F => ascii_byte,
 497             _ => 0xFF
 498         }
 499     }
 500
 501     /// Returns an iterator for the string’s code points.
 502     #[inline]
 503     pub fn code_points(&self) -> Wtf8CodePoints {
 504         Wtf8CodePoints { bytes: self.bytes.iter() }
 505     }
 506
 507     /// Tries to convert the string to UTF-8 and return a `&str` slice.
 508     ///
 509     /// Returns `None` if the string contains surrogates.
 510     ///
 511     /// This does not copy the data.
 512     #[inline]
 513     pub fn as_str(&self) -> Option<&str> {
 514         // Well-formed WTF-8 is also well-formed UTF-8
 515         // if and only if it contains no surrogate.
 516         match self.next_surrogate(0) {
 517             None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 518             Some(_) => None,
 519         }
 520     }
 521
 522     /// Lossily converts the string to UTF-8.
 523     /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
 524     ///
 525     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
 526     ///
 527     /// This only copies the data if necessary (if it contains any surrogate).
 528     pub fn to_string_lossy(&self) -> Cow<str> {
 529         let surrogate_pos = match self.next_surrogate(0) {
 530             None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 531             Some((pos, _)) => pos,
 532         };
 533         let wtf8_bytes = &self.bytes;
 534         let mut utf8_bytes = Vec::with_capacity(self.len());
 535         utf8_bytes.push_all(&wtf8_bytes[..surrogate_pos]);
 536         utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER);
 537         let mut pos = surrogate_pos + 3;
 538         loop {
 539             match self.next_surrogate(pos) {
 540                 Some((surrogate_pos, _)) => {
 541                     utf8_bytes.push_all(&wtf8_bytes[pos .. surrogate_pos]);
 542                     utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER);
 543                     pos = surrogate_pos + 3;
 544                 },
 545                 None => {
 546                     utf8_bytes.push_all(&wtf8_bytes[pos..]);
 547                     return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) })
 548                 }
 549             }
 550         }
 551     }
 552
 553     /// Converts the WTF-8 string to potentially ill-formed UTF-16
 554     /// and return an iterator of 16-bit code units.
 555     ///
 556     /// This is lossless:
 557     /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
 558     /// would always return the original WTF-8 string.
 559     #[inline]
 560     pub fn encode_wide(&self) -> EncodeWide {
 561         EncodeWide { code_points: self.code_points(), extra: 0 }
 562     }
 563
 564     #[inline]
 565     fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
 566         let mut iter = self.bytes[pos..].iter();
 567         loop {
 568             let b = match iter.next() {
 569                 None => return None,
 570                 Some(&b) => b,
 571             };
 572             if b < 0x80 {
 573                 pos += 1;
 574             } else if b < 0xE0 {
 575                 iter.next();
 576                 pos += 2;
 577             } else if b == 0xED {
 578                 match (iter.next(), iter.next()) {
 579                     (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
 580                         return Some((pos, decode_surrogate(b2, b3)))
 581                     }
 582                     _ => pos += 3
 583                 }
 584             } else if b < 0xF0 {
 585                 iter.next();
 586                 iter.next();
 587                 pos += 3;
 588             } else {
 589                 iter.next();
 590                 iter.next();
 591                 iter.next();
 592                 pos += 4;
 593             }
 594         }
 595     }
 596
 597     #[inline]
 598     fn final_lead_surrogate(&self) -> Option<u16> {
 599         let len = self.len();
 600         if len < 3 {
 601             return None
 602         }
 603         match &self.bytes[(len - 3)..] {
 604             [0xED, b2 @ 0xA0...0xAF, b3] => Some(decode_surrogate(b2, b3)),
 605             _ => None
 606         }
 607     }
 608
 609     #[inline]
 610     fn initial_trail_surrogate(&self) -> Option<u16> {
 611         let len = self.len();
 612         if len < 3 {
 613             return None
 614         }
 615         match &self.bytes[..3] {
 616             [0xED, b2 @ 0xB0...0xBF, b3] => Some(decode_surrogate(b2, b3)),
 617             _ => None
 618         }
 619     }
 620 }
 621
 622
 623 /// Return a slice of the given string for the byte range [`begin`..`end`).
 624 ///
 625 /// # Panics
 626 ///
 627 /// Panics when `begin` and `end` do not point to code point boundaries,
 628 /// or point beyond the end of the string.
 629 impl ops::Index<ops::Range<usize>> for Wtf8 {
 630     type Output = Wtf8;
 631
 632     #[inline]
 633     fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
 634         // is_code_point_boundary checks that the index is in [0, .len()]
 635         if range.start <= range.end &&
 636            is_code_point_boundary(self, range.start) &&
 637            is_code_point_boundary(self, range.end) {
 638             unsafe { slice_unchecked(self, range.start, range.end) }
 639         } else {
 640             slice_error_fail(self, range.start, range.end)
 641         }
 642     }
 643 }
 644
 645 /// Return a slice of the given string from byte `begin` to its end.
 646 ///
 647 /// # Panics
 648 ///
 649 /// Panics when `begin` is not at a code point boundary,
 650 /// or is beyond the end of the string.
 651 impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
 652     type Output = Wtf8;
 653
 654     #[inline]
 655     fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
 656         // is_code_point_boundary checks that the index is in [0, .len()]
 657         if is_code_point_boundary(self, range.start) {
 658             unsafe { slice_unchecked(self, range.start, self.len()) }
 659         } else {
 660             slice_error_fail(self, range.start, self.len())
 661         }
 662     }
 663 }
 664
 665 /// Return a slice of the given string from its beginning to byte `end`.
 666 ///
 667 /// # Panics
 668 ///
 669 /// Panics when `end` is not at a code point boundary,
 670 /// or is beyond the end of the string.
 671 impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
 672     type Output = Wtf8;
 673
 674     #[inline]
 675     fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
 676         // is_code_point_boundary checks that the index is in [0, .len()]
 677         if is_code_point_boundary(self, range.end) {
 678             unsafe { slice_unchecked(self, 0, range.end) }
 679         } else {
 680             slice_error_fail(self, 0, range.end)
 681         }
 682     }
 683 }
 684
 685 impl ops::Index<ops::RangeFull> for Wtf8 {
 686     type Output = Wtf8;
 687
 688     #[inline]
 689     fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
 690         self
 691     }
 692 }
 693
 694 #[inline]
 695 fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
 696     // The first byte is assumed to be 0xED
 697     0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
 698 }
 699
 700 #[inline]
 701 fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
 702     let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
 703     unsafe { char::from_u32_unchecked(code_point) }
 704 }
 705
 706 /// Copied from core::str::StrPrelude::is_char_boundary
 707 #[inline]
 708 pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
 709     if index == slice.len() { return true; }
 710     match slice.bytes.get(index) {
 711         None => false,
 712         Some(&b) => b < 128 || b >= 192,
 713     }
 714 }
 715
 716 /// Copied from core::str::raw::slice_unchecked
 717 #[inline]
 718 pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
 719     // memory layout of an &[u8] and &Wtf8 are the same
 720     Wtf8::from_bytes_unchecked(slice::from_raw_parts(
 721         s.bytes.as_ptr().offset(begin as isize),
 722         end - begin
 723     ))
 724 }
 725
 726 /// Copied from core::str::raw::slice_error_fail
 727 #[inline(never)]
 728 pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
 729     assert!(begin <= end);
 730     panic!("index {} and/or {} in `{:?}` do not lie on character boundary",
 731           begin, end, s);
 732 }
 733
 734 /// Iterator for the code points of a WTF-8 string.
 735 ///
 736 /// Created with the method `.code_points()`.
 737 #[derive(Clone)]
 738 pub struct Wtf8CodePoints<'a> {
 739     bytes: slice::Iter<'a, u8>
 740 }
 741
 742 impl<'a> Iterator for Wtf8CodePoints<'a> {
 743     type Item = CodePoint;
 744
 745     #[inline]
 746     fn next(&mut self) -> Option<CodePoint> {
 747         next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
 748     }
 749
 750     #[inline]
 751     fn size_hint(&self) -> (usize, Option<usize>) {
 752         let (len, _) = self.bytes.size_hint();
 753         (len.saturating_add(3) / 4, Some(len))
 754     }
 755 }
 756
 757 #[derive(Clone)]
 758 pub struct EncodeWide<'a> {
 759     code_points: Wtf8CodePoints<'a>,
 760     extra: u16
 761 }
 762
 763 // Copied from libunicode/u_str.rs
 764 impl<'a> Iterator for EncodeWide<'a> {
 765     type Item = u16;
 766
 767     #[inline]
 768     fn next(&mut self) -> Option<u16> {
 769         if self.extra != 0 {
 770             let tmp = self.extra;
 771             self.extra = 0;
 772             return Some(tmp);
 773         }
 774
 775         let mut buf = [0; 2];
 776         self.code_points.next().map(|code_point| {
 777             let n = encode_utf16_raw(code_point.value, &mut buf)
 778                 .unwrap_or(0);
 779             if n == 2 { self.extra = buf[1]; }
 780             buf[0]
 781         })
 782     }
 783
 784     #[inline]
 785     fn size_hint(&self) -> (usize, Option<usize>) {
 786         let (low, high) = self.code_points.size_hint();
 787         // every code point gets either one u16 or two u16,
 788         // so this iterator is between 1 or 2 times as
 789         // long as the underlying iterator.
 790         (low, high.and_then(|n| n.checked_mul(2)))
 791     }
 792 }
 793
 794 impl Hash for CodePoint {
 795     #[inline]
 796     fn hash<H: Hasher>(&self, state: &mut H) {
 797         self.value.hash(state)
 798     }
 799 }
 800
 801 impl Hash for Wtf8Buf {
 802     #[inline]
 803     fn hash<H: Hasher>(&self, state: &mut H) {
 804         state.write(&self.bytes);
 805         0xfeu8.hash(state)
 806     }
 807 }
 808
 809 impl Hash for Wtf8 {
 810     #[inline]
 811     fn hash<H: Hasher>(&self, state: &mut H) {
 812         state.write(&self.bytes);
 813         0xfeu8.hash(state)
 814     }
 815 }
 816
 817 impl AsciiExt for Wtf8 {
 818     type Owned = Wtf8Buf;
 819
 820     fn is_ascii(&self) -> bool {
 821         self.bytes.is_ascii()
 822     }
 823     fn to_ascii_uppercase(&self) -> Wtf8Buf {
 824         Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
 825     }
 826     fn to_ascii_lowercase(&self) -> Wtf8Buf {
 827         Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
 828     }
 829     fn eq_ignore_ascii_case(&self, other: &Wtf8) -> bool {
 830         self.bytes.eq_ignore_ascii_case(&other.bytes)
 831     }
 832
 833     fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() }
 834     fn make_ascii_lowercase(&mut self) { self.bytes.make_ascii_lowercase() }
 835 }
 836
 837 #[cfg(test)]
 838 mod tests {
 839     use prelude::v1::*;
 840     use borrow::Cow;
 841     use super::*;
 842
 843     #[test]
 844     fn code_point_from_u32() {
 845         assert!(CodePoint::from_u32(0).is_some());
 846         assert!(CodePoint::from_u32(0xD800).is_some());
 847         assert!(CodePoint::from_u32(0x10FFFF).is_some());
 848         assert!(CodePoint::from_u32(0x110000).is_none());
 849     }
 850
 851     #[test]
 852     fn code_point_to_u32() {
 853         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 854         assert_eq!(c(0).to_u32(), 0);
 855         assert_eq!(c(0xD800).to_u32(), 0xD800);
 856         assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF);
 857     }
 858
 859     #[test]
 860     fn code_point_from_char() {
 861         assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
 862         assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9);
 863     }
 864
 865     #[test]
 866     fn code_point_to_string() {
 867         assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061");
 868         assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9");
 869     }
 870
 871     #[test]
 872     fn code_point_to_char() {
 873         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 874         assert_eq!(c(0x61).to_char(), Some('a'));
 875         assert_eq!(c(0x1F4A9).to_char(), Some('💩'));
 876         assert_eq!(c(0xD800).to_char(), None);
 877     }
 878
 879     #[test]
 880     fn code_point_to_char_lossy() {
 881         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 882         assert_eq!(c(0x61).to_char_lossy(), 'a');
 883         assert_eq!(c(0x1F4A9).to_char_lossy(), '💩');
 884         assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}');
 885     }
 886
 887     #[test]
 888     fn wtf8buf_new() {
 889         assert_eq!(Wtf8Buf::new().bytes, b"");
 890     }
 891
 892     #[test]
 893     fn wtf8buf_from_str() {
 894         assert_eq!(Wtf8Buf::from_str("").bytes, b"");
 895         assert_eq!(Wtf8Buf::from_str("aé 💩").bytes,
 896                    b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 897     }
 898
 899     #[test]
 900     fn wtf8buf_from_string() {
 901         assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b"");
 902         assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes,
 903                    b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 904     }
 905
 906     #[test]
 907     fn wtf8buf_from_wide() {
 908         assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b"");
 909         assert_eq!(Wtf8Buf::from_wide(
 910                       &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes,
 911                    b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9");
 912     }
 913
 914     #[test]
 915     fn wtf8buf_push_str() {
 916         let mut string = Wtf8Buf::new();
 917         assert_eq!(string.bytes, b"");
 918         string.push_str("aé 💩");
 919         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 920     }
 921
 922     #[test]
 923     fn wtf8buf_push_char() {
 924         let mut string = Wtf8Buf::from_str("aé ");
 925         assert_eq!(string.bytes, b"a\xC3\xA9 ");
 926         string.push_char('💩');
 927         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 928     }
 929
 930     #[test]
 931     fn wtf8buf_push() {
 932         let mut string = Wtf8Buf::from_str("aé ");
 933         assert_eq!(string.bytes, b"a\xC3\xA9 ");
 934         string.push(CodePoint::from_char('💩'));
 935         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 936
 937         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 938
 939         let mut string = Wtf8Buf::new();
 940         string.push(c(0xD83D));  // lead
 941         string.push(c(0xDCA9));  // trail
 942         assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9");  // Magic!
 943
 944         let mut string = Wtf8Buf::new();
 945         string.push(c(0xD83D));  // lead
 946         string.push(c(0x20));  // not surrogate
 947         string.push(c(0xDCA9));  // trail
 948         assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
 949
 950         let mut string = Wtf8Buf::new();
 951         string.push(c(0xD800));  // lead
 952         string.push(c(0xDBFF));  // lead
 953         assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
 954
 955         let mut string = Wtf8Buf::new();
 956         string.push(c(0xD800));  // lead
 957         string.push(c(0xE000));  // not surrogate
 958         assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
 959
 960         let mut string = Wtf8Buf::new();
 961         string.push(c(0xD7FF));  // not surrogate
 962         string.push(c(0xDC00));  // trail
 963         assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
 964
 965         let mut string = Wtf8Buf::new();
 966         string.push(c(0x61));  // not surrogate, < 3 bytes
 967         string.push(c(0xDC00));  // trail
 968         assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
 969
 970         let mut string = Wtf8Buf::new();
 971         string.push(c(0xDC00));  // trail
 972         assert_eq!(string.bytes, b"\xED\xB0\x80");
 973     }
 974
 975     #[test]
 976     fn wtf8buf_push_wtf8() {
 977         let mut string = Wtf8Buf::from_str("aé");
 978         assert_eq!(string.bytes, b"a\xC3\xA9");
 979         string.push_wtf8(Wtf8::from_str(" 💩"));
 980         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 981
 982         fn w(v: &[u8]) -> &Wtf8 { unsafe { Wtf8::from_bytes_unchecked(v) } }
 983
 984         let mut string = Wtf8Buf::new();
 985         string.push_wtf8(w(b"\xED\xA0\xBD"));  // lead
 986         string.push_wtf8(w(b"\xED\xB2\xA9"));  // trail
 987         assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9");  // Magic!
 988
 989         let mut string = Wtf8Buf::new();
 990         string.push_wtf8(w(b"\xED\xA0\xBD"));  // lead
 991         string.push_wtf8(w(b" "));  // not surrogate
 992         string.push_wtf8(w(b"\xED\xB2\xA9"));  // trail
 993         assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
 994
 995         let mut string = Wtf8Buf::new();
 996         string.push_wtf8(w(b"\xED\xA0\x80"));  // lead
 997         string.push_wtf8(w(b"\xED\xAF\xBF"));  // lead
 998         assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
 999
1000         let mut string = Wtf8Buf::new();
1001         string.push_wtf8(w(b"\xED\xA0\x80"));  // lead
1002         string.push_wtf8(w(b"\xEE\x80\x80"));  // not surrogate
1003         assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
1004
1005         let mut string = Wtf8Buf::new();
1006         string.push_wtf8(w(b"\xED\x9F\xBF"));  // not surrogate
1007         string.push_wtf8(w(b"\xED\xB0\x80"));  // trail
1008         assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1009
1010         let mut string = Wtf8Buf::new();
1011         string.push_wtf8(w(b"a"));  // not surrogate, < 3 bytes
1012         string.push_wtf8(w(b"\xED\xB0\x80"));  // trail
1013         assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
1014
1015         let mut string = Wtf8Buf::new();
1016         string.push_wtf8(w(b"\xED\xB0\x80"));  // trail
1017         assert_eq!(string.bytes, b"\xED\xB0\x80");
1018     }
1019
1020     #[test]
1021     fn wtf8buf_truncate() {
1022         let mut string = Wtf8Buf::from_str("aé");
1023         string.truncate(1);
1024         assert_eq!(string.bytes, b"a");
1025     }
1026
1027     #[test]
1028     #[should_panic]
1029     fn wtf8buf_truncate_fail_code_point_boundary() {
1030         let mut string = Wtf8Buf::from_str("aé");
1031         string.truncate(2);
1032     }
1033
1034     #[test]
1035     #[should_panic]
1036     fn wtf8buf_truncate_fail_longer() {
1037         let mut string = Wtf8Buf::from_str("aé");
1038         string.truncate(4);
1039     }
1040
1041     #[test]
1042     fn wtf8buf_into_string() {
1043         let mut string = Wtf8Buf::from_str("aé 💩");
1044         assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩")));
1045         string.push(CodePoint::from_u32(0xD800).unwrap());
1046         assert_eq!(string.clone().into_string(), Err(string));
1047     }
1048
1049     #[test]
1050     fn wtf8buf_into_string_lossy() {
1051         let mut string = Wtf8Buf::from_str("aé 💩");
1052         assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩"));
1053         string.push(CodePoint::from_u32(0xD800).unwrap());
1054         assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
1055     }
1056
1057     #[test]
1058     fn wtf8buf_from_iterator() {
1059         fn f(values: &[u32]) -> Wtf8Buf {
1060             values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>()
1061         };
1062         assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1063
1064         assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9");  // Magic!
1065         assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1066         assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1067         assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1068         assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1069         assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80");
1070         assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80");
1071     }
1072
1073     #[test]
1074     fn wtf8buf_extend() {
1075         fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf {
1076             fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() }
1077             let mut string = initial.iter().map(c).collect::<Wtf8Buf>();
1078             string.extend(extended.iter().map(c));
1079             string
1080         };
1081
1082         assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes,
1083                    b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1084
1085         assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9");  // Magic!
1086         assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1087         assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1088         assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1089         assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1090         assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80");
1091         assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80");
1092     }
1093
1094     #[test]
1095     fn wtf8buf_show() {
1096         let mut string = Wtf8Buf::from_str("a\té 💩\r");
1097         string.push(CodePoint::from_u32(0xD800).unwrap());
1098         assert_eq!(format!("{:?}", string), r#""a\t\u{e9} \u{1f4a9}\r\u{D800}""#);
1099     }
1100
1101     #[test]
1102     fn wtf8buf_as_slice() {
1103         assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé"));
1104     }
1105
1106     #[test]
1107     fn wtf8buf_show_str() {
1108         let text = "a\té 💩\r";
1109         let mut string = Wtf8Buf::from_str(text);
1110         assert_eq!(format!("{:?}", text), format!("{:?}", string));
1111     }
1112
1113     #[test]
1114     fn wtf8_from_str() {
1115         assert_eq!(&Wtf8::from_str("").bytes, b"");
1116         assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1117     }
1118
1119     #[test]
1120     fn wtf8_len() {
1121         assert_eq!(Wtf8::from_str("").len(), 0);
1122         assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
1123     }
1124
1125     #[test]
1126     fn wtf8_slice() {
1127         assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 ");
1128     }
1129
1130     #[test]
1131     #[should_panic]
1132     fn wtf8_slice_not_code_point_boundary() {
1133         &Wtf8::from_str("aé 💩")[2.. 4];
1134     }
1135
1136     #[test]
1137     fn wtf8_slice_from() {
1138         assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
1139     }
1140
1141     #[test]
1142     #[should_panic]
1143     fn wtf8_slice_from_not_code_point_boundary() {
1144         &Wtf8::from_str("aé 💩")[2..];
1145     }
1146
1147     #[test]
1148     fn wtf8_slice_to() {
1149         assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 ");
1150     }
1151
1152     #[test]
1153     #[should_panic]
1154     fn wtf8_slice_to_not_code_point_boundary() {
1155         &Wtf8::from_str("aé 💩")[5..];
1156     }
1157
1158     #[test]
1159     fn wtf8_ascii_byte_at() {
1160         let slice = Wtf8::from_str("aé 💩");
1161         assert_eq!(slice.ascii_byte_at(0), b'a');
1162         assert_eq!(slice.ascii_byte_at(1), b'\xFF');
1163         assert_eq!(slice.ascii_byte_at(2), b'\xFF');
1164         assert_eq!(slice.ascii_byte_at(3), b' ');
1165         assert_eq!(slice.ascii_byte_at(4), b'\xFF');
1166     }
1167
1168     #[test]
1169     fn wtf8_code_points() {
1170         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
1171         fn cp(string: &Wtf8Buf) -> Vec<Option<char>> {
1172             string.code_points().map(|c| c.to_char()).collect::<Vec<_>>()
1173         }
1174         let mut string = Wtf8Buf::from_str("é ");
1175         assert_eq!(cp(&string), [Some('é'), Some(' ')]);
1176         string.push(c(0xD83D));
1177         assert_eq!(cp(&string), [Some('é'), Some(' '), None]);
1178         string.push(c(0xDCA9));
1179         assert_eq!(cp(&string), [Some('é'), Some(' '), Some('💩')]);
1180     }
1181
1182     #[test]
1183     fn wtf8_as_str() {
1184         assert_eq!(Wtf8::from_str("").as_str(), Some(""));
1185         assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩"));
1186         let mut string = Wtf8Buf::new();
1187         string.push(CodePoint::from_u32(0xD800).unwrap());
1188         assert_eq!(string.as_str(), None);
1189     }
1190
1191     #[test]
1192     fn wtf8_to_string_lossy() {
1193         assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed(""));
1194         assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩"));
1195         let mut string = Wtf8Buf::from_str("aé 💩");
1196         string.push(CodePoint::from_u32(0xD800).unwrap());
1197         let expected: Cow<str> = Cow::Owned(String::from("aé 💩�"));
1198         assert_eq!(string.to_string_lossy(), expected);
1199     }
1200
1201     #[test]
1202     fn wtf8_encode_wide() {
1203         let mut string = Wtf8Buf::from_str("aé ");
1204         string.push(CodePoint::from_u32(0xD83D).unwrap());
1205         string.push_char('💩');
1206         assert_eq!(string.encode_wide().collect::<Vec<_>>(),
1207                    vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);
1208     }
1209 }