library/core/src/str/lossy.rs

   1 use crate::char;
   2 use crate::fmt::{self, Write};
   3 use crate::mem;
   4
   5 use super::from_utf8_unchecked;
   6 use super::validations::utf8_char_width;
   7
   8 /// Lossy UTF-8 string.
   9 #[unstable(feature = "str_internals", issue = "none")]
  10 pub struct Utf8Lossy {
  11     bytes: [u8],
  12 }
  13
  14 impl Utf8Lossy {
  15     #[must_use]
  16     pub fn from_str(s: &str) -> &Utf8Lossy {
  17         Utf8Lossy::from_bytes(s.as_bytes())
  18     }
  19
  20     #[must_use]
  21     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
  22         // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
  23         unsafe { mem::transmute(bytes) }
  24     }
  25
  26     pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
  27         Utf8LossyChunksIter { source: &self.bytes }
  28     }
  29 }
  30
  31 /// Iterator over lossy UTF-8 string
  32 #[must_use = "iterators are lazy and do nothing unless consumed"]
  33 #[unstable(feature = "str_internals", issue = "none")]
  34 #[allow(missing_debug_implementations)]
  35 pub struct Utf8LossyChunksIter<'a> {
  36     source: &'a [u8],
  37 }
  38
  39 #[unstable(feature = "str_internals", issue = "none")]
  40 #[derive(PartialEq, Eq, Debug)]
  41 pub struct Utf8LossyChunk<'a> {
  42     /// Sequence of valid chars.
  43     /// Can be empty between broken UTF-8 chars.
  44     pub valid: &'a str,
  45     /// Single broken char, empty if none.
  46     /// Empty iff iterator item is last.
  47     pub broken: &'a [u8],
  48 }
  49
  50 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
  51     type Item = Utf8LossyChunk<'a>;
  52
  53     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
  54         if self.source.is_empty() {
  55             return None;
  56         }
  57
  58         const TAG_CONT_U8: u8 = 128;
  59         fn safe_get(xs: &[u8], i: usize) -> u8 {
  60             *xs.get(i).unwrap_or(&0)
  61         }
  62
  63         let mut i = 0;
  64         while i < self.source.len() {
  65             let i_ = i;
  66
  67             // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
  68             // only increases, so `0 <= i < self.source.len()`.
  69             let byte = unsafe { *self.source.get_unchecked(i) };
  70             i += 1;
  71
  72             if byte < 128 {
  73             } else {
  74                 let w = utf8_char_width(byte);
  75
  76                 macro_rules! error {
  77                     () => {{
  78                         // SAFETY: We have checked up to `i` that source is valid UTF-8.
  79                         unsafe {
  80                             let r = Utf8LossyChunk {
  81                                 valid: from_utf8_unchecked(&self.source[0..i_]),
  82                                 broken: &self.source[i_..i],
  83                             };
  84                             self.source = &self.source[i..];
  85                             return Some(r);
  86                         }
  87                     }};
  88                 }
  89
  90                 match w {
  91                     2 => {
  92                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
  93                             error!();
  94                         }
  95                         i += 1;
  96                     }
  97                     3 => {
  98                         match (byte, safe_get(self.source, i)) {
  99                             (0xE0, 0xA0..=0xBF) => (),
 100                             (0xE1..=0xEC, 0x80..=0xBF) => (),
 101                             (0xED, 0x80..=0x9F) => (),
 102                             (0xEE..=0xEF, 0x80..=0xBF) => (),
 103                             _ => {
 104                                 error!();
 105                             }
 106                         }
 107                         i += 1;
 108                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 109                             error!();
 110                         }
 111                         i += 1;
 112                     }
 113                     4 => {
 114                         match (byte, safe_get(self.source, i)) {
 115                             (0xF0, 0x90..=0xBF) => (),
 116                             (0xF1..=0xF3, 0x80..=0xBF) => (),
 117                             (0xF4, 0x80..=0x8F) => (),
 118                             _ => {
 119                                 error!();
 120                             }
 121                         }
 122                         i += 1;
 123                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 124                             error!();
 125                         }
 126                         i += 1;
 127                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 128                             error!();
 129                         }
 130                         i += 1;
 131                     }
 132                     _ => {
 133                         error!();
 134                     }
 135                 }
 136             }
 137         }
 138
 139         let r = Utf8LossyChunk {
 140             // SAFETY: We have checked that the entire source is valid UTF-8.
 141             valid: unsafe { from_utf8_unchecked(self.source) },
 142             broken: &[],
 143         };
 144         self.source = &[];
 145         Some(r)
 146     }
 147 }
 148
 149 impl fmt::Display for Utf8Lossy {
 150     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 151         // If we're the empty string then our iterator won't actually yield
 152         // anything, so perform the formatting manually
 153         if self.bytes.is_empty() {
 154             return "".fmt(f);
 155         }
 156
 157         for Utf8LossyChunk { valid, broken } in self.chunks() {
 158             // If we successfully decoded the whole chunk as a valid string then
 159             // we can return a direct formatting of the string which will also
 160             // respect various formatting flags if possible.
 161             if valid.len() == self.bytes.len() {
 162                 assert!(broken.is_empty());
 163                 return valid.fmt(f);
 164             }
 165
 166             f.write_str(valid)?;
 167             if !broken.is_empty() {
 168                 f.write_char(char::REPLACEMENT_CHARACTER)?;
 169             }
 170         }
 171         Ok(())
 172     }
 173 }
 174
 175 impl fmt::Debug for Utf8Lossy {
 176     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 177         f.write_char('"')?;
 178
 179         for Utf8LossyChunk { valid, broken } in self.chunks() {
 180             // Valid part.
 181             // Here we partially parse UTF-8 again which is suboptimal.
 182             {
 183                 let mut from = 0;
 184                 for (i, c) in valid.char_indices() {
 185                     let esc = c.escape_debug();
 186                     // If char needs escaping, flush backlog so far and write, else skip
 187                     if esc.len() != 1 {
 188                         f.write_str(&valid[from..i])?;
 189                         for c in esc {
 190                             f.write_char(c)?;
 191                         }
 192                         from = i + c.len_utf8();
 193                     }
 194                 }
 195                 f.write_str(&valid[from..])?;
 196             }
 197
 198             // Broken parts of string as hex escape.
 199             for &b in broken {
 200                 write!(f, "\\x{:02x}", b)?;
 201             }
 202         }
 203
 204         f.write_char('"')
 205     }
 206 }