2 use crate::fmt
::{self, Write}
;
5 use super::from_utf8_unchecked
;
6 use super::validations
::utf8_char_width
;
8 /// Lossy UTF-8 string.
9 #[unstable(feature = "str_internals", issue = "none")]
10 pub struct Utf8Lossy
{
16 pub fn from_str(s
: &str) -> &Utf8Lossy
{
17 Utf8Lossy
::from_bytes(s
.as_bytes())
21 pub fn from_bytes(bytes
: &[u8]) -> &Utf8Lossy
{
22 // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
23 unsafe { mem::transmute(bytes) }
26 pub fn chunks(&self) -> Utf8LossyChunksIter
<'_
> {
27 Utf8LossyChunksIter { source: &self.bytes }
31 /// Iterator over lossy UTF-8 string
32 #[must_use = "iterators are lazy and do nothing unless consumed"]
33 #[unstable(feature = "str_internals", issue = "none")]
34 #[allow(missing_debug_implementations)]
35 pub struct Utf8LossyChunksIter
<'a
> {
39 #[unstable(feature = "str_internals", issue = "none")]
40 #[derive(PartialEq, Eq, Debug)]
41 pub struct Utf8LossyChunk
<'a
> {
42 /// Sequence of valid chars.
43 /// Can be empty between broken UTF-8 chars.
45 /// Single broken char, empty if none.
46 /// Empty iff iterator item is last.
50 impl<'a
> Iterator
for Utf8LossyChunksIter
<'a
> {
51 type Item
= Utf8LossyChunk
<'a
>;
53 fn next(&mut self) -> Option
<Utf8LossyChunk
<'a
>> {
54 if self.source
.is_empty() {
58 const TAG_CONT_U8
: u8 = 128;
59 fn safe_get(xs
: &[u8], i
: usize) -> u8 {
60 *xs
.get(i
).unwrap_or(&0)
64 while i
< self.source
.len() {
67 // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
68 // only increases, so `0 <= i < self.source.len()`.
69 let byte
= unsafe { *self.source.get_unchecked(i) }
;
74 let w
= utf8_char_width(byte
);
78 // SAFETY: We have checked up to `i` that source is valid UTF-8.
80 let r
= Utf8LossyChunk
{
81 valid
: from_utf8_unchecked(&self.source
[0..i_
]),
82 broken
: &self.source
[i_
..i
],
84 self.source
= &self.source
[i
..];
92 if safe_get(self.source
, i
) & 192 != TAG_CONT_U8
{
98 match (byte
, safe_get(self.source
, i
)) {
99 (0xE0, 0xA0..=0xBF) => (),
100 (0xE1..=0xEC, 0x80..=0xBF) => (),
101 (0xED, 0x80..=0x9F) => (),
102 (0xEE..=0xEF, 0x80..=0xBF) => (),
108 if safe_get(self.source
, i
) & 192 != TAG_CONT_U8
{
114 match (byte
, safe_get(self.source
, i
)) {
115 (0xF0, 0x90..=0xBF) => (),
116 (0xF1..=0xF3, 0x80..=0xBF) => (),
117 (0xF4, 0x80..=0x8F) => (),
123 if safe_get(self.source
, i
) & 192 != TAG_CONT_U8
{
127 if safe_get(self.source
, i
) & 192 != TAG_CONT_U8
{
139 let r
= Utf8LossyChunk
{
140 // SAFETY: We have checked that the entire source is valid UTF-8.
141 valid
: unsafe { from_utf8_unchecked(self.source) }
,
149 impl fmt
::Display
for Utf8Lossy
{
150 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
151 // If we're the empty string then our iterator won't actually yield
152 // anything, so perform the formatting manually
153 if self.bytes
.is_empty() {
157 for Utf8LossyChunk { valid, broken }
in self.chunks() {
158 // If we successfully decoded the whole chunk as a valid string then
159 // we can return a direct formatting of the string which will also
160 // respect various formatting flags if possible.
161 if valid
.len() == self.bytes
.len() {
162 assert
!(broken
.is_empty());
167 if !broken
.is_empty() {
168 f
.write_char(char::REPLACEMENT_CHARACTER
)?
;
175 impl fmt
::Debug
for Utf8Lossy
{
176 fn fmt(&self, f
: &mut fmt
::Formatter
<'_
>) -> fmt
::Result
{
179 for Utf8LossyChunk { valid, broken } in self.chunks() {
181 // Here we partially parse UTF-8 again which is suboptimal.
184 for (i, c) in valid.char_indices() {
185 let esc = c.escape_debug();
186 // If char needs escaping, flush backlog so far and write, else skip
188 f.write_str(&valid[from..i])?;
192 from = i + c.len_utf8();
195 f.write_str(&valid[from..])?;
198 // Broken parts of string as hex escape.
200 write!(f, "\\x{:02x}
", b)?;