]> git.proxmox.com Git - rustc.git/blame - library/std/src/sys_common/wtf8.rs
New upstream version 1.48.0~beta.8+dfsg1
[rustc.git] / library / std / src / sys_common / wtf8.rs
CommitLineData
85aaf69f
SL
1//! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
2//!
3//! This library uses Rust’s type system to maintain
4//! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
5//! like the `String` and `&str` types do for UTF-8.
6//!
7//! Since [WTF-8 must not be used
8//! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
9//! this library deliberately does not provide access to the underlying bytes
10//! of WTF-8 strings,
11//! nor can it decode WTF-8 from arbitrary bytes.
12//! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
13
c34b1796
AL
14// this module is imported from @SimonSapin's repo and has tons of dead code on
15// unix (it's mostly used on windows), so don't worry about dead code here.
16#![allow(dead_code)]
17
1b1a35ee
XL
18#[cfg(test)]
19mod tests;
20
62682a34 21use core::str::next_code_point;
85aaf69f 22
532ac7d7
XL
23use crate::borrow::Cow;
24use crate::char;
25use crate::fmt;
26use crate::hash::{Hash, Hasher};
27use crate::iter::FromIterator;
28use crate::mem;
29use crate::ops;
30use crate::rc::Rc;
31use crate::slice;
32use crate::str;
33use crate::sync::Arc;
34use crate::sys_common::AsInner;
85aaf69f 35
0731742a 36const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
85aaf69f
SL
37
38/// A Unicode code point: from U+0000 to U+10FFFF.
39///
9fa01778 40/// Compares with the `char` type,
85aaf69f
SL
41/// which represents a Unicode scalar value:
42/// a code point that is not a surrogate (U+D800 to U+DFFF).
43#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
44pub struct CodePoint {
60c5eb7d 45 value: u32,
85aaf69f
SL
46}
47
48/// Format the code point as `U+` followed by four to six hexadecimal digits.
49/// Example: `U+1F4A9`
50impl fmt::Debug for CodePoint {
51 #[inline]
532ac7d7 52 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
85aaf69f
SL
53 write!(formatter, "U+{:04X}", self.value)
54 }
55}
56
57impl CodePoint {
9346a6ac 58 /// Unsafely creates a new `CodePoint` without checking the value.
85aaf69f
SL
59 ///
60 /// Only use when `value` is known to be less than or equal to 0x10FFFF.
61 #[inline]
62 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
a1dfa0c6 63 CodePoint { value }
85aaf69f
SL
64 }
65
9346a6ac 66 /// Creates a new `CodePoint` if the value is a valid code point.
85aaf69f 67 ///
9346a6ac 68 /// Returns `None` if `value` is above 0x10FFFF.
85aaf69f
SL
69 #[inline]
70 pub fn from_u32(value: u32) -> Option<CodePoint> {
71 match value {
60c5eb7d
XL
72 0..=0x10FFFF => Some(CodePoint { value }),
73 _ => None,
85aaf69f
SL
74 }
75 }
76
9346a6ac 77 /// Creates a new `CodePoint` from a `char`.
85aaf69f
SL
78 ///
79 /// Since all Unicode scalar values are code points, this always succeeds.
80 #[inline]
81 pub fn from_char(value: char) -> CodePoint {
82 CodePoint { value: value as u32 }
83 }
84
9346a6ac 85 /// Returns the numeric value of the code point.
85aaf69f
SL
86 #[inline]
87 pub fn to_u32(&self) -> u32 {
88 self.value
89 }
90
9346a6ac 91 /// Optionally returns a Unicode scalar value for the code point.
85aaf69f 92 ///
9346a6ac 93 /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
85aaf69f
SL
94 #[inline]
95 pub fn to_char(&self) -> Option<char> {
96 match self.value {
60c5eb7d
XL
97 0xD800..=0xDFFF => None,
98 _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
85aaf69f
SL
99 }
100 }
101
9346a6ac 102 /// Returns a Unicode scalar value for the code point.
85aaf69f 103 ///
9346a6ac 104 /// Returns `'\u{FFFD}'` (the replacement character “�”)
85aaf69f
SL
105 /// if the code point is a surrogate (from U+D800 to U+DFFF).
106 #[inline]
107 pub fn to_char_lossy(&self) -> char {
108 self.to_char().unwrap_or('\u{FFFD}')
109 }
110}
111
112/// An owned, growable string of well-formed WTF-8 data.
113///
114/// Similar to `String`, but can additionally contain surrogate code points
115/// if they’re not in a surrogate pair.
116#[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
117pub struct Wtf8Buf {
60c5eb7d 118 bytes: Vec<u8>,
85aaf69f
SL
119}
120
121impl ops::Deref for Wtf8Buf {
122 type Target = Wtf8;
123
124 fn deref(&self) -> &Wtf8 {
125 self.as_slice()
126 }
127}
128
ff7c6d11
XL
129impl ops::DerefMut for Wtf8Buf {
130 fn deref_mut(&mut self) -> &mut Wtf8 {
131 self.as_mut_slice()
132 }
133}
134
85aaf69f
SL
135/// Format the string with double quotes,
136/// and surrogates as `\u` followed by four hexadecimal digits.
137/// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
138impl fmt::Debug for Wtf8Buf {
139 #[inline]
532ac7d7 140 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
85aaf69f
SL
141 fmt::Debug::fmt(&**self, formatter)
142 }
143}
144
145impl Wtf8Buf {
b039eaaf 146 /// Creates a new, empty WTF-8 string.
85aaf69f
SL
147 #[inline]
148 pub fn new() -> Wtf8Buf {
149 Wtf8Buf { bytes: Vec::new() }
150 }
151
48663c56 152 /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
85aaf69f 153 #[inline]
48663c56
XL
154 pub fn with_capacity(capacity: usize) -> Wtf8Buf {
155 Wtf8Buf { bytes: Vec::with_capacity(capacity) }
85aaf69f
SL
156 }
157
d9579d0f 158 /// Creates a WTF-8 string from a UTF-8 `String`.
85aaf69f
SL
159 ///
160 /// This takes ownership of the `String` and does not copy.
161 ///
162 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
163 #[inline]
164 pub fn from_string(string: String) -> Wtf8Buf {
165 Wtf8Buf { bytes: string.into_bytes() }
166 }
167
d9579d0f 168 /// Creates a WTF-8 string from a UTF-8 `&str` slice.
85aaf69f
SL
169 ///
170 /// This copies the content of the slice.
171 ///
172 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
173 #[inline]
174 pub fn from_str(str: &str) -> Wtf8Buf {
c34b1796 175 Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
85aaf69f
SL
176 }
177
7453a54e
SL
178 pub fn clear(&mut self) {
179 self.bytes.clear()
180 }
181
9346a6ac 182 /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
85aaf69f
SL
183 ///
184 /// This is lossless: calling `.encode_wide()` on the resulting string
185 /// will always return the original code units.
186 pub fn from_wide(v: &[u16]) -> Wtf8Buf {
187 let mut string = Wtf8Buf::with_capacity(v.len());
e9174d1e 188 for item in char::decode_utf16(v.iter().cloned()) {
85aaf69f 189 match item {
e9174d1e
SL
190 Ok(ch) => string.push_char(ch),
191 Err(surrogate) => {
54a0048b 192 let surrogate = surrogate.unpaired_surrogate();
85aaf69f 193 // Surrogates are known to be in the code point range.
60c5eb7d 194 let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
85aaf69f 195 // Skip the WTF-8 concatenation check,
e9174d1e 196 // surrogate pairs are already decoded by decode_utf16
85aaf69f
SL
197 string.push_code_point_unchecked(code_point)
198 }
199 }
200 }
201 string
202 }
203
204 /// Copied from String::push
205 /// This does **not** include the WTF-8 concatenation check.
206 fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
c30ab7b3 207 let mut bytes = [0; 4];
f9f354fc 208 let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
c30ab7b3 209 self.bytes.extend_from_slice(bytes)
85aaf69f
SL
210 }
211
212 #[inline]
213 pub fn as_slice(&self) -> &Wtf8 {
c1a9b12d 214 unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
85aaf69f
SL
215 }
216
ff7c6d11
XL
217 #[inline]
218 pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
219 unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
220 }
221
85aaf69f
SL
222 /// Reserves capacity for at least `additional` more bytes to be inserted
223 /// in the given `Wtf8Buf`.
224 /// The collection may reserve more space to avoid frequent reallocations.
225 ///
226 /// # Panics
227 ///
c34b1796 228 /// Panics if the new capacity overflows `usize`.
85aaf69f 229 #[inline]
c34b1796 230 pub fn reserve(&mut self, additional: usize) {
85aaf69f
SL
231 self.bytes.reserve(additional)
232 }
233
7453a54e
SL
234 #[inline]
235 pub fn reserve_exact(&mut self, additional: usize) {
236 self.bytes.reserve_exact(additional)
237 }
238
8bb4bdeb
XL
239 #[inline]
240 pub fn shrink_to_fit(&mut self) {
241 self.bytes.shrink_to_fit()
242 }
243
0531ce1d
XL
244 #[inline]
245 pub fn shrink_to(&mut self, min_capacity: usize) {
246 self.bytes.shrink_to(min_capacity)
247 }
248
85aaf69f
SL
249 /// Returns the number of bytes that this string buffer can hold without reallocating.
250 #[inline]
c34b1796 251 pub fn capacity(&self) -> usize {
85aaf69f
SL
252 self.bytes.capacity()
253 }
254
d9579d0f 255 /// Append a UTF-8 slice at the end of the string.
85aaf69f
SL
256 #[inline]
257 pub fn push_str(&mut self, other: &str) {
92a42be0 258 self.bytes.extend_from_slice(other.as_bytes())
85aaf69f
SL
259 }
260
261 /// Append a WTF-8 slice at the end of the string.
262 ///
263 /// This replaces newly paired surrogates at the boundary
264 /// with a supplementary code point,
265 /// like concatenating ill-formed UTF-16 strings effectively would.
266 #[inline]
267 pub fn push_wtf8(&mut self, other: &Wtf8) {
268 match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
269 // Replace newly paired surrogates by a supplementary code point.
270 (Some(lead), Some(trail)) => {
271 let len_without_lead_surrogate = self.len() - 3;
272 self.bytes.truncate(len_without_lead_surrogate);
273 let other_without_trail_surrogate = &other.bytes[3..];
274 // 4 bytes for the supplementary code point
275 self.bytes.reserve(4 + other_without_trail_surrogate.len());
276 self.push_char(decode_surrogate_pair(lead, trail));
92a42be0 277 self.bytes.extend_from_slice(other_without_trail_surrogate);
85aaf69f 278 }
60c5eb7d 279 _ => self.bytes.extend_from_slice(&other.bytes),
85aaf69f
SL
280 }
281 }
282
283 /// Append a Unicode scalar value at the end of the string.
284 #[inline]
285 pub fn push_char(&mut self, c: char) {
286 self.push_code_point_unchecked(CodePoint::from_char(c))
287 }
288
289 /// Append a code point at the end of the string.
290 ///
291 /// This replaces newly paired surrogates at the boundary
292 /// with a supplementary code point,
293 /// like concatenating ill-formed UTF-16 strings effectively would.
294 #[inline]
295 pub fn push(&mut self, code_point: CodePoint) {
8faf50e0 296 if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
e9174d1e
SL
297 if let Some(lead) = (&*self).final_lead_surrogate() {
298 let len_without_lead_surrogate = self.len() - 3;
299 self.bytes.truncate(len_without_lead_surrogate);
300 self.push_char(decode_surrogate_pair(lead, trail as u16));
60c5eb7d 301 return;
85aaf69f 302 }
85aaf69f
SL
303 }
304
305 // No newly paired surrogates at the boundary.
306 self.push_code_point_unchecked(code_point)
307 }
308
309 /// Shortens a string to the specified length.
310 ///
311 /// # Panics
312 ///
313 /// Panics if `new_len` > current length,
314 /// or if `new_len` is not a code point boundary.
315 #[inline]
c34b1796 316 pub fn truncate(&mut self, new_len: usize) {
85aaf69f
SL
317 assert!(is_code_point_boundary(self, new_len));
318 self.bytes.truncate(new_len)
319 }
320
9346a6ac 321 /// Consumes the WTF-8 string and tries to convert it to UTF-8.
85aaf69f
SL
322 ///
323 /// This does not copy the data.
324 ///
325 /// If the contents are not well-formed UTF-8
326 /// (that is, if the string contains surrogates),
327 /// the original WTF-8 string is returned instead.
328 pub fn into_string(self) -> Result<String, Wtf8Buf> {
329 match self.next_surrogate(0) {
330 None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
331 Some(_) => Err(self),
332 }
333 }
334
9346a6ac 335 /// Consumes the WTF-8 string and converts it lossily to UTF-8.
85aaf69f
SL
336 ///
337 /// This does not copy the data (but may overwrite parts of it in place).
338 ///
339 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
340 pub fn into_string_lossy(mut self) -> String {
341 let mut pos = 0;
342 loop {
343 match self.next_surrogate(pos) {
344 Some((surrogate_pos, _)) => {
345 pos = surrogate_pos + 3;
92a42be0 346 self.bytes[surrogate_pos..pos]
041b39d2 347 .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
60c5eb7d
XL
348 }
349 None => return unsafe { String::from_utf8_unchecked(self.bytes) },
85aaf69f
SL
350 }
351 }
352 }
8bb4bdeb
XL
353
354 /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
355 #[inline]
356 pub fn into_box(self) -> Box<Wtf8> {
357 unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
358 }
cc61c64b
XL
359
360 /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
361 pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
362 let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
363 Wtf8Buf { bytes: bytes.into_vec() }
364 }
85aaf69f
SL
365}
366
9fa01778 367/// Creates a new WTF-8 string from an iterator of code points.
85aaf69f
SL
368///
369/// This replaces surrogate code point pairs with supplementary code points,
370/// like concatenating ill-formed UTF-16 strings effectively would.
371impl FromIterator<CodePoint> for Wtf8Buf {
60c5eb7d 372 fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
85aaf69f
SL
373 let mut string = Wtf8Buf::new();
374 string.extend(iter);
375 string
376 }
377}
378
379/// Append code points from an iterator to the string.
380///
381/// This replaces surrogate code point pairs with supplementary code points,
382/// like concatenating ill-formed UTF-16 strings effectively would.
383impl Extend<CodePoint> for Wtf8Buf {
60c5eb7d 384 fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
54a0048b 385 let iterator = iter.into_iter();
85aaf69f
SL
386 let (low, _high) = iterator.size_hint();
387 // Lower bound of one byte per code point (ASCII only)
388 self.bytes.reserve(low);
532ac7d7 389 iterator.for_each(move |code_point| self.push(code_point));
85aaf69f 390 }
f9f354fc
XL
391
392 #[inline]
393 fn extend_one(&mut self, code_point: CodePoint) {
394 self.push(code_point);
395 }
396
397 #[inline]
398 fn extend_reserve(&mut self, additional: usize) {
399 // Lower bound of one byte per code point (ASCII only)
400 self.bytes.reserve(additional);
401 }
85aaf69f
SL
402}
403
404/// A borrowed slice of well-formed WTF-8 data.
405///
406/// Similar to `&str`, but can additionally contain surrogate code points
407/// if they’re not in a surrogate pair.
b039eaaf 408#[derive(Eq, Ord, PartialEq, PartialOrd)]
85aaf69f 409pub struct Wtf8 {
60c5eb7d 410 bytes: [u8],
85aaf69f
SL
411}
412
413impl AsInner<[u8]> for Wtf8 {
60c5eb7d
XL
414 fn as_inner(&self) -> &[u8] {
415 &self.bytes
416 }
85aaf69f
SL
417}
418
85aaf69f
SL
419/// Format the slice with double quotes,
420/// and surrogates as `\u` followed by four hexadecimal digits.
421/// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
422impl fmt::Debug for Wtf8 {
532ac7d7
XL
423 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
424 fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
425 use crate::fmt::Write;
5bcae85e 426 for c in s.chars().flat_map(|c| c.escape_debug()) {
54a0048b 427 f.write_char(c)?
c1a9b12d
SL
428 }
429 Ok(())
430 }
431
54a0048b 432 formatter.write_str("\"")?;
85aaf69f 433 let mut pos = 0;
0531ce1d 434 while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
60c5eb7d
XL
435 write_str_escaped(formatter, unsafe {
436 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
437 })?;
0531ce1d
XL
438 write!(formatter, "\\u{{{:x}}}", surrogate)?;
439 pos = surrogate_pos + 3;
85aaf69f 440 }
60c5eb7d 441 write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
85aaf69f
SL
442 formatter.write_str("\"")
443 }
444}
445
041b39d2 446impl fmt::Display for Wtf8 {
532ac7d7 447 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
041b39d2
XL
448 let wtf8_bytes = &self.bytes;
449 let mut pos = 0;
450 loop {
451 match self.next_surrogate(pos) {
452 Some((surrogate_pos, _)) => {
453 formatter.write_str(unsafe {
60c5eb7d 454 str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
041b39d2
XL
455 })?;
456 formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
457 pos = surrogate_pos + 3;
60c5eb7d 458 }
041b39d2 459 None => {
60c5eb7d
XL
460 let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
461 if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
041b39d2
XL
462 }
463 }
464 }
465 }
466}
467
85aaf69f 468impl Wtf8 {
9346a6ac 469 /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
85aaf69f
SL
470 ///
471 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
472 #[inline]
473 pub fn from_str(value: &str) -> &Wtf8 {
c1a9b12d
SL
474 unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
475 }
476
477 /// Creates a WTF-8 slice from a WTF-8 byte slice.
478 ///
479 /// Since the byte slice is not checked for valid WTF-8, this functions is
480 /// marked unsafe.
481 #[inline]
482 unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
483 mem::transmute(value)
85aaf69f
SL
484 }
485
ff7c6d11
XL
486 /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
487 ///
488 /// Since the byte slice is not checked for valid WTF-8, this functions is
489 /// marked unsafe.
490 #[inline]
491 unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
492 mem::transmute(value)
493 }
494
9346a6ac 495 /// Returns the length, in WTF-8 bytes.
85aaf69f 496 #[inline]
c34b1796 497 pub fn len(&self) -> usize {
85aaf69f
SL
498 self.bytes.len()
499 }
500
7453a54e
SL
501 #[inline]
502 pub fn is_empty(&self) -> bool {
503 self.bytes.is_empty()
504 }
505
9346a6ac 506 /// Returns the code point at `position` if it is in the ASCII range,
85aaf69f
SL
507 /// or `b'\xFF' otherwise.
508 ///
509 /// # Panics
510 ///
511 /// Panics if `position` is beyond the end of the string.
512 #[inline]
c34b1796 513 pub fn ascii_byte_at(&self, position: usize) -> u8 {
85aaf69f 514 match self.bytes[position] {
60c5eb7d
XL
515 ascii_byte @ 0x00..=0x7F => ascii_byte,
516 _ => 0xFF,
85aaf69f
SL
517 }
518 }
519
9346a6ac 520 /// Returns an iterator for the string’s code points.
85aaf69f 521 #[inline]
532ac7d7 522 pub fn code_points(&self) -> Wtf8CodePoints<'_> {
85aaf69f
SL
523 Wtf8CodePoints { bytes: self.bytes.iter() }
524 }
525
9346a6ac 526 /// Tries to convert the string to UTF-8 and return a `&str` slice.
85aaf69f 527 ///
9346a6ac 528 /// Returns `None` if the string contains surrogates.
85aaf69f
SL
529 ///
530 /// This does not copy the data.
531 #[inline]
532 pub fn as_str(&self) -> Option<&str> {
533 // Well-formed WTF-8 is also well-formed UTF-8
534 // if and only if it contains no surrogate.
535 match self.next_surrogate(0) {
536 None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
537 Some(_) => None,
538 }
539 }
540
9346a6ac 541 /// Lossily converts the string to UTF-8.
d9579d0f 542 /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
85aaf69f
SL
543 ///
544 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
545 ///
546 /// This only copies the data if necessary (if it contains any surrogate).
532ac7d7 547 pub fn to_string_lossy(&self) -> Cow<'_, str> {
85aaf69f
SL
548 let surrogate_pos = match self.next_surrogate(0) {
549 None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
550 Some((pos, _)) => pos,
551 };
552 let wtf8_bytes = &self.bytes;
553 let mut utf8_bytes = Vec::with_capacity(self.len());
92a42be0 554 utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
041b39d2 555 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
85aaf69f
SL
556 let mut pos = surrogate_pos + 3;
557 loop {
558 match self.next_surrogate(pos) {
559 Some((surrogate_pos, _)) => {
60c5eb7d 560 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
041b39d2 561 utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
85aaf69f 562 pos = surrogate_pos + 3;
60c5eb7d 563 }
85aaf69f 564 None => {
92a42be0 565 utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
60c5eb7d 566 return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
85aaf69f
SL
567 }
568 }
569 }
570 }
571
9346a6ac 572 /// Converts the WTF-8 string to potentially ill-formed UTF-16
85aaf69f
SL
573 /// and return an iterator of 16-bit code units.
574 ///
575 /// This is lossless:
576 /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
577 /// would always return the original WTF-8 string.
578 #[inline]
532ac7d7 579 pub fn encode_wide(&self) -> EncodeWide<'_> {
85aaf69f
SL
580 EncodeWide { code_points: self.code_points(), extra: 0 }
581 }
582
583 #[inline]
c34b1796 584 fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
85aaf69f
SL
585 let mut iter = self.bytes[pos..].iter();
586 loop {
ff7c6d11 587 let b = *iter.next()?;
85aaf69f
SL
588 if b < 0x80 {
589 pos += 1;
590 } else if b < 0xE0 {
591 iter.next();
592 pos += 2;
593 } else if b == 0xED {
594 match (iter.next(), iter.next()) {
595 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
60c5eb7d 596 return Some((pos, decode_surrogate(b2, b3)));
85aaf69f 597 }
60c5eb7d 598 _ => pos += 3,
85aaf69f
SL
599 }
600 } else if b < 0xF0 {
601 iter.next();
602 iter.next();
603 pos += 3;
604 } else {
605 iter.next();
606 iter.next();
607 iter.next();
608 pos += 4;
609 }
610 }
611 }
612
613 #[inline]
614 fn final_lead_surrogate(&self) -> Option<u16> {
ba9703b0
XL
615 match self.bytes {
616 [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
60c5eb7d 617 _ => None,
85aaf69f
SL
618 }
619 }
620
621 #[inline]
622 fn initial_trail_surrogate(&self) -> Option<u16> {
ba9703b0
XL
623 match self.bytes {
624 [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
60c5eb7d 625 _ => None,
85aaf69f
SL
626 }
627 }
8bb4bdeb 628
ba9703b0
XL
629 pub fn clone_into(&self, buf: &mut Wtf8Buf) {
630 self.bytes.clone_into(&mut buf.bytes)
631 }
632
8bb4bdeb
XL
633 /// Boxes this `Wtf8`.
634 #[inline]
635 pub fn into_box(&self) -> Box<Wtf8> {
636 let boxed: Box<[u8]> = self.bytes.into();
637 unsafe { mem::transmute(boxed) }
638 }
639
640 /// Creates a boxed, empty `Wtf8`.
641 pub fn empty_box() -> Box<Wtf8> {
642 let boxed: Box<[u8]> = Default::default();
643 unsafe { mem::transmute(boxed) }
644 }
ff7c6d11
XL
645
646 #[inline]
647 pub fn into_arc(&self) -> Arc<Wtf8> {
648 let arc: Arc<[u8]> = Arc::from(&self.bytes);
649 unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
650 }
651
652 #[inline]
653 pub fn into_rc(&self) -> Rc<Wtf8> {
654 let rc: Rc<[u8]> = Rc::from(&self.bytes);
655 unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
656 }
ba9703b0
XL
657
658 #[inline]
659 pub fn make_ascii_lowercase(&mut self) {
660 self.bytes.make_ascii_lowercase()
661 }
662
663 #[inline]
664 pub fn make_ascii_uppercase(&mut self) {
665 self.bytes.make_ascii_uppercase()
666 }
667
668 #[inline]
669 pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
670 Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
671 }
672
673 #[inline]
674 pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
675 Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
676 }
677
678 #[inline]
679 pub fn is_ascii(&self) -> bool {
680 self.bytes.is_ascii()
681 }
682
683 #[inline]
684 pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
685 self.bytes.eq_ignore_ascii_case(&other.bytes)
686 }
85aaf69f
SL
687}
688
9fa01778 689/// Returns a slice of the given string for the byte range [`begin`..`end`).
85aaf69f
SL
690///
691/// # Panics
692///
693/// Panics when `begin` and `end` do not point to code point boundaries,
694/// or point beyond the end of the string.
695impl ops::Index<ops::Range<usize>> for Wtf8 {
696 type Output = Wtf8;
697
698 #[inline]
c34b1796 699 fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
85aaf69f 700 // is_code_point_boundary checks that the index is in [0, .len()]
60c5eb7d
XL
701 if range.start <= range.end
702 && is_code_point_boundary(self, range.start)
703 && is_code_point_boundary(self, range.end)
704 {
85aaf69f
SL
705 unsafe { slice_unchecked(self, range.start, range.end) }
706 } else {
707 slice_error_fail(self, range.start, range.end)
708 }
709 }
710}
711
9fa01778 712/// Returns a slice of the given string from byte `begin` to its end.
85aaf69f
SL
713///
714/// # Panics
715///
716/// Panics when `begin` is not at a code point boundary,
717/// or is beyond the end of the string.
718impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
719 type Output = Wtf8;
720
721 #[inline]
c34b1796 722 fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
85aaf69f
SL
723 // is_code_point_boundary checks that the index is in [0, .len()]
724 if is_code_point_boundary(self, range.start) {
725 unsafe { slice_unchecked(self, range.start, self.len()) }
726 } else {
727 slice_error_fail(self, range.start, self.len())
728 }
729 }
730}
731
9fa01778 732/// Returns a slice of the given string from its beginning to byte `end`.
85aaf69f
SL
733///
734/// # Panics
735///
736/// Panics when `end` is not at a code point boundary,
737/// or is beyond the end of the string.
738impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
739 type Output = Wtf8;
740
741 #[inline]
c34b1796 742 fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
85aaf69f
SL
743 // is_code_point_boundary checks that the index is in [0, .len()]
744 if is_code_point_boundary(self, range.end) {
745 unsafe { slice_unchecked(self, 0, range.end) }
746 } else {
747 slice_error_fail(self, 0, range.end)
748 }
749 }
750}
751
752impl ops::Index<ops::RangeFull> for Wtf8 {
753 type Output = Wtf8;
754
755 #[inline]
c34b1796 756 fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
85aaf69f
SL
757 self
758 }
759}
760
761#[inline]
762fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
763 // The first byte is assumed to be 0xED
764 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
765}
766
767#[inline]
768fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
769 let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
c1a9b12d 770 unsafe { char::from_u32_unchecked(code_point) }
85aaf69f
SL
771}
772
773/// Copied from core::str::StrPrelude::is_char_boundary
774#[inline]
c34b1796 775pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
60c5eb7d
XL
776 if index == slice.len() {
777 return true;
778 }
85aaf69f
SL
779 match slice.bytes.get(index) {
780 None => false,
c34b1796 781 Some(&b) => b < 128 || b >= 192,
85aaf69f
SL
782 }
783}
784
785/// Copied from core::str::raw::slice_unchecked
786#[inline]
c34b1796
AL
787pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
788 // memory layout of an &[u8] and &Wtf8 are the same
60c5eb7d 789 Wtf8::from_bytes_unchecked(slice::from_raw_parts(s.bytes.as_ptr().add(begin), end - begin))
85aaf69f
SL
790}
791
792/// Copied from core::str::raw::slice_error_fail
793#[inline(never)]
c34b1796 794pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
85aaf69f 795 assert!(begin <= end);
60c5eb7d 796 panic!("index {} and/or {} in `{:?}` do not lie on character boundary", begin, end, s);
85aaf69f
SL
797}
798
799/// Iterator for the code points of a WTF-8 string.
800///
801/// Created with the method `.code_points()`.
802#[derive(Clone)]
803pub struct Wtf8CodePoints<'a> {
60c5eb7d 804 bytes: slice::Iter<'a, u8>,
85aaf69f
SL
805}
806
807impl<'a> Iterator for Wtf8CodePoints<'a> {
808 type Item = CodePoint;
809
810 #[inline]
811 fn next(&mut self) -> Option<CodePoint> {
812 next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
813 }
814
815 #[inline]
c34b1796 816 fn size_hint(&self) -> (usize, Option<usize>) {
3157f602 817 let len = self.bytes.len();
85aaf69f
SL
818 (len.saturating_add(3) / 4, Some(len))
819 }
820}
821
7cac9316 822/// Generates a wide character sequence for potentially ill-formed UTF-16.
92a42be0 823#[stable(feature = "rust1", since = "1.0.0")]
85aaf69f
SL
824#[derive(Clone)]
825pub struct EncodeWide<'a> {
826 code_points: Wtf8CodePoints<'a>,
60c5eb7d 827 extra: u16,
85aaf69f
SL
828}
829
830// Copied from libunicode/u_str.rs
92a42be0 831#[stable(feature = "rust1", since = "1.0.0")]
85aaf69f
SL
832impl<'a> Iterator for EncodeWide<'a> {
833 type Item = u16;
834
835 #[inline]
836 fn next(&mut self) -> Option<u16> {
837 if self.extra != 0 {
838 let tmp = self.extra;
839 self.extra = 0;
840 return Some(tmp);
841 }
842
c30ab7b3 843 let mut buf = [0; 2];
85aaf69f 844 self.code_points.next().map(|code_point| {
f9f354fc 845 let n = char::encode_utf16_raw(code_point.value, &mut buf).len();
c30ab7b3
SL
846 if n == 2 {
847 self.extra = buf[1];
54a0048b 848 }
c30ab7b3 849 buf[0]
85aaf69f
SL
850 })
851 }
852
853 #[inline]
c34b1796 854 fn size_hint(&self) -> (usize, Option<usize>) {
85aaf69f
SL
855 let (low, high) = self.code_points.size_hint();
856 // every code point gets either one u16 or two u16,
857 // so this iterator is between 1 or 2 times as
858 // long as the underlying iterator.
859 (low, high.and_then(|n| n.checked_mul(2)))
860 }
861}
862
85aaf69f
SL
863impl Hash for CodePoint {
864 #[inline]
865 fn hash<H: Hasher>(&self, state: &mut H) {
866 self.value.hash(state)
867 }
868}
869
85aaf69f
SL
870impl Hash for Wtf8Buf {
871 #[inline]
872 fn hash<H: Hasher>(&self, state: &mut H) {
873 state.write(&self.bytes);
874 0xfeu8.hash(state)
875 }
876}
877
85aaf69f
SL
878impl Hash for Wtf8 {
879 #[inline]
880 fn hash<H: Hasher>(&self, state: &mut H) {
881 state.write(&self.bytes);
882 0xfeu8.hash(state)
883 }
884}