]>
Commit | Line | Data |
---|---|---|
85aaf69f SL |
1 | //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). |
2 | //! | |
3 | //! This library uses Rust’s type system to maintain | |
4 | //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), | |
5 | //! like the `String` and `&str` types do for UTF-8. | |
6 | //! | |
7 | //! Since [WTF-8 must not be used | |
8 | //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), | |
9 | //! this library deliberately does not provide access to the underlying bytes | |
10 | //! of WTF-8 strings, | |
11 | //! nor can it decode WTF-8 from arbitrary bytes. | |
12 | //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. | |
13 | ||
c34b1796 AL |
14 | // this module is imported from @SimonSapin's repo and has tons of dead code on |
15 | // unix (it's mostly used on windows), so don't worry about dead code here. | |
16 | #![allow(dead_code)] | |
17 | ||
1b1a35ee XL |
18 | #[cfg(test)] |
19 | mod tests; | |
20 | ||
62682a34 | 21 | use core::str::next_code_point; |
85aaf69f | 22 | |
532ac7d7 XL |
23 | use crate::borrow::Cow; |
24 | use crate::char; | |
25 | use crate::fmt; | |
26 | use crate::hash::{Hash, Hasher}; | |
27 | use crate::iter::FromIterator; | |
28 | use crate::mem; | |
29 | use crate::ops; | |
30 | use crate::rc::Rc; | |
31 | use crate::slice; | |
32 | use crate::str; | |
33 | use crate::sync::Arc; | |
34 | use crate::sys_common::AsInner; | |
85aaf69f | 35 | |
0731742a | 36 | const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}"; |
85aaf69f SL |
37 | |
38 | /// A Unicode code point: from U+0000 to U+10FFFF. | |
39 | /// | |
9fa01778 | 40 | /// Compares with the `char` type, |
85aaf69f SL |
41 | /// which represents a Unicode scalar value: |
42 | /// a code point that is not a surrogate (U+D800 to U+DFFF). | |
43 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] | |
44 | pub struct CodePoint { | |
60c5eb7d | 45 | value: u32, |
85aaf69f SL |
46 | } |
47 | ||
48 | /// Format the code point as `U+` followed by four to six hexadecimal digits. | |
49 | /// Example: `U+1F4A9` | |
50 | impl fmt::Debug for CodePoint { | |
51 | #[inline] | |
532ac7d7 | 52 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { |
85aaf69f SL |
53 | write!(formatter, "U+{:04X}", self.value) |
54 | } | |
55 | } | |
56 | ||
57 | impl CodePoint { | |
9346a6ac | 58 | /// Unsafely creates a new `CodePoint` without checking the value. |
85aaf69f SL |
59 | /// |
60 | /// Only use when `value` is known to be less than or equal to 0x10FFFF. | |
61 | #[inline] | |
62 | pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { | |
a1dfa0c6 | 63 | CodePoint { value } |
85aaf69f SL |
64 | } |
65 | ||
9346a6ac | 66 | /// Creates a new `CodePoint` if the value is a valid code point. |
85aaf69f | 67 | /// |
9346a6ac | 68 | /// Returns `None` if `value` is above 0x10FFFF. |
85aaf69f SL |
69 | #[inline] |
70 | pub fn from_u32(value: u32) -> Option<CodePoint> { | |
71 | match value { | |
60c5eb7d XL |
72 | 0..=0x10FFFF => Some(CodePoint { value }), |
73 | _ => None, | |
85aaf69f SL |
74 | } |
75 | } | |
76 | ||
9346a6ac | 77 | /// Creates a new `CodePoint` from a `char`. |
85aaf69f SL |
78 | /// |
79 | /// Since all Unicode scalar values are code points, this always succeeds. | |
80 | #[inline] | |
81 | pub fn from_char(value: char) -> CodePoint { | |
82 | CodePoint { value: value as u32 } | |
83 | } | |
84 | ||
9346a6ac | 85 | /// Returns the numeric value of the code point. |
85aaf69f SL |
86 | #[inline] |
87 | pub fn to_u32(&self) -> u32 { | |
88 | self.value | |
89 | } | |
90 | ||
9346a6ac | 91 | /// Optionally returns a Unicode scalar value for the code point. |
85aaf69f | 92 | /// |
9346a6ac | 93 | /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). |
85aaf69f SL |
94 | #[inline] |
95 | pub fn to_char(&self) -> Option<char> { | |
96 | match self.value { | |
60c5eb7d XL |
97 | 0xD800..=0xDFFF => None, |
98 | _ => Some(unsafe { char::from_u32_unchecked(self.value) }), | |
85aaf69f SL |
99 | } |
100 | } | |
101 | ||
9346a6ac | 102 | /// Returns a Unicode scalar value for the code point. |
85aaf69f | 103 | /// |
9346a6ac | 104 | /// Returns `'\u{FFFD}'` (the replacement character “�”) |
85aaf69f SL |
105 | /// if the code point is a surrogate (from U+D800 to U+DFFF). |
106 | #[inline] | |
107 | pub fn to_char_lossy(&self) -> char { | |
108 | self.to_char().unwrap_or('\u{FFFD}') | |
109 | } | |
110 | } | |
111 | ||
112 | /// An owned, growable string of well-formed WTF-8 data. | |
113 | /// | |
114 | /// Similar to `String`, but can additionally contain surrogate code points | |
115 | /// if they’re not in a surrogate pair. | |
116 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] | |
117 | pub struct Wtf8Buf { | |
60c5eb7d | 118 | bytes: Vec<u8>, |
85aaf69f SL |
119 | } |
120 | ||
121 | impl ops::Deref for Wtf8Buf { | |
122 | type Target = Wtf8; | |
123 | ||
124 | fn deref(&self) -> &Wtf8 { | |
125 | self.as_slice() | |
126 | } | |
127 | } | |
128 | ||
ff7c6d11 XL |
129 | impl ops::DerefMut for Wtf8Buf { |
130 | fn deref_mut(&mut self) -> &mut Wtf8 { | |
131 | self.as_mut_slice() | |
132 | } | |
133 | } | |
134 | ||
85aaf69f SL |
135 | /// Format the string with double quotes, |
136 | /// and surrogates as `\u` followed by four hexadecimal digits. | |
137 | /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] | |
138 | impl fmt::Debug for Wtf8Buf { | |
139 | #[inline] | |
532ac7d7 | 140 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { |
85aaf69f SL |
141 | fmt::Debug::fmt(&**self, formatter) |
142 | } | |
143 | } | |
144 | ||
145 | impl Wtf8Buf { | |
b039eaaf | 146 | /// Creates a new, empty WTF-8 string. |
85aaf69f SL |
147 | #[inline] |
148 | pub fn new() -> Wtf8Buf { | |
149 | Wtf8Buf { bytes: Vec::new() } | |
150 | } | |
151 | ||
48663c56 | 152 | /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes. |
85aaf69f | 153 | #[inline] |
48663c56 XL |
154 | pub fn with_capacity(capacity: usize) -> Wtf8Buf { |
155 | Wtf8Buf { bytes: Vec::with_capacity(capacity) } | |
85aaf69f SL |
156 | } |
157 | ||
d9579d0f | 158 | /// Creates a WTF-8 string from a UTF-8 `String`. |
85aaf69f SL |
159 | /// |
160 | /// This takes ownership of the `String` and does not copy. | |
161 | /// | |
162 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
163 | #[inline] | |
164 | pub fn from_string(string: String) -> Wtf8Buf { | |
165 | Wtf8Buf { bytes: string.into_bytes() } | |
166 | } | |
167 | ||
d9579d0f | 168 | /// Creates a WTF-8 string from a UTF-8 `&str` slice. |
85aaf69f SL |
169 | /// |
170 | /// This copies the content of the slice. | |
171 | /// | |
172 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
173 | #[inline] | |
174 | pub fn from_str(str: &str) -> Wtf8Buf { | |
c34b1796 | 175 | Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) } |
85aaf69f SL |
176 | } |
177 | ||
7453a54e SL |
178 | pub fn clear(&mut self) { |
179 | self.bytes.clear() | |
180 | } | |
181 | ||
9346a6ac | 182 | /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. |
85aaf69f SL |
183 | /// |
184 | /// This is lossless: calling `.encode_wide()` on the resulting string | |
185 | /// will always return the original code units. | |
186 | pub fn from_wide(v: &[u16]) -> Wtf8Buf { | |
187 | let mut string = Wtf8Buf::with_capacity(v.len()); | |
e9174d1e | 188 | for item in char::decode_utf16(v.iter().cloned()) { |
85aaf69f | 189 | match item { |
e9174d1e SL |
190 | Ok(ch) => string.push_char(ch), |
191 | Err(surrogate) => { | |
54a0048b | 192 | let surrogate = surrogate.unpaired_surrogate(); |
85aaf69f | 193 | // Surrogates are known to be in the code point range. |
60c5eb7d | 194 | let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) }; |
85aaf69f | 195 | // Skip the WTF-8 concatenation check, |
e9174d1e | 196 | // surrogate pairs are already decoded by decode_utf16 |
85aaf69f SL |
197 | string.push_code_point_unchecked(code_point) |
198 | } | |
199 | } | |
200 | } | |
201 | string | |
202 | } | |
203 | ||
204 | /// Copied from String::push | |
205 | /// This does **not** include the WTF-8 concatenation check. | |
206 | fn push_code_point_unchecked(&mut self, code_point: CodePoint) { | |
c30ab7b3 | 207 | let mut bytes = [0; 4]; |
f9f354fc | 208 | let bytes = char::encode_utf8_raw(code_point.value, &mut bytes); |
c30ab7b3 | 209 | self.bytes.extend_from_slice(bytes) |
85aaf69f SL |
210 | } |
211 | ||
212 | #[inline] | |
213 | pub fn as_slice(&self) -> &Wtf8 { | |
c1a9b12d | 214 | unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } |
85aaf69f SL |
215 | } |
216 | ||
ff7c6d11 XL |
217 | #[inline] |
218 | pub fn as_mut_slice(&mut self) -> &mut Wtf8 { | |
219 | unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) } | |
220 | } | |
221 | ||
85aaf69f SL |
222 | /// Reserves capacity for at least `additional` more bytes to be inserted |
223 | /// in the given `Wtf8Buf`. | |
224 | /// The collection may reserve more space to avoid frequent reallocations. | |
225 | /// | |
226 | /// # Panics | |
227 | /// | |
c34b1796 | 228 | /// Panics if the new capacity overflows `usize`. |
85aaf69f | 229 | #[inline] |
c34b1796 | 230 | pub fn reserve(&mut self, additional: usize) { |
85aaf69f SL |
231 | self.bytes.reserve(additional) |
232 | } | |
233 | ||
7453a54e SL |
234 | #[inline] |
235 | pub fn reserve_exact(&mut self, additional: usize) { | |
236 | self.bytes.reserve_exact(additional) | |
237 | } | |
238 | ||
8bb4bdeb XL |
239 | #[inline] |
240 | pub fn shrink_to_fit(&mut self) { | |
241 | self.bytes.shrink_to_fit() | |
242 | } | |
243 | ||
0531ce1d XL |
244 | #[inline] |
245 | pub fn shrink_to(&mut self, min_capacity: usize) { | |
246 | self.bytes.shrink_to(min_capacity) | |
247 | } | |
248 | ||
85aaf69f SL |
249 | /// Returns the number of bytes that this string buffer can hold without reallocating. |
250 | #[inline] | |
c34b1796 | 251 | pub fn capacity(&self) -> usize { |
85aaf69f SL |
252 | self.bytes.capacity() |
253 | } | |
254 | ||
d9579d0f | 255 | /// Append a UTF-8 slice at the end of the string. |
85aaf69f SL |
256 | #[inline] |
257 | pub fn push_str(&mut self, other: &str) { | |
92a42be0 | 258 | self.bytes.extend_from_slice(other.as_bytes()) |
85aaf69f SL |
259 | } |
260 | ||
261 | /// Append a WTF-8 slice at the end of the string. | |
262 | /// | |
263 | /// This replaces newly paired surrogates at the boundary | |
264 | /// with a supplementary code point, | |
265 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
266 | #[inline] | |
267 | pub fn push_wtf8(&mut self, other: &Wtf8) { | |
268 | match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { | |
269 | // Replace newly paired surrogates by a supplementary code point. | |
270 | (Some(lead), Some(trail)) => { | |
271 | let len_without_lead_surrogate = self.len() - 3; | |
272 | self.bytes.truncate(len_without_lead_surrogate); | |
273 | let other_without_trail_surrogate = &other.bytes[3..]; | |
274 | // 4 bytes for the supplementary code point | |
275 | self.bytes.reserve(4 + other_without_trail_surrogate.len()); | |
276 | self.push_char(decode_surrogate_pair(lead, trail)); | |
92a42be0 | 277 | self.bytes.extend_from_slice(other_without_trail_surrogate); |
85aaf69f | 278 | } |
60c5eb7d | 279 | _ => self.bytes.extend_from_slice(&other.bytes), |
85aaf69f SL |
280 | } |
281 | } | |
282 | ||
283 | /// Append a Unicode scalar value at the end of the string. | |
284 | #[inline] | |
285 | pub fn push_char(&mut self, c: char) { | |
286 | self.push_code_point_unchecked(CodePoint::from_char(c)) | |
287 | } | |
288 | ||
289 | /// Append a code point at the end of the string. | |
290 | /// | |
291 | /// This replaces newly paired surrogates at the boundary | |
292 | /// with a supplementary code point, | |
293 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
294 | #[inline] | |
295 | pub fn push(&mut self, code_point: CodePoint) { | |
8faf50e0 | 296 | if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() { |
e9174d1e SL |
297 | if let Some(lead) = (&*self).final_lead_surrogate() { |
298 | let len_without_lead_surrogate = self.len() - 3; | |
299 | self.bytes.truncate(len_without_lead_surrogate); | |
300 | self.push_char(decode_surrogate_pair(lead, trail as u16)); | |
60c5eb7d | 301 | return; |
85aaf69f | 302 | } |
85aaf69f SL |
303 | } |
304 | ||
305 | // No newly paired surrogates at the boundary. | |
306 | self.push_code_point_unchecked(code_point) | |
307 | } | |
308 | ||
309 | /// Shortens a string to the specified length. | |
310 | /// | |
311 | /// # Panics | |
312 | /// | |
313 | /// Panics if `new_len` > current length, | |
314 | /// or if `new_len` is not a code point boundary. | |
315 | #[inline] | |
c34b1796 | 316 | pub fn truncate(&mut self, new_len: usize) { |
85aaf69f SL |
317 | assert!(is_code_point_boundary(self, new_len)); |
318 | self.bytes.truncate(new_len) | |
319 | } | |
320 | ||
9346a6ac | 321 | /// Consumes the WTF-8 string and tries to convert it to UTF-8. |
85aaf69f SL |
322 | /// |
323 | /// This does not copy the data. | |
324 | /// | |
325 | /// If the contents are not well-formed UTF-8 | |
326 | /// (that is, if the string contains surrogates), | |
327 | /// the original WTF-8 string is returned instead. | |
328 | pub fn into_string(self) -> Result<String, Wtf8Buf> { | |
329 | match self.next_surrogate(0) { | |
330 | None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), | |
331 | Some(_) => Err(self), | |
332 | } | |
333 | } | |
334 | ||
9346a6ac | 335 | /// Consumes the WTF-8 string and converts it lossily to UTF-8. |
85aaf69f SL |
336 | /// |
337 | /// This does not copy the data (but may overwrite parts of it in place). | |
338 | /// | |
339 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) | |
340 | pub fn into_string_lossy(mut self) -> String { | |
341 | let mut pos = 0; | |
342 | loop { | |
343 | match self.next_surrogate(pos) { | |
344 | Some((surrogate_pos, _)) => { | |
345 | pos = surrogate_pos + 3; | |
92a42be0 | 346 | self.bytes[surrogate_pos..pos] |
041b39d2 | 347 | .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); |
60c5eb7d XL |
348 | } |
349 | None => return unsafe { String::from_utf8_unchecked(self.bytes) }, | |
85aaf69f SL |
350 | } |
351 | } | |
352 | } | |
8bb4bdeb XL |
353 | |
354 | /// Converts this `Wtf8Buf` into a boxed `Wtf8`. | |
355 | #[inline] | |
356 | pub fn into_box(self) -> Box<Wtf8> { | |
357 | unsafe { mem::transmute(self.bytes.into_boxed_slice()) } | |
358 | } | |
cc61c64b XL |
359 | |
360 | /// Converts a `Box<Wtf8>` into a `Wtf8Buf`. | |
361 | pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf { | |
362 | let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) }; | |
363 | Wtf8Buf { bytes: bytes.into_vec() } | |
364 | } | |
85aaf69f SL |
365 | } |
366 | ||
9fa01778 | 367 | /// Creates a new WTF-8 string from an iterator of code points. |
85aaf69f SL |
368 | /// |
369 | /// This replaces surrogate code point pairs with supplementary code points, | |
370 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
371 | impl FromIterator<CodePoint> for Wtf8Buf { | |
60c5eb7d | 372 | fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf { |
85aaf69f SL |
373 | let mut string = Wtf8Buf::new(); |
374 | string.extend(iter); | |
375 | string | |
376 | } | |
377 | } | |
378 | ||
379 | /// Append code points from an iterator to the string. | |
380 | /// | |
381 | /// This replaces surrogate code point pairs with supplementary code points, | |
382 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
383 | impl Extend<CodePoint> for Wtf8Buf { | |
60c5eb7d | 384 | fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) { |
54a0048b | 385 | let iterator = iter.into_iter(); |
85aaf69f SL |
386 | let (low, _high) = iterator.size_hint(); |
387 | // Lower bound of one byte per code point (ASCII only) | |
388 | self.bytes.reserve(low); | |
532ac7d7 | 389 | iterator.for_each(move |code_point| self.push(code_point)); |
85aaf69f | 390 | } |
f9f354fc XL |
391 | |
392 | #[inline] | |
393 | fn extend_one(&mut self, code_point: CodePoint) { | |
394 | self.push(code_point); | |
395 | } | |
396 | ||
397 | #[inline] | |
398 | fn extend_reserve(&mut self, additional: usize) { | |
399 | // Lower bound of one byte per code point (ASCII only) | |
400 | self.bytes.reserve(additional); | |
401 | } | |
85aaf69f SL |
402 | } |
403 | ||
404 | /// A borrowed slice of well-formed WTF-8 data. | |
405 | /// | |
406 | /// Similar to `&str`, but can additionally contain surrogate code points | |
407 | /// if they’re not in a surrogate pair. | |
b039eaaf | 408 | #[derive(Eq, Ord, PartialEq, PartialOrd)] |
85aaf69f | 409 | pub struct Wtf8 { |
60c5eb7d | 410 | bytes: [u8], |
85aaf69f SL |
411 | } |
412 | ||
413 | impl AsInner<[u8]> for Wtf8 { | |
60c5eb7d XL |
414 | fn as_inner(&self) -> &[u8] { |
415 | &self.bytes | |
416 | } | |
85aaf69f SL |
417 | } |
418 | ||
85aaf69f SL |
419 | /// Format the slice with double quotes, |
420 | /// and surrogates as `\u` followed by four hexadecimal digits. | |
421 | /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] | |
422 | impl fmt::Debug for Wtf8 { | |
532ac7d7 XL |
423 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { |
424 | fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result { | |
425 | use crate::fmt::Write; | |
5bcae85e | 426 | for c in s.chars().flat_map(|c| c.escape_debug()) { |
54a0048b | 427 | f.write_char(c)? |
c1a9b12d SL |
428 | } |
429 | Ok(()) | |
430 | } | |
431 | ||
54a0048b | 432 | formatter.write_str("\"")?; |
85aaf69f | 433 | let mut pos = 0; |
0531ce1d | 434 | while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) { |
60c5eb7d XL |
435 | write_str_escaped(formatter, unsafe { |
436 | str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) | |
437 | })?; | |
0531ce1d XL |
438 | write!(formatter, "\\u{{{:x}}}", surrogate)?; |
439 | pos = surrogate_pos + 3; | |
85aaf69f | 440 | } |
60c5eb7d | 441 | write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?; |
85aaf69f SL |
442 | formatter.write_str("\"") |
443 | } | |
444 | } | |
445 | ||
041b39d2 | 446 | impl fmt::Display for Wtf8 { |
532ac7d7 | 447 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { |
041b39d2 XL |
448 | let wtf8_bytes = &self.bytes; |
449 | let mut pos = 0; | |
450 | loop { | |
451 | match self.next_surrogate(pos) { | |
452 | Some((surrogate_pos, _)) => { | |
453 | formatter.write_str(unsafe { | |
60c5eb7d | 454 | str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos]) |
041b39d2 XL |
455 | })?; |
456 | formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?; | |
457 | pos = surrogate_pos + 3; | |
60c5eb7d | 458 | } |
041b39d2 | 459 | None => { |
60c5eb7d XL |
460 | let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) }; |
461 | if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) } | |
041b39d2 XL |
462 | } |
463 | } | |
464 | } | |
465 | } | |
466 | } | |
467 | ||
85aaf69f | 468 | impl Wtf8 { |
9346a6ac | 469 | /// Creates a WTF-8 slice from a UTF-8 `&str` slice. |
85aaf69f SL |
470 | /// |
471 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
472 | #[inline] | |
473 | pub fn from_str(value: &str) -> &Wtf8 { | |
c1a9b12d SL |
474 | unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } |
475 | } | |
476 | ||
477 | /// Creates a WTF-8 slice from a WTF-8 byte slice. | |
478 | /// | |
479 | /// Since the byte slice is not checked for valid WTF-8, this functions is | |
480 | /// marked unsafe. | |
481 | #[inline] | |
482 | unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { | |
483 | mem::transmute(value) | |
85aaf69f SL |
484 | } |
485 | ||
ff7c6d11 XL |
486 | /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice. |
487 | /// | |
488 | /// Since the byte slice is not checked for valid WTF-8, this functions is | |
489 | /// marked unsafe. | |
490 | #[inline] | |
491 | unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 { | |
492 | mem::transmute(value) | |
493 | } | |
494 | ||
9346a6ac | 495 | /// Returns the length, in WTF-8 bytes. |
85aaf69f | 496 | #[inline] |
c34b1796 | 497 | pub fn len(&self) -> usize { |
85aaf69f SL |
498 | self.bytes.len() |
499 | } | |
500 | ||
7453a54e SL |
501 | #[inline] |
502 | pub fn is_empty(&self) -> bool { | |
503 | self.bytes.is_empty() | |
504 | } | |
505 | ||
9346a6ac | 506 | /// Returns the code point at `position` if it is in the ASCII range, |
85aaf69f SL |
507 | /// or `b'\xFF' otherwise. |
508 | /// | |
509 | /// # Panics | |
510 | /// | |
511 | /// Panics if `position` is beyond the end of the string. | |
512 | #[inline] | |
c34b1796 | 513 | pub fn ascii_byte_at(&self, position: usize) -> u8 { |
85aaf69f | 514 | match self.bytes[position] { |
60c5eb7d XL |
515 | ascii_byte @ 0x00..=0x7F => ascii_byte, |
516 | _ => 0xFF, | |
85aaf69f SL |
517 | } |
518 | } | |
519 | ||
9346a6ac | 520 | /// Returns an iterator for the string’s code points. |
85aaf69f | 521 | #[inline] |
532ac7d7 | 522 | pub fn code_points(&self) -> Wtf8CodePoints<'_> { |
85aaf69f SL |
523 | Wtf8CodePoints { bytes: self.bytes.iter() } |
524 | } | |
525 | ||
9346a6ac | 526 | /// Tries to convert the string to UTF-8 and return a `&str` slice. |
85aaf69f | 527 | /// |
9346a6ac | 528 | /// Returns `None` if the string contains surrogates. |
85aaf69f SL |
529 | /// |
530 | /// This does not copy the data. | |
531 | #[inline] | |
532 | pub fn as_str(&self) -> Option<&str> { | |
533 | // Well-formed WTF-8 is also well-formed UTF-8 | |
534 | // if and only if it contains no surrogate. | |
535 | match self.next_surrogate(0) { | |
536 | None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }), | |
537 | Some(_) => None, | |
538 | } | |
539 | } | |
540 | ||
9346a6ac | 541 | /// Lossily converts the string to UTF-8. |
d9579d0f | 542 | /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. |
85aaf69f SL |
543 | /// |
544 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). | |
545 | /// | |
546 | /// This only copies the data if necessary (if it contains any surrogate). | |
532ac7d7 | 547 | pub fn to_string_lossy(&self) -> Cow<'_, str> { |
85aaf69f SL |
548 | let surrogate_pos = match self.next_surrogate(0) { |
549 | None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), | |
550 | Some((pos, _)) => pos, | |
551 | }; | |
552 | let wtf8_bytes = &self.bytes; | |
553 | let mut utf8_bytes = Vec::with_capacity(self.len()); | |
92a42be0 | 554 | utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); |
041b39d2 | 555 | utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); |
85aaf69f SL |
556 | let mut pos = surrogate_pos + 3; |
557 | loop { | |
558 | match self.next_surrogate(pos) { | |
559 | Some((surrogate_pos, _)) => { | |
60c5eb7d | 560 | utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); |
041b39d2 | 561 | utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes()); |
85aaf69f | 562 | pos = surrogate_pos + 3; |
60c5eb7d | 563 | } |
85aaf69f | 564 | None => { |
92a42be0 | 565 | utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); |
60c5eb7d | 566 | return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }); |
85aaf69f SL |
567 | } |
568 | } | |
569 | } | |
570 | } | |
571 | ||
9346a6ac | 572 | /// Converts the WTF-8 string to potentially ill-formed UTF-16 |
85aaf69f SL |
573 | /// and return an iterator of 16-bit code units. |
574 | /// | |
575 | /// This is lossless: | |
576 | /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units | |
577 | /// would always return the original WTF-8 string. | |
578 | #[inline] | |
532ac7d7 | 579 | pub fn encode_wide(&self) -> EncodeWide<'_> { |
85aaf69f SL |
580 | EncodeWide { code_points: self.code_points(), extra: 0 } |
581 | } | |
582 | ||
583 | #[inline] | |
c34b1796 | 584 | fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { |
85aaf69f SL |
585 | let mut iter = self.bytes[pos..].iter(); |
586 | loop { | |
ff7c6d11 | 587 | let b = *iter.next()?; |
85aaf69f SL |
588 | if b < 0x80 { |
589 | pos += 1; | |
590 | } else if b < 0xE0 { | |
591 | iter.next(); | |
592 | pos += 2; | |
593 | } else if b == 0xED { | |
594 | match (iter.next(), iter.next()) { | |
595 | (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { | |
60c5eb7d | 596 | return Some((pos, decode_surrogate(b2, b3))); |
85aaf69f | 597 | } |
60c5eb7d | 598 | _ => pos += 3, |
85aaf69f SL |
599 | } |
600 | } else if b < 0xF0 { | |
601 | iter.next(); | |
602 | iter.next(); | |
603 | pos += 3; | |
604 | } else { | |
605 | iter.next(); | |
606 | iter.next(); | |
607 | iter.next(); | |
608 | pos += 4; | |
609 | } | |
610 | } | |
611 | } | |
612 | ||
613 | #[inline] | |
614 | fn final_lead_surrogate(&self) -> Option<u16> { | |
ba9703b0 XL |
615 | match self.bytes { |
616 | [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)), | |
60c5eb7d | 617 | _ => None, |
85aaf69f SL |
618 | } |
619 | } | |
620 | ||
621 | #[inline] | |
622 | fn initial_trail_surrogate(&self) -> Option<u16> { | |
ba9703b0 XL |
623 | match self.bytes { |
624 | [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)), | |
60c5eb7d | 625 | _ => None, |
85aaf69f SL |
626 | } |
627 | } | |
8bb4bdeb | 628 | |
ba9703b0 XL |
629 | pub fn clone_into(&self, buf: &mut Wtf8Buf) { |
630 | self.bytes.clone_into(&mut buf.bytes) | |
631 | } | |
632 | ||
8bb4bdeb XL |
633 | /// Boxes this `Wtf8`. |
634 | #[inline] | |
635 | pub fn into_box(&self) -> Box<Wtf8> { | |
636 | let boxed: Box<[u8]> = self.bytes.into(); | |
637 | unsafe { mem::transmute(boxed) } | |
638 | } | |
639 | ||
640 | /// Creates a boxed, empty `Wtf8`. | |
641 | pub fn empty_box() -> Box<Wtf8> { | |
642 | let boxed: Box<[u8]> = Default::default(); | |
643 | unsafe { mem::transmute(boxed) } | |
644 | } | |
ff7c6d11 XL |
645 | |
646 | #[inline] | |
647 | pub fn into_arc(&self) -> Arc<Wtf8> { | |
648 | let arc: Arc<[u8]> = Arc::from(&self.bytes); | |
649 | unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) } | |
650 | } | |
651 | ||
652 | #[inline] | |
653 | pub fn into_rc(&self) -> Rc<Wtf8> { | |
654 | let rc: Rc<[u8]> = Rc::from(&self.bytes); | |
655 | unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) } | |
656 | } | |
ba9703b0 XL |
657 | |
658 | #[inline] | |
659 | pub fn make_ascii_lowercase(&mut self) { | |
660 | self.bytes.make_ascii_lowercase() | |
661 | } | |
662 | ||
663 | #[inline] | |
664 | pub fn make_ascii_uppercase(&mut self) { | |
665 | self.bytes.make_ascii_uppercase() | |
666 | } | |
667 | ||
668 | #[inline] | |
669 | pub fn to_ascii_lowercase(&self) -> Wtf8Buf { | |
670 | Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() } | |
671 | } | |
672 | ||
673 | #[inline] | |
674 | pub fn to_ascii_uppercase(&self) -> Wtf8Buf { | |
675 | Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() } | |
676 | } | |
677 | ||
678 | #[inline] | |
679 | pub fn is_ascii(&self) -> bool { | |
680 | self.bytes.is_ascii() | |
681 | } | |
682 | ||
683 | #[inline] | |
684 | pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool { | |
685 | self.bytes.eq_ignore_ascii_case(&other.bytes) | |
686 | } | |
85aaf69f SL |
687 | } |
688 | ||
9fa01778 | 689 | /// Returns a slice of the given string for the byte range [`begin`..`end`). |
85aaf69f SL |
690 | /// |
691 | /// # Panics | |
692 | /// | |
693 | /// Panics when `begin` and `end` do not point to code point boundaries, | |
694 | /// or point beyond the end of the string. | |
695 | impl ops::Index<ops::Range<usize>> for Wtf8 { | |
696 | type Output = Wtf8; | |
697 | ||
698 | #[inline] | |
c34b1796 | 699 | fn index(&self, range: ops::Range<usize>) -> &Wtf8 { |
85aaf69f | 700 | // is_code_point_boundary checks that the index is in [0, .len()] |
60c5eb7d XL |
701 | if range.start <= range.end |
702 | && is_code_point_boundary(self, range.start) | |
703 | && is_code_point_boundary(self, range.end) | |
704 | { | |
85aaf69f SL |
705 | unsafe { slice_unchecked(self, range.start, range.end) } |
706 | } else { | |
707 | slice_error_fail(self, range.start, range.end) | |
708 | } | |
709 | } | |
710 | } | |
711 | ||
9fa01778 | 712 | /// Returns a slice of the given string from byte `begin` to its end. |
85aaf69f SL |
713 | /// |
714 | /// # Panics | |
715 | /// | |
716 | /// Panics when `begin` is not at a code point boundary, | |
717 | /// or is beyond the end of the string. | |
718 | impl ops::Index<ops::RangeFrom<usize>> for Wtf8 { | |
719 | type Output = Wtf8; | |
720 | ||
721 | #[inline] | |
c34b1796 | 722 | fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 { |
85aaf69f SL |
723 | // is_code_point_boundary checks that the index is in [0, .len()] |
724 | if is_code_point_boundary(self, range.start) { | |
725 | unsafe { slice_unchecked(self, range.start, self.len()) } | |
726 | } else { | |
727 | slice_error_fail(self, range.start, self.len()) | |
728 | } | |
729 | } | |
730 | } | |
731 | ||
9fa01778 | 732 | /// Returns a slice of the given string from its beginning to byte `end`. |
85aaf69f SL |
733 | /// |
734 | /// # Panics | |
735 | /// | |
736 | /// Panics when `end` is not at a code point boundary, | |
737 | /// or is beyond the end of the string. | |
738 | impl ops::Index<ops::RangeTo<usize>> for Wtf8 { | |
739 | type Output = Wtf8; | |
740 | ||
741 | #[inline] | |
c34b1796 | 742 | fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 { |
85aaf69f SL |
743 | // is_code_point_boundary checks that the index is in [0, .len()] |
744 | if is_code_point_boundary(self, range.end) { | |
745 | unsafe { slice_unchecked(self, 0, range.end) } | |
746 | } else { | |
747 | slice_error_fail(self, 0, range.end) | |
748 | } | |
749 | } | |
750 | } | |
751 | ||
752 | impl ops::Index<ops::RangeFull> for Wtf8 { | |
753 | type Output = Wtf8; | |
754 | ||
755 | #[inline] | |
c34b1796 | 756 | fn index(&self, _range: ops::RangeFull) -> &Wtf8 { |
85aaf69f SL |
757 | self |
758 | } | |
759 | } | |
760 | ||
761 | #[inline] | |
762 | fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { | |
763 | // The first byte is assumed to be 0xED | |
764 | 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F | |
765 | } | |
766 | ||
767 | #[inline] | |
768 | fn decode_surrogate_pair(lead: u16, trail: u16) -> char { | |
769 | let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); | |
c1a9b12d | 770 | unsafe { char::from_u32_unchecked(code_point) } |
85aaf69f SL |
771 | } |
772 | ||
773 | /// Copied from core::str::StrPrelude::is_char_boundary | |
774 | #[inline] | |
c34b1796 | 775 | pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { |
60c5eb7d XL |
776 | if index == slice.len() { |
777 | return true; | |
778 | } | |
85aaf69f SL |
779 | match slice.bytes.get(index) { |
780 | None => false, | |
c34b1796 | 781 | Some(&b) => b < 128 || b >= 192, |
85aaf69f SL |
782 | } |
783 | } | |
784 | ||
785 | /// Copied from core::str::raw::slice_unchecked | |
786 | #[inline] | |
c34b1796 AL |
787 | pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { |
788 | // memory layout of an &[u8] and &Wtf8 are the same | |
60c5eb7d | 789 | Wtf8::from_bytes_unchecked(slice::from_raw_parts(s.bytes.as_ptr().add(begin), end - begin)) |
85aaf69f SL |
790 | } |
791 | ||
792 | /// Copied from core::str::raw::slice_error_fail | |
793 | #[inline(never)] | |
c34b1796 | 794 | pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { |
85aaf69f | 795 | assert!(begin <= end); |
60c5eb7d | 796 | panic!("index {} and/or {} in `{:?}` do not lie on character boundary", begin, end, s); |
85aaf69f SL |
797 | } |
798 | ||
799 | /// Iterator for the code points of a WTF-8 string. | |
800 | /// | |
801 | /// Created with the method `.code_points()`. | |
802 | #[derive(Clone)] | |
803 | pub struct Wtf8CodePoints<'a> { | |
60c5eb7d | 804 | bytes: slice::Iter<'a, u8>, |
85aaf69f SL |
805 | } |
806 | ||
807 | impl<'a> Iterator for Wtf8CodePoints<'a> { | |
808 | type Item = CodePoint; | |
809 | ||
810 | #[inline] | |
811 | fn next(&mut self) -> Option<CodePoint> { | |
812 | next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) | |
813 | } | |
814 | ||
815 | #[inline] | |
c34b1796 | 816 | fn size_hint(&self) -> (usize, Option<usize>) { |
3157f602 | 817 | let len = self.bytes.len(); |
85aaf69f SL |
818 | (len.saturating_add(3) / 4, Some(len)) |
819 | } | |
820 | } | |
821 | ||
7cac9316 | 822 | /// Generates a wide character sequence for potentially ill-formed UTF-16. |
92a42be0 | 823 | #[stable(feature = "rust1", since = "1.0.0")] |
85aaf69f SL |
824 | #[derive(Clone)] |
825 | pub struct EncodeWide<'a> { | |
826 | code_points: Wtf8CodePoints<'a>, | |
60c5eb7d | 827 | extra: u16, |
85aaf69f SL |
828 | } |
829 | ||
830 | // Copied from libunicode/u_str.rs | |
92a42be0 | 831 | #[stable(feature = "rust1", since = "1.0.0")] |
85aaf69f SL |
832 | impl<'a> Iterator for EncodeWide<'a> { |
833 | type Item = u16; | |
834 | ||
835 | #[inline] | |
836 | fn next(&mut self) -> Option<u16> { | |
837 | if self.extra != 0 { | |
838 | let tmp = self.extra; | |
839 | self.extra = 0; | |
840 | return Some(tmp); | |
841 | } | |
842 | ||
c30ab7b3 | 843 | let mut buf = [0; 2]; |
85aaf69f | 844 | self.code_points.next().map(|code_point| { |
f9f354fc | 845 | let n = char::encode_utf16_raw(code_point.value, &mut buf).len(); |
c30ab7b3 SL |
846 | if n == 2 { |
847 | self.extra = buf[1]; | |
54a0048b | 848 | } |
c30ab7b3 | 849 | buf[0] |
85aaf69f SL |
850 | }) |
851 | } | |
852 | ||
853 | #[inline] | |
c34b1796 | 854 | fn size_hint(&self) -> (usize, Option<usize>) { |
85aaf69f SL |
855 | let (low, high) = self.code_points.size_hint(); |
856 | // every code point gets either one u16 or two u16, | |
857 | // so this iterator is between 1 or 2 times as | |
858 | // long as the underlying iterator. | |
859 | (low, high.and_then(|n| n.checked_mul(2))) | |
860 | } | |
861 | } | |
862 | ||
85aaf69f SL |
863 | impl Hash for CodePoint { |
864 | #[inline] | |
865 | fn hash<H: Hasher>(&self, state: &mut H) { | |
866 | self.value.hash(state) | |
867 | } | |
868 | } | |
869 | ||
85aaf69f SL |
870 | impl Hash for Wtf8Buf { |
871 | #[inline] | |
872 | fn hash<H: Hasher>(&self, state: &mut H) { | |
873 | state.write(&self.bytes); | |
874 | 0xfeu8.hash(state) | |
875 | } | |
876 | } | |
877 | ||
85aaf69f SL |
878 | impl Hash for Wtf8 { |
879 | #[inline] | |
880 | fn hash<H: Hasher>(&self, state: &mut H) { | |
881 | state.write(&self.bytes); | |
882 | 0xfeu8.hash(state) | |
883 | } | |
884 | } |