]>
Commit | Line | Data |
---|---|---|
85aaf69f SL |
1 | // Copyright 2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). | |
12 | //! | |
13 | //! This library uses Rust’s type system to maintain | |
14 | //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), | |
15 | //! like the `String` and `&str` types do for UTF-8. | |
16 | //! | |
17 | //! Since [WTF-8 must not be used | |
18 | //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), | |
19 | //! this library deliberately does not provide access to the underlying bytes | |
20 | //! of WTF-8 strings, | |
21 | //! nor can it decode WTF-8 from arbitrary bytes. | |
22 | //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. | |
23 | ||
c34b1796 AL |
24 | // this module is imported from @SimonSapin's repo and has tons of dead code on |
25 | // unix (it's mostly used on windows), so don't worry about dead code here. | |
26 | #![allow(dead_code)] | |
27 | ||
62682a34 | 28 | use core::str::next_code_point; |
85aaf69f SL |
29 | |
30 | use ascii::*; | |
31 | use borrow::Cow; | |
c1a9b12d | 32 | use char; |
85aaf69f SL |
33 | use fmt; |
34 | use hash::{Hash, Hasher}; | |
9346a6ac | 35 | use iter::FromIterator; |
85aaf69f | 36 | use mem; |
85aaf69f SL |
37 | use ops; |
38 | use slice; | |
39 | use str; | |
c34b1796 | 40 | use string::String; |
85aaf69f | 41 | use sys_common::AsInner; |
85aaf69f SL |
42 | use vec::Vec; |
43 | ||
c34b1796 | 44 | const UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD"; |
85aaf69f SL |
45 | |
46 | /// A Unicode code point: from U+0000 to U+10FFFF. | |
47 | /// | |
48 | /// Compare with the `char` type, | |
49 | /// which represents a Unicode scalar value: | |
50 | /// a code point that is not a surrogate (U+D800 to U+DFFF). | |
51 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] | |
52 | pub struct CodePoint { | |
53 | value: u32 | |
54 | } | |
55 | ||
56 | /// Format the code point as `U+` followed by four to six hexadecimal digits. | |
57 | /// Example: `U+1F4A9` | |
58 | impl fmt::Debug for CodePoint { | |
59 | #[inline] | |
60 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { | |
61 | write!(formatter, "U+{:04X}", self.value) | |
62 | } | |
63 | } | |
64 | ||
65 | impl CodePoint { | |
9346a6ac | 66 | /// Unsafely creates a new `CodePoint` without checking the value. |
85aaf69f SL |
67 | /// |
68 | /// Only use when `value` is known to be less than or equal to 0x10FFFF. | |
69 | #[inline] | |
70 | pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { | |
71 | CodePoint { value: value } | |
72 | } | |
73 | ||
9346a6ac | 74 | /// Creates a new `CodePoint` if the value is a valid code point. |
85aaf69f | 75 | /// |
9346a6ac | 76 | /// Returns `None` if `value` is above 0x10FFFF. |
85aaf69f SL |
77 | #[inline] |
78 | pub fn from_u32(value: u32) -> Option<CodePoint> { | |
79 | match value { | |
80 | 0 ... 0x10FFFF => Some(CodePoint { value: value }), | |
81 | _ => None | |
82 | } | |
83 | } | |
84 | ||
9346a6ac | 85 | /// Creates a new `CodePoint` from a `char`. |
85aaf69f SL |
86 | /// |
87 | /// Since all Unicode scalar values are code points, this always succeeds. | |
88 | #[inline] | |
89 | pub fn from_char(value: char) -> CodePoint { | |
90 | CodePoint { value: value as u32 } | |
91 | } | |
92 | ||
9346a6ac | 93 | /// Returns the numeric value of the code point. |
85aaf69f SL |
94 | #[inline] |
95 | pub fn to_u32(&self) -> u32 { | |
96 | self.value | |
97 | } | |
98 | ||
9346a6ac | 99 | /// Optionally returns a Unicode scalar value for the code point. |
85aaf69f | 100 | /// |
9346a6ac | 101 | /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). |
85aaf69f SL |
102 | #[inline] |
103 | pub fn to_char(&self) -> Option<char> { | |
104 | match self.value { | |
105 | 0xD800 ... 0xDFFF => None, | |
c1a9b12d | 106 | _ => Some(unsafe { char::from_u32_unchecked(self.value) }) |
85aaf69f SL |
107 | } |
108 | } | |
109 | ||
9346a6ac | 110 | /// Returns a Unicode scalar value for the code point. |
85aaf69f | 111 | /// |
9346a6ac | 112 | /// Returns `'\u{FFFD}'` (the replacement character “�”) |
85aaf69f SL |
113 | /// if the code point is a surrogate (from U+D800 to U+DFFF). |
114 | #[inline] | |
115 | pub fn to_char_lossy(&self) -> char { | |
116 | self.to_char().unwrap_or('\u{FFFD}') | |
117 | } | |
118 | } | |
119 | ||
120 | /// An owned, growable string of well-formed WTF-8 data. | |
121 | /// | |
122 | /// Similar to `String`, but can additionally contain surrogate code points | |
123 | /// if they’re not in a surrogate pair. | |
124 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] | |
125 | pub struct Wtf8Buf { | |
126 | bytes: Vec<u8> | |
127 | } | |
128 | ||
129 | impl ops::Deref for Wtf8Buf { | |
130 | type Target = Wtf8; | |
131 | ||
132 | fn deref(&self) -> &Wtf8 { | |
133 | self.as_slice() | |
134 | } | |
135 | } | |
136 | ||
137 | /// Format the string with double quotes, | |
138 | /// and surrogates as `\u` followed by four hexadecimal digits. | |
139 | /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] | |
140 | impl fmt::Debug for Wtf8Buf { | |
141 | #[inline] | |
142 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { | |
143 | fmt::Debug::fmt(&**self, formatter) | |
144 | } | |
145 | } | |
146 | ||
147 | impl Wtf8Buf { | |
b039eaaf | 148 | /// Creates a new, empty WTF-8 string. |
85aaf69f SL |
149 | #[inline] |
150 | pub fn new() -> Wtf8Buf { | |
151 | Wtf8Buf { bytes: Vec::new() } | |
152 | } | |
153 | ||
b039eaaf | 154 | /// Creates a new, empty WTF-8 string with pre-allocated capacity for `n` bytes. |
85aaf69f | 155 | #[inline] |
c34b1796 | 156 | pub fn with_capacity(n: usize) -> Wtf8Buf { |
85aaf69f SL |
157 | Wtf8Buf { bytes: Vec::with_capacity(n) } |
158 | } | |
159 | ||
d9579d0f | 160 | /// Creates a WTF-8 string from a UTF-8 `String`. |
85aaf69f SL |
161 | /// |
162 | /// This takes ownership of the `String` and does not copy. | |
163 | /// | |
164 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
165 | #[inline] | |
166 | pub fn from_string(string: String) -> Wtf8Buf { | |
167 | Wtf8Buf { bytes: string.into_bytes() } | |
168 | } | |
169 | ||
d9579d0f | 170 | /// Creates a WTF-8 string from a UTF-8 `&str` slice. |
85aaf69f SL |
171 | /// |
172 | /// This copies the content of the slice. | |
173 | /// | |
174 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
175 | #[inline] | |
176 | pub fn from_str(str: &str) -> Wtf8Buf { | |
c34b1796 | 177 | Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) } |
85aaf69f SL |
178 | } |
179 | ||
7453a54e SL |
180 | pub fn clear(&mut self) { |
181 | self.bytes.clear() | |
182 | } | |
183 | ||
9346a6ac | 184 | /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. |
85aaf69f SL |
185 | /// |
186 | /// This is lossless: calling `.encode_wide()` on the resulting string | |
187 | /// will always return the original code units. | |
188 | pub fn from_wide(v: &[u16]) -> Wtf8Buf { | |
189 | let mut string = Wtf8Buf::with_capacity(v.len()); | |
e9174d1e | 190 | for item in char::decode_utf16(v.iter().cloned()) { |
85aaf69f | 191 | match item { |
e9174d1e SL |
192 | Ok(ch) => string.push_char(ch), |
193 | Err(surrogate) => { | |
54a0048b | 194 | let surrogate = surrogate.unpaired_surrogate(); |
85aaf69f | 195 | // Surrogates are known to be in the code point range. |
54a0048b SL |
196 | let code_point = unsafe { |
197 | CodePoint::from_u32_unchecked(surrogate as u32) | |
198 | }; | |
85aaf69f | 199 | // Skip the WTF-8 concatenation check, |
e9174d1e | 200 | // surrogate pairs are already decoded by decode_utf16 |
85aaf69f SL |
201 | string.push_code_point_unchecked(code_point) |
202 | } | |
203 | } | |
204 | } | |
205 | string | |
206 | } | |
207 | ||
208 | /// Copied from String::push | |
209 | /// This does **not** include the WTF-8 concatenation check. | |
210 | fn push_code_point_unchecked(&mut self, code_point: CodePoint) { | |
54a0048b SL |
211 | let bytes = unsafe { |
212 | char::from_u32_unchecked(code_point.value).encode_utf8() | |
213 | }; | |
214 | self.bytes.extend_from_slice(bytes.as_slice()); | |
85aaf69f SL |
215 | } |
216 | ||
217 | #[inline] | |
218 | pub fn as_slice(&self) -> &Wtf8 { | |
c1a9b12d | 219 | unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } |
85aaf69f SL |
220 | } |
221 | ||
222 | /// Reserves capacity for at least `additional` more bytes to be inserted | |
223 | /// in the given `Wtf8Buf`. | |
224 | /// The collection may reserve more space to avoid frequent reallocations. | |
225 | /// | |
226 | /// # Panics | |
227 | /// | |
c34b1796 | 228 | /// Panics if the new capacity overflows `usize`. |
85aaf69f | 229 | #[inline] |
c34b1796 | 230 | pub fn reserve(&mut self, additional: usize) { |
85aaf69f SL |
231 | self.bytes.reserve(additional) |
232 | } | |
233 | ||
7453a54e SL |
234 | #[inline] |
235 | pub fn reserve_exact(&mut self, additional: usize) { | |
236 | self.bytes.reserve_exact(additional) | |
237 | } | |
238 | ||
85aaf69f SL |
239 | /// Returns the number of bytes that this string buffer can hold without reallocating. |
240 | #[inline] | |
c34b1796 | 241 | pub fn capacity(&self) -> usize { |
85aaf69f SL |
242 | self.bytes.capacity() |
243 | } | |
244 | ||
d9579d0f | 245 | /// Append a UTF-8 slice at the end of the string. |
85aaf69f SL |
246 | #[inline] |
247 | pub fn push_str(&mut self, other: &str) { | |
92a42be0 | 248 | self.bytes.extend_from_slice(other.as_bytes()) |
85aaf69f SL |
249 | } |
250 | ||
251 | /// Append a WTF-8 slice at the end of the string. | |
252 | /// | |
253 | /// This replaces newly paired surrogates at the boundary | |
254 | /// with a supplementary code point, | |
255 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
256 | #[inline] | |
257 | pub fn push_wtf8(&mut self, other: &Wtf8) { | |
258 | match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { | |
259 | // Replace newly paired surrogates by a supplementary code point. | |
260 | (Some(lead), Some(trail)) => { | |
261 | let len_without_lead_surrogate = self.len() - 3; | |
262 | self.bytes.truncate(len_without_lead_surrogate); | |
263 | let other_without_trail_surrogate = &other.bytes[3..]; | |
264 | // 4 bytes for the supplementary code point | |
265 | self.bytes.reserve(4 + other_without_trail_surrogate.len()); | |
266 | self.push_char(decode_surrogate_pair(lead, trail)); | |
92a42be0 | 267 | self.bytes.extend_from_slice(other_without_trail_surrogate); |
85aaf69f | 268 | } |
92a42be0 | 269 | _ => self.bytes.extend_from_slice(&other.bytes) |
85aaf69f SL |
270 | } |
271 | } | |
272 | ||
273 | /// Append a Unicode scalar value at the end of the string. | |
274 | #[inline] | |
275 | pub fn push_char(&mut self, c: char) { | |
276 | self.push_code_point_unchecked(CodePoint::from_char(c)) | |
277 | } | |
278 | ||
279 | /// Append a code point at the end of the string. | |
280 | /// | |
281 | /// This replaces newly paired surrogates at the boundary | |
282 | /// with a supplementary code point, | |
283 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
284 | #[inline] | |
285 | pub fn push(&mut self, code_point: CodePoint) { | |
e9174d1e SL |
286 | if let trail @ 0xDC00...0xDFFF = code_point.to_u32() { |
287 | if let Some(lead) = (&*self).final_lead_surrogate() { | |
288 | let len_without_lead_surrogate = self.len() - 3; | |
289 | self.bytes.truncate(len_without_lead_surrogate); | |
290 | self.push_char(decode_surrogate_pair(lead, trail as u16)); | |
291 | return | |
85aaf69f | 292 | } |
85aaf69f SL |
293 | } |
294 | ||
295 | // No newly paired surrogates at the boundary. | |
296 | self.push_code_point_unchecked(code_point) | |
297 | } | |
298 | ||
299 | /// Shortens a string to the specified length. | |
300 | /// | |
301 | /// # Panics | |
302 | /// | |
303 | /// Panics if `new_len` > current length, | |
304 | /// or if `new_len` is not a code point boundary. | |
305 | #[inline] | |
c34b1796 | 306 | pub fn truncate(&mut self, new_len: usize) { |
85aaf69f SL |
307 | assert!(is_code_point_boundary(self, new_len)); |
308 | self.bytes.truncate(new_len) | |
309 | } | |
310 | ||
9346a6ac | 311 | /// Consumes the WTF-8 string and tries to convert it to UTF-8. |
85aaf69f SL |
312 | /// |
313 | /// This does not copy the data. | |
314 | /// | |
315 | /// If the contents are not well-formed UTF-8 | |
316 | /// (that is, if the string contains surrogates), | |
317 | /// the original WTF-8 string is returned instead. | |
318 | pub fn into_string(self) -> Result<String, Wtf8Buf> { | |
319 | match self.next_surrogate(0) { | |
320 | None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), | |
321 | Some(_) => Err(self), | |
322 | } | |
323 | } | |
324 | ||
9346a6ac | 325 | /// Consumes the WTF-8 string and converts it lossily to UTF-8. |
85aaf69f SL |
326 | /// |
327 | /// This does not copy the data (but may overwrite parts of it in place). | |
328 | /// | |
329 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) | |
330 | pub fn into_string_lossy(mut self) -> String { | |
331 | let mut pos = 0; | |
332 | loop { | |
333 | match self.next_surrogate(pos) { | |
334 | Some((surrogate_pos, _)) => { | |
335 | pos = surrogate_pos + 3; | |
92a42be0 | 336 | self.bytes[surrogate_pos..pos] |
7453a54e | 337 | .copy_from_slice(UTF8_REPLACEMENT_CHARACTER); |
85aaf69f SL |
338 | }, |
339 | None => return unsafe { String::from_utf8_unchecked(self.bytes) } | |
340 | } | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
345 | /// Create a new WTF-8 string from an iterator of code points. | |
346 | /// | |
347 | /// This replaces surrogate code point pairs with supplementary code points, | |
348 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
349 | impl FromIterator<CodePoint> for Wtf8Buf { | |
350 | fn from_iter<T: IntoIterator<Item=CodePoint>>(iter: T) -> Wtf8Buf { | |
351 | let mut string = Wtf8Buf::new(); | |
352 | string.extend(iter); | |
353 | string | |
354 | } | |
355 | } | |
356 | ||
357 | /// Append code points from an iterator to the string. | |
358 | /// | |
359 | /// This replaces surrogate code point pairs with supplementary code points, | |
360 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
361 | impl Extend<CodePoint> for Wtf8Buf { | |
54a0048b SL |
362 | fn extend<T: IntoIterator<Item=CodePoint>>(&mut self, iter: T) { |
363 | let iterator = iter.into_iter(); | |
85aaf69f SL |
364 | let (low, _high) = iterator.size_hint(); |
365 | // Lower bound of one byte per code point (ASCII only) | |
366 | self.bytes.reserve(low); | |
367 | for code_point in iterator { | |
368 | self.push(code_point); | |
369 | } | |
370 | } | |
371 | } | |
372 | ||
373 | /// A borrowed slice of well-formed WTF-8 data. | |
374 | /// | |
375 | /// Similar to `&str`, but can additionally contain surrogate code points | |
376 | /// if they’re not in a surrogate pair. | |
b039eaaf | 377 | #[derive(Eq, Ord, PartialEq, PartialOrd)] |
85aaf69f SL |
378 | pub struct Wtf8 { |
379 | bytes: [u8] | |
380 | } | |
381 | ||
382 | impl AsInner<[u8]> for Wtf8 { | |
383 | fn as_inner(&self) -> &[u8] { &self.bytes } | |
384 | } | |
385 | ||
85aaf69f SL |
386 | /// Format the slice with double quotes, |
387 | /// and surrogates as `\u` followed by four hexadecimal digits. | |
388 | /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] | |
389 | impl fmt::Debug for Wtf8 { | |
c1a9b12d SL |
390 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
391 | fn write_str_escaped(f: &mut fmt::Formatter, s: &str) -> fmt::Result { | |
392 | use fmt::Write; | |
393 | for c in s.chars().flat_map(|c| c.escape_default()) { | |
54a0048b | 394 | f.write_char(c)? |
c1a9b12d SL |
395 | } |
396 | Ok(()) | |
397 | } | |
398 | ||
54a0048b | 399 | formatter.write_str("\"")?; |
85aaf69f SL |
400 | let mut pos = 0; |
401 | loop { | |
402 | match self.next_surrogate(pos) { | |
403 | None => break, | |
404 | Some((surrogate_pos, surrogate)) => { | |
54a0048b | 405 | write_str_escaped( |
c1a9b12d SL |
406 | formatter, |
407 | unsafe { str::from_utf8_unchecked( | |
408 | &self.bytes[pos .. surrogate_pos] | |
409 | )}, | |
54a0048b SL |
410 | )?; |
411 | write!(formatter, "\\u{{{:X}}}", surrogate)?; | |
85aaf69f SL |
412 | pos = surrogate_pos + 3; |
413 | } | |
414 | } | |
415 | } | |
54a0048b | 416 | write_str_escaped( |
c1a9b12d SL |
417 | formatter, |
418 | unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) }, | |
54a0048b | 419 | )?; |
85aaf69f SL |
420 | formatter.write_str("\"") |
421 | } | |
422 | } | |
423 | ||
424 | impl Wtf8 { | |
9346a6ac | 425 | /// Creates a WTF-8 slice from a UTF-8 `&str` slice. |
85aaf69f SL |
426 | /// |
427 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
428 | #[inline] | |
429 | pub fn from_str(value: &str) -> &Wtf8 { | |
c1a9b12d SL |
430 | unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } |
431 | } | |
432 | ||
433 | /// Creates a WTF-8 slice from a WTF-8 byte slice. | |
434 | /// | |
435 | /// Since the byte slice is not checked for valid WTF-8, this functions is | |
436 | /// marked unsafe. | |
437 | #[inline] | |
438 | unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { | |
439 | mem::transmute(value) | |
85aaf69f SL |
440 | } |
441 | ||
9346a6ac | 442 | /// Returns the length, in WTF-8 bytes. |
85aaf69f | 443 | #[inline] |
c34b1796 | 444 | pub fn len(&self) -> usize { |
85aaf69f SL |
445 | self.bytes.len() |
446 | } | |
447 | ||
7453a54e SL |
448 | #[inline] |
449 | pub fn is_empty(&self) -> bool { | |
450 | self.bytes.is_empty() | |
451 | } | |
452 | ||
9346a6ac | 453 | /// Returns the code point at `position` if it is in the ASCII range, |
85aaf69f SL |
454 | /// or `b'\xFF' otherwise. |
455 | /// | |
456 | /// # Panics | |
457 | /// | |
458 | /// Panics if `position` is beyond the end of the string. | |
459 | #[inline] | |
c34b1796 | 460 | pub fn ascii_byte_at(&self, position: usize) -> u8 { |
85aaf69f SL |
461 | match self.bytes[position] { |
462 | ascii_byte @ 0x00 ... 0x7F => ascii_byte, | |
463 | _ => 0xFF | |
464 | } | |
465 | } | |
466 | ||
9346a6ac | 467 | /// Returns an iterator for the string’s code points. |
85aaf69f SL |
468 | #[inline] |
469 | pub fn code_points(&self) -> Wtf8CodePoints { | |
470 | Wtf8CodePoints { bytes: self.bytes.iter() } | |
471 | } | |
472 | ||
9346a6ac | 473 | /// Tries to convert the string to UTF-8 and return a `&str` slice. |
85aaf69f | 474 | /// |
9346a6ac | 475 | /// Returns `None` if the string contains surrogates. |
85aaf69f SL |
476 | /// |
477 | /// This does not copy the data. | |
478 | #[inline] | |
479 | pub fn as_str(&self) -> Option<&str> { | |
480 | // Well-formed WTF-8 is also well-formed UTF-8 | |
481 | // if and only if it contains no surrogate. | |
482 | match self.next_surrogate(0) { | |
483 | None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }), | |
484 | Some(_) => None, | |
485 | } | |
486 | } | |
487 | ||
9346a6ac | 488 | /// Lossily converts the string to UTF-8. |
d9579d0f | 489 | /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. |
85aaf69f SL |
490 | /// |
491 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). | |
492 | /// | |
493 | /// This only copies the data if necessary (if it contains any surrogate). | |
c34b1796 | 494 | pub fn to_string_lossy(&self) -> Cow<str> { |
85aaf69f SL |
495 | let surrogate_pos = match self.next_surrogate(0) { |
496 | None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), | |
497 | Some((pos, _)) => pos, | |
498 | }; | |
499 | let wtf8_bytes = &self.bytes; | |
500 | let mut utf8_bytes = Vec::with_capacity(self.len()); | |
92a42be0 SL |
501 | utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); |
502 | utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); | |
85aaf69f SL |
503 | let mut pos = surrogate_pos + 3; |
504 | loop { | |
505 | match self.next_surrogate(pos) { | |
506 | Some((surrogate_pos, _)) => { | |
92a42be0 SL |
507 | utf8_bytes.extend_from_slice(&wtf8_bytes[pos .. surrogate_pos]); |
508 | utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); | |
85aaf69f SL |
509 | pos = surrogate_pos + 3; |
510 | }, | |
511 | None => { | |
92a42be0 | 512 | utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); |
85aaf69f SL |
513 | return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }) |
514 | } | |
515 | } | |
516 | } | |
517 | } | |
518 | ||
9346a6ac | 519 | /// Converts the WTF-8 string to potentially ill-formed UTF-16 |
85aaf69f SL |
520 | /// and return an iterator of 16-bit code units. |
521 | /// | |
522 | /// This is lossless: | |
523 | /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units | |
524 | /// would always return the original WTF-8 string. | |
525 | #[inline] | |
526 | pub fn encode_wide(&self) -> EncodeWide { | |
527 | EncodeWide { code_points: self.code_points(), extra: 0 } | |
528 | } | |
529 | ||
530 | #[inline] | |
c34b1796 | 531 | fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { |
85aaf69f SL |
532 | let mut iter = self.bytes[pos..].iter(); |
533 | loop { | |
534 | let b = match iter.next() { | |
535 | None => return None, | |
536 | Some(&b) => b, | |
537 | }; | |
538 | if b < 0x80 { | |
539 | pos += 1; | |
540 | } else if b < 0xE0 { | |
541 | iter.next(); | |
542 | pos += 2; | |
543 | } else if b == 0xED { | |
544 | match (iter.next(), iter.next()) { | |
545 | (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { | |
546 | return Some((pos, decode_surrogate(b2, b3))) | |
547 | } | |
548 | _ => pos += 3 | |
549 | } | |
550 | } else if b < 0xF0 { | |
551 | iter.next(); | |
552 | iter.next(); | |
553 | pos += 3; | |
554 | } else { | |
555 | iter.next(); | |
556 | iter.next(); | |
557 | iter.next(); | |
558 | pos += 4; | |
559 | } | |
560 | } | |
561 | } | |
562 | ||
563 | #[inline] | |
564 | fn final_lead_surrogate(&self) -> Option<u16> { | |
565 | let len = self.len(); | |
566 | if len < 3 { | |
567 | return None | |
568 | } | |
569 | match &self.bytes[(len - 3)..] { | |
570 | [0xED, b2 @ 0xA0...0xAF, b3] => Some(decode_surrogate(b2, b3)), | |
571 | _ => None | |
572 | } | |
573 | } | |
574 | ||
575 | #[inline] | |
576 | fn initial_trail_surrogate(&self) -> Option<u16> { | |
577 | let len = self.len(); | |
578 | if len < 3 { | |
579 | return None | |
580 | } | |
581 | match &self.bytes[..3] { | |
582 | [0xED, b2 @ 0xB0...0xBF, b3] => Some(decode_surrogate(b2, b3)), | |
583 | _ => None | |
584 | } | |
585 | } | |
586 | } | |
587 | ||
588 | ||
589 | /// Return a slice of the given string for the byte range [`begin`..`end`). | |
590 | /// | |
591 | /// # Panics | |
592 | /// | |
593 | /// Panics when `begin` and `end` do not point to code point boundaries, | |
594 | /// or point beyond the end of the string. | |
595 | impl ops::Index<ops::Range<usize>> for Wtf8 { | |
596 | type Output = Wtf8; | |
597 | ||
598 | #[inline] | |
c34b1796 | 599 | fn index(&self, range: ops::Range<usize>) -> &Wtf8 { |
85aaf69f SL |
600 | // is_code_point_boundary checks that the index is in [0, .len()] |
601 | if range.start <= range.end && | |
602 | is_code_point_boundary(self, range.start) && | |
603 | is_code_point_boundary(self, range.end) { | |
604 | unsafe { slice_unchecked(self, range.start, range.end) } | |
605 | } else { | |
606 | slice_error_fail(self, range.start, range.end) | |
607 | } | |
608 | } | |
609 | } | |
610 | ||
611 | /// Return a slice of the given string from byte `begin` to its end. | |
612 | /// | |
613 | /// # Panics | |
614 | /// | |
615 | /// Panics when `begin` is not at a code point boundary, | |
616 | /// or is beyond the end of the string. | |
617 | impl ops::Index<ops::RangeFrom<usize>> for Wtf8 { | |
618 | type Output = Wtf8; | |
619 | ||
620 | #[inline] | |
c34b1796 | 621 | fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 { |
85aaf69f SL |
622 | // is_code_point_boundary checks that the index is in [0, .len()] |
623 | if is_code_point_boundary(self, range.start) { | |
624 | unsafe { slice_unchecked(self, range.start, self.len()) } | |
625 | } else { | |
626 | slice_error_fail(self, range.start, self.len()) | |
627 | } | |
628 | } | |
629 | } | |
630 | ||
631 | /// Return a slice of the given string from its beginning to byte `end`. | |
632 | /// | |
633 | /// # Panics | |
634 | /// | |
635 | /// Panics when `end` is not at a code point boundary, | |
636 | /// or is beyond the end of the string. | |
637 | impl ops::Index<ops::RangeTo<usize>> for Wtf8 { | |
638 | type Output = Wtf8; | |
639 | ||
640 | #[inline] | |
c34b1796 | 641 | fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 { |
85aaf69f SL |
642 | // is_code_point_boundary checks that the index is in [0, .len()] |
643 | if is_code_point_boundary(self, range.end) { | |
644 | unsafe { slice_unchecked(self, 0, range.end) } | |
645 | } else { | |
646 | slice_error_fail(self, 0, range.end) | |
647 | } | |
648 | } | |
649 | } | |
650 | ||
651 | impl ops::Index<ops::RangeFull> for Wtf8 { | |
652 | type Output = Wtf8; | |
653 | ||
654 | #[inline] | |
c34b1796 | 655 | fn index(&self, _range: ops::RangeFull) -> &Wtf8 { |
85aaf69f SL |
656 | self |
657 | } | |
658 | } | |
659 | ||
660 | #[inline] | |
661 | fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { | |
662 | // The first byte is assumed to be 0xED | |
663 | 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F | |
664 | } | |
665 | ||
666 | #[inline] | |
667 | fn decode_surrogate_pair(lead: u16, trail: u16) -> char { | |
668 | let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); | |
c1a9b12d | 669 | unsafe { char::from_u32_unchecked(code_point) } |
85aaf69f SL |
670 | } |
671 | ||
672 | /// Copied from core::str::StrPrelude::is_char_boundary | |
673 | #[inline] | |
c34b1796 | 674 | pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { |
85aaf69f SL |
675 | if index == slice.len() { return true; } |
676 | match slice.bytes.get(index) { | |
677 | None => false, | |
c34b1796 | 678 | Some(&b) => b < 128 || b >= 192, |
85aaf69f SL |
679 | } |
680 | } | |
681 | ||
682 | /// Copied from core::str::raw::slice_unchecked | |
683 | #[inline] | |
c34b1796 AL |
684 | pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { |
685 | // memory layout of an &[u8] and &Wtf8 are the same | |
c1a9b12d | 686 | Wtf8::from_bytes_unchecked(slice::from_raw_parts( |
c34b1796 AL |
687 | s.bytes.as_ptr().offset(begin as isize), |
688 | end - begin | |
689 | )) | |
85aaf69f SL |
690 | } |
691 | ||
692 | /// Copied from core::str::raw::slice_error_fail | |
693 | #[inline(never)] | |
c34b1796 | 694 | pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { |
85aaf69f SL |
695 | assert!(begin <= end); |
696 | panic!("index {} and/or {} in `{:?}` do not lie on character boundary", | |
697 | begin, end, s); | |
698 | } | |
699 | ||
700 | /// Iterator for the code points of a WTF-8 string. | |
701 | /// | |
702 | /// Created with the method `.code_points()`. | |
703 | #[derive(Clone)] | |
704 | pub struct Wtf8CodePoints<'a> { | |
705 | bytes: slice::Iter<'a, u8> | |
706 | } | |
707 | ||
708 | impl<'a> Iterator for Wtf8CodePoints<'a> { | |
709 | type Item = CodePoint; | |
710 | ||
711 | #[inline] | |
712 | fn next(&mut self) -> Option<CodePoint> { | |
713 | next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) | |
714 | } | |
715 | ||
716 | #[inline] | |
c34b1796 | 717 | fn size_hint(&self) -> (usize, Option<usize>) { |
85aaf69f SL |
718 | let (len, _) = self.bytes.size_hint(); |
719 | (len.saturating_add(3) / 4, Some(len)) | |
720 | } | |
721 | } | |
722 | ||
92a42be0 | 723 | #[stable(feature = "rust1", since = "1.0.0")] |
85aaf69f SL |
724 | #[derive(Clone)] |
725 | pub struct EncodeWide<'a> { | |
726 | code_points: Wtf8CodePoints<'a>, | |
727 | extra: u16 | |
728 | } | |
729 | ||
730 | // Copied from libunicode/u_str.rs | |
92a42be0 | 731 | #[stable(feature = "rust1", since = "1.0.0")] |
85aaf69f SL |
732 | impl<'a> Iterator for EncodeWide<'a> { |
733 | type Item = u16; | |
734 | ||
735 | #[inline] | |
736 | fn next(&mut self) -> Option<u16> { | |
737 | if self.extra != 0 { | |
738 | let tmp = self.extra; | |
739 | self.extra = 0; | |
740 | return Some(tmp); | |
741 | } | |
742 | ||
85aaf69f | 743 | self.code_points.next().map(|code_point| { |
54a0048b SL |
744 | let n = unsafe { |
745 | char::from_u32_unchecked(code_point.value).encode_utf16() | |
746 | }; | |
747 | let n = n.as_slice(); | |
748 | if n.len() == 2 { | |
749 | self.extra = n[1]; | |
750 | } | |
751 | n[0] | |
85aaf69f SL |
752 | }) |
753 | } | |
754 | ||
755 | #[inline] | |
c34b1796 | 756 | fn size_hint(&self) -> (usize, Option<usize>) { |
85aaf69f SL |
757 | let (low, high) = self.code_points.size_hint(); |
758 | // every code point gets either one u16 or two u16, | |
759 | // so this iterator is between 1 or 2 times as | |
760 | // long as the underlying iterator. | |
761 | (low, high.and_then(|n| n.checked_mul(2))) | |
762 | } | |
763 | } | |
764 | ||
85aaf69f SL |
765 | impl Hash for CodePoint { |
766 | #[inline] | |
767 | fn hash<H: Hasher>(&self, state: &mut H) { | |
768 | self.value.hash(state) | |
769 | } | |
770 | } | |
771 | ||
85aaf69f SL |
772 | impl Hash for Wtf8Buf { |
773 | #[inline] | |
774 | fn hash<H: Hasher>(&self, state: &mut H) { | |
775 | state.write(&self.bytes); | |
776 | 0xfeu8.hash(state) | |
777 | } | |
778 | } | |
779 | ||
85aaf69f SL |
780 | impl Hash for Wtf8 { |
781 | #[inline] | |
782 | fn hash<H: Hasher>(&self, state: &mut H) { | |
783 | state.write(&self.bytes); | |
784 | 0xfeu8.hash(state) | |
785 | } | |
786 | } | |
787 | ||
788 | impl AsciiExt for Wtf8 { | |
789 | type Owned = Wtf8Buf; | |
790 | ||
791 | fn is_ascii(&self) -> bool { | |
792 | self.bytes.is_ascii() | |
793 | } | |
794 | fn to_ascii_uppercase(&self) -> Wtf8Buf { | |
795 | Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() } | |
796 | } | |
797 | fn to_ascii_lowercase(&self) -> Wtf8Buf { | |
798 | Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() } | |
799 | } | |
800 | fn eq_ignore_ascii_case(&self, other: &Wtf8) -> bool { | |
801 | self.bytes.eq_ignore_ascii_case(&other.bytes) | |
802 | } | |
803 | ||
804 | fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() } | |
805 | fn make_ascii_lowercase(&mut self) { self.bytes.make_ascii_lowercase() } | |
806 | } | |
807 | ||
808 | #[cfg(test)] | |
809 | mod tests { | |
810 | use prelude::v1::*; | |
811 | use borrow::Cow; | |
812 | use super::*; | |
85aaf69f SL |
813 | |
814 | #[test] | |
815 | fn code_point_from_u32() { | |
816 | assert!(CodePoint::from_u32(0).is_some()); | |
817 | assert!(CodePoint::from_u32(0xD800).is_some()); | |
818 | assert!(CodePoint::from_u32(0x10FFFF).is_some()); | |
819 | assert!(CodePoint::from_u32(0x110000).is_none()); | |
820 | } | |
821 | ||
822 | #[test] | |
823 | fn code_point_to_u32() { | |
824 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
825 | assert_eq!(c(0).to_u32(), 0); | |
826 | assert_eq!(c(0xD800).to_u32(), 0xD800); | |
827 | assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF); | |
828 | } | |
829 | ||
830 | #[test] | |
831 | fn code_point_from_char() { | |
832 | assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); | |
833 | assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9); | |
834 | } | |
835 | ||
836 | #[test] | |
837 | fn code_point_to_string() { | |
838 | assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061"); | |
839 | assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9"); | |
840 | } | |
841 | ||
842 | #[test] | |
843 | fn code_point_to_char() { | |
844 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
845 | assert_eq!(c(0x61).to_char(), Some('a')); | |
846 | assert_eq!(c(0x1F4A9).to_char(), Some('💩')); | |
847 | assert_eq!(c(0xD800).to_char(), None); | |
848 | } | |
849 | ||
850 | #[test] | |
851 | fn code_point_to_char_lossy() { | |
852 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
853 | assert_eq!(c(0x61).to_char_lossy(), 'a'); | |
854 | assert_eq!(c(0x1F4A9).to_char_lossy(), '💩'); | |
855 | assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}'); | |
856 | } | |
857 | ||
858 | #[test] | |
859 | fn wtf8buf_new() { | |
860 | assert_eq!(Wtf8Buf::new().bytes, b""); | |
861 | } | |
862 | ||
863 | #[test] | |
864 | fn wtf8buf_from_str() { | |
865 | assert_eq!(Wtf8Buf::from_str("").bytes, b""); | |
866 | assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, | |
867 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
868 | } | |
869 | ||
870 | #[test] | |
871 | fn wtf8buf_from_string() { | |
62682a34 SL |
872 | assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); |
873 | assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes, | |
85aaf69f SL |
874 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); |
875 | } | |
876 | ||
877 | #[test] | |
878 | fn wtf8buf_from_wide() { | |
879 | assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b""); | |
880 | assert_eq!(Wtf8Buf::from_wide( | |
881 | &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes, | |
882 | b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); | |
883 | } | |
884 | ||
885 | #[test] | |
886 | fn wtf8buf_push_str() { | |
887 | let mut string = Wtf8Buf::new(); | |
888 | assert_eq!(string.bytes, b""); | |
889 | string.push_str("aé 💩"); | |
890 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
891 | } | |
892 | ||
893 | #[test] | |
894 | fn wtf8buf_push_char() { | |
895 | let mut string = Wtf8Buf::from_str("aé "); | |
896 | assert_eq!(string.bytes, b"a\xC3\xA9 "); | |
897 | string.push_char('💩'); | |
898 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
899 | } | |
900 | ||
901 | #[test] | |
902 | fn wtf8buf_push() { | |
903 | let mut string = Wtf8Buf::from_str("aé "); | |
904 | assert_eq!(string.bytes, b"a\xC3\xA9 "); | |
905 | string.push(CodePoint::from_char('💩')); | |
906 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
907 | ||
908 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
909 | ||
910 | let mut string = Wtf8Buf::new(); | |
911 | string.push(c(0xD83D)); // lead | |
912 | string.push(c(0xDCA9)); // trail | |
913 | assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
914 | ||
915 | let mut string = Wtf8Buf::new(); | |
916 | string.push(c(0xD83D)); // lead | |
917 | string.push(c(0x20)); // not surrogate | |
918 | string.push(c(0xDCA9)); // trail | |
919 | assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
920 | ||
921 | let mut string = Wtf8Buf::new(); | |
922 | string.push(c(0xD800)); // lead | |
923 | string.push(c(0xDBFF)); // lead | |
924 | assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
925 | ||
926 | let mut string = Wtf8Buf::new(); | |
927 | string.push(c(0xD800)); // lead | |
928 | string.push(c(0xE000)); // not surrogate | |
929 | assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
930 | ||
931 | let mut string = Wtf8Buf::new(); | |
932 | string.push(c(0xD7FF)); // not surrogate | |
933 | string.push(c(0xDC00)); // trail | |
934 | assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
935 | ||
936 | let mut string = Wtf8Buf::new(); | |
937 | string.push(c(0x61)); // not surrogate, < 3 bytes | |
938 | string.push(c(0xDC00)); // trail | |
939 | assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); | |
940 | ||
941 | let mut string = Wtf8Buf::new(); | |
942 | string.push(c(0xDC00)); // trail | |
943 | assert_eq!(string.bytes, b"\xED\xB0\x80"); | |
944 | } | |
945 | ||
946 | #[test] | |
947 | fn wtf8buf_push_wtf8() { | |
948 | let mut string = Wtf8Buf::from_str("aé"); | |
949 | assert_eq!(string.bytes, b"a\xC3\xA9"); | |
950 | string.push_wtf8(Wtf8::from_str(" 💩")); | |
951 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
952 | ||
c1a9b12d | 953 | fn w(v: &[u8]) -> &Wtf8 { unsafe { Wtf8::from_bytes_unchecked(v) } } |
85aaf69f SL |
954 | |
955 | let mut string = Wtf8Buf::new(); | |
956 | string.push_wtf8(w(b"\xED\xA0\xBD")); // lead | |
957 | string.push_wtf8(w(b"\xED\xB2\xA9")); // trail | |
958 | assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
959 | ||
960 | let mut string = Wtf8Buf::new(); | |
961 | string.push_wtf8(w(b"\xED\xA0\xBD")); // lead | |
962 | string.push_wtf8(w(b" ")); // not surrogate | |
963 | string.push_wtf8(w(b"\xED\xB2\xA9")); // trail | |
964 | assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
965 | ||
966 | let mut string = Wtf8Buf::new(); | |
967 | string.push_wtf8(w(b"\xED\xA0\x80")); // lead | |
968 | string.push_wtf8(w(b"\xED\xAF\xBF")); // lead | |
969 | assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
970 | ||
971 | let mut string = Wtf8Buf::new(); | |
972 | string.push_wtf8(w(b"\xED\xA0\x80")); // lead | |
973 | string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate | |
974 | assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
975 | ||
976 | let mut string = Wtf8Buf::new(); | |
977 | string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate | |
978 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail | |
979 | assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
980 | ||
981 | let mut string = Wtf8Buf::new(); | |
982 | string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes | |
983 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail | |
984 | assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); | |
985 | ||
986 | let mut string = Wtf8Buf::new(); | |
987 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail | |
988 | assert_eq!(string.bytes, b"\xED\xB0\x80"); | |
989 | } | |
990 | ||
991 | #[test] | |
992 | fn wtf8buf_truncate() { | |
993 | let mut string = Wtf8Buf::from_str("aé"); | |
994 | string.truncate(1); | |
995 | assert_eq!(string.bytes, b"a"); | |
996 | } | |
997 | ||
998 | #[test] | |
c34b1796 | 999 | #[should_panic] |
85aaf69f SL |
1000 | fn wtf8buf_truncate_fail_code_point_boundary() { |
1001 | let mut string = Wtf8Buf::from_str("aé"); | |
1002 | string.truncate(2); | |
1003 | } | |
1004 | ||
1005 | #[test] | |
c34b1796 | 1006 | #[should_panic] |
85aaf69f SL |
1007 | fn wtf8buf_truncate_fail_longer() { |
1008 | let mut string = Wtf8Buf::from_str("aé"); | |
1009 | string.truncate(4); | |
1010 | } | |
1011 | ||
1012 | #[test] | |
1013 | fn wtf8buf_into_string() { | |
1014 | let mut string = Wtf8Buf::from_str("aé 💩"); | |
62682a34 | 1015 | assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); |
85aaf69f SL |
1016 | string.push(CodePoint::from_u32(0xD800).unwrap()); |
1017 | assert_eq!(string.clone().into_string(), Err(string)); | |
1018 | } | |
1019 | ||
1020 | #[test] | |
1021 | fn wtf8buf_into_string_lossy() { | |
1022 | let mut string = Wtf8Buf::from_str("aé 💩"); | |
62682a34 | 1023 | assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); |
85aaf69f | 1024 | string.push(CodePoint::from_u32(0xD800).unwrap()); |
62682a34 | 1025 | assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); |
85aaf69f SL |
1026 | } |
1027 | ||
1028 | #[test] | |
1029 | fn wtf8buf_from_iterator() { | |
1030 | fn f(values: &[u32]) -> Wtf8Buf { | |
1031 | values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>() | |
b039eaaf | 1032 | } |
85aaf69f SL |
1033 | assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); |
1034 | ||
1035 | assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
1036 | assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
1037 | assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
1038 | assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
1039 | assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
1040 | assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80"); | |
1041 | assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80"); | |
1042 | } | |
1043 | ||
1044 | #[test] | |
1045 | fn wtf8buf_extend() { | |
1046 | fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { | |
1047 | fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() } | |
1048 | let mut string = initial.iter().map(c).collect::<Wtf8Buf>(); | |
1049 | string.extend(extended.iter().map(c)); | |
1050 | string | |
b039eaaf | 1051 | } |
85aaf69f SL |
1052 | |
1053 | assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, | |
1054 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1055 | ||
1056 | assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
1057 | assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
1058 | assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
1059 | assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
1060 | assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
1061 | assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80"); | |
1062 | assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80"); | |
1063 | } | |
1064 | ||
1065 | #[test] | |
1066 | fn wtf8buf_show() { | |
c1a9b12d | 1067 | let mut string = Wtf8Buf::from_str("a\té 💩\r"); |
85aaf69f | 1068 | string.push(CodePoint::from_u32(0xD800).unwrap()); |
c1a9b12d | 1069 | assert_eq!(format!("{:?}", string), r#""a\t\u{e9} \u{1f4a9}\r\u{D800}""#); |
85aaf69f SL |
1070 | } |
1071 | ||
1072 | #[test] | |
1073 | fn wtf8buf_as_slice() { | |
1074 | assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé")); | |
1075 | } | |
1076 | ||
1077 | #[test] | |
c1a9b12d SL |
1078 | fn wtf8buf_show_str() { |
1079 | let text = "a\té 💩\r"; | |
9cc50fc6 | 1080 | let string = Wtf8Buf::from_str(text); |
c1a9b12d | 1081 | assert_eq!(format!("{:?}", text), format!("{:?}", string)); |
85aaf69f SL |
1082 | } |
1083 | ||
1084 | #[test] | |
1085 | fn wtf8_from_str() { | |
1086 | assert_eq!(&Wtf8::from_str("").bytes, b""); | |
1087 | assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1088 | } | |
1089 | ||
1090 | #[test] | |
1091 | fn wtf8_len() { | |
1092 | assert_eq!(Wtf8::from_str("").len(), 0); | |
1093 | assert_eq!(Wtf8::from_str("aé 💩").len(), 8); | |
1094 | } | |
1095 | ||
1096 | #[test] | |
1097 | fn wtf8_slice() { | |
1098 | assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 "); | |
1099 | } | |
1100 | ||
1101 | #[test] | |
c34b1796 | 1102 | #[should_panic] |
85aaf69f SL |
1103 | fn wtf8_slice_not_code_point_boundary() { |
1104 | &Wtf8::from_str("aé 💩")[2.. 4]; | |
1105 | } | |
1106 | ||
1107 | #[test] | |
1108 | fn wtf8_slice_from() { | |
1109 | assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1110 | } | |
1111 | ||
1112 | #[test] | |
c34b1796 | 1113 | #[should_panic] |
85aaf69f SL |
1114 | fn wtf8_slice_from_not_code_point_boundary() { |
1115 | &Wtf8::from_str("aé 💩")[2..]; | |
1116 | } | |
1117 | ||
1118 | #[test] | |
1119 | fn wtf8_slice_to() { | |
1120 | assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 "); | |
1121 | } | |
1122 | ||
1123 | #[test] | |
c34b1796 | 1124 | #[should_panic] |
85aaf69f SL |
1125 | fn wtf8_slice_to_not_code_point_boundary() { |
1126 | &Wtf8::from_str("aé 💩")[5..]; | |
1127 | } | |
1128 | ||
1129 | #[test] | |
1130 | fn wtf8_ascii_byte_at() { | |
1131 | let slice = Wtf8::from_str("aé 💩"); | |
1132 | assert_eq!(slice.ascii_byte_at(0), b'a'); | |
1133 | assert_eq!(slice.ascii_byte_at(1), b'\xFF'); | |
1134 | assert_eq!(slice.ascii_byte_at(2), b'\xFF'); | |
1135 | assert_eq!(slice.ascii_byte_at(3), b' '); | |
1136 | assert_eq!(slice.ascii_byte_at(4), b'\xFF'); | |
1137 | } | |
1138 | ||
85aaf69f SL |
1139 | #[test] |
1140 | fn wtf8_code_points() { | |
1141 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
1142 | fn cp(string: &Wtf8Buf) -> Vec<Option<char>> { | |
1143 | string.code_points().map(|c| c.to_char()).collect::<Vec<_>>() | |
1144 | } | |
1145 | let mut string = Wtf8Buf::from_str("é "); | |
c34b1796 | 1146 | assert_eq!(cp(&string), [Some('é'), Some(' ')]); |
85aaf69f | 1147 | string.push(c(0xD83D)); |
c34b1796 | 1148 | assert_eq!(cp(&string), [Some('é'), Some(' '), None]); |
85aaf69f | 1149 | string.push(c(0xDCA9)); |
c34b1796 | 1150 | assert_eq!(cp(&string), [Some('é'), Some(' '), Some('💩')]); |
85aaf69f SL |
1151 | } |
1152 | ||
1153 | #[test] | |
1154 | fn wtf8_as_str() { | |
1155 | assert_eq!(Wtf8::from_str("").as_str(), Some("")); | |
1156 | assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩")); | |
1157 | let mut string = Wtf8Buf::new(); | |
1158 | string.push(CodePoint::from_u32(0xD800).unwrap()); | |
1159 | assert_eq!(string.as_str(), None); | |
1160 | } | |
1161 | ||
1162 | #[test] | |
1163 | fn wtf8_to_string_lossy() { | |
1164 | assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); | |
1165 | assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩")); | |
1166 | let mut string = Wtf8Buf::from_str("aé 💩"); | |
1167 | string.push(CodePoint::from_u32(0xD800).unwrap()); | |
62682a34 | 1168 | let expected: Cow<str> = Cow::Owned(String::from("aé 💩�")); |
85aaf69f SL |
1169 | assert_eq!(string.to_string_lossy(), expected); |
1170 | } | |
1171 | ||
1172 | #[test] | |
1173 | fn wtf8_encode_wide() { | |
1174 | let mut string = Wtf8Buf::from_str("aé "); | |
1175 | string.push(CodePoint::from_u32(0xD83D).unwrap()); | |
1176 | string.push_char('💩'); | |
1177 | assert_eq!(string.encode_wide().collect::<Vec<_>>(), | |
1178 | vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); | |
1179 | } | |
1180 | } |