]>
Commit | Line | Data |
---|---|---|
85aaf69f SL |
1 | // Copyright 2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). | |
12 | //! | |
13 | //! This library uses Rust’s type system to maintain | |
14 | //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), | |
15 | //! like the `String` and `&str` types do for UTF-8. | |
16 | //! | |
17 | //! Since [WTF-8 must not be used | |
18 | //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), | |
19 | //! this library deliberately does not provide access to the underlying bytes | |
20 | //! of WTF-8 strings, | |
21 | //! nor can it decode WTF-8 from arbitrary bytes. | |
22 | //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. | |
23 | ||
c34b1796 AL |
24 | // this module is imported from @SimonSapin's repo and has tons of dead code on |
25 | // unix (it's mostly used on windows), so don't worry about dead code here. | |
26 | #![allow(dead_code)] | |
27 | ||
85aaf69f SL |
28 | use core::prelude::*; |
29 | ||
30 | use core::char::{encode_utf8_raw, encode_utf16_raw}; | |
62682a34 | 31 | use core::str::next_code_point; |
85aaf69f SL |
32 | |
33 | use ascii::*; | |
34 | use borrow::Cow; | |
c1a9b12d | 35 | use char; |
85aaf69f SL |
36 | use cmp; |
37 | use fmt; | |
38 | use hash::{Hash, Hasher}; | |
9346a6ac | 39 | use iter::FromIterator; |
85aaf69f | 40 | use mem; |
85aaf69f | 41 | use ops; |
c1a9b12d | 42 | use rustc_unicode::str::{Utf16Item, utf16_items}; |
85aaf69f SL |
43 | use slice; |
44 | use str; | |
c34b1796 | 45 | use string::String; |
85aaf69f | 46 | use sys_common::AsInner; |
85aaf69f SL |
47 | use vec::Vec; |
48 | ||
c34b1796 | 49 | const UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD"; |
85aaf69f SL |
50 | |
51 | /// A Unicode code point: from U+0000 to U+10FFFF. | |
52 | /// | |
53 | /// Compare with the `char` type, | |
54 | /// which represents a Unicode scalar value: | |
55 | /// a code point that is not a surrogate (U+D800 to U+DFFF). | |
56 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)] | |
57 | pub struct CodePoint { | |
58 | value: u32 | |
59 | } | |
60 | ||
61 | /// Format the code point as `U+` followed by four to six hexadecimal digits. | |
62 | /// Example: `U+1F4A9` | |
63 | impl fmt::Debug for CodePoint { | |
64 | #[inline] | |
65 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { | |
66 | write!(formatter, "U+{:04X}", self.value) | |
67 | } | |
68 | } | |
69 | ||
70 | impl CodePoint { | |
9346a6ac | 71 | /// Unsafely creates a new `CodePoint` without checking the value. |
85aaf69f SL |
72 | /// |
73 | /// Only use when `value` is known to be less than or equal to 0x10FFFF. | |
74 | #[inline] | |
75 | pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { | |
76 | CodePoint { value: value } | |
77 | } | |
78 | ||
9346a6ac | 79 | /// Creates a new `CodePoint` if the value is a valid code point. |
85aaf69f | 80 | /// |
9346a6ac | 81 | /// Returns `None` if `value` is above 0x10FFFF. |
85aaf69f SL |
82 | #[inline] |
83 | pub fn from_u32(value: u32) -> Option<CodePoint> { | |
84 | match value { | |
85 | 0 ... 0x10FFFF => Some(CodePoint { value: value }), | |
86 | _ => None | |
87 | } | |
88 | } | |
89 | ||
9346a6ac | 90 | /// Creates a new `CodePoint` from a `char`. |
85aaf69f SL |
91 | /// |
92 | /// Since all Unicode scalar values are code points, this always succeeds. | |
93 | #[inline] | |
94 | pub fn from_char(value: char) -> CodePoint { | |
95 | CodePoint { value: value as u32 } | |
96 | } | |
97 | ||
9346a6ac | 98 | /// Returns the numeric value of the code point. |
85aaf69f SL |
99 | #[inline] |
100 | pub fn to_u32(&self) -> u32 { | |
101 | self.value | |
102 | } | |
103 | ||
9346a6ac | 104 | /// Optionally returns a Unicode scalar value for the code point. |
85aaf69f | 105 | /// |
9346a6ac | 106 | /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF). |
85aaf69f SL |
107 | #[inline] |
108 | pub fn to_char(&self) -> Option<char> { | |
109 | match self.value { | |
110 | 0xD800 ... 0xDFFF => None, | |
c1a9b12d | 111 | _ => Some(unsafe { char::from_u32_unchecked(self.value) }) |
85aaf69f SL |
112 | } |
113 | } | |
114 | ||
9346a6ac | 115 | /// Returns a Unicode scalar value for the code point. |
85aaf69f | 116 | /// |
9346a6ac | 117 | /// Returns `'\u{FFFD}'` (the replacement character “�”) |
85aaf69f SL |
118 | /// if the code point is a surrogate (from U+D800 to U+DFFF). |
119 | #[inline] | |
120 | pub fn to_char_lossy(&self) -> char { | |
121 | self.to_char().unwrap_or('\u{FFFD}') | |
122 | } | |
123 | } | |
124 | ||
125 | /// An owned, growable string of well-formed WTF-8 data. | |
126 | /// | |
127 | /// Similar to `String`, but can additionally contain surrogate code points | |
128 | /// if they’re not in a surrogate pair. | |
129 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] | |
130 | pub struct Wtf8Buf { | |
131 | bytes: Vec<u8> | |
132 | } | |
133 | ||
134 | impl ops::Deref for Wtf8Buf { | |
135 | type Target = Wtf8; | |
136 | ||
137 | fn deref(&self) -> &Wtf8 { | |
138 | self.as_slice() | |
139 | } | |
140 | } | |
141 | ||
142 | /// Format the string with double quotes, | |
143 | /// and surrogates as `\u` followed by four hexadecimal digits. | |
144 | /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] | |
145 | impl fmt::Debug for Wtf8Buf { | |
146 | #[inline] | |
147 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { | |
148 | fmt::Debug::fmt(&**self, formatter) | |
149 | } | |
150 | } | |
151 | ||
152 | impl Wtf8Buf { | |
9346a6ac | 153 | /// Creates an new, empty WTF-8 string. |
85aaf69f SL |
154 | #[inline] |
155 | pub fn new() -> Wtf8Buf { | |
156 | Wtf8Buf { bytes: Vec::new() } | |
157 | } | |
158 | ||
9346a6ac | 159 | /// Creates an new, empty WTF-8 string with pre-allocated capacity for `n` bytes. |
85aaf69f | 160 | #[inline] |
c34b1796 | 161 | pub fn with_capacity(n: usize) -> Wtf8Buf { |
85aaf69f SL |
162 | Wtf8Buf { bytes: Vec::with_capacity(n) } |
163 | } | |
164 | ||
d9579d0f | 165 | /// Creates a WTF-8 string from a UTF-8 `String`. |
85aaf69f SL |
166 | /// |
167 | /// This takes ownership of the `String` and does not copy. | |
168 | /// | |
169 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
170 | #[inline] | |
171 | pub fn from_string(string: String) -> Wtf8Buf { | |
172 | Wtf8Buf { bytes: string.into_bytes() } | |
173 | } | |
174 | ||
d9579d0f | 175 | /// Creates a WTF-8 string from a UTF-8 `&str` slice. |
85aaf69f SL |
176 | /// |
177 | /// This copies the content of the slice. | |
178 | /// | |
179 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
180 | #[inline] | |
181 | pub fn from_str(str: &str) -> Wtf8Buf { | |
c34b1796 | 182 | Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) } |
85aaf69f SL |
183 | } |
184 | ||
9346a6ac | 185 | /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. |
85aaf69f SL |
186 | /// |
187 | /// This is lossless: calling `.encode_wide()` on the resulting string | |
188 | /// will always return the original code units. | |
189 | pub fn from_wide(v: &[u16]) -> Wtf8Buf { | |
190 | let mut string = Wtf8Buf::with_capacity(v.len()); | |
191 | for item in utf16_items(v) { | |
192 | match item { | |
193 | Utf16Item::ScalarValue(c) => string.push_char(c), | |
194 | Utf16Item::LoneSurrogate(s) => { | |
195 | // Surrogates are known to be in the code point range. | |
196 | let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) }; | |
197 | // Skip the WTF-8 concatenation check, | |
198 | // surrogate pairs are already decoded by utf16_items | |
199 | string.push_code_point_unchecked(code_point) | |
200 | } | |
201 | } | |
202 | } | |
203 | string | |
204 | } | |
205 | ||
206 | /// Copied from String::push | |
207 | /// This does **not** include the WTF-8 concatenation check. | |
208 | fn push_code_point_unchecked(&mut self, code_point: CodePoint) { | |
209 | let cur_len = self.len(); | |
210 | // This may use up to 4 bytes. | |
211 | self.reserve(4); | |
212 | ||
213 | unsafe { | |
214 | // Attempt to not use an intermediate buffer by just pushing bytes | |
215 | // directly onto this string. | |
c34b1796 | 216 | let slice = slice::from_raw_parts_mut( |
c1a9b12d | 217 | self.bytes.as_mut_ptr().offset(cur_len as isize), 4 |
c34b1796 | 218 | ); |
c1a9b12d | 219 | let used = encode_utf8_raw(code_point.value, slice).unwrap(); |
85aaf69f SL |
220 | self.bytes.set_len(cur_len + used); |
221 | } | |
222 | } | |
223 | ||
224 | #[inline] | |
225 | pub fn as_slice(&self) -> &Wtf8 { | |
c1a9b12d | 226 | unsafe { Wtf8::from_bytes_unchecked(&self.bytes) } |
85aaf69f SL |
227 | } |
228 | ||
229 | /// Reserves capacity for at least `additional` more bytes to be inserted | |
230 | /// in the given `Wtf8Buf`. | |
231 | /// The collection may reserve more space to avoid frequent reallocations. | |
232 | /// | |
233 | /// # Panics | |
234 | /// | |
c34b1796 | 235 | /// Panics if the new capacity overflows `usize`. |
85aaf69f | 236 | #[inline] |
c34b1796 | 237 | pub fn reserve(&mut self, additional: usize) { |
85aaf69f SL |
238 | self.bytes.reserve(additional) |
239 | } | |
240 | ||
241 | /// Returns the number of bytes that this string buffer can hold without reallocating. | |
242 | #[inline] | |
c34b1796 | 243 | pub fn capacity(&self) -> usize { |
85aaf69f SL |
244 | self.bytes.capacity() |
245 | } | |
246 | ||
d9579d0f | 247 | /// Append a UTF-8 slice at the end of the string. |
85aaf69f SL |
248 | #[inline] |
249 | pub fn push_str(&mut self, other: &str) { | |
250 | self.bytes.push_all(other.as_bytes()) | |
251 | } | |
252 | ||
253 | /// Append a WTF-8 slice at the end of the string. | |
254 | /// | |
255 | /// This replaces newly paired surrogates at the boundary | |
256 | /// with a supplementary code point, | |
257 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
258 | #[inline] | |
259 | pub fn push_wtf8(&mut self, other: &Wtf8) { | |
260 | match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { | |
261 | // Replace newly paired surrogates by a supplementary code point. | |
262 | (Some(lead), Some(trail)) => { | |
263 | let len_without_lead_surrogate = self.len() - 3; | |
264 | self.bytes.truncate(len_without_lead_surrogate); | |
265 | let other_without_trail_surrogate = &other.bytes[3..]; | |
266 | // 4 bytes for the supplementary code point | |
267 | self.bytes.reserve(4 + other_without_trail_surrogate.len()); | |
268 | self.push_char(decode_surrogate_pair(lead, trail)); | |
269 | self.bytes.push_all(other_without_trail_surrogate); | |
270 | } | |
271 | _ => self.bytes.push_all(&other.bytes) | |
272 | } | |
273 | } | |
274 | ||
275 | /// Append a Unicode scalar value at the end of the string. | |
276 | #[inline] | |
277 | pub fn push_char(&mut self, c: char) { | |
278 | self.push_code_point_unchecked(CodePoint::from_char(c)) | |
279 | } | |
280 | ||
281 | /// Append a code point at the end of the string. | |
282 | /// | |
283 | /// This replaces newly paired surrogates at the boundary | |
284 | /// with a supplementary code point, | |
285 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
286 | #[inline] | |
287 | pub fn push(&mut self, code_point: CodePoint) { | |
288 | match code_point.to_u32() { | |
289 | trail @ 0xDC00...0xDFFF => { | |
290 | match (&*self).final_lead_surrogate() { | |
291 | Some(lead) => { | |
292 | let len_without_lead_surrogate = self.len() - 3; | |
293 | self.bytes.truncate(len_without_lead_surrogate); | |
294 | self.push_char(decode_surrogate_pair(lead, trail as u16)); | |
295 | return | |
296 | } | |
297 | _ => {} | |
298 | } | |
299 | } | |
300 | _ => {} | |
301 | } | |
302 | ||
303 | // No newly paired surrogates at the boundary. | |
304 | self.push_code_point_unchecked(code_point) | |
305 | } | |
306 | ||
307 | /// Shortens a string to the specified length. | |
308 | /// | |
309 | /// # Panics | |
310 | /// | |
311 | /// Panics if `new_len` > current length, | |
312 | /// or if `new_len` is not a code point boundary. | |
313 | #[inline] | |
c34b1796 | 314 | pub fn truncate(&mut self, new_len: usize) { |
85aaf69f SL |
315 | assert!(is_code_point_boundary(self, new_len)); |
316 | self.bytes.truncate(new_len) | |
317 | } | |
318 | ||
9346a6ac | 319 | /// Consumes the WTF-8 string and tries to convert it to UTF-8. |
85aaf69f SL |
320 | /// |
321 | /// This does not copy the data. | |
322 | /// | |
323 | /// If the contents are not well-formed UTF-8 | |
324 | /// (that is, if the string contains surrogates), | |
325 | /// the original WTF-8 string is returned instead. | |
326 | pub fn into_string(self) -> Result<String, Wtf8Buf> { | |
327 | match self.next_surrogate(0) { | |
328 | None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), | |
329 | Some(_) => Err(self), | |
330 | } | |
331 | } | |
332 | ||
9346a6ac | 333 | /// Consumes the WTF-8 string and converts it lossily to UTF-8. |
85aaf69f SL |
334 | /// |
335 | /// This does not copy the data (but may overwrite parts of it in place). | |
336 | /// | |
337 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) | |
338 | pub fn into_string_lossy(mut self) -> String { | |
339 | let mut pos = 0; | |
340 | loop { | |
341 | match self.next_surrogate(pos) { | |
342 | Some((surrogate_pos, _)) => { | |
343 | pos = surrogate_pos + 3; | |
344 | slice::bytes::copy_memory( | |
c34b1796 | 345 | UTF8_REPLACEMENT_CHARACTER, |
85aaf69f | 346 | &mut self.bytes[surrogate_pos .. pos], |
85aaf69f SL |
347 | ); |
348 | }, | |
349 | None => return unsafe { String::from_utf8_unchecked(self.bytes) } | |
350 | } | |
351 | } | |
352 | } | |
353 | } | |
354 | ||
355 | /// Create a new WTF-8 string from an iterator of code points. | |
356 | /// | |
357 | /// This replaces surrogate code point pairs with supplementary code points, | |
358 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
359 | impl FromIterator<CodePoint> for Wtf8Buf { | |
360 | fn from_iter<T: IntoIterator<Item=CodePoint>>(iter: T) -> Wtf8Buf { | |
361 | let mut string = Wtf8Buf::new(); | |
362 | string.extend(iter); | |
363 | string | |
364 | } | |
365 | } | |
366 | ||
367 | /// Append code points from an iterator to the string. | |
368 | /// | |
369 | /// This replaces surrogate code point pairs with supplementary code points, | |
370 | /// like concatenating ill-formed UTF-16 strings effectively would. | |
371 | impl Extend<CodePoint> for Wtf8Buf { | |
372 | fn extend<T: IntoIterator<Item=CodePoint>>(&mut self, iterable: T) { | |
373 | let iterator = iterable.into_iter(); | |
374 | let (low, _high) = iterator.size_hint(); | |
375 | // Lower bound of one byte per code point (ASCII only) | |
376 | self.bytes.reserve(low); | |
377 | for code_point in iterator { | |
378 | self.push(code_point); | |
379 | } | |
380 | } | |
381 | } | |
382 | ||
383 | /// A borrowed slice of well-formed WTF-8 data. | |
384 | /// | |
385 | /// Similar to `&str`, but can additionally contain surrogate code points | |
386 | /// if they’re not in a surrogate pair. | |
387 | pub struct Wtf8 { | |
388 | bytes: [u8] | |
389 | } | |
390 | ||
391 | impl AsInner<[u8]> for Wtf8 { | |
392 | fn as_inner(&self) -> &[u8] { &self.bytes } | |
393 | } | |
394 | ||
395 | // FIXME: https://github.com/rust-lang/rust/issues/18805 | |
396 | impl PartialEq for Wtf8 { | |
397 | fn eq(&self, other: &Wtf8) -> bool { self.bytes.eq(&other.bytes) } | |
398 | } | |
399 | ||
400 | // FIXME: https://github.com/rust-lang/rust/issues/18805 | |
401 | impl Eq for Wtf8 {} | |
402 | ||
403 | // FIXME: https://github.com/rust-lang/rust/issues/18738 | |
404 | impl PartialOrd for Wtf8 { | |
405 | #[inline] | |
406 | fn partial_cmp(&self, other: &Wtf8) -> Option<cmp::Ordering> { | |
407 | self.bytes.partial_cmp(&other.bytes) | |
408 | } | |
409 | #[inline] | |
410 | fn lt(&self, other: &Wtf8) -> bool { self.bytes.lt(&other.bytes) } | |
411 | #[inline] | |
412 | fn le(&self, other: &Wtf8) -> bool { self.bytes.le(&other.bytes) } | |
413 | #[inline] | |
414 | fn gt(&self, other: &Wtf8) -> bool { self.bytes.gt(&other.bytes) } | |
415 | #[inline] | |
416 | fn ge(&self, other: &Wtf8) -> bool { self.bytes.ge(&other.bytes) } | |
417 | } | |
418 | ||
419 | // FIXME: https://github.com/rust-lang/rust/issues/18738 | |
420 | impl Ord for Wtf8 { | |
421 | #[inline] | |
422 | fn cmp(&self, other: &Wtf8) -> cmp::Ordering { self.bytes.cmp(&other.bytes) } | |
423 | } | |
424 | ||
425 | /// Format the slice with double quotes, | |
426 | /// and surrogates as `\u` followed by four hexadecimal digits. | |
427 | /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] | |
428 | impl fmt::Debug for Wtf8 { | |
c1a9b12d SL |
429 | fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { |
430 | fn write_str_escaped(f: &mut fmt::Formatter, s: &str) -> fmt::Result { | |
431 | use fmt::Write; | |
432 | for c in s.chars().flat_map(|c| c.escape_default()) { | |
433 | try!(f.write_char(c)) | |
434 | } | |
435 | Ok(()) | |
436 | } | |
437 | ||
85aaf69f SL |
438 | try!(formatter.write_str("\"")); |
439 | let mut pos = 0; | |
440 | loop { | |
441 | match self.next_surrogate(pos) { | |
442 | None => break, | |
443 | Some((surrogate_pos, surrogate)) => { | |
c1a9b12d SL |
444 | try!(write_str_escaped( |
445 | formatter, | |
446 | unsafe { str::from_utf8_unchecked( | |
447 | &self.bytes[pos .. surrogate_pos] | |
448 | )}, | |
449 | )); | |
85aaf69f SL |
450 | try!(write!(formatter, "\\u{{{:X}}}", surrogate)); |
451 | pos = surrogate_pos + 3; | |
452 | } | |
453 | } | |
454 | } | |
c1a9b12d SL |
455 | try!(write_str_escaped( |
456 | formatter, | |
457 | unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) }, | |
458 | )); | |
85aaf69f SL |
459 | formatter.write_str("\"") |
460 | } | |
461 | } | |
462 | ||
463 | impl Wtf8 { | |
9346a6ac | 464 | /// Creates a WTF-8 slice from a UTF-8 `&str` slice. |
85aaf69f SL |
465 | /// |
466 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. | |
467 | #[inline] | |
468 | pub fn from_str(value: &str) -> &Wtf8 { | |
c1a9b12d SL |
469 | unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) } |
470 | } | |
471 | ||
472 | /// Creates a WTF-8 slice from a WTF-8 byte slice. | |
473 | /// | |
474 | /// Since the byte slice is not checked for valid WTF-8, this functions is | |
475 | /// marked unsafe. | |
476 | #[inline] | |
477 | unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 { | |
478 | mem::transmute(value) | |
85aaf69f SL |
479 | } |
480 | ||
9346a6ac | 481 | /// Returns the length, in WTF-8 bytes. |
85aaf69f | 482 | #[inline] |
c34b1796 | 483 | pub fn len(&self) -> usize { |
85aaf69f SL |
484 | self.bytes.len() |
485 | } | |
486 | ||
9346a6ac | 487 | /// Returns the code point at `position` if it is in the ASCII range, |
85aaf69f SL |
488 | /// or `b'\xFF' otherwise. |
489 | /// | |
490 | /// # Panics | |
491 | /// | |
492 | /// Panics if `position` is beyond the end of the string. | |
493 | #[inline] | |
c34b1796 | 494 | pub fn ascii_byte_at(&self, position: usize) -> u8 { |
85aaf69f SL |
495 | match self.bytes[position] { |
496 | ascii_byte @ 0x00 ... 0x7F => ascii_byte, | |
497 | _ => 0xFF | |
498 | } | |
499 | } | |
500 | ||
9346a6ac | 501 | /// Returns an iterator for the string’s code points. |
85aaf69f SL |
502 | #[inline] |
503 | pub fn code_points(&self) -> Wtf8CodePoints { | |
504 | Wtf8CodePoints { bytes: self.bytes.iter() } | |
505 | } | |
506 | ||
9346a6ac | 507 | /// Tries to convert the string to UTF-8 and return a `&str` slice. |
85aaf69f | 508 | /// |
9346a6ac | 509 | /// Returns `None` if the string contains surrogates. |
85aaf69f SL |
510 | /// |
511 | /// This does not copy the data. | |
512 | #[inline] | |
513 | pub fn as_str(&self) -> Option<&str> { | |
514 | // Well-formed WTF-8 is also well-formed UTF-8 | |
515 | // if and only if it contains no surrogate. | |
516 | match self.next_surrogate(0) { | |
517 | None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }), | |
518 | Some(_) => None, | |
519 | } | |
520 | } | |
521 | ||
9346a6ac | 522 | /// Lossily converts the string to UTF-8. |
d9579d0f | 523 | /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8. |
85aaf69f SL |
524 | /// |
525 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). | |
526 | /// | |
527 | /// This only copies the data if necessary (if it contains any surrogate). | |
c34b1796 | 528 | pub fn to_string_lossy(&self) -> Cow<str> { |
85aaf69f SL |
529 | let surrogate_pos = match self.next_surrogate(0) { |
530 | None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), | |
531 | Some((pos, _)) => pos, | |
532 | }; | |
533 | let wtf8_bytes = &self.bytes; | |
534 | let mut utf8_bytes = Vec::with_capacity(self.len()); | |
535 | utf8_bytes.push_all(&wtf8_bytes[..surrogate_pos]); | |
536 | utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER); | |
537 | let mut pos = surrogate_pos + 3; | |
538 | loop { | |
539 | match self.next_surrogate(pos) { | |
540 | Some((surrogate_pos, _)) => { | |
541 | utf8_bytes.push_all(&wtf8_bytes[pos .. surrogate_pos]); | |
542 | utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER); | |
543 | pos = surrogate_pos + 3; | |
544 | }, | |
545 | None => { | |
546 | utf8_bytes.push_all(&wtf8_bytes[pos..]); | |
547 | return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }) | |
548 | } | |
549 | } | |
550 | } | |
551 | } | |
552 | ||
9346a6ac | 553 | /// Converts the WTF-8 string to potentially ill-formed UTF-16 |
85aaf69f SL |
554 | /// and return an iterator of 16-bit code units. |
555 | /// | |
556 | /// This is lossless: | |
557 | /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units | |
558 | /// would always return the original WTF-8 string. | |
559 | #[inline] | |
560 | pub fn encode_wide(&self) -> EncodeWide { | |
561 | EncodeWide { code_points: self.code_points(), extra: 0 } | |
562 | } | |
563 | ||
564 | #[inline] | |
c34b1796 | 565 | fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { |
85aaf69f SL |
566 | let mut iter = self.bytes[pos..].iter(); |
567 | loop { | |
568 | let b = match iter.next() { | |
569 | None => return None, | |
570 | Some(&b) => b, | |
571 | }; | |
572 | if b < 0x80 { | |
573 | pos += 1; | |
574 | } else if b < 0xE0 { | |
575 | iter.next(); | |
576 | pos += 2; | |
577 | } else if b == 0xED { | |
578 | match (iter.next(), iter.next()) { | |
579 | (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { | |
580 | return Some((pos, decode_surrogate(b2, b3))) | |
581 | } | |
582 | _ => pos += 3 | |
583 | } | |
584 | } else if b < 0xF0 { | |
585 | iter.next(); | |
586 | iter.next(); | |
587 | pos += 3; | |
588 | } else { | |
589 | iter.next(); | |
590 | iter.next(); | |
591 | iter.next(); | |
592 | pos += 4; | |
593 | } | |
594 | } | |
595 | } | |
596 | ||
597 | #[inline] | |
598 | fn final_lead_surrogate(&self) -> Option<u16> { | |
599 | let len = self.len(); | |
600 | if len < 3 { | |
601 | return None | |
602 | } | |
603 | match &self.bytes[(len - 3)..] { | |
604 | [0xED, b2 @ 0xA0...0xAF, b3] => Some(decode_surrogate(b2, b3)), | |
605 | _ => None | |
606 | } | |
607 | } | |
608 | ||
609 | #[inline] | |
610 | fn initial_trail_surrogate(&self) -> Option<u16> { | |
611 | let len = self.len(); | |
612 | if len < 3 { | |
613 | return None | |
614 | } | |
615 | match &self.bytes[..3] { | |
616 | [0xED, b2 @ 0xB0...0xBF, b3] => Some(decode_surrogate(b2, b3)), | |
617 | _ => None | |
618 | } | |
619 | } | |
620 | } | |
621 | ||
622 | ||
623 | /// Return a slice of the given string for the byte range [`begin`..`end`). | |
624 | /// | |
625 | /// # Panics | |
626 | /// | |
627 | /// Panics when `begin` and `end` do not point to code point boundaries, | |
628 | /// or point beyond the end of the string. | |
629 | impl ops::Index<ops::Range<usize>> for Wtf8 { | |
630 | type Output = Wtf8; | |
631 | ||
632 | #[inline] | |
c34b1796 | 633 | fn index(&self, range: ops::Range<usize>) -> &Wtf8 { |
85aaf69f SL |
634 | // is_code_point_boundary checks that the index is in [0, .len()] |
635 | if range.start <= range.end && | |
636 | is_code_point_boundary(self, range.start) && | |
637 | is_code_point_boundary(self, range.end) { | |
638 | unsafe { slice_unchecked(self, range.start, range.end) } | |
639 | } else { | |
640 | slice_error_fail(self, range.start, range.end) | |
641 | } | |
642 | } | |
643 | } | |
644 | ||
645 | /// Return a slice of the given string from byte `begin` to its end. | |
646 | /// | |
647 | /// # Panics | |
648 | /// | |
649 | /// Panics when `begin` is not at a code point boundary, | |
650 | /// or is beyond the end of the string. | |
651 | impl ops::Index<ops::RangeFrom<usize>> for Wtf8 { | |
652 | type Output = Wtf8; | |
653 | ||
654 | #[inline] | |
c34b1796 | 655 | fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 { |
85aaf69f SL |
656 | // is_code_point_boundary checks that the index is in [0, .len()] |
657 | if is_code_point_boundary(self, range.start) { | |
658 | unsafe { slice_unchecked(self, range.start, self.len()) } | |
659 | } else { | |
660 | slice_error_fail(self, range.start, self.len()) | |
661 | } | |
662 | } | |
663 | } | |
664 | ||
665 | /// Return a slice of the given string from its beginning to byte `end`. | |
666 | /// | |
667 | /// # Panics | |
668 | /// | |
669 | /// Panics when `end` is not at a code point boundary, | |
670 | /// or is beyond the end of the string. | |
671 | impl ops::Index<ops::RangeTo<usize>> for Wtf8 { | |
672 | type Output = Wtf8; | |
673 | ||
674 | #[inline] | |
c34b1796 | 675 | fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 { |
85aaf69f SL |
676 | // is_code_point_boundary checks that the index is in [0, .len()] |
677 | if is_code_point_boundary(self, range.end) { | |
678 | unsafe { slice_unchecked(self, 0, range.end) } | |
679 | } else { | |
680 | slice_error_fail(self, 0, range.end) | |
681 | } | |
682 | } | |
683 | } | |
684 | ||
685 | impl ops::Index<ops::RangeFull> for Wtf8 { | |
686 | type Output = Wtf8; | |
687 | ||
688 | #[inline] | |
c34b1796 | 689 | fn index(&self, _range: ops::RangeFull) -> &Wtf8 { |
85aaf69f SL |
690 | self |
691 | } | |
692 | } | |
693 | ||
694 | #[inline] | |
695 | fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { | |
696 | // The first byte is assumed to be 0xED | |
697 | 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F | |
698 | } | |
699 | ||
700 | #[inline] | |
701 | fn decode_surrogate_pair(lead: u16, trail: u16) -> char { | |
702 | let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32); | |
c1a9b12d | 703 | unsafe { char::from_u32_unchecked(code_point) } |
85aaf69f SL |
704 | } |
705 | ||
706 | /// Copied from core::str::StrPrelude::is_char_boundary | |
707 | #[inline] | |
c34b1796 | 708 | pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { |
85aaf69f SL |
709 | if index == slice.len() { return true; } |
710 | match slice.bytes.get(index) { | |
711 | None => false, | |
c34b1796 | 712 | Some(&b) => b < 128 || b >= 192, |
85aaf69f SL |
713 | } |
714 | } | |
715 | ||
716 | /// Copied from core::str::raw::slice_unchecked | |
717 | #[inline] | |
c34b1796 AL |
718 | pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { |
719 | // memory layout of an &[u8] and &Wtf8 are the same | |
c1a9b12d | 720 | Wtf8::from_bytes_unchecked(slice::from_raw_parts( |
c34b1796 AL |
721 | s.bytes.as_ptr().offset(begin as isize), |
722 | end - begin | |
723 | )) | |
85aaf69f SL |
724 | } |
725 | ||
726 | /// Copied from core::str::raw::slice_error_fail | |
727 | #[inline(never)] | |
c34b1796 | 728 | pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { |
85aaf69f SL |
729 | assert!(begin <= end); |
730 | panic!("index {} and/or {} in `{:?}` do not lie on character boundary", | |
731 | begin, end, s); | |
732 | } | |
733 | ||
734 | /// Iterator for the code points of a WTF-8 string. | |
735 | /// | |
736 | /// Created with the method `.code_points()`. | |
737 | #[derive(Clone)] | |
738 | pub struct Wtf8CodePoints<'a> { | |
739 | bytes: slice::Iter<'a, u8> | |
740 | } | |
741 | ||
742 | impl<'a> Iterator for Wtf8CodePoints<'a> { | |
743 | type Item = CodePoint; | |
744 | ||
745 | #[inline] | |
746 | fn next(&mut self) -> Option<CodePoint> { | |
747 | next_code_point(&mut self.bytes).map(|c| CodePoint { value: c }) | |
748 | } | |
749 | ||
750 | #[inline] | |
c34b1796 | 751 | fn size_hint(&self) -> (usize, Option<usize>) { |
85aaf69f SL |
752 | let (len, _) = self.bytes.size_hint(); |
753 | (len.saturating_add(3) / 4, Some(len)) | |
754 | } | |
755 | } | |
756 | ||
757 | #[derive(Clone)] | |
758 | pub struct EncodeWide<'a> { | |
759 | code_points: Wtf8CodePoints<'a>, | |
760 | extra: u16 | |
761 | } | |
762 | ||
763 | // Copied from libunicode/u_str.rs | |
764 | impl<'a> Iterator for EncodeWide<'a> { | |
765 | type Item = u16; | |
766 | ||
767 | #[inline] | |
768 | fn next(&mut self) -> Option<u16> { | |
769 | if self.extra != 0 { | |
770 | let tmp = self.extra; | |
771 | self.extra = 0; | |
772 | return Some(tmp); | |
773 | } | |
774 | ||
c34b1796 | 775 | let mut buf = [0; 2]; |
85aaf69f SL |
776 | self.code_points.next().map(|code_point| { |
777 | let n = encode_utf16_raw(code_point.value, &mut buf) | |
778 | .unwrap_or(0); | |
779 | if n == 2 { self.extra = buf[1]; } | |
780 | buf[0] | |
781 | }) | |
782 | } | |
783 | ||
784 | #[inline] | |
c34b1796 | 785 | fn size_hint(&self) -> (usize, Option<usize>) { |
85aaf69f SL |
786 | let (low, high) = self.code_points.size_hint(); |
787 | // every code point gets either one u16 or two u16, | |
788 | // so this iterator is between 1 or 2 times as | |
789 | // long as the underlying iterator. | |
790 | (low, high.and_then(|n| n.checked_mul(2))) | |
791 | } | |
792 | } | |
793 | ||
85aaf69f SL |
794 | impl Hash for CodePoint { |
795 | #[inline] | |
796 | fn hash<H: Hasher>(&self, state: &mut H) { | |
797 | self.value.hash(state) | |
798 | } | |
799 | } | |
800 | ||
85aaf69f SL |
801 | impl Hash for Wtf8Buf { |
802 | #[inline] | |
803 | fn hash<H: Hasher>(&self, state: &mut H) { | |
804 | state.write(&self.bytes); | |
805 | 0xfeu8.hash(state) | |
806 | } | |
807 | } | |
808 | ||
85aaf69f SL |
809 | impl Hash for Wtf8 { |
810 | #[inline] | |
811 | fn hash<H: Hasher>(&self, state: &mut H) { | |
812 | state.write(&self.bytes); | |
813 | 0xfeu8.hash(state) | |
814 | } | |
815 | } | |
816 | ||
817 | impl AsciiExt for Wtf8 { | |
818 | type Owned = Wtf8Buf; | |
819 | ||
820 | fn is_ascii(&self) -> bool { | |
821 | self.bytes.is_ascii() | |
822 | } | |
823 | fn to_ascii_uppercase(&self) -> Wtf8Buf { | |
824 | Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() } | |
825 | } | |
826 | fn to_ascii_lowercase(&self) -> Wtf8Buf { | |
827 | Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() } | |
828 | } | |
829 | fn eq_ignore_ascii_case(&self, other: &Wtf8) -> bool { | |
830 | self.bytes.eq_ignore_ascii_case(&other.bytes) | |
831 | } | |
832 | ||
833 | fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() } | |
834 | fn make_ascii_lowercase(&mut self) { self.bytes.make_ascii_lowercase() } | |
835 | } | |
836 | ||
837 | #[cfg(test)] | |
838 | mod tests { | |
839 | use prelude::v1::*; | |
840 | use borrow::Cow; | |
841 | use super::*; | |
85aaf69f SL |
842 | |
843 | #[test] | |
844 | fn code_point_from_u32() { | |
845 | assert!(CodePoint::from_u32(0).is_some()); | |
846 | assert!(CodePoint::from_u32(0xD800).is_some()); | |
847 | assert!(CodePoint::from_u32(0x10FFFF).is_some()); | |
848 | assert!(CodePoint::from_u32(0x110000).is_none()); | |
849 | } | |
850 | ||
851 | #[test] | |
852 | fn code_point_to_u32() { | |
853 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
854 | assert_eq!(c(0).to_u32(), 0); | |
855 | assert_eq!(c(0xD800).to_u32(), 0xD800); | |
856 | assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF); | |
857 | } | |
858 | ||
859 | #[test] | |
860 | fn code_point_from_char() { | |
861 | assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); | |
862 | assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9); | |
863 | } | |
864 | ||
865 | #[test] | |
866 | fn code_point_to_string() { | |
867 | assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061"); | |
868 | assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9"); | |
869 | } | |
870 | ||
871 | #[test] | |
872 | fn code_point_to_char() { | |
873 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
874 | assert_eq!(c(0x61).to_char(), Some('a')); | |
875 | assert_eq!(c(0x1F4A9).to_char(), Some('💩')); | |
876 | assert_eq!(c(0xD800).to_char(), None); | |
877 | } | |
878 | ||
879 | #[test] | |
880 | fn code_point_to_char_lossy() { | |
881 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
882 | assert_eq!(c(0x61).to_char_lossy(), 'a'); | |
883 | assert_eq!(c(0x1F4A9).to_char_lossy(), '💩'); | |
884 | assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}'); | |
885 | } | |
886 | ||
887 | #[test] | |
888 | fn wtf8buf_new() { | |
889 | assert_eq!(Wtf8Buf::new().bytes, b""); | |
890 | } | |
891 | ||
892 | #[test] | |
893 | fn wtf8buf_from_str() { | |
894 | assert_eq!(Wtf8Buf::from_str("").bytes, b""); | |
895 | assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, | |
896 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
897 | } | |
898 | ||
899 | #[test] | |
900 | fn wtf8buf_from_string() { | |
62682a34 SL |
901 | assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); |
902 | assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes, | |
85aaf69f SL |
903 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); |
904 | } | |
905 | ||
906 | #[test] | |
907 | fn wtf8buf_from_wide() { | |
908 | assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b""); | |
909 | assert_eq!(Wtf8Buf::from_wide( | |
910 | &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes, | |
911 | b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); | |
912 | } | |
913 | ||
914 | #[test] | |
915 | fn wtf8buf_push_str() { | |
916 | let mut string = Wtf8Buf::new(); | |
917 | assert_eq!(string.bytes, b""); | |
918 | string.push_str("aé 💩"); | |
919 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
920 | } | |
921 | ||
922 | #[test] | |
923 | fn wtf8buf_push_char() { | |
924 | let mut string = Wtf8Buf::from_str("aé "); | |
925 | assert_eq!(string.bytes, b"a\xC3\xA9 "); | |
926 | string.push_char('💩'); | |
927 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
928 | } | |
929 | ||
930 | #[test] | |
931 | fn wtf8buf_push() { | |
932 | let mut string = Wtf8Buf::from_str("aé "); | |
933 | assert_eq!(string.bytes, b"a\xC3\xA9 "); | |
934 | string.push(CodePoint::from_char('💩')); | |
935 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
936 | ||
937 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
938 | ||
939 | let mut string = Wtf8Buf::new(); | |
940 | string.push(c(0xD83D)); // lead | |
941 | string.push(c(0xDCA9)); // trail | |
942 | assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
943 | ||
944 | let mut string = Wtf8Buf::new(); | |
945 | string.push(c(0xD83D)); // lead | |
946 | string.push(c(0x20)); // not surrogate | |
947 | string.push(c(0xDCA9)); // trail | |
948 | assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
949 | ||
950 | let mut string = Wtf8Buf::new(); | |
951 | string.push(c(0xD800)); // lead | |
952 | string.push(c(0xDBFF)); // lead | |
953 | assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
954 | ||
955 | let mut string = Wtf8Buf::new(); | |
956 | string.push(c(0xD800)); // lead | |
957 | string.push(c(0xE000)); // not surrogate | |
958 | assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
959 | ||
960 | let mut string = Wtf8Buf::new(); | |
961 | string.push(c(0xD7FF)); // not surrogate | |
962 | string.push(c(0xDC00)); // trail | |
963 | assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
964 | ||
965 | let mut string = Wtf8Buf::new(); | |
966 | string.push(c(0x61)); // not surrogate, < 3 bytes | |
967 | string.push(c(0xDC00)); // trail | |
968 | assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); | |
969 | ||
970 | let mut string = Wtf8Buf::new(); | |
971 | string.push(c(0xDC00)); // trail | |
972 | assert_eq!(string.bytes, b"\xED\xB0\x80"); | |
973 | } | |
974 | ||
975 | #[test] | |
976 | fn wtf8buf_push_wtf8() { | |
977 | let mut string = Wtf8Buf::from_str("aé"); | |
978 | assert_eq!(string.bytes, b"a\xC3\xA9"); | |
979 | string.push_wtf8(Wtf8::from_str(" 💩")); | |
980 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
981 | ||
c1a9b12d | 982 | fn w(v: &[u8]) -> &Wtf8 { unsafe { Wtf8::from_bytes_unchecked(v) } } |
85aaf69f SL |
983 | |
984 | let mut string = Wtf8Buf::new(); | |
985 | string.push_wtf8(w(b"\xED\xA0\xBD")); // lead | |
986 | string.push_wtf8(w(b"\xED\xB2\xA9")); // trail | |
987 | assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
988 | ||
989 | let mut string = Wtf8Buf::new(); | |
990 | string.push_wtf8(w(b"\xED\xA0\xBD")); // lead | |
991 | string.push_wtf8(w(b" ")); // not surrogate | |
992 | string.push_wtf8(w(b"\xED\xB2\xA9")); // trail | |
993 | assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
994 | ||
995 | let mut string = Wtf8Buf::new(); | |
996 | string.push_wtf8(w(b"\xED\xA0\x80")); // lead | |
997 | string.push_wtf8(w(b"\xED\xAF\xBF")); // lead | |
998 | assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
999 | ||
1000 | let mut string = Wtf8Buf::new(); | |
1001 | string.push_wtf8(w(b"\xED\xA0\x80")); // lead | |
1002 | string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate | |
1003 | assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
1004 | ||
1005 | let mut string = Wtf8Buf::new(); | |
1006 | string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate | |
1007 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail | |
1008 | assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
1009 | ||
1010 | let mut string = Wtf8Buf::new(); | |
1011 | string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes | |
1012 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail | |
1013 | assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); | |
1014 | ||
1015 | let mut string = Wtf8Buf::new(); | |
1016 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail | |
1017 | assert_eq!(string.bytes, b"\xED\xB0\x80"); | |
1018 | } | |
1019 | ||
1020 | #[test] | |
1021 | fn wtf8buf_truncate() { | |
1022 | let mut string = Wtf8Buf::from_str("aé"); | |
1023 | string.truncate(1); | |
1024 | assert_eq!(string.bytes, b"a"); | |
1025 | } | |
1026 | ||
1027 | #[test] | |
c34b1796 | 1028 | #[should_panic] |
85aaf69f SL |
1029 | fn wtf8buf_truncate_fail_code_point_boundary() { |
1030 | let mut string = Wtf8Buf::from_str("aé"); | |
1031 | string.truncate(2); | |
1032 | } | |
1033 | ||
1034 | #[test] | |
c34b1796 | 1035 | #[should_panic] |
85aaf69f SL |
1036 | fn wtf8buf_truncate_fail_longer() { |
1037 | let mut string = Wtf8Buf::from_str("aé"); | |
1038 | string.truncate(4); | |
1039 | } | |
1040 | ||
1041 | #[test] | |
1042 | fn wtf8buf_into_string() { | |
1043 | let mut string = Wtf8Buf::from_str("aé 💩"); | |
62682a34 | 1044 | assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); |
85aaf69f SL |
1045 | string.push(CodePoint::from_u32(0xD800).unwrap()); |
1046 | assert_eq!(string.clone().into_string(), Err(string)); | |
1047 | } | |
1048 | ||
1049 | #[test] | |
1050 | fn wtf8buf_into_string_lossy() { | |
1051 | let mut string = Wtf8Buf::from_str("aé 💩"); | |
62682a34 | 1052 | assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); |
85aaf69f | 1053 | string.push(CodePoint::from_u32(0xD800).unwrap()); |
62682a34 | 1054 | assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); |
85aaf69f SL |
1055 | } |
1056 | ||
1057 | #[test] | |
1058 | fn wtf8buf_from_iterator() { | |
1059 | fn f(values: &[u32]) -> Wtf8Buf { | |
1060 | values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>() | |
1061 | }; | |
1062 | assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1063 | ||
1064 | assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
1065 | assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
1066 | assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
1067 | assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
1068 | assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
1069 | assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80"); | |
1070 | assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80"); | |
1071 | } | |
1072 | ||
1073 | #[test] | |
1074 | fn wtf8buf_extend() { | |
1075 | fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { | |
1076 | fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() } | |
1077 | let mut string = initial.iter().map(c).collect::<Wtf8Buf>(); | |
1078 | string.extend(extended.iter().map(c)); | |
1079 | string | |
1080 | }; | |
1081 | ||
1082 | assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, | |
1083 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1084 | ||
1085 | assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! | |
1086 | assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); | |
1087 | assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); | |
1088 | assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); | |
1089 | assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); | |
1090 | assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80"); | |
1091 | assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80"); | |
1092 | } | |
1093 | ||
1094 | #[test] | |
1095 | fn wtf8buf_show() { | |
c1a9b12d | 1096 | let mut string = Wtf8Buf::from_str("a\té 💩\r"); |
85aaf69f | 1097 | string.push(CodePoint::from_u32(0xD800).unwrap()); |
c1a9b12d | 1098 | assert_eq!(format!("{:?}", string), r#""a\t\u{e9} \u{1f4a9}\r\u{D800}""#); |
85aaf69f SL |
1099 | } |
1100 | ||
1101 | #[test] | |
1102 | fn wtf8buf_as_slice() { | |
1103 | assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé")); | |
1104 | } | |
1105 | ||
1106 | #[test] | |
c1a9b12d SL |
1107 | fn wtf8buf_show_str() { |
1108 | let text = "a\té 💩\r"; | |
1109 | let mut string = Wtf8Buf::from_str(text); | |
1110 | assert_eq!(format!("{:?}", text), format!("{:?}", string)); | |
85aaf69f SL |
1111 | } |
1112 | ||
1113 | #[test] | |
1114 | fn wtf8_from_str() { | |
1115 | assert_eq!(&Wtf8::from_str("").bytes, b""); | |
1116 | assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1117 | } | |
1118 | ||
1119 | #[test] | |
1120 | fn wtf8_len() { | |
1121 | assert_eq!(Wtf8::from_str("").len(), 0); | |
1122 | assert_eq!(Wtf8::from_str("aé 💩").len(), 8); | |
1123 | } | |
1124 | ||
1125 | #[test] | |
1126 | fn wtf8_slice() { | |
1127 | assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 "); | |
1128 | } | |
1129 | ||
1130 | #[test] | |
c34b1796 | 1131 | #[should_panic] |
85aaf69f SL |
1132 | fn wtf8_slice_not_code_point_boundary() { |
1133 | &Wtf8::from_str("aé 💩")[2.. 4]; | |
1134 | } | |
1135 | ||
1136 | #[test] | |
1137 | fn wtf8_slice_from() { | |
1138 | assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9"); | |
1139 | } | |
1140 | ||
1141 | #[test] | |
c34b1796 | 1142 | #[should_panic] |
85aaf69f SL |
1143 | fn wtf8_slice_from_not_code_point_boundary() { |
1144 | &Wtf8::from_str("aé 💩")[2..]; | |
1145 | } | |
1146 | ||
1147 | #[test] | |
1148 | fn wtf8_slice_to() { | |
1149 | assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 "); | |
1150 | } | |
1151 | ||
1152 | #[test] | |
c34b1796 | 1153 | #[should_panic] |
85aaf69f SL |
1154 | fn wtf8_slice_to_not_code_point_boundary() { |
1155 | &Wtf8::from_str("aé 💩")[5..]; | |
1156 | } | |
1157 | ||
1158 | #[test] | |
1159 | fn wtf8_ascii_byte_at() { | |
1160 | let slice = Wtf8::from_str("aé 💩"); | |
1161 | assert_eq!(slice.ascii_byte_at(0), b'a'); | |
1162 | assert_eq!(slice.ascii_byte_at(1), b'\xFF'); | |
1163 | assert_eq!(slice.ascii_byte_at(2), b'\xFF'); | |
1164 | assert_eq!(slice.ascii_byte_at(3), b' '); | |
1165 | assert_eq!(slice.ascii_byte_at(4), b'\xFF'); | |
1166 | } | |
1167 | ||
85aaf69f SL |
1168 | #[test] |
1169 | fn wtf8_code_points() { | |
1170 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } | |
1171 | fn cp(string: &Wtf8Buf) -> Vec<Option<char>> { | |
1172 | string.code_points().map(|c| c.to_char()).collect::<Vec<_>>() | |
1173 | } | |
1174 | let mut string = Wtf8Buf::from_str("é "); | |
c34b1796 | 1175 | assert_eq!(cp(&string), [Some('é'), Some(' ')]); |
85aaf69f | 1176 | string.push(c(0xD83D)); |
c34b1796 | 1177 | assert_eq!(cp(&string), [Some('é'), Some(' '), None]); |
85aaf69f | 1178 | string.push(c(0xDCA9)); |
c34b1796 | 1179 | assert_eq!(cp(&string), [Some('é'), Some(' '), Some('💩')]); |
85aaf69f SL |
1180 | } |
1181 | ||
1182 | #[test] | |
1183 | fn wtf8_as_str() { | |
1184 | assert_eq!(Wtf8::from_str("").as_str(), Some("")); | |
1185 | assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩")); | |
1186 | let mut string = Wtf8Buf::new(); | |
1187 | string.push(CodePoint::from_u32(0xD800).unwrap()); | |
1188 | assert_eq!(string.as_str(), None); | |
1189 | } | |
1190 | ||
1191 | #[test] | |
1192 | fn wtf8_to_string_lossy() { | |
1193 | assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); | |
1194 | assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩")); | |
1195 | let mut string = Wtf8Buf::from_str("aé 💩"); | |
1196 | string.push(CodePoint::from_u32(0xD800).unwrap()); | |
62682a34 | 1197 | let expected: Cow<str> = Cow::Owned(String::from("aé 💩�")); |
85aaf69f SL |
1198 | assert_eq!(string.to_string_lossy(), expected); |
1199 | } | |
1200 | ||
1201 | #[test] | |
1202 | fn wtf8_encode_wide() { | |
1203 | let mut string = Wtf8Buf::from_str("aé "); | |
1204 | string.push(CodePoint::from_u32(0xD83D).unwrap()); | |
1205 | string.push_char('💩'); | |
1206 | assert_eq!(string.encode_wide().collect::<Vec<_>>(), | |
1207 | vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); | |
1208 | } | |
1209 | } |