]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | //! Character manipulation. | |
12 | //! | |
d9579d0f | 13 | //! For more details, see ::rustc_unicode::char (a.k.a. std::char) |
1a4d82fc JJ |
14 | |
15 | #![allow(non_snake_case)] | |
62682a34 | 16 | #![stable(feature = "core_char", since = "1.2.0")] |
1a4d82fc | 17 | |
5bcae85e | 18 | use char_private::is_printable; |
9e0c209e SL |
19 | use convert::TryFrom; |
20 | use fmt; | |
21 | use iter::FusedIterator; | |
1a4d82fc | 22 | use mem::transmute; |
1a4d82fc JJ |
23 | |
24 | // UTF-8 ranges and tags for encoding characters | |
c34b1796 AL |
25 | const TAG_CONT: u8 = 0b1000_0000; |
26 | const TAG_TWO_B: u8 = 0b1100_0000; | |
27 | const TAG_THREE_B: u8 = 0b1110_0000; | |
28 | const TAG_FOUR_B: u8 = 0b1111_0000; | |
29 | const MAX_ONE_B: u32 = 0x80; | |
30 | const MAX_TWO_B: u32 = 0x800; | |
31 | const MAX_THREE_B: u32 = 0x10000; | |
1a4d82fc JJ |
32 | |
33 | /* | |
34 | Lu Uppercase_Letter an uppercase letter | |
35 | Ll Lowercase_Letter a lowercase letter | |
36 | Lt Titlecase_Letter a digraphic character, with first part uppercase | |
37 | Lm Modifier_Letter a modifier letter | |
38 | Lo Other_Letter other letters, including syllables and ideographs | |
39 | Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) | |
40 | Mc Spacing_Mark a spacing combining mark (positive advance width) | |
41 | Me Enclosing_Mark an enclosing combining mark | |
42 | Nd Decimal_Number a decimal digit | |
43 | Nl Letter_Number a letterlike numeric character | |
44 | No Other_Number a numeric character of other type | |
45 | Pc Connector_Punctuation a connecting punctuation mark, like a tie | |
46 | Pd Dash_Punctuation a dash or hyphen punctuation mark | |
47 | Ps Open_Punctuation an opening punctuation mark (of a pair) | |
48 | Pe Close_Punctuation a closing punctuation mark (of a pair) | |
49 | Pi Initial_Punctuation an initial quotation mark | |
50 | Pf Final_Punctuation a final quotation mark | |
51 | Po Other_Punctuation a punctuation mark of other type | |
52 | Sm Math_Symbol a symbol of primarily mathematical use | |
53 | Sc Currency_Symbol a currency sign | |
54 | Sk Modifier_Symbol a non-letterlike modifier symbol | |
55 | So Other_Symbol a symbol of other type | |
56 | Zs Space_Separator a space character (of various non-zero widths) | |
57 | Zl Line_Separator U+2028 LINE SEPARATOR only | |
58 | Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only | |
59 | Cc Control a C0 or C1 control code | |
60 | Cf Format a format control character | |
61 | Cs Surrogate a surrogate code point | |
62 | Co Private_Use a private-use character | |
63 | Cn Unassigned a reserved unassigned code point or a noncharacter | |
64 | */ | |
65 | ||
92a42be0 SL |
66 | /// The highest valid code point a `char` can have. |
67 | /// | |
68 | /// A [`char`] is a [Unicode Scalar Value], which means that it is a [Code | |
69 | /// Point], but only ones within a certain range. `MAX` is the highest valid | |
70 | /// code point that's a valid [Unicode Scalar Value]. | |
71 | /// | |
54a0048b | 72 | /// [`char`]: ../../std/primitive.char.html |
92a42be0 SL |
73 | /// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value |
74 | /// [Code Point]: http://www.unicode.org/glossary/#code_point | |
85aaf69f | 75 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
76 | pub const MAX: char = '\u{10ffff}'; |
77 | ||
92a42be0 SL |
78 | /// Converts a `u32` to a `char`. |
79 | /// | |
80 | /// Note that all [`char`]s are valid [`u32`]s, and can be casted to one with | |
81 | /// [`as`]: | |
82 | /// | |
83 | /// ``` | |
84 | /// let c = '💯'; | |
85 | /// let i = c as u32; | |
86 | /// | |
87 | /// assert_eq!(128175, i); | |
88 | /// ``` | |
89 | /// | |
90 | /// However, the reverse is not true: not all valid [`u32`]s are valid | |
91 | /// [`char`]s. `from_u32()` will return `None` if the input is not a valid value | |
92 | /// for a [`char`]. | |
93 | /// | |
54a0048b SL |
94 | /// [`char`]: ../../std/primitive.char.html |
95 | /// [`u32`]: ../../std/primitive.u32.html | |
9cc50fc6 | 96 | /// [`as`]: ../../book/casting-between-types.html#as |
92a42be0 SL |
97 | /// |
98 | /// For an unsafe version of this function which ignores these checks, see | |
99 | /// [`from_u32_unchecked()`]. | |
100 | /// | |
101 | /// [`from_u32_unchecked()`]: fn.from_u32_unchecked.html | |
85aaf69f SL |
102 | /// |
103 | /// # Examples | |
104 | /// | |
92a42be0 SL |
105 | /// Basic usage: |
106 | /// | |
85aaf69f SL |
107 | /// ``` |
108 | /// use std::char; | |
109 | /// | |
92a42be0 SL |
110 | /// let c = char::from_u32(0x2764); |
111 | /// | |
112 | /// assert_eq!(Some('❤'), c); | |
113 | /// ``` | |
114 | /// | |
115 | /// Returning `None` when the input is not a valid [`char`]: | |
116 | /// | |
117 | /// ``` | |
118 | /// use std::char; | |
119 | /// | |
120 | /// let c = char::from_u32(0x110000); | |
121 | /// | |
122 | /// assert_eq!(None, c); | |
85aaf69f | 123 | /// ``` |
1a4d82fc | 124 | #[inline] |
85aaf69f | 125 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc | 126 | pub fn from_u32(i: u32) -> Option<char> { |
9e0c209e | 127 | char::try_from(i).ok() |
1a4d82fc JJ |
128 | } |
129 | ||
92a42be0 SL |
130 | /// Converts a `u32` to a `char`, ignoring validity. |
131 | /// | |
132 | /// Note that all [`char`]s are valid [`u32`]s, and can be casted to one with | |
133 | /// [`as`]: | |
134 | /// | |
135 | /// ``` | |
136 | /// let c = '💯'; | |
137 | /// let i = c as u32; | |
138 | /// | |
139 | /// assert_eq!(128175, i); | |
140 | /// ``` | |
141 | /// | |
142 | /// However, the reverse is not true: not all valid [`u32`]s are valid | |
143 | /// [`char`]s. `from_u32_unchecked()` will ignore this, and blindly cast to | |
144 | /// [`char`], possibly creating an invalid one. | |
145 | /// | |
54a0048b SL |
146 | /// [`char`]: ../../std/primitive.char.html |
147 | /// [`u32`]: ../../std/primitive.u32.html | |
9cc50fc6 | 148 | /// [`as`]: ../../book/casting-between-types.html#as |
92a42be0 SL |
149 | /// |
150 | /// # Safety | |
151 | /// | |
152 | /// This function is unsafe, as it may construct invalid `char` values. | |
153 | /// | |
154 | /// For a safe version of this function, see the [`from_u32()`] function. | |
155 | /// | |
156 | /// [`from_u32()`]: fn.from_u32.html | |
157 | /// | |
158 | /// # Examples | |
159 | /// | |
160 | /// Basic usage: | |
161 | /// | |
162 | /// ``` | |
163 | /// use std::char; | |
164 | /// | |
165 | /// let c = unsafe { char::from_u32_unchecked(0x2764) }; | |
166 | /// | |
167 | /// assert_eq!('❤', c); | |
168 | /// ``` | |
c1a9b12d | 169 | #[inline] |
b039eaaf | 170 | #[stable(feature = "char_from_unchecked", since = "1.5.0")] |
c1a9b12d SL |
171 | pub unsafe fn from_u32_unchecked(i: u32) -> char { |
172 | transmute(i) | |
173 | } | |
174 | ||
9e0c209e SL |
175 | #[stable(feature = "char_convert", since = "1.13.0")] |
176 | impl From<char> for u32 { | |
177 | #[inline] | |
178 | fn from(c: char) -> Self { | |
179 | c as u32 | |
180 | } | |
181 | } | |
182 | ||
183 | /// Maps a byte in 0x00...0xFF to a `char` whose code point has the same value, in U+0000 to U+00FF. | |
184 | /// | |
185 | /// Unicode is designed such that this effectively decodes bytes | |
186 | /// with the character encoding that IANA calls ISO-8859-1. | |
187 | /// This encoding is compatible with ASCII. | |
188 | /// | |
189 | /// Note that this is different from ISO/IEC 8859-1 a.k.a. ISO 8859-1 (with one less hypen), | |
190 | /// which leaves some "blanks", byte values that are not assigned to any character. | |
191 | /// ISO-8859-1 (the IANA one) assigns them to the C0 and C1 control codes. | |
192 | /// | |
193 | /// Note that this is *also* different from Windows-1252 a.k.a. code page 1252, | |
194 | /// which is a superset ISO/IEC 8859-1 that assigns some (not all!) blanks | |
195 | /// to punctuation and various Latin characters. | |
196 | /// | |
197 | /// To confuse things further, [on the Web](https://encoding.spec.whatwg.org/) | |
198 | /// `ascii`, `iso-8859-1`, and `windows-1252` are all aliases | |
199 | /// for a superset of Windows-1252 that fills the remaining blanks with corresponding | |
200 | /// C0 and C1 control codes. | |
201 | #[stable(feature = "char_convert", since = "1.13.0")] | |
202 | impl From<u8> for char { | |
203 | #[inline] | |
204 | fn from(i: u8) -> Self { | |
205 | i as char | |
206 | } | |
207 | } | |
208 | ||
209 | #[unstable(feature = "try_from", issue = "33417")] | |
210 | impl TryFrom<u32> for char { | |
211 | type Err = CharTryFromError; | |
212 | ||
213 | #[inline] | |
214 | fn try_from(i: u32) -> Result<Self, Self::Err> { | |
215 | if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) { | |
216 | Err(CharTryFromError(())) | |
217 | } else { | |
218 | Ok(unsafe { from_u32_unchecked(i) }) | |
219 | } | |
220 | } | |
221 | } | |
222 | ||
223 | /// The error type returned when a conversion from u32 to char fails. | |
224 | #[unstable(feature = "try_from", issue = "33417")] | |
225 | #[derive(Copy, Clone, Debug, PartialEq, Eq)] | |
226 | pub struct CharTryFromError(()); | |
227 | ||
228 | #[unstable(feature = "try_from", issue = "33417")] | |
229 | impl fmt::Display for CharTryFromError { | |
230 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
231 | "converted integer out of range for `char`".fmt(f) | |
232 | } | |
233 | } | |
234 | ||
92a42be0 | 235 | /// Converts a digit in the given radix to a `char`. |
1a4d82fc | 236 | /// |
92a42be0 SL |
237 | /// A 'radix' here is sometimes also called a 'base'. A radix of two |
238 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 239 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
92a42be0 | 240 | /// radicum are supported. |
1a4d82fc | 241 | /// |
92a42be0 SL |
242 | /// `from_digit()` will return `None` if the input is not a digit in |
243 | /// the given radix. | |
1a4d82fc JJ |
244 | /// |
245 | /// # Panics | |
246 | /// | |
92a42be0 | 247 | /// Panics if given a radix larger than 36. |
1a4d82fc | 248 | /// |
85aaf69f SL |
249 | /// # Examples |
250 | /// | |
92a42be0 SL |
251 | /// Basic usage: |
252 | /// | |
85aaf69f SL |
253 | /// ``` |
254 | /// use std::char; | |
255 | /// | |
256 | /// let c = char::from_digit(4, 10); | |
257 | /// | |
92a42be0 SL |
258 | /// assert_eq!(Some('4'), c); |
259 | /// | |
260 | /// // Decimal 11 is a single digit in base 16 | |
261 | /// let c = char::from_digit(11, 16); | |
262 | /// | |
263 | /// assert_eq!(Some('b'), c); | |
264 | /// ``` | |
265 | /// | |
266 | /// Returning `None` when the input is not a digit: | |
267 | /// | |
268 | /// ``` | |
269 | /// use std::char; | |
270 | /// | |
271 | /// let c = char::from_digit(20, 10); | |
272 | /// | |
273 | /// assert_eq!(None, c); | |
274 | /// ``` | |
275 | /// | |
276 | /// Passing a large radix, causing a panic: | |
277 | /// | |
278 | /// ``` | |
279 | /// use std::thread; | |
280 | /// use std::char; | |
281 | /// | |
282 | /// let result = thread::spawn(|| { | |
283 | /// // this panics | |
284 | /// let c = char::from_digit(1, 37); | |
285 | /// }).join(); | |
286 | /// | |
287 | /// assert!(result.is_err()); | |
85aaf69f | 288 | /// ``` |
1a4d82fc | 289 | #[inline] |
c34b1796 | 290 | #[stable(feature = "rust1", since = "1.0.0")] |
85aaf69f | 291 | pub fn from_digit(num: u32, radix: u32) -> Option<char> { |
1a4d82fc JJ |
292 | if radix > 36 { |
293 | panic!("from_digit: radix is too high (maximum 36)"); | |
294 | } | |
295 | if num < radix { | |
c1a9b12d SL |
296 | let num = num as u8; |
297 | if num < 10 { | |
298 | Some((b'0' + num) as char) | |
299 | } else { | |
300 | Some((b'a' + num - 10) as char) | |
1a4d82fc JJ |
301 | } |
302 | } else { | |
303 | None | |
304 | } | |
305 | } | |
306 | ||
c34b1796 AL |
307 | // NB: the stabilization and documentation for this trait is in |
308 | // unicode/char.rs, not here | |
309 | #[allow(missing_docs)] // docs in libunicode/u_char.rs | |
9346a6ac | 310 | #[doc(hidden)] |
62682a34 | 311 | #[unstable(feature = "core_char_ext", |
e9174d1e | 312 | reason = "the stable interface is `impl char` in later crate", |
54a0048b | 313 | issue = "32110")] |
1a4d82fc | 314 | pub trait CharExt { |
92a42be0 | 315 | #[stable(feature = "core", since = "1.6.0")] |
85aaf69f | 316 | fn is_digit(self, radix: u32) -> bool; |
92a42be0 | 317 | #[stable(feature = "core", since = "1.6.0")] |
85aaf69f | 318 | fn to_digit(self, radix: u32) -> Option<u32>; |
92a42be0 | 319 | #[stable(feature = "core", since = "1.6.0")] |
1a4d82fc | 320 | fn escape_unicode(self) -> EscapeUnicode; |
92a42be0 | 321 | #[stable(feature = "core", since = "1.6.0")] |
1a4d82fc | 322 | fn escape_default(self) -> EscapeDefault; |
5bcae85e SL |
323 | #[unstable(feature = "char_escape_debug", issue = "35068")] |
324 | fn escape_debug(self) -> EscapeDebug; | |
92a42be0 | 325 | #[stable(feature = "core", since = "1.6.0")] |
85aaf69f | 326 | fn len_utf8(self) -> usize; |
92a42be0 | 327 | #[stable(feature = "core", since = "1.6.0")] |
85aaf69f | 328 | fn len_utf16(self) -> usize; |
54a0048b SL |
329 | #[unstable(feature = "unicode", issue = "27784")] |
330 | fn encode_utf8(self) -> EncodeUtf8; | |
331 | #[unstable(feature = "unicode", issue = "27784")] | |
332 | fn encode_utf16(self) -> EncodeUtf16; | |
1a4d82fc JJ |
333 | } |
334 | ||
92a42be0 | 335 | #[stable(feature = "core", since = "1.6.0")] |
1a4d82fc | 336 | impl CharExt for char { |
62682a34 | 337 | #[inline] |
85aaf69f | 338 | fn is_digit(self, radix: u32) -> bool { |
1a4d82fc JJ |
339 | self.to_digit(radix).is_some() |
340 | } | |
341 | ||
62682a34 | 342 | #[inline] |
85aaf69f | 343 | fn to_digit(self, radix: u32) -> Option<u32> { |
1a4d82fc JJ |
344 | if radix > 36 { |
345 | panic!("to_digit: radix is too high (maximum 36)"); | |
346 | } | |
347 | let val = match self { | |
85aaf69f SL |
348 | '0' ... '9' => self as u32 - '0' as u32, |
349 | 'a' ... 'z' => self as u32 - 'a' as u32 + 10, | |
350 | 'A' ... 'Z' => self as u32 - 'A' as u32 + 10, | |
1a4d82fc JJ |
351 | _ => return None, |
352 | }; | |
353 | if val < radix { Some(val) } | |
354 | else { None } | |
355 | } | |
356 | ||
62682a34 | 357 | #[inline] |
1a4d82fc | 358 | fn escape_unicode(self) -> EscapeUnicode { |
a7813a04 XL |
359 | let c = self as u32; |
360 | ||
361 | // or-ing 1 ensures that for c==0 the code computes that one | |
362 | // digit should be printed and (which is the same) avoids the | |
363 | // (31 - 32) underflow | |
364 | let msb = 31 - (c | 1).leading_zeros(); | |
365 | ||
366 | // the index of the most significant hex digit | |
367 | let ms_hex_digit = msb / 4; | |
368 | EscapeUnicode { | |
369 | c: self, | |
370 | state: EscapeUnicodeState::Backslash, | |
371 | hex_digit_idx: ms_hex_digit as usize, | |
372 | } | |
1a4d82fc JJ |
373 | } |
374 | ||
62682a34 | 375 | #[inline] |
1a4d82fc JJ |
376 | fn escape_default(self) -> EscapeDefault { |
377 | let init_state = match self { | |
378 | '\t' => EscapeDefaultState::Backslash('t'), | |
379 | '\r' => EscapeDefaultState::Backslash('r'), | |
380 | '\n' => EscapeDefaultState::Backslash('n'), | |
b039eaaf | 381 | '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), |
1a4d82fc JJ |
382 | '\x20' ... '\x7e' => EscapeDefaultState::Char(self), |
383 | _ => EscapeDefaultState::Unicode(self.escape_unicode()) | |
384 | }; | |
385 | EscapeDefault { state: init_state } | |
386 | } | |
387 | ||
5bcae85e SL |
388 | #[inline] |
389 | fn escape_debug(self) -> EscapeDebug { | |
390 | let init_state = match self { | |
391 | '\t' => EscapeDefaultState::Backslash('t'), | |
392 | '\r' => EscapeDefaultState::Backslash('r'), | |
393 | '\n' => EscapeDefaultState::Backslash('n'), | |
394 | '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), | |
395 | c if is_printable(c) => EscapeDefaultState::Char(c), | |
396 | c => EscapeDefaultState::Unicode(c.escape_unicode()), | |
397 | }; | |
398 | EscapeDebug(EscapeDefault { state: init_state }) | |
399 | } | |
400 | ||
1a4d82fc | 401 | #[inline] |
85aaf69f | 402 | fn len_utf8(self) -> usize { |
1a4d82fc | 403 | let code = self as u32; |
c34b1796 AL |
404 | if code < MAX_ONE_B { |
405 | 1 | |
406 | } else if code < MAX_TWO_B { | |
407 | 2 | |
408 | } else if code < MAX_THREE_B { | |
409 | 3 | |
410 | } else { | |
411 | 4 | |
1a4d82fc JJ |
412 | } |
413 | } | |
414 | ||
415 | #[inline] | |
85aaf69f | 416 | fn len_utf16(self) -> usize { |
1a4d82fc | 417 | let ch = self as u32; |
c34b1796 | 418 | if (ch & 0xFFFF) == ch { 1 } else { 2 } |
1a4d82fc JJ |
419 | } |
420 | ||
421 | #[inline] | |
54a0048b SL |
422 | fn encode_utf8(self) -> EncodeUtf8 { |
423 | let code = self as u32; | |
424 | let mut buf = [0; 4]; | |
425 | let pos = if code < MAX_ONE_B { | |
426 | buf[3] = code as u8; | |
427 | 3 | |
428 | } else if code < MAX_TWO_B { | |
429 | buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; | |
430 | buf[3] = (code & 0x3F) as u8 | TAG_CONT; | |
431 | 2 | |
432 | } else if code < MAX_THREE_B { | |
433 | buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; | |
434 | buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
435 | buf[3] = (code & 0x3F) as u8 | TAG_CONT; | |
436 | 1 | |
437 | } else { | |
438 | buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; | |
439 | buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; | |
440 | buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
441 | buf[3] = (code & 0x3F) as u8 | TAG_CONT; | |
442 | 0 | |
443 | }; | |
444 | EncodeUtf8 { buf: buf, pos: pos } | |
1a4d82fc JJ |
445 | } |
446 | ||
447 | #[inline] | |
54a0048b SL |
448 | fn encode_utf16(self) -> EncodeUtf16 { |
449 | let mut buf = [0; 2]; | |
450 | let mut code = self as u32; | |
451 | let pos = if (code & 0xFFFF) == code { | |
452 | // The BMP falls through (assuming non-surrogate, as it should) | |
453 | buf[1] = code as u16; | |
454 | 1 | |
455 | } else { | |
456 | // Supplementary planes break into surrogates. | |
457 | code -= 0x1_0000; | |
458 | buf[0] = 0xD800 | ((code >> 10) as u16); | |
459 | buf[1] = 0xDC00 | ((code as u16) & 0x3FF); | |
460 | 0 | |
461 | }; | |
462 | EncodeUtf16 { buf: buf, pos: pos } | |
1a4d82fc JJ |
463 | } |
464 | } | |
465 | ||
92a42be0 SL |
466 | /// Returns an iterator that yields the hexadecimal Unicode escape of a |
467 | /// character, as `char`s. | |
468 | /// | |
469 | /// This `struct` is created by the [`escape_unicode()`] method on [`char`]. See | |
470 | /// its documentation for more. | |
471 | /// | |
54a0048b SL |
472 | /// [`escape_unicode()`]: ../../std/primitive.char.html#method.escape_unicode |
473 | /// [`char`]: ../../std/primitive.char.html | |
474 | #[derive(Clone, Debug)] | |
85aaf69f | 475 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
476 | pub struct EscapeUnicode { |
477 | c: char, | |
a7813a04 XL |
478 | state: EscapeUnicodeState, |
479 | ||
480 | // The index of the next hex digit to be printed (0 if none), | |
481 | // i.e. the number of remaining hex digits to be printed; | |
482 | // increasing from the least significant digit: 0x543210 | |
483 | hex_digit_idx: usize, | |
1a4d82fc JJ |
484 | } |
485 | ||
3157f602 XL |
486 | // The enum values are ordered so that their representation is the |
487 | // same as the remaining length (besides the hexadecimal digits). This | |
488 | // likely makes `len()` a single load from memory) and inline-worth. | |
54a0048b | 489 | #[derive(Clone, Debug)] |
1a4d82fc | 490 | enum EscapeUnicodeState { |
1a4d82fc | 491 | Done, |
3157f602 XL |
492 | RightBrace, |
493 | Value, | |
494 | LeftBrace, | |
495 | Type, | |
496 | Backslash, | |
1a4d82fc JJ |
497 | } |
498 | ||
85aaf69f | 499 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
500 | impl Iterator for EscapeUnicode { |
501 | type Item = char; | |
502 | ||
503 | fn next(&mut self) -> Option<char> { | |
504 | match self.state { | |
505 | EscapeUnicodeState::Backslash => { | |
506 | self.state = EscapeUnicodeState::Type; | |
507 | Some('\\') | |
508 | } | |
509 | EscapeUnicodeState::Type => { | |
510 | self.state = EscapeUnicodeState::LeftBrace; | |
511 | Some('u') | |
512 | } | |
513 | EscapeUnicodeState::LeftBrace => { | |
a7813a04 | 514 | self.state = EscapeUnicodeState::Value; |
1a4d82fc JJ |
515 | Some('{') |
516 | } | |
a7813a04 XL |
517 | EscapeUnicodeState::Value => { |
518 | let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf; | |
519 | let c = from_digit(hex_digit, 16).unwrap(); | |
520 | if self.hex_digit_idx == 0 { | |
1a4d82fc JJ |
521 | self.state = EscapeUnicodeState::RightBrace; |
522 | } else { | |
a7813a04 | 523 | self.hex_digit_idx -= 1; |
1a4d82fc | 524 | } |
c1a9b12d | 525 | Some(c) |
1a4d82fc JJ |
526 | } |
527 | EscapeUnicodeState::RightBrace => { | |
528 | self.state = EscapeUnicodeState::Done; | |
529 | Some('}') | |
530 | } | |
531 | EscapeUnicodeState::Done => None, | |
532 | } | |
533 | } | |
b039eaaf | 534 | |
3157f602 | 535 | #[inline] |
b039eaaf | 536 | fn size_hint(&self) -> (usize, Option<usize>) { |
3157f602 | 537 | let n = self.len(); |
b039eaaf SL |
538 | (n, Some(n)) |
539 | } | |
a7813a04 | 540 | |
3157f602 XL |
541 | #[inline] |
542 | fn count(self) -> usize { | |
543 | self.len() | |
544 | } | |
545 | ||
a7813a04 XL |
546 | fn last(self) -> Option<char> { |
547 | match self.state { | |
548 | EscapeUnicodeState::Done => None, | |
549 | ||
550 | EscapeUnicodeState::RightBrace | | |
551 | EscapeUnicodeState::Value | | |
552 | EscapeUnicodeState::LeftBrace | | |
553 | EscapeUnicodeState::Type | | |
554 | EscapeUnicodeState::Backslash => Some('}'), | |
555 | } | |
556 | } | |
1a4d82fc JJ |
557 | } |
558 | ||
3157f602 XL |
559 | #[stable(feature = "exact_size_escape", since = "1.11.0")] |
560 | impl ExactSizeIterator for EscapeUnicode { | |
561 | #[inline] | |
562 | fn len(&self) -> usize { | |
563 | // The match is a single memory access with no branching | |
564 | self.hex_digit_idx + match self.state { | |
565 | EscapeUnicodeState::Done => 0, | |
566 | EscapeUnicodeState::RightBrace => 1, | |
567 | EscapeUnicodeState::Value => 2, | |
568 | EscapeUnicodeState::LeftBrace => 3, | |
569 | EscapeUnicodeState::Type => 4, | |
570 | EscapeUnicodeState::Backslash => 5, | |
571 | } | |
572 | } | |
573 | } | |
574 | ||
9e0c209e SL |
575 | #[unstable(feature = "fused", issue = "35602")] |
576 | impl FusedIterator for EscapeUnicode {} | |
577 | ||
92a42be0 SL |
578 | /// An iterator that yields the literal escape code of a `char`. |
579 | /// | |
580 | /// This `struct` is created by the [`escape_default()`] method on [`char`]. See | |
581 | /// its documentation for more. | |
582 | /// | |
54a0048b SL |
583 | /// [`escape_default()`]: ../../std/primitive.char.html#method.escape_default |
584 | /// [`char`]: ../../std/primitive.char.html | |
585 | #[derive(Clone, Debug)] | |
85aaf69f | 586 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
587 | pub struct EscapeDefault { |
588 | state: EscapeDefaultState | |
589 | } | |
590 | ||
54a0048b | 591 | #[derive(Clone, Debug)] |
1a4d82fc | 592 | enum EscapeDefaultState { |
1a4d82fc | 593 | Done, |
3157f602 XL |
594 | Char(char), |
595 | Backslash(char), | |
1a4d82fc JJ |
596 | Unicode(EscapeUnicode), |
597 | } | |
598 | ||
85aaf69f | 599 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
600 | impl Iterator for EscapeDefault { |
601 | type Item = char; | |
602 | ||
603 | fn next(&mut self) -> Option<char> { | |
604 | match self.state { | |
605 | EscapeDefaultState::Backslash(c) => { | |
606 | self.state = EscapeDefaultState::Char(c); | |
607 | Some('\\') | |
608 | } | |
609 | EscapeDefaultState::Char(c) => { | |
610 | self.state = EscapeDefaultState::Done; | |
611 | Some(c) | |
612 | } | |
613 | EscapeDefaultState::Done => None, | |
b039eaaf SL |
614 | EscapeDefaultState::Unicode(ref mut iter) => iter.next(), |
615 | } | |
616 | } | |
617 | ||
3157f602 | 618 | #[inline] |
b039eaaf | 619 | fn size_hint(&self) -> (usize, Option<usize>) { |
3157f602 XL |
620 | let n = self.len(); |
621 | (n, Some(n)) | |
1a4d82fc | 622 | } |
9cc50fc6 | 623 | |
3157f602 | 624 | #[inline] |
9cc50fc6 | 625 | fn count(self) -> usize { |
3157f602 | 626 | self.len() |
9cc50fc6 SL |
627 | } |
628 | ||
629 | fn nth(&mut self, n: usize) -> Option<char> { | |
630 | match self.state { | |
631 | EscapeDefaultState::Backslash(c) if n == 0 => { | |
632 | self.state = EscapeDefaultState::Char(c); | |
633 | Some('\\') | |
634 | }, | |
635 | EscapeDefaultState::Backslash(c) if n == 1 => { | |
636 | self.state = EscapeDefaultState::Done; | |
637 | Some(c) | |
638 | }, | |
639 | EscapeDefaultState::Backslash(_) => { | |
640 | self.state = EscapeDefaultState::Done; | |
641 | None | |
642 | }, | |
643 | EscapeDefaultState::Char(c) => { | |
644 | self.state = EscapeDefaultState::Done; | |
645 | ||
646 | if n == 0 { | |
647 | Some(c) | |
648 | } else { | |
649 | None | |
650 | } | |
651 | }, | |
652 | EscapeDefaultState::Done => return None, | |
653 | EscapeDefaultState::Unicode(ref mut i) => return i.nth(n), | |
654 | } | |
655 | } | |
656 | ||
657 | fn last(self) -> Option<char> { | |
658 | match self.state { | |
659 | EscapeDefaultState::Unicode(iter) => iter.last(), | |
660 | EscapeDefaultState::Done => None, | |
661 | EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c), | |
662 | } | |
663 | } | |
1a4d82fc | 664 | } |
54a0048b | 665 | |
3157f602 XL |
666 | #[stable(feature = "exact_size_escape", since = "1.11.0")] |
667 | impl ExactSizeIterator for EscapeDefault { | |
668 | fn len(&self) -> usize { | |
669 | match self.state { | |
670 | EscapeDefaultState::Done => 0, | |
671 | EscapeDefaultState::Char(_) => 1, | |
672 | EscapeDefaultState::Backslash(_) => 2, | |
673 | EscapeDefaultState::Unicode(ref iter) => iter.len(), | |
674 | } | |
675 | } | |
676 | } | |
677 | ||
9e0c209e SL |
678 | #[unstable(feature = "fused", issue = "35602")] |
679 | impl FusedIterator for EscapeDefault {} | |
680 | ||
5bcae85e SL |
681 | /// An iterator that yields the literal escape code of a `char`. |
682 | /// | |
683 | /// This `struct` is created by the [`escape_debug()`] method on [`char`]. See its | |
684 | /// documentation for more. | |
685 | /// | |
686 | /// [`escape_debug()`]: ../../std/primitive.char.html#method.escape_debug | |
687 | /// [`char`]: ../../std/primitive.char.html | |
688 | #[unstable(feature = "char_escape_debug", issue = "35068")] | |
689 | #[derive(Clone, Debug)] | |
690 | pub struct EscapeDebug(EscapeDefault); | |
691 | ||
692 | #[unstable(feature = "char_escape_debug", issue = "35068")] | |
693 | impl Iterator for EscapeDebug { | |
694 | type Item = char; | |
695 | fn next(&mut self) -> Option<char> { self.0.next() } | |
696 | fn size_hint(&self) -> (usize, Option<usize>) { self.0.size_hint() } | |
697 | } | |
698 | ||
699 | #[unstable(feature = "char_escape_debug", issue = "35068")] | |
700 | impl ExactSizeIterator for EscapeDebug { } | |
701 | ||
9e0c209e SL |
702 | #[unstable(feature = "fused", issue = "35602")] |
703 | impl FusedIterator for EscapeDebug {} | |
704 | ||
54a0048b SL |
705 | /// An iterator over `u8` entries represending the UTF-8 encoding of a `char` |
706 | /// value. | |
707 | /// | |
708 | /// Constructed via the `.encode_utf8()` method on `char`. | |
709 | #[unstable(feature = "unicode", issue = "27784")] | |
710 | #[derive(Debug)] | |
711 | pub struct EncodeUtf8 { | |
712 | buf: [u8; 4], | |
713 | pos: usize, | |
714 | } | |
715 | ||
716 | impl EncodeUtf8 { | |
717 | /// Returns the remaining bytes of this iterator as a slice. | |
718 | #[unstable(feature = "unicode", issue = "27784")] | |
719 | pub fn as_slice(&self) -> &[u8] { | |
720 | &self.buf[self.pos..] | |
721 | } | |
722 | } | |
723 | ||
724 | #[unstable(feature = "unicode", issue = "27784")] | |
725 | impl Iterator for EncodeUtf8 { | |
726 | type Item = u8; | |
727 | ||
728 | fn next(&mut self) -> Option<u8> { | |
729 | if self.pos == self.buf.len() { | |
730 | None | |
731 | } else { | |
732 | let ret = Some(self.buf[self.pos]); | |
733 | self.pos += 1; | |
734 | ret | |
735 | } | |
736 | } | |
737 | ||
738 | fn size_hint(&self) -> (usize, Option<usize>) { | |
739 | self.as_slice().iter().size_hint() | |
740 | } | |
741 | } | |
742 | ||
9e0c209e SL |
743 | #[unstable(feature = "fused", issue = "35602")] |
744 | impl FusedIterator for EncodeUtf8 {} | |
745 | ||
54a0048b SL |
746 | /// An iterator over `u16` entries represending the UTF-16 encoding of a `char` |
747 | /// value. | |
748 | /// | |
749 | /// Constructed via the `.encode_utf16()` method on `char`. | |
750 | #[unstable(feature = "unicode", issue = "27784")] | |
751 | #[derive(Debug)] | |
752 | pub struct EncodeUtf16 { | |
753 | buf: [u16; 2], | |
754 | pos: usize, | |
755 | } | |
756 | ||
757 | impl EncodeUtf16 { | |
758 | /// Returns the remaining bytes of this iterator as a slice. | |
759 | #[unstable(feature = "unicode", issue = "27784")] | |
760 | pub fn as_slice(&self) -> &[u16] { | |
761 | &self.buf[self.pos..] | |
762 | } | |
763 | } | |
764 | ||
765 | ||
766 | #[unstable(feature = "unicode", issue = "27784")] | |
767 | impl Iterator for EncodeUtf16 { | |
768 | type Item = u16; | |
769 | ||
770 | fn next(&mut self) -> Option<u16> { | |
771 | if self.pos == self.buf.len() { | |
772 | None | |
773 | } else { | |
774 | let ret = Some(self.buf[self.pos]); | |
775 | self.pos += 1; | |
776 | ret | |
777 | } | |
778 | } | |
779 | ||
780 | fn size_hint(&self) -> (usize, Option<usize>) { | |
781 | self.as_slice().iter().size_hint() | |
782 | } | |
783 | } | |
5bcae85e | 784 | |
9e0c209e SL |
785 | #[unstable(feature = "fused", issue = "35602")] |
786 | impl FusedIterator for EncodeUtf16 {} | |
5bcae85e SL |
787 | |
788 | /// An iterator over an iterator of bytes of the characters the bytes represent | |
789 | /// as UTF-8 | |
790 | #[unstable(feature = "decode_utf8", issue = "33906")] | |
791 | #[derive(Clone, Debug)] | |
792 | pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>); | |
793 | ||
794 | /// Decodes an `Iterator` of bytes as UTF-8. | |
795 | #[unstable(feature = "decode_utf8", issue = "33906")] | |
796 | #[inline] | |
797 | pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> { | |
798 | DecodeUtf8(i.into_iter().peekable()) | |
799 | } | |
800 | ||
801 | /// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence. | |
802 | #[unstable(feature = "decode_utf8", issue = "33906")] | |
9e0c209e | 803 | #[derive(PartialEq, Eq, Debug)] |
5bcae85e SL |
804 | pub struct InvalidSequence(()); |
805 | ||
806 | #[unstable(feature = "decode_utf8", issue = "33906")] | |
807 | impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> { | |
808 | type Item = Result<char, InvalidSequence>; | |
809 | #[inline] | |
9e0c209e | 810 | |
5bcae85e | 811 | fn next(&mut self) -> Option<Result<char, InvalidSequence>> { |
9e0c209e SL |
812 | self.0.next().map(|first_byte| { |
813 | // Emit InvalidSequence according to | |
814 | // Unicode §5.22 Best Practice for U+FFFD Substitution | |
815 | // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630 | |
816 | ||
817 | // Roughly: consume at least one byte, | |
818 | // then validate one byte at a time and stop before the first unexpected byte | |
819 | // (which might be the valid start of the next byte sequence). | |
820 | ||
821 | let mut code_point; | |
822 | macro_rules! first_byte { | |
823 | ($mask: expr) => { | |
824 | code_point = u32::from(first_byte & $mask) | |
825 | } | |
826 | } | |
827 | macro_rules! continuation_byte { | |
828 | () => { continuation_byte!(0x80...0xBF) }; | |
829 | ($range: pat) => { | |
5bcae85e | 830 | match self.0.peek() { |
9e0c209e SL |
831 | Some(&byte @ $range) => { |
832 | code_point = (code_point << 6) | u32::from(byte & 0b0011_1111); | |
5bcae85e | 833 | self.0.next(); |
9e0c209e SL |
834 | } |
835 | _ => return Err(InvalidSequence(())) | |
5bcae85e SL |
836 | } |
837 | } | |
9e0c209e SL |
838 | } |
839 | ||
840 | match first_byte { | |
841 | 0x00...0x7F => { | |
842 | first_byte!(0b1111_1111); | |
843 | } | |
844 | 0xC2...0xDF => { | |
845 | first_byte!(0b0001_1111); | |
846 | continuation_byte!(); | |
847 | } | |
848 | 0xE0 => { | |
849 | first_byte!(0b0000_1111); | |
850 | continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong | |
851 | continuation_byte!(); | |
852 | } | |
853 | 0xE1...0xEC | 0xEE...0xEF => { | |
854 | first_byte!(0b0000_1111); | |
855 | continuation_byte!(); | |
856 | continuation_byte!(); | |
857 | } | |
858 | 0xED => { | |
859 | first_byte!(0b0000_1111); | |
860 | continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates | |
861 | continuation_byte!(); | |
5bcae85e | 862 | } |
9e0c209e SL |
863 | 0xF0 => { |
864 | first_byte!(0b0000_0111); | |
865 | continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong | |
866 | continuation_byte!(); | |
867 | continuation_byte!(); | |
868 | } | |
869 | 0xF1...0xF3 => { | |
870 | first_byte!(0b0000_0111); | |
871 | continuation_byte!(); | |
872 | continuation_byte!(); | |
873 | continuation_byte!(); | |
874 | } | |
875 | 0xF4 => { | |
876 | first_byte!(0b0000_0111); | |
877 | continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX | |
878 | continuation_byte!(); | |
879 | continuation_byte!(); | |
880 | } | |
881 | _ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX | |
882 | } | |
883 | unsafe { | |
884 | Ok(from_u32_unchecked(code_point)) | |
5bcae85e SL |
885 | } |
886 | }) | |
887 | } | |
888 | } | |
9e0c209e SL |
889 | |
890 | #[unstable(feature = "fused", issue = "35602")] | |
891 | impl<I: FusedIterator<Item = u8>> FusedIterator for DecodeUtf8<I> {} |