]>
Commit | Line | Data |
---|---|---|
83c7162d | 1 | //! impl char {} |
c34b1796 | 2 | |
6a06907d | 3 | use crate::intrinsics::likely; |
48663c56 XL |
4 | use crate::slice; |
5 | use crate::str::from_utf8_unchecked_mut; | |
6 | use crate::unicode::printable::is_printable; | |
dfeec247 | 7 | use crate::unicode::{self, conversions}; |
48663c56 | 8 | |
83c7162d | 9 | use super::*; |
32a655c1 | 10 | |
c34b1796 AL |
11 | #[lang = "char"] |
12 | impl char { | |
f9f354fc XL |
13 | /// The highest valid code point a `char` can have. |
14 | /// | |
15 | /// A `char` is a [Unicode Scalar Value], which means that it is a [Code | |
16 | /// Point], but only ones within a certain range. `MAX` is the highest valid | |
17 | /// code point that's a valid [Unicode Scalar Value]. | |
18 | /// | |
19 | /// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value | |
20 | /// [Code Point]: http://www.unicode.org/glossary/#code_point | |
6a06907d | 21 | #[stable(feature = "assoc_char_consts", since = "1.52.0")] |
f9f354fc XL |
22 | pub const MAX: char = '\u{10ffff}'; |
23 | ||
24 | /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a | |
25 | /// decoding error. | |
26 | /// | |
27 | /// It can occur, for example, when giving ill-formed UTF-8 bytes to | |
28 | /// [`String::from_utf8_lossy`](string/struct.String.html#method.from_utf8_lossy). | |
6a06907d | 29 | #[stable(feature = "assoc_char_consts", since = "1.52.0")] |
f9f354fc XL |
30 | pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; |
31 | ||
32 | /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of | |
33 | /// `char` and `str` methods are based on. | |
34 | /// | |
35 | /// New versions of Unicode are released regularly and subsequently all methods | |
36 | /// in the standard library depending on Unicode are updated. Therefore the | |
37 | /// behavior of some `char` and `str` methods and the value of this constant | |
38 | /// changes over time. This is *not* considered to be a breaking change. | |
39 | /// | |
40 | /// The version numbering scheme is explained in | |
41 | /// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). | |
6a06907d | 42 | #[stable(feature = "assoc_char_consts", since = "1.52.0")] |
f9f354fc XL |
43 | pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION; |
44 | ||
45 | /// Creates an iterator over the UTF-16 encoded code points in `iter`, | |
46 | /// returning unpaired surrogates as `Err`s. | |
47 | /// | |
48 | /// # Examples | |
49 | /// | |
50 | /// Basic usage: | |
51 | /// | |
52 | /// ``` | |
53 | /// use std::char::decode_utf16; | |
54 | /// | |
55 | /// // 𝄞mus<invalid>ic<invalid> | |
56 | /// let v = [ | |
57 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, | |
58 | /// ]; | |
59 | /// | |
60 | /// assert_eq!( | |
61 | /// decode_utf16(v.iter().cloned()) | |
62 | /// .map(|r| r.map_err(|e| e.unpaired_surrogate())) | |
63 | /// .collect::<Vec<_>>(), | |
64 | /// vec![ | |
65 | /// Ok('𝄞'), | |
66 | /// Ok('m'), Ok('u'), Ok('s'), | |
67 | /// Err(0xDD1E), | |
68 | /// Ok('i'), Ok('c'), | |
69 | /// Err(0xD834) | |
70 | /// ] | |
71 | /// ); | |
72 | /// ``` | |
73 | /// | |
74 | /// A lossy decoder can be obtained by replacing `Err` results with the replacement character: | |
75 | /// | |
76 | /// ``` | |
77 | /// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; | |
78 | /// | |
79 | /// // 𝄞mus<invalid>ic<invalid> | |
80 | /// let v = [ | |
81 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, | |
82 | /// ]; | |
83 | /// | |
84 | /// assert_eq!( | |
85 | /// decode_utf16(v.iter().cloned()) | |
86 | /// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) | |
87 | /// .collect::<String>(), | |
88 | /// "𝄞mus�ic�" | |
89 | /// ); | |
90 | /// ``` | |
6a06907d | 91 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
92 | #[inline] |
93 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> { | |
94 | super::decode::decode_utf16(iter) | |
95 | } | |
96 | ||
97 | /// Converts a `u32` to a `char`. | |
98 | /// | |
99 | /// Note that all `char`s are valid [`u32`]s, and can be cast to one with | |
100 | /// `as`: | |
101 | /// | |
102 | /// ``` | |
103 | /// let c = '💯'; | |
104 | /// let i = c as u32; | |
105 | /// | |
106 | /// assert_eq!(128175, i); | |
107 | /// ``` | |
108 | /// | |
109 | /// However, the reverse is not true: not all valid [`u32`]s are valid | |
110 | /// `char`s. `from_u32()` will return `None` if the input is not a valid value | |
111 | /// for a `char`. | |
112 | /// | |
f9f354fc XL |
113 | /// For an unsafe version of this function which ignores these checks, see |
114 | /// [`from_u32_unchecked`]. | |
115 | /// | |
116 | /// [`from_u32_unchecked`]: #method.from_u32_unchecked | |
117 | /// | |
118 | /// # Examples | |
119 | /// | |
120 | /// Basic usage: | |
121 | /// | |
122 | /// ``` | |
123 | /// use std::char; | |
124 | /// | |
125 | /// let c = char::from_u32(0x2764); | |
126 | /// | |
127 | /// assert_eq!(Some('❤'), c); | |
128 | /// ``` | |
129 | /// | |
130 | /// Returning `None` when the input is not a valid `char`: | |
131 | /// | |
132 | /// ``` | |
133 | /// use std::char; | |
134 | /// | |
135 | /// let c = char::from_u32(0x110000); | |
136 | /// | |
137 | /// assert_eq!(None, c); | |
138 | /// ``` | |
6a06907d | 139 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
140 | #[inline] |
141 | pub fn from_u32(i: u32) -> Option<char> { | |
142 | super::convert::from_u32(i) | |
143 | } | |
144 | ||
145 | /// Converts a `u32` to a `char`, ignoring validity. | |
146 | /// | |
147 | /// Note that all `char`s are valid [`u32`]s, and can be cast to one with | |
148 | /// `as`: | |
149 | /// | |
150 | /// ``` | |
151 | /// let c = '💯'; | |
152 | /// let i = c as u32; | |
153 | /// | |
154 | /// assert_eq!(128175, i); | |
155 | /// ``` | |
156 | /// | |
157 | /// However, the reverse is not true: not all valid [`u32`]s are valid | |
158 | /// `char`s. `from_u32_unchecked()` will ignore this, and blindly cast to | |
159 | /// `char`, possibly creating an invalid one. | |
160 | /// | |
f9f354fc XL |
161 | /// # Safety |
162 | /// | |
163 | /// This function is unsafe, as it may construct invalid `char` values. | |
164 | /// | |
165 | /// For a safe version of this function, see the [`from_u32`] function. | |
166 | /// | |
167 | /// [`from_u32`]: #method.from_u32 | |
168 | /// | |
169 | /// # Examples | |
170 | /// | |
171 | /// Basic usage: | |
172 | /// | |
173 | /// ``` | |
174 | /// use std::char; | |
175 | /// | |
176 | /// let c = unsafe { char::from_u32_unchecked(0x2764) }; | |
177 | /// | |
178 | /// assert_eq!('❤', c); | |
179 | /// ``` | |
6a06907d | 180 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
181 | #[inline] |
182 | pub unsafe fn from_u32_unchecked(i: u32) -> char { | |
f035d41b XL |
183 | // SAFETY: the safety contract must be upheld by the caller. |
184 | unsafe { super::convert::from_u32_unchecked(i) } | |
f9f354fc XL |
185 | } |
186 | ||
187 | /// Converts a digit in the given radix to a `char`. | |
188 | /// | |
189 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
190 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
191 | /// sixteen, hexadecimal, to give some common values. Arbitrary | |
192 | /// radices are supported. | |
193 | /// | |
194 | /// `from_digit()` will return `None` if the input is not a digit in | |
195 | /// the given radix. | |
196 | /// | |
197 | /// # Panics | |
198 | /// | |
199 | /// Panics if given a radix larger than 36. | |
200 | /// | |
201 | /// # Examples | |
202 | /// | |
203 | /// Basic usage: | |
204 | /// | |
205 | /// ``` | |
206 | /// use std::char; | |
207 | /// | |
208 | /// let c = char::from_digit(4, 10); | |
209 | /// | |
210 | /// assert_eq!(Some('4'), c); | |
211 | /// | |
212 | /// // Decimal 11 is a single digit in base 16 | |
213 | /// let c = char::from_digit(11, 16); | |
214 | /// | |
215 | /// assert_eq!(Some('b'), c); | |
216 | /// ``` | |
217 | /// | |
218 | /// Returning `None` when the input is not a digit: | |
219 | /// | |
220 | /// ``` | |
221 | /// use std::char; | |
222 | /// | |
223 | /// let c = char::from_digit(20, 10); | |
224 | /// | |
225 | /// assert_eq!(None, c); | |
226 | /// ``` | |
227 | /// | |
228 | /// Passing a large radix, causing a panic: | |
229 | /// | |
f035d41b | 230 | /// ```should_panic |
f9f354fc XL |
231 | /// use std::char; |
232 | /// | |
f035d41b XL |
233 | /// // this panics |
234 | /// char::from_digit(1, 37); | |
f9f354fc | 235 | /// ``` |
6a06907d | 236 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
237 | #[inline] |
238 | pub fn from_digit(num: u32, radix: u32) -> Option<char> { | |
239 | super::convert::from_digit(num, radix) | |
240 | } | |
241 | ||
b039eaaf SL |
242 | /// Checks if a `char` is a digit in the given radix. |
243 | /// | |
244 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
245 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 246 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
476ff2be | 247 | /// radices are supported. |
1a4d82fc | 248 | /// |
fc512014 | 249 | /// Compared to [`is_numeric()`], this function only recognizes the characters |
1a4d82fc JJ |
250 | /// `0-9`, `a-z` and `A-Z`. |
251 | /// | |
b039eaaf | 252 | /// 'Digit' is defined to be only the following characters: |
1a4d82fc | 253 | /// |
b039eaaf SL |
254 | /// * `0-9` |
255 | /// * `a-z` | |
256 | /// * `A-Z` | |
257 | /// | |
fc512014 | 258 | /// For a more comprehensive understanding of 'digit', see [`is_numeric()`]. |
b039eaaf | 259 | /// |
fc512014 | 260 | /// [`is_numeric()`]: #method.is_numeric |
1a4d82fc JJ |
261 | /// |
262 | /// # Panics | |
263 | /// | |
b039eaaf | 264 | /// Panics if given a radix larger than 36. |
c34b1796 AL |
265 | /// |
266 | /// # Examples | |
267 | /// | |
b039eaaf SL |
268 | /// Basic usage: |
269 | /// | |
c34b1796 | 270 | /// ``` |
54a0048b SL |
271 | /// assert!('1'.is_digit(10)); |
272 | /// assert!('f'.is_digit(16)); | |
273 | /// assert!(!'f'.is_digit(10)); | |
b039eaaf SL |
274 | /// ``` |
275 | /// | |
276 | /// Passing a large radix, causing a panic: | |
277 | /// | |
f035d41b XL |
278 | /// ```should_panic |
279 | /// // this panics | |
280 | /// '1'.is_digit(37); | |
c34b1796 AL |
281 | /// ``` |
282 | #[stable(feature = "rust1", since = "1.0.0")] | |
283 | #[inline] | |
b039eaaf | 284 | pub fn is_digit(self, radix: u32) -> bool { |
83c7162d | 285 | self.to_digit(radix).is_some() |
b039eaaf | 286 | } |
1a4d82fc | 287 | |
b039eaaf SL |
288 | /// Converts a `char` to a digit in the given radix. |
289 | /// | |
290 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
291 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 292 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
476ff2be | 293 | /// radices are supported. |
b039eaaf SL |
294 | /// |
295 | /// 'Digit' is defined to be only the following characters: | |
296 | /// | |
297 | /// * `0-9` | |
298 | /// * `a-z` | |
299 | /// * `A-Z` | |
1a4d82fc | 300 | /// |
7453a54e | 301 | /// # Errors |
1a4d82fc | 302 | /// |
b039eaaf | 303 | /// Returns `None` if the `char` does not refer to a digit in the given radix. |
1a4d82fc JJ |
304 | /// |
305 | /// # Panics | |
306 | /// | |
b039eaaf | 307 | /// Panics if given a radix larger than 36. |
c34b1796 AL |
308 | /// |
309 | /// # Examples | |
310 | /// | |
b039eaaf SL |
311 | /// Basic usage: |
312 | /// | |
313 | /// ``` | |
54a0048b SL |
314 | /// assert_eq!('1'.to_digit(10), Some(1)); |
315 | /// assert_eq!('f'.to_digit(16), Some(15)); | |
b039eaaf SL |
316 | /// ``` |
317 | /// | |
318 | /// Passing a non-digit results in failure: | |
319 | /// | |
320 | /// ``` | |
54a0048b SL |
321 | /// assert_eq!('f'.to_digit(10), None); |
322 | /// assert_eq!('z'.to_digit(16), None); | |
b039eaaf SL |
323 | /// ``` |
324 | /// | |
325 | /// Passing a large radix, causing a panic: | |
326 | /// | |
f035d41b XL |
327 | /// ```should_panic |
328 | /// // this panics | |
329 | /// '1'.to_digit(37); | |
c34b1796 AL |
330 | /// ``` |
331 | #[stable(feature = "rust1", since = "1.0.0")] | |
62682a34 | 332 | #[inline] |
b039eaaf | 333 | pub fn to_digit(self, radix: u32) -> Option<u32> { |
a1dfa0c6 | 334 | assert!(radix <= 36, "to_digit: radix is too high (maximum 36)"); |
a1dfa0c6 XL |
335 | // the code is split up here to improve execution speed for cases where |
336 | // the `radix` is constant and 10 or smaller | |
6a06907d XL |
337 | let val = if likely(radix <= 10) { |
338 | // If not a digit, a number greater than radix will be created. | |
339 | (self as u32).wrapping_sub('0' as u32) | |
a1dfa0c6 XL |
340 | } else { |
341 | match self { | |
342 | '0'..='9' => self as u32 - '0' as u32, | |
343 | 'a'..='z' => self as u32 - 'a' as u32 + 10, | |
344 | 'A'..='Z' => self as u32 - 'A' as u32 + 10, | |
345 | _ => return None, | |
346 | } | |
83c7162d | 347 | }; |
a1dfa0c6 | 348 | |
60c5eb7d | 349 | if val < radix { Some(val) } else { None } |
b039eaaf | 350 | } |
1a4d82fc | 351 | |
c34b1796 | 352 | /// Returns an iterator that yields the hexadecimal Unicode escape of a |
32a655c1 | 353 | /// character as `char`s. |
1a4d82fc | 354 | /// |
32a655c1 SL |
355 | /// This will escape characters with the Rust syntax of the form |
356 | /// `\u{NNNNNN}` where `NNNNNN` is a hexadecimal representation. | |
c34b1796 AL |
357 | /// |
358 | /// # Examples | |
359 | /// | |
32a655c1 | 360 | /// As an iterator: |
92a42be0 | 361 | /// |
c34b1796 | 362 | /// ``` |
62682a34 SL |
363 | /// for c in '❤'.escape_unicode() { |
364 | /// print!("{}", c); | |
c34b1796 | 365 | /// } |
32a655c1 | 366 | /// println!(); |
c34b1796 AL |
367 | /// ``` |
368 | /// | |
32a655c1 | 369 | /// Using `println!` directly: |
c34b1796 | 370 | /// |
32a655c1 SL |
371 | /// ``` |
372 | /// println!("{}", '❤'.escape_unicode()); | |
c34b1796 AL |
373 | /// ``` |
374 | /// | |
32a655c1 | 375 | /// Both are equivalent to: |
c34b1796 AL |
376 | /// |
377 | /// ``` | |
32a655c1 SL |
378 | /// println!("\\u{{2764}}"); |
379 | /// ``` | |
c34b1796 | 380 | /// |
32a655c1 SL |
381 | /// Using `to_string`: |
382 | /// | |
383 | /// ``` | |
384 | /// assert_eq!('❤'.escape_unicode().to_string(), "\\u{2764}"); | |
c34b1796 | 385 | /// ``` |
85aaf69f | 386 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 387 | #[inline] |
b039eaaf | 388 | pub fn escape_unicode(self) -> EscapeUnicode { |
83c7162d XL |
389 | let c = self as u32; |
390 | ||
391 | // or-ing 1 ensures that for c==0 the code computes that one | |
392 | // digit should be printed and (which is the same) avoids the | |
393 | // (31 - 32) underflow | |
394 | let msb = 31 - (c | 1).leading_zeros(); | |
395 | ||
396 | // the index of the most significant hex digit | |
397 | let ms_hex_digit = msb / 4; | |
398 | EscapeUnicode { | |
399 | c: self, | |
400 | state: EscapeUnicodeState::Backslash, | |
401 | hex_digit_idx: ms_hex_digit as usize, | |
402 | } | |
b039eaaf | 403 | } |
1a4d82fc | 404 | |
94b46f34 XL |
405 | /// An extended version of `escape_debug` that optionally permits escaping |
406 | /// Extended Grapheme codepoints. This allows us to format characters like | |
407 | /// nonspacing marks better when they're at the start of a string. | |
94b46f34 | 408 | #[inline] |
9fa01778 | 409 | pub(crate) fn escape_debug_ext(self, escape_grapheme_extended: bool) -> EscapeDebug { |
94b46f34 XL |
410 | let init_state = match self { |
411 | '\t' => EscapeDefaultState::Backslash('t'), | |
412 | '\r' => EscapeDefaultState::Backslash('r'), | |
413 | '\n' => EscapeDefaultState::Backslash('n'), | |
414 | '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), | |
415 | _ if escape_grapheme_extended && self.is_grapheme_extended() => { | |
416 | EscapeDefaultState::Unicode(self.escape_unicode()) | |
417 | } | |
418 | _ if is_printable(self) => EscapeDefaultState::Char(self), | |
419 | _ => EscapeDefaultState::Unicode(self.escape_unicode()), | |
420 | }; | |
421 | EscapeDebug(EscapeDefault { state: init_state }) | |
422 | } | |
423 | ||
32a655c1 SL |
424 | /// Returns an iterator that yields the literal escape code of a character |
425 | /// as `char`s. | |
5bcae85e SL |
426 | /// |
427 | /// This will escape the characters similar to the `Debug` implementations | |
428 | /// of `str` or `char`. | |
429 | /// | |
430 | /// # Examples | |
431 | /// | |
32a655c1 | 432 | /// As an iterator: |
5bcae85e SL |
433 | /// |
434 | /// ``` | |
32a655c1 SL |
435 | /// for c in '\n'.escape_debug() { |
436 | /// print!("{}", c); | |
5bcae85e | 437 | /// } |
32a655c1 | 438 | /// println!(); |
5bcae85e SL |
439 | /// ``` |
440 | /// | |
32a655c1 | 441 | /// Using `println!` directly: |
5bcae85e | 442 | /// |
32a655c1 | 443 | /// ``` |
32a655c1 | 444 | /// println!("{}", '\n'.escape_debug()); |
5bcae85e SL |
445 | /// ``` |
446 | /// | |
32a655c1 | 447 | /// Both are equivalent to: |
5bcae85e SL |
448 | /// |
449 | /// ``` | |
32a655c1 SL |
450 | /// println!("\\n"); |
451 | /// ``` | |
5bcae85e | 452 | /// |
32a655c1 SL |
453 | /// Using `to_string`: |
454 | /// | |
455 | /// ``` | |
32a655c1 | 456 | /// assert_eq!('\n'.escape_debug().to_string(), "\\n"); |
5bcae85e | 457 | /// ``` |
041b39d2 | 458 | #[stable(feature = "char_escape_debug", since = "1.20.0")] |
5bcae85e SL |
459 | #[inline] |
460 | pub fn escape_debug(self) -> EscapeDebug { | |
94b46f34 | 461 | self.escape_debug_ext(true) |
5bcae85e SL |
462 | } |
463 | ||
32a655c1 SL |
464 | /// Returns an iterator that yields the literal escape code of a character |
465 | /// as `char`s. | |
1a4d82fc JJ |
466 | /// |
467 | /// The default is chosen with a bias toward producing literals that are | |
468 | /// legal in a variety of languages, including C++11 and similar C-family | |
469 | /// languages. The exact rules are: | |
470 | /// | |
b039eaaf SL |
471 | /// * Tab is escaped as `\t`. |
472 | /// * Carriage return is escaped as `\r`. | |
473 | /// * Line feed is escaped as `\n`. | |
474 | /// * Single quote is escaped as `\'`. | |
475 | /// * Double quote is escaped as `\"`. | |
476 | /// * Backslash is escaped as `\\`. | |
477 | /// * Any character in the 'printable ASCII' range `0x20` .. `0x7e` | |
478 | /// inclusive is not escaped. | |
479 | /// * All other characters are given hexadecimal Unicode escapes; see | |
fc512014 | 480 | /// [`escape_unicode`]. |
b039eaaf | 481 | /// |
fc512014 | 482 | /// [`escape_unicode`]: #method.escape_unicode |
c34b1796 AL |
483 | /// |
484 | /// # Examples | |
485 | /// | |
32a655c1 | 486 | /// As an iterator: |
b039eaaf | 487 | /// |
c34b1796 | 488 | /// ``` |
32a655c1 SL |
489 | /// for c in '"'.escape_default() { |
490 | /// print!("{}", c); | |
c34b1796 | 491 | /// } |
32a655c1 | 492 | /// println!(); |
c34b1796 AL |
493 | /// ``` |
494 | /// | |
32a655c1 | 495 | /// Using `println!` directly: |
c34b1796 | 496 | /// |
c34b1796 | 497 | /// ``` |
32a655c1 SL |
498 | /// println!("{}", '"'.escape_default()); |
499 | /// ``` | |
500 | /// | |
32a655c1 | 501 | /// Both are equivalent to: |
c34b1796 AL |
502 | /// |
503 | /// ``` | |
32a655c1 SL |
504 | /// println!("\\\""); |
505 | /// ``` | |
506 | /// | |
507 | /// Using `to_string`: | |
c34b1796 | 508 | /// |
32a655c1 SL |
509 | /// ``` |
510 | /// assert_eq!('"'.escape_default().to_string(), "\\\""); | |
c34b1796 | 511 | /// ``` |
85aaf69f | 512 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 513 | #[inline] |
b039eaaf | 514 | pub fn escape_default(self) -> EscapeDefault { |
83c7162d XL |
515 | let init_state = match self { |
516 | '\t' => EscapeDefaultState::Backslash('t'), | |
517 | '\r' => EscapeDefaultState::Backslash('r'), | |
518 | '\n' => EscapeDefaultState::Backslash('n'), | |
519 | '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), | |
e74abb32 XL |
520 | '\x20'..='\x7e' => EscapeDefaultState::Char(self), |
521 | _ => EscapeDefaultState::Unicode(self.escape_unicode()), | |
83c7162d XL |
522 | }; |
523 | EscapeDefault { state: init_state } | |
b039eaaf | 524 | } |
1a4d82fc | 525 | |
b039eaaf SL |
526 | /// Returns the number of bytes this `char` would need if encoded in UTF-8. |
527 | /// | |
528 | /// That number of bytes is always between 1 and 4, inclusive. | |
c34b1796 AL |
529 | /// |
530 | /// # Examples | |
531 | /// | |
b039eaaf SL |
532 | /// Basic usage: |
533 | /// | |
c34b1796 | 534 | /// ``` |
b039eaaf SL |
535 | /// let len = 'A'.len_utf8(); |
536 | /// assert_eq!(len, 1); | |
537 | /// | |
538 | /// let len = 'ß'.len_utf8(); | |
539 | /// assert_eq!(len, 2); | |
540 | /// | |
541 | /// let len = 'ℝ'.len_utf8(); | |
542 | /// assert_eq!(len, 3); | |
c34b1796 | 543 | /// |
b039eaaf SL |
544 | /// let len = '💣'.len_utf8(); |
545 | /// assert_eq!(len, 4); | |
546 | /// ``` | |
547 | /// | |
548 | /// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it | |
549 | /// would take if each code point was represented as a `char` vs in the `&str` itself: | |
550 | /// | |
551 | /// ``` | |
552 | /// // as chars | |
553 | /// let eastern = '東'; | |
dc9dc135 | 554 | /// let capital = '京'; |
b039eaaf SL |
555 | /// |
556 | /// // both can be represented as three bytes | |
557 | /// assert_eq!(3, eastern.len_utf8()); | |
dc9dc135 | 558 | /// assert_eq!(3, capital.len_utf8()); |
b039eaaf SL |
559 | /// |
560 | /// // as a &str, these two are encoded in UTF-8 | |
561 | /// let tokyo = "東京"; | |
562 | /// | |
dc9dc135 | 563 | /// let len = eastern.len_utf8() + capital.len_utf8(); |
b039eaaf SL |
564 | /// |
565 | /// // we can see that they take six bytes total... | |
566 | /// assert_eq!(6, tokyo.len()); | |
567 | /// | |
568 | /// // ... just like the &str | |
569 | /// assert_eq!(len, tokyo.len()); | |
c34b1796 | 570 | /// ``` |
85aaf69f | 571 | #[stable(feature = "rust1", since = "1.0.0")] |
6a06907d | 572 | #[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")] |
62682a34 | 573 | #[inline] |
6a06907d | 574 | pub const fn len_utf8(self) -> usize { |
f9f354fc | 575 | len_utf8(self as u32) |
b039eaaf | 576 | } |
1a4d82fc | 577 | |
b039eaaf | 578 | /// Returns the number of 16-bit code units this `char` would need if |
c34b1796 AL |
579 | /// encoded in UTF-16. |
580 | /// | |
fc512014 | 581 | /// See the documentation for [`len_utf8()`] for more explanation of this |
92a42be0 SL |
582 | /// concept. This function is a mirror, but for UTF-16 instead of UTF-8. |
583 | /// | |
fc512014 | 584 | /// [`len_utf8()`]: #method.len_utf8 |
b039eaaf | 585 | /// |
c34b1796 AL |
586 | /// # Examples |
587 | /// | |
92a42be0 SL |
588 | /// Basic usage: |
589 | /// | |
c34b1796 AL |
590 | /// ``` |
591 | /// let n = 'ß'.len_utf16(); | |
c34b1796 | 592 | /// assert_eq!(n, 1); |
b039eaaf SL |
593 | /// |
594 | /// let len = '💣'.len_utf16(); | |
595 | /// assert_eq!(len, 2); | |
c34b1796 | 596 | /// ``` |
85aaf69f | 597 | #[stable(feature = "rust1", since = "1.0.0")] |
6a06907d | 598 | #[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")] |
62682a34 | 599 | #[inline] |
6a06907d | 600 | pub const fn len_utf16(self) -> usize { |
83c7162d XL |
601 | let ch = self as u32; |
602 | if (ch & 0xFFFF) == ch { 1 } else { 2 } | |
b039eaaf | 603 | } |
1a4d82fc | 604 | |
c30ab7b3 SL |
605 | /// Encodes this character as UTF-8 into the provided byte buffer, |
606 | /// and then returns the subslice of the buffer that contains the encoded character. | |
c34b1796 | 607 | /// |
c30ab7b3 SL |
608 | /// # Panics |
609 | /// | |
610 | /// Panics if the buffer is not large enough. | |
611 | /// A buffer of length four is large enough to encode any `char`. | |
c34b1796 AL |
612 | /// |
613 | /// # Examples | |
614 | /// | |
c30ab7b3 SL |
615 | /// In both of these examples, 'ß' takes two bytes to encode. |
616 | /// | |
c34b1796 | 617 | /// ``` |
c30ab7b3 | 618 | /// let mut b = [0; 2]; |
c34b1796 | 619 | /// |
c30ab7b3 SL |
620 | /// let result = 'ß'.encode_utf8(&mut b); |
621 | /// | |
622 | /// assert_eq!(result, "ß"); | |
623 | /// | |
624 | /// assert_eq!(result.len(), 2); | |
625 | /// ``` | |
626 | /// | |
627 | /// A buffer that's too small: | |
628 | /// | |
f035d41b XL |
629 | /// ```should_panic |
630 | /// let mut b = [0; 1]; | |
c30ab7b3 | 631 | /// |
f035d41b XL |
632 | /// // this panics |
633 | /// 'ß'.encode_utf8(&mut b); | |
c34b1796 | 634 | /// ``` |
476ff2be | 635 | #[stable(feature = "unicode_encode_char", since = "1.15.0")] |
62682a34 | 636 | #[inline] |
c30ab7b3 | 637 | pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { |
f9f354fc XL |
638 | // SAFETY: `char` is not a surrogate, so this is valid UTF-8. |
639 | unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) } | |
62682a34 | 640 | } |
1a4d82fc | 641 | |
c30ab7b3 SL |
642 | /// Encodes this character as UTF-16 into the provided `u16` buffer, |
643 | /// and then returns the subslice of the buffer that contains the encoded character. | |
c34b1796 | 644 | /// |
c30ab7b3 SL |
645 | /// # Panics |
646 | /// | |
647 | /// Panics if the buffer is not large enough. | |
648 | /// A buffer of length 2 is large enough to encode any `char`. | |
c34b1796 AL |
649 | /// |
650 | /// # Examples | |
651 | /// | |
c30ab7b3 SL |
652 | /// In both of these examples, '𝕊' takes two `u16`s to encode. |
653 | /// | |
c34b1796 | 654 | /// ``` |
c30ab7b3 | 655 | /// let mut b = [0; 2]; |
c34b1796 | 656 | /// |
c30ab7b3 SL |
657 | /// let result = '𝕊'.encode_utf16(&mut b); |
658 | /// | |
659 | /// assert_eq!(result.len(), 2); | |
660 | /// ``` | |
661 | /// | |
662 | /// A buffer that's too small: | |
663 | /// | |
f035d41b XL |
664 | /// ```should_panic |
665 | /// let mut b = [0; 1]; | |
c30ab7b3 | 666 | /// |
f035d41b XL |
667 | /// // this panics |
668 | /// '𝕊'.encode_utf16(&mut b); | |
c34b1796 | 669 | /// ``` |
476ff2be | 670 | #[stable(feature = "unicode_encode_char", since = "1.15.0")] |
62682a34 | 671 | #[inline] |
c30ab7b3 | 672 | pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { |
f9f354fc | 673 | encode_utf16_raw(self as u32, dst) |
62682a34 | 674 | } |
1a4d82fc | 675 | |
e74abb32 XL |
676 | /// Returns `true` if this `char` has the `Alphabetic` property. |
677 | /// | |
678 | /// `Alphabetic` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and | |
679 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | |
680 | /// | |
681 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | |
682 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
683 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
b039eaaf SL |
684 | /// |
685 | /// # Examples | |
686 | /// | |
687 | /// Basic usage: | |
688 | /// | |
689 | /// ``` | |
54a0048b SL |
690 | /// assert!('a'.is_alphabetic()); |
691 | /// assert!('京'.is_alphabetic()); | |
b039eaaf SL |
692 | /// |
693 | /// let c = '💝'; | |
694 | /// // love is many things, but it is not alphabetic | |
695 | /// assert!(!c.is_alphabetic()); | |
696 | /// ``` | |
85aaf69f | 697 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
698 | #[inline] |
699 | pub fn is_alphabetic(self) -> bool { | |
700 | match self { | |
8faf50e0 | 701 | 'a'..='z' | 'A'..='Z' => true, |
dfeec247 | 702 | c => c > '\x7f' && unicode::Alphabetic(c), |
c34b1796 AL |
703 | } |
704 | } | |
1a4d82fc | 705 | |
e74abb32 | 706 | /// Returns `true` if this `char` has the `Lowercase` property. |
1a4d82fc | 707 | /// |
e74abb32 XL |
708 | /// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and |
709 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | |
710 | /// | |
711 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | |
712 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
713 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
b039eaaf SL |
714 | /// |
715 | /// # Examples | |
716 | /// | |
717 | /// Basic usage: | |
718 | /// | |
719 | /// ``` | |
54a0048b SL |
720 | /// assert!('a'.is_lowercase()); |
721 | /// assert!('δ'.is_lowercase()); | |
722 | /// assert!(!'A'.is_lowercase()); | |
723 | /// assert!(!'Δ'.is_lowercase()); | |
b039eaaf | 724 | /// |
f9f354fc | 725 | /// // The various Chinese scripts and punctuation do not have case, and so: |
54a0048b | 726 | /// assert!(!'中'.is_lowercase()); |
f9f354fc | 727 | /// assert!(!' '.is_lowercase()); |
b039eaaf | 728 | /// ``` |
85aaf69f | 729 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
730 | #[inline] |
731 | pub fn is_lowercase(self) -> bool { | |
732 | match self { | |
8faf50e0 | 733 | 'a'..='z' => true, |
dfeec247 | 734 | c => c > '\x7f' && unicode::Lowercase(c), |
c34b1796 AL |
735 | } |
736 | } | |
1a4d82fc | 737 | |
e74abb32 XL |
738 | /// Returns `true` if this `char` has the `Uppercase` property. |
739 | /// | |
740 | /// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and | |
741 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | |
1a4d82fc | 742 | /// |
e74abb32 XL |
743 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
744 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
745 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
b039eaaf SL |
746 | /// |
747 | /// # Examples | |
748 | /// | |
749 | /// Basic usage: | |
750 | /// | |
751 | /// ``` | |
54a0048b SL |
752 | /// assert!(!'a'.is_uppercase()); |
753 | /// assert!(!'δ'.is_uppercase()); | |
754 | /// assert!('A'.is_uppercase()); | |
755 | /// assert!('Δ'.is_uppercase()); | |
b039eaaf | 756 | /// |
f9f354fc | 757 | /// // The various Chinese scripts and punctuation do not have case, and so: |
54a0048b | 758 | /// assert!(!'中'.is_uppercase()); |
f9f354fc | 759 | /// assert!(!' '.is_uppercase()); |
b039eaaf | 760 | /// ``` |
85aaf69f | 761 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
762 | #[inline] |
763 | pub fn is_uppercase(self) -> bool { | |
764 | match self { | |
8faf50e0 | 765 | 'A'..='Z' => true, |
dfeec247 | 766 | c => c > '\x7f' && unicode::Uppercase(c), |
c34b1796 AL |
767 | } |
768 | } | |
1a4d82fc | 769 | |
e74abb32 | 770 | /// Returns `true` if this `char` has the `White_Space` property. |
b039eaaf | 771 | /// |
e74abb32 XL |
772 | /// `White_Space` is specified in the [Unicode Character Database][ucd] [`PropList.txt`]. |
773 | /// | |
774 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
775 | /// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt | |
b039eaaf SL |
776 | /// |
777 | /// # Examples | |
1a4d82fc | 778 | /// |
b039eaaf SL |
779 | /// Basic usage: |
780 | /// | |
781 | /// ``` | |
54a0048b | 782 | /// assert!(' '.is_whitespace()); |
b039eaaf SL |
783 | /// |
784 | /// // a non-breaking space | |
54a0048b | 785 | /// assert!('\u{A0}'.is_whitespace()); |
b039eaaf | 786 | /// |
54a0048b | 787 | /// assert!(!'越'.is_whitespace()); |
b039eaaf | 788 | /// ``` |
85aaf69f | 789 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
790 | #[inline] |
791 | pub fn is_whitespace(self) -> bool { | |
792 | match self { | |
8faf50e0 | 793 | ' ' | '\x09'..='\x0d' => true, |
dfeec247 | 794 | c => c > '\x7f' && unicode::White_Space(c), |
c34b1796 AL |
795 | } |
796 | } | |
1a4d82fc | 797 | |
e74abb32 | 798 | /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. |
1a4d82fc | 799 | /// |
e74abb32 XL |
800 | /// [`is_alphabetic()`]: #method.is_alphabetic |
801 | /// [`is_numeric()`]: #method.is_numeric | |
b039eaaf SL |
802 | /// |
803 | /// # Examples | |
804 | /// | |
805 | /// Basic usage: | |
806 | /// | |
807 | /// ``` | |
54a0048b SL |
808 | /// assert!('٣'.is_alphanumeric()); |
809 | /// assert!('7'.is_alphanumeric()); | |
810 | /// assert!('৬'.is_alphanumeric()); | |
b7449926 XL |
811 | /// assert!('¾'.is_alphanumeric()); |
812 | /// assert!('①'.is_alphanumeric()); | |
54a0048b SL |
813 | /// assert!('K'.is_alphanumeric()); |
814 | /// assert!('و'.is_alphanumeric()); | |
815 | /// assert!('藏'.is_alphanumeric()); | |
b039eaaf | 816 | /// ``` |
85aaf69f | 817 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
818 | #[inline] |
819 | pub fn is_alphanumeric(self) -> bool { | |
820 | self.is_alphabetic() || self.is_numeric() | |
821 | } | |
1a4d82fc | 822 | |
e74abb32 XL |
823 | /// Returns `true` if this `char` has the general category for control codes. |
824 | /// | |
825 | /// Control codes (code points with the general category of `Cc`) are described in Chapter 4 | |
826 | /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character | |
827 | /// Database][ucd] [`UnicodeData.txt`]. | |
1a4d82fc | 828 | /// |
e74abb32 XL |
829 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
830 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
831 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
b039eaaf SL |
832 | /// |
833 | /// # Examples | |
834 | /// | |
835 | /// Basic usage: | |
836 | /// | |
837 | /// ``` | |
838 | /// // U+009C, STRING TERMINATOR | |
54a0048b SL |
839 | /// assert!('\9c'.is_control()); |
840 | /// assert!(!'q'.is_control()); | |
b039eaaf | 841 | /// ``` |
85aaf69f | 842 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 | 843 | #[inline] |
b039eaaf | 844 | pub fn is_control(self) -> bool { |
dfeec247 | 845 | unicode::Cc(self) |
b039eaaf | 846 | } |
1a4d82fc | 847 | |
e74abb32 | 848 | /// Returns `true` if this `char` has the `Grapheme_Extend` property. |
94b46f34 | 849 | /// |
e74abb32 XL |
850 | /// `Grapheme_Extend` is described in [Unicode Standard Annex #29 (Unicode Text |
851 | /// Segmentation)][uax29] and specified in the [Unicode Character Database][ucd] | |
852 | /// [`DerivedCoreProperties.txt`]. | |
853 | /// | |
854 | /// [uax29]: https://www.unicode.org/reports/tr29/ | |
855 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
856 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
94b46f34 XL |
857 | #[inline] |
858 | pub(crate) fn is_grapheme_extended(self) -> bool { | |
dfeec247 | 859 | unicode::Grapheme_Extend(self) |
94b46f34 XL |
860 | } |
861 | ||
e74abb32 XL |
862 | /// Returns `true` if this `char` has one of the general categories for numbers. |
863 | /// | |
864 | /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric | |
865 | /// characters, and `No` for other numeric characters) are specified in the [Unicode Character | |
866 | /// Database][ucd] [`UnicodeData.txt`]. | |
b039eaaf | 867 | /// |
e74abb32 XL |
868 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
869 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
870 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
b039eaaf SL |
871 | /// |
872 | /// # Examples | |
873 | /// | |
874 | /// Basic usage: | |
875 | /// | |
876 | /// ``` | |
54a0048b SL |
877 | /// assert!('٣'.is_numeric()); |
878 | /// assert!('7'.is_numeric()); | |
879 | /// assert!('৬'.is_numeric()); | |
b7449926 XL |
880 | /// assert!('¾'.is_numeric()); |
881 | /// assert!('①'.is_numeric()); | |
54a0048b SL |
882 | /// assert!(!'K'.is_numeric()); |
883 | /// assert!(!'و'.is_numeric()); | |
884 | /// assert!(!'藏'.is_numeric()); | |
b039eaaf | 885 | /// ``` |
85aaf69f | 886 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
887 | #[inline] |
888 | pub fn is_numeric(self) -> bool { | |
889 | match self { | |
8faf50e0 | 890 | '0'..='9' => true, |
dfeec247 | 891 | c => c > '\x7f' && unicode::N(c), |
c34b1796 AL |
892 | } |
893 | } | |
1a4d82fc | 894 | |
e74abb32 XL |
895 | /// Returns an iterator that yields the lowercase mapping of this `char` as one or more |
896 | /// `char`s. | |
1a4d82fc | 897 | /// |
e74abb32 | 898 | /// If this `char` does not have a lowercase mapping, the iterator yields the same `char`. |
1a4d82fc | 899 | /// |
e74abb32 XL |
900 | /// If this `char` has a one-to-one lowercase mapping given by the [Unicode Character |
901 | /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. | |
1a4d82fc | 902 | /// |
e74abb32 XL |
903 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
904 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
b039eaaf | 905 | /// |
e74abb32 XL |
906 | /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields |
907 | /// the `char`(s) given by [`SpecialCasing.txt`]. | |
b039eaaf | 908 | /// |
e74abb32 | 909 | /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt |
b039eaaf | 910 | /// |
e74abb32 XL |
911 | /// This operation performs an unconditional mapping without tailoring. That is, the conversion |
912 | /// is independent of context and language. | |
913 | /// | |
914 | /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in | |
915 | /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. | |
916 | /// | |
917 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | |
62682a34 SL |
918 | /// |
919 | /// # Examples | |
920 | /// | |
32a655c1 | 921 | /// As an iterator: |
b039eaaf | 922 | /// |
62682a34 | 923 | /// ``` |
32a655c1 SL |
924 | /// for c in 'İ'.to_lowercase() { |
925 | /// print!("{}", c); | |
926 | /// } | |
927 | /// println!(); | |
928 | /// ``` | |
929 | /// | |
930 | /// Using `println!` directly: | |
931 | /// | |
932 | /// ``` | |
933 | /// println!("{}", 'İ'.to_lowercase()); | |
934 | /// ``` | |
935 | /// | |
936 | /// Both are equivalent to: | |
937 | /// | |
938 | /// ``` | |
939 | /// println!("i\u{307}"); | |
940 | /// ``` | |
941 | /// | |
942 | /// Using `to_string`: | |
943 | /// | |
944 | /// ``` | |
945 | /// assert_eq!('C'.to_lowercase().to_string(), "c"); | |
3157f602 XL |
946 | /// |
947 | /// // Sometimes the result is more than one character: | |
32a655c1 | 948 | /// assert_eq!('İ'.to_lowercase().to_string(), "i\u{307}"); |
b039eaaf | 949 | /// |
cc61c64b XL |
950 | /// // Characters that do not have both uppercase and lowercase |
951 | /// // convert into themselves. | |
32a655c1 | 952 | /// assert_eq!('山'.to_lowercase().to_string(), "山"); |
62682a34 | 953 | /// ``` |
c34b1796 AL |
954 | #[stable(feature = "rust1", since = "1.0.0")] |
955 | #[inline] | |
956 | pub fn to_lowercase(self) -> ToLowercase { | |
62682a34 SL |
957 | ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) |
958 | } | |
959 | ||
e74abb32 XL |
960 | /// Returns an iterator that yields the uppercase mapping of this `char` as one or more |
961 | /// `char`s. | |
962 | /// | |
963 | /// If this `char` does not have a uppercase mapping, the iterator yields the same `char`. | |
964 | /// | |
965 | /// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character | |
966 | /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. | |
1a4d82fc | 967 | /// |
e74abb32 XL |
968 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
969 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
1a4d82fc | 970 | /// |
e74abb32 XL |
971 | /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields |
972 | /// the `char`(s) given by [`SpecialCasing.txt`]. | |
1a4d82fc | 973 | /// |
e74abb32 | 974 | /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt |
1a4d82fc | 975 | /// |
e74abb32 XL |
976 | /// This operation performs an unconditional mapping without tailoring. That is, the conversion |
977 | /// is independent of context and language. | |
1a4d82fc | 978 | /// |
e74abb32 XL |
979 | /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in |
980 | /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. | |
62682a34 | 981 | /// |
e74abb32 | 982 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
1a4d82fc | 983 | /// |
62682a34 SL |
984 | /// # Examples |
985 | /// | |
32a655c1 SL |
986 | /// As an iterator: |
987 | /// | |
988 | /// ``` | |
989 | /// for c in 'ß'.to_uppercase() { | |
990 | /// print!("{}", c); | |
991 | /// } | |
992 | /// println!(); | |
993 | /// ``` | |
994 | /// | |
995 | /// Using `println!` directly: | |
996 | /// | |
997 | /// ``` | |
998 | /// println!("{}", 'ß'.to_uppercase()); | |
999 | /// ``` | |
1000 | /// | |
1001 | /// Both are equivalent to: | |
1002 | /// | |
1003 | /// ``` | |
1004 | /// println!("SS"); | |
1005 | /// ``` | |
1006 | /// | |
1007 | /// Using `to_string`: | |
b039eaaf SL |
1008 | /// |
1009 | /// ``` | |
32a655c1 | 1010 | /// assert_eq!('c'.to_uppercase().to_string(), "C"); |
3157f602 XL |
1011 | /// |
1012 | /// // Sometimes the result is more than one character: | |
32a655c1 | 1013 | /// assert_eq!('ß'.to_uppercase().to_string(), "SS"); |
b039eaaf | 1014 | /// |
cc61c64b XL |
1015 | /// // Characters that do not have both uppercase and lowercase |
1016 | /// // convert into themselves. | |
32a655c1 | 1017 | /// assert_eq!('山'.to_uppercase().to_string(), "山"); |
b039eaaf SL |
1018 | /// ``` |
1019 | /// | |
32a655c1 SL |
1020 | /// # Note on locale |
1021 | /// | |
b039eaaf SL |
1022 | /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: |
1023 | /// | |
1024 | /// * 'Dotless': I / ı, sometimes written ï | |
1025 | /// * 'Dotted': İ / i | |
1026 | /// | |
1027 | /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: | |
1028 | /// | |
1029 | /// ``` | |
32a655c1 | 1030 | /// let upper_i = 'i'.to_uppercase().to_string(); |
b039eaaf SL |
1031 | /// ``` |
1032 | /// | |
1033 | /// The value of `upper_i` here relies on the language of the text: if we're | |
3157f602 XL |
1034 | /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should |
1035 | /// be `"İ"`. `to_uppercase()` does not take this into account, and so: | |
b039eaaf | 1036 | /// |
62682a34 | 1037 | /// ``` |
32a655c1 | 1038 | /// let upper_i = 'i'.to_uppercase().to_string(); |
b039eaaf | 1039 | /// |
3157f602 | 1040 | /// assert_eq!(upper_i, "I"); |
62682a34 | 1041 | /// ``` |
b039eaaf SL |
1042 | /// |
1043 | /// holds across languages. | |
c34b1796 AL |
1044 | #[stable(feature = "rust1", since = "1.0.0")] |
1045 | #[inline] | |
1046 | pub fn to_uppercase(self) -> ToUppercase { | |
62682a34 | 1047 | ToUppercase(CaseMappingIter::new(conversions::to_upper(self))) |
c34b1796 | 1048 | } |
abe05a73 XL |
1049 | |
1050 | /// Checks if the value is within the ASCII range. | |
1051 | /// | |
1052 | /// # Examples | |
1053 | /// | |
1054 | /// ``` | |
1055 | /// let ascii = 'a'; | |
1056 | /// let non_ascii = '❤'; | |
1057 | /// | |
1058 | /// assert!(ascii.is_ascii()); | |
1059 | /// assert!(!non_ascii.is_ascii()); | |
1060 | /// ``` | |
ff7c6d11 | 1061 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
dfeec247 | 1062 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.32.0")] |
abe05a73 | 1063 | #[inline] |
a1dfa0c6 | 1064 | pub const fn is_ascii(&self) -> bool { |
abe05a73 XL |
1065 | *self as u32 <= 0x7F |
1066 | } | |
1067 | ||
1068 | /// Makes a copy of the value in its ASCII upper case equivalent. | |
1069 | /// | |
1070 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', | |
1071 | /// but non-ASCII letters are unchanged. | |
1072 | /// | |
fc512014 | 1073 | /// To uppercase the value in-place, use [`make_ascii_uppercase()`]. |
abe05a73 XL |
1074 | /// |
1075 | /// To uppercase ASCII characters in addition to non-ASCII characters, use | |
fc512014 | 1076 | /// [`to_uppercase()`]. |
abe05a73 XL |
1077 | /// |
1078 | /// # Examples | |
1079 | /// | |
1080 | /// ``` | |
1081 | /// let ascii = 'a'; | |
1082 | /// let non_ascii = '❤'; | |
1083 | /// | |
1084 | /// assert_eq!('A', ascii.to_ascii_uppercase()); | |
1085 | /// assert_eq!('❤', non_ascii.to_ascii_uppercase()); | |
1086 | /// ``` | |
1087 | /// | |
fc512014 XL |
1088 | /// [`make_ascii_uppercase()`]: #method.make_ascii_uppercase |
1089 | /// [`to_uppercase()`]: #method.to_uppercase | |
ff7c6d11 | 1090 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
6a06907d | 1091 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")] |
abe05a73 | 1092 | #[inline] |
6a06907d XL |
1093 | pub const fn to_ascii_uppercase(&self) -> char { |
1094 | if self.is_ascii_lowercase() { | |
1095 | (*self as u8).ascii_change_case_unchecked() as char | |
1096 | } else { | |
1097 | *self | |
1098 | } | |
abe05a73 XL |
1099 | } |
1100 | ||
1101 | /// Makes a copy of the value in its ASCII lower case equivalent. | |
1102 | /// | |
1103 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', | |
1104 | /// but non-ASCII letters are unchanged. | |
1105 | /// | |
fc512014 | 1106 | /// To lowercase the value in-place, use [`make_ascii_lowercase()`]. |
abe05a73 XL |
1107 | /// |
1108 | /// To lowercase ASCII characters in addition to non-ASCII characters, use | |
fc512014 | 1109 | /// [`to_lowercase()`]. |
abe05a73 XL |
1110 | /// |
1111 | /// # Examples | |
1112 | /// | |
1113 | /// ``` | |
1114 | /// let ascii = 'A'; | |
1115 | /// let non_ascii = '❤'; | |
1116 | /// | |
1117 | /// assert_eq!('a', ascii.to_ascii_lowercase()); | |
1118 | /// assert_eq!('❤', non_ascii.to_ascii_lowercase()); | |
1119 | /// ``` | |
1120 | /// | |
fc512014 XL |
1121 | /// [`make_ascii_lowercase()`]: #method.make_ascii_lowercase |
1122 | /// [`to_lowercase()`]: #method.to_lowercase | |
ff7c6d11 | 1123 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
6a06907d | 1124 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")] |
abe05a73 | 1125 | #[inline] |
6a06907d XL |
1126 | pub const fn to_ascii_lowercase(&self) -> char { |
1127 | if self.is_ascii_uppercase() { | |
1128 | (*self as u8).ascii_change_case_unchecked() as char | |
1129 | } else { | |
1130 | *self | |
1131 | } | |
abe05a73 XL |
1132 | } |
1133 | ||
1134 | /// Checks that two values are an ASCII case-insensitive match. | |
1135 | /// | |
1136 | /// Equivalent to `to_ascii_lowercase(a) == to_ascii_lowercase(b)`. | |
1137 | /// | |
1138 | /// # Examples | |
1139 | /// | |
1140 | /// ``` | |
1141 | /// let upper_a = 'A'; | |
1142 | /// let lower_a = 'a'; | |
1143 | /// let lower_z = 'z'; | |
1144 | /// | |
1145 | /// assert!(upper_a.eq_ignore_ascii_case(&lower_a)); | |
1146 | /// assert!(upper_a.eq_ignore_ascii_case(&upper_a)); | |
1147 | /// assert!(!upper_a.eq_ignore_ascii_case(&lower_z)); | |
1148 | /// ``` | |
ff7c6d11 | 1149 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
6a06907d | 1150 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")] |
abe05a73 | 1151 | #[inline] |
6a06907d | 1152 | pub const fn eq_ignore_ascii_case(&self, other: &char) -> bool { |
abe05a73 XL |
1153 | self.to_ascii_lowercase() == other.to_ascii_lowercase() |
1154 | } | |
1155 | ||
1156 | /// Converts this type to its ASCII upper case equivalent in-place. | |
1157 | /// | |
1158 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', | |
1159 | /// but non-ASCII letters are unchanged. | |
1160 | /// | |
1161 | /// To return a new uppercased value without modifying the existing one, use | |
fc512014 | 1162 | /// [`to_ascii_uppercase()`]. |
abe05a73 XL |
1163 | /// |
1164 | /// # Examples | |
1165 | /// | |
1166 | /// ``` | |
1167 | /// let mut ascii = 'a'; | |
1168 | /// | |
1169 | /// ascii.make_ascii_uppercase(); | |
1170 | /// | |
1171 | /// assert_eq!('A', ascii); | |
1172 | /// ``` | |
1173 | /// | |
fc512014 | 1174 | /// [`to_ascii_uppercase()`]: #method.to_ascii_uppercase |
ff7c6d11 | 1175 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
abe05a73 XL |
1176 | #[inline] |
1177 | pub fn make_ascii_uppercase(&mut self) { | |
1178 | *self = self.to_ascii_uppercase(); | |
1179 | } | |
1180 | ||
1181 | /// Converts this type to its ASCII lower case equivalent in-place. | |
1182 | /// | |
1183 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', | |
1184 | /// but non-ASCII letters are unchanged. | |
1185 | /// | |
1186 | /// To return a new lowercased value without modifying the existing one, use | |
fc512014 | 1187 | /// [`to_ascii_lowercase()`]. |
abe05a73 XL |
1188 | /// |
1189 | /// # Examples | |
1190 | /// | |
1191 | /// ``` | |
1192 | /// let mut ascii = 'A'; | |
1193 | /// | |
1194 | /// ascii.make_ascii_lowercase(); | |
1195 | /// | |
1196 | /// assert_eq!('a', ascii); | |
1197 | /// ``` | |
1198 | /// | |
fc512014 | 1199 | /// [`to_ascii_lowercase()`]: #method.to_ascii_lowercase |
ff7c6d11 | 1200 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
abe05a73 XL |
1201 | #[inline] |
1202 | pub fn make_ascii_lowercase(&mut self) { | |
1203 | *self = self.to_ascii_lowercase(); | |
1204 | } | |
1205 | ||
1206 | /// Checks if the value is an ASCII alphabetic character: | |
1207 | /// | |
dc9dc135 XL |
1208 | /// - U+0041 'A' ..= U+005A 'Z', or |
1209 | /// - U+0061 'a' ..= U+007A 'z'. | |
abe05a73 XL |
1210 | /// |
1211 | /// # Examples | |
1212 | /// | |
1213 | /// ``` | |
abe05a73 XL |
1214 | /// let uppercase_a = 'A'; |
1215 | /// let uppercase_g = 'G'; | |
1216 | /// let a = 'a'; | |
1217 | /// let g = 'g'; | |
1218 | /// let zero = '0'; | |
1219 | /// let percent = '%'; | |
1220 | /// let space = ' '; | |
1221 | /// let lf = '\n'; | |
1222 | /// let esc: char = 0x1b_u8.into(); | |
1223 | /// | |
1224 | /// assert!(uppercase_a.is_ascii_alphabetic()); | |
1225 | /// assert!(uppercase_g.is_ascii_alphabetic()); | |
1226 | /// assert!(a.is_ascii_alphabetic()); | |
1227 | /// assert!(g.is_ascii_alphabetic()); | |
1228 | /// assert!(!zero.is_ascii_alphabetic()); | |
1229 | /// assert!(!percent.is_ascii_alphabetic()); | |
1230 | /// assert!(!space.is_ascii_alphabetic()); | |
1231 | /// assert!(!lf.is_ascii_alphabetic()); | |
1232 | /// assert!(!esc.is_ascii_alphabetic()); | |
1233 | /// ``` | |
ff7c6d11 | 1234 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1235 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1236 | #[inline] |
74b04a01 | 1237 | pub const fn is_ascii_alphabetic(&self) -> bool { |
29967ef6 | 1238 | matches!(*self, 'A'..='Z' | 'a'..='z') |
abe05a73 XL |
1239 | } |
1240 | ||
1241 | /// Checks if the value is an ASCII uppercase character: | |
dc9dc135 | 1242 | /// U+0041 'A' ..= U+005A 'Z'. |
abe05a73 XL |
1243 | /// |
1244 | /// # Examples | |
1245 | /// | |
1246 | /// ``` | |
abe05a73 XL |
1247 | /// let uppercase_a = 'A'; |
1248 | /// let uppercase_g = 'G'; | |
1249 | /// let a = 'a'; | |
1250 | /// let g = 'g'; | |
1251 | /// let zero = '0'; | |
1252 | /// let percent = '%'; | |
1253 | /// let space = ' '; | |
1254 | /// let lf = '\n'; | |
1255 | /// let esc: char = 0x1b_u8.into(); | |
1256 | /// | |
1257 | /// assert!(uppercase_a.is_ascii_uppercase()); | |
1258 | /// assert!(uppercase_g.is_ascii_uppercase()); | |
1259 | /// assert!(!a.is_ascii_uppercase()); | |
1260 | /// assert!(!g.is_ascii_uppercase()); | |
1261 | /// assert!(!zero.is_ascii_uppercase()); | |
1262 | /// assert!(!percent.is_ascii_uppercase()); | |
1263 | /// assert!(!space.is_ascii_uppercase()); | |
1264 | /// assert!(!lf.is_ascii_uppercase()); | |
1265 | /// assert!(!esc.is_ascii_uppercase()); | |
1266 | /// ``` | |
ff7c6d11 | 1267 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1268 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1269 | #[inline] |
74b04a01 | 1270 | pub const fn is_ascii_uppercase(&self) -> bool { |
29967ef6 | 1271 | matches!(*self, 'A'..='Z') |
abe05a73 XL |
1272 | } |
1273 | ||
1274 | /// Checks if the value is an ASCII lowercase character: | |
dc9dc135 | 1275 | /// U+0061 'a' ..= U+007A 'z'. |
abe05a73 XL |
1276 | /// |
1277 | /// # Examples | |
1278 | /// | |
1279 | /// ``` | |
abe05a73 XL |
1280 | /// let uppercase_a = 'A'; |
1281 | /// let uppercase_g = 'G'; | |
1282 | /// let a = 'a'; | |
1283 | /// let g = 'g'; | |
1284 | /// let zero = '0'; | |
1285 | /// let percent = '%'; | |
1286 | /// let space = ' '; | |
1287 | /// let lf = '\n'; | |
1288 | /// let esc: char = 0x1b_u8.into(); | |
1289 | /// | |
1290 | /// assert!(!uppercase_a.is_ascii_lowercase()); | |
1291 | /// assert!(!uppercase_g.is_ascii_lowercase()); | |
1292 | /// assert!(a.is_ascii_lowercase()); | |
1293 | /// assert!(g.is_ascii_lowercase()); | |
1294 | /// assert!(!zero.is_ascii_lowercase()); | |
1295 | /// assert!(!percent.is_ascii_lowercase()); | |
1296 | /// assert!(!space.is_ascii_lowercase()); | |
1297 | /// assert!(!lf.is_ascii_lowercase()); | |
1298 | /// assert!(!esc.is_ascii_lowercase()); | |
1299 | /// ``` | |
ff7c6d11 | 1300 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1301 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1302 | #[inline] |
74b04a01 | 1303 | pub const fn is_ascii_lowercase(&self) -> bool { |
29967ef6 | 1304 | matches!(*self, 'a'..='z') |
abe05a73 XL |
1305 | } |
1306 | ||
1307 | /// Checks if the value is an ASCII alphanumeric character: | |
1308 | /// | |
dc9dc135 XL |
1309 | /// - U+0041 'A' ..= U+005A 'Z', or |
1310 | /// - U+0061 'a' ..= U+007A 'z', or | |
1311 | /// - U+0030 '0' ..= U+0039 '9'. | |
abe05a73 XL |
1312 | /// |
1313 | /// # Examples | |
1314 | /// | |
1315 | /// ``` | |
abe05a73 XL |
1316 | /// let uppercase_a = 'A'; |
1317 | /// let uppercase_g = 'G'; | |
1318 | /// let a = 'a'; | |
1319 | /// let g = 'g'; | |
1320 | /// let zero = '0'; | |
1321 | /// let percent = '%'; | |
1322 | /// let space = ' '; | |
1323 | /// let lf = '\n'; | |
1324 | /// let esc: char = 0x1b_u8.into(); | |
1325 | /// | |
1326 | /// assert!(uppercase_a.is_ascii_alphanumeric()); | |
1327 | /// assert!(uppercase_g.is_ascii_alphanumeric()); | |
1328 | /// assert!(a.is_ascii_alphanumeric()); | |
1329 | /// assert!(g.is_ascii_alphanumeric()); | |
1330 | /// assert!(zero.is_ascii_alphanumeric()); | |
1331 | /// assert!(!percent.is_ascii_alphanumeric()); | |
1332 | /// assert!(!space.is_ascii_alphanumeric()); | |
1333 | /// assert!(!lf.is_ascii_alphanumeric()); | |
1334 | /// assert!(!esc.is_ascii_alphanumeric()); | |
1335 | /// ``` | |
ff7c6d11 | 1336 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1337 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1338 | #[inline] |
74b04a01 | 1339 | pub const fn is_ascii_alphanumeric(&self) -> bool { |
29967ef6 | 1340 | matches!(*self, '0'..='9' | 'A'..='Z' | 'a'..='z') |
abe05a73 XL |
1341 | } |
1342 | ||
1343 | /// Checks if the value is an ASCII decimal digit: | |
dc9dc135 | 1344 | /// U+0030 '0' ..= U+0039 '9'. |
abe05a73 XL |
1345 | /// |
1346 | /// # Examples | |
1347 | /// | |
1348 | /// ``` | |
abe05a73 XL |
1349 | /// let uppercase_a = 'A'; |
1350 | /// let uppercase_g = 'G'; | |
1351 | /// let a = 'a'; | |
1352 | /// let g = 'g'; | |
1353 | /// let zero = '0'; | |
1354 | /// let percent = '%'; | |
1355 | /// let space = ' '; | |
1356 | /// let lf = '\n'; | |
1357 | /// let esc: char = 0x1b_u8.into(); | |
1358 | /// | |
1359 | /// assert!(!uppercase_a.is_ascii_digit()); | |
1360 | /// assert!(!uppercase_g.is_ascii_digit()); | |
1361 | /// assert!(!a.is_ascii_digit()); | |
1362 | /// assert!(!g.is_ascii_digit()); | |
1363 | /// assert!(zero.is_ascii_digit()); | |
1364 | /// assert!(!percent.is_ascii_digit()); | |
1365 | /// assert!(!space.is_ascii_digit()); | |
1366 | /// assert!(!lf.is_ascii_digit()); | |
1367 | /// assert!(!esc.is_ascii_digit()); | |
1368 | /// ``` | |
ff7c6d11 | 1369 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1370 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1371 | #[inline] |
74b04a01 | 1372 | pub const fn is_ascii_digit(&self) -> bool { |
29967ef6 | 1373 | matches!(*self, '0'..='9') |
abe05a73 XL |
1374 | } |
1375 | ||
1376 | /// Checks if the value is an ASCII hexadecimal digit: | |
1377 | /// | |
dc9dc135 XL |
1378 | /// - U+0030 '0' ..= U+0039 '9', or |
1379 | /// - U+0041 'A' ..= U+0046 'F', or | |
1380 | /// - U+0061 'a' ..= U+0066 'f'. | |
abe05a73 XL |
1381 | /// |
1382 | /// # Examples | |
1383 | /// | |
1384 | /// ``` | |
abe05a73 XL |
1385 | /// let uppercase_a = 'A'; |
1386 | /// let uppercase_g = 'G'; | |
1387 | /// let a = 'a'; | |
1388 | /// let g = 'g'; | |
1389 | /// let zero = '0'; | |
1390 | /// let percent = '%'; | |
1391 | /// let space = ' '; | |
1392 | /// let lf = '\n'; | |
1393 | /// let esc: char = 0x1b_u8.into(); | |
1394 | /// | |
1395 | /// assert!(uppercase_a.is_ascii_hexdigit()); | |
1396 | /// assert!(!uppercase_g.is_ascii_hexdigit()); | |
1397 | /// assert!(a.is_ascii_hexdigit()); | |
1398 | /// assert!(!g.is_ascii_hexdigit()); | |
1399 | /// assert!(zero.is_ascii_hexdigit()); | |
1400 | /// assert!(!percent.is_ascii_hexdigit()); | |
1401 | /// assert!(!space.is_ascii_hexdigit()); | |
1402 | /// assert!(!lf.is_ascii_hexdigit()); | |
1403 | /// assert!(!esc.is_ascii_hexdigit()); | |
1404 | /// ``` | |
ff7c6d11 | 1405 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1406 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1407 | #[inline] |
74b04a01 | 1408 | pub const fn is_ascii_hexdigit(&self) -> bool { |
29967ef6 | 1409 | matches!(*self, '0'..='9' | 'A'..='F' | 'a'..='f') |
abe05a73 XL |
1410 | } |
1411 | ||
1412 | /// Checks if the value is an ASCII punctuation character: | |
1413 | /// | |
dc9dc135 XL |
1414 | /// - U+0021 ..= U+002F `! " # $ % & ' ( ) * + , - . /`, or |
1415 | /// - U+003A ..= U+0040 `: ; < = > ? @`, or | |
1416 | /// - U+005B ..= U+0060 ``[ \ ] ^ _ ` ``, or | |
1417 | /// - U+007B ..= U+007E `{ | } ~` | |
abe05a73 XL |
1418 | /// |
1419 | /// # Examples | |
1420 | /// | |
1421 | /// ``` | |
abe05a73 XL |
1422 | /// let uppercase_a = 'A'; |
1423 | /// let uppercase_g = 'G'; | |
1424 | /// let a = 'a'; | |
1425 | /// let g = 'g'; | |
1426 | /// let zero = '0'; | |
1427 | /// let percent = '%'; | |
1428 | /// let space = ' '; | |
1429 | /// let lf = '\n'; | |
1430 | /// let esc: char = 0x1b_u8.into(); | |
1431 | /// | |
1432 | /// assert!(!uppercase_a.is_ascii_punctuation()); | |
1433 | /// assert!(!uppercase_g.is_ascii_punctuation()); | |
1434 | /// assert!(!a.is_ascii_punctuation()); | |
1435 | /// assert!(!g.is_ascii_punctuation()); | |
1436 | /// assert!(!zero.is_ascii_punctuation()); | |
1437 | /// assert!(percent.is_ascii_punctuation()); | |
1438 | /// assert!(!space.is_ascii_punctuation()); | |
1439 | /// assert!(!lf.is_ascii_punctuation()); | |
1440 | /// assert!(!esc.is_ascii_punctuation()); | |
1441 | /// ``` | |
ff7c6d11 | 1442 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1443 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1444 | #[inline] |
74b04a01 | 1445 | pub const fn is_ascii_punctuation(&self) -> bool { |
29967ef6 | 1446 | matches!(*self, '!'..='/' | ':'..='@' | '['..='`' | '{'..='~') |
abe05a73 XL |
1447 | } |
1448 | ||
1449 | /// Checks if the value is an ASCII graphic character: | |
dc9dc135 | 1450 | /// U+0021 '!' ..= U+007E '~'. |
abe05a73 XL |
1451 | /// |
1452 | /// # Examples | |
1453 | /// | |
1454 | /// ``` | |
abe05a73 XL |
1455 | /// let uppercase_a = 'A'; |
1456 | /// let uppercase_g = 'G'; | |
1457 | /// let a = 'a'; | |
1458 | /// let g = 'g'; | |
1459 | /// let zero = '0'; | |
1460 | /// let percent = '%'; | |
1461 | /// let space = ' '; | |
1462 | /// let lf = '\n'; | |
1463 | /// let esc: char = 0x1b_u8.into(); | |
1464 | /// | |
1465 | /// assert!(uppercase_a.is_ascii_graphic()); | |
1466 | /// assert!(uppercase_g.is_ascii_graphic()); | |
1467 | /// assert!(a.is_ascii_graphic()); | |
1468 | /// assert!(g.is_ascii_graphic()); | |
1469 | /// assert!(zero.is_ascii_graphic()); | |
1470 | /// assert!(percent.is_ascii_graphic()); | |
1471 | /// assert!(!space.is_ascii_graphic()); | |
1472 | /// assert!(!lf.is_ascii_graphic()); | |
1473 | /// assert!(!esc.is_ascii_graphic()); | |
1474 | /// ``` | |
ff7c6d11 | 1475 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1476 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1477 | #[inline] |
74b04a01 | 1478 | pub const fn is_ascii_graphic(&self) -> bool { |
29967ef6 | 1479 | matches!(*self, '!'..='~') |
abe05a73 XL |
1480 | } |
1481 | ||
1482 | /// Checks if the value is an ASCII whitespace character: | |
1483 | /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED, | |
1484 | /// U+000C FORM FEED, or U+000D CARRIAGE RETURN. | |
1485 | /// | |
1486 | /// Rust uses the WhatWG Infra Standard's [definition of ASCII | |
1487 | /// whitespace][infra-aw]. There are several other definitions in | |
1488 | /// wide use. For instance, [the POSIX locale][pct] includes | |
1489 | /// U+000B VERTICAL TAB as well as all the above characters, | |
1490 | /// but—from the very same specification—[the default rule for | |
1491 | /// "field splitting" in the Bourne shell][bfs] considers *only* | |
1492 | /// SPACE, HORIZONTAL TAB, and LINE FEED as whitespace. | |
1493 | /// | |
1494 | /// If you are writing a program that will process an existing | |
1495 | /// file format, check what that format's definition of whitespace is | |
1496 | /// before using this function. | |
1497 | /// | |
1498 | /// [infra-aw]: https://infra.spec.whatwg.org/#ascii-whitespace | |
1499 | /// [pct]: http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 | |
1500 | /// [bfs]: http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05 | |
1501 | /// | |
1502 | /// # Examples | |
1503 | /// | |
1504 | /// ``` | |
abe05a73 XL |
1505 | /// let uppercase_a = 'A'; |
1506 | /// let uppercase_g = 'G'; | |
1507 | /// let a = 'a'; | |
1508 | /// let g = 'g'; | |
1509 | /// let zero = '0'; | |
1510 | /// let percent = '%'; | |
1511 | /// let space = ' '; | |
1512 | /// let lf = '\n'; | |
1513 | /// let esc: char = 0x1b_u8.into(); | |
1514 | /// | |
1515 | /// assert!(!uppercase_a.is_ascii_whitespace()); | |
1516 | /// assert!(!uppercase_g.is_ascii_whitespace()); | |
1517 | /// assert!(!a.is_ascii_whitespace()); | |
1518 | /// assert!(!g.is_ascii_whitespace()); | |
1519 | /// assert!(!zero.is_ascii_whitespace()); | |
1520 | /// assert!(!percent.is_ascii_whitespace()); | |
1521 | /// assert!(space.is_ascii_whitespace()); | |
1522 | /// assert!(lf.is_ascii_whitespace()); | |
1523 | /// assert!(!esc.is_ascii_whitespace()); | |
1524 | /// ``` | |
ff7c6d11 | 1525 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1526 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1527 | #[inline] |
74b04a01 | 1528 | pub const fn is_ascii_whitespace(&self) -> bool { |
29967ef6 | 1529 | matches!(*self, '\t' | '\n' | '\x0C' | '\r' | ' ') |
abe05a73 XL |
1530 | } |
1531 | ||
1532 | /// Checks if the value is an ASCII control character: | |
dc9dc135 | 1533 | /// U+0000 NUL ..= U+001F UNIT SEPARATOR, or U+007F DELETE. |
abe05a73 XL |
1534 | /// Note that most ASCII whitespace characters are control |
1535 | /// characters, but SPACE is not. | |
1536 | /// | |
1537 | /// # Examples | |
1538 | /// | |
1539 | /// ``` | |
abe05a73 XL |
1540 | /// let uppercase_a = 'A'; |
1541 | /// let uppercase_g = 'G'; | |
1542 | /// let a = 'a'; | |
1543 | /// let g = 'g'; | |
1544 | /// let zero = '0'; | |
1545 | /// let percent = '%'; | |
1546 | /// let space = ' '; | |
1547 | /// let lf = '\n'; | |
1548 | /// let esc: char = 0x1b_u8.into(); | |
1549 | /// | |
1550 | /// assert!(!uppercase_a.is_ascii_control()); | |
1551 | /// assert!(!uppercase_g.is_ascii_control()); | |
1552 | /// assert!(!a.is_ascii_control()); | |
1553 | /// assert!(!g.is_ascii_control()); | |
1554 | /// assert!(!zero.is_ascii_control()); | |
1555 | /// assert!(!percent.is_ascii_control()); | |
1556 | /// assert!(!space.is_ascii_control()); | |
1557 | /// assert!(lf.is_ascii_control()); | |
1558 | /// assert!(esc.is_ascii_control()); | |
1559 | /// ``` | |
ff7c6d11 | 1560 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1561 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1562 | #[inline] |
74b04a01 | 1563 | pub const fn is_ascii_control(&self) -> bool { |
29967ef6 | 1564 | matches!(*self, '\0'..='\x1F' | '\x7F') |
abe05a73 | 1565 | } |
e9174d1e | 1566 | } |
f9f354fc XL |
1567 | |
1568 | #[inline] | |
6a06907d | 1569 | const fn len_utf8(code: u32) -> usize { |
f9f354fc XL |
1570 | if code < MAX_ONE_B { |
1571 | 1 | |
1572 | } else if code < MAX_TWO_B { | |
1573 | 2 | |
1574 | } else if code < MAX_THREE_B { | |
1575 | 3 | |
1576 | } else { | |
1577 | 4 | |
1578 | } | |
1579 | } | |
1580 | ||
1581 | /// Encodes a raw u32 value as UTF-8 into the provided byte buffer, | |
1582 | /// and then returns the subslice of the buffer that contains the encoded character. | |
1583 | /// | |
1584 | /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. | |
1585 | /// (Creating a `char` in the surrogate range is UB.) | |
1586 | /// The result is valid [generalized UTF-8] but not valid UTF-8. | |
1587 | /// | |
1588 | /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 | |
1589 | /// | |
1590 | /// # Panics | |
1591 | /// | |
1592 | /// Panics if the buffer is not large enough. | |
1593 | /// A buffer of length four is large enough to encode any `char`. | |
1594 | #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] | |
1595 | #[doc(hidden)] | |
1596 | #[inline] | |
1597 | pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { | |
1598 | let len = len_utf8(code); | |
1599 | match (len, &mut dst[..]) { | |
1600 | (1, [a, ..]) => { | |
1601 | *a = code as u8; | |
1602 | } | |
1603 | (2, [a, b, ..]) => { | |
1604 | *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; | |
1605 | *b = (code & 0x3F) as u8 | TAG_CONT; | |
1606 | } | |
1607 | (3, [a, b, c, ..]) => { | |
1608 | *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; | |
1609 | *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
1610 | *c = (code & 0x3F) as u8 | TAG_CONT; | |
1611 | } | |
1612 | (4, [a, b, c, d, ..]) => { | |
1613 | *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; | |
1614 | *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; | |
1615 | *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
1616 | *d = (code & 0x3F) as u8 | TAG_CONT; | |
1617 | } | |
1618 | _ => panic!( | |
1619 | "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", | |
1620 | len, | |
1621 | code, | |
1622 | dst.len(), | |
1623 | ), | |
1624 | }; | |
1625 | &mut dst[..len] | |
1626 | } | |
1627 | ||
1628 | /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, | |
1629 | /// and then returns the subslice of the buffer that contains the encoded character. | |
1630 | /// | |
1631 | /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range. | |
1632 | /// (Creating a `char` in the surrogate range is UB.) | |
1633 | /// | |
1634 | /// # Panics | |
1635 | /// | |
1636 | /// Panics if the buffer is not large enough. | |
1637 | /// A buffer of length 2 is large enough to encode any `char`. | |
1638 | #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] | |
1639 | #[doc(hidden)] | |
1640 | #[inline] | |
1641 | pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] { | |
1642 | // SAFETY: each arm checks whether there are enough bits to write into | |
1643 | unsafe { | |
1644 | if (code & 0xFFFF) == code && !dst.is_empty() { | |
1645 | // The BMP falls through | |
1646 | *dst.get_unchecked_mut(0) = code as u16; | |
1647 | slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) | |
1648 | } else if dst.len() >= 2 { | |
1649 | // Supplementary planes break into surrogates. | |
1650 | code -= 0x1_0000; | |
1651 | *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); | |
1652 | *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); | |
1653 | slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) | |
1654 | } else { | |
1655 | panic!( | |
1656 | "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", | |
1657 | from_u32_unchecked(code).len_utf16(), | |
1658 | code, | |
1659 | dst.len(), | |
1660 | ) | |
1661 | } | |
1662 | } | |
1663 | } |