]>
Commit | Line | Data |
---|---|---|
83c7162d | 1 | //! impl char {} |
c34b1796 | 2 | |
6a06907d | 3 | use crate::intrinsics::likely; |
48663c56 XL |
4 | use crate::slice; |
5 | use crate::str::from_utf8_unchecked_mut; | |
6 | use crate::unicode::printable::is_printable; | |
dfeec247 | 7 | use crate::unicode::{self, conversions}; |
48663c56 | 8 | |
83c7162d | 9 | use super::*; |
32a655c1 | 10 | |
c34b1796 AL |
11 | #[lang = "char"] |
12 | impl char { | |
f9f354fc XL |
13 | /// The highest valid code point a `char` can have. |
14 | /// | |
15 | /// A `char` is a [Unicode Scalar Value], which means that it is a [Code | |
16 | /// Point], but only ones within a certain range. `MAX` is the highest valid | |
17 | /// code point that's a valid [Unicode Scalar Value]. | |
18 | /// | |
19 | /// [Unicode Scalar Value]: http://www.unicode.org/glossary/#unicode_scalar_value | |
20 | /// [Code Point]: http://www.unicode.org/glossary/#code_point | |
6a06907d | 21 | #[stable(feature = "assoc_char_consts", since = "1.52.0")] |
f9f354fc XL |
22 | pub const MAX: char = '\u{10ffff}'; |
23 | ||
24 | /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a | |
25 | /// decoding error. | |
26 | /// | |
27 | /// It can occur, for example, when giving ill-formed UTF-8 bytes to | |
28 | /// [`String::from_utf8_lossy`](string/struct.String.html#method.from_utf8_lossy). | |
6a06907d | 29 | #[stable(feature = "assoc_char_consts", since = "1.52.0")] |
f9f354fc XL |
30 | pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; |
31 | ||
32 | /// The version of [Unicode](http://www.unicode.org/) that the Unicode parts of | |
33 | /// `char` and `str` methods are based on. | |
34 | /// | |
35 | /// New versions of Unicode are released regularly and subsequently all methods | |
36 | /// in the standard library depending on Unicode are updated. Therefore the | |
37 | /// behavior of some `char` and `str` methods and the value of this constant | |
38 | /// changes over time. This is *not* considered to be a breaking change. | |
39 | /// | |
40 | /// The version numbering scheme is explained in | |
41 | /// [Unicode 11.0 or later, Section 3.1 Versions of the Unicode Standard](https://www.unicode.org/versions/Unicode11.0.0/ch03.pdf#page=4). | |
6a06907d | 42 | #[stable(feature = "assoc_char_consts", since = "1.52.0")] |
f9f354fc XL |
43 | pub const UNICODE_VERSION: (u8, u8, u8) = crate::unicode::UNICODE_VERSION; |
44 | ||
45 | /// Creates an iterator over the UTF-16 encoded code points in `iter`, | |
46 | /// returning unpaired surrogates as `Err`s. | |
47 | /// | |
48 | /// # Examples | |
49 | /// | |
50 | /// Basic usage: | |
51 | /// | |
52 | /// ``` | |
53 | /// use std::char::decode_utf16; | |
54 | /// | |
55 | /// // 𝄞mus<invalid>ic<invalid> | |
56 | /// let v = [ | |
57 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, | |
58 | /// ]; | |
59 | /// | |
60 | /// assert_eq!( | |
61 | /// decode_utf16(v.iter().cloned()) | |
62 | /// .map(|r| r.map_err(|e| e.unpaired_surrogate())) | |
63 | /// .collect::<Vec<_>>(), | |
64 | /// vec![ | |
65 | /// Ok('𝄞'), | |
66 | /// Ok('m'), Ok('u'), Ok('s'), | |
67 | /// Err(0xDD1E), | |
68 | /// Ok('i'), Ok('c'), | |
69 | /// Err(0xD834) | |
70 | /// ] | |
71 | /// ); | |
72 | /// ``` | |
73 | /// | |
74 | /// A lossy decoder can be obtained by replacing `Err` results with the replacement character: | |
75 | /// | |
76 | /// ``` | |
77 | /// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; | |
78 | /// | |
79 | /// // 𝄞mus<invalid>ic<invalid> | |
80 | /// let v = [ | |
81 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, | |
82 | /// ]; | |
83 | /// | |
84 | /// assert_eq!( | |
85 | /// decode_utf16(v.iter().cloned()) | |
86 | /// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) | |
87 | /// .collect::<String>(), | |
88 | /// "𝄞mus�ic�" | |
89 | /// ); | |
90 | /// ``` | |
6a06907d | 91 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
92 | #[inline] |
93 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> { | |
94 | super::decode::decode_utf16(iter) | |
95 | } | |
96 | ||
97 | /// Converts a `u32` to a `char`. | |
98 | /// | |
99 | /// Note that all `char`s are valid [`u32`]s, and can be cast to one with | |
100 | /// `as`: | |
101 | /// | |
102 | /// ``` | |
103 | /// let c = '💯'; | |
104 | /// let i = c as u32; | |
105 | /// | |
106 | /// assert_eq!(128175, i); | |
107 | /// ``` | |
108 | /// | |
109 | /// However, the reverse is not true: not all valid [`u32`]s are valid | |
110 | /// `char`s. `from_u32()` will return `None` if the input is not a valid value | |
111 | /// for a `char`. | |
112 | /// | |
f9f354fc XL |
113 | /// For an unsafe version of this function which ignores these checks, see |
114 | /// [`from_u32_unchecked`]. | |
115 | /// | |
116 | /// [`from_u32_unchecked`]: #method.from_u32_unchecked | |
117 | /// | |
118 | /// # Examples | |
119 | /// | |
120 | /// Basic usage: | |
121 | /// | |
122 | /// ``` | |
123 | /// use std::char; | |
124 | /// | |
125 | /// let c = char::from_u32(0x2764); | |
126 | /// | |
127 | /// assert_eq!(Some('❤'), c); | |
128 | /// ``` | |
129 | /// | |
130 | /// Returning `None` when the input is not a valid `char`: | |
131 | /// | |
132 | /// ``` | |
133 | /// use std::char; | |
134 | /// | |
135 | /// let c = char::from_u32(0x110000); | |
136 | /// | |
137 | /// assert_eq!(None, c); | |
138 | /// ``` | |
6a06907d | 139 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
140 | #[inline] |
141 | pub fn from_u32(i: u32) -> Option<char> { | |
142 | super::convert::from_u32(i) | |
143 | } | |
144 | ||
145 | /// Converts a `u32` to a `char`, ignoring validity. | |
146 | /// | |
147 | /// Note that all `char`s are valid [`u32`]s, and can be cast to one with | |
148 | /// `as`: | |
149 | /// | |
150 | /// ``` | |
151 | /// let c = '💯'; | |
152 | /// let i = c as u32; | |
153 | /// | |
154 | /// assert_eq!(128175, i); | |
155 | /// ``` | |
156 | /// | |
157 | /// However, the reverse is not true: not all valid [`u32`]s are valid | |
158 | /// `char`s. `from_u32_unchecked()` will ignore this, and blindly cast to | |
159 | /// `char`, possibly creating an invalid one. | |
160 | /// | |
f9f354fc XL |
161 | /// # Safety |
162 | /// | |
163 | /// This function is unsafe, as it may construct invalid `char` values. | |
164 | /// | |
165 | /// For a safe version of this function, see the [`from_u32`] function. | |
166 | /// | |
167 | /// [`from_u32`]: #method.from_u32 | |
168 | /// | |
169 | /// # Examples | |
170 | /// | |
171 | /// Basic usage: | |
172 | /// | |
173 | /// ``` | |
174 | /// use std::char; | |
175 | /// | |
176 | /// let c = unsafe { char::from_u32_unchecked(0x2764) }; | |
177 | /// | |
178 | /// assert_eq!('❤', c); | |
179 | /// ``` | |
6a06907d | 180 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
181 | #[inline] |
182 | pub unsafe fn from_u32_unchecked(i: u32) -> char { | |
f035d41b XL |
183 | // SAFETY: the safety contract must be upheld by the caller. |
184 | unsafe { super::convert::from_u32_unchecked(i) } | |
f9f354fc XL |
185 | } |
186 | ||
187 | /// Converts a digit in the given radix to a `char`. | |
188 | /// | |
189 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
190 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
191 | /// sixteen, hexadecimal, to give some common values. Arbitrary | |
192 | /// radices are supported. | |
193 | /// | |
194 | /// `from_digit()` will return `None` if the input is not a digit in | |
195 | /// the given radix. | |
196 | /// | |
197 | /// # Panics | |
198 | /// | |
199 | /// Panics if given a radix larger than 36. | |
200 | /// | |
201 | /// # Examples | |
202 | /// | |
203 | /// Basic usage: | |
204 | /// | |
205 | /// ``` | |
206 | /// use std::char; | |
207 | /// | |
208 | /// let c = char::from_digit(4, 10); | |
209 | /// | |
210 | /// assert_eq!(Some('4'), c); | |
211 | /// | |
212 | /// // Decimal 11 is a single digit in base 16 | |
213 | /// let c = char::from_digit(11, 16); | |
214 | /// | |
215 | /// assert_eq!(Some('b'), c); | |
216 | /// ``` | |
217 | /// | |
218 | /// Returning `None` when the input is not a digit: | |
219 | /// | |
220 | /// ``` | |
221 | /// use std::char; | |
222 | /// | |
223 | /// let c = char::from_digit(20, 10); | |
224 | /// | |
225 | /// assert_eq!(None, c); | |
226 | /// ``` | |
227 | /// | |
228 | /// Passing a large radix, causing a panic: | |
229 | /// | |
f035d41b | 230 | /// ```should_panic |
f9f354fc XL |
231 | /// use std::char; |
232 | /// | |
f035d41b XL |
233 | /// // this panics |
234 | /// char::from_digit(1, 37); | |
f9f354fc | 235 | /// ``` |
6a06907d | 236 | #[stable(feature = "assoc_char_funcs", since = "1.52.0")] |
f9f354fc XL |
237 | #[inline] |
238 | pub fn from_digit(num: u32, radix: u32) -> Option<char> { | |
239 | super::convert::from_digit(num, radix) | |
240 | } | |
241 | ||
b039eaaf SL |
242 | /// Checks if a `char` is a digit in the given radix. |
243 | /// | |
244 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
245 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 246 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
476ff2be | 247 | /// radices are supported. |
1a4d82fc | 248 | /// |
fc512014 | 249 | /// Compared to [`is_numeric()`], this function only recognizes the characters |
1a4d82fc JJ |
250 | /// `0-9`, `a-z` and `A-Z`. |
251 | /// | |
b039eaaf | 252 | /// 'Digit' is defined to be only the following characters: |
1a4d82fc | 253 | /// |
b039eaaf SL |
254 | /// * `0-9` |
255 | /// * `a-z` | |
256 | /// * `A-Z` | |
257 | /// | |
fc512014 | 258 | /// For a more comprehensive understanding of 'digit', see [`is_numeric()`]. |
b039eaaf | 259 | /// |
fc512014 | 260 | /// [`is_numeric()`]: #method.is_numeric |
1a4d82fc JJ |
261 | /// |
262 | /// # Panics | |
263 | /// | |
b039eaaf | 264 | /// Panics if given a radix larger than 36. |
c34b1796 AL |
265 | /// |
266 | /// # Examples | |
267 | /// | |
b039eaaf SL |
268 | /// Basic usage: |
269 | /// | |
c34b1796 | 270 | /// ``` |
54a0048b SL |
271 | /// assert!('1'.is_digit(10)); |
272 | /// assert!('f'.is_digit(16)); | |
273 | /// assert!(!'f'.is_digit(10)); | |
b039eaaf SL |
274 | /// ``` |
275 | /// | |
276 | /// Passing a large radix, causing a panic: | |
277 | /// | |
f035d41b XL |
278 | /// ```should_panic |
279 | /// // this panics | |
280 | /// '1'.is_digit(37); | |
c34b1796 AL |
281 | /// ``` |
282 | #[stable(feature = "rust1", since = "1.0.0")] | |
283 | #[inline] | |
b039eaaf | 284 | pub fn is_digit(self, radix: u32) -> bool { |
83c7162d | 285 | self.to_digit(radix).is_some() |
b039eaaf | 286 | } |
1a4d82fc | 287 | |
b039eaaf SL |
288 | /// Converts a `char` to a digit in the given radix. |
289 | /// | |
290 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
291 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 292 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
476ff2be | 293 | /// radices are supported. |
b039eaaf SL |
294 | /// |
295 | /// 'Digit' is defined to be only the following characters: | |
296 | /// | |
297 | /// * `0-9` | |
298 | /// * `a-z` | |
299 | /// * `A-Z` | |
1a4d82fc | 300 | /// |
7453a54e | 301 | /// # Errors |
1a4d82fc | 302 | /// |
b039eaaf | 303 | /// Returns `None` if the `char` does not refer to a digit in the given radix. |
1a4d82fc JJ |
304 | /// |
305 | /// # Panics | |
306 | /// | |
b039eaaf | 307 | /// Panics if given a radix larger than 36. |
c34b1796 AL |
308 | /// |
309 | /// # Examples | |
310 | /// | |
b039eaaf SL |
311 | /// Basic usage: |
312 | /// | |
313 | /// ``` | |
54a0048b SL |
314 | /// assert_eq!('1'.to_digit(10), Some(1)); |
315 | /// assert_eq!('f'.to_digit(16), Some(15)); | |
b039eaaf SL |
316 | /// ``` |
317 | /// | |
318 | /// Passing a non-digit results in failure: | |
319 | /// | |
320 | /// ``` | |
54a0048b SL |
321 | /// assert_eq!('f'.to_digit(10), None); |
322 | /// assert_eq!('z'.to_digit(16), None); | |
b039eaaf SL |
323 | /// ``` |
324 | /// | |
325 | /// Passing a large radix, causing a panic: | |
326 | /// | |
f035d41b XL |
327 | /// ```should_panic |
328 | /// // this panics | |
329 | /// '1'.to_digit(37); | |
c34b1796 AL |
330 | /// ``` |
331 | #[stable(feature = "rust1", since = "1.0.0")] | |
62682a34 | 332 | #[inline] |
b039eaaf | 333 | pub fn to_digit(self, radix: u32) -> Option<u32> { |
a1dfa0c6 | 334 | assert!(radix <= 36, "to_digit: radix is too high (maximum 36)"); |
a1dfa0c6 XL |
335 | // the code is split up here to improve execution speed for cases where |
336 | // the `radix` is constant and 10 or smaller | |
6a06907d XL |
337 | let val = if likely(radix <= 10) { |
338 | // If not a digit, a number greater than radix will be created. | |
339 | (self as u32).wrapping_sub('0' as u32) | |
a1dfa0c6 XL |
340 | } else { |
341 | match self { | |
342 | '0'..='9' => self as u32 - '0' as u32, | |
343 | 'a'..='z' => self as u32 - 'a' as u32 + 10, | |
344 | 'A'..='Z' => self as u32 - 'A' as u32 + 10, | |
345 | _ => return None, | |
346 | } | |
83c7162d | 347 | }; |
a1dfa0c6 | 348 | |
60c5eb7d | 349 | if val < radix { Some(val) } else { None } |
b039eaaf | 350 | } |
1a4d82fc | 351 | |
c34b1796 | 352 | /// Returns an iterator that yields the hexadecimal Unicode escape of a |
32a655c1 | 353 | /// character as `char`s. |
1a4d82fc | 354 | /// |
32a655c1 SL |
355 | /// This will escape characters with the Rust syntax of the form |
356 | /// `\u{NNNNNN}` where `NNNNNN` is a hexadecimal representation. | |
c34b1796 AL |
357 | /// |
358 | /// # Examples | |
359 | /// | |
32a655c1 | 360 | /// As an iterator: |
92a42be0 | 361 | /// |
c34b1796 | 362 | /// ``` |
62682a34 SL |
363 | /// for c in '❤'.escape_unicode() { |
364 | /// print!("{}", c); | |
c34b1796 | 365 | /// } |
32a655c1 | 366 | /// println!(); |
c34b1796 AL |
367 | /// ``` |
368 | /// | |
32a655c1 | 369 | /// Using `println!` directly: |
c34b1796 | 370 | /// |
32a655c1 SL |
371 | /// ``` |
372 | /// println!("{}", '❤'.escape_unicode()); | |
c34b1796 AL |
373 | /// ``` |
374 | /// | |
32a655c1 | 375 | /// Both are equivalent to: |
c34b1796 AL |
376 | /// |
377 | /// ``` | |
32a655c1 SL |
378 | /// println!("\\u{{2764}}"); |
379 | /// ``` | |
c34b1796 | 380 | /// |
32a655c1 SL |
381 | /// Using `to_string`: |
382 | /// | |
383 | /// ``` | |
384 | /// assert_eq!('❤'.escape_unicode().to_string(), "\\u{2764}"); | |
c34b1796 | 385 | /// ``` |
85aaf69f | 386 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 387 | #[inline] |
b039eaaf | 388 | pub fn escape_unicode(self) -> EscapeUnicode { |
83c7162d XL |
389 | let c = self as u32; |
390 | ||
391 | // or-ing 1 ensures that for c==0 the code computes that one | |
392 | // digit should be printed and (which is the same) avoids the | |
393 | // (31 - 32) underflow | |
394 | let msb = 31 - (c | 1).leading_zeros(); | |
395 | ||
396 | // the index of the most significant hex digit | |
397 | let ms_hex_digit = msb / 4; | |
398 | EscapeUnicode { | |
399 | c: self, | |
400 | state: EscapeUnicodeState::Backslash, | |
401 | hex_digit_idx: ms_hex_digit as usize, | |
402 | } | |
b039eaaf | 403 | } |
1a4d82fc | 404 | |
94b46f34 | 405 | /// An extended version of `escape_debug` that optionally permits escaping |
cdc7bbd5 XL |
406 | /// Extended Grapheme codepoints, single quotes, and double quotes. This |
407 | /// allows us to format characters like nonspacing marks better when they're | |
408 | /// at the start of a string, and allows escaping single quotes in | |
409 | /// characters, and double quotes in strings. | |
94b46f34 | 410 | #[inline] |
cdc7bbd5 | 411 | pub(crate) fn escape_debug_ext(self, args: EscapeDebugExtArgs) -> EscapeDebug { |
94b46f34 XL |
412 | let init_state = match self { |
413 | '\t' => EscapeDefaultState::Backslash('t'), | |
414 | '\r' => EscapeDefaultState::Backslash('r'), | |
415 | '\n' => EscapeDefaultState::Backslash('n'), | |
cdc7bbd5 XL |
416 | '\\' => EscapeDefaultState::Backslash(self), |
417 | '"' if args.escape_double_quote => EscapeDefaultState::Backslash(self), | |
418 | '\'' if args.escape_single_quote => EscapeDefaultState::Backslash(self), | |
419 | _ if args.escape_grapheme_extended && self.is_grapheme_extended() => { | |
94b46f34 XL |
420 | EscapeDefaultState::Unicode(self.escape_unicode()) |
421 | } | |
422 | _ if is_printable(self) => EscapeDefaultState::Char(self), | |
423 | _ => EscapeDefaultState::Unicode(self.escape_unicode()), | |
424 | }; | |
425 | EscapeDebug(EscapeDefault { state: init_state }) | |
426 | } | |
427 | ||
32a655c1 SL |
428 | /// Returns an iterator that yields the literal escape code of a character |
429 | /// as `char`s. | |
5bcae85e SL |
430 | /// |
431 | /// This will escape the characters similar to the `Debug` implementations | |
432 | /// of `str` or `char`. | |
433 | /// | |
434 | /// # Examples | |
435 | /// | |
32a655c1 | 436 | /// As an iterator: |
5bcae85e SL |
437 | /// |
438 | /// ``` | |
32a655c1 SL |
439 | /// for c in '\n'.escape_debug() { |
440 | /// print!("{}", c); | |
5bcae85e | 441 | /// } |
32a655c1 | 442 | /// println!(); |
5bcae85e SL |
443 | /// ``` |
444 | /// | |
32a655c1 | 445 | /// Using `println!` directly: |
5bcae85e | 446 | /// |
32a655c1 | 447 | /// ``` |
32a655c1 | 448 | /// println!("{}", '\n'.escape_debug()); |
5bcae85e SL |
449 | /// ``` |
450 | /// | |
32a655c1 | 451 | /// Both are equivalent to: |
5bcae85e SL |
452 | /// |
453 | /// ``` | |
32a655c1 SL |
454 | /// println!("\\n"); |
455 | /// ``` | |
5bcae85e | 456 | /// |
32a655c1 SL |
457 | /// Using `to_string`: |
458 | /// | |
459 | /// ``` | |
32a655c1 | 460 | /// assert_eq!('\n'.escape_debug().to_string(), "\\n"); |
5bcae85e | 461 | /// ``` |
041b39d2 | 462 | #[stable(feature = "char_escape_debug", since = "1.20.0")] |
5bcae85e SL |
463 | #[inline] |
464 | pub fn escape_debug(self) -> EscapeDebug { | |
cdc7bbd5 | 465 | self.escape_debug_ext(EscapeDebugExtArgs::ESCAPE_ALL) |
5bcae85e SL |
466 | } |
467 | ||
32a655c1 SL |
468 | /// Returns an iterator that yields the literal escape code of a character |
469 | /// as `char`s. | |
1a4d82fc JJ |
470 | /// |
471 | /// The default is chosen with a bias toward producing literals that are | |
472 | /// legal in a variety of languages, including C++11 and similar C-family | |
473 | /// languages. The exact rules are: | |
474 | /// | |
b039eaaf SL |
475 | /// * Tab is escaped as `\t`. |
476 | /// * Carriage return is escaped as `\r`. | |
477 | /// * Line feed is escaped as `\n`. | |
478 | /// * Single quote is escaped as `\'`. | |
479 | /// * Double quote is escaped as `\"`. | |
480 | /// * Backslash is escaped as `\\`. | |
481 | /// * Any character in the 'printable ASCII' range `0x20` .. `0x7e` | |
482 | /// inclusive is not escaped. | |
483 | /// * All other characters are given hexadecimal Unicode escapes; see | |
fc512014 | 484 | /// [`escape_unicode`]. |
b039eaaf | 485 | /// |
fc512014 | 486 | /// [`escape_unicode`]: #method.escape_unicode |
c34b1796 AL |
487 | /// |
488 | /// # Examples | |
489 | /// | |
32a655c1 | 490 | /// As an iterator: |
b039eaaf | 491 | /// |
c34b1796 | 492 | /// ``` |
32a655c1 SL |
493 | /// for c in '"'.escape_default() { |
494 | /// print!("{}", c); | |
c34b1796 | 495 | /// } |
32a655c1 | 496 | /// println!(); |
c34b1796 AL |
497 | /// ``` |
498 | /// | |
32a655c1 | 499 | /// Using `println!` directly: |
c34b1796 | 500 | /// |
c34b1796 | 501 | /// ``` |
32a655c1 SL |
502 | /// println!("{}", '"'.escape_default()); |
503 | /// ``` | |
504 | /// | |
32a655c1 | 505 | /// Both are equivalent to: |
c34b1796 AL |
506 | /// |
507 | /// ``` | |
32a655c1 SL |
508 | /// println!("\\\""); |
509 | /// ``` | |
510 | /// | |
511 | /// Using `to_string`: | |
c34b1796 | 512 | /// |
32a655c1 SL |
513 | /// ``` |
514 | /// assert_eq!('"'.escape_default().to_string(), "\\\""); | |
c34b1796 | 515 | /// ``` |
85aaf69f | 516 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 517 | #[inline] |
b039eaaf | 518 | pub fn escape_default(self) -> EscapeDefault { |
83c7162d XL |
519 | let init_state = match self { |
520 | '\t' => EscapeDefaultState::Backslash('t'), | |
521 | '\r' => EscapeDefaultState::Backslash('r'), | |
522 | '\n' => EscapeDefaultState::Backslash('n'), | |
523 | '\\' | '\'' | '"' => EscapeDefaultState::Backslash(self), | |
e74abb32 XL |
524 | '\x20'..='\x7e' => EscapeDefaultState::Char(self), |
525 | _ => EscapeDefaultState::Unicode(self.escape_unicode()), | |
83c7162d XL |
526 | }; |
527 | EscapeDefault { state: init_state } | |
b039eaaf | 528 | } |
1a4d82fc | 529 | |
b039eaaf SL |
530 | /// Returns the number of bytes this `char` would need if encoded in UTF-8. |
531 | /// | |
532 | /// That number of bytes is always between 1 and 4, inclusive. | |
c34b1796 AL |
533 | /// |
534 | /// # Examples | |
535 | /// | |
b039eaaf SL |
536 | /// Basic usage: |
537 | /// | |
c34b1796 | 538 | /// ``` |
b039eaaf SL |
539 | /// let len = 'A'.len_utf8(); |
540 | /// assert_eq!(len, 1); | |
541 | /// | |
542 | /// let len = 'ß'.len_utf8(); | |
543 | /// assert_eq!(len, 2); | |
544 | /// | |
545 | /// let len = 'ℝ'.len_utf8(); | |
546 | /// assert_eq!(len, 3); | |
c34b1796 | 547 | /// |
b039eaaf SL |
548 | /// let len = '💣'.len_utf8(); |
549 | /// assert_eq!(len, 4); | |
550 | /// ``` | |
551 | /// | |
552 | /// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it | |
553 | /// would take if each code point was represented as a `char` vs in the `&str` itself: | |
554 | /// | |
555 | /// ``` | |
556 | /// // as chars | |
557 | /// let eastern = '東'; | |
dc9dc135 | 558 | /// let capital = '京'; |
b039eaaf SL |
559 | /// |
560 | /// // both can be represented as three bytes | |
561 | /// assert_eq!(3, eastern.len_utf8()); | |
dc9dc135 | 562 | /// assert_eq!(3, capital.len_utf8()); |
b039eaaf SL |
563 | /// |
564 | /// // as a &str, these two are encoded in UTF-8 | |
565 | /// let tokyo = "東京"; | |
566 | /// | |
dc9dc135 | 567 | /// let len = eastern.len_utf8() + capital.len_utf8(); |
b039eaaf SL |
568 | /// |
569 | /// // we can see that they take six bytes total... | |
570 | /// assert_eq!(6, tokyo.len()); | |
571 | /// | |
572 | /// // ... just like the &str | |
573 | /// assert_eq!(len, tokyo.len()); | |
c34b1796 | 574 | /// ``` |
85aaf69f | 575 | #[stable(feature = "rust1", since = "1.0.0")] |
6a06907d | 576 | #[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")] |
62682a34 | 577 | #[inline] |
6a06907d | 578 | pub const fn len_utf8(self) -> usize { |
f9f354fc | 579 | len_utf8(self as u32) |
b039eaaf | 580 | } |
1a4d82fc | 581 | |
b039eaaf | 582 | /// Returns the number of 16-bit code units this `char` would need if |
c34b1796 AL |
583 | /// encoded in UTF-16. |
584 | /// | |
fc512014 | 585 | /// See the documentation for [`len_utf8()`] for more explanation of this |
92a42be0 SL |
586 | /// concept. This function is a mirror, but for UTF-16 instead of UTF-8. |
587 | /// | |
fc512014 | 588 | /// [`len_utf8()`]: #method.len_utf8 |
b039eaaf | 589 | /// |
c34b1796 AL |
590 | /// # Examples |
591 | /// | |
92a42be0 SL |
592 | /// Basic usage: |
593 | /// | |
c34b1796 AL |
594 | /// ``` |
595 | /// let n = 'ß'.len_utf16(); | |
c34b1796 | 596 | /// assert_eq!(n, 1); |
b039eaaf SL |
597 | /// |
598 | /// let len = '💣'.len_utf16(); | |
599 | /// assert_eq!(len, 2); | |
c34b1796 | 600 | /// ``` |
85aaf69f | 601 | #[stable(feature = "rust1", since = "1.0.0")] |
6a06907d | 602 | #[rustc_const_stable(feature = "const_char_len_utf", since = "1.52.0")] |
62682a34 | 603 | #[inline] |
6a06907d | 604 | pub const fn len_utf16(self) -> usize { |
83c7162d XL |
605 | let ch = self as u32; |
606 | if (ch & 0xFFFF) == ch { 1 } else { 2 } | |
b039eaaf | 607 | } |
1a4d82fc | 608 | |
c30ab7b3 SL |
609 | /// Encodes this character as UTF-8 into the provided byte buffer, |
610 | /// and then returns the subslice of the buffer that contains the encoded character. | |
c34b1796 | 611 | /// |
c30ab7b3 SL |
612 | /// # Panics |
613 | /// | |
614 | /// Panics if the buffer is not large enough. | |
615 | /// A buffer of length four is large enough to encode any `char`. | |
c34b1796 AL |
616 | /// |
617 | /// # Examples | |
618 | /// | |
c30ab7b3 SL |
619 | /// In both of these examples, 'ß' takes two bytes to encode. |
620 | /// | |
c34b1796 | 621 | /// ``` |
c30ab7b3 | 622 | /// let mut b = [0; 2]; |
c34b1796 | 623 | /// |
c30ab7b3 SL |
624 | /// let result = 'ß'.encode_utf8(&mut b); |
625 | /// | |
626 | /// assert_eq!(result, "ß"); | |
627 | /// | |
628 | /// assert_eq!(result.len(), 2); | |
629 | /// ``` | |
630 | /// | |
631 | /// A buffer that's too small: | |
632 | /// | |
f035d41b XL |
633 | /// ```should_panic |
634 | /// let mut b = [0; 1]; | |
c30ab7b3 | 635 | /// |
f035d41b XL |
636 | /// // this panics |
637 | /// 'ß'.encode_utf8(&mut b); | |
c34b1796 | 638 | /// ``` |
476ff2be | 639 | #[stable(feature = "unicode_encode_char", since = "1.15.0")] |
62682a34 | 640 | #[inline] |
c30ab7b3 | 641 | pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str { |
f9f354fc XL |
642 | // SAFETY: `char` is not a surrogate, so this is valid UTF-8. |
643 | unsafe { from_utf8_unchecked_mut(encode_utf8_raw(self as u32, dst)) } | |
62682a34 | 644 | } |
1a4d82fc | 645 | |
c30ab7b3 SL |
646 | /// Encodes this character as UTF-16 into the provided `u16` buffer, |
647 | /// and then returns the subslice of the buffer that contains the encoded character. | |
c34b1796 | 648 | /// |
c30ab7b3 SL |
649 | /// # Panics |
650 | /// | |
651 | /// Panics if the buffer is not large enough. | |
652 | /// A buffer of length 2 is large enough to encode any `char`. | |
c34b1796 AL |
653 | /// |
654 | /// # Examples | |
655 | /// | |
c30ab7b3 SL |
656 | /// In both of these examples, '𝕊' takes two `u16`s to encode. |
657 | /// | |
c34b1796 | 658 | /// ``` |
c30ab7b3 | 659 | /// let mut b = [0; 2]; |
c34b1796 | 660 | /// |
c30ab7b3 SL |
661 | /// let result = '𝕊'.encode_utf16(&mut b); |
662 | /// | |
663 | /// assert_eq!(result.len(), 2); | |
664 | /// ``` | |
665 | /// | |
666 | /// A buffer that's too small: | |
667 | /// | |
f035d41b XL |
668 | /// ```should_panic |
669 | /// let mut b = [0; 1]; | |
c30ab7b3 | 670 | /// |
f035d41b XL |
671 | /// // this panics |
672 | /// '𝕊'.encode_utf16(&mut b); | |
c34b1796 | 673 | /// ``` |
476ff2be | 674 | #[stable(feature = "unicode_encode_char", since = "1.15.0")] |
62682a34 | 675 | #[inline] |
c30ab7b3 | 676 | pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] { |
f9f354fc | 677 | encode_utf16_raw(self as u32, dst) |
62682a34 | 678 | } |
1a4d82fc | 679 | |
e74abb32 XL |
680 | /// Returns `true` if this `char` has the `Alphabetic` property. |
681 | /// | |
682 | /// `Alphabetic` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and | |
683 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | |
684 | /// | |
685 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | |
686 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
687 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
b039eaaf SL |
688 | /// |
689 | /// # Examples | |
690 | /// | |
691 | /// Basic usage: | |
692 | /// | |
693 | /// ``` | |
54a0048b SL |
694 | /// assert!('a'.is_alphabetic()); |
695 | /// assert!('京'.is_alphabetic()); | |
b039eaaf SL |
696 | /// |
697 | /// let c = '💝'; | |
698 | /// // love is many things, but it is not alphabetic | |
699 | /// assert!(!c.is_alphabetic()); | |
700 | /// ``` | |
85aaf69f | 701 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
702 | #[inline] |
703 | pub fn is_alphabetic(self) -> bool { | |
704 | match self { | |
8faf50e0 | 705 | 'a'..='z' | 'A'..='Z' => true, |
dfeec247 | 706 | c => c > '\x7f' && unicode::Alphabetic(c), |
c34b1796 AL |
707 | } |
708 | } | |
1a4d82fc | 709 | |
e74abb32 | 710 | /// Returns `true` if this `char` has the `Lowercase` property. |
1a4d82fc | 711 | /// |
e74abb32 XL |
712 | /// `Lowercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and |
713 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | |
714 | /// | |
715 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | |
716 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
717 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
b039eaaf SL |
718 | /// |
719 | /// # Examples | |
720 | /// | |
721 | /// Basic usage: | |
722 | /// | |
723 | /// ``` | |
54a0048b SL |
724 | /// assert!('a'.is_lowercase()); |
725 | /// assert!('δ'.is_lowercase()); | |
726 | /// assert!(!'A'.is_lowercase()); | |
727 | /// assert!(!'Δ'.is_lowercase()); | |
b039eaaf | 728 | /// |
f9f354fc | 729 | /// // The various Chinese scripts and punctuation do not have case, and so: |
54a0048b | 730 | /// assert!(!'中'.is_lowercase()); |
f9f354fc | 731 | /// assert!(!' '.is_lowercase()); |
b039eaaf | 732 | /// ``` |
85aaf69f | 733 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
734 | #[inline] |
735 | pub fn is_lowercase(self) -> bool { | |
736 | match self { | |
8faf50e0 | 737 | 'a'..='z' => true, |
dfeec247 | 738 | c => c > '\x7f' && unicode::Lowercase(c), |
c34b1796 AL |
739 | } |
740 | } | |
1a4d82fc | 741 | |
e74abb32 XL |
742 | /// Returns `true` if this `char` has the `Uppercase` property. |
743 | /// | |
744 | /// `Uppercase` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and | |
745 | /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. | |
1a4d82fc | 746 | /// |
e74abb32 XL |
747 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
748 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
749 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
b039eaaf SL |
750 | /// |
751 | /// # Examples | |
752 | /// | |
753 | /// Basic usage: | |
754 | /// | |
755 | /// ``` | |
54a0048b SL |
756 | /// assert!(!'a'.is_uppercase()); |
757 | /// assert!(!'δ'.is_uppercase()); | |
758 | /// assert!('A'.is_uppercase()); | |
759 | /// assert!('Δ'.is_uppercase()); | |
b039eaaf | 760 | /// |
f9f354fc | 761 | /// // The various Chinese scripts and punctuation do not have case, and so: |
54a0048b | 762 | /// assert!(!'中'.is_uppercase()); |
f9f354fc | 763 | /// assert!(!' '.is_uppercase()); |
b039eaaf | 764 | /// ``` |
85aaf69f | 765 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
766 | #[inline] |
767 | pub fn is_uppercase(self) -> bool { | |
768 | match self { | |
8faf50e0 | 769 | 'A'..='Z' => true, |
dfeec247 | 770 | c => c > '\x7f' && unicode::Uppercase(c), |
c34b1796 AL |
771 | } |
772 | } | |
1a4d82fc | 773 | |
e74abb32 | 774 | /// Returns `true` if this `char` has the `White_Space` property. |
b039eaaf | 775 | /// |
e74abb32 XL |
776 | /// `White_Space` is specified in the [Unicode Character Database][ucd] [`PropList.txt`]. |
777 | /// | |
778 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
779 | /// [`PropList.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt | |
b039eaaf SL |
780 | /// |
781 | /// # Examples | |
1a4d82fc | 782 | /// |
b039eaaf SL |
783 | /// Basic usage: |
784 | /// | |
785 | /// ``` | |
54a0048b | 786 | /// assert!(' '.is_whitespace()); |
b039eaaf SL |
787 | /// |
788 | /// // a non-breaking space | |
54a0048b | 789 | /// assert!('\u{A0}'.is_whitespace()); |
b039eaaf | 790 | /// |
54a0048b | 791 | /// assert!(!'越'.is_whitespace()); |
b039eaaf | 792 | /// ``` |
85aaf69f | 793 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
794 | #[inline] |
795 | pub fn is_whitespace(self) -> bool { | |
796 | match self { | |
8faf50e0 | 797 | ' ' | '\x09'..='\x0d' => true, |
dfeec247 | 798 | c => c > '\x7f' && unicode::White_Space(c), |
c34b1796 AL |
799 | } |
800 | } | |
1a4d82fc | 801 | |
e74abb32 | 802 | /// Returns `true` if this `char` satisfies either [`is_alphabetic()`] or [`is_numeric()`]. |
1a4d82fc | 803 | /// |
e74abb32 XL |
804 | /// [`is_alphabetic()`]: #method.is_alphabetic |
805 | /// [`is_numeric()`]: #method.is_numeric | |
b039eaaf SL |
806 | /// |
807 | /// # Examples | |
808 | /// | |
809 | /// Basic usage: | |
810 | /// | |
811 | /// ``` | |
54a0048b SL |
812 | /// assert!('٣'.is_alphanumeric()); |
813 | /// assert!('7'.is_alphanumeric()); | |
814 | /// assert!('৬'.is_alphanumeric()); | |
b7449926 XL |
815 | /// assert!('¾'.is_alphanumeric()); |
816 | /// assert!('①'.is_alphanumeric()); | |
54a0048b SL |
817 | /// assert!('K'.is_alphanumeric()); |
818 | /// assert!('و'.is_alphanumeric()); | |
819 | /// assert!('藏'.is_alphanumeric()); | |
b039eaaf | 820 | /// ``` |
85aaf69f | 821 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
822 | #[inline] |
823 | pub fn is_alphanumeric(self) -> bool { | |
824 | self.is_alphabetic() || self.is_numeric() | |
825 | } | |
1a4d82fc | 826 | |
e74abb32 XL |
827 | /// Returns `true` if this `char` has the general category for control codes. |
828 | /// | |
829 | /// Control codes (code points with the general category of `Cc`) are described in Chapter 4 | |
830 | /// (Character Properties) of the [Unicode Standard] and specified in the [Unicode Character | |
831 | /// Database][ucd] [`UnicodeData.txt`]. | |
1a4d82fc | 832 | /// |
e74abb32 XL |
833 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
834 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
835 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
b039eaaf SL |
836 | /// |
837 | /// # Examples | |
838 | /// | |
839 | /// Basic usage: | |
840 | /// | |
841 | /// ``` | |
842 | /// // U+009C, STRING TERMINATOR | |
54a0048b SL |
843 | /// assert!('\9c'.is_control()); |
844 | /// assert!(!'q'.is_control()); | |
b039eaaf | 845 | /// ``` |
85aaf69f | 846 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 | 847 | #[inline] |
b039eaaf | 848 | pub fn is_control(self) -> bool { |
dfeec247 | 849 | unicode::Cc(self) |
b039eaaf | 850 | } |
1a4d82fc | 851 | |
e74abb32 | 852 | /// Returns `true` if this `char` has the `Grapheme_Extend` property. |
94b46f34 | 853 | /// |
e74abb32 XL |
854 | /// `Grapheme_Extend` is described in [Unicode Standard Annex #29 (Unicode Text |
855 | /// Segmentation)][uax29] and specified in the [Unicode Character Database][ucd] | |
856 | /// [`DerivedCoreProperties.txt`]. | |
857 | /// | |
858 | /// [uax29]: https://www.unicode.org/reports/tr29/ | |
859 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
860 | /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
94b46f34 XL |
861 | #[inline] |
862 | pub(crate) fn is_grapheme_extended(self) -> bool { | |
dfeec247 | 863 | unicode::Grapheme_Extend(self) |
94b46f34 XL |
864 | } |
865 | ||
e74abb32 XL |
866 | /// Returns `true` if this `char` has one of the general categories for numbers. |
867 | /// | |
868 | /// The general categories for numbers (`Nd` for decimal digits, `Nl` for letter-like numeric | |
869 | /// characters, and `No` for other numeric characters) are specified in the [Unicode Character | |
870 | /// Database][ucd] [`UnicodeData.txt`]. | |
b039eaaf | 871 | /// |
e74abb32 XL |
872 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
873 | /// [ucd]: https://www.unicode.org/reports/tr44/ | |
874 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
b039eaaf SL |
875 | /// |
876 | /// # Examples | |
877 | /// | |
878 | /// Basic usage: | |
879 | /// | |
880 | /// ``` | |
54a0048b SL |
881 | /// assert!('٣'.is_numeric()); |
882 | /// assert!('7'.is_numeric()); | |
883 | /// assert!('৬'.is_numeric()); | |
b7449926 XL |
884 | /// assert!('¾'.is_numeric()); |
885 | /// assert!('①'.is_numeric()); | |
54a0048b SL |
886 | /// assert!(!'K'.is_numeric()); |
887 | /// assert!(!'و'.is_numeric()); | |
888 | /// assert!(!'藏'.is_numeric()); | |
b039eaaf | 889 | /// ``` |
85aaf69f | 890 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
891 | #[inline] |
892 | pub fn is_numeric(self) -> bool { | |
893 | match self { | |
8faf50e0 | 894 | '0'..='9' => true, |
dfeec247 | 895 | c => c > '\x7f' && unicode::N(c), |
c34b1796 AL |
896 | } |
897 | } | |
1a4d82fc | 898 | |
e74abb32 XL |
899 | /// Returns an iterator that yields the lowercase mapping of this `char` as one or more |
900 | /// `char`s. | |
1a4d82fc | 901 | /// |
e74abb32 | 902 | /// If this `char` does not have a lowercase mapping, the iterator yields the same `char`. |
1a4d82fc | 903 | /// |
e74abb32 XL |
904 | /// If this `char` has a one-to-one lowercase mapping given by the [Unicode Character |
905 | /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. | |
1a4d82fc | 906 | /// |
e74abb32 XL |
907 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
908 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
b039eaaf | 909 | /// |
e74abb32 XL |
910 | /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields |
911 | /// the `char`(s) given by [`SpecialCasing.txt`]. | |
b039eaaf | 912 | /// |
e74abb32 | 913 | /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt |
b039eaaf | 914 | /// |
e74abb32 XL |
915 | /// This operation performs an unconditional mapping without tailoring. That is, the conversion |
916 | /// is independent of context and language. | |
917 | /// | |
918 | /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in | |
919 | /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. | |
920 | /// | |
921 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ | |
62682a34 SL |
922 | /// |
923 | /// # Examples | |
924 | /// | |
32a655c1 | 925 | /// As an iterator: |
b039eaaf | 926 | /// |
62682a34 | 927 | /// ``` |
32a655c1 SL |
928 | /// for c in 'İ'.to_lowercase() { |
929 | /// print!("{}", c); | |
930 | /// } | |
931 | /// println!(); | |
932 | /// ``` | |
933 | /// | |
934 | /// Using `println!` directly: | |
935 | /// | |
936 | /// ``` | |
937 | /// println!("{}", 'İ'.to_lowercase()); | |
938 | /// ``` | |
939 | /// | |
940 | /// Both are equivalent to: | |
941 | /// | |
942 | /// ``` | |
943 | /// println!("i\u{307}"); | |
944 | /// ``` | |
945 | /// | |
946 | /// Using `to_string`: | |
947 | /// | |
948 | /// ``` | |
949 | /// assert_eq!('C'.to_lowercase().to_string(), "c"); | |
3157f602 XL |
950 | /// |
951 | /// // Sometimes the result is more than one character: | |
32a655c1 | 952 | /// assert_eq!('İ'.to_lowercase().to_string(), "i\u{307}"); |
b039eaaf | 953 | /// |
cc61c64b XL |
954 | /// // Characters that do not have both uppercase and lowercase |
955 | /// // convert into themselves. | |
32a655c1 | 956 | /// assert_eq!('山'.to_lowercase().to_string(), "山"); |
62682a34 | 957 | /// ``` |
c34b1796 AL |
958 | #[stable(feature = "rust1", since = "1.0.0")] |
959 | #[inline] | |
960 | pub fn to_lowercase(self) -> ToLowercase { | |
62682a34 SL |
961 | ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) |
962 | } | |
963 | ||
e74abb32 XL |
964 | /// Returns an iterator that yields the uppercase mapping of this `char` as one or more |
965 | /// `char`s. | |
966 | /// | |
967 | /// If this `char` does not have a uppercase mapping, the iterator yields the same `char`. | |
968 | /// | |
969 | /// If this `char` has a one-to-one uppercase mapping given by the [Unicode Character | |
970 | /// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`. | |
1a4d82fc | 971 | /// |
e74abb32 XL |
972 | /// [ucd]: https://www.unicode.org/reports/tr44/ |
973 | /// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | |
1a4d82fc | 974 | /// |
e74abb32 XL |
975 | /// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields |
976 | /// the `char`(s) given by [`SpecialCasing.txt`]. | |
1a4d82fc | 977 | /// |
e74abb32 | 978 | /// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt |
1a4d82fc | 979 | /// |
e74abb32 XL |
980 | /// This operation performs an unconditional mapping without tailoring. That is, the conversion |
981 | /// is independent of context and language. | |
1a4d82fc | 982 | /// |
e74abb32 XL |
983 | /// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in |
984 | /// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion. | |
62682a34 | 985 | /// |
e74abb32 | 986 | /// [Unicode Standard]: https://www.unicode.org/versions/latest/ |
1a4d82fc | 987 | /// |
62682a34 SL |
988 | /// # Examples |
989 | /// | |
32a655c1 SL |
990 | /// As an iterator: |
991 | /// | |
992 | /// ``` | |
993 | /// for c in 'ß'.to_uppercase() { | |
994 | /// print!("{}", c); | |
995 | /// } | |
996 | /// println!(); | |
997 | /// ``` | |
998 | /// | |
999 | /// Using `println!` directly: | |
1000 | /// | |
1001 | /// ``` | |
1002 | /// println!("{}", 'ß'.to_uppercase()); | |
1003 | /// ``` | |
1004 | /// | |
1005 | /// Both are equivalent to: | |
1006 | /// | |
1007 | /// ``` | |
1008 | /// println!("SS"); | |
1009 | /// ``` | |
1010 | /// | |
1011 | /// Using `to_string`: | |
b039eaaf SL |
1012 | /// |
1013 | /// ``` | |
32a655c1 | 1014 | /// assert_eq!('c'.to_uppercase().to_string(), "C"); |
3157f602 XL |
1015 | /// |
1016 | /// // Sometimes the result is more than one character: | |
32a655c1 | 1017 | /// assert_eq!('ß'.to_uppercase().to_string(), "SS"); |
b039eaaf | 1018 | /// |
cc61c64b XL |
1019 | /// // Characters that do not have both uppercase and lowercase |
1020 | /// // convert into themselves. | |
32a655c1 | 1021 | /// assert_eq!('山'.to_uppercase().to_string(), "山"); |
b039eaaf SL |
1022 | /// ``` |
1023 | /// | |
32a655c1 SL |
1024 | /// # Note on locale |
1025 | /// | |
b039eaaf SL |
1026 | /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: |
1027 | /// | |
1028 | /// * 'Dotless': I / ı, sometimes written ï | |
1029 | /// * 'Dotted': İ / i | |
1030 | /// | |
1031 | /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: | |
1032 | /// | |
1033 | /// ``` | |
32a655c1 | 1034 | /// let upper_i = 'i'.to_uppercase().to_string(); |
b039eaaf SL |
1035 | /// ``` |
1036 | /// | |
1037 | /// The value of `upper_i` here relies on the language of the text: if we're | |
3157f602 XL |
1038 | /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should |
1039 | /// be `"İ"`. `to_uppercase()` does not take this into account, and so: | |
b039eaaf | 1040 | /// |
62682a34 | 1041 | /// ``` |
32a655c1 | 1042 | /// let upper_i = 'i'.to_uppercase().to_string(); |
b039eaaf | 1043 | /// |
3157f602 | 1044 | /// assert_eq!(upper_i, "I"); |
62682a34 | 1045 | /// ``` |
b039eaaf SL |
1046 | /// |
1047 | /// holds across languages. | |
c34b1796 AL |
1048 | #[stable(feature = "rust1", since = "1.0.0")] |
1049 | #[inline] | |
1050 | pub fn to_uppercase(self) -> ToUppercase { | |
62682a34 | 1051 | ToUppercase(CaseMappingIter::new(conversions::to_upper(self))) |
c34b1796 | 1052 | } |
abe05a73 XL |
1053 | |
1054 | /// Checks if the value is within the ASCII range. | |
1055 | /// | |
1056 | /// # Examples | |
1057 | /// | |
1058 | /// ``` | |
1059 | /// let ascii = 'a'; | |
1060 | /// let non_ascii = '❤'; | |
1061 | /// | |
1062 | /// assert!(ascii.is_ascii()); | |
1063 | /// assert!(!non_ascii.is_ascii()); | |
1064 | /// ``` | |
ff7c6d11 | 1065 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
dfeec247 | 1066 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.32.0")] |
abe05a73 | 1067 | #[inline] |
a1dfa0c6 | 1068 | pub const fn is_ascii(&self) -> bool { |
abe05a73 XL |
1069 | *self as u32 <= 0x7F |
1070 | } | |
1071 | ||
1072 | /// Makes a copy of the value in its ASCII upper case equivalent. | |
1073 | /// | |
1074 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', | |
1075 | /// but non-ASCII letters are unchanged. | |
1076 | /// | |
fc512014 | 1077 | /// To uppercase the value in-place, use [`make_ascii_uppercase()`]. |
abe05a73 XL |
1078 | /// |
1079 | /// To uppercase ASCII characters in addition to non-ASCII characters, use | |
fc512014 | 1080 | /// [`to_uppercase()`]. |
abe05a73 XL |
1081 | /// |
1082 | /// # Examples | |
1083 | /// | |
1084 | /// ``` | |
1085 | /// let ascii = 'a'; | |
1086 | /// let non_ascii = '❤'; | |
1087 | /// | |
1088 | /// assert_eq!('A', ascii.to_ascii_uppercase()); | |
1089 | /// assert_eq!('❤', non_ascii.to_ascii_uppercase()); | |
1090 | /// ``` | |
1091 | /// | |
fc512014 XL |
1092 | /// [`make_ascii_uppercase()`]: #method.make_ascii_uppercase |
1093 | /// [`to_uppercase()`]: #method.to_uppercase | |
ff7c6d11 | 1094 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
6a06907d | 1095 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")] |
abe05a73 | 1096 | #[inline] |
6a06907d XL |
1097 | pub const fn to_ascii_uppercase(&self) -> char { |
1098 | if self.is_ascii_lowercase() { | |
1099 | (*self as u8).ascii_change_case_unchecked() as char | |
1100 | } else { | |
1101 | *self | |
1102 | } | |
abe05a73 XL |
1103 | } |
1104 | ||
1105 | /// Makes a copy of the value in its ASCII lower case equivalent. | |
1106 | /// | |
1107 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', | |
1108 | /// but non-ASCII letters are unchanged. | |
1109 | /// | |
fc512014 | 1110 | /// To lowercase the value in-place, use [`make_ascii_lowercase()`]. |
abe05a73 XL |
1111 | /// |
1112 | /// To lowercase ASCII characters in addition to non-ASCII characters, use | |
fc512014 | 1113 | /// [`to_lowercase()`]. |
abe05a73 XL |
1114 | /// |
1115 | /// # Examples | |
1116 | /// | |
1117 | /// ``` | |
1118 | /// let ascii = 'A'; | |
1119 | /// let non_ascii = '❤'; | |
1120 | /// | |
1121 | /// assert_eq!('a', ascii.to_ascii_lowercase()); | |
1122 | /// assert_eq!('❤', non_ascii.to_ascii_lowercase()); | |
1123 | /// ``` | |
1124 | /// | |
fc512014 XL |
1125 | /// [`make_ascii_lowercase()`]: #method.make_ascii_lowercase |
1126 | /// [`to_lowercase()`]: #method.to_lowercase | |
ff7c6d11 | 1127 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
6a06907d | 1128 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")] |
abe05a73 | 1129 | #[inline] |
6a06907d XL |
1130 | pub const fn to_ascii_lowercase(&self) -> char { |
1131 | if self.is_ascii_uppercase() { | |
1132 | (*self as u8).ascii_change_case_unchecked() as char | |
1133 | } else { | |
1134 | *self | |
1135 | } | |
abe05a73 XL |
1136 | } |
1137 | ||
1138 | /// Checks that two values are an ASCII case-insensitive match. | |
1139 | /// | |
1140 | /// Equivalent to `to_ascii_lowercase(a) == to_ascii_lowercase(b)`. | |
1141 | /// | |
1142 | /// # Examples | |
1143 | /// | |
1144 | /// ``` | |
1145 | /// let upper_a = 'A'; | |
1146 | /// let lower_a = 'a'; | |
1147 | /// let lower_z = 'z'; | |
1148 | /// | |
1149 | /// assert!(upper_a.eq_ignore_ascii_case(&lower_a)); | |
1150 | /// assert!(upper_a.eq_ignore_ascii_case(&upper_a)); | |
1151 | /// assert!(!upper_a.eq_ignore_ascii_case(&lower_z)); | |
1152 | /// ``` | |
ff7c6d11 | 1153 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
6a06907d | 1154 | #[rustc_const_stable(feature = "const_ascii_methods_on_intrinsics", since = "1.52.0")] |
abe05a73 | 1155 | #[inline] |
6a06907d | 1156 | pub const fn eq_ignore_ascii_case(&self, other: &char) -> bool { |
abe05a73 XL |
1157 | self.to_ascii_lowercase() == other.to_ascii_lowercase() |
1158 | } | |
1159 | ||
1160 | /// Converts this type to its ASCII upper case equivalent in-place. | |
1161 | /// | |
1162 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', | |
1163 | /// but non-ASCII letters are unchanged. | |
1164 | /// | |
1165 | /// To return a new uppercased value without modifying the existing one, use | |
fc512014 | 1166 | /// [`to_ascii_uppercase()`]. |
abe05a73 XL |
1167 | /// |
1168 | /// # Examples | |
1169 | /// | |
1170 | /// ``` | |
1171 | /// let mut ascii = 'a'; | |
1172 | /// | |
1173 | /// ascii.make_ascii_uppercase(); | |
1174 | /// | |
1175 | /// assert_eq!('A', ascii); | |
1176 | /// ``` | |
1177 | /// | |
fc512014 | 1178 | /// [`to_ascii_uppercase()`]: #method.to_ascii_uppercase |
ff7c6d11 | 1179 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
abe05a73 XL |
1180 | #[inline] |
1181 | pub fn make_ascii_uppercase(&mut self) { | |
1182 | *self = self.to_ascii_uppercase(); | |
1183 | } | |
1184 | ||
1185 | /// Converts this type to its ASCII lower case equivalent in-place. | |
1186 | /// | |
1187 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', | |
1188 | /// but non-ASCII letters are unchanged. | |
1189 | /// | |
1190 | /// To return a new lowercased value without modifying the existing one, use | |
fc512014 | 1191 | /// [`to_ascii_lowercase()`]. |
abe05a73 XL |
1192 | /// |
1193 | /// # Examples | |
1194 | /// | |
1195 | /// ``` | |
1196 | /// let mut ascii = 'A'; | |
1197 | /// | |
1198 | /// ascii.make_ascii_lowercase(); | |
1199 | /// | |
1200 | /// assert_eq!('a', ascii); | |
1201 | /// ``` | |
1202 | /// | |
fc512014 | 1203 | /// [`to_ascii_lowercase()`]: #method.to_ascii_lowercase |
ff7c6d11 | 1204 | #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
abe05a73 XL |
1205 | #[inline] |
1206 | pub fn make_ascii_lowercase(&mut self) { | |
1207 | *self = self.to_ascii_lowercase(); | |
1208 | } | |
1209 | ||
1210 | /// Checks if the value is an ASCII alphabetic character: | |
1211 | /// | |
dc9dc135 XL |
1212 | /// - U+0041 'A' ..= U+005A 'Z', or |
1213 | /// - U+0061 'a' ..= U+007A 'z'. | |
abe05a73 XL |
1214 | /// |
1215 | /// # Examples | |
1216 | /// | |
1217 | /// ``` | |
abe05a73 XL |
1218 | /// let uppercase_a = 'A'; |
1219 | /// let uppercase_g = 'G'; | |
1220 | /// let a = 'a'; | |
1221 | /// let g = 'g'; | |
1222 | /// let zero = '0'; | |
1223 | /// let percent = '%'; | |
1224 | /// let space = ' '; | |
1225 | /// let lf = '\n'; | |
1226 | /// let esc: char = 0x1b_u8.into(); | |
1227 | /// | |
1228 | /// assert!(uppercase_a.is_ascii_alphabetic()); | |
1229 | /// assert!(uppercase_g.is_ascii_alphabetic()); | |
1230 | /// assert!(a.is_ascii_alphabetic()); | |
1231 | /// assert!(g.is_ascii_alphabetic()); | |
1232 | /// assert!(!zero.is_ascii_alphabetic()); | |
1233 | /// assert!(!percent.is_ascii_alphabetic()); | |
1234 | /// assert!(!space.is_ascii_alphabetic()); | |
1235 | /// assert!(!lf.is_ascii_alphabetic()); | |
1236 | /// assert!(!esc.is_ascii_alphabetic()); | |
1237 | /// ``` | |
ff7c6d11 | 1238 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1239 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1240 | #[inline] |
74b04a01 | 1241 | pub const fn is_ascii_alphabetic(&self) -> bool { |
29967ef6 | 1242 | matches!(*self, 'A'..='Z' | 'a'..='z') |
abe05a73 XL |
1243 | } |
1244 | ||
1245 | /// Checks if the value is an ASCII uppercase character: | |
dc9dc135 | 1246 | /// U+0041 'A' ..= U+005A 'Z'. |
abe05a73 XL |
1247 | /// |
1248 | /// # Examples | |
1249 | /// | |
1250 | /// ``` | |
abe05a73 XL |
1251 | /// let uppercase_a = 'A'; |
1252 | /// let uppercase_g = 'G'; | |
1253 | /// let a = 'a'; | |
1254 | /// let g = 'g'; | |
1255 | /// let zero = '0'; | |
1256 | /// let percent = '%'; | |
1257 | /// let space = ' '; | |
1258 | /// let lf = '\n'; | |
1259 | /// let esc: char = 0x1b_u8.into(); | |
1260 | /// | |
1261 | /// assert!(uppercase_a.is_ascii_uppercase()); | |
1262 | /// assert!(uppercase_g.is_ascii_uppercase()); | |
1263 | /// assert!(!a.is_ascii_uppercase()); | |
1264 | /// assert!(!g.is_ascii_uppercase()); | |
1265 | /// assert!(!zero.is_ascii_uppercase()); | |
1266 | /// assert!(!percent.is_ascii_uppercase()); | |
1267 | /// assert!(!space.is_ascii_uppercase()); | |
1268 | /// assert!(!lf.is_ascii_uppercase()); | |
1269 | /// assert!(!esc.is_ascii_uppercase()); | |
1270 | /// ``` | |
ff7c6d11 | 1271 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1272 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1273 | #[inline] |
74b04a01 | 1274 | pub const fn is_ascii_uppercase(&self) -> bool { |
29967ef6 | 1275 | matches!(*self, 'A'..='Z') |
abe05a73 XL |
1276 | } |
1277 | ||
1278 | /// Checks if the value is an ASCII lowercase character: | |
dc9dc135 | 1279 | /// U+0061 'a' ..= U+007A 'z'. |
abe05a73 XL |
1280 | /// |
1281 | /// # Examples | |
1282 | /// | |
1283 | /// ``` | |
abe05a73 XL |
1284 | /// let uppercase_a = 'A'; |
1285 | /// let uppercase_g = 'G'; | |
1286 | /// let a = 'a'; | |
1287 | /// let g = 'g'; | |
1288 | /// let zero = '0'; | |
1289 | /// let percent = '%'; | |
1290 | /// let space = ' '; | |
1291 | /// let lf = '\n'; | |
1292 | /// let esc: char = 0x1b_u8.into(); | |
1293 | /// | |
1294 | /// assert!(!uppercase_a.is_ascii_lowercase()); | |
1295 | /// assert!(!uppercase_g.is_ascii_lowercase()); | |
1296 | /// assert!(a.is_ascii_lowercase()); | |
1297 | /// assert!(g.is_ascii_lowercase()); | |
1298 | /// assert!(!zero.is_ascii_lowercase()); | |
1299 | /// assert!(!percent.is_ascii_lowercase()); | |
1300 | /// assert!(!space.is_ascii_lowercase()); | |
1301 | /// assert!(!lf.is_ascii_lowercase()); | |
1302 | /// assert!(!esc.is_ascii_lowercase()); | |
1303 | /// ``` | |
ff7c6d11 | 1304 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1305 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1306 | #[inline] |
74b04a01 | 1307 | pub const fn is_ascii_lowercase(&self) -> bool { |
29967ef6 | 1308 | matches!(*self, 'a'..='z') |
abe05a73 XL |
1309 | } |
1310 | ||
1311 | /// Checks if the value is an ASCII alphanumeric character: | |
1312 | /// | |
dc9dc135 XL |
1313 | /// - U+0041 'A' ..= U+005A 'Z', or |
1314 | /// - U+0061 'a' ..= U+007A 'z', or | |
1315 | /// - U+0030 '0' ..= U+0039 '9'. | |
abe05a73 XL |
1316 | /// |
1317 | /// # Examples | |
1318 | /// | |
1319 | /// ``` | |
abe05a73 XL |
1320 | /// let uppercase_a = 'A'; |
1321 | /// let uppercase_g = 'G'; | |
1322 | /// let a = 'a'; | |
1323 | /// let g = 'g'; | |
1324 | /// let zero = '0'; | |
1325 | /// let percent = '%'; | |
1326 | /// let space = ' '; | |
1327 | /// let lf = '\n'; | |
1328 | /// let esc: char = 0x1b_u8.into(); | |
1329 | /// | |
1330 | /// assert!(uppercase_a.is_ascii_alphanumeric()); | |
1331 | /// assert!(uppercase_g.is_ascii_alphanumeric()); | |
1332 | /// assert!(a.is_ascii_alphanumeric()); | |
1333 | /// assert!(g.is_ascii_alphanumeric()); | |
1334 | /// assert!(zero.is_ascii_alphanumeric()); | |
1335 | /// assert!(!percent.is_ascii_alphanumeric()); | |
1336 | /// assert!(!space.is_ascii_alphanumeric()); | |
1337 | /// assert!(!lf.is_ascii_alphanumeric()); | |
1338 | /// assert!(!esc.is_ascii_alphanumeric()); | |
1339 | /// ``` | |
ff7c6d11 | 1340 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1341 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1342 | #[inline] |
74b04a01 | 1343 | pub const fn is_ascii_alphanumeric(&self) -> bool { |
29967ef6 | 1344 | matches!(*self, '0'..='9' | 'A'..='Z' | 'a'..='z') |
abe05a73 XL |
1345 | } |
1346 | ||
1347 | /// Checks if the value is an ASCII decimal digit: | |
dc9dc135 | 1348 | /// U+0030 '0' ..= U+0039 '9'. |
abe05a73 XL |
1349 | /// |
1350 | /// # Examples | |
1351 | /// | |
1352 | /// ``` | |
abe05a73 XL |
1353 | /// let uppercase_a = 'A'; |
1354 | /// let uppercase_g = 'G'; | |
1355 | /// let a = 'a'; | |
1356 | /// let g = 'g'; | |
1357 | /// let zero = '0'; | |
1358 | /// let percent = '%'; | |
1359 | /// let space = ' '; | |
1360 | /// let lf = '\n'; | |
1361 | /// let esc: char = 0x1b_u8.into(); | |
1362 | /// | |
1363 | /// assert!(!uppercase_a.is_ascii_digit()); | |
1364 | /// assert!(!uppercase_g.is_ascii_digit()); | |
1365 | /// assert!(!a.is_ascii_digit()); | |
1366 | /// assert!(!g.is_ascii_digit()); | |
1367 | /// assert!(zero.is_ascii_digit()); | |
1368 | /// assert!(!percent.is_ascii_digit()); | |
1369 | /// assert!(!space.is_ascii_digit()); | |
1370 | /// assert!(!lf.is_ascii_digit()); | |
1371 | /// assert!(!esc.is_ascii_digit()); | |
1372 | /// ``` | |
ff7c6d11 | 1373 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1374 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1375 | #[inline] |
74b04a01 | 1376 | pub const fn is_ascii_digit(&self) -> bool { |
29967ef6 | 1377 | matches!(*self, '0'..='9') |
abe05a73 XL |
1378 | } |
1379 | ||
1380 | /// Checks if the value is an ASCII hexadecimal digit: | |
1381 | /// | |
dc9dc135 XL |
1382 | /// - U+0030 '0' ..= U+0039 '9', or |
1383 | /// - U+0041 'A' ..= U+0046 'F', or | |
1384 | /// - U+0061 'a' ..= U+0066 'f'. | |
abe05a73 XL |
1385 | /// |
1386 | /// # Examples | |
1387 | /// | |
1388 | /// ``` | |
abe05a73 XL |
1389 | /// let uppercase_a = 'A'; |
1390 | /// let uppercase_g = 'G'; | |
1391 | /// let a = 'a'; | |
1392 | /// let g = 'g'; | |
1393 | /// let zero = '0'; | |
1394 | /// let percent = '%'; | |
1395 | /// let space = ' '; | |
1396 | /// let lf = '\n'; | |
1397 | /// let esc: char = 0x1b_u8.into(); | |
1398 | /// | |
1399 | /// assert!(uppercase_a.is_ascii_hexdigit()); | |
1400 | /// assert!(!uppercase_g.is_ascii_hexdigit()); | |
1401 | /// assert!(a.is_ascii_hexdigit()); | |
1402 | /// assert!(!g.is_ascii_hexdigit()); | |
1403 | /// assert!(zero.is_ascii_hexdigit()); | |
1404 | /// assert!(!percent.is_ascii_hexdigit()); | |
1405 | /// assert!(!space.is_ascii_hexdigit()); | |
1406 | /// assert!(!lf.is_ascii_hexdigit()); | |
1407 | /// assert!(!esc.is_ascii_hexdigit()); | |
1408 | /// ``` | |
ff7c6d11 | 1409 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1410 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1411 | #[inline] |
74b04a01 | 1412 | pub const fn is_ascii_hexdigit(&self) -> bool { |
29967ef6 | 1413 | matches!(*self, '0'..='9' | 'A'..='F' | 'a'..='f') |
abe05a73 XL |
1414 | } |
1415 | ||
1416 | /// Checks if the value is an ASCII punctuation character: | |
1417 | /// | |
dc9dc135 XL |
1418 | /// - U+0021 ..= U+002F `! " # $ % & ' ( ) * + , - . /`, or |
1419 | /// - U+003A ..= U+0040 `: ; < = > ? @`, or | |
1420 | /// - U+005B ..= U+0060 ``[ \ ] ^ _ ` ``, or | |
1421 | /// - U+007B ..= U+007E `{ | } ~` | |
abe05a73 XL |
1422 | /// |
1423 | /// # Examples | |
1424 | /// | |
1425 | /// ``` | |
abe05a73 XL |
1426 | /// let uppercase_a = 'A'; |
1427 | /// let uppercase_g = 'G'; | |
1428 | /// let a = 'a'; | |
1429 | /// let g = 'g'; | |
1430 | /// let zero = '0'; | |
1431 | /// let percent = '%'; | |
1432 | /// let space = ' '; | |
1433 | /// let lf = '\n'; | |
1434 | /// let esc: char = 0x1b_u8.into(); | |
1435 | /// | |
1436 | /// assert!(!uppercase_a.is_ascii_punctuation()); | |
1437 | /// assert!(!uppercase_g.is_ascii_punctuation()); | |
1438 | /// assert!(!a.is_ascii_punctuation()); | |
1439 | /// assert!(!g.is_ascii_punctuation()); | |
1440 | /// assert!(!zero.is_ascii_punctuation()); | |
1441 | /// assert!(percent.is_ascii_punctuation()); | |
1442 | /// assert!(!space.is_ascii_punctuation()); | |
1443 | /// assert!(!lf.is_ascii_punctuation()); | |
1444 | /// assert!(!esc.is_ascii_punctuation()); | |
1445 | /// ``` | |
ff7c6d11 | 1446 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1447 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1448 | #[inline] |
74b04a01 | 1449 | pub const fn is_ascii_punctuation(&self) -> bool { |
29967ef6 | 1450 | matches!(*self, '!'..='/' | ':'..='@' | '['..='`' | '{'..='~') |
abe05a73 XL |
1451 | } |
1452 | ||
1453 | /// Checks if the value is an ASCII graphic character: | |
dc9dc135 | 1454 | /// U+0021 '!' ..= U+007E '~'. |
abe05a73 XL |
1455 | /// |
1456 | /// # Examples | |
1457 | /// | |
1458 | /// ``` | |
abe05a73 XL |
1459 | /// let uppercase_a = 'A'; |
1460 | /// let uppercase_g = 'G'; | |
1461 | /// let a = 'a'; | |
1462 | /// let g = 'g'; | |
1463 | /// let zero = '0'; | |
1464 | /// let percent = '%'; | |
1465 | /// let space = ' '; | |
1466 | /// let lf = '\n'; | |
1467 | /// let esc: char = 0x1b_u8.into(); | |
1468 | /// | |
1469 | /// assert!(uppercase_a.is_ascii_graphic()); | |
1470 | /// assert!(uppercase_g.is_ascii_graphic()); | |
1471 | /// assert!(a.is_ascii_graphic()); | |
1472 | /// assert!(g.is_ascii_graphic()); | |
1473 | /// assert!(zero.is_ascii_graphic()); | |
1474 | /// assert!(percent.is_ascii_graphic()); | |
1475 | /// assert!(!space.is_ascii_graphic()); | |
1476 | /// assert!(!lf.is_ascii_graphic()); | |
1477 | /// assert!(!esc.is_ascii_graphic()); | |
1478 | /// ``` | |
ff7c6d11 | 1479 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1480 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1481 | #[inline] |
74b04a01 | 1482 | pub const fn is_ascii_graphic(&self) -> bool { |
29967ef6 | 1483 | matches!(*self, '!'..='~') |
abe05a73 XL |
1484 | } |
1485 | ||
1486 | /// Checks if the value is an ASCII whitespace character: | |
1487 | /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED, | |
1488 | /// U+000C FORM FEED, or U+000D CARRIAGE RETURN. | |
1489 | /// | |
1490 | /// Rust uses the WhatWG Infra Standard's [definition of ASCII | |
1491 | /// whitespace][infra-aw]. There are several other definitions in | |
1492 | /// wide use. For instance, [the POSIX locale][pct] includes | |
1493 | /// U+000B VERTICAL TAB as well as all the above characters, | |
1494 | /// but—from the very same specification—[the default rule for | |
1495 | /// "field splitting" in the Bourne shell][bfs] considers *only* | |
1496 | /// SPACE, HORIZONTAL TAB, and LINE FEED as whitespace. | |
1497 | /// | |
1498 | /// If you are writing a program that will process an existing | |
1499 | /// file format, check what that format's definition of whitespace is | |
1500 | /// before using this function. | |
1501 | /// | |
1502 | /// [infra-aw]: https://infra.spec.whatwg.org/#ascii-whitespace | |
1503 | /// [pct]: http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 | |
1504 | /// [bfs]: http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05 | |
1505 | /// | |
1506 | /// # Examples | |
1507 | /// | |
1508 | /// ``` | |
abe05a73 XL |
1509 | /// let uppercase_a = 'A'; |
1510 | /// let uppercase_g = 'G'; | |
1511 | /// let a = 'a'; | |
1512 | /// let g = 'g'; | |
1513 | /// let zero = '0'; | |
1514 | /// let percent = '%'; | |
1515 | /// let space = ' '; | |
1516 | /// let lf = '\n'; | |
1517 | /// let esc: char = 0x1b_u8.into(); | |
1518 | /// | |
1519 | /// assert!(!uppercase_a.is_ascii_whitespace()); | |
1520 | /// assert!(!uppercase_g.is_ascii_whitespace()); | |
1521 | /// assert!(!a.is_ascii_whitespace()); | |
1522 | /// assert!(!g.is_ascii_whitespace()); | |
1523 | /// assert!(!zero.is_ascii_whitespace()); | |
1524 | /// assert!(!percent.is_ascii_whitespace()); | |
1525 | /// assert!(space.is_ascii_whitespace()); | |
1526 | /// assert!(lf.is_ascii_whitespace()); | |
1527 | /// assert!(!esc.is_ascii_whitespace()); | |
1528 | /// ``` | |
ff7c6d11 | 1529 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1530 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1531 | #[inline] |
74b04a01 | 1532 | pub const fn is_ascii_whitespace(&self) -> bool { |
29967ef6 | 1533 | matches!(*self, '\t' | '\n' | '\x0C' | '\r' | ' ') |
abe05a73 XL |
1534 | } |
1535 | ||
1536 | /// Checks if the value is an ASCII control character: | |
dc9dc135 | 1537 | /// U+0000 NUL ..= U+001F UNIT SEPARATOR, or U+007F DELETE. |
abe05a73 XL |
1538 | /// Note that most ASCII whitespace characters are control |
1539 | /// characters, but SPACE is not. | |
1540 | /// | |
1541 | /// # Examples | |
1542 | /// | |
1543 | /// ``` | |
abe05a73 XL |
1544 | /// let uppercase_a = 'A'; |
1545 | /// let uppercase_g = 'G'; | |
1546 | /// let a = 'a'; | |
1547 | /// let g = 'g'; | |
1548 | /// let zero = '0'; | |
1549 | /// let percent = '%'; | |
1550 | /// let space = ' '; | |
1551 | /// let lf = '\n'; | |
1552 | /// let esc: char = 0x1b_u8.into(); | |
1553 | /// | |
1554 | /// assert!(!uppercase_a.is_ascii_control()); | |
1555 | /// assert!(!uppercase_g.is_ascii_control()); | |
1556 | /// assert!(!a.is_ascii_control()); | |
1557 | /// assert!(!g.is_ascii_control()); | |
1558 | /// assert!(!zero.is_ascii_control()); | |
1559 | /// assert!(!percent.is_ascii_control()); | |
1560 | /// assert!(!space.is_ascii_control()); | |
1561 | /// assert!(lf.is_ascii_control()); | |
1562 | /// assert!(esc.is_ascii_control()); | |
1563 | /// ``` | |
ff7c6d11 | 1564 | #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")] |
3dfed10e | 1565 | #[rustc_const_stable(feature = "const_ascii_ctype_on_intrinsics", since = "1.47.0")] |
abe05a73 | 1566 | #[inline] |
74b04a01 | 1567 | pub const fn is_ascii_control(&self) -> bool { |
29967ef6 | 1568 | matches!(*self, '\0'..='\x1F' | '\x7F') |
abe05a73 | 1569 | } |
e9174d1e | 1570 | } |
f9f354fc | 1571 | |
cdc7bbd5 XL |
1572 | pub(crate) struct EscapeDebugExtArgs { |
1573 | /// Escape Extended Grapheme codepoints? | |
1574 | pub(crate) escape_grapheme_extended: bool, | |
1575 | ||
1576 | /// Escape single quotes? | |
1577 | pub(crate) escape_single_quote: bool, | |
1578 | ||
1579 | /// Escape double quotes? | |
1580 | pub(crate) escape_double_quote: bool, | |
1581 | } | |
1582 | ||
1583 | impl EscapeDebugExtArgs { | |
1584 | pub(crate) const ESCAPE_ALL: Self = Self { | |
1585 | escape_grapheme_extended: true, | |
1586 | escape_single_quote: true, | |
1587 | escape_double_quote: true, | |
1588 | }; | |
1589 | } | |
1590 | ||
f9f354fc | 1591 | #[inline] |
6a06907d | 1592 | const fn len_utf8(code: u32) -> usize { |
f9f354fc XL |
1593 | if code < MAX_ONE_B { |
1594 | 1 | |
1595 | } else if code < MAX_TWO_B { | |
1596 | 2 | |
1597 | } else if code < MAX_THREE_B { | |
1598 | 3 | |
1599 | } else { | |
1600 | 4 | |
1601 | } | |
1602 | } | |
1603 | ||
1604 | /// Encodes a raw u32 value as UTF-8 into the provided byte buffer, | |
1605 | /// and then returns the subslice of the buffer that contains the encoded character. | |
1606 | /// | |
1607 | /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range. | |
1608 | /// (Creating a `char` in the surrogate range is UB.) | |
1609 | /// The result is valid [generalized UTF-8] but not valid UTF-8. | |
1610 | /// | |
1611 | /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8 | |
1612 | /// | |
1613 | /// # Panics | |
1614 | /// | |
1615 | /// Panics if the buffer is not large enough. | |
1616 | /// A buffer of length four is large enough to encode any `char`. | |
1617 | #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] | |
1618 | #[doc(hidden)] | |
1619 | #[inline] | |
1620 | pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> &mut [u8] { | |
1621 | let len = len_utf8(code); | |
1622 | match (len, &mut dst[..]) { | |
1623 | (1, [a, ..]) => { | |
1624 | *a = code as u8; | |
1625 | } | |
1626 | (2, [a, b, ..]) => { | |
1627 | *a = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; | |
1628 | *b = (code & 0x3F) as u8 | TAG_CONT; | |
1629 | } | |
1630 | (3, [a, b, c, ..]) => { | |
1631 | *a = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; | |
1632 | *b = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
1633 | *c = (code & 0x3F) as u8 | TAG_CONT; | |
1634 | } | |
1635 | (4, [a, b, c, d, ..]) => { | |
1636 | *a = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; | |
1637 | *b = (code >> 12 & 0x3F) as u8 | TAG_CONT; | |
1638 | *c = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
1639 | *d = (code & 0x3F) as u8 | TAG_CONT; | |
1640 | } | |
1641 | _ => panic!( | |
1642 | "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}", | |
1643 | len, | |
1644 | code, | |
1645 | dst.len(), | |
1646 | ), | |
1647 | }; | |
1648 | &mut dst[..len] | |
1649 | } | |
1650 | ||
1651 | /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, | |
1652 | /// and then returns the subslice of the buffer that contains the encoded character. | |
1653 | /// | |
1654 | /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range. | |
1655 | /// (Creating a `char` in the surrogate range is UB.) | |
1656 | /// | |
1657 | /// # Panics | |
1658 | /// | |
1659 | /// Panics if the buffer is not large enough. | |
1660 | /// A buffer of length 2 is large enough to encode any `char`. | |
1661 | #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] | |
1662 | #[doc(hidden)] | |
1663 | #[inline] | |
1664 | pub fn encode_utf16_raw(mut code: u32, dst: &mut [u16]) -> &mut [u16] { | |
1665 | // SAFETY: each arm checks whether there are enough bits to write into | |
1666 | unsafe { | |
1667 | if (code & 0xFFFF) == code && !dst.is_empty() { | |
1668 | // The BMP falls through | |
1669 | *dst.get_unchecked_mut(0) = code as u16; | |
1670 | slice::from_raw_parts_mut(dst.as_mut_ptr(), 1) | |
1671 | } else if dst.len() >= 2 { | |
1672 | // Supplementary planes break into surrogates. | |
1673 | code -= 0x1_0000; | |
1674 | *dst.get_unchecked_mut(0) = 0xD800 | ((code >> 10) as u16); | |
1675 | *dst.get_unchecked_mut(1) = 0xDC00 | ((code as u16) & 0x3FF); | |
1676 | slice::from_raw_parts_mut(dst.as_mut_ptr(), 2) | |
1677 | } else { | |
1678 | panic!( | |
1679 | "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}", | |
1680 | from_u32_unchecked(code).len_utf16(), | |
1681 | code, | |
1682 | dst.len(), | |
1683 | ) | |
1684 | } | |
1685 | } | |
1686 | } |