]> git.proxmox.com Git - rustc.git/blob - library/core/src/char/mod.rs
New upstream version 1.65.0+dfsg1
[rustc.git] / library / core / src / char / mod.rs
1 //! Utilities for the `char` primitive type.
2 //!
3 //! *[See also the `char` primitive type](primitive@char).*
4 //!
5 //! The `char` type represents a single character. More specifically, since
6 //! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
7 //! scalar value]', which is similar to, but not the same as, a '[Unicode code
8 //! point]'.
9 //!
10 //! [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
11 //! [Unicode code point]: https://www.unicode.org/glossary/#code_point
12 //!
13 //! This module exists for technical reasons, the primary documentation for
14 //! `char` is directly on [the `char` primitive type][char] itself.
15 //!
16 //! This module is the home of the iterator implementations for the iterators
17 //! implemented on `char`, as well as some useful constants and conversion
18 //! functions that convert various types to `char`.
19
20 #![allow(non_snake_case)]
21 #![stable(feature = "core_char", since = "1.2.0")]
22
23 mod convert;
24 mod decode;
25 mod methods;
26
27 // stable re-exports
28 #[stable(feature = "try_from", since = "1.34.0")]
29 pub use self::convert::CharTryFromError;
30 #[stable(feature = "char_from_str", since = "1.20.0")]
31 pub use self::convert::ParseCharError;
32 #[stable(feature = "decode_utf16", since = "1.9.0")]
33 pub use self::decode::{DecodeUtf16, DecodeUtf16Error};
34
35 // perma-unstable re-exports
36 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
37 pub use self::methods::encode_utf16_raw;
38 #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
39 pub use self::methods::encode_utf8_raw;
40
41 #[cfg(not(bootstrap))]
42 use crate::error::Error;
43 use crate::fmt::{self, Write};
44 use crate::iter::FusedIterator;
45
46 pub(crate) use self::methods::EscapeDebugExtArgs;
47
48 // UTF-8 ranges and tags for encoding characters
49 const TAG_CONT: u8 = 0b1000_0000;
50 const TAG_TWO_B: u8 = 0b1100_0000;
51 const TAG_THREE_B: u8 = 0b1110_0000;
52 const TAG_FOUR_B: u8 = 0b1111_0000;
53 const MAX_ONE_B: u32 = 0x80;
54 const MAX_TWO_B: u32 = 0x800;
55 const MAX_THREE_B: u32 = 0x10000;
56
57 /*
58 Lu Uppercase_Letter an uppercase letter
59 Ll Lowercase_Letter a lowercase letter
60 Lt Titlecase_Letter a digraphic character, with first part uppercase
61 Lm Modifier_Letter a modifier letter
62 Lo Other_Letter other letters, including syllables and ideographs
63 Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
64 Mc Spacing_Mark a spacing combining mark (positive advance width)
65 Me Enclosing_Mark an enclosing combining mark
66 Nd Decimal_Number a decimal digit
67 Nl Letter_Number a letterlike numeric character
68 No Other_Number a numeric character of other type
69 Pc Connector_Punctuation a connecting punctuation mark, like a tie
70 Pd Dash_Punctuation a dash or hyphen punctuation mark
71 Ps Open_Punctuation an opening punctuation mark (of a pair)
72 Pe Close_Punctuation a closing punctuation mark (of a pair)
73 Pi Initial_Punctuation an initial quotation mark
74 Pf Final_Punctuation a final quotation mark
75 Po Other_Punctuation a punctuation mark of other type
76 Sm Math_Symbol a symbol of primarily mathematical use
77 Sc Currency_Symbol a currency sign
78 Sk Modifier_Symbol a non-letterlike modifier symbol
79 So Other_Symbol a symbol of other type
80 Zs Space_Separator a space character (of various non-zero widths)
81 Zl Line_Separator U+2028 LINE SEPARATOR only
82 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
83 Cc Control a C0 or C1 control code
84 Cf Format a format control character
85 Cs Surrogate a surrogate code point
86 Co Private_Use a private-use character
87 Cn Unassigned a reserved unassigned code point or a noncharacter
88 */
89
90 /// The highest valid code point a `char` can have, `'\u{10FFFF}'`. Use [`char::MAX`] instead.
91 #[stable(feature = "rust1", since = "1.0.0")]
92 pub const MAX: char = char::MAX;
93
94 /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a
95 /// decoding error. Use [`char::REPLACEMENT_CHARACTER`] instead.
96 #[stable(feature = "decode_utf16", since = "1.9.0")]
97 pub const REPLACEMENT_CHARACTER: char = char::REPLACEMENT_CHARACTER;
98
99 /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
100 /// `char` and `str` methods are based on. Use [`char::UNICODE_VERSION`] instead.
101 #[stable(feature = "unicode_version", since = "1.45.0")]
102 pub const UNICODE_VERSION: (u8, u8, u8) = char::UNICODE_VERSION;
103
104 /// Creates an iterator over the UTF-16 encoded code points in `iter`, returning
105 /// unpaired surrogates as `Err`s. Use [`char::decode_utf16`] instead.
106 #[stable(feature = "decode_utf16", since = "1.9.0")]
107 #[inline]
108 pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
109 self::decode::decode_utf16(iter)
110 }
111
112 /// Converts a `u32` to a `char`. Use [`char::from_u32`] instead.
113 #[stable(feature = "rust1", since = "1.0.0")]
114 #[rustc_const_unstable(feature = "const_char_convert", issue = "89259")]
115 #[must_use]
116 #[inline]
117 pub const fn from_u32(i: u32) -> Option<char> {
118 self::convert::from_u32(i)
119 }
120
121 /// Converts a `u32` to a `char`, ignoring validity. Use [`char::from_u32_unchecked`].
122 /// instead.
123 #[stable(feature = "char_from_unchecked", since = "1.5.0")]
124 #[rustc_const_unstable(feature = "const_char_convert", issue = "89259")]
125 #[must_use]
126 #[inline]
127 pub const unsafe fn from_u32_unchecked(i: u32) -> char {
128 // SAFETY: the safety contract must be upheld by the caller.
129 unsafe { self::convert::from_u32_unchecked(i) }
130 }
131
132 /// Converts a digit in the given radix to a `char`. Use [`char::from_digit`] instead.
133 #[stable(feature = "rust1", since = "1.0.0")]
134 #[rustc_const_unstable(feature = "const_char_convert", issue = "89259")]
135 #[must_use]
136 #[inline]
137 pub const fn from_digit(num: u32, radix: u32) -> Option<char> {
138 self::convert::from_digit(num, radix)
139 }
140
141 /// Returns an iterator that yields the hexadecimal Unicode escape of a
142 /// character, as `char`s.
143 ///
144 /// This `struct` is created by the [`escape_unicode`] method on [`char`]. See
145 /// its documentation for more.
146 ///
147 /// [`escape_unicode`]: char::escape_unicode
148 #[derive(Clone, Debug)]
149 #[stable(feature = "rust1", since = "1.0.0")]
150 pub struct EscapeUnicode {
151 c: char,
152 state: EscapeUnicodeState,
153
154 // The index of the next hex digit to be printed (0 if none),
155 // i.e., the number of remaining hex digits to be printed;
156 // increasing from the least significant digit: 0x543210
157 hex_digit_idx: usize,
158 }
159
160 // The enum values are ordered so that their representation is the
161 // same as the remaining length (besides the hexadecimal digits). This
162 // likely makes `len()` a single load from memory) and inline-worth.
163 #[derive(Clone, Debug)]
164 enum EscapeUnicodeState {
165 Done,
166 RightBrace,
167 Value,
168 LeftBrace,
169 Type,
170 Backslash,
171 }
172
173 #[stable(feature = "rust1", since = "1.0.0")]
174 impl Iterator for EscapeUnicode {
175 type Item = char;
176
177 fn next(&mut self) -> Option<char> {
178 match self.state {
179 EscapeUnicodeState::Backslash => {
180 self.state = EscapeUnicodeState::Type;
181 Some('\\')
182 }
183 EscapeUnicodeState::Type => {
184 self.state = EscapeUnicodeState::LeftBrace;
185 Some('u')
186 }
187 EscapeUnicodeState::LeftBrace => {
188 self.state = EscapeUnicodeState::Value;
189 Some('{')
190 }
191 EscapeUnicodeState::Value => {
192 let hex_digit = ((self.c as u32) >> (self.hex_digit_idx * 4)) & 0xf;
193 let c = from_digit(hex_digit, 16).unwrap();
194 if self.hex_digit_idx == 0 {
195 self.state = EscapeUnicodeState::RightBrace;
196 } else {
197 self.hex_digit_idx -= 1;
198 }
199 Some(c)
200 }
201 EscapeUnicodeState::RightBrace => {
202 self.state = EscapeUnicodeState::Done;
203 Some('}')
204 }
205 EscapeUnicodeState::Done => None,
206 }
207 }
208
209 #[inline]
210 fn size_hint(&self) -> (usize, Option<usize>) {
211 let n = self.len();
212 (n, Some(n))
213 }
214
215 #[inline]
216 fn count(self) -> usize {
217 self.len()
218 }
219
220 fn last(self) -> Option<char> {
221 match self.state {
222 EscapeUnicodeState::Done => None,
223
224 EscapeUnicodeState::RightBrace
225 | EscapeUnicodeState::Value
226 | EscapeUnicodeState::LeftBrace
227 | EscapeUnicodeState::Type
228 | EscapeUnicodeState::Backslash => Some('}'),
229 }
230 }
231 }
232
233 #[stable(feature = "exact_size_escape", since = "1.11.0")]
234 impl ExactSizeIterator for EscapeUnicode {
235 #[inline]
236 fn len(&self) -> usize {
237 // The match is a single memory access with no branching
238 self.hex_digit_idx
239 + match self.state {
240 EscapeUnicodeState::Done => 0,
241 EscapeUnicodeState::RightBrace => 1,
242 EscapeUnicodeState::Value => 2,
243 EscapeUnicodeState::LeftBrace => 3,
244 EscapeUnicodeState::Type => 4,
245 EscapeUnicodeState::Backslash => 5,
246 }
247 }
248 }
249
250 #[stable(feature = "fused", since = "1.26.0")]
251 impl FusedIterator for EscapeUnicode {}
252
253 #[stable(feature = "char_struct_display", since = "1.16.0")]
254 impl fmt::Display for EscapeUnicode {
255 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
256 for c in self.clone() {
257 f.write_char(c)?;
258 }
259 Ok(())
260 }
261 }
262
263 /// An iterator that yields the literal escape code of a `char`.
264 ///
265 /// This `struct` is created by the [`escape_default`] method on [`char`]. See
266 /// its documentation for more.
267 ///
268 /// [`escape_default`]: char::escape_default
269 #[derive(Clone, Debug)]
270 #[stable(feature = "rust1", since = "1.0.0")]
271 pub struct EscapeDefault {
272 state: EscapeDefaultState,
273 }
274
275 #[derive(Clone, Debug)]
276 enum EscapeDefaultState {
277 Done,
278 Char(char),
279 Backslash(char),
280 Unicode(EscapeUnicode),
281 }
282
283 #[stable(feature = "rust1", since = "1.0.0")]
284 impl Iterator for EscapeDefault {
285 type Item = char;
286
287 fn next(&mut self) -> Option<char> {
288 match self.state {
289 EscapeDefaultState::Backslash(c) => {
290 self.state = EscapeDefaultState::Char(c);
291 Some('\\')
292 }
293 EscapeDefaultState::Char(c) => {
294 self.state = EscapeDefaultState::Done;
295 Some(c)
296 }
297 EscapeDefaultState::Done => None,
298 EscapeDefaultState::Unicode(ref mut iter) => iter.next(),
299 }
300 }
301
302 #[inline]
303 fn size_hint(&self) -> (usize, Option<usize>) {
304 let n = self.len();
305 (n, Some(n))
306 }
307
308 #[inline]
309 fn count(self) -> usize {
310 self.len()
311 }
312
313 fn nth(&mut self, n: usize) -> Option<char> {
314 match self.state {
315 EscapeDefaultState::Backslash(c) if n == 0 => {
316 self.state = EscapeDefaultState::Char(c);
317 Some('\\')
318 }
319 EscapeDefaultState::Backslash(c) if n == 1 => {
320 self.state = EscapeDefaultState::Done;
321 Some(c)
322 }
323 EscapeDefaultState::Backslash(_) => {
324 self.state = EscapeDefaultState::Done;
325 None
326 }
327 EscapeDefaultState::Char(c) => {
328 self.state = EscapeDefaultState::Done;
329
330 if n == 0 { Some(c) } else { None }
331 }
332 EscapeDefaultState::Done => None,
333 EscapeDefaultState::Unicode(ref mut i) => i.nth(n),
334 }
335 }
336
337 fn last(self) -> Option<char> {
338 match self.state {
339 EscapeDefaultState::Unicode(iter) => iter.last(),
340 EscapeDefaultState::Done => None,
341 EscapeDefaultState::Backslash(c) | EscapeDefaultState::Char(c) => Some(c),
342 }
343 }
344 }
345
346 #[stable(feature = "exact_size_escape", since = "1.11.0")]
347 impl ExactSizeIterator for EscapeDefault {
348 fn len(&self) -> usize {
349 match self.state {
350 EscapeDefaultState::Done => 0,
351 EscapeDefaultState::Char(_) => 1,
352 EscapeDefaultState::Backslash(_) => 2,
353 EscapeDefaultState::Unicode(ref iter) => iter.len(),
354 }
355 }
356 }
357
358 #[stable(feature = "fused", since = "1.26.0")]
359 impl FusedIterator for EscapeDefault {}
360
361 #[stable(feature = "char_struct_display", since = "1.16.0")]
362 impl fmt::Display for EscapeDefault {
363 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
364 for c in self.clone() {
365 f.write_char(c)?;
366 }
367 Ok(())
368 }
369 }
370
371 /// An iterator that yields the literal escape code of a `char`.
372 ///
373 /// This `struct` is created by the [`escape_debug`] method on [`char`]. See its
374 /// documentation for more.
375 ///
376 /// [`escape_debug`]: char::escape_debug
377 #[stable(feature = "char_escape_debug", since = "1.20.0")]
378 #[derive(Clone, Debug)]
379 pub struct EscapeDebug(EscapeDefault);
380
381 #[stable(feature = "char_escape_debug", since = "1.20.0")]
382 impl Iterator for EscapeDebug {
383 type Item = char;
384 fn next(&mut self) -> Option<char> {
385 self.0.next()
386 }
387 fn size_hint(&self) -> (usize, Option<usize>) {
388 self.0.size_hint()
389 }
390 }
391
392 #[stable(feature = "char_escape_debug", since = "1.20.0")]
393 impl ExactSizeIterator for EscapeDebug {}
394
395 #[stable(feature = "fused", since = "1.26.0")]
396 impl FusedIterator for EscapeDebug {}
397
398 #[stable(feature = "char_escape_debug", since = "1.20.0")]
399 impl fmt::Display for EscapeDebug {
400 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
401 fmt::Display::fmt(&self.0, f)
402 }
403 }
404
405 /// Returns an iterator that yields the lowercase equivalent of a `char`.
406 ///
407 /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
408 /// its documentation for more.
409 ///
410 /// [`to_lowercase`]: char::to_lowercase
411 #[stable(feature = "rust1", since = "1.0.0")]
412 #[derive(Debug, Clone)]
413 pub struct ToLowercase(CaseMappingIter);
414
415 #[stable(feature = "rust1", since = "1.0.0")]
416 impl Iterator for ToLowercase {
417 type Item = char;
418 fn next(&mut self) -> Option<char> {
419 self.0.next()
420 }
421 fn size_hint(&self) -> (usize, Option<usize>) {
422 self.0.size_hint()
423 }
424 }
425
426 #[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
427 impl DoubleEndedIterator for ToLowercase {
428 fn next_back(&mut self) -> Option<char> {
429 self.0.next_back()
430 }
431 }
432
433 #[stable(feature = "fused", since = "1.26.0")]
434 impl FusedIterator for ToLowercase {}
435
436 #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
437 impl ExactSizeIterator for ToLowercase {}
438
439 /// Returns an iterator that yields the uppercase equivalent of a `char`.
440 ///
441 /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
442 /// its documentation for more.
443 ///
444 /// [`to_uppercase`]: char::to_uppercase
445 #[stable(feature = "rust1", since = "1.0.0")]
446 #[derive(Debug, Clone)]
447 pub struct ToUppercase(CaseMappingIter);
448
449 #[stable(feature = "rust1", since = "1.0.0")]
450 impl Iterator for ToUppercase {
451 type Item = char;
452 fn next(&mut self) -> Option<char> {
453 self.0.next()
454 }
455 fn size_hint(&self) -> (usize, Option<usize>) {
456 self.0.size_hint()
457 }
458 }
459
460 #[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
461 impl DoubleEndedIterator for ToUppercase {
462 fn next_back(&mut self) -> Option<char> {
463 self.0.next_back()
464 }
465 }
466
467 #[stable(feature = "fused", since = "1.26.0")]
468 impl FusedIterator for ToUppercase {}
469
470 #[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
471 impl ExactSizeIterator for ToUppercase {}
472
473 #[derive(Debug, Clone)]
474 enum CaseMappingIter {
475 Three(char, char, char),
476 Two(char, char),
477 One(char),
478 Zero,
479 }
480
481 impl CaseMappingIter {
482 fn new(chars: [char; 3]) -> CaseMappingIter {
483 if chars[2] == '\0' {
484 if chars[1] == '\0' {
485 CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0'
486 } else {
487 CaseMappingIter::Two(chars[0], chars[1])
488 }
489 } else {
490 CaseMappingIter::Three(chars[0], chars[1], chars[2])
491 }
492 }
493 }
494
495 impl Iterator for CaseMappingIter {
496 type Item = char;
497 fn next(&mut self) -> Option<char> {
498 match *self {
499 CaseMappingIter::Three(a, b, c) => {
500 *self = CaseMappingIter::Two(b, c);
501 Some(a)
502 }
503 CaseMappingIter::Two(b, c) => {
504 *self = CaseMappingIter::One(c);
505 Some(b)
506 }
507 CaseMappingIter::One(c) => {
508 *self = CaseMappingIter::Zero;
509 Some(c)
510 }
511 CaseMappingIter::Zero => None,
512 }
513 }
514
515 fn size_hint(&self) -> (usize, Option<usize>) {
516 let size = match self {
517 CaseMappingIter::Three(..) => 3,
518 CaseMappingIter::Two(..) => 2,
519 CaseMappingIter::One(_) => 1,
520 CaseMappingIter::Zero => 0,
521 };
522 (size, Some(size))
523 }
524 }
525
526 impl DoubleEndedIterator for CaseMappingIter {
527 fn next_back(&mut self) -> Option<char> {
528 match *self {
529 CaseMappingIter::Three(a, b, c) => {
530 *self = CaseMappingIter::Two(a, b);
531 Some(c)
532 }
533 CaseMappingIter::Two(b, c) => {
534 *self = CaseMappingIter::One(b);
535 Some(c)
536 }
537 CaseMappingIter::One(c) => {
538 *self = CaseMappingIter::Zero;
539 Some(c)
540 }
541 CaseMappingIter::Zero => None,
542 }
543 }
544 }
545
546 impl fmt::Display for CaseMappingIter {
547 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
548 match *self {
549 CaseMappingIter::Three(a, b, c) => {
550 f.write_char(a)?;
551 f.write_char(b)?;
552 f.write_char(c)
553 }
554 CaseMappingIter::Two(b, c) => {
555 f.write_char(b)?;
556 f.write_char(c)
557 }
558 CaseMappingIter::One(c) => f.write_char(c),
559 CaseMappingIter::Zero => Ok(()),
560 }
561 }
562 }
563
564 #[stable(feature = "char_struct_display", since = "1.16.0")]
565 impl fmt::Display for ToLowercase {
566 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
567 fmt::Display::fmt(&self.0, f)
568 }
569 }
570
571 #[stable(feature = "char_struct_display", since = "1.16.0")]
572 impl fmt::Display for ToUppercase {
573 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
574 fmt::Display::fmt(&self.0, f)
575 }
576 }
577
578 /// The error type returned when a checked char conversion fails.
579 #[stable(feature = "u8_from_char", since = "1.59.0")]
580 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
581 pub struct TryFromCharError(pub(crate) ());
582
583 #[stable(feature = "u8_from_char", since = "1.59.0")]
584 impl fmt::Display for TryFromCharError {
585 fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
586 "unicode code point out of range".fmt(fmt)
587 }
588 }
589
590 #[cfg(not(bootstrap))]
591 #[stable(feature = "u8_from_char", since = "1.59.0")]
592 impl Error for TryFromCharError {}