]> git.proxmox.com Git - rustc.git/blob - src/libstd_unicode/char.rs
New upstream version 1.25.0+dfsg1
[rustc.git] / src / libstd_unicode / char.rs
1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 //! A character type.
12 //!
13 //! The `char` type represents a single character. More specifically, since
14 //! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
15 //! scalar value]', which is similar to, but not the same as, a '[Unicode code
16 //! point]'.
17 //!
18 //! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value
19 //! [Unicode code point]: http://www.unicode.org/glossary/#code_point
20 //!
21 //! This module exists for technical reasons, the primary documentation for
22 //! `char` is directly on [the `char` primitive type](../../std/primitive.char.html)
23 //! itself.
24 //!
25 //! This module is the home of the iterator implementations for the iterators
26 //! implemented on `char`, as well as some useful constants and conversion
27 //! functions that convert various types to `char`.
28
29 #![stable(feature = "rust1", since = "1.0.0")]
30
31 use core::char::CharExt as C;
32 use core::iter::FusedIterator;
33 use core::fmt::{self, Write};
34 use tables::{conversions, derived_property, general_category, property};
35
36 // stable re-exports
37 #[stable(feature = "rust1", since = "1.0.0")]
38 pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
39 #[stable(feature = "rust1", since = "1.0.0")]
40 pub use core::char::{EscapeDebug, EscapeDefault, EscapeUnicode};
41 #[stable(feature = "char_from_str", since = "1.20.0")]
42 pub use core::char::ParseCharError;
43
44 // unstable re-exports
45 #[unstable(feature = "try_from", issue = "33417")]
46 pub use core::char::CharTryFromError;
47 #[unstable(feature = "decode_utf8", issue = "33906")]
48 pub use core::char::{DecodeUtf8, decode_utf8};
49 #[unstable(feature = "unicode", issue = "27783")]
50 pub use tables::{UNICODE_VERSION};
51 #[unstable(feature = "unicode", issue = "27783")]
52 pub use version::UnicodeVersion;
53
54 /// Returns an iterator that yields the lowercase equivalent of a `char`.
55 ///
56 /// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
57 /// its documentation for more.
58 ///
59 /// [`to_lowercase`]: ../../std/primitive.char.html#method.to_lowercase
60 /// [`char`]: ../../std/primitive.char.html
61 #[stable(feature = "rust1", since = "1.0.0")]
62 #[derive(Debug)]
63 pub struct ToLowercase(CaseMappingIter);
64
65 #[stable(feature = "rust1", since = "1.0.0")]
66 impl Iterator for ToLowercase {
67 type Item = char;
68 fn next(&mut self) -> Option<char> {
69 self.0.next()
70 }
71 }
72
73 #[unstable(feature = "fused", issue = "35602")]
74 impl FusedIterator for ToLowercase {}
75
76 /// Returns an iterator that yields the uppercase equivalent of a `char`.
77 ///
78 /// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
79 /// its documentation for more.
80 ///
81 /// [`to_uppercase`]: ../../std/primitive.char.html#method.to_uppercase
82 /// [`char`]: ../../std/primitive.char.html
83 #[stable(feature = "rust1", since = "1.0.0")]
84 #[derive(Debug)]
85 pub struct ToUppercase(CaseMappingIter);
86
87 #[stable(feature = "rust1", since = "1.0.0")]
88 impl Iterator for ToUppercase {
89 type Item = char;
90 fn next(&mut self) -> Option<char> {
91 self.0.next()
92 }
93 }
94
95 #[unstable(feature = "fused", issue = "35602")]
96 impl FusedIterator for ToUppercase {}
97
98 #[derive(Debug)]
99 enum CaseMappingIter {
100 Three(char, char, char),
101 Two(char, char),
102 One(char),
103 Zero,
104 }
105
106 impl CaseMappingIter {
107 fn new(chars: [char; 3]) -> CaseMappingIter {
108 if chars[2] == '\0' {
109 if chars[1] == '\0' {
110 CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0'
111 } else {
112 CaseMappingIter::Two(chars[0], chars[1])
113 }
114 } else {
115 CaseMappingIter::Three(chars[0], chars[1], chars[2])
116 }
117 }
118 }
119
120 impl Iterator for CaseMappingIter {
121 type Item = char;
122 fn next(&mut self) -> Option<char> {
123 match *self {
124 CaseMappingIter::Three(a, b, c) => {
125 *self = CaseMappingIter::Two(b, c);
126 Some(a)
127 }
128 CaseMappingIter::Two(b, c) => {
129 *self = CaseMappingIter::One(c);
130 Some(b)
131 }
132 CaseMappingIter::One(c) => {
133 *self = CaseMappingIter::Zero;
134 Some(c)
135 }
136 CaseMappingIter::Zero => None,
137 }
138 }
139 }
140
141 impl fmt::Display for CaseMappingIter {
142 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
143 match *self {
144 CaseMappingIter::Three(a, b, c) => {
145 f.write_char(a)?;
146 f.write_char(b)?;
147 f.write_char(c)
148 }
149 CaseMappingIter::Two(b, c) => {
150 f.write_char(b)?;
151 f.write_char(c)
152 }
153 CaseMappingIter::One(c) => {
154 f.write_char(c)
155 }
156 CaseMappingIter::Zero => Ok(()),
157 }
158 }
159 }
160
161 #[stable(feature = "char_struct_display", since = "1.16.0")]
162 impl fmt::Display for ToLowercase {
163 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
164 fmt::Display::fmt(&self.0, f)
165 }
166 }
167
168 #[stable(feature = "char_struct_display", since = "1.16.0")]
169 impl fmt::Display for ToUppercase {
170 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
171 fmt::Display::fmt(&self.0, f)
172 }
173 }
174
175 #[lang = "char"]
176 impl char {
177 /// Checks if a `char` is a digit in the given radix.
178 ///
179 /// A 'radix' here is sometimes also called a 'base'. A radix of two
180 /// indicates a binary number, a radix of ten, decimal, and a radix of
181 /// sixteen, hexadecimal, to give some common values. Arbitrary
182 /// radices are supported.
183 ///
184 /// Compared to `is_numeric()`, this function only recognizes the characters
185 /// `0-9`, `a-z` and `A-Z`.
186 ///
187 /// 'Digit' is defined to be only the following characters:
188 ///
189 /// * `0-9`
190 /// * `a-z`
191 /// * `A-Z`
192 ///
193 /// For a more comprehensive understanding of 'digit', see [`is_numeric`][is_numeric].
194 ///
195 /// [is_numeric]: #method.is_numeric
196 ///
197 /// # Panics
198 ///
199 /// Panics if given a radix larger than 36.
200 ///
201 /// # Examples
202 ///
203 /// Basic usage:
204 ///
205 /// ```
206 /// assert!('1'.is_digit(10));
207 /// assert!('f'.is_digit(16));
208 /// assert!(!'f'.is_digit(10));
209 /// ```
210 ///
211 /// Passing a large radix, causing a panic:
212 ///
213 /// ```
214 /// use std::thread;
215 ///
216 /// let result = thread::spawn(|| {
217 /// // this panics
218 /// '1'.is_digit(37);
219 /// }).join();
220 ///
221 /// assert!(result.is_err());
222 /// ```
223 #[stable(feature = "rust1", since = "1.0.0")]
224 #[inline]
225 pub fn is_digit(self, radix: u32) -> bool {
226 C::is_digit(self, radix)
227 }
228
229 /// Converts a `char` to a digit in the given radix.
230 ///
231 /// A 'radix' here is sometimes also called a 'base'. A radix of two
232 /// indicates a binary number, a radix of ten, decimal, and a radix of
233 /// sixteen, hexadecimal, to give some common values. Arbitrary
234 /// radices are supported.
235 ///
236 /// 'Digit' is defined to be only the following characters:
237 ///
238 /// * `0-9`
239 /// * `a-z`
240 /// * `A-Z`
241 ///
242 /// # Errors
243 ///
244 /// Returns `None` if the `char` does not refer to a digit in the given radix.
245 ///
246 /// # Panics
247 ///
248 /// Panics if given a radix larger than 36.
249 ///
250 /// # Examples
251 ///
252 /// Basic usage:
253 ///
254 /// ```
255 /// assert_eq!('1'.to_digit(10), Some(1));
256 /// assert_eq!('f'.to_digit(16), Some(15));
257 /// ```
258 ///
259 /// Passing a non-digit results in failure:
260 ///
261 /// ```
262 /// assert_eq!('f'.to_digit(10), None);
263 /// assert_eq!('z'.to_digit(16), None);
264 /// ```
265 ///
266 /// Passing a large radix, causing a panic:
267 ///
268 /// ```
269 /// use std::thread;
270 ///
271 /// let result = thread::spawn(|| {
272 /// '1'.to_digit(37);
273 /// }).join();
274 ///
275 /// assert!(result.is_err());
276 /// ```
277 #[stable(feature = "rust1", since = "1.0.0")]
278 #[inline]
279 pub fn to_digit(self, radix: u32) -> Option<u32> {
280 C::to_digit(self, radix)
281 }
282
283 /// Returns an iterator that yields the hexadecimal Unicode escape of a
284 /// character as `char`s.
285 ///
286 /// This will escape characters with the Rust syntax of the form
287 /// `\u{NNNNNN}` where `NNNNNN` is a hexadecimal representation.
288 ///
289 /// # Examples
290 ///
291 /// As an iterator:
292 ///
293 /// ```
294 /// for c in '❤'.escape_unicode() {
295 /// print!("{}", c);
296 /// }
297 /// println!();
298 /// ```
299 ///
300 /// Using `println!` directly:
301 ///
302 /// ```
303 /// println!("{}", '❤'.escape_unicode());
304 /// ```
305 ///
306 /// Both are equivalent to:
307 ///
308 /// ```
309 /// println!("\\u{{2764}}");
310 /// ```
311 ///
312 /// Using `to_string`:
313 ///
314 /// ```
315 /// assert_eq!('❤'.escape_unicode().to_string(), "\\u{2764}");
316 /// ```
317 #[stable(feature = "rust1", since = "1.0.0")]
318 #[inline]
319 pub fn escape_unicode(self) -> EscapeUnicode {
320 C::escape_unicode(self)
321 }
322
323 /// Returns an iterator that yields the literal escape code of a character
324 /// as `char`s.
325 ///
326 /// This will escape the characters similar to the `Debug` implementations
327 /// of `str` or `char`.
328 ///
329 /// # Examples
330 ///
331 /// As an iterator:
332 ///
333 /// ```
334 /// for c in '\n'.escape_debug() {
335 /// print!("{}", c);
336 /// }
337 /// println!();
338 /// ```
339 ///
340 /// Using `println!` directly:
341 ///
342 /// ```
343 /// println!("{}", '\n'.escape_debug());
344 /// ```
345 ///
346 /// Both are equivalent to:
347 ///
348 /// ```
349 /// println!("\\n");
350 /// ```
351 ///
352 /// Using `to_string`:
353 ///
354 /// ```
355 /// assert_eq!('\n'.escape_debug().to_string(), "\\n");
356 /// ```
357 #[stable(feature = "char_escape_debug", since = "1.20.0")]
358 #[inline]
359 pub fn escape_debug(self) -> EscapeDebug {
360 C::escape_debug(self)
361 }
362
363 /// Returns an iterator that yields the literal escape code of a character
364 /// as `char`s.
365 ///
366 /// The default is chosen with a bias toward producing literals that are
367 /// legal in a variety of languages, including C++11 and similar C-family
368 /// languages. The exact rules are:
369 ///
370 /// * Tab is escaped as `\t`.
371 /// * Carriage return is escaped as `\r`.
372 /// * Line feed is escaped as `\n`.
373 /// * Single quote is escaped as `\'`.
374 /// * Double quote is escaped as `\"`.
375 /// * Backslash is escaped as `\\`.
376 /// * Any character in the 'printable ASCII' range `0x20` .. `0x7e`
377 /// inclusive is not escaped.
378 /// * All other characters are given hexadecimal Unicode escapes; see
379 /// [`escape_unicode`][escape_unicode].
380 ///
381 /// [escape_unicode]: #method.escape_unicode
382 ///
383 /// # Examples
384 ///
385 /// As an iterator:
386 ///
387 /// ```
388 /// for c in '"'.escape_default() {
389 /// print!("{}", c);
390 /// }
391 /// println!();
392 /// ```
393 ///
394 /// Using `println!` directly:
395 ///
396 /// ```
397 /// println!("{}", '"'.escape_default());
398 /// ```
399 ///
400 ///
401 /// Both are equivalent to:
402 ///
403 /// ```
404 /// println!("\\\"");
405 /// ```
406 ///
407 /// Using `to_string`:
408 ///
409 /// ```
410 /// assert_eq!('"'.escape_default().to_string(), "\\\"");
411 /// ```
412 #[stable(feature = "rust1", since = "1.0.0")]
413 #[inline]
414 pub fn escape_default(self) -> EscapeDefault {
415 C::escape_default(self)
416 }
417
418 /// Returns the number of bytes this `char` would need if encoded in UTF-8.
419 ///
420 /// That number of bytes is always between 1 and 4, inclusive.
421 ///
422 /// # Examples
423 ///
424 /// Basic usage:
425 ///
426 /// ```
427 /// let len = 'A'.len_utf8();
428 /// assert_eq!(len, 1);
429 ///
430 /// let len = 'ß'.len_utf8();
431 /// assert_eq!(len, 2);
432 ///
433 /// let len = 'ℝ'.len_utf8();
434 /// assert_eq!(len, 3);
435 ///
436 /// let len = '💣'.len_utf8();
437 /// assert_eq!(len, 4);
438 /// ```
439 ///
440 /// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it
441 /// would take if each code point was represented as a `char` vs in the `&str` itself:
442 ///
443 /// ```
444 /// // as chars
445 /// let eastern = '東';
446 /// let capitol = '京';
447 ///
448 /// // both can be represented as three bytes
449 /// assert_eq!(3, eastern.len_utf8());
450 /// assert_eq!(3, capitol.len_utf8());
451 ///
452 /// // as a &str, these two are encoded in UTF-8
453 /// let tokyo = "東京";
454 ///
455 /// let len = eastern.len_utf8() + capitol.len_utf8();
456 ///
457 /// // we can see that they take six bytes total...
458 /// assert_eq!(6, tokyo.len());
459 ///
460 /// // ... just like the &str
461 /// assert_eq!(len, tokyo.len());
462 /// ```
463 #[stable(feature = "rust1", since = "1.0.0")]
464 #[inline]
465 pub fn len_utf8(self) -> usize {
466 C::len_utf8(self)
467 }
468
469 /// Returns the number of 16-bit code units this `char` would need if
470 /// encoded in UTF-16.
471 ///
472 /// See the documentation for [`len_utf8`] for more explanation of this
473 /// concept. This function is a mirror, but for UTF-16 instead of UTF-8.
474 ///
475 /// [`len_utf8`]: #method.len_utf8
476 ///
477 /// # Examples
478 ///
479 /// Basic usage:
480 ///
481 /// ```
482 /// let n = 'ß'.len_utf16();
483 /// assert_eq!(n, 1);
484 ///
485 /// let len = '💣'.len_utf16();
486 /// assert_eq!(len, 2);
487 /// ```
488 #[stable(feature = "rust1", since = "1.0.0")]
489 #[inline]
490 pub fn len_utf16(self) -> usize {
491 C::len_utf16(self)
492 }
493
494 /// Encodes this character as UTF-8 into the provided byte buffer,
495 /// and then returns the subslice of the buffer that contains the encoded character.
496 ///
497 /// # Panics
498 ///
499 /// Panics if the buffer is not large enough.
500 /// A buffer of length four is large enough to encode any `char`.
501 ///
502 /// # Examples
503 ///
504 /// In both of these examples, 'ß' takes two bytes to encode.
505 ///
506 /// ```
507 /// let mut b = [0; 2];
508 ///
509 /// let result = 'ß'.encode_utf8(&mut b);
510 ///
511 /// assert_eq!(result, "ß");
512 ///
513 /// assert_eq!(result.len(), 2);
514 /// ```
515 ///
516 /// A buffer that's too small:
517 ///
518 /// ```
519 /// use std::thread;
520 ///
521 /// let result = thread::spawn(|| {
522 /// let mut b = [0; 1];
523 ///
524 /// // this panics
525 /// 'ß'.encode_utf8(&mut b);
526 /// }).join();
527 ///
528 /// assert!(result.is_err());
529 /// ```
530 #[stable(feature = "unicode_encode_char", since = "1.15.0")]
531 #[inline]
532 pub fn encode_utf8(self, dst: &mut [u8]) -> &mut str {
533 C::encode_utf8(self, dst)
534 }
535
536 /// Encodes this character as UTF-16 into the provided `u16` buffer,
537 /// and then returns the subslice of the buffer that contains the encoded character.
538 ///
539 /// # Panics
540 ///
541 /// Panics if the buffer is not large enough.
542 /// A buffer of length 2 is large enough to encode any `char`.
543 ///
544 /// # Examples
545 ///
546 /// In both of these examples, '𝕊' takes two `u16`s to encode.
547 ///
548 /// ```
549 /// let mut b = [0; 2];
550 ///
551 /// let result = '𝕊'.encode_utf16(&mut b);
552 ///
553 /// assert_eq!(result.len(), 2);
554 /// ```
555 ///
556 /// A buffer that's too small:
557 ///
558 /// ```
559 /// use std::thread;
560 ///
561 /// let result = thread::spawn(|| {
562 /// let mut b = [0; 1];
563 ///
564 /// // this panics
565 /// '𝕊'.encode_utf16(&mut b);
566 /// }).join();
567 ///
568 /// assert!(result.is_err());
569 /// ```
570 #[stable(feature = "unicode_encode_char", since = "1.15.0")]
571 #[inline]
572 pub fn encode_utf16(self, dst: &mut [u16]) -> &mut [u16] {
573 C::encode_utf16(self, dst)
574 }
575
576 /// Returns true if this `char` is an alphabetic code point, and false if not.
577 ///
578 /// # Examples
579 ///
580 /// Basic usage:
581 ///
582 /// ```
583 /// assert!('a'.is_alphabetic());
584 /// assert!('京'.is_alphabetic());
585 ///
586 /// let c = '💝';
587 /// // love is many things, but it is not alphabetic
588 /// assert!(!c.is_alphabetic());
589 /// ```
590 #[stable(feature = "rust1", since = "1.0.0")]
591 #[inline]
592 pub fn is_alphabetic(self) -> bool {
593 match self {
594 'a'...'z' | 'A'...'Z' => true,
595 c if c > '\x7f' => derived_property::Alphabetic(c),
596 _ => false,
597 }
598 }
599
600 /// Returns true if this `char` satisfies the 'XID_Start' Unicode property, and false
601 /// otherwise.
602 ///
603 /// 'XID_Start' is a Unicode Derived Property specified in
604 /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
605 /// mostly similar to `ID_Start` but modified for closure under `NFKx`.
606 #[unstable(feature = "rustc_private",
607 reason = "mainly needed for compiler internals",
608 issue = "27812")]
609 #[inline]
610 pub fn is_xid_start(self) -> bool {
611 derived_property::XID_Start(self)
612 }
613
614 /// Returns true if this `char` satisfies the 'XID_Continue' Unicode property, and false
615 /// otherwise.
616 ///
617 /// 'XID_Continue' is a Unicode Derived Property specified in
618 /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
619 /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
620 #[unstable(feature = "rustc_private",
621 reason = "mainly needed for compiler internals",
622 issue = "27812")]
623 #[inline]
624 pub fn is_xid_continue(self) -> bool {
625 derived_property::XID_Continue(self)
626 }
627
628 /// Returns true if this `char` is lowercase, and false otherwise.
629 ///
630 /// 'Lowercase' is defined according to the terms of the Unicode Derived Core
631 /// Property `Lowercase`.
632 ///
633 /// # Examples
634 ///
635 /// Basic usage:
636 ///
637 /// ```
638 /// assert!('a'.is_lowercase());
639 /// assert!('δ'.is_lowercase());
640 /// assert!(!'A'.is_lowercase());
641 /// assert!(!'Δ'.is_lowercase());
642 ///
643 /// // The various Chinese scripts do not have case, and so:
644 /// assert!(!'中'.is_lowercase());
645 /// ```
646 #[stable(feature = "rust1", since = "1.0.0")]
647 #[inline]
648 pub fn is_lowercase(self) -> bool {
649 match self {
650 'a'...'z' => true,
651 c if c > '\x7f' => derived_property::Lowercase(c),
652 _ => false,
653 }
654 }
655
656 /// Returns true if this `char` is uppercase, and false otherwise.
657 ///
658 /// 'Uppercase' is defined according to the terms of the Unicode Derived Core
659 /// Property `Uppercase`.
660 ///
661 /// # Examples
662 ///
663 /// Basic usage:
664 ///
665 /// ```
666 /// assert!(!'a'.is_uppercase());
667 /// assert!(!'δ'.is_uppercase());
668 /// assert!('A'.is_uppercase());
669 /// assert!('Δ'.is_uppercase());
670 ///
671 /// // The various Chinese scripts do not have case, and so:
672 /// assert!(!'中'.is_uppercase());
673 /// ```
674 #[stable(feature = "rust1", since = "1.0.0")]
675 #[inline]
676 pub fn is_uppercase(self) -> bool {
677 match self {
678 'A'...'Z' => true,
679 c if c > '\x7f' => derived_property::Uppercase(c),
680 _ => false,
681 }
682 }
683
684 /// Returns true if this `char` is whitespace, and false otherwise.
685 ///
686 /// 'Whitespace' is defined according to the terms of the Unicode Derived Core
687 /// Property `White_Space`.
688 ///
689 /// # Examples
690 ///
691 /// Basic usage:
692 ///
693 /// ```
694 /// assert!(' '.is_whitespace());
695 ///
696 /// // a non-breaking space
697 /// assert!('\u{A0}'.is_whitespace());
698 ///
699 /// assert!(!'越'.is_whitespace());
700 /// ```
701 #[stable(feature = "rust1", since = "1.0.0")]
702 #[inline]
703 pub fn is_whitespace(self) -> bool {
704 match self {
705 ' ' | '\x09'...'\x0d' => true,
706 c if c > '\x7f' => property::White_Space(c),
707 _ => false,
708 }
709 }
710
711 /// Returns true if this `char` is alphanumeric, and false otherwise.
712 ///
713 /// 'Alphanumeric'-ness is defined in terms of the Unicode General Categories
714 /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
715 ///
716 /// # Examples
717 ///
718 /// Basic usage:
719 ///
720 /// ```
721 /// assert!('٣'.is_alphanumeric());
722 /// assert!('7'.is_alphanumeric());
723 /// assert!('৬'.is_alphanumeric());
724 /// assert!('K'.is_alphanumeric());
725 /// assert!('و'.is_alphanumeric());
726 /// assert!('藏'.is_alphanumeric());
727 /// assert!(!'¾'.is_alphanumeric());
728 /// assert!(!'①'.is_alphanumeric());
729 /// ```
730 #[stable(feature = "rust1", since = "1.0.0")]
731 #[inline]
732 pub fn is_alphanumeric(self) -> bool {
733 self.is_alphabetic() || self.is_numeric()
734 }
735
736 /// Returns true if this `char` is a control code point, and false otherwise.
737 ///
738 /// 'Control code point' is defined in terms of the Unicode General
739 /// Category `Cc`.
740 ///
741 /// # Examples
742 ///
743 /// Basic usage:
744 ///
745 /// ```
746 /// // U+009C, STRING TERMINATOR
747 /// assert!('\9c'.is_control());
748 /// assert!(!'q'.is_control());
749 /// ```
750 #[stable(feature = "rust1", since = "1.0.0")]
751 #[inline]
752 pub fn is_control(self) -> bool {
753 general_category::Cc(self)
754 }
755
756 /// Returns true if this `char` is numeric, and false otherwise.
757 ///
758 /// 'Numeric'-ness is defined in terms of the Unicode General Categories
759 /// 'Nd', 'Nl', 'No'.
760 ///
761 /// # Examples
762 ///
763 /// Basic usage:
764 ///
765 /// ```
766 /// assert!('٣'.is_numeric());
767 /// assert!('7'.is_numeric());
768 /// assert!('৬'.is_numeric());
769 /// assert!(!'K'.is_numeric());
770 /// assert!(!'و'.is_numeric());
771 /// assert!(!'藏'.is_numeric());
772 /// assert!(!'¾'.is_numeric());
773 /// assert!(!'①'.is_numeric());
774 /// ```
775 #[stable(feature = "rust1", since = "1.0.0")]
776 #[inline]
777 pub fn is_numeric(self) -> bool {
778 match self {
779 '0'...'9' => true,
780 c if c > '\x7f' => general_category::N(c),
781 _ => false,
782 }
783 }
784
785 /// Returns an iterator that yields the lowercase equivalent of a `char`
786 /// as one or more `char`s.
787 ///
788 /// If a character does not have a lowercase equivalent, the same character
789 /// will be returned back by the iterator.
790 ///
791 /// This performs complex unconditional mappings with no tailoring: it maps
792 /// one Unicode character to its lowercase equivalent according to the
793 /// [Unicode database] and the additional complex mappings
794 /// [`SpecialCasing.txt`]. Conditional mappings (based on context or
795 /// language) are not considered here.
796 ///
797 /// For a full reference, see [here][reference].
798 ///
799 /// [Unicode database]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
800 ///
801 /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
802 ///
803 /// [reference]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
804 ///
805 /// # Examples
806 ///
807 /// As an iterator:
808 ///
809 /// ```
810 /// for c in 'İ'.to_lowercase() {
811 /// print!("{}", c);
812 /// }
813 /// println!();
814 /// ```
815 ///
816 /// Using `println!` directly:
817 ///
818 /// ```
819 /// println!("{}", 'İ'.to_lowercase());
820 /// ```
821 ///
822 /// Both are equivalent to:
823 ///
824 /// ```
825 /// println!("i\u{307}");
826 /// ```
827 ///
828 /// Using `to_string`:
829 ///
830 /// ```
831 /// assert_eq!('C'.to_lowercase().to_string(), "c");
832 ///
833 /// // Sometimes the result is more than one character:
834 /// assert_eq!('İ'.to_lowercase().to_string(), "i\u{307}");
835 ///
836 /// // Characters that do not have both uppercase and lowercase
837 /// // convert into themselves.
838 /// assert_eq!('山'.to_lowercase().to_string(), "山");
839 /// ```
840 #[stable(feature = "rust1", since = "1.0.0")]
841 #[inline]
842 pub fn to_lowercase(self) -> ToLowercase {
843 ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
844 }
845
846 /// Returns an iterator that yields the uppercase equivalent of a `char`
847 /// as one or more `char`s.
848 ///
849 /// If a character does not have an uppercase equivalent, the same character
850 /// will be returned back by the iterator.
851 ///
852 /// This performs complex unconditional mappings with no tailoring: it maps
853 /// one Unicode character to its uppercase equivalent according to the
854 /// [Unicode database] and the additional complex mappings
855 /// [`SpecialCasing.txt`]. Conditional mappings (based on context or
856 /// language) are not considered here.
857 ///
858 /// For a full reference, see [here][reference].
859 ///
860 /// [Unicode database]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
861 ///
862 /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
863 ///
864 /// [reference]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
865 ///
866 /// # Examples
867 ///
868 /// As an iterator:
869 ///
870 /// ```
871 /// for c in 'ß'.to_uppercase() {
872 /// print!("{}", c);
873 /// }
874 /// println!();
875 /// ```
876 ///
877 /// Using `println!` directly:
878 ///
879 /// ```
880 /// println!("{}", 'ß'.to_uppercase());
881 /// ```
882 ///
883 /// Both are equivalent to:
884 ///
885 /// ```
886 /// println!("SS");
887 /// ```
888 ///
889 /// Using `to_string`:
890 ///
891 /// ```
892 /// assert_eq!('c'.to_uppercase().to_string(), "C");
893 ///
894 /// // Sometimes the result is more than one character:
895 /// assert_eq!('ß'.to_uppercase().to_string(), "SS");
896 ///
897 /// // Characters that do not have both uppercase and lowercase
898 /// // convert into themselves.
899 /// assert_eq!('山'.to_uppercase().to_string(), "山");
900 /// ```
901 ///
902 /// # Note on locale
903 ///
904 /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
905 ///
906 /// * 'Dotless': I / ı, sometimes written ï
907 /// * 'Dotted': İ / i
908 ///
909 /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
910 ///
911 /// ```
912 /// let upper_i = 'i'.to_uppercase().to_string();
913 /// ```
914 ///
915 /// The value of `upper_i` here relies on the language of the text: if we're
916 /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
917 /// be `"İ"`. `to_uppercase()` does not take this into account, and so:
918 ///
919 /// ```
920 /// let upper_i = 'i'.to_uppercase().to_string();
921 ///
922 /// assert_eq!(upper_i, "I");
923 /// ```
924 ///
925 /// holds across languages.
926 #[stable(feature = "rust1", since = "1.0.0")]
927 #[inline]
928 pub fn to_uppercase(self) -> ToUppercase {
929 ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
930 }
931
932 /// Checks if the value is within the ASCII range.
933 ///
934 /// # Examples
935 ///
936 /// ```
937 /// let ascii = 'a';
938 /// let non_ascii = '❤';
939 ///
940 /// assert!(ascii.is_ascii());
941 /// assert!(!non_ascii.is_ascii());
942 /// ```
943 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
944 #[inline]
945 pub fn is_ascii(&self) -> bool {
946 *self as u32 <= 0x7F
947 }
948
949 /// Makes a copy of the value in its ASCII upper case equivalent.
950 ///
951 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
952 /// but non-ASCII letters are unchanged.
953 ///
954 /// To uppercase the value in-place, use [`make_ascii_uppercase`].
955 ///
956 /// To uppercase ASCII characters in addition to non-ASCII characters, use
957 /// [`to_uppercase`].
958 ///
959 /// # Examples
960 ///
961 /// ```
962 /// let ascii = 'a';
963 /// let non_ascii = '❤';
964 ///
965 /// assert_eq!('A', ascii.to_ascii_uppercase());
966 /// assert_eq!('❤', non_ascii.to_ascii_uppercase());
967 /// ```
968 ///
969 /// [`make_ascii_uppercase`]: #method.make_ascii_uppercase
970 /// [`to_uppercase`]: #method.to_uppercase
971 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
972 #[inline]
973 pub fn to_ascii_uppercase(&self) -> char {
974 if self.is_ascii() {
975 (*self as u8).to_ascii_uppercase() as char
976 } else {
977 *self
978 }
979 }
980
981 /// Makes a copy of the value in its ASCII lower case equivalent.
982 ///
983 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
984 /// but non-ASCII letters are unchanged.
985 ///
986 /// To lowercase the value in-place, use [`make_ascii_lowercase`].
987 ///
988 /// To lowercase ASCII characters in addition to non-ASCII characters, use
989 /// [`to_lowercase`].
990 ///
991 /// # Examples
992 ///
993 /// ```
994 /// let ascii = 'A';
995 /// let non_ascii = '❤';
996 ///
997 /// assert_eq!('a', ascii.to_ascii_lowercase());
998 /// assert_eq!('❤', non_ascii.to_ascii_lowercase());
999 /// ```
1000 ///
1001 /// [`make_ascii_lowercase`]: #method.make_ascii_lowercase
1002 /// [`to_lowercase`]: #method.to_lowercase
1003 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
1004 #[inline]
1005 pub fn to_ascii_lowercase(&self) -> char {
1006 if self.is_ascii() {
1007 (*self as u8).to_ascii_lowercase() as char
1008 } else {
1009 *self
1010 }
1011 }
1012
1013 /// Checks that two values are an ASCII case-insensitive match.
1014 ///
1015 /// Equivalent to `to_ascii_lowercase(a) == to_ascii_lowercase(b)`.
1016 ///
1017 /// # Examples
1018 ///
1019 /// ```
1020 /// let upper_a = 'A';
1021 /// let lower_a = 'a';
1022 /// let lower_z = 'z';
1023 ///
1024 /// assert!(upper_a.eq_ignore_ascii_case(&lower_a));
1025 /// assert!(upper_a.eq_ignore_ascii_case(&upper_a));
1026 /// assert!(!upper_a.eq_ignore_ascii_case(&lower_z));
1027 /// ```
1028 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
1029 #[inline]
1030 pub fn eq_ignore_ascii_case(&self, other: &char) -> bool {
1031 self.to_ascii_lowercase() == other.to_ascii_lowercase()
1032 }
1033
1034 /// Converts this type to its ASCII upper case equivalent in-place.
1035 ///
1036 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
1037 /// but non-ASCII letters are unchanged.
1038 ///
1039 /// To return a new uppercased value without modifying the existing one, use
1040 /// [`to_ascii_uppercase`].
1041 ///
1042 /// # Examples
1043 ///
1044 /// ```
1045 /// let mut ascii = 'a';
1046 ///
1047 /// ascii.make_ascii_uppercase();
1048 ///
1049 /// assert_eq!('A', ascii);
1050 /// ```
1051 ///
1052 /// [`to_ascii_uppercase`]: #method.to_ascii_uppercase
1053 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
1054 #[inline]
1055 pub fn make_ascii_uppercase(&mut self) {
1056 *self = self.to_ascii_uppercase();
1057 }
1058
1059 /// Converts this type to its ASCII lower case equivalent in-place.
1060 ///
1061 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
1062 /// but non-ASCII letters are unchanged.
1063 ///
1064 /// To return a new lowercased value without modifying the existing one, use
1065 /// [`to_ascii_lowercase`].
1066 ///
1067 /// # Examples
1068 ///
1069 /// ```
1070 /// let mut ascii = 'A';
1071 ///
1072 /// ascii.make_ascii_lowercase();
1073 ///
1074 /// assert_eq!('a', ascii);
1075 /// ```
1076 ///
1077 /// [`to_ascii_lowercase`]: #method.to_ascii_lowercase
1078 #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
1079 #[inline]
1080 pub fn make_ascii_lowercase(&mut self) {
1081 *self = self.to_ascii_lowercase();
1082 }
1083
1084 /// Checks if the value is an ASCII alphabetic character:
1085 ///
1086 /// - U+0041 'A' ... U+005A 'Z', or
1087 /// - U+0061 'a' ... U+007A 'z'.
1088 ///
1089 /// # Examples
1090 ///
1091 /// ```
1092 /// #![feature(ascii_ctype)]
1093 ///
1094 /// let uppercase_a = 'A';
1095 /// let uppercase_g = 'G';
1096 /// let a = 'a';
1097 /// let g = 'g';
1098 /// let zero = '0';
1099 /// let percent = '%';
1100 /// let space = ' ';
1101 /// let lf = '\n';
1102 /// let esc: char = 0x1b_u8.into();
1103 ///
1104 /// assert!(uppercase_a.is_ascii_alphabetic());
1105 /// assert!(uppercase_g.is_ascii_alphabetic());
1106 /// assert!(a.is_ascii_alphabetic());
1107 /// assert!(g.is_ascii_alphabetic());
1108 /// assert!(!zero.is_ascii_alphabetic());
1109 /// assert!(!percent.is_ascii_alphabetic());
1110 /// assert!(!space.is_ascii_alphabetic());
1111 /// assert!(!lf.is_ascii_alphabetic());
1112 /// assert!(!esc.is_ascii_alphabetic());
1113 /// ```
1114 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1115 #[inline]
1116 pub fn is_ascii_alphabetic(&self) -> bool {
1117 self.is_ascii() && (*self as u8).is_ascii_alphabetic()
1118 }
1119
1120 /// Checks if the value is an ASCII uppercase character:
1121 /// U+0041 'A' ... U+005A 'Z'.
1122 ///
1123 /// # Examples
1124 ///
1125 /// ```
1126 /// #![feature(ascii_ctype)]
1127 ///
1128 /// let uppercase_a = 'A';
1129 /// let uppercase_g = 'G';
1130 /// let a = 'a';
1131 /// let g = 'g';
1132 /// let zero = '0';
1133 /// let percent = '%';
1134 /// let space = ' ';
1135 /// let lf = '\n';
1136 /// let esc: char = 0x1b_u8.into();
1137 ///
1138 /// assert!(uppercase_a.is_ascii_uppercase());
1139 /// assert!(uppercase_g.is_ascii_uppercase());
1140 /// assert!(!a.is_ascii_uppercase());
1141 /// assert!(!g.is_ascii_uppercase());
1142 /// assert!(!zero.is_ascii_uppercase());
1143 /// assert!(!percent.is_ascii_uppercase());
1144 /// assert!(!space.is_ascii_uppercase());
1145 /// assert!(!lf.is_ascii_uppercase());
1146 /// assert!(!esc.is_ascii_uppercase());
1147 /// ```
1148 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1149 #[inline]
1150 pub fn is_ascii_uppercase(&self) -> bool {
1151 self.is_ascii() && (*self as u8).is_ascii_uppercase()
1152 }
1153
1154 /// Checks if the value is an ASCII lowercase character:
1155 /// U+0061 'a' ... U+007A 'z'.
1156 ///
1157 /// # Examples
1158 ///
1159 /// ```
1160 /// #![feature(ascii_ctype)]
1161 ///
1162 /// let uppercase_a = 'A';
1163 /// let uppercase_g = 'G';
1164 /// let a = 'a';
1165 /// let g = 'g';
1166 /// let zero = '0';
1167 /// let percent = '%';
1168 /// let space = ' ';
1169 /// let lf = '\n';
1170 /// let esc: char = 0x1b_u8.into();
1171 ///
1172 /// assert!(!uppercase_a.is_ascii_lowercase());
1173 /// assert!(!uppercase_g.is_ascii_lowercase());
1174 /// assert!(a.is_ascii_lowercase());
1175 /// assert!(g.is_ascii_lowercase());
1176 /// assert!(!zero.is_ascii_lowercase());
1177 /// assert!(!percent.is_ascii_lowercase());
1178 /// assert!(!space.is_ascii_lowercase());
1179 /// assert!(!lf.is_ascii_lowercase());
1180 /// assert!(!esc.is_ascii_lowercase());
1181 /// ```
1182 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1183 #[inline]
1184 pub fn is_ascii_lowercase(&self) -> bool {
1185 self.is_ascii() && (*self as u8).is_ascii_lowercase()
1186 }
1187
1188 /// Checks if the value is an ASCII alphanumeric character:
1189 ///
1190 /// - U+0041 'A' ... U+005A 'Z', or
1191 /// - U+0061 'a' ... U+007A 'z', or
1192 /// - U+0030 '0' ... U+0039 '9'.
1193 ///
1194 /// # Examples
1195 ///
1196 /// ```
1197 /// #![feature(ascii_ctype)]
1198 ///
1199 /// let uppercase_a = 'A';
1200 /// let uppercase_g = 'G';
1201 /// let a = 'a';
1202 /// let g = 'g';
1203 /// let zero = '0';
1204 /// let percent = '%';
1205 /// let space = ' ';
1206 /// let lf = '\n';
1207 /// let esc: char = 0x1b_u8.into();
1208 ///
1209 /// assert!(uppercase_a.is_ascii_alphanumeric());
1210 /// assert!(uppercase_g.is_ascii_alphanumeric());
1211 /// assert!(a.is_ascii_alphanumeric());
1212 /// assert!(g.is_ascii_alphanumeric());
1213 /// assert!(zero.is_ascii_alphanumeric());
1214 /// assert!(!percent.is_ascii_alphanumeric());
1215 /// assert!(!space.is_ascii_alphanumeric());
1216 /// assert!(!lf.is_ascii_alphanumeric());
1217 /// assert!(!esc.is_ascii_alphanumeric());
1218 /// ```
1219 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1220 #[inline]
1221 pub fn is_ascii_alphanumeric(&self) -> bool {
1222 self.is_ascii() && (*self as u8).is_ascii_alphanumeric()
1223 }
1224
1225 /// Checks if the value is an ASCII decimal digit:
1226 /// U+0030 '0' ... U+0039 '9'.
1227 ///
1228 /// # Examples
1229 ///
1230 /// ```
1231 /// #![feature(ascii_ctype)]
1232 ///
1233 /// let uppercase_a = 'A';
1234 /// let uppercase_g = 'G';
1235 /// let a = 'a';
1236 /// let g = 'g';
1237 /// let zero = '0';
1238 /// let percent = '%';
1239 /// let space = ' ';
1240 /// let lf = '\n';
1241 /// let esc: char = 0x1b_u8.into();
1242 ///
1243 /// assert!(!uppercase_a.is_ascii_digit());
1244 /// assert!(!uppercase_g.is_ascii_digit());
1245 /// assert!(!a.is_ascii_digit());
1246 /// assert!(!g.is_ascii_digit());
1247 /// assert!(zero.is_ascii_digit());
1248 /// assert!(!percent.is_ascii_digit());
1249 /// assert!(!space.is_ascii_digit());
1250 /// assert!(!lf.is_ascii_digit());
1251 /// assert!(!esc.is_ascii_digit());
1252 /// ```
1253 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1254 #[inline]
1255 pub fn is_ascii_digit(&self) -> bool {
1256 self.is_ascii() && (*self as u8).is_ascii_digit()
1257 }
1258
1259 /// Checks if the value is an ASCII hexadecimal digit:
1260 ///
1261 /// - U+0030 '0' ... U+0039 '9', or
1262 /// - U+0041 'A' ... U+0046 'F', or
1263 /// - U+0061 'a' ... U+0066 'f'.
1264 ///
1265 /// # Examples
1266 ///
1267 /// ```
1268 /// #![feature(ascii_ctype)]
1269 ///
1270 /// let uppercase_a = 'A';
1271 /// let uppercase_g = 'G';
1272 /// let a = 'a';
1273 /// let g = 'g';
1274 /// let zero = '0';
1275 /// let percent = '%';
1276 /// let space = ' ';
1277 /// let lf = '\n';
1278 /// let esc: char = 0x1b_u8.into();
1279 ///
1280 /// assert!(uppercase_a.is_ascii_hexdigit());
1281 /// assert!(!uppercase_g.is_ascii_hexdigit());
1282 /// assert!(a.is_ascii_hexdigit());
1283 /// assert!(!g.is_ascii_hexdigit());
1284 /// assert!(zero.is_ascii_hexdigit());
1285 /// assert!(!percent.is_ascii_hexdigit());
1286 /// assert!(!space.is_ascii_hexdigit());
1287 /// assert!(!lf.is_ascii_hexdigit());
1288 /// assert!(!esc.is_ascii_hexdigit());
1289 /// ```
1290 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1291 #[inline]
1292 pub fn is_ascii_hexdigit(&self) -> bool {
1293 self.is_ascii() && (*self as u8).is_ascii_hexdigit()
1294 }
1295
1296 /// Checks if the value is an ASCII punctuation character:
1297 ///
1298 /// - U+0021 ... U+002F `! " # $ % & ' ( ) * + , - . /`, or
1299 /// - U+003A ... U+0040 `: ; < = > ? @`, or
1300 /// - U+005B ... U+0060 ``[ \ ] ^ _ ` ``, or
1301 /// - U+007B ... U+007E `{ | } ~`
1302 ///
1303 /// # Examples
1304 ///
1305 /// ```
1306 /// #![feature(ascii_ctype)]
1307 ///
1308 /// let uppercase_a = 'A';
1309 /// let uppercase_g = 'G';
1310 /// let a = 'a';
1311 /// let g = 'g';
1312 /// let zero = '0';
1313 /// let percent = '%';
1314 /// let space = ' ';
1315 /// let lf = '\n';
1316 /// let esc: char = 0x1b_u8.into();
1317 ///
1318 /// assert!(!uppercase_a.is_ascii_punctuation());
1319 /// assert!(!uppercase_g.is_ascii_punctuation());
1320 /// assert!(!a.is_ascii_punctuation());
1321 /// assert!(!g.is_ascii_punctuation());
1322 /// assert!(!zero.is_ascii_punctuation());
1323 /// assert!(percent.is_ascii_punctuation());
1324 /// assert!(!space.is_ascii_punctuation());
1325 /// assert!(!lf.is_ascii_punctuation());
1326 /// assert!(!esc.is_ascii_punctuation());
1327 /// ```
1328 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1329 #[inline]
1330 pub fn is_ascii_punctuation(&self) -> bool {
1331 self.is_ascii() && (*self as u8).is_ascii_punctuation()
1332 }
1333
1334 /// Checks if the value is an ASCII graphic character:
1335 /// U+0021 '@' ... U+007E '~'.
1336 ///
1337 /// # Examples
1338 ///
1339 /// ```
1340 /// #![feature(ascii_ctype)]
1341 ///
1342 /// let uppercase_a = 'A';
1343 /// let uppercase_g = 'G';
1344 /// let a = 'a';
1345 /// let g = 'g';
1346 /// let zero = '0';
1347 /// let percent = '%';
1348 /// let space = ' ';
1349 /// let lf = '\n';
1350 /// let esc: char = 0x1b_u8.into();
1351 ///
1352 /// assert!(uppercase_a.is_ascii_graphic());
1353 /// assert!(uppercase_g.is_ascii_graphic());
1354 /// assert!(a.is_ascii_graphic());
1355 /// assert!(g.is_ascii_graphic());
1356 /// assert!(zero.is_ascii_graphic());
1357 /// assert!(percent.is_ascii_graphic());
1358 /// assert!(!space.is_ascii_graphic());
1359 /// assert!(!lf.is_ascii_graphic());
1360 /// assert!(!esc.is_ascii_graphic());
1361 /// ```
1362 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1363 #[inline]
1364 pub fn is_ascii_graphic(&self) -> bool {
1365 self.is_ascii() && (*self as u8).is_ascii_graphic()
1366 }
1367
1368 /// Checks if the value is an ASCII whitespace character:
1369 /// U+0020 SPACE, U+0009 HORIZONTAL TAB, U+000A LINE FEED,
1370 /// U+000C FORM FEED, or U+000D CARRIAGE RETURN.
1371 ///
1372 /// Rust uses the WhatWG Infra Standard's [definition of ASCII
1373 /// whitespace][infra-aw]. There are several other definitions in
1374 /// wide use. For instance, [the POSIX locale][pct] includes
1375 /// U+000B VERTICAL TAB as well as all the above characters,
1376 /// but—from the very same specification—[the default rule for
1377 /// "field splitting" in the Bourne shell][bfs] considers *only*
1378 /// SPACE, HORIZONTAL TAB, and LINE FEED as whitespace.
1379 ///
1380 /// If you are writing a program that will process an existing
1381 /// file format, check what that format's definition of whitespace is
1382 /// before using this function.
1383 ///
1384 /// [infra-aw]: https://infra.spec.whatwg.org/#ascii-whitespace
1385 /// [pct]: http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
1386 /// [bfs]: http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_06_05
1387 ///
1388 /// # Examples
1389 ///
1390 /// ```
1391 /// #![feature(ascii_ctype)]
1392 ///
1393 /// let uppercase_a = 'A';
1394 /// let uppercase_g = 'G';
1395 /// let a = 'a';
1396 /// let g = 'g';
1397 /// let zero = '0';
1398 /// let percent = '%';
1399 /// let space = ' ';
1400 /// let lf = '\n';
1401 /// let esc: char = 0x1b_u8.into();
1402 ///
1403 /// assert!(!uppercase_a.is_ascii_whitespace());
1404 /// assert!(!uppercase_g.is_ascii_whitespace());
1405 /// assert!(!a.is_ascii_whitespace());
1406 /// assert!(!g.is_ascii_whitespace());
1407 /// assert!(!zero.is_ascii_whitespace());
1408 /// assert!(!percent.is_ascii_whitespace());
1409 /// assert!(space.is_ascii_whitespace());
1410 /// assert!(lf.is_ascii_whitespace());
1411 /// assert!(!esc.is_ascii_whitespace());
1412 /// ```
1413 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1414 #[inline]
1415 pub fn is_ascii_whitespace(&self) -> bool {
1416 self.is_ascii() && (*self as u8).is_ascii_whitespace()
1417 }
1418
1419 /// Checks if the value is an ASCII control character:
1420 /// U+0000 NUL ... U+001F UNIT SEPARATOR, or U+007F DELETE.
1421 /// Note that most ASCII whitespace characters are control
1422 /// characters, but SPACE is not.
1423 ///
1424 /// # Examples
1425 ///
1426 /// ```
1427 /// #![feature(ascii_ctype)]
1428 ///
1429 /// let uppercase_a = 'A';
1430 /// let uppercase_g = 'G';
1431 /// let a = 'a';
1432 /// let g = 'g';
1433 /// let zero = '0';
1434 /// let percent = '%';
1435 /// let space = ' ';
1436 /// let lf = '\n';
1437 /// let esc: char = 0x1b_u8.into();
1438 ///
1439 /// assert!(!uppercase_a.is_ascii_control());
1440 /// assert!(!uppercase_g.is_ascii_control());
1441 /// assert!(!a.is_ascii_control());
1442 /// assert!(!g.is_ascii_control());
1443 /// assert!(!zero.is_ascii_control());
1444 /// assert!(!percent.is_ascii_control());
1445 /// assert!(!space.is_ascii_control());
1446 /// assert!(lf.is_ascii_control());
1447 /// assert!(esc.is_ascii_control());
1448 /// ```
1449 #[stable(feature = "ascii_ctype_on_intrinsics", since = "1.24.0")]
1450 #[inline]
1451 pub fn is_ascii_control(&self) -> bool {
1452 self.is_ascii() && (*self as u8).is_ascii_control()
1453 }
1454 }
1455
1456 /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
1457 #[stable(feature = "decode_utf16", since = "1.9.0")]
1458 #[derive(Clone, Debug)]
1459 pub struct DecodeUtf16<I>
1460 where I: Iterator<Item = u16>
1461 {
1462 iter: I,
1463 buf: Option<u16>,
1464 }
1465
1466 /// An error that can be returned when decoding UTF-16 code points.
1467 #[stable(feature = "decode_utf16", since = "1.9.0")]
1468 #[derive(Debug, Clone, Eq, PartialEq)]
1469 pub struct DecodeUtf16Error {
1470 code: u16,
1471 }
1472
1473 /// Create an iterator over the UTF-16 encoded code points in `iter`,
1474 /// returning unpaired surrogates as `Err`s.
1475 ///
1476 /// # Examples
1477 ///
1478 /// Basic usage:
1479 ///
1480 /// ```
1481 /// use std::char::decode_utf16;
1482 ///
1483 /// fn main() {
1484 /// // 𝄞mus<invalid>ic<invalid>
1485 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1486 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1487 /// 0xD834];
1488 ///
1489 /// assert_eq!(decode_utf16(v.iter().cloned())
1490 /// .map(|r| r.map_err(|e| e.unpaired_surrogate()))
1491 /// .collect::<Vec<_>>(),
1492 /// vec![Ok('𝄞'),
1493 /// Ok('m'), Ok('u'), Ok('s'),
1494 /// Err(0xDD1E),
1495 /// Ok('i'), Ok('c'),
1496 /// Err(0xD834)]);
1497 /// }
1498 /// ```
1499 ///
1500 /// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
1501 ///
1502 /// ```
1503 /// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
1504 ///
1505 /// fn main() {
1506 /// // 𝄞mus<invalid>ic<invalid>
1507 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1508 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1509 /// 0xD834];
1510 ///
1511 /// assert_eq!(decode_utf16(v.iter().cloned())
1512 /// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
1513 /// .collect::<String>(),
1514 /// "𝄞mus�ic�");
1515 /// }
1516 /// ```
1517 #[stable(feature = "decode_utf16", since = "1.9.0")]
1518 #[inline]
1519 pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
1520 DecodeUtf16 {
1521 iter: iter.into_iter(),
1522 buf: None,
1523 }
1524 }
1525
1526 #[stable(feature = "decode_utf16", since = "1.9.0")]
1527 impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
1528 type Item = Result<char, DecodeUtf16Error>;
1529
1530 fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> {
1531 let u = match self.buf.take() {
1532 Some(buf) => buf,
1533 None => self.iter.next()?
1534 };
1535
1536 if u < 0xD800 || 0xDFFF < u {
1537 // not a surrogate
1538 Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
1539 } else if u >= 0xDC00 {
1540 // a trailing surrogate
1541 Some(Err(DecodeUtf16Error { code: u }))
1542 } else {
1543 let u2 = match self.iter.next() {
1544 Some(u2) => u2,
1545 // eof
1546 None => return Some(Err(DecodeUtf16Error { code: u })),
1547 };
1548 if u2 < 0xDC00 || u2 > 0xDFFF {
1549 // not a trailing surrogate so we're not a valid
1550 // surrogate pair, so rewind to redecode u2 next time.
1551 self.buf = Some(u2);
1552 return Some(Err(DecodeUtf16Error { code: u }));
1553 }
1554
1555 // all ok, so lets decode it.
1556 let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
1557 Some(Ok(unsafe { from_u32_unchecked(c) }))
1558 }
1559 }
1560
1561 #[inline]
1562 fn size_hint(&self) -> (usize, Option<usize>) {
1563 let (low, high) = self.iter.size_hint();
1564 // we could be entirely valid surrogates (2 elements per
1565 // char), or entirely non-surrogates (1 element per char)
1566 (low / 2, high)
1567 }
1568 }
1569
1570 impl DecodeUtf16Error {
1571 /// Returns the unpaired surrogate which caused this error.
1572 #[stable(feature = "decode_utf16", since = "1.9.0")]
1573 pub fn unpaired_surrogate(&self) -> u16 {
1574 self.code
1575 }
1576 }
1577
1578 #[stable(feature = "decode_utf16", since = "1.9.0")]
1579 impl fmt::Display for DecodeUtf16Error {
1580 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1581 write!(f, "unpaired surrogate found: {:x}", self.code)
1582 }
1583 }
1584
1585 /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a
1586 /// decoding error.
1587 ///
1588 /// It can occur, for example, when giving ill-formed UTF-8 bytes to
1589 /// [`String::from_utf8_lossy`](../../std/string/struct.String.html#method.from_utf8_lossy).
1590 #[stable(feature = "decode_utf16", since = "1.9.0")]
1591 pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';