src/librustc_unicode/char.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Character manipulation (`char` type, Unicode Scalar Value)
  12 //!
  13 //! This module provides the `CharExt` trait, as well as its
  14 //! implementation for the primitive `char` type, in order to allow
  15 //! basic character manipulation.
  16 //!
  17 //! A `char` actually represents a
  18 //! *[Unicode Scalar
  19 //! Value](http://www.unicode.org/glossary/#unicode_scalar_value)*, as it can
  20 //! contain any Unicode code point except high-surrogate and low-surrogate code
  21 //! points.
  22 //!
  23 //! As such, only values in the ranges \[0x0,0xD7FF\] and \[0xE000,0x10FFFF\]
  24 //! (inclusive) are allowed. A `char` can always be safely cast to a `u32`;
  25 //! however the converse is not always true due to the above range limits
  26 //! and, as such, should be performed via the `from_u32` function.
  27
  28 #![stable(feature = "rust1", since = "1.0.0")]
  29 #![doc(primitive = "char")]
  30
  31 use core::char::CharExt as C;
  32 use core::option::Option::{self, Some, None};
  33 use core::iter::Iterator;
  34 use tables::{derived_property, property, general_category, conversions, charwidth};
  35
  36 // stable reexports
  37 pub use core::char::{MAX, from_u32, from_digit, EscapeUnicode, EscapeDefault};
  38
  39 // unstable reexports
  40 #[allow(deprecated)]
  41 pub use normalize::{decompose_canonical, decompose_compatible, compose};
  42 #[allow(deprecated)]
  43 pub use tables::normalization::canonical_combining_class;
  44 pub use tables::UNICODE_VERSION;
  45
  46 /// An iterator over the lowercase mapping of a given character, returned from
  47 /// the [`to_lowercase` method](../primitive.char.html#method.to_lowercase) on
  48 /// characters.
  49 #[stable(feature = "rust1", since = "1.0.0")]
  50 pub struct ToLowercase(CaseMappingIter);
  51
  52 #[stable(feature = "rust1", since = "1.0.0")]
  53 impl Iterator for ToLowercase {
  54     type Item = char;
  55     fn next(&mut self) -> Option<char> { self.0.next() }
  56 }
  57
  58 /// An iterator over the uppercase mapping of a given character, returned from
  59 /// the [`to_uppercase` method](../primitive.char.html#method.to_uppercase) on
  60 /// characters.
  61 #[stable(feature = "rust1", since = "1.0.0")]
  62 pub struct ToUppercase(CaseMappingIter);
  63
  64 #[stable(feature = "rust1", since = "1.0.0")]
  65 impl Iterator for ToUppercase {
  66     type Item = char;
  67     fn next(&mut self) -> Option<char> { self.0.next() }
  68 }
  69
  70 /// An iterator over the titlecase mapping of a given character, returned from
  71 /// the [`to_titlecase` method](../primitive.char.html#method.to_titlecase) on
  72 /// characters.
  73 #[unstable(feature = "unicode", reason = "recently added")]
  74 pub struct ToTitlecase(CaseMappingIter);
  75
  76 #[stable(feature = "unicode_case_mapping", since = "1.2.0")]
  77 impl Iterator for ToTitlecase {
  78     type Item = char;
  79     fn next(&mut self) -> Option<char> { self.0.next() }
  80 }
  81
  82
  83 enum CaseMappingIter {
  84     Three(char, char, char),
  85     Two(char, char),
  86     One(char),
  87     Zero
  88 }
  89
  90 impl CaseMappingIter {
  91     fn new(chars: [char; 3]) -> CaseMappingIter {
  92         if chars[2] == '\0' {
  93             if chars[1] == '\0' {
  94                 CaseMappingIter::One(chars[0])  // Including if chars[0] == '\0'
  95             } else {
  96                 CaseMappingIter::Two(chars[0], chars[1])
  97             }
  98         } else {
  99             CaseMappingIter::Three(chars[0], chars[1], chars[2])
 100         }
 101     }
 102 }
 103
 104 impl Iterator for CaseMappingIter {
 105     type Item = char;
 106     fn next(&mut self) -> Option<char> {
 107         match *self {
 108             CaseMappingIter::Three(a, b, c) => {
 109                 *self = CaseMappingIter::Two(b, c);
 110                 Some(a)
 111             }
 112             CaseMappingIter::Two(b, c) => {
 113                 *self = CaseMappingIter::One(c);
 114                 Some(b)
 115             }
 116             CaseMappingIter::One(c) => {
 117                 *self = CaseMappingIter::Zero;
 118                 Some(c)
 119             }
 120             CaseMappingIter::Zero => None,
 121         }
 122     }
 123 }
 124
 125 #[stable(feature = "rust1", since = "1.0.0")]
 126 #[lang = "char"]
 127 impl char {
 128     /// Checks if a `char` parses as a numeric digit in the given radix.
 129     ///
 130     /// Compared to `is_numeric()`, this function only recognizes the characters
 131     /// `0-9`, `a-z` and `A-Z`.
 132     ///
 133     /// # Return value
 134     ///
 135     /// Returns `true` if `c` is a valid digit under `radix`, and `false`
 136     /// otherwise.
 137     ///
 138     /// # Panics
 139     ///
 140     /// Panics if given a radix > 36.
 141     ///
 142     /// # Examples
 143     ///
 144     /// ```
 145     /// let c = '1';
 146     ///
 147     /// assert!(c.is_digit(10));
 148     ///
 149     /// assert!('f'.is_digit(16));
 150     /// ```
 151     #[stable(feature = "rust1", since = "1.0.0")]
 152     #[inline]
 153     pub fn is_digit(self, radix: u32) -> bool { C::is_digit(self, radix) }
 154
 155     /// Converts a character to the corresponding digit.
 156     ///
 157     /// # Return value
 158     ///
 159     /// If `c` is between '0' and '9', the corresponding value between 0 and
 160     /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns
 161     /// none if the character does not refer to a digit in the given radix.
 162     ///
 163     /// # Panics
 164     ///
 165     /// Panics if given a radix outside the range [0..36].
 166     ///
 167     /// # Examples
 168     ///
 169     /// ```
 170     /// let c = '1';
 171     ///
 172     /// assert_eq!(c.to_digit(10), Some(1));
 173     ///
 174     /// assert_eq!('f'.to_digit(16), Some(15));
 175     /// ```
 176     #[stable(feature = "rust1", since = "1.0.0")]
 177     #[inline]
 178     pub fn to_digit(self, radix: u32) -> Option<u32> { C::to_digit(self, radix) }
 179
 180     /// Returns an iterator that yields the hexadecimal Unicode escape of a
 181     /// character, as `char`s.
 182     ///
 183     /// All characters are escaped with Rust syntax of the form `\\u{NNNN}`
 184     /// where `NNNN` is the shortest hexadecimal representation of the code
 185     /// point.
 186     ///
 187     /// # Examples
 188     ///
 189     /// ```
 190     /// for c in '❤'.escape_unicode() {
 191     ///     print!("{}", c);
 192     /// }
 193     /// println!("");
 194     /// ```
 195     ///
 196     /// This prints:
 197     ///
 198     /// ```text
 199     /// \u{2764}
 200     /// ```
 201     ///
 202     /// Collecting into a `String`:
 203     ///
 204     /// ```
 205     /// let heart: String = '❤'.escape_unicode().collect();
 206     ///
 207     /// assert_eq!(heart, r"\u{2764}");
 208     /// ```
 209     #[stable(feature = "rust1", since = "1.0.0")]
 210     #[inline]
 211     pub fn escape_unicode(self) -> EscapeUnicode { C::escape_unicode(self) }
 212
 213     /// Returns an iterator that yields the 'default' ASCII and
 214     /// C++11-like literal escape of a character, as `char`s.
 215     ///
 216     /// The default is chosen with a bias toward producing literals that are
 217     /// legal in a variety of languages, including C++11 and similar C-family
 218     /// languages. The exact rules are:
 219     ///
 220     /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 221     /// * Single-quote, double-quote and backslash chars are backslash-
 222     ///   escaped.
 223     /// * Any other chars in the range [0x20,0x7e] are not escaped.
 224     /// * Any other chars are given hex Unicode escapes; see `escape_unicode`.
 225     ///
 226     /// # Examples
 227     ///
 228     /// ```
 229     /// for i in '"'.escape_default() {
 230     ///     println!("{}", i);
 231     /// }
 232     /// ```
 233     ///
 234     /// This prints:
 235     ///
 236     /// ```text
 237     /// \
 238     /// "
 239     /// ```
 240     ///
 241     /// Collecting into a `String`:
 242     ///
 243     /// ```
 244     /// let quote: String = '"'.escape_default().collect();
 245     ///
 246     /// assert_eq!(quote, "\\\"");
 247     /// ```
 248     #[stable(feature = "rust1", since = "1.0.0")]
 249     #[inline]
 250     pub fn escape_default(self) -> EscapeDefault { C::escape_default(self) }
 251
 252     /// Returns the number of bytes this character would need if encoded in
 253     /// UTF-8.
 254     ///
 255     /// # Examples
 256     ///
 257     /// ```
 258     /// let n = 'ß'.len_utf8();
 259     ///
 260     /// assert_eq!(n, 2);
 261     /// ```
 262     #[stable(feature = "rust1", since = "1.0.0")]
 263     #[inline]
 264     pub fn len_utf8(self) -> usize { C::len_utf8(self) }
 265
 266     /// Returns the number of 16-bit code units this character would need if
 267     /// encoded in UTF-16.
 268     ///
 269     /// # Examples
 270     ///
 271     /// ```
 272     /// let n = 'ß'.len_utf16();
 273     ///
 274     /// assert_eq!(n, 1);
 275     /// ```
 276     #[stable(feature = "rust1", since = "1.0.0")]
 277     #[inline]
 278     pub fn len_utf16(self) -> usize { C::len_utf16(self) }
 279
 280     /// Encodes this character as UTF-8 into the provided byte buffer, and then
 281     /// returns the number of bytes written.
 282     ///
 283     /// If the buffer is not large enough, nothing will be written into it and a
 284     /// `None` will be returned. A buffer of length four is large enough to
 285     /// encode any `char`.
 286     ///
 287     /// # Examples
 288     ///
 289     /// In both of these examples, 'ß' takes two bytes to encode.
 290     ///
 291     /// ```
 292     /// # #![feature(unicode)]
 293     /// let mut b = [0; 2];
 294     ///
 295     /// let result = 'ß'.encode_utf8(&mut b);
 296     ///
 297     /// assert_eq!(result, Some(2));
 298     /// ```
 299     ///
 300     /// A buffer that's too small:
 301     ///
 302     /// ```
 303     /// # #![feature(unicode)]
 304     /// let mut b = [0; 1];
 305     ///
 306     /// let result = 'ß'.encode_utf8(&mut b);
 307     ///
 308     /// assert_eq!(result, None);
 309     /// ```
 310     #[unstable(feature = "unicode",
 311                reason = "pending decision about Iterator/Writer/Reader")]
 312     #[inline]
 313     pub fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
 314         C::encode_utf8(self, dst)
 315     }
 316
 317     /// Encodes this character as UTF-16 into the provided `u16` buffer, and
 318     /// then returns the number of `u16`s written.
 319     ///
 320     /// If the buffer is not large enough, nothing will be written into it and a
 321     /// `None` will be returned. A buffer of length 2 is large enough to encode
 322     /// any `char`.
 323     ///
 324     /// # Examples
 325     ///
 326     /// In both of these examples, 'ß' takes one `u16` to encode.
 327     ///
 328     /// ```
 329     /// # #![feature(unicode)]
 330     /// let mut b = [0; 1];
 331     ///
 332     /// let result = 'ß'.encode_utf16(&mut b);
 333     ///
 334     /// assert_eq!(result, Some(1));
 335     /// ```
 336     ///
 337     /// A buffer that's too small:
 338     ///
 339     /// ```
 340     /// # #![feature(unicode)]
 341     /// let mut b = [0; 0];
 342     ///
 343     /// let result = 'ß'.encode_utf8(&mut b);
 344     ///
 345     /// assert_eq!(result, None);
 346     /// ```
 347     #[unstable(feature = "unicode",
 348                reason = "pending decision about Iterator/Writer/Reader")]
 349     #[inline]
 350     pub fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
 351         C::encode_utf16(self, dst)
 352     }
 353
 354     /// Returns whether the specified character is considered a Unicode
 355     /// alphabetic code point.
 356     #[stable(feature = "rust1", since = "1.0.0")]
 357     #[inline]
 358     pub fn is_alphabetic(self) -> bool {
 359         match self {
 360             'a' ... 'z' | 'A' ... 'Z' => true,
 361             c if c > '\x7f' => derived_property::Alphabetic(c),
 362             _ => false
 363         }
 364     }
 365
 366     /// Returns whether the specified character satisfies the 'XID_Start'
 367     /// Unicode property.
 368     ///
 369     /// 'XID_Start' is a Unicode Derived Property specified in
 370     /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
 371     /// mostly similar to ID_Start but modified for closure under NFKx.
 372     #[unstable(feature = "unicode",
 373                reason = "mainly needed for compiler internals")]
 374     #[inline]
 375     pub fn is_xid_start(self) -> bool { derived_property::XID_Start(self) }
 376
 377     /// Returns whether the specified `char` satisfies the 'XID_Continue'
 378     /// Unicode property.
 379     ///
 380     /// 'XID_Continue' is a Unicode Derived Property specified in
 381     /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
 382     /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
 383     #[unstable(feature = "unicode",
 384                reason = "mainly needed for compiler internals")]
 385     #[inline]
 386     pub fn is_xid_continue(self) -> bool { derived_property::XID_Continue(self) }
 387
 388     /// Indicates whether a character is in lowercase.
 389     ///
 390     /// This is defined according to the terms of the Unicode Derived Core
 391     /// Property `Lowercase`.
 392     #[stable(feature = "rust1", since = "1.0.0")]
 393     #[inline]
 394     pub fn is_lowercase(self) -> bool {
 395         match self {
 396             'a' ... 'z' => true,
 397             c if c > '\x7f' => derived_property::Lowercase(c),
 398             _ => false
 399         }
 400     }
 401
 402     /// Indicates whether a character is in uppercase.
 403     ///
 404     /// This is defined according to the terms of the Unicode Derived Core
 405     /// Property `Uppercase`.
 406     #[stable(feature = "rust1", since = "1.0.0")]
 407     #[inline]
 408     pub fn is_uppercase(self) -> bool {
 409         match self {
 410             'A' ... 'Z' => true,
 411             c if c > '\x7f' => derived_property::Uppercase(c),
 412             _ => false
 413         }
 414     }
 415
 416     /// Indicates whether a character is whitespace.
 417     ///
 418     /// Whitespace is defined in terms of the Unicode Property `White_Space`.
 419     #[stable(feature = "rust1", since = "1.0.0")]
 420     #[inline]
 421     pub fn is_whitespace(self) -> bool {
 422         match self {
 423             ' ' | '\x09' ... '\x0d' => true,
 424             c if c > '\x7f' => property::White_Space(c),
 425             _ => false
 426         }
 427     }
 428
 429     /// Indicates whether a character is alphanumeric.
 430     ///
 431     /// Alphanumericness is defined in terms of the Unicode General Categories
 432     /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
 433     #[stable(feature = "rust1", since = "1.0.0")]
 434     #[inline]
 435     pub fn is_alphanumeric(self) -> bool {
 436         self.is_alphabetic() || self.is_numeric()
 437     }
 438
 439     /// Indicates whether a character is a control code point.
 440     ///
 441     /// Control code points are defined in terms of the Unicode General
 442     /// Category `Cc`.
 443     #[stable(feature = "rust1", since = "1.0.0")]
 444     #[inline]
 445     pub fn is_control(self) -> bool { general_category::Cc(self) }
 446
 447     /// Indicates whether the character is numeric (Nd, Nl, or No).
 448     #[stable(feature = "rust1", since = "1.0.0")]
 449     #[inline]
 450     pub fn is_numeric(self) -> bool {
 451         match self {
 452             '0' ... '9' => true,
 453             c if c > '\x7f' => general_category::N(c),
 454             _ => false
 455         }
 456     }
 457
 458     /// Converts a character to its lowercase equivalent.
 459     ///
 460     /// This performs complex unconditional mappings with no tailoring.
 461     /// See `to_uppercase()` for references and more information.
 462     ///
 463     /// # Return value
 464     ///
 465     /// Returns an iterator which yields the characters corresponding to the
 466     /// lowercase equivalent of the character. If no conversion is possible then
 467     /// an iterator with just the input character is returned.
 468     ///
 469     /// # Examples
 470     ///
 471     /// ```
 472     /// assert_eq!(Some('c'), 'C'.to_lowercase().next());
 473     /// ```
 474     #[stable(feature = "rust1", since = "1.0.0")]
 475     #[inline]
 476     pub fn to_lowercase(self) -> ToLowercase {
 477         ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
 478     }
 479
 480     /// Converts a character to its titlecase equivalent.
 481     ///
 482     /// This performs complex unconditional mappings with no tailoring.
 483     /// See `to_uppercase()` for references and more information.
 484     ///
 485     /// This differs from `to_uppercase()` since Unicode contains
 486     /// digraphs and ligature characters.
 487     /// For example, U+01F3 “ǳ” and U+FB01 “ﬁ”
 488     /// map to U+01F1 “Ǳ” and U+0046 U+0069 “Fi”, respectively.
 489     ///
 490     /// # Return value
 491     ///
 492     /// Returns an iterator which yields the characters corresponding to the
 493     /// titlecase equivalent of the character. If no conversion is possible then
 494     /// an iterator with just the input character is returned.
 495     #[unstable(feature = "unicode", reason = "recently added")]
 496     #[inline]
 497     pub fn to_titlecase(self) -> ToTitlecase {
 498         ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
 499     }
 500
 501     /// Converts a character to its uppercase equivalent.
 502     ///
 503     /// This performs complex unconditional mappings with no tailoring:
 504     /// it maps one Unicode character to its uppercase equivalent
 505     /// according to the Unicode database [1]
 506     /// and the additional complex mappings [`SpecialCasing.txt`].
 507     /// Conditional mappings (based on context or language) are not considerd here.
 508     ///
 509     /// A full reference can be found here [2].
 510     ///
 511     /// # Return value
 512     ///
 513     /// Returns an iterator which yields the characters corresponding to the
 514     /// uppercase equivalent of the character. If no conversion is possible then
 515     /// an iterator with just the input character is returned.
 516     ///
 517     /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 518     ///
 519     /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
 520     ///
 521     /// [2]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
 522     ///
 523     /// # Examples
 524     ///
 525     /// ```
 526     /// assert_eq!(Some('C'), 'c'.to_uppercase().next());
 527     /// ```
 528     #[stable(feature = "rust1", since = "1.0.0")]
 529     #[inline]
 530     pub fn to_uppercase(self) -> ToUppercase {
 531         ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
 532     }
 533
 534     /// Returns this character's displayed width in columns, or `None` if it is a
 535     /// control character other than `'\x00'`.
 536     ///
 537     /// `is_cjk` determines behavior for characters in the Ambiguous category:
 538     /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
 539     /// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
 540     /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
 541     /// recommends that these characters be treated as 1 column (i.e.,
 542     /// `is_cjk` = `false`) if the context cannot be reliably determined.
 543     #[deprecated(reason = "use the crates.io `unicode-width` library instead",
 544                  since = "1.0.0")]
 545     #[unstable(feature = "unicode",
 546                reason = "needs expert opinion. is_cjk flag stands out as ugly")]
 547     #[inline]
 548     pub fn width(self, is_cjk: bool) -> Option<usize> {
 549         charwidth::width(self, is_cjk)
 550     }
 551 }