src/libcore/char.rs

   1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Utilities for manipulating the char type
  12
  13 use option::{None, Option, Some};
  14 use str;
  15 use u32;
  16 use uint;
  17 use unicode;
  18
  19 #[cfg(notest)] use cmp::Eq;
  20
  21 /*
  22     Lu  Uppercase_Letter    an uppercase letter
  23     Ll  Lowercase_Letter    a lowercase letter
  24     Lt  Titlecase_Letter    a digraphic character, with first part uppercase
  25     Lm  Modifier_Letter     a modifier letter
  26     Lo  Other_Letter    other letters, including syllables and ideographs
  27     Mn  Nonspacing_Mark     a nonspacing combining mark (zero advance width)
  28     Mc  Spacing_Mark    a spacing combining mark (positive advance width)
  29     Me  Enclosing_Mark  an enclosing combining mark
  30     Nd  Decimal_Number  a decimal digit
  31     Nl  Letter_Number   a letterlike numeric character
  32     No  Other_Number    a numeric character of other type
  33     Pc  Connector_Punctuation   a connecting punctuation mark, like a tie
  34     Pd  Dash_Punctuation    a dash or hyphen punctuation mark
  35     Ps  Open_Punctuation    an opening punctuation mark (of a pair)
  36     Pe  Close_Punctuation   a closing punctuation mark (of a pair)
  37     Pi  Initial_Punctuation     an initial quotation mark
  38     Pf  Final_Punctuation   a final quotation mark
  39     Po  Other_Punctuation   a punctuation mark of other type
  40     Sm  Math_Symbol     a symbol of primarily mathematical use
  41     Sc  Currency_Symbol     a currency sign
  42     Sk  Modifier_Symbol     a non-letterlike modifier symbol
  43     So  Other_Symbol    a symbol of other type
  44     Zs  Space_Separator     a space character (of various non-zero widths)
  45     Zl  Line_Separator  U+2028 LINE SEPARATOR only
  46     Zp  Paragraph_Separator     U+2029 PARAGRAPH SEPARATOR only
  47     Cc  Control     a C0 or C1 control code
  48     Cf  Format  a format control character
  49     Cs  Surrogate   a surrogate code point
  50     Co  Private_Use     a private-use character
  51     Cn  Unassigned  a reserved unassigned code point or a noncharacter
  52 */
  53
  54 pub use is_alphabetic = unicode::derived_property::Alphabetic;
  55 pub use is_XID_start = unicode::derived_property::XID_Start;
  56 pub use is_XID_continue = unicode::derived_property::XID_Continue;
  57
  58
  59 /**
  60  * Indicates whether a character is in lower case, defined
  61  * in terms of the Unicode General Category 'Ll'
  62  */
  63 #[inline(always)]
  64 pub fn is_lowercase(c: char) -> bool {
  65     return unicode::general_category::Ll(c);
  66 }
  67
  68 /**
  69  * Indicates whether a character is in upper case, defined
  70  * in terms of the Unicode General Category 'Lu'.
  71  */
  72 #[inline(always)]
  73 pub fn is_uppercase(c: char) -> bool {
  74     return unicode::general_category::Lu(c);
  75 }
  76
  77 /**
  78  * Indicates whether a character is whitespace. Whitespace is defined in
  79  * terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
  80  * additional 'Cc'-category control codes in the range [0x09, 0x0d]
  81  */
  82 #[inline(always)]
  83 pub fn is_whitespace(c: char) -> bool {
  84     return ('\x09' <= c && c <= '\x0d')
  85         || unicode::general_category::Zs(c)
  86         || unicode::general_category::Zl(c)
  87         || unicode::general_category::Zp(c);
  88 }
  89
  90 /**
  91  * Indicates whether a character is alphanumeric. Alphanumericness is
  92  * defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
  93  * and the Derived Core Property 'Alphabetic'.
  94  */
  95 #[inline(always)]
  96 pub fn is_alphanumeric(c: char) -> bool {
  97     return unicode::derived_property::Alphabetic(c) ||
  98         unicode::general_category::Nd(c) ||
  99         unicode::general_category::Nl(c) ||
 100         unicode::general_category::No(c);
 101 }
 102
 103 /// Indicates whether the character is an ASCII character
 104 #[inline(always)]
 105 pub fn is_ascii(c: char) -> bool {
 106    c - ('\x7F' & c) == '\x00'
 107 }
 108
 109 /// Indicates whether the character is numeric (Nd, Nl, or No)
 110 #[inline(always)]
 111 pub fn is_digit(c: char) -> bool {
 112     return unicode::general_category::Nd(c) ||
 113         unicode::general_category::Nl(c) ||
 114         unicode::general_category::No(c);
 115 }
 116
 117 /**
 118  * Checks if a character parses as a numeric digit in the given radix.
 119  * Compared to `is_digit()`, this function only recognizes the ascii
 120  * characters `0-9`, `a-z` and `A-Z`.
 121  *
 122  * Returns `true` if `c` is a valid digit under `radix`, and `false`
 123  * otherwise.
 124  *
 125  * Fails if given a `radix` > 36.
 126  *
 127  * Note: This just wraps `to_digit()`.
 128  */
 129 #[inline(always)]
 130 pub fn is_digit_radix(c: char, radix: uint) -> bool {
 131     match to_digit(c, radix) {
 132         Some(_) => true,
 133         None    => false
 134     }
 135 }
 136
 137 /**
 138  * Convert a char to the corresponding digit.
 139  *
 140  * # Return value
 141  *
 142  * If `c` is between '0' and '9', the corresponding value
 143  * between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
 144  * 'b' or 'B', 11, etc. Returns none if the char does not
 145  * refer to a digit in the given radix.
 146  *
 147  * # Failure
 148  * Fails if given a `radix` outside the range `[0..36]`.
 149  */
 150 #[inline]
 151 pub fn to_digit(c: char, radix: uint) -> Option<uint> {
 152     if radix > 36 {
 153         fail!(fmt!("to_digit: radix %? is to high (maximum 36)", radix));
 154     }
 155     let val = match c {
 156       '0' .. '9' => c as uint - ('0' as uint),
 157       'a' .. 'z' => c as uint + 10u - ('a' as uint),
 158       'A' .. 'Z' => c as uint + 10u - ('A' as uint),
 159       _ => return None
 160     };
 161     if val < radix { Some(val) }
 162     else { None }
 163 }
 164
 165 /**
 166  * Converts a number to the ascii character representing it.
 167  *
 168  * Returns `Some(char)` if `num` represents one digit under `radix`,
 169  * using one character of `0-9` or `a-z`, or `None` if it doesn't.
 170  *
 171  * Fails if given an `radix` > 36.
 172  */
 173 #[inline]
 174 pub fn from_digit(num: uint, radix: uint) -> Option<char> {
 175     if radix > 36 {
 176         fail!(fmt!("from_digit: radix %? is to high (maximum 36)", num));
 177     }
 178     if num < radix {
 179         if num < 10 {
 180             Some(('0' as uint + num) as char)
 181         } else {
 182             Some(('a' as uint + num - 10u) as char)
 183         }
 184     } else {
 185         None
 186     }
 187 }
 188
 189 /**
 190  * Return the hexadecimal unicode escape of a char.
 191  *
 192  * The rules are as follows:
 193  *
 194  *   - chars in [0,0xff] get 2-digit escapes: `\\xNN`
 195  *   - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
 196  *   - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
 197  */
 198 pub fn escape_unicode(c: char) -> ~str {
 199     let s = u32::to_str_radix(c as u32, 16u);
 200     let (c, pad) = (if c <= '\xff' { ('x', 2u) }
 201                     else if c <= '\uffff' { ('u', 4u) }
 202                     else { ('U', 8u) });
 203     assert!(str::len(s) <= pad);
 204     let mut out = ~"\\";
 205     unsafe {
 206         str::push_str(&mut out, str::from_char(c));
 207         for uint::range(str::len(s), pad) |_i|
 208             { str::push_str(&mut out, ~"0"); }
 209         str::push_str(&mut out, s);
 210     }
 211     out
 212 }
 213
 214 /**
 215  * Return a 'default' ASCII and C++11-like char-literal escape of a char.
 216  *
 217  * The default is chosen with a bias toward producing literals that are
 218  * legal in a variety of languages, including C++11 and similar C-family
 219  * languages. The exact rules are:
 220  *
 221  *   - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 222  *   - Single-quote, double-quote and backslash chars are backslash-escaped.
 223  *   - Any other chars in the range [0x20,0x7e] are not escaped.
 224  *   - Any other chars are given hex unicode escapes; see `escape_unicode`.
 225  */
 226 pub fn escape_default(c: char) -> ~str {
 227     match c {
 228       '\t' => ~"\\t",
 229       '\r' => ~"\\r",
 230       '\n' => ~"\\n",
 231       '\\' => ~"\\\\",
 232       '\'' => ~"\\'",
 233       '"'  => ~"\\\"",
 234       '\x20' .. '\x7e' => str::from_char(c),
 235       _ => escape_unicode(c)
 236     }
 237 }
 238
 239 /**
 240  * Compare two chars
 241  *
 242  * # Return value
 243  *
 244  * -1 if a < b, 0 if a == b, +1 if a > b
 245  */
 246 #[inline(always)]
 247 pub fn cmp(a: char, b: char) -> int {
 248     return  if b > a { -1 }
 249     else if b < a { 1 }
 250     else { 0 }
 251 }
 252
 253 #[cfg(notest)]
 254 impl Eq for char {
 255     fn eq(&self, other: &char) -> bool { (*self) == (*other) }
 256     fn ne(&self, other: &char) -> bool { (*self) != (*other) }
 257 }
 258
 259 #[test]
 260 fn test_is_lowercase() {
 261     assert!(is_lowercase('a'));
 262     assert!(is_lowercase('ö'));
 263     assert!(is_lowercase('ß'));
 264     assert!(!is_lowercase('Ü'));
 265     assert!(!is_lowercase('P'));
 266 }
 267
 268 #[test]
 269 fn test_is_uppercase() {
 270     assert!(!is_uppercase('h'));
 271     assert!(!is_uppercase('ä'));
 272     assert!(!is_uppercase('ß'));
 273     assert!(is_uppercase('Ö'));
 274     assert!(is_uppercase('T'));
 275 }
 276
 277 #[test]
 278 fn test_is_whitespace() {
 279     assert!(is_whitespace(' '));
 280     assert!(is_whitespace('\u2007'));
 281     assert!(is_whitespace('\t'));
 282     assert!(is_whitespace('\n'));
 283
 284     assert!(!is_whitespace('a'));
 285     assert!(!is_whitespace('_'));
 286     assert!(!is_whitespace('\u0000'));
 287 }
 288
 289 #[test]
 290 fn test_to_digit() {
 291     assert_eq!(to_digit('0', 10u), Some(0u));
 292     assert_eq!(to_digit('1', 2u), Some(1u));
 293     assert_eq!(to_digit('2', 3u), Some(2u));
 294     assert_eq!(to_digit('9', 10u), Some(9u));
 295     assert_eq!(to_digit('a', 16u), Some(10u));
 296     assert_eq!(to_digit('A', 16u), Some(10u));
 297     assert_eq!(to_digit('b', 16u), Some(11u));
 298     assert_eq!(to_digit('B', 16u), Some(11u));
 299     assert_eq!(to_digit('z', 36u), Some(35u));
 300     assert_eq!(to_digit('Z', 36u), Some(35u));
 301
 302     assert!(to_digit(' ', 10u).is_none());
 303     assert!(to_digit('$', 36u).is_none());
 304 }
 305
 306 #[test]
 307 fn test_is_ascii() {
 308    assert!(str::all(~"banana", is_ascii));
 309    assert!(! str::all(~"ประเทศไทย中华Việt Nam", is_ascii));
 310 }
 311
 312 #[test]
 313 fn test_is_digit() {
 314    assert!(is_digit('2'));
 315    assert!(is_digit('7'));
 316    assert!(! is_digit('c'));
 317    assert!(! is_digit('i'));
 318    assert!(! is_digit('z'));
 319    assert!(! is_digit('Q'));
 320 }
 321
 322 #[test]
 323 fn test_escape_default() {
 324     assert_eq!(escape_default('\n'), ~"\\n");
 325     assert_eq!(escape_default('\r'), ~"\\r");
 326     assert_eq!(escape_default('\''), ~"\\'");
 327     assert_eq!(escape_default('"'), ~"\\\"");
 328     assert_eq!(escape_default(' '), ~" ");
 329     assert_eq!(escape_default('a'), ~"a");
 330     assert_eq!(escape_default('~'), ~"~");
 331     assert_eq!(escape_default('\x00'), ~"\\x00");
 332     assert_eq!(escape_default('\x1f'), ~"\\x1f");
 333     assert_eq!(escape_default('\x7f'), ~"\\x7f");
 334     assert_eq!(escape_default('\xff'), ~"\\xff");
 335     assert_eq!(escape_default('\u011b'), ~"\\u011b");
 336     assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6");
 337 }
 338
 339
 340 #[test]
 341 fn test_escape_unicode() {
 342     assert_eq!(escape_unicode('\x00'), ~"\\x00");
 343     assert_eq!(escape_unicode('\n'), ~"\\x0a");
 344     assert_eq!(escape_unicode(' '), ~"\\x20");
 345     assert_eq!(escape_unicode('a'), ~"\\x61");
 346     assert_eq!(escape_unicode('\u011b'), ~"\\u011b");
 347     assert_eq!(escape_unicode('\U0001d4b6'), ~"\\U0001d4b6");
 348 }