1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Utilities for manipulating the char type
13 use option
::{None, Option, Some}
;
19 #[cfg(notest)] use cmp::Eq;
22 Lu Uppercase_Letter an uppercase letter
23 Ll Lowercase_Letter a lowercase letter
24 Lt Titlecase_Letter a digraphic character, with first part uppercase
25 Lm Modifier_Letter a modifier letter
26 Lo Other_Letter other letters, including syllables and ideographs
27 Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
28 Mc Spacing_Mark a spacing combining mark (positive advance width)
29 Me Enclosing_Mark an enclosing combining mark
30 Nd Decimal_Number a decimal digit
31 Nl Letter_Number a letterlike numeric character
32 No Other_Number a numeric character of other type
33 Pc Connector_Punctuation a connecting punctuation mark, like a tie
34 Pd Dash_Punctuation a dash or hyphen punctuation mark
35 Ps Open_Punctuation an opening punctuation mark (of a pair)
36 Pe Close_Punctuation a closing punctuation mark (of a pair)
37 Pi Initial_Punctuation an initial quotation mark
38 Pf Final_Punctuation a final quotation mark
39 Po Other_Punctuation a punctuation mark of other type
40 Sm Math_Symbol a symbol of primarily mathematical use
41 Sc Currency_Symbol a currency sign
42 Sk Modifier_Symbol a non-letterlike modifier symbol
43 So Other_Symbol a symbol of other type
44 Zs Space_Separator a space character (of various non-zero widths)
45 Zl Line_Separator U+2028 LINE SEPARATOR only
46 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
47 Cc Control a C0 or C1 control code
48 Cf Format a format control character
49 Cs Surrogate a surrogate code point
50 Co Private_Use a private-use character
51 Cn Unassigned a reserved unassigned code point or a noncharacter
54 pub use is_alphabetic
= unicode
::derived_property
::Alphabetic
;
55 pub use is_XID_start
= unicode
::derived_property
::XID_Start
;
56 pub use is_XID_continue
= unicode
::derived_property
::XID_Continue
;
60 * Indicates whether a character is in lower case, defined
61 * in terms of the Unicode General Category 'Ll'
64 pub fn is_lowercase(c
: char) -> bool
{
65 return unicode
::general_category
::Ll(c
);
69 * Indicates whether a character is in upper case, defined
70 * in terms of the Unicode General Category 'Lu'.
73 pub fn is_uppercase(c
: char) -> bool
{
74 return unicode
::general_category
::Lu(c
);
78 * Indicates whether a character is whitespace. Whitespace is defined in
79 * terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
80 * additional 'Cc'-category control codes in the range [0x09, 0x0d]
83 pub fn is_whitespace(c
: char) -> bool
{
84 return ('
\x09'
<= c
&& c
<= '
\x0d'
)
85 || unicode
::general_category
::Zs(c
)
86 || unicode
::general_category
::Zl(c
)
87 || unicode
::general_category
::Zp(c
);
91 * Indicates whether a character is alphanumeric. Alphanumericness is
92 * defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
93 * and the Derived Core Property 'Alphabetic'.
96 pub fn is_alphanumeric(c
: char) -> bool
{
97 return unicode
::derived_property
::Alphabetic(c
) ||
98 unicode
::general_category
::Nd(c
) ||
99 unicode
::general_category
::Nl(c
) ||
100 unicode
::general_category
::No(c
);
103 /// Indicates whether the character is an ASCII character
105 pub fn is_ascii(c
: char) -> bool
{
106 c
- ('
\x7F'
& c
) == '
\x00'
109 /// Indicates whether the character is numeric (Nd, Nl, or No)
111 pub fn is_digit(c
: char) -> bool
{
112 return unicode
::general_category
::Nd(c
) ||
113 unicode
::general_category
::Nl(c
) ||
114 unicode
::general_category
::No(c
);
118 * Checks if a character parses as a numeric digit in the given radix.
119 * Compared to `is_digit()`, this function only recognizes the ascii
120 * characters `0-9`, `a-z` and `A-Z`.
122 * Returns `true` if `c` is a valid digit under `radix`, and `false`
125 * Fails if given a `radix` > 36.
127 * Note: This just wraps `to_digit()`.
130 pub fn is_digit_radix(c
: char, radix
: uint
) -> bool
{
131 match to_digit(c
, radix
) {
138 * Convert a char to the corresponding digit.
142 * If `c` is between '0' and '9', the corresponding value
143 * between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
144 * 'b' or 'B', 11, etc. Returns none if the char does not
145 * refer to a digit in the given radix.
148 * Fails if given a `radix` outside the range `[0..36]`.
151 pub fn to_digit(c
: char, radix
: uint
) -> Option
<uint
> {
153 fail
!(fmt
!("to_digit: radix %? is to high (maximum 36)", radix
));
156 '
0'
.. '
9'
=> c
as uint
- ('
0'
as uint
),
157 'a'
.. 'z'
=> c
as uint
+ 10u - ('a'
as uint
),
158 'A'
.. 'Z'
=> c
as uint
+ 10u - ('A'
as uint
),
161 if val
< radix { Some(val) }
166 * Converts a number to the ascii character representing it.
168 * Returns `Some(char)` if `num` represents one digit under `radix`,
169 * using one character of `0-9` or `a-z`, or `None` if it doesn't.
171 * Fails if given an `radix` > 36.
174 pub fn from_digit(num
: uint
, radix
: uint
) -> Option
<char> {
176 fail
!(fmt
!("from_digit: radix %? is to high (maximum 36)", num
));
180 Some(('
0'
as uint
+ num
) as char)
182 Some(('a'
as uint
+ num
- 10u) as char)
190 * Return the hexadecimal unicode escape of a char.
192 * The rules are as follows:
194 * - chars in [0,0xff] get 2-digit escapes: `\\xNN`
195 * - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
196 * - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
198 pub fn escape_unicode(c
: char) -> ~str {
199 let s
= u32::to_str_radix(c
as u32, 16u);
200 let (c
, pad
) = (if c
<= '
\xff' { ('x', 2u) }
201 else if c
<= '
\uffff' { ('u', 4u) }
203 assert
!(str::len(s
) <= pad
);
206 str::push_str(&mut out
, str::from_char(c
));
207 for uint
::range(str::len(s
), pad
) |_i
|
208 { str::push_str(&mut out, ~"0"); }
209 str::push_str(&mut out
, s
);
215 * Return a 'default' ASCII and C++11-like char-literal escape of a char.
217 * The default is chosen with a bias toward producing literals that are
218 * legal in a variety of languages, including C++11 and similar C-family
219 * languages. The exact rules are:
221 * - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
222 * - Single-quote, double-quote and backslash chars are backslash-escaped.
223 * - Any other chars in the range [0x20,0x7e] are not escaped.
224 * - Any other chars are given hex unicode escapes; see `escape_unicode`.
226 pub fn escape_default(c
: char) -> ~str {
234 '\x20' .. '\x7e' => str::from_char(c),
235 _ => escape_unicode(c)
244 * -1 if a < b, 0 if a == b, +1 if a > b
247 pub fn cmp(a: char, b: char) -> int {
248 return if b > a { -1 }
255 fn eq(&self, other: &char) -> bool { (*self) == (*other) }
256 fn ne(&self, other: &char) -> bool { (*self) != (*other) }
260 fn test_is_lowercase() {
261 assert!(is_lowercase('a'));
262 assert!(is_lowercase('ö'));
263 assert!(is_lowercase('ß'));
264 assert!(!is_lowercase('Ü'));
265 assert!(!is_lowercase('P'));
269 fn test_is_uppercase() {
270 assert!(!is_uppercase('h'));
271 assert!(!is_uppercase('ä'));
272 assert!(!is_uppercase('ß'));
273 assert!(is_uppercase('Ö'));
274 assert!(is_uppercase('T'));
278 fn test_is_whitespace() {
279 assert!(is_whitespace(' '));
280 assert!(is_whitespace('\u2007'));
281 assert!(is_whitespace('\t'));
282 assert!(is_whitespace('\n'));
284 assert!(!is_whitespace('a'));
285 assert!(!is_whitespace('_'));
286 assert!(!is_whitespace('\u0000'));
291 assert_eq!(to_digit('0', 10u), Some(0u));
292 assert_eq!(to_digit('1', 2u), Some(1u));
293 assert_eq!(to_digit('2', 3u), Some(2u));
294 assert_eq!(to_digit('9', 10u), Some(9u));
295 assert_eq!(to_digit('a', 16u), Some(10u));
296 assert_eq!(to_digit('A', 16u), Some(10u));
297 assert_eq!(to_digit('b', 16u), Some(11u));
298 assert_eq!(to_digit('B', 16u), Some(11u));
299 assert_eq!(to_digit('z', 36u), Some(35u));
300 assert_eq!(to_digit('Z', 36u), Some(35u));
302 assert!(to_digit(' ', 10u).is_none());
303 assert!(to_digit('$', 36u).is_none());
308 assert!(str::all(~"banana
", is_ascii));
309 assert!(! str::all(~"ประเทศไทย中华Việt Nam
", is_ascii));
314 assert!(is_digit('2'));
315 assert!(is_digit('7'));
316 assert!(! is_digit('c'));
317 assert!(! is_digit('i'));
318 assert!(! is_digit('z'));
319 assert!(! is_digit('Q'));
323 fn test_escape_default() {
324 assert_eq!(escape_default('\n'), ~"\\n
");
325 assert_eq!(escape_default('\r'), ~"\\r
");
326 assert_eq!(escape_default('\''), ~"\\'
");
327 assert_eq!(escape_default('"'
), ~"\\\"");
328 assert_eq
!(escape_default(' '
), ~" ");
329 assert_eq
!(escape_default('a'
), ~"a");
330 assert_eq
!(escape_default('
~'
), ~"~");
331 assert_eq
!(escape_default('
\x00'
), ~"\\x00");
332 assert_eq
!(escape_default('
\x1f'
), ~"\\x1f");
333 assert_eq
!(escape_default('
\x7f'
), ~"\\x7f");
334 assert_eq
!(escape_default('
\xff'
), ~"\\xff");
335 assert_eq
!(escape_default('
\u011b'
), ~"\\u011b");
336 assert_eq
!(escape_default('
\U0001d4b6'
), ~"\\U0001d4b6");
341 fn test_escape_unicode() {
342 assert_eq
!(escape_unicode('
\x00'
), ~"\\x00");
343 assert_eq
!(escape_unicode('
\n'
), ~"\\x0a");
344 assert_eq
!(escape_unicode(' '
), ~"\\x20");
345 assert_eq
!(escape_unicode('a'
), ~"\\x61");
346 assert_eq
!(escape_unicode('
\u011b'
), ~"\\u011b");
347 assert_eq
!(escape_unicode('
\U0001d4b6'
), ~"\\U0001d4b6");