]> git.proxmox.com Git - rustc.git/blob - src/libcore/char.rs
Merge tag 'debian/0.6-0_exp1'
[rustc.git] / src / libcore / char.rs
1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 //! Utilities for manipulating the char type
12
13 use option::{None, Option, Some};
14 use str;
15 use u32;
16 use uint;
17 use unicode;
18
19 #[cfg(notest)] use cmp::Eq;
20
21 /*
22 Lu Uppercase_Letter an uppercase letter
23 Ll Lowercase_Letter a lowercase letter
24 Lt Titlecase_Letter a digraphic character, with first part uppercase
25 Lm Modifier_Letter a modifier letter
26 Lo Other_Letter other letters, including syllables and ideographs
27 Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
28 Mc Spacing_Mark a spacing combining mark (positive advance width)
29 Me Enclosing_Mark an enclosing combining mark
30 Nd Decimal_Number a decimal digit
31 Nl Letter_Number a letterlike numeric character
32 No Other_Number a numeric character of other type
33 Pc Connector_Punctuation a connecting punctuation mark, like a tie
34 Pd Dash_Punctuation a dash or hyphen punctuation mark
35 Ps Open_Punctuation an opening punctuation mark (of a pair)
36 Pe Close_Punctuation a closing punctuation mark (of a pair)
37 Pi Initial_Punctuation an initial quotation mark
38 Pf Final_Punctuation a final quotation mark
39 Po Other_Punctuation a punctuation mark of other type
40 Sm Math_Symbol a symbol of primarily mathematical use
41 Sc Currency_Symbol a currency sign
42 Sk Modifier_Symbol a non-letterlike modifier symbol
43 So Other_Symbol a symbol of other type
44 Zs Space_Separator a space character (of various non-zero widths)
45 Zl Line_Separator U+2028 LINE SEPARATOR only
46 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
47 Cc Control a C0 or C1 control code
48 Cf Format a format control character
49 Cs Surrogate a surrogate code point
50 Co Private_Use a private-use character
51 Cn Unassigned a reserved unassigned code point or a noncharacter
52 */
53
54 pub use is_alphabetic = unicode::derived_property::Alphabetic;
55 pub use is_XID_start = unicode::derived_property::XID_Start;
56 pub use is_XID_continue = unicode::derived_property::XID_Continue;
57
58
59 /**
60 * Indicates whether a character is in lower case, defined
61 * in terms of the Unicode General Category 'Ll'
62 */
63 #[inline(always)]
64 pub fn is_lowercase(c: char) -> bool {
65 return unicode::general_category::Ll(c);
66 }
67
68 /**
69 * Indicates whether a character is in upper case, defined
70 * in terms of the Unicode General Category 'Lu'.
71 */
72 #[inline(always)]
73 pub fn is_uppercase(c: char) -> bool {
74 return unicode::general_category::Lu(c);
75 }
76
77 /**
78 * Indicates whether a character is whitespace. Whitespace is defined in
79 * terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
80 * additional 'Cc'-category control codes in the range [0x09, 0x0d]
81 */
82 #[inline(always)]
83 pub fn is_whitespace(c: char) -> bool {
84 return ('\x09' <= c && c <= '\x0d')
85 || unicode::general_category::Zs(c)
86 || unicode::general_category::Zl(c)
87 || unicode::general_category::Zp(c);
88 }
89
90 /**
91 * Indicates whether a character is alphanumeric. Alphanumericness is
92 * defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
93 * and the Derived Core Property 'Alphabetic'.
94 */
95 #[inline(always)]
96 pub fn is_alphanumeric(c: char) -> bool {
97 return unicode::derived_property::Alphabetic(c) ||
98 unicode::general_category::Nd(c) ||
99 unicode::general_category::Nl(c) ||
100 unicode::general_category::No(c);
101 }
102
103 /// Indicates whether the character is an ASCII character
104 #[inline(always)]
105 pub fn is_ascii(c: char) -> bool {
106 c - ('\x7F' & c) == '\x00'
107 }
108
109 /// Indicates whether the character is numeric (Nd, Nl, or No)
110 #[inline(always)]
111 pub fn is_digit(c: char) -> bool {
112 return unicode::general_category::Nd(c) ||
113 unicode::general_category::Nl(c) ||
114 unicode::general_category::No(c);
115 }
116
117 /**
118 * Checks if a character parses as a numeric digit in the given radix.
119 * Compared to `is_digit()`, this function only recognizes the ascii
120 * characters `0-9`, `a-z` and `A-Z`.
121 *
122 * Returns `true` if `c` is a valid digit under `radix`, and `false`
123 * otherwise.
124 *
125 * Fails if given a `radix` > 36.
126 *
127 * Note: This just wraps `to_digit()`.
128 */
129 #[inline(always)]
130 pub fn is_digit_radix(c: char, radix: uint) -> bool {
131 match to_digit(c, radix) {
132 Some(_) => true,
133 None => false
134 }
135 }
136
137 /**
138 * Convert a char to the corresponding digit.
139 *
140 * # Return value
141 *
142 * If `c` is between '0' and '9', the corresponding value
143 * between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
144 * 'b' or 'B', 11, etc. Returns none if the char does not
145 * refer to a digit in the given radix.
146 *
147 * # Failure
148 * Fails if given a `radix` outside the range `[0..36]`.
149 */
150 #[inline]
151 pub fn to_digit(c: char, radix: uint) -> Option<uint> {
152 if radix > 36 {
153 fail!(fmt!("to_digit: radix %? is to high (maximum 36)", radix));
154 }
155 let val = match c {
156 '0' .. '9' => c as uint - ('0' as uint),
157 'a' .. 'z' => c as uint + 10u - ('a' as uint),
158 'A' .. 'Z' => c as uint + 10u - ('A' as uint),
159 _ => return None
160 };
161 if val < radix { Some(val) }
162 else { None }
163 }
164
165 /**
166 * Converts a number to the ascii character representing it.
167 *
168 * Returns `Some(char)` if `num` represents one digit under `radix`,
169 * using one character of `0-9` or `a-z`, or `None` if it doesn't.
170 *
171 * Fails if given an `radix` > 36.
172 */
173 #[inline]
174 pub fn from_digit(num: uint, radix: uint) -> Option<char> {
175 if radix > 36 {
176 fail!(fmt!("from_digit: radix %? is to high (maximum 36)", num));
177 }
178 if num < radix {
179 if num < 10 {
180 Some(('0' as uint + num) as char)
181 } else {
182 Some(('a' as uint + num - 10u) as char)
183 }
184 } else {
185 None
186 }
187 }
188
189 /**
190 * Return the hexadecimal unicode escape of a char.
191 *
192 * The rules are as follows:
193 *
194 * - chars in [0,0xff] get 2-digit escapes: `\\xNN`
195 * - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
196 * - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
197 */
198 pub fn escape_unicode(c: char) -> ~str {
199 let s = u32::to_str_radix(c as u32, 16u);
200 let (c, pad) = (if c <= '\xff' { ('x', 2u) }
201 else if c <= '\uffff' { ('u', 4u) }
202 else { ('U', 8u) });
203 assert!(str::len(s) <= pad);
204 let mut out = ~"\\";
205 unsafe {
206 str::push_str(&mut out, str::from_char(c));
207 for uint::range(str::len(s), pad) |_i|
208 { str::push_str(&mut out, ~"0"); }
209 str::push_str(&mut out, s);
210 }
211 out
212 }
213
214 /**
215 * Return a 'default' ASCII and C++11-like char-literal escape of a char.
216 *
217 * The default is chosen with a bias toward producing literals that are
218 * legal in a variety of languages, including C++11 and similar C-family
219 * languages. The exact rules are:
220 *
221 * - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
222 * - Single-quote, double-quote and backslash chars are backslash-escaped.
223 * - Any other chars in the range [0x20,0x7e] are not escaped.
224 * - Any other chars are given hex unicode escapes; see `escape_unicode`.
225 */
226 pub fn escape_default(c: char) -> ~str {
227 match c {
228 '\t' => ~"\\t",
229 '\r' => ~"\\r",
230 '\n' => ~"\\n",
231 '\\' => ~"\\\\",
232 '\'' => ~"\\'",
233 '"' => ~"\\\"",
234 '\x20' .. '\x7e' => str::from_char(c),
235 _ => escape_unicode(c)
236 }
237 }
238
239 /**
240 * Compare two chars
241 *
242 * # Return value
243 *
244 * -1 if a < b, 0 if a == b, +1 if a > b
245 */
246 #[inline(always)]
247 pub fn cmp(a: char, b: char) -> int {
248 return if b > a { -1 }
249 else if b < a { 1 }
250 else { 0 }
251 }
252
253 #[cfg(notest)]
254 impl Eq for char {
255 fn eq(&self, other: &char) -> bool { (*self) == (*other) }
256 fn ne(&self, other: &char) -> bool { (*self) != (*other) }
257 }
258
259 #[test]
260 fn test_is_lowercase() {
261 assert!(is_lowercase('a'));
262 assert!(is_lowercase('ö'));
263 assert!(is_lowercase('ß'));
264 assert!(!is_lowercase('Ü'));
265 assert!(!is_lowercase('P'));
266 }
267
268 #[test]
269 fn test_is_uppercase() {
270 assert!(!is_uppercase('h'));
271 assert!(!is_uppercase('ä'));
272 assert!(!is_uppercase('ß'));
273 assert!(is_uppercase('Ö'));
274 assert!(is_uppercase('T'));
275 }
276
277 #[test]
278 fn test_is_whitespace() {
279 assert!(is_whitespace(' '));
280 assert!(is_whitespace('\u2007'));
281 assert!(is_whitespace('\t'));
282 assert!(is_whitespace('\n'));
283
284 assert!(!is_whitespace('a'));
285 assert!(!is_whitespace('_'));
286 assert!(!is_whitespace('\u0000'));
287 }
288
289 #[test]
290 fn test_to_digit() {
291 assert_eq!(to_digit('0', 10u), Some(0u));
292 assert_eq!(to_digit('1', 2u), Some(1u));
293 assert_eq!(to_digit('2', 3u), Some(2u));
294 assert_eq!(to_digit('9', 10u), Some(9u));
295 assert_eq!(to_digit('a', 16u), Some(10u));
296 assert_eq!(to_digit('A', 16u), Some(10u));
297 assert_eq!(to_digit('b', 16u), Some(11u));
298 assert_eq!(to_digit('B', 16u), Some(11u));
299 assert_eq!(to_digit('z', 36u), Some(35u));
300 assert_eq!(to_digit('Z', 36u), Some(35u));
301
302 assert!(to_digit(' ', 10u).is_none());
303 assert!(to_digit('$', 36u).is_none());
304 }
305
306 #[test]
307 fn test_is_ascii() {
308 assert!(str::all(~"banana", is_ascii));
309 assert!(! str::all(~"ประเทศไทย中华Việt Nam", is_ascii));
310 }
311
312 #[test]
313 fn test_is_digit() {
314 assert!(is_digit('2'));
315 assert!(is_digit('7'));
316 assert!(! is_digit('c'));
317 assert!(! is_digit('i'));
318 assert!(! is_digit('z'));
319 assert!(! is_digit('Q'));
320 }
321
322 #[test]
323 fn test_escape_default() {
324 assert_eq!(escape_default('\n'), ~"\\n");
325 assert_eq!(escape_default('\r'), ~"\\r");
326 assert_eq!(escape_default('\''), ~"\\'");
327 assert_eq!(escape_default('"'), ~"\\\"");
328 assert_eq!(escape_default(' '), ~" ");
329 assert_eq!(escape_default('a'), ~"a");
330 assert_eq!(escape_default('~'), ~"~");
331 assert_eq!(escape_default('\x00'), ~"\\x00");
332 assert_eq!(escape_default('\x1f'), ~"\\x1f");
333 assert_eq!(escape_default('\x7f'), ~"\\x7f");
334 assert_eq!(escape_default('\xff'), ~"\\xff");
335 assert_eq!(escape_default('\u011b'), ~"\\u011b");
336 assert_eq!(escape_default('\U0001d4b6'), ~"\\U0001d4b6");
337 }
338
339
340 #[test]
341 fn test_escape_unicode() {
342 assert_eq!(escape_unicode('\x00'), ~"\\x00");
343 assert_eq!(escape_unicode('\n'), ~"\\x0a");
344 assert_eq!(escape_unicode(' '), ~"\\x20");
345 assert_eq!(escape_unicode('a'), ~"\\x61");
346 assert_eq!(escape_unicode('\u011b'), ~"\\u011b");
347 assert_eq!(escape_unicode('\U0001d4b6'), ~"\\U0001d4b6");
348 }