]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
92a42be0 | 11 | //! A character type. |
1a4d82fc | 12 | //! |
92a42be0 SL |
13 | //! The `char` type represents a single character. More specifically, since |
14 | //! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode | |
15 | //! scalar value]', which is similar to, but not the same as, a '[Unicode code | |
16 | //! point]'. | |
c34b1796 | 17 | //! |
92a42be0 SL |
18 | //! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value |
19 | //! [Unicode code point]: http://www.unicode.org/glossary/#code_point | |
c34b1796 | 20 | //! |
92a42be0 | 21 | //! This module exists for technical reasons, the primary documentation for |
54a0048b | 22 | //! `char` is directly on [the `char` primitive type](../../std/primitive.char.html) |
92a42be0 | 23 | //! itself. |
c1a9b12d | 24 | //! |
92a42be0 SL |
25 | //! This module is the home of the iterator implementations for the iterators |
26 | //! implemented on `char`, as well as some useful constants and conversion | |
27 | //! functions that convert various types to `char`. | |
c34b1796 AL |
28 | |
29 | #![stable(feature = "rust1", since = "1.0.0")] | |
1a4d82fc | 30 | |
1a4d82fc | 31 | use core::char::CharExt as C; |
54a0048b | 32 | use core::fmt; |
3157f602 | 33 | use tables::{conversions, derived_property, general_category, property}; |
1a4d82fc | 34 | |
c34b1796 | 35 | // stable reexports |
92a42be0 | 36 | #[stable(feature = "rust1", since = "1.0.0")] |
3157f602 | 37 | pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked}; |
54a0048b | 38 | #[stable(feature = "rust1", since = "1.0.0")] |
3157f602 | 39 | pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDefault, EscapeUnicode}; |
c34b1796 AL |
40 | |
41 | // unstable reexports | |
92a42be0 | 42 | #[unstable(feature = "unicode", issue = "27783")] |
c34b1796 AL |
43 | pub use tables::UNICODE_VERSION; |
44 | ||
92a42be0 SL |
45 | /// Returns an iterator that yields the lowercase equivalent of a `char`. |
46 | /// | |
47 | /// This `struct` is created by the [`to_lowercase()`] method on [`char`]. See | |
48 | /// its documentation for more. | |
49 | /// | |
54a0048b SL |
50 | /// [`to_lowercase()`]: ../../std/primitive.char.html#method.to_lowercase |
51 | /// [`char`]: ../../std/primitive.char.html | |
c34b1796 | 52 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 53 | pub struct ToLowercase(CaseMappingIter); |
c34b1796 AL |
54 | |
55 | #[stable(feature = "rust1", since = "1.0.0")] | |
56 | impl Iterator for ToLowercase { | |
57 | type Item = char; | |
b039eaaf SL |
58 | fn next(&mut self) -> Option<char> { |
59 | self.0.next() | |
60 | } | |
c34b1796 AL |
61 | } |
62 | ||
92a42be0 SL |
63 | /// Returns an iterator that yields the uppercase equivalent of a `char`. |
64 | /// | |
65 | /// This `struct` is created by the [`to_uppercase()`] method on [`char`]. See | |
66 | /// its documentation for more. | |
67 | /// | |
54a0048b SL |
68 | /// [`to_uppercase()`]: ../../std/primitive.char.html#method.to_uppercase |
69 | /// [`char`]: ../../std/primitive.char.html | |
c34b1796 | 70 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 71 | pub struct ToUppercase(CaseMappingIter); |
c34b1796 AL |
72 | |
73 | #[stable(feature = "rust1", since = "1.0.0")] | |
74 | impl Iterator for ToUppercase { | |
75 | type Item = char; | |
b039eaaf SL |
76 | fn next(&mut self) -> Option<char> { |
77 | self.0.next() | |
78 | } | |
62682a34 SL |
79 | } |
80 | ||
62682a34 SL |
81 | |
82 | enum CaseMappingIter { | |
83 | Three(char, char, char), | |
84 | Two(char, char), | |
85 | One(char), | |
b039eaaf | 86 | Zero, |
62682a34 SL |
87 | } |
88 | ||
89 | impl CaseMappingIter { | |
90 | fn new(chars: [char; 3]) -> CaseMappingIter { | |
91 | if chars[2] == '\0' { | |
92 | if chars[1] == '\0' { | |
93 | CaseMappingIter::One(chars[0]) // Including if chars[0] == '\0' | |
94 | } else { | |
95 | CaseMappingIter::Two(chars[0], chars[1]) | |
96 | } | |
97 | } else { | |
98 | CaseMappingIter::Three(chars[0], chars[1], chars[2]) | |
99 | } | |
100 | } | |
101 | } | |
102 | ||
103 | impl Iterator for CaseMappingIter { | |
104 | type Item = char; | |
105 | fn next(&mut self) -> Option<char> { | |
106 | match *self { | |
107 | CaseMappingIter::Three(a, b, c) => { | |
108 | *self = CaseMappingIter::Two(b, c); | |
109 | Some(a) | |
110 | } | |
111 | CaseMappingIter::Two(b, c) => { | |
112 | *self = CaseMappingIter::One(c); | |
113 | Some(b) | |
114 | } | |
115 | CaseMappingIter::One(c) => { | |
116 | *self = CaseMappingIter::Zero; | |
117 | Some(c) | |
118 | } | |
119 | CaseMappingIter::Zero => None, | |
120 | } | |
121 | } | |
c34b1796 AL |
122 | } |
123 | ||
c34b1796 AL |
124 | #[lang = "char"] |
125 | impl char { | |
b039eaaf SL |
126 | /// Checks if a `char` is a digit in the given radix. |
127 | /// | |
128 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
129 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 130 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
b039eaaf | 131 | /// radicum are supported. |
1a4d82fc JJ |
132 | /// |
133 | /// Compared to `is_numeric()`, this function only recognizes the characters | |
134 | /// `0-9`, `a-z` and `A-Z`. | |
135 | /// | |
b039eaaf | 136 | /// 'Digit' is defined to be only the following characters: |
1a4d82fc | 137 | /// |
b039eaaf SL |
138 | /// * `0-9` |
139 | /// * `a-z` | |
140 | /// * `A-Z` | |
141 | /// | |
142 | /// For a more comprehensive understanding of 'digit', see [`is_numeric()`][is_numeric]. | |
143 | /// | |
144 | /// [is_numeric]: #method.is_numeric | |
1a4d82fc JJ |
145 | /// |
146 | /// # Panics | |
147 | /// | |
b039eaaf | 148 | /// Panics if given a radix larger than 36. |
c34b1796 AL |
149 | /// |
150 | /// # Examples | |
151 | /// | |
b039eaaf SL |
152 | /// Basic usage: |
153 | /// | |
c34b1796 | 154 | /// ``` |
54a0048b SL |
155 | /// assert!('1'.is_digit(10)); |
156 | /// assert!('f'.is_digit(16)); | |
157 | /// assert!(!'f'.is_digit(10)); | |
b039eaaf SL |
158 | /// ``` |
159 | /// | |
160 | /// Passing a large radix, causing a panic: | |
161 | /// | |
162 | /// ``` | |
163 | /// use std::thread; | |
164 | /// | |
165 | /// let result = thread::spawn(|| { | |
b039eaaf | 166 | /// // this panics |
54a0048b | 167 | /// '1'.is_digit(37); |
b039eaaf SL |
168 | /// }).join(); |
169 | /// | |
170 | /// assert!(result.is_err()); | |
c34b1796 AL |
171 | /// ``` |
172 | #[stable(feature = "rust1", since = "1.0.0")] | |
173 | #[inline] | |
b039eaaf SL |
174 | pub fn is_digit(self, radix: u32) -> bool { |
175 | C::is_digit(self, radix) | |
176 | } | |
1a4d82fc | 177 | |
b039eaaf SL |
178 | /// Converts a `char` to a digit in the given radix. |
179 | /// | |
180 | /// A 'radix' here is sometimes also called a 'base'. A radix of two | |
181 | /// indicates a binary number, a radix of ten, decimal, and a radix of | |
9cc50fc6 | 182 | /// sixteen, hexadecimal, to give some common values. Arbitrary |
b039eaaf SL |
183 | /// radicum are supported. |
184 | /// | |
185 | /// 'Digit' is defined to be only the following characters: | |
186 | /// | |
187 | /// * `0-9` | |
188 | /// * `a-z` | |
189 | /// * `A-Z` | |
1a4d82fc | 190 | /// |
7453a54e | 191 | /// # Errors |
1a4d82fc | 192 | /// |
b039eaaf | 193 | /// Returns `None` if the `char` does not refer to a digit in the given radix. |
1a4d82fc JJ |
194 | /// |
195 | /// # Panics | |
196 | /// | |
b039eaaf | 197 | /// Panics if given a radix larger than 36. |
c34b1796 AL |
198 | /// |
199 | /// # Examples | |
200 | /// | |
b039eaaf SL |
201 | /// Basic usage: |
202 | /// | |
203 | /// ``` | |
54a0048b SL |
204 | /// assert_eq!('1'.to_digit(10), Some(1)); |
205 | /// assert_eq!('f'.to_digit(16), Some(15)); | |
b039eaaf SL |
206 | /// ``` |
207 | /// | |
208 | /// Passing a non-digit results in failure: | |
209 | /// | |
210 | /// ``` | |
54a0048b SL |
211 | /// assert_eq!('f'.to_digit(10), None); |
212 | /// assert_eq!('z'.to_digit(16), None); | |
b039eaaf SL |
213 | /// ``` |
214 | /// | |
215 | /// Passing a large radix, causing a panic: | |
216 | /// | |
c34b1796 | 217 | /// ``` |
b039eaaf | 218 | /// use std::thread; |
c34b1796 | 219 | /// |
b039eaaf | 220 | /// let result = thread::spawn(|| { |
54a0048b | 221 | /// '1'.to_digit(37); |
b039eaaf SL |
222 | /// }).join(); |
223 | /// | |
224 | /// assert!(result.is_err()); | |
c34b1796 AL |
225 | /// ``` |
226 | #[stable(feature = "rust1", since = "1.0.0")] | |
62682a34 | 227 | #[inline] |
b039eaaf SL |
228 | pub fn to_digit(self, radix: u32) -> Option<u32> { |
229 | C::to_digit(self, radix) | |
230 | } | |
1a4d82fc | 231 | |
c34b1796 AL |
232 | /// Returns an iterator that yields the hexadecimal Unicode escape of a |
233 | /// character, as `char`s. | |
1a4d82fc JJ |
234 | /// |
235 | /// All characters are escaped with Rust syntax of the form `\\u{NNNN}` | |
92a42be0 | 236 | /// where `NNNN` is the shortest hexadecimal representation. |
c34b1796 AL |
237 | /// |
238 | /// # Examples | |
239 | /// | |
92a42be0 SL |
240 | /// Basic usage: |
241 | /// | |
c34b1796 | 242 | /// ``` |
62682a34 SL |
243 | /// for c in '❤'.escape_unicode() { |
244 | /// print!("{}", c); | |
c34b1796 | 245 | /// } |
62682a34 | 246 | /// println!(""); |
c34b1796 AL |
247 | /// ``` |
248 | /// | |
249 | /// This prints: | |
250 | /// | |
251 | /// ```text | |
62682a34 | 252 | /// \u{2764} |
c34b1796 AL |
253 | /// ``` |
254 | /// | |
255 | /// Collecting into a `String`: | |
256 | /// | |
257 | /// ``` | |
258 | /// let heart: String = '❤'.escape_unicode().collect(); | |
259 | /// | |
260 | /// assert_eq!(heart, r"\u{2764}"); | |
261 | /// ``` | |
85aaf69f | 262 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 263 | #[inline] |
b039eaaf SL |
264 | pub fn escape_unicode(self) -> EscapeUnicode { |
265 | C::escape_unicode(self) | |
266 | } | |
1a4d82fc | 267 | |
b039eaaf | 268 | /// Returns an iterator that yields the literal escape code of a `char`. |
1a4d82fc JJ |
269 | /// |
270 | /// The default is chosen with a bias toward producing literals that are | |
271 | /// legal in a variety of languages, including C++11 and similar C-family | |
272 | /// languages. The exact rules are: | |
273 | /// | |
b039eaaf SL |
274 | /// * Tab is escaped as `\t`. |
275 | /// * Carriage return is escaped as `\r`. | |
276 | /// * Line feed is escaped as `\n`. | |
277 | /// * Single quote is escaped as `\'`. | |
278 | /// * Double quote is escaped as `\"`. | |
279 | /// * Backslash is escaped as `\\`. | |
280 | /// * Any character in the 'printable ASCII' range `0x20` .. `0x7e` | |
281 | /// inclusive is not escaped. | |
282 | /// * All other characters are given hexadecimal Unicode escapes; see | |
283 | /// [`escape_unicode`][escape_unicode]. | |
284 | /// | |
285 | /// [escape_unicode]: #method.escape_unicode | |
c34b1796 AL |
286 | /// |
287 | /// # Examples | |
288 | /// | |
b039eaaf SL |
289 | /// Basic usage: |
290 | /// | |
c34b1796 AL |
291 | /// ``` |
292 | /// for i in '"'.escape_default() { | |
293 | /// println!("{}", i); | |
294 | /// } | |
295 | /// ``` | |
296 | /// | |
297 | /// This prints: | |
298 | /// | |
299 | /// ```text | |
300 | /// \ | |
301 | /// " | |
302 | /// ``` | |
303 | /// | |
304 | /// Collecting into a `String`: | |
305 | /// | |
306 | /// ``` | |
307 | /// let quote: String = '"'.escape_default().collect(); | |
308 | /// | |
309 | /// assert_eq!(quote, "\\\""); | |
310 | /// ``` | |
85aaf69f | 311 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 312 | #[inline] |
b039eaaf SL |
313 | pub fn escape_default(self) -> EscapeDefault { |
314 | C::escape_default(self) | |
315 | } | |
1a4d82fc | 316 | |
b039eaaf SL |
317 | /// Returns the number of bytes this `char` would need if encoded in UTF-8. |
318 | /// | |
319 | /// That number of bytes is always between 1 and 4, inclusive. | |
c34b1796 AL |
320 | /// |
321 | /// # Examples | |
322 | /// | |
b039eaaf SL |
323 | /// Basic usage: |
324 | /// | |
c34b1796 | 325 | /// ``` |
b039eaaf SL |
326 | /// let len = 'A'.len_utf8(); |
327 | /// assert_eq!(len, 1); | |
328 | /// | |
329 | /// let len = 'ß'.len_utf8(); | |
330 | /// assert_eq!(len, 2); | |
331 | /// | |
332 | /// let len = 'ℝ'.len_utf8(); | |
333 | /// assert_eq!(len, 3); | |
c34b1796 | 334 | /// |
b039eaaf SL |
335 | /// let len = '💣'.len_utf8(); |
336 | /// assert_eq!(len, 4); | |
337 | /// ``` | |
338 | /// | |
339 | /// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it | |
340 | /// would take if each code point was represented as a `char` vs in the `&str` itself: | |
341 | /// | |
342 | /// ``` | |
343 | /// // as chars | |
344 | /// let eastern = '東'; | |
345 | /// let capitol = '京'; | |
346 | /// | |
347 | /// // both can be represented as three bytes | |
348 | /// assert_eq!(3, eastern.len_utf8()); | |
349 | /// assert_eq!(3, capitol.len_utf8()); | |
350 | /// | |
351 | /// // as a &str, these two are encoded in UTF-8 | |
352 | /// let tokyo = "東京"; | |
353 | /// | |
354 | /// let len = eastern.len_utf8() + capitol.len_utf8(); | |
355 | /// | |
356 | /// // we can see that they take six bytes total... | |
357 | /// assert_eq!(6, tokyo.len()); | |
358 | /// | |
359 | /// // ... just like the &str | |
360 | /// assert_eq!(len, tokyo.len()); | |
c34b1796 | 361 | /// ``` |
85aaf69f | 362 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 363 | #[inline] |
b039eaaf SL |
364 | pub fn len_utf8(self) -> usize { |
365 | C::len_utf8(self) | |
366 | } | |
1a4d82fc | 367 | |
b039eaaf | 368 | /// Returns the number of 16-bit code units this `char` would need if |
c34b1796 AL |
369 | /// encoded in UTF-16. |
370 | /// | |
92a42be0 SL |
371 | /// See the documentation for [`len_utf8()`] for more explanation of this |
372 | /// concept. This function is a mirror, but for UTF-16 instead of UTF-8. | |
373 | /// | |
374 | /// [`len_utf8()`]: #method.len_utf8 | |
b039eaaf | 375 | /// |
c34b1796 AL |
376 | /// # Examples |
377 | /// | |
92a42be0 SL |
378 | /// Basic usage: |
379 | /// | |
c34b1796 AL |
380 | /// ``` |
381 | /// let n = 'ß'.len_utf16(); | |
c34b1796 | 382 | /// assert_eq!(n, 1); |
b039eaaf SL |
383 | /// |
384 | /// let len = '💣'.len_utf16(); | |
385 | /// assert_eq!(len, 2); | |
c34b1796 | 386 | /// ``` |
85aaf69f | 387 | #[stable(feature = "rust1", since = "1.0.0")] |
62682a34 | 388 | #[inline] |
b039eaaf SL |
389 | pub fn len_utf16(self) -> usize { |
390 | C::len_utf16(self) | |
391 | } | |
1a4d82fc | 392 | |
54a0048b | 393 | /// Returns an interator over the bytes of this character as UTF-8. |
c34b1796 | 394 | /// |
54a0048b SL |
395 | /// The returned iterator also has an `as_slice()` method to view the |
396 | /// encoded bytes as a byte slice. | |
c34b1796 AL |
397 | /// |
398 | /// # Examples | |
399 | /// | |
c34b1796 | 400 | /// ``` |
c1a9b12d SL |
401 | /// #![feature(unicode)] |
402 | /// | |
54a0048b SL |
403 | /// let iterator = 'ß'.encode_utf8(); |
404 | /// assert_eq!(iterator.as_slice(), [0xc3, 0x9f]); | |
c34b1796 | 405 | /// |
54a0048b SL |
406 | /// for (i, byte) in iterator.enumerate() { |
407 | /// println!("byte {}: {:x}", i, byte); | |
408 | /// } | |
c34b1796 | 409 | /// ``` |
54a0048b | 410 | #[unstable(feature = "unicode", issue = "27784")] |
62682a34 | 411 | #[inline] |
54a0048b SL |
412 | pub fn encode_utf8(self) -> EncodeUtf8 { |
413 | C::encode_utf8(self) | |
62682a34 | 414 | } |
1a4d82fc | 415 | |
54a0048b | 416 | /// Returns an interator over the `u16` entries of this character as UTF-16. |
c34b1796 | 417 | /// |
54a0048b SL |
418 | /// The returned iterator also has an `as_slice()` method to view the |
419 | /// encoded form as a slice. | |
c34b1796 AL |
420 | /// |
421 | /// # Examples | |
422 | /// | |
c34b1796 | 423 | /// ``` |
c1a9b12d SL |
424 | /// #![feature(unicode)] |
425 | /// | |
54a0048b SL |
426 | /// let iterator = '𝕊'.encode_utf16(); |
427 | /// assert_eq!(iterator.as_slice(), [0xd835, 0xdd4a]); | |
c34b1796 | 428 | /// |
54a0048b SL |
429 | /// for (i, val) in iterator.enumerate() { |
430 | /// println!("entry {}: {:x}", i, val); | |
431 | /// } | |
c34b1796 | 432 | /// ``` |
54a0048b | 433 | #[unstable(feature = "unicode", issue = "27784")] |
62682a34 | 434 | #[inline] |
54a0048b SL |
435 | pub fn encode_utf16(self) -> EncodeUtf16 { |
436 | C::encode_utf16(self) | |
62682a34 | 437 | } |
1a4d82fc | 438 | |
b039eaaf SL |
439 | /// Returns true if this `char` is an alphabetic code point, and false if not. |
440 | /// | |
441 | /// # Examples | |
442 | /// | |
443 | /// Basic usage: | |
444 | /// | |
445 | /// ``` | |
54a0048b SL |
446 | /// assert!('a'.is_alphabetic()); |
447 | /// assert!('京'.is_alphabetic()); | |
b039eaaf SL |
448 | /// |
449 | /// let c = '💝'; | |
450 | /// // love is many things, but it is not alphabetic | |
451 | /// assert!(!c.is_alphabetic()); | |
452 | /// ``` | |
85aaf69f | 453 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
454 | #[inline] |
455 | pub fn is_alphabetic(self) -> bool { | |
456 | match self { | |
b039eaaf | 457 | 'a'...'z' | 'A'...'Z' => true, |
c34b1796 | 458 | c if c > '\x7f' => derived_property::Alphabetic(c), |
b039eaaf | 459 | _ => false, |
c34b1796 AL |
460 | } |
461 | } | |
1a4d82fc | 462 | |
b039eaaf SL |
463 | /// Returns true if this `char` satisfies the 'XID_Start' Unicode property, and false |
464 | /// otherwise. | |
1a4d82fc JJ |
465 | /// |
466 | /// 'XID_Start' is a Unicode Derived Property specified in | |
467 | /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications), | |
b039eaaf | 468 | /// mostly similar to `ID_Start` but modified for closure under `NFKx`. |
85aaf69f | 469 | #[unstable(feature = "unicode", |
e9174d1e SL |
470 | reason = "mainly needed for compiler internals", |
471 | issue = "0")] | |
c34b1796 | 472 | #[inline] |
b039eaaf SL |
473 | pub fn is_xid_start(self) -> bool { |
474 | derived_property::XID_Start(self) | |
475 | } | |
1a4d82fc | 476 | |
b039eaaf SL |
477 | /// Returns true if this `char` satisfies the 'XID_Continue' Unicode property, and false |
478 | /// otherwise. | |
1a4d82fc JJ |
479 | /// |
480 | /// 'XID_Continue' is a Unicode Derived Property specified in | |
481 | /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications), | |
482 | /// mostly similar to 'ID_Continue' but modified for closure under NFKx. | |
85aaf69f | 483 | #[unstable(feature = "unicode", |
e9174d1e SL |
484 | reason = "mainly needed for compiler internals", |
485 | issue = "0")] | |
c34b1796 | 486 | #[inline] |
b039eaaf SL |
487 | pub fn is_xid_continue(self) -> bool { |
488 | derived_property::XID_Continue(self) | |
489 | } | |
1a4d82fc | 490 | |
b039eaaf | 491 | /// Returns true if this `char` is lowercase, and false otherwise. |
1a4d82fc | 492 | /// |
b039eaaf | 493 | /// 'Lowercase' is defined according to the terms of the Unicode Derived Core |
1a4d82fc | 494 | /// Property `Lowercase`. |
b039eaaf SL |
495 | /// |
496 | /// # Examples | |
497 | /// | |
498 | /// Basic usage: | |
499 | /// | |
500 | /// ``` | |
54a0048b SL |
501 | /// assert!('a'.is_lowercase()); |
502 | /// assert!('δ'.is_lowercase()); | |
503 | /// assert!(!'A'.is_lowercase()); | |
504 | /// assert!(!'Δ'.is_lowercase()); | |
b039eaaf SL |
505 | /// |
506 | /// // The various Chinese scripts do not have case, and so: | |
54a0048b | 507 | /// assert!(!'中'.is_lowercase()); |
b039eaaf | 508 | /// ``` |
85aaf69f | 509 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
510 | #[inline] |
511 | pub fn is_lowercase(self) -> bool { | |
512 | match self { | |
b039eaaf | 513 | 'a'...'z' => true, |
c34b1796 | 514 | c if c > '\x7f' => derived_property::Lowercase(c), |
b039eaaf | 515 | _ => false, |
c34b1796 AL |
516 | } |
517 | } | |
1a4d82fc | 518 | |
b039eaaf | 519 | /// Returns true if this `char` is uppercase, and false otherwise. |
1a4d82fc | 520 | /// |
b039eaaf | 521 | /// 'Uppercase' is defined according to the terms of the Unicode Derived Core |
1a4d82fc | 522 | /// Property `Uppercase`. |
b039eaaf SL |
523 | /// |
524 | /// # Examples | |
525 | /// | |
526 | /// Basic usage: | |
527 | /// | |
528 | /// ``` | |
54a0048b SL |
529 | /// assert!(!'a'.is_uppercase()); |
530 | /// assert!(!'δ'.is_uppercase()); | |
531 | /// assert!('A'.is_uppercase()); | |
532 | /// assert!('Δ'.is_uppercase()); | |
b039eaaf SL |
533 | /// |
534 | /// // The various Chinese scripts do not have case, and so: | |
54a0048b | 535 | /// assert!(!'中'.is_uppercase()); |
b039eaaf | 536 | /// ``` |
85aaf69f | 537 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
538 | #[inline] |
539 | pub fn is_uppercase(self) -> bool { | |
540 | match self { | |
b039eaaf | 541 | 'A'...'Z' => true, |
c34b1796 | 542 | c if c > '\x7f' => derived_property::Uppercase(c), |
b039eaaf | 543 | _ => false, |
c34b1796 AL |
544 | } |
545 | } | |
1a4d82fc | 546 | |
b039eaaf SL |
547 | /// Returns true if this `char` is whitespace, and false otherwise. |
548 | /// | |
549 | /// 'Whitespace' is defined according to the terms of the Unicode Derived Core | |
550 | /// Property `White_Space`. | |
551 | /// | |
552 | /// # Examples | |
1a4d82fc | 553 | /// |
b039eaaf SL |
554 | /// Basic usage: |
555 | /// | |
556 | /// ``` | |
54a0048b | 557 | /// assert!(' '.is_whitespace()); |
b039eaaf SL |
558 | /// |
559 | /// // a non-breaking space | |
54a0048b | 560 | /// assert!('\u{A0}'.is_whitespace()); |
b039eaaf | 561 | /// |
54a0048b | 562 | /// assert!(!'越'.is_whitespace()); |
b039eaaf | 563 | /// ``` |
85aaf69f | 564 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
565 | #[inline] |
566 | pub fn is_whitespace(self) -> bool { | |
567 | match self { | |
b039eaaf | 568 | ' ' | '\x09'...'\x0d' => true, |
c34b1796 | 569 | c if c > '\x7f' => property::White_Space(c), |
b039eaaf | 570 | _ => false, |
c34b1796 AL |
571 | } |
572 | } | |
1a4d82fc | 573 | |
b039eaaf | 574 | /// Returns true if this `char` is alphanumeric, and false otherwise. |
1a4d82fc | 575 | /// |
b039eaaf | 576 | /// 'Alphanumeric'-ness is defined in terms of the Unicode General Categories |
1a4d82fc | 577 | /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'. |
b039eaaf SL |
578 | /// |
579 | /// # Examples | |
580 | /// | |
581 | /// Basic usage: | |
582 | /// | |
583 | /// ``` | |
54a0048b SL |
584 | /// assert!('٣'.is_alphanumeric()); |
585 | /// assert!('7'.is_alphanumeric()); | |
586 | /// assert!('৬'.is_alphanumeric()); | |
587 | /// assert!('K'.is_alphanumeric()); | |
588 | /// assert!('و'.is_alphanumeric()); | |
589 | /// assert!('藏'.is_alphanumeric()); | |
590 | /// assert!(!'¾'.is_alphanumeric()); | |
591 | /// assert!(!'①'.is_alphanumeric()); | |
b039eaaf | 592 | /// ``` |
85aaf69f | 593 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
594 | #[inline] |
595 | pub fn is_alphanumeric(self) -> bool { | |
596 | self.is_alphabetic() || self.is_numeric() | |
597 | } | |
1a4d82fc | 598 | |
b039eaaf | 599 | /// Returns true if this `char` is a control code point, and false otherwise. |
1a4d82fc | 600 | /// |
b039eaaf | 601 | /// 'Control code point' is defined in terms of the Unicode General |
1a4d82fc | 602 | /// Category `Cc`. |
b039eaaf SL |
603 | /// |
604 | /// # Examples | |
605 | /// | |
606 | /// Basic usage: | |
607 | /// | |
608 | /// ``` | |
609 | /// // U+009C, STRING TERMINATOR | |
54a0048b SL |
610 | /// assert!('\9c'.is_control()); |
611 | /// assert!(!'q'.is_control()); | |
b039eaaf | 612 | /// ``` |
85aaf69f | 613 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 | 614 | #[inline] |
b039eaaf SL |
615 | pub fn is_control(self) -> bool { |
616 | general_category::Cc(self) | |
617 | } | |
1a4d82fc | 618 | |
b039eaaf SL |
619 | /// Returns true if this `char` is numeric, and false otherwise. |
620 | /// | |
621 | /// 'Numeric'-ness is defined in terms of the Unicode General Categories | |
622 | /// 'Nd', 'Nl', 'No'. | |
623 | /// | |
624 | /// # Examples | |
625 | /// | |
626 | /// Basic usage: | |
627 | /// | |
628 | /// ``` | |
54a0048b SL |
629 | /// assert!('٣'.is_numeric()); |
630 | /// assert!('7'.is_numeric()); | |
631 | /// assert!('৬'.is_numeric()); | |
632 | /// assert!(!'K'.is_numeric()); | |
633 | /// assert!(!'و'.is_numeric()); | |
634 | /// assert!(!'藏'.is_numeric()); | |
635 | /// assert!(!'¾'.is_numeric()); | |
636 | /// assert!(!'①'.is_numeric()); | |
b039eaaf | 637 | /// ``` |
85aaf69f | 638 | #[stable(feature = "rust1", since = "1.0.0")] |
c34b1796 AL |
639 | #[inline] |
640 | pub fn is_numeric(self) -> bool { | |
641 | match self { | |
b039eaaf | 642 | '0'...'9' => true, |
c34b1796 | 643 | c if c > '\x7f' => general_category::N(c), |
b039eaaf | 644 | _ => false, |
c34b1796 AL |
645 | } |
646 | } | |
1a4d82fc | 647 | |
b039eaaf | 648 | /// Returns an iterator that yields the lowercase equivalent of a `char`. |
1a4d82fc | 649 | /// |
b039eaaf | 650 | /// If no conversion is possible then an iterator with just the input character is returned. |
1a4d82fc | 651 | /// |
b039eaaf SL |
652 | /// This performs complex unconditional mappings with no tailoring: it maps |
653 | /// one Unicode character to its lowercase equivalent according to the | |
654 | /// [Unicode database] and the additional complex mappings | |
655 | /// [`SpecialCasing.txt`]. Conditional mappings (based on context or | |
656 | /// language) are not considered here. | |
1a4d82fc | 657 | /// |
b039eaaf SL |
658 | /// For a full reference, see [here][reference]. |
659 | /// | |
660 | /// [Unicode database]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt | |
661 | /// | |
662 | /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt | |
663 | /// | |
664 | /// [reference]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992 | |
62682a34 SL |
665 | /// |
666 | /// # Examples | |
667 | /// | |
b039eaaf SL |
668 | /// Basic usage: |
669 | /// | |
62682a34 | 670 | /// ``` |
3157f602 XL |
671 | /// assert_eq!('C'.to_lowercase().collect::<String>(), "c"); |
672 | /// | |
673 | /// // Sometimes the result is more than one character: | |
674 | /// assert_eq!('İ'.to_lowercase().collect::<String>(), "i\u{307}"); | |
b039eaaf SL |
675 | /// |
676 | /// // Japanese scripts do not have case, and so: | |
3157f602 | 677 | /// assert_eq!('山'.to_lowercase().collect::<String>(), "山"); |
62682a34 | 678 | /// ``` |
c34b1796 AL |
679 | #[stable(feature = "rust1", since = "1.0.0")] |
680 | #[inline] | |
681 | pub fn to_lowercase(self) -> ToLowercase { | |
62682a34 SL |
682 | ToLowercase(CaseMappingIter::new(conversions::to_lower(self))) |
683 | } | |
684 | ||
b039eaaf | 685 | /// Returns an iterator that yields the uppercase equivalent of a `char`. |
1a4d82fc | 686 | /// |
b039eaaf | 687 | /// If no conversion is possible then an iterator with just the input character is returned. |
1a4d82fc | 688 | /// |
b039eaaf SL |
689 | /// This performs complex unconditional mappings with no tailoring: it maps |
690 | /// one Unicode character to its uppercase equivalent according to the | |
691 | /// [Unicode database] and the additional complex mappings | |
692 | /// [`SpecialCasing.txt`]. Conditional mappings (based on context or | |
693 | /// language) are not considered here. | |
1a4d82fc | 694 | /// |
b039eaaf | 695 | /// For a full reference, see [here][reference]. |
1a4d82fc | 696 | /// |
b039eaaf | 697 | /// [Unicode database]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt |
1a4d82fc | 698 | /// |
62682a34 SL |
699 | /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt |
700 | /// | |
b039eaaf | 701 | /// [reference]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992 |
1a4d82fc | 702 | /// |
62682a34 SL |
703 | /// # Examples |
704 | /// | |
b039eaaf SL |
705 | /// Basic usage: |
706 | /// | |
707 | /// ``` | |
3157f602 XL |
708 | /// assert_eq!('c'.to_uppercase().collect::<String>(), "C"); |
709 | /// | |
710 | /// // Sometimes the result is more than one character: | |
711 | /// assert_eq!('ß'.to_uppercase().collect::<String>(), "SS"); | |
b039eaaf SL |
712 | /// |
713 | /// // Japanese does not have case, and so: | |
3157f602 | 714 | /// assert_eq!('山'.to_uppercase().collect::<String>(), "山"); |
b039eaaf SL |
715 | /// ``` |
716 | /// | |
717 | /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two: | |
718 | /// | |
719 | /// * 'Dotless': I / ı, sometimes written ï | |
720 | /// * 'Dotted': İ / i | |
721 | /// | |
722 | /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore: | |
723 | /// | |
724 | /// ``` | |
3157f602 | 725 | /// let upper_i: String = 'i'.to_uppercase().collect(); |
b039eaaf SL |
726 | /// ``` |
727 | /// | |
728 | /// The value of `upper_i` here relies on the language of the text: if we're | |
3157f602 XL |
729 | /// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should |
730 | /// be `"İ"`. `to_uppercase()` does not take this into account, and so: | |
b039eaaf | 731 | /// |
62682a34 | 732 | /// ``` |
3157f602 | 733 | /// let upper_i: String = 'i'.to_uppercase().collect(); |
b039eaaf | 734 | /// |
3157f602 | 735 | /// assert_eq!(upper_i, "I"); |
62682a34 | 736 | /// ``` |
b039eaaf SL |
737 | /// |
738 | /// holds across languages. | |
c34b1796 AL |
739 | #[stable(feature = "rust1", since = "1.0.0")] |
740 | #[inline] | |
741 | pub fn to_uppercase(self) -> ToUppercase { | |
62682a34 | 742 | ToUppercase(CaseMappingIter::new(conversions::to_upper(self))) |
c34b1796 | 743 | } |
e9174d1e SL |
744 | } |
745 | ||
b039eaaf | 746 | /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. |
54a0048b | 747 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
e9174d1e | 748 | #[derive(Clone)] |
b039eaaf SL |
749 | pub struct DecodeUtf16<I> |
750 | where I: Iterator<Item = u16> | |
751 | { | |
e9174d1e SL |
752 | iter: I, |
753 | buf: Option<u16>, | |
754 | } | |
755 | ||
54a0048b SL |
756 | /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. |
757 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
758 | #[derive(Debug, Clone, Eq, PartialEq)] | |
759 | pub struct DecodeUtf16Error { | |
760 | code: u16, | |
761 | } | |
762 | ||
763 | /// Create an iterator over the UTF-16 encoded code points in `iter`, | |
e9174d1e SL |
764 | /// returning unpaired surrogates as `Err`s. |
765 | /// | |
766 | /// # Examples | |
767 | /// | |
92a42be0 SL |
768 | /// Basic usage: |
769 | /// | |
e9174d1e | 770 | /// ``` |
e9174d1e SL |
771 | /// use std::char::decode_utf16; |
772 | /// | |
773 | /// fn main() { | |
774 | /// // 𝄞mus<invalid>ic<invalid> | |
775 | /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, | |
776 | /// 0x0073, 0xDD1E, 0x0069, 0x0063, | |
777 | /// 0xD834]; | |
778 | /// | |
54a0048b SL |
779 | /// assert_eq!(decode_utf16(v.iter().cloned()) |
780 | /// .map(|r| r.map_err(|e| e.unpaired_surrogate())) | |
781 | /// .collect::<Vec<_>>(), | |
e9174d1e SL |
782 | /// vec![Ok('𝄞'), |
783 | /// Ok('m'), Ok('u'), Ok('s'), | |
784 | /// Err(0xDD1E), | |
785 | /// Ok('i'), Ok('c'), | |
786 | /// Err(0xD834)]); | |
787 | /// } | |
788 | /// ``` | |
789 | /// | |
790 | /// A lossy decoder can be obtained by replacing `Err` results with the replacement character: | |
791 | /// | |
792 | /// ``` | |
e9174d1e SL |
793 | /// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; |
794 | /// | |
795 | /// fn main() { | |
796 | /// // 𝄞mus<invalid>ic<invalid> | |
797 | /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, | |
798 | /// 0x0073, 0xDD1E, 0x0069, 0x0063, | |
799 | /// 0xD834]; | |
800 | /// | |
801 | /// assert_eq!(decode_utf16(v.iter().cloned()) | |
802 | /// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) | |
803 | /// .collect::<String>(), | |
804 | /// "𝄞mus�ic�"); | |
805 | /// } | |
806 | /// ``` | |
54a0048b | 807 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
e9174d1e | 808 | #[inline] |
54a0048b | 809 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> { |
e9174d1e | 810 | DecodeUtf16 { |
54a0048b | 811 | iter: iter.into_iter(), |
e9174d1e SL |
812 | buf: None, |
813 | } | |
814 | } | |
815 | ||
54a0048b | 816 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
3157f602 | 817 | impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> { |
54a0048b | 818 | type Item = Result<char, DecodeUtf16Error>; |
e9174d1e | 819 | |
54a0048b | 820 | fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> { |
e9174d1e SL |
821 | let u = match self.buf.take() { |
822 | Some(buf) => buf, | |
3157f602 XL |
823 | None => { |
824 | match self.iter.next() { | |
825 | Some(u) => u, | |
826 | None => return None, | |
827 | } | |
828 | } | |
e9174d1e SL |
829 | }; |
830 | ||
831 | if u < 0xD800 || 0xDFFF < u { | |
832 | // not a surrogate | |
833 | Some(Ok(unsafe { from_u32_unchecked(u as u32) })) | |
834 | } else if u >= 0xDC00 { | |
835 | // a trailing surrogate | |
54a0048b | 836 | Some(Err(DecodeUtf16Error { code: u })) |
e9174d1e SL |
837 | } else { |
838 | let u2 = match self.iter.next() { | |
839 | Some(u2) => u2, | |
840 | // eof | |
54a0048b | 841 | None => return Some(Err(DecodeUtf16Error { code: u })), |
e9174d1e SL |
842 | }; |
843 | if u2 < 0xDC00 || u2 > 0xDFFF { | |
844 | // not a trailing surrogate so we're not a valid | |
845 | // surrogate pair, so rewind to redecode u2 next time. | |
846 | self.buf = Some(u2); | |
54a0048b | 847 | return Some(Err(DecodeUtf16Error { code: u })); |
e9174d1e SL |
848 | } |
849 | ||
850 | // all ok, so lets decode it. | |
851 | let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; | |
852 | Some(Ok(unsafe { from_u32_unchecked(c) })) | |
853 | } | |
854 | } | |
1a4d82fc | 855 | |
62682a34 | 856 | #[inline] |
e9174d1e SL |
857 | fn size_hint(&self) -> (usize, Option<usize>) { |
858 | let (low, high) = self.iter.size_hint(); | |
859 | // we could be entirely valid surrogates (2 elements per | |
860 | // char), or entirely non-surrogates (1 element per char) | |
861 | (low / 2, high) | |
62682a34 | 862 | } |
1a4d82fc | 863 | } |
e9174d1e | 864 | |
54a0048b SL |
865 | impl DecodeUtf16Error { |
866 | /// Returns the unpaired surrogate which caused this error. | |
867 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
868 | pub fn unpaired_surrogate(&self) -> u16 { | |
869 | self.code | |
870 | } | |
871 | } | |
872 | ||
873 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
874 | impl fmt::Display for DecodeUtf16Error { | |
875 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
876 | write!(f, "unpaired surrogate found: {:x}", self.code) | |
877 | } | |
878 | } | |
879 | ||
880 | /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a | |
881 | /// decoding error. | |
882 | /// | |
b039eaaf | 883 | /// It can occur, for example, when giving ill-formed UTF-8 bytes to |
54a0048b SL |
884 | /// [`String::from_utf8_lossy`](../../std/string/struct.String.html#method.from_utf8_lossy). |
885 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
e9174d1e | 886 | pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}'; |