1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Character manipulation.
13 //! For more details, see ::unicode::char (a.k.a. std::char)
15 #![allow(non_snake_case)]
16 #![doc(primitive = "char")]
20 use option
::Option
::{None, Some}
;
24 // UTF-8 ranges and tags for encoding characters
25 const TAG_CONT
: u8 = 0b1000_0000;
26 const TAG_TWO_B
: u8 = 0b1100_0000;
27 const TAG_THREE_B
: u8 = 0b1110_0000;
28 const TAG_FOUR_B
: u8 = 0b1111_0000;
29 const MAX_ONE_B
: u32 = 0x80;
30 const MAX_TWO_B
: u32 = 0x800;
31 const MAX_THREE_B
: u32 = 0x10000;
34 Lu Uppercase_Letter an uppercase letter
35 Ll Lowercase_Letter a lowercase letter
36 Lt Titlecase_Letter a digraphic character, with first part uppercase
37 Lm Modifier_Letter a modifier letter
38 Lo Other_Letter other letters, including syllables and ideographs
39 Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
40 Mc Spacing_Mark a spacing combining mark (positive advance width)
41 Me Enclosing_Mark an enclosing combining mark
42 Nd Decimal_Number a decimal digit
43 Nl Letter_Number a letterlike numeric character
44 No Other_Number a numeric character of other type
45 Pc Connector_Punctuation a connecting punctuation mark, like a tie
46 Pd Dash_Punctuation a dash or hyphen punctuation mark
47 Ps Open_Punctuation an opening punctuation mark (of a pair)
48 Pe Close_Punctuation a closing punctuation mark (of a pair)
49 Pi Initial_Punctuation an initial quotation mark
50 Pf Final_Punctuation a final quotation mark
51 Po Other_Punctuation a punctuation mark of other type
52 Sm Math_Symbol a symbol of primarily mathematical use
53 Sc Currency_Symbol a currency sign
54 Sk Modifier_Symbol a non-letterlike modifier symbol
55 So Other_Symbol a symbol of other type
56 Zs Space_Separator a space character (of various non-zero widths)
57 Zl Line_Separator U+2028 LINE SEPARATOR only
58 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
59 Cc Control a C0 or C1 control code
60 Cf Format a format control character
61 Cs Surrogate a surrogate code point
62 Co Private_Use a private-use character
63 Cn Unassigned a reserved unassigned code point or a noncharacter
66 /// The highest valid code point
67 #[stable(feature = "rust1", since = "1.0.0")]
68 pub const MAX
: char = '
\u{10ffff}'
;
70 /// Converts a `u32` to an `Option<char>`.
77 /// let c = char::from_u32(10084); // produces `Some(❤)`
78 /// assert_eq!(c, Some('❤'));
81 /// An invalid character:
86 /// let none = char::from_u32(1114112);
87 /// assert_eq!(none, None);
90 #[stable(feature = "rust1", since = "1.0.0")]
91 pub fn from_u32(i
: u32) -> Option
<char> {
92 // catch out-of-bounds and surrogates
93 if (i
> MAX
as u32) || (i
>= 0xD800 && i
<= 0xDFFF) {
96 Some(unsafe { transmute(i) }
)
100 /// Converts a number to the character representing it.
104 /// Returns `Some(char)` if `num` represents one digit under `radix`,
105 /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
109 /// Panics if given an `radix` > 36.
116 /// let c = char::from_digit(4, 10);
118 /// assert_eq!(c, Some('4'));
121 #[stable(feature = "rust1", since = "1.0.0")]
122 pub fn from_digit(num
: u32, radix
: u32) -> Option
<char> {
124 panic
!("from_digit: radix is too high (maximum 36)");
129 Some(transmute('
0'
as u32 + num
))
131 Some(transmute('a'
as u32 + num
- 10))
139 // NB: the stabilization and documentation for this trait is in
140 // unicode/char.rs, not here
141 #[allow(missing_docs)] // docs in libunicode/u_char.rs
144 fn is_digit(self, radix
: u32) -> bool
;
145 fn to_digit(self, radix
: u32) -> Option
<u32>;
146 fn escape_unicode(self) -> EscapeUnicode
;
147 fn escape_default(self) -> EscapeDefault
;
148 fn len_utf8(self) -> usize;
149 fn len_utf16(self) -> usize;
150 fn encode_utf8(self, dst
: &mut [u8]) -> Option
<usize>;
151 fn encode_utf16(self, dst
: &mut [u16]) -> Option
<usize>;
154 impl CharExt
for char {
155 fn is_digit(self, radix
: u32) -> bool
{
156 self.to_digit(radix
).is_some()
159 fn to_digit(self, radix
: u32) -> Option
<u32> {
161 panic
!("to_digit: radix is too high (maximum 36)");
163 let val
= match self {
164 '
0'
... '
9'
=> self as u32 - '
0'
as u32,
165 'a'
... 'z'
=> self as u32 - 'a'
as u32 + 10,
166 'A'
... 'Z'
=> self as u32 - 'A'
as u32 + 10,
169 if val
< radix { Some(val) }
173 fn escape_unicode(self) -> EscapeUnicode
{
174 EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash }
177 fn escape_default(self) -> EscapeDefault
{
178 let init_state
= match self {
179 '
\t'
=> EscapeDefaultState
::Backslash('t'
),
180 '
\r'
=> EscapeDefaultState
::Backslash('r'
),
181 '
\n'
=> EscapeDefaultState
::Backslash('n'
),
182 '
\\'
=> EscapeDefaultState
::Backslash('
\\'
),
183 '
\''
=> EscapeDefaultState
::Backslash('
\''
),
184 '
"' => EscapeDefaultState::Backslash('"'
),
185 '
\x20'
... '
\x7e'
=> EscapeDefaultState
::Char(self),
186 _
=> EscapeDefaultState
::Unicode(self.escape_unicode())
188 EscapeDefault { state: init_state }
192 fn len_utf8(self) -> usize {
193 let code
= self as u32;
194 if code
< MAX_ONE_B
{
196 } else if code
< MAX_TWO_B
{
198 } else if code
< MAX_THREE_B
{
206 fn len_utf16(self) -> usize {
207 let ch
= self as u32;
208 if (ch
& 0xFFFF) == ch { 1 }
else { 2 }
212 fn encode_utf8(self, dst
: &mut [u8]) -> Option
<usize> {
213 encode_utf8_raw(self as u32, dst
)
217 fn encode_utf16(self, dst
: &mut [u16]) -> Option
<usize> {
218 encode_utf16_raw(self as u32, dst
)
222 /// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
223 /// and then returns the number of bytes written.
225 /// If the buffer is not large enough, nothing will be written into it
226 /// and a `None` will be returned.
228 pub fn encode_utf8_raw(code
: u32, dst
: &mut [u8]) -> Option
<usize> {
229 // Marked #[inline] to allow llvm optimizing it away
230 if code
< MAX_ONE_B
&& !dst
.is_empty() {
233 } else if code
< MAX_TWO_B
&& dst
.len() >= 2 {
234 dst
[0] = (code
>> 6 & 0x1F) as u8 | TAG_TWO_B
;
235 dst
[1] = (code
& 0x3F) as u8 | TAG_CONT
;
237 } else if code
< MAX_THREE_B
&& dst
.len() >= 3 {
238 dst
[0] = (code
>> 12 & 0x0F) as u8 | TAG_THREE_B
;
239 dst
[1] = (code
>> 6 & 0x3F) as u8 | TAG_CONT
;
240 dst
[2] = (code
& 0x3F) as u8 | TAG_CONT
;
242 } else if dst
.len() >= 4 {
243 dst
[0] = (code
>> 18 & 0x07) as u8 | TAG_FOUR_B
;
244 dst
[1] = (code
>> 12 & 0x3F) as u8 | TAG_CONT
;
245 dst
[2] = (code
>> 6 & 0x3F) as u8 | TAG_CONT
;
246 dst
[3] = (code
& 0x3F) as u8 | TAG_CONT
;
253 /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
254 /// and then returns the number of `u16`s written.
256 /// If the buffer is not large enough, nothing will be written into it
257 /// and a `None` will be returned.
259 pub fn encode_utf16_raw(mut ch
: u32, dst
: &mut [u16]) -> Option
<usize> {
260 // Marked #[inline] to allow llvm optimizing it away
261 if (ch
& 0xFFFF) == ch
&& !dst
.is_empty() {
262 // The BMP falls through (assuming non-surrogate, as it should)
265 } else if dst
.len() >= 2 {
266 // Supplementary planes break into surrogates.
268 dst
[0] = 0xD800 | ((ch
>> 10) as u16);
269 dst
[1] = 0xDC00 | ((ch
as u16) & 0x3FF);
276 /// An iterator over the characters that represent a `char`, as escaped by
277 /// Rust's unicode escaping rules.
279 #[stable(feature = "rust1", since = "1.0.0")]
280 pub struct EscapeUnicode
{
282 state
: EscapeUnicodeState
286 enum EscapeUnicodeState
{
295 #[stable(feature = "rust1", since = "1.0.0")]
296 impl Iterator
for EscapeUnicode
{
299 fn next(&mut self) -> Option
<char> {
301 EscapeUnicodeState
::Backslash
=> {
302 self.state
= EscapeUnicodeState
::Type
;
305 EscapeUnicodeState
::Type
=> {
306 self.state
= EscapeUnicodeState
::LeftBrace
;
309 EscapeUnicodeState
::LeftBrace
=> {
311 while (self.c
as u32) >> (4 * (n
+ 1)) != 0 {
314 self.state
= EscapeUnicodeState
::Value(n
);
317 EscapeUnicodeState
::Value(offset
) => {
318 let v
= match ((self.c
as i32) >> (offset
* 4)) & 0xf {
319 i @
0 ... 9 => '
0'
as i32 + i
,
320 i
=> 'a'
as i32 + (i
- 10)
323 self.state
= EscapeUnicodeState
::RightBrace
;
325 self.state
= EscapeUnicodeState
::Value(offset
- 1);
327 Some(unsafe { transmute(v) }
)
329 EscapeUnicodeState
::RightBrace
=> {
330 self.state
= EscapeUnicodeState
::Done
;
333 EscapeUnicodeState
::Done
=> None
,
338 /// An iterator over the characters that represent a `char`, escaped
339 /// for maximum portability.
341 #[stable(feature = "rust1", since = "1.0.0")]
342 pub struct EscapeDefault
{
343 state
: EscapeDefaultState
347 enum EscapeDefaultState
{
351 Unicode(EscapeUnicode
),
354 #[stable(feature = "rust1", since = "1.0.0")]
355 impl Iterator
for EscapeDefault
{
358 fn next(&mut self) -> Option
<char> {
360 EscapeDefaultState
::Backslash(c
) => {
361 self.state
= EscapeDefaultState
::Char(c
);
364 EscapeDefaultState
::Char(c
) => {
365 self.state
= EscapeDefaultState
::Done
;
368 EscapeDefaultState
::Done
=> None
,
369 EscapeDefaultState
::Unicode(ref mut iter
) => iter
.next()