]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | //! Character manipulation. | |
12 | //! | |
d9579d0f | 13 | //! For more details, see ::rustc_unicode::char (a.k.a. std::char) |
1a4d82fc JJ |
14 | |
15 | #![allow(non_snake_case)] | |
62682a34 | 16 | #![stable(feature = "core_char", since = "1.2.0")] |
1a4d82fc JJ |
17 | |
18 | use iter::Iterator; | |
19 | use mem::transmute; | |
20 | use option::Option::{None, Some}; | |
21 | use option::Option; | |
22 | use slice::SliceExt; | |
23 | ||
24 | // UTF-8 ranges and tags for encoding characters | |
c34b1796 AL |
25 | const TAG_CONT: u8 = 0b1000_0000; |
26 | const TAG_TWO_B: u8 = 0b1100_0000; | |
27 | const TAG_THREE_B: u8 = 0b1110_0000; | |
28 | const TAG_FOUR_B: u8 = 0b1111_0000; | |
29 | const MAX_ONE_B: u32 = 0x80; | |
30 | const MAX_TWO_B: u32 = 0x800; | |
31 | const MAX_THREE_B: u32 = 0x10000; | |
1a4d82fc JJ |
32 | |
33 | /* | |
34 | Lu Uppercase_Letter an uppercase letter | |
35 | Ll Lowercase_Letter a lowercase letter | |
36 | Lt Titlecase_Letter a digraphic character, with first part uppercase | |
37 | Lm Modifier_Letter a modifier letter | |
38 | Lo Other_Letter other letters, including syllables and ideographs | |
39 | Mn Nonspacing_Mark a nonspacing combining mark (zero advance width) | |
40 | Mc Spacing_Mark a spacing combining mark (positive advance width) | |
41 | Me Enclosing_Mark an enclosing combining mark | |
42 | Nd Decimal_Number a decimal digit | |
43 | Nl Letter_Number a letterlike numeric character | |
44 | No Other_Number a numeric character of other type | |
45 | Pc Connector_Punctuation a connecting punctuation mark, like a tie | |
46 | Pd Dash_Punctuation a dash or hyphen punctuation mark | |
47 | Ps Open_Punctuation an opening punctuation mark (of a pair) | |
48 | Pe Close_Punctuation a closing punctuation mark (of a pair) | |
49 | Pi Initial_Punctuation an initial quotation mark | |
50 | Pf Final_Punctuation a final quotation mark | |
51 | Po Other_Punctuation a punctuation mark of other type | |
52 | Sm Math_Symbol a symbol of primarily mathematical use | |
53 | Sc Currency_Symbol a currency sign | |
54 | Sk Modifier_Symbol a non-letterlike modifier symbol | |
55 | So Other_Symbol a symbol of other type | |
56 | Zs Space_Separator a space character (of various non-zero widths) | |
57 | Zl Line_Separator U+2028 LINE SEPARATOR only | |
58 | Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only | |
59 | Cc Control a C0 or C1 control code | |
60 | Cf Format a format control character | |
61 | Cs Surrogate a surrogate code point | |
62 | Co Private_Use a private-use character | |
63 | Cn Unassigned a reserved unassigned code point or a noncharacter | |
64 | */ | |
65 | ||
66 | /// The highest valid code point | |
85aaf69f | 67 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
68 | pub const MAX: char = '\u{10ffff}'; |
69 | ||
85aaf69f SL |
70 | /// Converts a `u32` to an `Option<char>`. |
71 | /// | |
72 | /// # Examples | |
73 | /// | |
74 | /// ``` | |
75 | /// use std::char; | |
76 | /// | |
62682a34 SL |
77 | /// assert_eq!(char::from_u32(0x2764), Some('❤')); |
78 | /// assert_eq!(char::from_u32(0x110000), None); // invalid character | |
85aaf69f | 79 | /// ``` |
1a4d82fc | 80 | #[inline] |
85aaf69f | 81 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
82 | pub fn from_u32(i: u32) -> Option<char> { |
83 | // catch out-of-bounds and surrogates | |
84 | if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) { | |
85 | None | |
86 | } else { | |
c1a9b12d | 87 | Some(unsafe { from_u32_unchecked(i) }) |
1a4d82fc JJ |
88 | } |
89 | } | |
90 | ||
c1a9b12d SL |
91 | /// Converts a `u32` to an `char`, not checking whether it is a valid unicode |
92 | /// codepoint. | |
93 | #[inline] | |
94 | #[unstable(feature = "char_from_unchecked", reason = "recently added API")] | |
95 | pub unsafe fn from_u32_unchecked(i: u32) -> char { | |
96 | transmute(i) | |
97 | } | |
98 | ||
85aaf69f | 99 | /// Converts a number to the character representing it. |
1a4d82fc JJ |
100 | /// |
101 | /// # Return value | |
102 | /// | |
103 | /// Returns `Some(char)` if `num` represents one digit under `radix`, | |
104 | /// using one character of `0-9` or `a-z`, or `None` if it doesn't. | |
105 | /// | |
106 | /// # Panics | |
107 | /// | |
108 | /// Panics if given an `radix` > 36. | |
109 | /// | |
85aaf69f SL |
110 | /// # Examples |
111 | /// | |
112 | /// ``` | |
113 | /// use std::char; | |
114 | /// | |
115 | /// let c = char::from_digit(4, 10); | |
116 | /// | |
117 | /// assert_eq!(c, Some('4')); | |
118 | /// ``` | |
1a4d82fc | 119 | #[inline] |
c34b1796 | 120 | #[stable(feature = "rust1", since = "1.0.0")] |
85aaf69f | 121 | pub fn from_digit(num: u32, radix: u32) -> Option<char> { |
1a4d82fc JJ |
122 | if radix > 36 { |
123 | panic!("from_digit: radix is too high (maximum 36)"); | |
124 | } | |
125 | if num < radix { | |
c1a9b12d SL |
126 | let num = num as u8; |
127 | if num < 10 { | |
128 | Some((b'0' + num) as char) | |
129 | } else { | |
130 | Some((b'a' + num - 10) as char) | |
1a4d82fc JJ |
131 | } |
132 | } else { | |
133 | None | |
134 | } | |
135 | } | |
136 | ||
c34b1796 AL |
137 | // NB: the stabilization and documentation for this trait is in |
138 | // unicode/char.rs, not here | |
139 | #[allow(missing_docs)] // docs in libunicode/u_char.rs | |
9346a6ac | 140 | #[doc(hidden)] |
62682a34 SL |
141 | #[unstable(feature = "core_char_ext", |
142 | reason = "the stable interface is `impl char` in later crate")] | |
1a4d82fc | 143 | pub trait CharExt { |
85aaf69f | 144 | fn is_digit(self, radix: u32) -> bool; |
85aaf69f | 145 | fn to_digit(self, radix: u32) -> Option<u32>; |
1a4d82fc | 146 | fn escape_unicode(self) -> EscapeUnicode; |
1a4d82fc | 147 | fn escape_default(self) -> EscapeDefault; |
85aaf69f | 148 | fn len_utf8(self) -> usize; |
85aaf69f | 149 | fn len_utf16(self) -> usize; |
85aaf69f | 150 | fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>; |
85aaf69f | 151 | fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>; |
1a4d82fc JJ |
152 | } |
153 | ||
1a4d82fc | 154 | impl CharExt for char { |
62682a34 | 155 | #[inline] |
85aaf69f | 156 | fn is_digit(self, radix: u32) -> bool { |
1a4d82fc JJ |
157 | self.to_digit(radix).is_some() |
158 | } | |
159 | ||
62682a34 | 160 | #[inline] |
85aaf69f | 161 | fn to_digit(self, radix: u32) -> Option<u32> { |
1a4d82fc JJ |
162 | if radix > 36 { |
163 | panic!("to_digit: radix is too high (maximum 36)"); | |
164 | } | |
165 | let val = match self { | |
85aaf69f SL |
166 | '0' ... '9' => self as u32 - '0' as u32, |
167 | 'a' ... 'z' => self as u32 - 'a' as u32 + 10, | |
168 | 'A' ... 'Z' => self as u32 - 'A' as u32 + 10, | |
1a4d82fc JJ |
169 | _ => return None, |
170 | }; | |
171 | if val < radix { Some(val) } | |
172 | else { None } | |
173 | } | |
174 | ||
62682a34 | 175 | #[inline] |
1a4d82fc JJ |
176 | fn escape_unicode(self) -> EscapeUnicode { |
177 | EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash } | |
178 | } | |
179 | ||
62682a34 | 180 | #[inline] |
1a4d82fc JJ |
181 | fn escape_default(self) -> EscapeDefault { |
182 | let init_state = match self { | |
183 | '\t' => EscapeDefaultState::Backslash('t'), | |
184 | '\r' => EscapeDefaultState::Backslash('r'), | |
185 | '\n' => EscapeDefaultState::Backslash('n'), | |
186 | '\\' => EscapeDefaultState::Backslash('\\'), | |
187 | '\'' => EscapeDefaultState::Backslash('\''), | |
188 | '"' => EscapeDefaultState::Backslash('"'), | |
189 | '\x20' ... '\x7e' => EscapeDefaultState::Char(self), | |
190 | _ => EscapeDefaultState::Unicode(self.escape_unicode()) | |
191 | }; | |
192 | EscapeDefault { state: init_state } | |
193 | } | |
194 | ||
195 | #[inline] | |
85aaf69f | 196 | fn len_utf8(self) -> usize { |
1a4d82fc | 197 | let code = self as u32; |
c34b1796 AL |
198 | if code < MAX_ONE_B { |
199 | 1 | |
200 | } else if code < MAX_TWO_B { | |
201 | 2 | |
202 | } else if code < MAX_THREE_B { | |
203 | 3 | |
204 | } else { | |
205 | 4 | |
1a4d82fc JJ |
206 | } |
207 | } | |
208 | ||
209 | #[inline] | |
85aaf69f | 210 | fn len_utf16(self) -> usize { |
1a4d82fc | 211 | let ch = self as u32; |
c34b1796 | 212 | if (ch & 0xFFFF) == ch { 1 } else { 2 } |
1a4d82fc JJ |
213 | } |
214 | ||
215 | #[inline] | |
85aaf69f SL |
216 | fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> { |
217 | encode_utf8_raw(self as u32, dst) | |
1a4d82fc JJ |
218 | } |
219 | ||
220 | #[inline] | |
85aaf69f SL |
221 | fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> { |
222 | encode_utf16_raw(self as u32, dst) | |
223 | } | |
224 | } | |
225 | ||
226 | /// Encodes a raw u32 value as UTF-8 into the provided byte buffer, | |
227 | /// and then returns the number of bytes written. | |
228 | /// | |
229 | /// If the buffer is not large enough, nothing will be written into it | |
230 | /// and a `None` will be returned. | |
231 | #[inline] | |
62682a34 SL |
232 | #[unstable(feature = "char_internals", |
233 | reason = "this function should not be exposed publicly")] | |
234 | #[doc(hidden)] | |
85aaf69f SL |
235 | pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> { |
236 | // Marked #[inline] to allow llvm optimizing it away | |
9346a6ac | 237 | if code < MAX_ONE_B && !dst.is_empty() { |
85aaf69f SL |
238 | dst[0] = code as u8; |
239 | Some(1) | |
240 | } else if code < MAX_TWO_B && dst.len() >= 2 { | |
c34b1796 AL |
241 | dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; |
242 | dst[1] = (code & 0x3F) as u8 | TAG_CONT; | |
85aaf69f SL |
243 | Some(2) |
244 | } else if code < MAX_THREE_B && dst.len() >= 3 { | |
c34b1796 AL |
245 | dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; |
246 | dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
247 | dst[2] = (code & 0x3F) as u8 | TAG_CONT; | |
85aaf69f SL |
248 | Some(3) |
249 | } else if dst.len() >= 4 { | |
c34b1796 AL |
250 | dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; |
251 | dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; | |
252 | dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; | |
253 | dst[3] = (code & 0x3F) as u8 | TAG_CONT; | |
85aaf69f SL |
254 | Some(4) |
255 | } else { | |
256 | None | |
257 | } | |
258 | } | |
259 | ||
260 | /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer, | |
261 | /// and then returns the number of `u16`s written. | |
262 | /// | |
263 | /// If the buffer is not large enough, nothing will be written into it | |
264 | /// and a `None` will be returned. | |
265 | #[inline] | |
62682a34 SL |
266 | #[unstable(feature = "char_internals", |
267 | reason = "this function should not be exposed publicly")] | |
268 | #[doc(hidden)] | |
85aaf69f SL |
269 | pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> { |
270 | // Marked #[inline] to allow llvm optimizing it away | |
9346a6ac | 271 | if (ch & 0xFFFF) == ch && !dst.is_empty() { |
85aaf69f SL |
272 | // The BMP falls through (assuming non-surrogate, as it should) |
273 | dst[0] = ch as u16; | |
274 | Some(1) | |
275 | } else if dst.len() >= 2 { | |
276 | // Supplementary planes break into surrogates. | |
c34b1796 AL |
277 | ch -= 0x1_0000; |
278 | dst[0] = 0xD800 | ((ch >> 10) as u16); | |
279 | dst[1] = 0xDC00 | ((ch as u16) & 0x3FF); | |
85aaf69f SL |
280 | Some(2) |
281 | } else { | |
282 | None | |
1a4d82fc JJ |
283 | } |
284 | } | |
285 | ||
286 | /// An iterator over the characters that represent a `char`, as escaped by | |
287 | /// Rust's unicode escaping rules. | |
288 | #[derive(Clone)] | |
85aaf69f | 289 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
290 | pub struct EscapeUnicode { |
291 | c: char, | |
292 | state: EscapeUnicodeState | |
293 | } | |
294 | ||
295 | #[derive(Clone)] | |
1a4d82fc JJ |
296 | enum EscapeUnicodeState { |
297 | Backslash, | |
298 | Type, | |
299 | LeftBrace, | |
85aaf69f | 300 | Value(usize), |
1a4d82fc JJ |
301 | RightBrace, |
302 | Done, | |
303 | } | |
304 | ||
85aaf69f | 305 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
306 | impl Iterator for EscapeUnicode { |
307 | type Item = char; | |
308 | ||
309 | fn next(&mut self) -> Option<char> { | |
310 | match self.state { | |
311 | EscapeUnicodeState::Backslash => { | |
312 | self.state = EscapeUnicodeState::Type; | |
313 | Some('\\') | |
314 | } | |
315 | EscapeUnicodeState::Type => { | |
316 | self.state = EscapeUnicodeState::LeftBrace; | |
317 | Some('u') | |
318 | } | |
319 | EscapeUnicodeState::LeftBrace => { | |
85aaf69f | 320 | let mut n = 0; |
1a4d82fc JJ |
321 | while (self.c as u32) >> (4 * (n + 1)) != 0 { |
322 | n += 1; | |
323 | } | |
324 | self.state = EscapeUnicodeState::Value(n); | |
325 | Some('{') | |
326 | } | |
327 | EscapeUnicodeState::Value(offset) => { | |
c1a9b12d | 328 | let c = from_digit(((self.c as u32) >> (offset * 4)) & 0xf, 16).unwrap(); |
1a4d82fc JJ |
329 | if offset == 0 { |
330 | self.state = EscapeUnicodeState::RightBrace; | |
331 | } else { | |
332 | self.state = EscapeUnicodeState::Value(offset - 1); | |
333 | } | |
c1a9b12d | 334 | Some(c) |
1a4d82fc JJ |
335 | } |
336 | EscapeUnicodeState::RightBrace => { | |
337 | self.state = EscapeUnicodeState::Done; | |
338 | Some('}') | |
339 | } | |
340 | EscapeUnicodeState::Done => None, | |
341 | } | |
342 | } | |
343 | } | |
344 | ||
345 | /// An iterator over the characters that represent a `char`, escaped | |
346 | /// for maximum portability. | |
347 | #[derive(Clone)] | |
85aaf69f | 348 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
349 | pub struct EscapeDefault { |
350 | state: EscapeDefaultState | |
351 | } | |
352 | ||
353 | #[derive(Clone)] | |
1a4d82fc JJ |
354 | enum EscapeDefaultState { |
355 | Backslash(char), | |
356 | Char(char), | |
357 | Done, | |
358 | Unicode(EscapeUnicode), | |
359 | } | |
360 | ||
85aaf69f | 361 | #[stable(feature = "rust1", since = "1.0.0")] |
1a4d82fc JJ |
362 | impl Iterator for EscapeDefault { |
363 | type Item = char; | |
364 | ||
365 | fn next(&mut self) -> Option<char> { | |
366 | match self.state { | |
367 | EscapeDefaultState::Backslash(c) => { | |
368 | self.state = EscapeDefaultState::Char(c); | |
369 | Some('\\') | |
370 | } | |
371 | EscapeDefaultState::Char(c) => { | |
372 | self.state = EscapeDefaultState::Done; | |
373 | Some(c) | |
374 | } | |
375 | EscapeDefaultState::Done => None, | |
376 | EscapeDefaultState::Unicode(ref mut iter) => iter.next() | |
377 | } | |
378 | } | |
379 | } |