]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
1a4d82fc JJ |
10 | |
11 | //! Unicode-intensive string manipulations. | |
12 | //! | |
b039eaaf SL |
13 | //! This module provides functionality to `str` that requires the Unicode |
14 | //! methods provided by the unicode parts of the CharExt trait. | |
1a4d82fc | 15 | |
e9174d1e | 16 | use char::{DecodeUtf16, decode_utf16}; |
1a4d82fc | 17 | use core::char; |
e9174d1e | 18 | use core::iter::{Cloned, Filter}; |
1a4d82fc JJ |
19 | use core::slice; |
20 | use core::str::Split; | |
21 | ||
d9579d0f AL |
22 | /// An iterator over the non-whitespace substrings of a string, |
23 | /// separated by any amount of whitespace. | |
24 | #[stable(feature = "split_whitespace", since = "1.1.0")] | |
25 | pub struct SplitWhitespace<'a> { | |
85aaf69f | 26 | inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>, |
1a4d82fc JJ |
27 | } |
28 | ||
29 | /// Methods for Unicode string slices | |
30 | #[allow(missing_docs)] // docs in libcollections | |
31 | pub trait UnicodeStr { | |
d9579d0f | 32 | fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>; |
1a4d82fc JJ |
33 | fn is_whitespace(&self) -> bool; |
34 | fn is_alphanumeric(&self) -> bool; | |
1a4d82fc JJ |
35 | fn trim<'a>(&'a self) -> &'a str; |
36 | fn trim_left<'a>(&'a self) -> &'a str; | |
37 | fn trim_right<'a>(&'a self) -> &'a str; | |
38 | } | |
39 | ||
40 | impl UnicodeStr for str { | |
d9579d0f AL |
41 | #[inline] |
42 | fn split_whitespace(&self) -> SplitWhitespace { | |
b039eaaf SL |
43 | fn is_not_empty(s: &&str) -> bool { |
44 | !s.is_empty() | |
45 | } | |
1a4d82fc JJ |
46 | let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer |
47 | ||
b039eaaf SL |
48 | fn is_whitespace(c: char) -> bool { |
49 | c.is_whitespace() | |
50 | } | |
1a4d82fc JJ |
51 | let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer |
52 | ||
d9579d0f | 53 | SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) } |
1a4d82fc JJ |
54 | } |
55 | ||
56 | #[inline] | |
b039eaaf SL |
57 | fn is_whitespace(&self) -> bool { |
58 | self.chars().all(|c| c.is_whitespace()) | |
59 | } | |
1a4d82fc JJ |
60 | |
61 | #[inline] | |
b039eaaf SL |
62 | fn is_alphanumeric(&self) -> bool { |
63 | self.chars().all(|c| c.is_alphanumeric()) | |
64 | } | |
1a4d82fc | 65 | |
1a4d82fc JJ |
66 | #[inline] |
67 | fn trim(&self) -> &str { | |
c34b1796 | 68 | self.trim_matches(|c: char| c.is_whitespace()) |
1a4d82fc JJ |
69 | } |
70 | ||
71 | #[inline] | |
72 | fn trim_left(&self) -> &str { | |
85aaf69f | 73 | self.trim_left_matches(|c: char| c.is_whitespace()) |
1a4d82fc JJ |
74 | } |
75 | ||
76 | #[inline] | |
77 | fn trim_right(&self) -> &str { | |
85aaf69f | 78 | self.trim_right_matches(|c: char| c.is_whitespace()) |
1a4d82fc JJ |
79 | } |
80 | } | |
81 | ||
1a4d82fc JJ |
82 | // https://tools.ietf.org/html/rfc3629 |
83 | static UTF8_CHAR_WIDTH: [u8; 256] = [ | |
84 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
85 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F | |
86 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
87 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F | |
88 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
89 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F | |
90 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
91 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F | |
92 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
93 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F | |
94 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
95 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF | |
96 | 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
97 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF | |
98 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF | |
99 | 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF | |
100 | ]; | |
101 | ||
102 | /// Given a first byte, determine how many bytes are in this UTF-8 character | |
103 | #[inline] | |
85aaf69f SL |
104 | pub fn utf8_char_width(b: u8) -> usize { |
105 | return UTF8_CHAR_WIDTH[b as usize] as usize; | |
1a4d82fc JJ |
106 | } |
107 | ||
108 | /// Determines if a vector of `u16` contains valid UTF-16 | |
109 | pub fn is_utf16(v: &[u16]) -> bool { | |
110 | let mut it = v.iter(); | |
111 | macro_rules! next { ($ret:expr) => { | |
112 | match it.next() { Some(u) => *u, None => return $ret } | |
113 | } | |
114 | } | |
115 | loop { | |
116 | let u = next!(true); | |
117 | ||
118 | match char::from_u32(u as u32) { | |
119 | Some(_) => {} | |
120 | None => { | |
121 | let u2 = next!(false); | |
b039eaaf SL |
122 | if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF { |
123 | return false; | |
124 | } | |
1a4d82fc JJ |
125 | } |
126 | } | |
127 | } | |
128 | } | |
129 | ||
130 | /// An iterator that decodes UTF-16 encoded codepoints from a vector | |
131 | /// of `u16`s. | |
92a42be0 | 132 | #[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")] |
e9174d1e SL |
133 | #[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] |
134 | #[allow(deprecated)] | |
1a4d82fc JJ |
135 | #[derive(Clone)] |
136 | pub struct Utf16Items<'a> { | |
b039eaaf | 137 | decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>, |
1a4d82fc | 138 | } |
e9174d1e | 139 | |
1a4d82fc | 140 | /// The possibilities for values decoded from a `u16` stream. |
92a42be0 SL |
141 | #[rustc_deprecated(since = "1.4.0", |
142 | reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")] | |
e9174d1e SL |
143 | #[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] |
144 | #[allow(deprecated)] | |
85aaf69f | 145 | #[derive(Copy, PartialEq, Eq, Clone, Debug)] |
1a4d82fc JJ |
146 | pub enum Utf16Item { |
147 | /// A valid codepoint. | |
148 | ScalarValue(char), | |
149 | /// An invalid surrogate without its pair. | |
b039eaaf | 150 | LoneSurrogate(u16), |
1a4d82fc JJ |
151 | } |
152 | ||
e9174d1e | 153 | #[allow(deprecated)] |
1a4d82fc JJ |
154 | impl Utf16Item { |
155 | /// Convert `self` to a `char`, taking `LoneSurrogate`s to the | |
156 | /// replacement character (U+FFFD). | |
157 | #[inline] | |
158 | pub fn to_char_lossy(&self) -> char { | |
159 | match *self { | |
160 | Utf16Item::ScalarValue(c) => c, | |
b039eaaf | 161 | Utf16Item::LoneSurrogate(_) => '\u{FFFD}', |
1a4d82fc JJ |
162 | } |
163 | } | |
164 | } | |
165 | ||
92a42be0 | 166 | #[rustc_deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")] |
e9174d1e SL |
167 | #[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] |
168 | #[allow(deprecated)] | |
1a4d82fc JJ |
169 | impl<'a> Iterator for Utf16Items<'a> { |
170 | type Item = Utf16Item; | |
171 | ||
172 | fn next(&mut self) -> Option<Utf16Item> { | |
b039eaaf SL |
173 | self.decoder.next().map(|result| { |
174 | match result { | |
175 | Ok(c) => Utf16Item::ScalarValue(c), | |
176 | Err(s) => Utf16Item::LoneSurrogate(s), | |
177 | } | |
e9174d1e | 178 | }) |
1a4d82fc JJ |
179 | } |
180 | ||
181 | #[inline] | |
85aaf69f | 182 | fn size_hint(&self) -> (usize, Option<usize>) { |
e9174d1e | 183 | self.decoder.size_hint() |
1a4d82fc JJ |
184 | } |
185 | } | |
186 | ||
187 | /// Create an iterator over the UTF-16 encoded codepoints in `v`, | |
188 | /// returning invalid surrogates as `LoneSurrogate`s. | |
189 | /// | |
c34b1796 AL |
190 | /// # Examples |
191 | /// | |
192 | /// ``` | |
e9174d1e | 193 | /// #![feature(unicode, decode_utf16)] |
92a42be0 | 194 | /// # #![allow(deprecated)] |
c1a9b12d | 195 | /// |
d9579d0f | 196 | /// extern crate rustc_unicode; |
1a4d82fc | 197 | /// |
d9579d0f | 198 | /// use rustc_unicode::str::Utf16Item::{ScalarValue, LoneSurrogate}; |
1a4d82fc | 199 | /// |
c34b1796 AL |
200 | /// fn main() { |
201 | /// // 𝄞mus<invalid>ic<invalid> | |
202 | /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, | |
203 | /// 0x0073, 0xDD1E, 0x0069, 0x0063, | |
204 | /// 0xD834]; | |
1a4d82fc | 205 | /// |
d9579d0f | 206 | /// assert_eq!(rustc_unicode::str::utf16_items(&v).collect::<Vec<_>>(), |
c34b1796 AL |
207 | /// vec![ScalarValue('𝄞'), |
208 | /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'), | |
209 | /// LoneSurrogate(0xDD1E), | |
210 | /// ScalarValue('i'), ScalarValue('c'), | |
211 | /// LoneSurrogate(0xD834)]); | |
212 | /// } | |
1a4d82fc | 213 | /// ``` |
92a42be0 | 214 | #[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")] |
e9174d1e SL |
215 | #[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")] |
216 | #[allow(deprecated)] | |
1a4d82fc | 217 | pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> { |
e9174d1e | 218 | Utf16Items { decoder: decode_utf16(v.iter().cloned()) } |
1a4d82fc JJ |
219 | } |
220 | ||
221 | /// Iterator adaptor for encoding `char`s to UTF-16. | |
222 | #[derive(Clone)] | |
223 | pub struct Utf16Encoder<I> { | |
224 | chars: I, | |
b039eaaf | 225 | extra: u16, |
1a4d82fc JJ |
226 | } |
227 | ||
228 | impl<I> Utf16Encoder<I> { | |
d9579d0f | 229 | /// Create a UTF-16 encoder from any `char` iterator. |
b039eaaf SL |
230 | pub fn new(chars: I) -> Utf16Encoder<I> |
231 | where I: Iterator<Item = char> | |
232 | { | |
233 | Utf16Encoder { | |
234 | chars: chars, | |
235 | extra: 0, | |
236 | } | |
1a4d82fc JJ |
237 | } |
238 | } | |
239 | ||
240 | impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> { | |
241 | type Item = u16; | |
242 | ||
243 | #[inline] | |
244 | fn next(&mut self) -> Option<u16> { | |
245 | if self.extra != 0 { | |
246 | let tmp = self.extra; | |
247 | self.extra = 0; | |
248 | return Some(tmp); | |
249 | } | |
250 | ||
c34b1796 | 251 | let mut buf = [0; 2]; |
1a4d82fc | 252 | self.chars.next().map(|ch| { |
85aaf69f | 253 | let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0); |
b039eaaf SL |
254 | if n == 2 { |
255 | self.extra = buf[1]; | |
256 | } | |
1a4d82fc JJ |
257 | buf[0] |
258 | }) | |
259 | } | |
260 | ||
261 | #[inline] | |
85aaf69f | 262 | fn size_hint(&self) -> (usize, Option<usize>) { |
1a4d82fc JJ |
263 | let (low, high) = self.chars.size_hint(); |
264 | // every char gets either one u16 or two u16, | |
265 | // so this iterator is between 1 or 2 times as | |
266 | // long as the underlying iterator. | |
267 | (low, high.and_then(|n| n.checked_mul(2))) | |
268 | } | |
269 | } | |
270 | ||
d9579d0f | 271 | impl<'a> Iterator for SplitWhitespace<'a> { |
1a4d82fc JJ |
272 | type Item = &'a str; |
273 | ||
b039eaaf SL |
274 | fn next(&mut self) -> Option<&'a str> { |
275 | self.inner.next() | |
276 | } | |
1a4d82fc | 277 | } |
d9579d0f | 278 | impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { |
b039eaaf SL |
279 | fn next_back(&mut self) -> Option<&'a str> { |
280 | self.inner.next_back() | |
281 | } | |
1a4d82fc | 282 | } |