]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
1a4d82fc JJ |
10 | |
11 | //! Unicode-intensive string manipulations. | |
12 | //! | |
b039eaaf SL |
13 | //! This module provides functionality to `str` that requires the Unicode |
14 | //! methods provided by the unicode parts of the CharExt trait. | |
1a4d82fc | 15 | |
1a4d82fc | 16 | use core::char; |
9e0c209e | 17 | use core::iter::{Filter, FusedIterator}; |
1a4d82fc JJ |
18 | use core::str::Split; |
19 | ||
d9579d0f AL |
20 | /// An iterator over the non-whitespace substrings of a string, |
21 | /// separated by any amount of whitespace. | |
22 | #[stable(feature = "split_whitespace", since = "1.1.0")] | |
23 | pub struct SplitWhitespace<'a> { | |
85aaf69f | 24 | inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>, |
1a4d82fc JJ |
25 | } |
26 | ||
27 | /// Methods for Unicode string slices | |
28 | #[allow(missing_docs)] // docs in libcollections | |
29 | pub trait UnicodeStr { | |
d9579d0f | 30 | fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>; |
1a4d82fc JJ |
31 | fn is_whitespace(&self) -> bool; |
32 | fn is_alphanumeric(&self) -> bool; | |
7453a54e SL |
33 | fn trim(&self) -> &str; |
34 | fn trim_left(&self) -> &str; | |
35 | fn trim_right(&self) -> &str; | |
1a4d82fc JJ |
36 | } |
37 | ||
38 | impl UnicodeStr for str { | |
d9579d0f AL |
39 | #[inline] |
40 | fn split_whitespace(&self) -> SplitWhitespace { | |
b039eaaf SL |
41 | fn is_not_empty(s: &&str) -> bool { |
42 | !s.is_empty() | |
43 | } | |
1a4d82fc JJ |
44 | let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer |
45 | ||
b039eaaf SL |
46 | fn is_whitespace(c: char) -> bool { |
47 | c.is_whitespace() | |
48 | } | |
1a4d82fc JJ |
49 | let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer |
50 | ||
d9579d0f | 51 | SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) } |
1a4d82fc JJ |
52 | } |
53 | ||
54 | #[inline] | |
b039eaaf SL |
55 | fn is_whitespace(&self) -> bool { |
56 | self.chars().all(|c| c.is_whitespace()) | |
57 | } | |
1a4d82fc JJ |
58 | |
59 | #[inline] | |
b039eaaf SL |
60 | fn is_alphanumeric(&self) -> bool { |
61 | self.chars().all(|c| c.is_alphanumeric()) | |
62 | } | |
1a4d82fc | 63 | |
1a4d82fc JJ |
64 | #[inline] |
65 | fn trim(&self) -> &str { | |
c34b1796 | 66 | self.trim_matches(|c: char| c.is_whitespace()) |
1a4d82fc JJ |
67 | } |
68 | ||
69 | #[inline] | |
70 | fn trim_left(&self) -> &str { | |
85aaf69f | 71 | self.trim_left_matches(|c: char| c.is_whitespace()) |
1a4d82fc JJ |
72 | } |
73 | ||
74 | #[inline] | |
75 | fn trim_right(&self) -> &str { | |
85aaf69f | 76 | self.trim_right_matches(|c: char| c.is_whitespace()) |
1a4d82fc JJ |
77 | } |
78 | } | |
79 | ||
1a4d82fc JJ |
80 | // https://tools.ietf.org/html/rfc3629 |
81 | static UTF8_CHAR_WIDTH: [u8; 256] = [ | |
82 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
83 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F | |
84 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
85 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F | |
86 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
87 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F | |
88 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | |
89 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F | |
90 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
91 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F | |
92 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
93 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF | |
94 | 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
95 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF | |
96 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF | |
97 | 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF | |
98 | ]; | |
99 | ||
100 | /// Given a first byte, determine how many bytes are in this UTF-8 character | |
101 | #[inline] | |
85aaf69f SL |
102 | pub fn utf8_char_width(b: u8) -> usize { |
103 | return UTF8_CHAR_WIDTH[b as usize] as usize; | |
1a4d82fc JJ |
104 | } |
105 | ||
106 | /// Determines if a vector of `u16` contains valid UTF-16 | |
107 | pub fn is_utf16(v: &[u16]) -> bool { | |
108 | let mut it = v.iter(); | |
109 | macro_rules! next { ($ret:expr) => { | |
110 | match it.next() { Some(u) => *u, None => return $ret } | |
111 | } | |
112 | } | |
113 | loop { | |
114 | let u = next!(true); | |
115 | ||
116 | match char::from_u32(u as u32) { | |
117 | Some(_) => {} | |
118 | None => { | |
119 | let u2 = next!(false); | |
b039eaaf SL |
120 | if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF { |
121 | return false; | |
122 | } | |
1a4d82fc JJ |
123 | } |
124 | } | |
125 | } | |
126 | } | |
127 | ||
1a4d82fc JJ |
128 | /// Iterator adaptor for encoding `char`s to UTF-16. |
129 | #[derive(Clone)] | |
130 | pub struct Utf16Encoder<I> { | |
131 | chars: I, | |
b039eaaf | 132 | extra: u16, |
1a4d82fc JJ |
133 | } |
134 | ||
135 | impl<I> Utf16Encoder<I> { | |
d9579d0f | 136 | /// Create a UTF-16 encoder from any `char` iterator. |
b039eaaf SL |
137 | pub fn new(chars: I) -> Utf16Encoder<I> |
138 | where I: Iterator<Item = char> | |
139 | { | |
140 | Utf16Encoder { | |
141 | chars: chars, | |
142 | extra: 0, | |
143 | } | |
1a4d82fc JJ |
144 | } |
145 | } | |
146 | ||
3157f602 XL |
147 | impl<I> Iterator for Utf16Encoder<I> |
148 | where I: Iterator<Item = char> | |
149 | { | |
1a4d82fc JJ |
150 | type Item = u16; |
151 | ||
152 | #[inline] | |
153 | fn next(&mut self) -> Option<u16> { | |
154 | if self.extra != 0 { | |
155 | let tmp = self.extra; | |
156 | self.extra = 0; | |
157 | return Some(tmp); | |
158 | } | |
159 | ||
c30ab7b3 | 160 | let mut buf = [0; 2]; |
1a4d82fc | 161 | self.chars.next().map(|ch| { |
c30ab7b3 SL |
162 | let n = CharExt::encode_utf16(ch, &mut buf).len(); |
163 | if n == 2 { | |
164 | self.extra = buf[1]; | |
b039eaaf | 165 | } |
c30ab7b3 | 166 | buf[0] |
1a4d82fc JJ |
167 | }) |
168 | } | |
169 | ||
170 | #[inline] | |
85aaf69f | 171 | fn size_hint(&self) -> (usize, Option<usize>) { |
1a4d82fc JJ |
172 | let (low, high) = self.chars.size_hint(); |
173 | // every char gets either one u16 or two u16, | |
174 | // so this iterator is between 1 or 2 times as | |
175 | // long as the underlying iterator. | |
176 | (low, high.and_then(|n| n.checked_mul(2))) | |
177 | } | |
178 | } | |
179 | ||
9e0c209e SL |
180 | #[unstable(feature = "fused", issue = "35602")] |
181 | impl<I> FusedIterator for Utf16Encoder<I> | |
182 | where I: FusedIterator<Item = char> {} | |
183 | ||
c30ab7b3 | 184 | #[stable(feature = "split_whitespace", since = "1.1.0")] |
d9579d0f | 185 | impl<'a> Iterator for SplitWhitespace<'a> { |
1a4d82fc JJ |
186 | type Item = &'a str; |
187 | ||
b039eaaf SL |
188 | fn next(&mut self) -> Option<&'a str> { |
189 | self.inner.next() | |
190 | } | |
1a4d82fc | 191 | } |
c30ab7b3 SL |
192 | |
193 | #[stable(feature = "split_whitespace", since = "1.1.0")] | |
d9579d0f | 194 | impl<'a> DoubleEndedIterator for SplitWhitespace<'a> { |
b039eaaf SL |
195 | fn next_back(&mut self) -> Option<&'a str> { |
196 | self.inner.next_back() | |
197 | } | |
1a4d82fc | 198 | } |
9e0c209e SL |
199 | |
200 | #[unstable(feature = "fused", issue = "35602")] | |
201 | impl<'a> FusedIterator for SplitWhitespace<'a> {} |