]> git.proxmox.com Git - rustc.git/blame - src/librustc_unicode/u_str.rs
Imported Upstream version 1.6.0+dfsg1
[rustc.git] / src / librustc_unicode / u_str.rs
CommitLineData
1a4d82fc
JJ
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
1a4d82fc
JJ
10
11//! Unicode-intensive string manipulations.
12//!
b039eaaf
SL
13//! This module provides functionality to `str` that requires the Unicode
14//! methods provided by the unicode parts of the CharExt trait.
1a4d82fc 15
e9174d1e 16use char::{DecodeUtf16, decode_utf16};
1a4d82fc 17use core::char;
e9174d1e 18use core::iter::{Cloned, Filter};
1a4d82fc
JJ
19use core::slice;
20use core::str::Split;
21
d9579d0f
AL
22/// An iterator over the non-whitespace substrings of a string,
23/// separated by any amount of whitespace.
24#[stable(feature = "split_whitespace", since = "1.1.0")]
25pub struct SplitWhitespace<'a> {
85aaf69f 26 inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
1a4d82fc
JJ
27}
28
29/// Methods for Unicode string slices
30#[allow(missing_docs)] // docs in libcollections
31pub trait UnicodeStr {
d9579d0f 32 fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
1a4d82fc
JJ
33 fn is_whitespace(&self) -> bool;
34 fn is_alphanumeric(&self) -> bool;
1a4d82fc
JJ
35 fn trim<'a>(&'a self) -> &'a str;
36 fn trim_left<'a>(&'a self) -> &'a str;
37 fn trim_right<'a>(&'a self) -> &'a str;
38}
39
40impl UnicodeStr for str {
d9579d0f
AL
41 #[inline]
42 fn split_whitespace(&self) -> SplitWhitespace {
b039eaaf
SL
43 fn is_not_empty(s: &&str) -> bool {
44 !s.is_empty()
45 }
1a4d82fc
JJ
46 let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
47
b039eaaf
SL
48 fn is_whitespace(c: char) -> bool {
49 c.is_whitespace()
50 }
1a4d82fc
JJ
51 let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
52
d9579d0f 53 SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
1a4d82fc
JJ
54 }
55
56 #[inline]
b039eaaf
SL
57 fn is_whitespace(&self) -> bool {
58 self.chars().all(|c| c.is_whitespace())
59 }
1a4d82fc
JJ
60
61 #[inline]
b039eaaf
SL
62 fn is_alphanumeric(&self) -> bool {
63 self.chars().all(|c| c.is_alphanumeric())
64 }
1a4d82fc 65
1a4d82fc
JJ
66 #[inline]
67 fn trim(&self) -> &str {
c34b1796 68 self.trim_matches(|c: char| c.is_whitespace())
1a4d82fc
JJ
69 }
70
71 #[inline]
72 fn trim_left(&self) -> &str {
85aaf69f 73 self.trim_left_matches(|c: char| c.is_whitespace())
1a4d82fc
JJ
74 }
75
76 #[inline]
77 fn trim_right(&self) -> &str {
85aaf69f 78 self.trim_right_matches(|c: char| c.is_whitespace())
1a4d82fc
JJ
79 }
80}
81
1a4d82fc
JJ
82// https://tools.ietf.org/html/rfc3629
83static UTF8_CHAR_WIDTH: [u8; 256] = [
841,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
851,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
861,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
871,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
881,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
891,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
901,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
911,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
920,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
930,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
940,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
950,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
960,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
972,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
983,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
994,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
100];
101
102/// Given a first byte, determine how many bytes are in this UTF-8 character
103#[inline]
85aaf69f
SL
104pub fn utf8_char_width(b: u8) -> usize {
105 return UTF8_CHAR_WIDTH[b as usize] as usize;
1a4d82fc
JJ
106}
107
108/// Determines if a vector of `u16` contains valid UTF-16
109pub fn is_utf16(v: &[u16]) -> bool {
110 let mut it = v.iter();
111 macro_rules! next { ($ret:expr) => {
112 match it.next() { Some(u) => *u, None => return $ret }
113 }
114 }
115 loop {
116 let u = next!(true);
117
118 match char::from_u32(u as u32) {
119 Some(_) => {}
120 None => {
121 let u2 = next!(false);
b039eaaf
SL
122 if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
123 return false;
124 }
1a4d82fc
JJ
125 }
126 }
127 }
128}
129
130/// An iterator that decodes UTF-16 encoded codepoints from a vector
131/// of `u16`s.
92a42be0 132#[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
e9174d1e
SL
133#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
134#[allow(deprecated)]
1a4d82fc
JJ
135#[derive(Clone)]
136pub struct Utf16Items<'a> {
b039eaaf 137 decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>,
1a4d82fc 138}
e9174d1e 139
1a4d82fc 140/// The possibilities for values decoded from a `u16` stream.
92a42be0
SL
141#[rustc_deprecated(since = "1.4.0",
142 reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
e9174d1e
SL
143#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
144#[allow(deprecated)]
85aaf69f 145#[derive(Copy, PartialEq, Eq, Clone, Debug)]
1a4d82fc
JJ
146pub enum Utf16Item {
147 /// A valid codepoint.
148 ScalarValue(char),
149 /// An invalid surrogate without its pair.
b039eaaf 150 LoneSurrogate(u16),
1a4d82fc
JJ
151}
152
e9174d1e 153#[allow(deprecated)]
1a4d82fc
JJ
154impl Utf16Item {
155 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
156 /// replacement character (U+FFFD).
157 #[inline]
158 pub fn to_char_lossy(&self) -> char {
159 match *self {
160 Utf16Item::ScalarValue(c) => c,
b039eaaf 161 Utf16Item::LoneSurrogate(_) => '\u{FFFD}',
1a4d82fc
JJ
162 }
163 }
164}
165
92a42be0 166#[rustc_deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
e9174d1e
SL
167#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
168#[allow(deprecated)]
1a4d82fc
JJ
169impl<'a> Iterator for Utf16Items<'a> {
170 type Item = Utf16Item;
171
172 fn next(&mut self) -> Option<Utf16Item> {
b039eaaf
SL
173 self.decoder.next().map(|result| {
174 match result {
175 Ok(c) => Utf16Item::ScalarValue(c),
176 Err(s) => Utf16Item::LoneSurrogate(s),
177 }
e9174d1e 178 })
1a4d82fc
JJ
179 }
180
181 #[inline]
85aaf69f 182 fn size_hint(&self) -> (usize, Option<usize>) {
e9174d1e 183 self.decoder.size_hint()
1a4d82fc
JJ
184 }
185}
186
187/// Create an iterator over the UTF-16 encoded codepoints in `v`,
188/// returning invalid surrogates as `LoneSurrogate`s.
189///
c34b1796
AL
190/// # Examples
191///
192/// ```
e9174d1e 193/// #![feature(unicode, decode_utf16)]
92a42be0 194/// # #![allow(deprecated)]
c1a9b12d 195///
d9579d0f 196/// extern crate rustc_unicode;
1a4d82fc 197///
d9579d0f 198/// use rustc_unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
1a4d82fc 199///
c34b1796
AL
200/// fn main() {
201/// // 𝄞mus<invalid>ic<invalid>
202/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
203/// 0x0073, 0xDD1E, 0x0069, 0x0063,
204/// 0xD834];
1a4d82fc 205///
d9579d0f 206/// assert_eq!(rustc_unicode::str::utf16_items(&v).collect::<Vec<_>>(),
c34b1796
AL
207/// vec![ScalarValue('𝄞'),
208/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
209/// LoneSurrogate(0xDD1E),
210/// ScalarValue('i'), ScalarValue('c'),
211/// LoneSurrogate(0xD834)]);
212/// }
1a4d82fc 213/// ```
92a42be0 214#[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
e9174d1e
SL
215#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
216#[allow(deprecated)]
1a4d82fc 217pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
e9174d1e 218 Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
1a4d82fc
JJ
219}
220
221/// Iterator adaptor for encoding `char`s to UTF-16.
222#[derive(Clone)]
223pub struct Utf16Encoder<I> {
224 chars: I,
b039eaaf 225 extra: u16,
1a4d82fc
JJ
226}
227
228impl<I> Utf16Encoder<I> {
d9579d0f 229 /// Create a UTF-16 encoder from any `char` iterator.
b039eaaf
SL
230 pub fn new(chars: I) -> Utf16Encoder<I>
231 where I: Iterator<Item = char>
232 {
233 Utf16Encoder {
234 chars: chars,
235 extra: 0,
236 }
1a4d82fc
JJ
237 }
238}
239
240impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
241 type Item = u16;
242
243 #[inline]
244 fn next(&mut self) -> Option<u16> {
245 if self.extra != 0 {
246 let tmp = self.extra;
247 self.extra = 0;
248 return Some(tmp);
249 }
250
c34b1796 251 let mut buf = [0; 2];
1a4d82fc 252 self.chars.next().map(|ch| {
85aaf69f 253 let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
b039eaaf
SL
254 if n == 2 {
255 self.extra = buf[1];
256 }
1a4d82fc
JJ
257 buf[0]
258 })
259 }
260
261 #[inline]
85aaf69f 262 fn size_hint(&self) -> (usize, Option<usize>) {
1a4d82fc
JJ
263 let (low, high) = self.chars.size_hint();
264 // every char gets either one u16 or two u16,
265 // so this iterator is between 1 or 2 times as
266 // long as the underlying iterator.
267 (low, high.and_then(|n| n.checked_mul(2)))
268 }
269}
270
d9579d0f 271impl<'a> Iterator for SplitWhitespace<'a> {
1a4d82fc
JJ
272 type Item = &'a str;
273
b039eaaf
SL
274 fn next(&mut self) -> Option<&'a str> {
275 self.inner.next()
276 }
1a4d82fc 277}
d9579d0f 278impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
b039eaaf
SL
279 fn next_back(&mut self) -> Option<&'a str> {
280 self.inner.next_back()
281 }
1a4d82fc 282}