]> git.proxmox.com Git - rustc.git/blame - src/librustc_unicode/u_str.rs
New upstream version 1.14.0+dfsg1
[rustc.git] / src / librustc_unicode / u_str.rs
CommitLineData
1a4d82fc
JJ
1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
1a4d82fc
JJ
10
11//! Unicode-intensive string manipulations.
12//!
b039eaaf
SL
13//! This module provides functionality to `str` that requires the Unicode
14//! methods provided by the unicode parts of the CharExt trait.
1a4d82fc 15
1a4d82fc 16use core::char;
9e0c209e 17use core::iter::{Filter, FusedIterator};
1a4d82fc
JJ
18use core::str::Split;
19
d9579d0f
AL
20/// An iterator over the non-whitespace substrings of a string,
21/// separated by any amount of whitespace.
22#[stable(feature = "split_whitespace", since = "1.1.0")]
23pub struct SplitWhitespace<'a> {
85aaf69f 24 inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
1a4d82fc
JJ
25}
26
27/// Methods for Unicode string slices
28#[allow(missing_docs)] // docs in libcollections
29pub trait UnicodeStr {
d9579d0f 30 fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
1a4d82fc
JJ
31 fn is_whitespace(&self) -> bool;
32 fn is_alphanumeric(&self) -> bool;
7453a54e
SL
33 fn trim(&self) -> &str;
34 fn trim_left(&self) -> &str;
35 fn trim_right(&self) -> &str;
1a4d82fc
JJ
36}
37
38impl UnicodeStr for str {
d9579d0f
AL
39 #[inline]
40 fn split_whitespace(&self) -> SplitWhitespace {
b039eaaf
SL
41 fn is_not_empty(s: &&str) -> bool {
42 !s.is_empty()
43 }
1a4d82fc
JJ
44 let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
45
b039eaaf
SL
46 fn is_whitespace(c: char) -> bool {
47 c.is_whitespace()
48 }
1a4d82fc
JJ
49 let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
50
d9579d0f 51 SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
1a4d82fc
JJ
52 }
53
54 #[inline]
b039eaaf
SL
55 fn is_whitespace(&self) -> bool {
56 self.chars().all(|c| c.is_whitespace())
57 }
1a4d82fc
JJ
58
59 #[inline]
b039eaaf
SL
60 fn is_alphanumeric(&self) -> bool {
61 self.chars().all(|c| c.is_alphanumeric())
62 }
1a4d82fc 63
1a4d82fc
JJ
64 #[inline]
65 fn trim(&self) -> &str {
c34b1796 66 self.trim_matches(|c: char| c.is_whitespace())
1a4d82fc
JJ
67 }
68
69 #[inline]
70 fn trim_left(&self) -> &str {
85aaf69f 71 self.trim_left_matches(|c: char| c.is_whitespace())
1a4d82fc
JJ
72 }
73
74 #[inline]
75 fn trim_right(&self) -> &str {
85aaf69f 76 self.trim_right_matches(|c: char| c.is_whitespace())
1a4d82fc
JJ
77 }
78}
79
1a4d82fc
JJ
80// https://tools.ietf.org/html/rfc3629
81static UTF8_CHAR_WIDTH: [u8; 256] = [
821,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
831,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
841,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
851,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
861,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
871,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
881,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
891,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
900,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
910,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
920,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
930,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
940,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
952,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
963,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
974,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
98];
99
100/// Given a first byte, determine how many bytes are in this UTF-8 character
101#[inline]
85aaf69f
SL
102pub fn utf8_char_width(b: u8) -> usize {
103 return UTF8_CHAR_WIDTH[b as usize] as usize;
1a4d82fc
JJ
104}
105
106/// Determines if a vector of `u16` contains valid UTF-16
107pub fn is_utf16(v: &[u16]) -> bool {
108 let mut it = v.iter();
109 macro_rules! next { ($ret:expr) => {
110 match it.next() { Some(u) => *u, None => return $ret }
111 }
112 }
113 loop {
114 let u = next!(true);
115
116 match char::from_u32(u as u32) {
117 Some(_) => {}
118 None => {
119 let u2 = next!(false);
b039eaaf
SL
120 if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
121 return false;
122 }
1a4d82fc
JJ
123 }
124 }
125 }
126}
127
1a4d82fc
JJ
128/// Iterator adaptor for encoding `char`s to UTF-16.
129#[derive(Clone)]
130pub struct Utf16Encoder<I> {
131 chars: I,
b039eaaf 132 extra: u16,
1a4d82fc
JJ
133}
134
135impl<I> Utf16Encoder<I> {
d9579d0f 136 /// Create a UTF-16 encoder from any `char` iterator.
b039eaaf
SL
137 pub fn new(chars: I) -> Utf16Encoder<I>
138 where I: Iterator<Item = char>
139 {
140 Utf16Encoder {
141 chars: chars,
142 extra: 0,
143 }
1a4d82fc
JJ
144 }
145}
146
3157f602
XL
147impl<I> Iterator for Utf16Encoder<I>
148 where I: Iterator<Item = char>
149{
1a4d82fc
JJ
150 type Item = u16;
151
152 #[inline]
153 fn next(&mut self) -> Option<u16> {
154 if self.extra != 0 {
155 let tmp = self.extra;
156 self.extra = 0;
157 return Some(tmp);
158 }
159
c30ab7b3 160 let mut buf = [0; 2];
1a4d82fc 161 self.chars.next().map(|ch| {
c30ab7b3
SL
162 let n = CharExt::encode_utf16(ch, &mut buf).len();
163 if n == 2 {
164 self.extra = buf[1];
b039eaaf 165 }
c30ab7b3 166 buf[0]
1a4d82fc
JJ
167 })
168 }
169
170 #[inline]
85aaf69f 171 fn size_hint(&self) -> (usize, Option<usize>) {
1a4d82fc
JJ
172 let (low, high) = self.chars.size_hint();
173 // every char gets either one u16 or two u16,
174 // so this iterator is between 1 or 2 times as
175 // long as the underlying iterator.
176 (low, high.and_then(|n| n.checked_mul(2)))
177 }
178}
179
9e0c209e
SL
180#[unstable(feature = "fused", issue = "35602")]
181impl<I> FusedIterator for Utf16Encoder<I>
182 where I: FusedIterator<Item = char> {}
183
c30ab7b3 184#[stable(feature = "split_whitespace", since = "1.1.0")]
d9579d0f 185impl<'a> Iterator for SplitWhitespace<'a> {
1a4d82fc
JJ
186 type Item = &'a str;
187
b039eaaf
SL
188 fn next(&mut self) -> Option<&'a str> {
189 self.inner.next()
190 }
1a4d82fc 191}
c30ab7b3
SL
192
193#[stable(feature = "split_whitespace", since = "1.1.0")]
d9579d0f 194impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
b039eaaf
SL
195 fn next_back(&mut self) -> Option<&'a str> {
196 self.inner.next_back()
197 }
1a4d82fc 198}
9e0c209e
SL
199
200#[unstable(feature = "fused", issue = "35602")]
201impl<'a> FusedIterator for SplitWhitespace<'a> {}