]> git.proxmox.com Git - rustc.git/blob - src/libunicode/u_str.rs
0e424ea0d708e8a9a1da5130ffc8a2450080ea7c
[rustc.git] / src / libunicode / u_str.rs
1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 //
11 // ignore-lexer-test FIXME #15679
12
13 //! Unicode-intensive string manipulations.
14 //!
15 //! This module provides functionality to `str` that requires the Unicode methods provided by the
16 //! unicode parts of the CharExt trait.
17
18 use self::GraphemeState::*;
19 use core::prelude::*;
20
21 use core::char;
22 use core::cmp;
23 use core::iter::Filter;
24 use core::mem;
25 use core::slice;
26 use core::str::Split;
27
28 use tables::grapheme::GraphemeCat;
29
30 /// An iterator over the words of a string, separated by a sequence of whitespace
31 #[unstable(feature = "str_words",
32 reason = "words() will be replaced by split_whitespace() in 1.1.0")]
33 pub struct Words<'a> {
34 inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
35 }
36
37 /// Methods for Unicode string slices
38 #[allow(missing_docs)] // docs in libcollections
39 pub trait UnicodeStr {
40 fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
41 fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
42 fn words<'a>(&'a self) -> Words<'a>;
43 fn is_whitespace(&self) -> bool;
44 fn is_alphanumeric(&self) -> bool;
45 fn width(&self, is_cjk: bool) -> usize;
46 fn trim<'a>(&'a self) -> &'a str;
47 fn trim_left<'a>(&'a self) -> &'a str;
48 fn trim_right<'a>(&'a self) -> &'a str;
49 }
50
51 impl UnicodeStr for str {
52 #[inline]
53 fn graphemes(&self, is_extended: bool) -> Graphemes {
54 Graphemes { string: self, extended: is_extended, cat: None, catb: None }
55 }
56
57 #[inline]
58 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
59 GraphemeIndices { start_offset: self.as_ptr() as usize, iter: self.graphemes(is_extended) }
60 }
61
62 #[inline]
63 fn words(&self) -> Words {
64 fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
65 let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
66
67 fn is_whitespace(c: char) -> bool { c.is_whitespace() }
68 let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
69
70 Words { inner: self.split(is_whitespace).filter(is_not_empty) }
71 }
72
73 #[inline]
74 fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
75
76 #[inline]
77 fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
78
79 #[inline]
80 fn width(&self, is_cjk: bool) -> usize {
81 self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
82 }
83
84 #[inline]
85 fn trim(&self) -> &str {
86 self.trim_matches(|c: char| c.is_whitespace())
87 }
88
89 #[inline]
90 fn trim_left(&self) -> &str {
91 self.trim_left_matches(|c: char| c.is_whitespace())
92 }
93
94 #[inline]
95 fn trim_right(&self) -> &str {
96 self.trim_right_matches(|c: char| c.is_whitespace())
97 }
98 }
99
100 /// External iterator for grapheme clusters and byte offsets.
101 #[derive(Clone)]
102 pub struct GraphemeIndices<'a> {
103 start_offset: usize,
104 iter: Graphemes<'a>,
105 }
106
107 impl<'a> Iterator for GraphemeIndices<'a> {
108 type Item = (usize, &'a str);
109
110 #[inline]
111 fn next(&mut self) -> Option<(usize, &'a str)> {
112 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
113 }
114
115 #[inline]
116 fn size_hint(&self) -> (usize, Option<usize>) {
117 self.iter.size_hint()
118 }
119 }
120
121 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
122 #[inline]
123 fn next_back(&mut self) -> Option<(usize, &'a str)> {
124 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
125 }
126 }
127
128 /// External iterator for a string's
129 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
130 #[derive(Clone)]
131 pub struct Graphemes<'a> {
132 string: &'a str,
133 extended: bool,
134 cat: Option<GraphemeCat>,
135 catb: Option<GraphemeCat>,
136 }
137
138 // state machine for cluster boundary rules
139 #[derive(PartialEq,Eq)]
140 enum GraphemeState {
141 Start,
142 FindExtend,
143 HangulL,
144 HangulLV,
145 HangulLVT,
146 Regional,
147 }
148
149 impl<'a> Iterator for Graphemes<'a> {
150 type Item = &'a str;
151
152 #[inline]
153 fn size_hint(&self) -> (usize, Option<usize>) {
154 let slen = self.string.len();
155 (cmp::min(slen, 1), Some(slen))
156 }
157
158 #[inline]
159 fn next(&mut self) -> Option<&'a str> {
160 use tables::grapheme as gr;
161 if self.string.is_empty() {
162 return None;
163 }
164
165 let mut take_curr = true;
166 let mut idx = 0;
167 let mut state = Start;
168 let mut cat = gr::GC_Any;
169 for (curr, ch) in self.string.char_indices() {
170 idx = curr;
171
172 // retrieve cached category, if any
173 // We do this because most of the time we would end up
174 // looking up each character twice.
175 cat = match self.cat {
176 None => gr::grapheme_category(ch),
177 _ => self.cat.take().unwrap()
178 };
179
180 if match cat {
181 gr::GC_Extend => true,
182 gr::GC_SpacingMark if self.extended => true,
183 _ => false
184 } {
185 state = FindExtend; // rule GB9/GB9a
186 continue;
187 }
188
189 state = match state {
190 Start if '\r' == ch => {
191 let slen = self.string.len();
192 let nidx = idx + 1;
193 if nidx != slen && self.string.char_at(nidx) == '\n' {
194 idx = nidx; // rule GB3
195 }
196 break; // rule GB4
197 }
198 Start => match cat {
199 gr::GC_Control => break,
200 gr::GC_L => HangulL,
201 gr::GC_LV | gr::GC_V => HangulLV,
202 gr::GC_LVT | gr::GC_T => HangulLVT,
203 gr::GC_Regional_Indicator => Regional,
204 _ => FindExtend
205 },
206 FindExtend => { // found non-extending when looking for extending
207 take_curr = false;
208 break;
209 },
210 HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
211 gr::GC_L => continue,
212 gr::GC_LV | gr::GC_V => HangulLV,
213 gr::GC_LVT => HangulLVT,
214 _ => {
215 take_curr = false;
216 break;
217 }
218 },
219 HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
220 gr::GC_V => continue,
221 gr::GC_T => HangulLVT,
222 _ => {
223 take_curr = false;
224 break;
225 }
226 },
227 HangulLVT => match cat { // rule GB8: (LVT|T) x T
228 gr::GC_T => continue,
229 _ => {
230 take_curr = false;
231 break;
232 }
233 },
234 Regional => match cat { // rule GB8a
235 gr::GC_Regional_Indicator => continue,
236 _ => {
237 take_curr = false;
238 break;
239 }
240 }
241 }
242 }
243
244 self.cat = if take_curr {
245 idx = idx + self.string.char_at(idx).len_utf8();
246 None
247 } else {
248 Some(cat)
249 };
250
251 let retstr = &self.string[..idx];
252 self.string = &self.string[idx..];
253 Some(retstr)
254 }
255 }
256
257 impl<'a> DoubleEndedIterator for Graphemes<'a> {
258 #[inline]
259 fn next_back(&mut self) -> Option<&'a str> {
260 use tables::grapheme as gr;
261 if self.string.is_empty() {
262 return None;
263 }
264
265 let mut take_curr = true;
266 let mut idx = self.string.len();
267 let mut previdx = idx;
268 let mut state = Start;
269 let mut cat = gr::GC_Any;
270 for (curr, ch) in self.string.char_indices().rev() {
271 previdx = idx;
272 idx = curr;
273
274 // cached category, if any
275 cat = match self.catb {
276 None => gr::grapheme_category(ch),
277 _ => self.catb.take().unwrap()
278 };
279
280 // a matching state machine that runs *backwards* across an input string
281 // note that this has some implications for the Hangul matching, since
282 // we now need to know what the rightward letter is:
283 //
284 // Right to left, we have:
285 // L x L
286 // V x (L|V|LV)
287 // T x (V|T|LV|LVT)
288 // HangulL means the letter to the right is L
289 // HangulLV means the letter to the right is V
290 // HangulLVT means the letter to the right is T
291 state = match state {
292 Start if '\n' == ch => {
293 if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
294 idx -= 1; // rule GB3
295 }
296 break; // rule GB4
297 },
298 Start | FindExtend => match cat {
299 gr::GC_Extend => FindExtend,
300 gr::GC_SpacingMark if self.extended => FindExtend,
301 gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
302 gr::GC_V => HangulLV,
303 gr::GC_T => HangulLVT,
304 gr::GC_Regional_Indicator => Regional,
305 gr::GC_Control => {
306 take_curr = Start == state;
307 break;
308 },
309 _ => break
310 },
311 HangulL => match cat { // char to right is an L
312 gr::GC_L => continue, // L x L is the only legal match
313 _ => {
314 take_curr = false;
315 break;
316 }
317 },
318 HangulLV => match cat { // char to right is a V
319 gr::GC_V => continue, // V x V, right char is still V
320 gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
321 _ => {
322 take_curr = false;
323 break;
324 }
325 },
326 HangulLVT => match cat { // char to right is a T
327 gr::GC_T => continue, // T x T, right char is still T
328 gr::GC_V => HangulLV, // V x T, right char is now V
329 gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
330 _ => {
331 take_curr = false;
332 break;
333 }
334 },
335 Regional => match cat { // rule GB8a
336 gr::GC_Regional_Indicator => continue,
337 _ => {
338 take_curr = false;
339 break;
340 }
341 }
342 }
343 }
344
345 self.catb = if take_curr {
346 None
347 } else {
348 idx = previdx;
349 Some(cat)
350 };
351
352 let retstr = &self.string[idx..];
353 self.string = &self.string[..idx];
354 Some(retstr)
355 }
356 }
357
358 // https://tools.ietf.org/html/rfc3629
359 static UTF8_CHAR_WIDTH: [u8; 256] = [
360 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
361 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
362 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
363 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
364 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
365 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
366 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
367 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
368 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
369 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
370 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
371 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
372 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
373 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
374 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
375 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
376 ];
377
378 /// Given a first byte, determine how many bytes are in this UTF-8 character
379 #[inline]
380 pub fn utf8_char_width(b: u8) -> usize {
381 return UTF8_CHAR_WIDTH[b as usize] as usize;
382 }
383
384 /// Determines if a vector of `u16` contains valid UTF-16
385 pub fn is_utf16(v: &[u16]) -> bool {
386 let mut it = v.iter();
387 macro_rules! next { ($ret:expr) => {
388 match it.next() { Some(u) => *u, None => return $ret }
389 }
390 }
391 loop {
392 let u = next!(true);
393
394 match char::from_u32(u as u32) {
395 Some(_) => {}
396 None => {
397 let u2 = next!(false);
398 if u < 0xD7FF || u > 0xDBFF ||
399 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
400 }
401 }
402 }
403 }
404
405 /// An iterator that decodes UTF-16 encoded codepoints from a vector
406 /// of `u16`s.
407 #[derive(Clone)]
408 pub struct Utf16Items<'a> {
409 iter: slice::Iter<'a, u16>
410 }
411 /// The possibilities for values decoded from a `u16` stream.
412 #[derive(Copy, PartialEq, Eq, Clone, Debug)]
413 pub enum Utf16Item {
414 /// A valid codepoint.
415 ScalarValue(char),
416 /// An invalid surrogate without its pair.
417 LoneSurrogate(u16)
418 }
419
420 impl Utf16Item {
421 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
422 /// replacement character (U+FFFD).
423 #[inline]
424 pub fn to_char_lossy(&self) -> char {
425 match *self {
426 Utf16Item::ScalarValue(c) => c,
427 Utf16Item::LoneSurrogate(_) => '\u{FFFD}'
428 }
429 }
430 }
431
432 impl<'a> Iterator for Utf16Items<'a> {
433 type Item = Utf16Item;
434
435 fn next(&mut self) -> Option<Utf16Item> {
436 let u = match self.iter.next() {
437 Some(u) => *u,
438 None => return None
439 };
440
441 if u < 0xD800 || 0xDFFF < u {
442 // not a surrogate
443 Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
444 } else if u >= 0xDC00 {
445 // a trailing surrogate
446 Some(Utf16Item::LoneSurrogate(u))
447 } else {
448 // preserve state for rewinding.
449 let old = self.iter.clone();
450
451 let u2 = match self.iter.next() {
452 Some(u2) => *u2,
453 // eof
454 None => return Some(Utf16Item::LoneSurrogate(u))
455 };
456 if u2 < 0xDC00 || u2 > 0xDFFF {
457 // not a trailing surrogate so we're not a valid
458 // surrogate pair, so rewind to redecode u2 next time.
459 self.iter = old.clone();
460 return Some(Utf16Item::LoneSurrogate(u))
461 }
462
463 // all ok, so lets decode it.
464 let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
465 Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
466 }
467 }
468
469 #[inline]
470 fn size_hint(&self) -> (usize, Option<usize>) {
471 let (low, high) = self.iter.size_hint();
472 // we could be entirely valid surrogates (2 elements per
473 // char), or entirely non-surrogates (1 element per char)
474 (low / 2, high)
475 }
476 }
477
478 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
479 /// returning invalid surrogates as `LoneSurrogate`s.
480 ///
481 /// # Examples
482 ///
483 /// ```
484 /// # #![feature(unicode)]
485 /// extern crate unicode;
486 ///
487 /// use unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
488 ///
489 /// fn main() {
490 /// // 𝄞mus<invalid>ic<invalid>
491 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
492 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
493 /// 0xD834];
494 ///
495 /// assert_eq!(unicode::str::utf16_items(&v).collect::<Vec<_>>(),
496 /// vec![ScalarValue('𝄞'),
497 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
498 /// LoneSurrogate(0xDD1E),
499 /// ScalarValue('i'), ScalarValue('c'),
500 /// LoneSurrogate(0xD834)]);
501 /// }
502 /// ```
503 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
504 Utf16Items { iter : v.iter() }
505 }
506
507 /// Iterator adaptor for encoding `char`s to UTF-16.
508 #[derive(Clone)]
509 pub struct Utf16Encoder<I> {
510 chars: I,
511 extra: u16
512 }
513
514 impl<I> Utf16Encoder<I> {
515 /// Create an UTF-16 encoder from any `char` iterator.
516 pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<Item=char> {
517 Utf16Encoder { chars: chars, extra: 0 }
518 }
519 }
520
521 impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
522 type Item = u16;
523
524 #[inline]
525 fn next(&mut self) -> Option<u16> {
526 if self.extra != 0 {
527 let tmp = self.extra;
528 self.extra = 0;
529 return Some(tmp);
530 }
531
532 let mut buf = [0; 2];
533 self.chars.next().map(|ch| {
534 let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
535 if n == 2 { self.extra = buf[1]; }
536 buf[0]
537 })
538 }
539
540 #[inline]
541 fn size_hint(&self) -> (usize, Option<usize>) {
542 let (low, high) = self.chars.size_hint();
543 // every char gets either one u16 or two u16,
544 // so this iterator is between 1 or 2 times as
545 // long as the underlying iterator.
546 (low, high.and_then(|n| n.checked_mul(2)))
547 }
548 }
549
550 impl<'a> Iterator for Words<'a> {
551 type Item = &'a str;
552
553 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
554 }
555 impl<'a> DoubleEndedIterator for Words<'a> {
556 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
557 }