src/librustc_unicode/u_str.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Unicode-intensive string manipulations.
  12 //!
  13 //! This module provides functionality to `str` that requires the Unicode methods provided by the
  14 //! unicode parts of the CharExt trait.
  15
  16 use self::GraphemeState::*;
  17 use core::prelude::*;
  18
  19 use core::char;
  20 use core::cmp;
  21 use core::iter::Filter;
  22 use core::mem;
  23 use core::slice;
  24 use core::str::Split;
  25
  26 use tables::grapheme::GraphemeCat;
  27
  28 #[deprecated(reason = "struct Words is being replaced by struct SplitWhitespace",
  29              since = "1.1.0")]
  30 #[unstable(feature = "str_words",
  31            reason = "words() will be replaced by split_whitespace() in 1.1.0")]
  32 pub type Words<'a> = SplitWhitespace<'a>;
  33
  34 /// An iterator over the non-whitespace substrings of a string,
  35 /// separated by any amount of whitespace.
  36 #[stable(feature = "split_whitespace", since = "1.1.0")]
  37 pub struct SplitWhitespace<'a> {
  38     inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
  39 }
  40
  41 /// Methods for Unicode string slices
  42 #[allow(missing_docs)] // docs in libcollections
  43 pub trait UnicodeStr {
  44     fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
  45     fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
  46     #[allow(deprecated)]
  47     fn words<'a>(&'a self) -> Words<'a>;
  48     fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
  49     fn is_whitespace(&self) -> bool;
  50     fn is_alphanumeric(&self) -> bool;
  51     fn width(&self, is_cjk: bool) -> usize;
  52     fn trim<'a>(&'a self) -> &'a str;
  53     fn trim_left<'a>(&'a self) -> &'a str;
  54     fn trim_right<'a>(&'a self) -> &'a str;
  55 }
  56
  57 impl UnicodeStr for str {
  58     #[inline]
  59     fn graphemes(&self, is_extended: bool) -> Graphemes {
  60         Graphemes { string: self, extended: is_extended, cat: None, catb: None }
  61     }
  62
  63     #[inline]
  64     fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
  65         GraphemeIndices { start_offset: self.as_ptr() as usize, iter: self.graphemes(is_extended) }
  66     }
  67
  68     #[allow(deprecated)]
  69     #[inline]
  70     fn words(&self) -> Words {
  71         self.split_whitespace()
  72     }
  73
  74     #[inline]
  75     fn split_whitespace(&self) -> SplitWhitespace {
  76         fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
  77         let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
  78
  79         fn is_whitespace(c: char) -> bool { c.is_whitespace() }
  80         let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
  81
  82         SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
  83     }
  84
  85     #[inline]
  86     fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
  87
  88     #[inline]
  89     fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
  90
  91     #[allow(deprecated)]
  92     #[inline]
  93     fn width(&self, is_cjk: bool) -> usize {
  94         self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
  95     }
  96
  97     #[inline]
  98     fn trim(&self) -> &str {
  99         self.trim_matches(|c: char| c.is_whitespace())
 100     }
 101
 102     #[inline]
 103     fn trim_left(&self) -> &str {
 104         self.trim_left_matches(|c: char| c.is_whitespace())
 105     }
 106
 107     #[inline]
 108     fn trim_right(&self) -> &str {
 109         self.trim_right_matches(|c: char| c.is_whitespace())
 110     }
 111 }
 112
 113 /// External iterator for grapheme clusters and byte offsets.
 114 #[derive(Clone)]
 115 pub struct GraphemeIndices<'a> {
 116     start_offset: usize,
 117     iter: Graphemes<'a>,
 118 }
 119
 120 impl<'a> Iterator for GraphemeIndices<'a> {
 121     type Item = (usize, &'a str);
 122
 123     #[inline]
 124     fn next(&mut self) -> Option<(usize, &'a str)> {
 125         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
 126     }
 127
 128     #[inline]
 129     fn size_hint(&self) -> (usize, Option<usize>) {
 130         self.iter.size_hint()
 131     }
 132 }
 133
 134 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
 135     #[inline]
 136     fn next_back(&mut self) -> Option<(usize, &'a str)> {
 137         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
 138     }
 139 }
 140
 141 /// External iterator for a string's
 142 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
 143 #[derive(Clone)]
 144 pub struct Graphemes<'a> {
 145     string: &'a str,
 146     extended: bool,
 147     cat: Option<GraphemeCat>,
 148     catb: Option<GraphemeCat>,
 149 }
 150
 151 // state machine for cluster boundary rules
 152 #[derive(PartialEq,Eq)]
 153 enum GraphemeState {
 154     Start,
 155     FindExtend,
 156     HangulL,
 157     HangulLV,
 158     HangulLVT,
 159     Regional,
 160 }
 161
 162 impl<'a> Iterator for Graphemes<'a> {
 163     type Item = &'a str;
 164
 165     #[inline]
 166     fn size_hint(&self) -> (usize, Option<usize>) {
 167         let slen = self.string.len();
 168         (cmp::min(slen, 1), Some(slen))
 169     }
 170
 171     #[inline]
 172     fn next(&mut self) -> Option<&'a str> {
 173         use tables::grapheme as gr;
 174         if self.string.is_empty() {
 175             return None;
 176         }
 177
 178         let mut take_curr = true;
 179         let mut idx = 0;
 180         let mut state = Start;
 181         let mut cat = gr::GC_Any;
 182         for (curr, ch) in self.string.char_indices() {
 183             idx = curr;
 184
 185             // retrieve cached category, if any
 186             // We do this because most of the time we would end up
 187             // looking up each character twice.
 188             cat = match self.cat {
 189                 None => gr::grapheme_category(ch),
 190                 _ => self.cat.take().unwrap()
 191             };
 192
 193             if match cat {
 194                 gr::GC_Extend => true,
 195                 gr::GC_SpacingMark if self.extended => true,
 196                 _ => false
 197             } {
 198                     state = FindExtend;     // rule GB9/GB9a
 199                     continue;
 200             }
 201
 202             state = match state {
 203                 Start if '\r' == ch => {
 204                     let slen = self.string.len();
 205                     let nidx = idx + 1;
 206                     if nidx != slen && self.string.char_at(nidx) == '\n' {
 207                         idx = nidx;             // rule GB3
 208                     }
 209                     break;                      // rule GB4
 210                 }
 211                 Start => match cat {
 212                     gr::GC_Control => break,
 213                     gr::GC_L => HangulL,
 214                     gr::GC_LV | gr::GC_V => HangulLV,
 215                     gr::GC_LVT | gr::GC_T => HangulLVT,
 216                     gr::GC_Regional_Indicator => Regional,
 217                     _ => FindExtend
 218                 },
 219                 FindExtend => {         // found non-extending when looking for extending
 220                     take_curr = false;
 221                     break;
 222                 },
 223                 HangulL => match cat {      // rule GB6: L x (L|V|LV|LVT)
 224                     gr::GC_L => continue,
 225                     gr::GC_LV | gr::GC_V => HangulLV,
 226                     gr::GC_LVT => HangulLVT,
 227                     _ => {
 228                         take_curr = false;
 229                         break;
 230                     }
 231                 },
 232                 HangulLV => match cat {     // rule GB7: (LV|V) x (V|T)
 233                     gr::GC_V => continue,
 234                     gr::GC_T => HangulLVT,
 235                     _ => {
 236                         take_curr = false;
 237                         break;
 238                     }
 239                 },
 240                 HangulLVT => match cat {    // rule GB8: (LVT|T) x T
 241                     gr::GC_T => continue,
 242                     _ => {
 243                         take_curr = false;
 244                         break;
 245                     }
 246                 },
 247                 Regional => match cat {     // rule GB8a
 248                     gr::GC_Regional_Indicator => continue,
 249                     _ => {
 250                         take_curr = false;
 251                         break;
 252                     }
 253                 }
 254             }
 255         }
 256
 257         self.cat = if take_curr {
 258             idx = idx + self.string.char_at(idx).len_utf8();
 259             None
 260         } else {
 261             Some(cat)
 262         };
 263
 264         let retstr = &self.string[..idx];
 265         self.string = &self.string[idx..];
 266         Some(retstr)
 267     }
 268 }
 269
 270 impl<'a> DoubleEndedIterator for Graphemes<'a> {
 271     #[inline]
 272     fn next_back(&mut self) -> Option<&'a str> {
 273         use tables::grapheme as gr;
 274         if self.string.is_empty() {
 275             return None;
 276         }
 277
 278         let mut take_curr = true;
 279         let mut idx = self.string.len();
 280         let mut previdx = idx;
 281         let mut state = Start;
 282         let mut cat = gr::GC_Any;
 283         for (curr, ch) in self.string.char_indices().rev() {
 284             previdx = idx;
 285             idx = curr;
 286
 287             // cached category, if any
 288             cat = match self.catb {
 289                 None => gr::grapheme_category(ch),
 290                 _ => self.catb.take().unwrap()
 291             };
 292
 293             // a matching state machine that runs *backwards* across an input string
 294             // note that this has some implications for the Hangul matching, since
 295             // we now need to know what the rightward letter is:
 296             //
 297             // Right to left, we have:
 298             //      L x L
 299             //      V x (L|V|LV)
 300             //      T x (V|T|LV|LVT)
 301             // HangulL means the letter to the right is L
 302             // HangulLV means the letter to the right is V
 303             // HangulLVT means the letter to the right is T
 304             state = match state {
 305                 Start if '\n' == ch => {
 306                     if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
 307                         idx -= 1;       // rule GB3
 308                     }
 309                     break;              // rule GB4
 310                 },
 311                 Start | FindExtend => match cat {
 312                     gr::GC_Extend => FindExtend,
 313                     gr::GC_SpacingMark if self.extended => FindExtend,
 314                     gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
 315                     gr::GC_V => HangulLV,
 316                     gr::GC_T => HangulLVT,
 317                     gr::GC_Regional_Indicator => Regional,
 318                     gr::GC_Control => {
 319                         take_curr = Start == state;
 320                         break;
 321                     },
 322                     _ => break
 323                 },
 324                 HangulL => match cat {      // char to right is an L
 325                     gr::GC_L => continue,               // L x L is the only legal match
 326                     _ => {
 327                         take_curr = false;
 328                         break;
 329                     }
 330                 },
 331                 HangulLV => match cat {     // char to right is a V
 332                     gr::GC_V => continue,               // V x V, right char is still V
 333                     gr::GC_L | gr::GC_LV => HangulL,    // (L|V) x V, right char is now L
 334                     _ => {
 335                         take_curr = false;
 336                         break;
 337                     }
 338                 },
 339                 HangulLVT => match cat {    // char to right is a T
 340                     gr::GC_T => continue,               // T x T, right char is still T
 341                     gr::GC_V => HangulLV,               // V x T, right char is now V
 342                     gr::GC_LV | gr::GC_LVT => HangulL,  // (LV|LVT) x T, right char is now L
 343                     _ => {
 344                         take_curr = false;
 345                         break;
 346                     }
 347                 },
 348                 Regional => match cat {     // rule GB8a
 349                     gr::GC_Regional_Indicator => continue,
 350                     _ => {
 351                         take_curr = false;
 352                         break;
 353                     }
 354                 }
 355             }
 356         }
 357
 358         self.catb = if take_curr {
 359             None
 360         } else  {
 361             idx = previdx;
 362             Some(cat)
 363         };
 364
 365         let retstr = &self.string[idx..];
 366         self.string = &self.string[..idx];
 367         Some(retstr)
 368     }
 369 }
 370
 371 // https://tools.ietf.org/html/rfc3629
 372 static UTF8_CHAR_WIDTH: [u8; 256] = [
 373 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 374 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
 375 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 376 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
 377 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 378 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
 379 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 380 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
 381 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 382 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
 383 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 384 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
 385 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 386 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
 387 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
 388 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 389 ];
 390
 391 /// Given a first byte, determine how many bytes are in this UTF-8 character
 392 #[inline]
 393 pub fn utf8_char_width(b: u8) -> usize {
 394     return UTF8_CHAR_WIDTH[b as usize] as usize;
 395 }
 396
 397 /// Determines if a vector of `u16` contains valid UTF-16
 398 pub fn is_utf16(v: &[u16]) -> bool {
 399     let mut it = v.iter();
 400     macro_rules! next { ($ret:expr) => {
 401             match it.next() { Some(u) => *u, None => return $ret }
 402         }
 403     }
 404     loop {
 405         let u = next!(true);
 406
 407         match char::from_u32(u as u32) {
 408             Some(_) => {}
 409             None => {
 410                 let u2 = next!(false);
 411                 if u < 0xD7FF || u > 0xDBFF ||
 412                     u2 < 0xDC00 || u2 > 0xDFFF { return false; }
 413             }
 414         }
 415     }
 416 }
 417
 418 /// An iterator that decodes UTF-16 encoded codepoints from a vector
 419 /// of `u16`s.
 420 #[derive(Clone)]
 421 pub struct Utf16Items<'a> {
 422     iter: slice::Iter<'a, u16>
 423 }
 424 /// The possibilities for values decoded from a `u16` stream.
 425 #[derive(Copy, PartialEq, Eq, Clone, Debug)]
 426 pub enum Utf16Item {
 427     /// A valid codepoint.
 428     ScalarValue(char),
 429     /// An invalid surrogate without its pair.
 430     LoneSurrogate(u16)
 431 }
 432
 433 impl Utf16Item {
 434     /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
 435     /// replacement character (U+FFFD).
 436     #[inline]
 437     pub fn to_char_lossy(&self) -> char {
 438         match *self {
 439             Utf16Item::ScalarValue(c) => c,
 440             Utf16Item::LoneSurrogate(_) => '\u{FFFD}'
 441         }
 442     }
 443 }
 444
 445 impl<'a> Iterator for Utf16Items<'a> {
 446     type Item = Utf16Item;
 447
 448     fn next(&mut self) -> Option<Utf16Item> {
 449         let u = match self.iter.next() {
 450             Some(u) => *u,
 451             None => return None
 452         };
 453
 454         if u < 0xD800 || 0xDFFF < u {
 455             // not a surrogate
 456             Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
 457         } else if u >= 0xDC00 {
 458             // a trailing surrogate
 459             Some(Utf16Item::LoneSurrogate(u))
 460         } else {
 461             // preserve state for rewinding.
 462             let old = self.iter.clone();
 463
 464             let u2 = match self.iter.next() {
 465                 Some(u2) => *u2,
 466                 // eof
 467                 None => return Some(Utf16Item::LoneSurrogate(u))
 468             };
 469             if u2 < 0xDC00 || u2 > 0xDFFF {
 470                 // not a trailing surrogate so we're not a valid
 471                 // surrogate pair, so rewind to redecode u2 next time.
 472                 self.iter = old.clone();
 473                 return Some(Utf16Item::LoneSurrogate(u))
 474             }
 475
 476             // all ok, so lets decode it.
 477             let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
 478             Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
 479         }
 480     }
 481
 482     #[inline]
 483     fn size_hint(&self) -> (usize, Option<usize>) {
 484         let (low, high) = self.iter.size_hint();
 485         // we could be entirely valid surrogates (2 elements per
 486         // char), or entirely non-surrogates (1 element per char)
 487         (low / 2, high)
 488     }
 489 }
 490
 491 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
 492 /// returning invalid surrogates as `LoneSurrogate`s.
 493 ///
 494 /// # Examples
 495 ///
 496 /// ```
 497 /// # #![feature(unicode)]
 498 /// extern crate rustc_unicode;
 499 ///
 500 /// use rustc_unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
 501 ///
 502 /// fn main() {
 503 ///     // 𝄞mus<invalid>ic<invalid>
 504 ///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 505 ///              0x0073, 0xDD1E, 0x0069, 0x0063,
 506 ///              0xD834];
 507 ///
 508 ///     assert_eq!(rustc_unicode::str::utf16_items(&v).collect::<Vec<_>>(),
 509 ///                vec![ScalarValue('𝄞'),
 510 ///                     ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
 511 ///                     LoneSurrogate(0xDD1E),
 512 ///                     ScalarValue('i'), ScalarValue('c'),
 513 ///                     LoneSurrogate(0xD834)]);
 514 /// }
 515 /// ```
 516 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
 517     Utf16Items { iter : v.iter() }
 518 }
 519
 520 /// Iterator adaptor for encoding `char`s to UTF-16.
 521 #[derive(Clone)]
 522 pub struct Utf16Encoder<I> {
 523     chars: I,
 524     extra: u16
 525 }
 526
 527 impl<I> Utf16Encoder<I> {
 528     /// Create a UTF-16 encoder from any `char` iterator.
 529     pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<Item=char> {
 530         Utf16Encoder { chars: chars, extra: 0 }
 531     }
 532 }
 533
 534 impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
 535     type Item = u16;
 536
 537     #[inline]
 538     fn next(&mut self) -> Option<u16> {
 539         if self.extra != 0 {
 540             let tmp = self.extra;
 541             self.extra = 0;
 542             return Some(tmp);
 543         }
 544
 545         let mut buf = [0; 2];
 546         self.chars.next().map(|ch| {
 547             let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
 548             if n == 2 { self.extra = buf[1]; }
 549             buf[0]
 550         })
 551     }
 552
 553     #[inline]
 554     fn size_hint(&self) -> (usize, Option<usize>) {
 555         let (low, high) = self.chars.size_hint();
 556         // every char gets either one u16 or two u16,
 557         // so this iterator is between 1 or 2 times as
 558         // long as the underlying iterator.
 559         (low, high.and_then(|n| n.checked_mul(2)))
 560     }
 561 }
 562
 563 impl<'a> Iterator for SplitWhitespace<'a> {
 564     type Item = &'a str;
 565
 566     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
 567 }
 568 impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
 569     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
 570 }