src/libunicode/u_str.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! Unicode-intensive string manipulations.
  14 //!
  15 //! This module provides functionality to `str` that requires the Unicode methods provided by the
  16 //! unicode parts of the CharExt trait.
  17
  18 use self::GraphemeState::*;
  19 use core::prelude::*;
  20
  21 use core::char;
  22 use core::cmp;
  23 use core::iter::Filter;
  24 use core::mem;
  25 use core::slice;
  26 use core::str::Split;
  27
  28 use tables::grapheme::GraphemeCat;
  29
  30 /// An iterator over the words of a string, separated by a sequence of whitespace
  31 #[unstable(feature = "str_words",
  32            reason = "words() will be replaced by split_whitespace() in 1.1.0")]
  33 pub struct Words<'a> {
  34     inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
  35 }
  36
  37 /// Methods for Unicode string slices
  38 #[allow(missing_docs)] // docs in libcollections
  39 pub trait UnicodeStr {
  40     fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
  41     fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
  42     fn words<'a>(&'a self) -> Words<'a>;
  43     fn is_whitespace(&self) -> bool;
  44     fn is_alphanumeric(&self) -> bool;
  45     fn width(&self, is_cjk: bool) -> usize;
  46     fn trim<'a>(&'a self) -> &'a str;
  47     fn trim_left<'a>(&'a self) -> &'a str;
  48     fn trim_right<'a>(&'a self) -> &'a str;
  49 }
  50
  51 impl UnicodeStr for str {
  52     #[inline]
  53     fn graphemes(&self, is_extended: bool) -> Graphemes {
  54         Graphemes { string: self, extended: is_extended, cat: None, catb: None }
  55     }
  56
  57     #[inline]
  58     fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
  59         GraphemeIndices { start_offset: self.as_ptr() as usize, iter: self.graphemes(is_extended) }
  60     }
  61
  62     #[inline]
  63     fn words(&self) -> Words {
  64         fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
  65         let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
  66
  67         fn is_whitespace(c: char) -> bool { c.is_whitespace() }
  68         let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
  69
  70         Words { inner: self.split(is_whitespace).filter(is_not_empty) }
  71     }
  72
  73     #[inline]
  74     fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
  75
  76     #[inline]
  77     fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
  78
  79     #[inline]
  80     fn width(&self, is_cjk: bool) -> usize {
  81         self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
  82     }
  83
  84     #[inline]
  85     fn trim(&self) -> &str {
  86         self.trim_matches(|c: char| c.is_whitespace())
  87     }
  88
  89     #[inline]
  90     fn trim_left(&self) -> &str {
  91         self.trim_left_matches(|c: char| c.is_whitespace())
  92     }
  93
  94     #[inline]
  95     fn trim_right(&self) -> &str {
  96         self.trim_right_matches(|c: char| c.is_whitespace())
  97     }
  98 }
  99
 100 /// External iterator for grapheme clusters and byte offsets.
 101 #[derive(Clone)]
 102 pub struct GraphemeIndices<'a> {
 103     start_offset: usize,
 104     iter: Graphemes<'a>,
 105 }
 106
 107 impl<'a> Iterator for GraphemeIndices<'a> {
 108     type Item = (usize, &'a str);
 109
 110     #[inline]
 111     fn next(&mut self) -> Option<(usize, &'a str)> {
 112         self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
 113     }
 114
 115     #[inline]
 116     fn size_hint(&self) -> (usize, Option<usize>) {
 117         self.iter.size_hint()
 118     }
 119 }
 120
 121 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
 122     #[inline]
 123     fn next_back(&mut self) -> Option<(usize, &'a str)> {
 124         self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
 125     }
 126 }
 127
 128 /// External iterator for a string's
 129 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
 130 #[derive(Clone)]
 131 pub struct Graphemes<'a> {
 132     string: &'a str,
 133     extended: bool,
 134     cat: Option<GraphemeCat>,
 135     catb: Option<GraphemeCat>,
 136 }
 137
 138 // state machine for cluster boundary rules
 139 #[derive(PartialEq,Eq)]
 140 enum GraphemeState {
 141     Start,
 142     FindExtend,
 143     HangulL,
 144     HangulLV,
 145     HangulLVT,
 146     Regional,
 147 }
 148
 149 impl<'a> Iterator for Graphemes<'a> {
 150     type Item = &'a str;
 151
 152     #[inline]
 153     fn size_hint(&self) -> (usize, Option<usize>) {
 154         let slen = self.string.len();
 155         (cmp::min(slen, 1), Some(slen))
 156     }
 157
 158     #[inline]
 159     fn next(&mut self) -> Option<&'a str> {
 160         use tables::grapheme as gr;
 161         if self.string.is_empty() {
 162             return None;
 163         }
 164
 165         let mut take_curr = true;
 166         let mut idx = 0;
 167         let mut state = Start;
 168         let mut cat = gr::GC_Any;
 169         for (curr, ch) in self.string.char_indices() {
 170             idx = curr;
 171
 172             // retrieve cached category, if any
 173             // We do this because most of the time we would end up
 174             // looking up each character twice.
 175             cat = match self.cat {
 176                 None => gr::grapheme_category(ch),
 177                 _ => self.cat.take().unwrap()
 178             };
 179
 180             if match cat {
 181                 gr::GC_Extend => true,
 182                 gr::GC_SpacingMark if self.extended => true,
 183                 _ => false
 184             } {
 185                     state = FindExtend;     // rule GB9/GB9a
 186                     continue;
 187             }
 188
 189             state = match state {
 190                 Start if '\r' == ch => {
 191                     let slen = self.string.len();
 192                     let nidx = idx + 1;
 193                     if nidx != slen && self.string.char_at(nidx) == '\n' {
 194                         idx = nidx;             // rule GB3
 195                     }
 196                     break;                      // rule GB4
 197                 }
 198                 Start => match cat {
 199                     gr::GC_Control => break,
 200                     gr::GC_L => HangulL,
 201                     gr::GC_LV | gr::GC_V => HangulLV,
 202                     gr::GC_LVT | gr::GC_T => HangulLVT,
 203                     gr::GC_Regional_Indicator => Regional,
 204                     _ => FindExtend
 205                 },
 206                 FindExtend => {         // found non-extending when looking for extending
 207                     take_curr = false;
 208                     break;
 209                 },
 210                 HangulL => match cat {      // rule GB6: L x (L|V|LV|LVT)
 211                     gr::GC_L => continue,
 212                     gr::GC_LV | gr::GC_V => HangulLV,
 213                     gr::GC_LVT => HangulLVT,
 214                     _ => {
 215                         take_curr = false;
 216                         break;
 217                     }
 218                 },
 219                 HangulLV => match cat {     // rule GB7: (LV|V) x (V|T)
 220                     gr::GC_V => continue,
 221                     gr::GC_T => HangulLVT,
 222                     _ => {
 223                         take_curr = false;
 224                         break;
 225                     }
 226                 },
 227                 HangulLVT => match cat {    // rule GB8: (LVT|T) x T
 228                     gr::GC_T => continue,
 229                     _ => {
 230                         take_curr = false;
 231                         break;
 232                     }
 233                 },
 234                 Regional => match cat {     // rule GB8a
 235                     gr::GC_Regional_Indicator => continue,
 236                     _ => {
 237                         take_curr = false;
 238                         break;
 239                     }
 240                 }
 241             }
 242         }
 243
 244         self.cat = if take_curr {
 245             idx = idx + self.string.char_at(idx).len_utf8();
 246             None
 247         } else {
 248             Some(cat)
 249         };
 250
 251         let retstr = &self.string[..idx];
 252         self.string = &self.string[idx..];
 253         Some(retstr)
 254     }
 255 }
 256
 257 impl<'a> DoubleEndedIterator for Graphemes<'a> {
 258     #[inline]
 259     fn next_back(&mut self) -> Option<&'a str> {
 260         use tables::grapheme as gr;
 261         if self.string.is_empty() {
 262             return None;
 263         }
 264
 265         let mut take_curr = true;
 266         let mut idx = self.string.len();
 267         let mut previdx = idx;
 268         let mut state = Start;
 269         let mut cat = gr::GC_Any;
 270         for (curr, ch) in self.string.char_indices().rev() {
 271             previdx = idx;
 272             idx = curr;
 273
 274             // cached category, if any
 275             cat = match self.catb {
 276                 None => gr::grapheme_category(ch),
 277                 _ => self.catb.take().unwrap()
 278             };
 279
 280             // a matching state machine that runs *backwards* across an input string
 281             // note that this has some implications for the Hangul matching, since
 282             // we now need to know what the rightward letter is:
 283             //
 284             // Right to left, we have:
 285             //      L x L
 286             //      V x (L|V|LV)
 287             //      T x (V|T|LV|LVT)
 288             // HangulL means the letter to the right is L
 289             // HangulLV means the letter to the right is V
 290             // HangulLVT means the letter to the right is T
 291             state = match state {
 292                 Start if '\n' == ch => {
 293                     if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
 294                         idx -= 1;       // rule GB3
 295                     }
 296                     break;              // rule GB4
 297                 },
 298                 Start | FindExtend => match cat {
 299                     gr::GC_Extend => FindExtend,
 300                     gr::GC_SpacingMark if self.extended => FindExtend,
 301                     gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
 302                     gr::GC_V => HangulLV,
 303                     gr::GC_T => HangulLVT,
 304                     gr::GC_Regional_Indicator => Regional,
 305                     gr::GC_Control => {
 306                         take_curr = Start == state;
 307                         break;
 308                     },
 309                     _ => break
 310                 },
 311                 HangulL => match cat {      // char to right is an L
 312                     gr::GC_L => continue,               // L x L is the only legal match
 313                     _ => {
 314                         take_curr = false;
 315                         break;
 316                     }
 317                 },
 318                 HangulLV => match cat {     // char to right is a V
 319                     gr::GC_V => continue,               // V x V, right char is still V
 320                     gr::GC_L | gr::GC_LV => HangulL,    // (L|V) x V, right char is now L
 321                     _ => {
 322                         take_curr = false;
 323                         break;
 324                     }
 325                 },
 326                 HangulLVT => match cat {    // char to right is a T
 327                     gr::GC_T => continue,               // T x T, right char is still T
 328                     gr::GC_V => HangulLV,               // V x T, right char is now V
 329                     gr::GC_LV | gr::GC_LVT => HangulL,  // (LV|LVT) x T, right char is now L
 330                     _ => {
 331                         take_curr = false;
 332                         break;
 333                     }
 334                 },
 335                 Regional => match cat {     // rule GB8a
 336                     gr::GC_Regional_Indicator => continue,
 337                     _ => {
 338                         take_curr = false;
 339                         break;
 340                     }
 341                 }
 342             }
 343         }
 344
 345         self.catb = if take_curr {
 346             None
 347         } else  {
 348             idx = previdx;
 349             Some(cat)
 350         };
 351
 352         let retstr = &self.string[idx..];
 353         self.string = &self.string[..idx];
 354         Some(retstr)
 355     }
 356 }
 357
 358 // https://tools.ietf.org/html/rfc3629
 359 static UTF8_CHAR_WIDTH: [u8; 256] = [
 360 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 361 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
 362 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 363 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
 364 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 365 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
 366 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 367 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
 368 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 369 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
 370 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 371 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
 372 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 373 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
 374 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
 375 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 376 ];
 377
 378 /// Given a first byte, determine how many bytes are in this UTF-8 character
 379 #[inline]
 380 pub fn utf8_char_width(b: u8) -> usize {
 381     return UTF8_CHAR_WIDTH[b as usize] as usize;
 382 }
 383
 384 /// Determines if a vector of `u16` contains valid UTF-16
 385 pub fn is_utf16(v: &[u16]) -> bool {
 386     let mut it = v.iter();
 387     macro_rules! next { ($ret:expr) => {
 388             match it.next() { Some(u) => *u, None => return $ret }
 389         }
 390     }
 391     loop {
 392         let u = next!(true);
 393
 394         match char::from_u32(u as u32) {
 395             Some(_) => {}
 396             None => {
 397                 let u2 = next!(false);
 398                 if u < 0xD7FF || u > 0xDBFF ||
 399                     u2 < 0xDC00 || u2 > 0xDFFF { return false; }
 400             }
 401         }
 402     }
 403 }
 404
 405 /// An iterator that decodes UTF-16 encoded codepoints from a vector
 406 /// of `u16`s.
 407 #[derive(Clone)]
 408 pub struct Utf16Items<'a> {
 409     iter: slice::Iter<'a, u16>
 410 }
 411 /// The possibilities for values decoded from a `u16` stream.
 412 #[derive(Copy, PartialEq, Eq, Clone, Debug)]
 413 pub enum Utf16Item {
 414     /// A valid codepoint.
 415     ScalarValue(char),
 416     /// An invalid surrogate without its pair.
 417     LoneSurrogate(u16)
 418 }
 419
 420 impl Utf16Item {
 421     /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
 422     /// replacement character (U+FFFD).
 423     #[inline]
 424     pub fn to_char_lossy(&self) -> char {
 425         match *self {
 426             Utf16Item::ScalarValue(c) => c,
 427             Utf16Item::LoneSurrogate(_) => '\u{FFFD}'
 428         }
 429     }
 430 }
 431
 432 impl<'a> Iterator for Utf16Items<'a> {
 433     type Item = Utf16Item;
 434
 435     fn next(&mut self) -> Option<Utf16Item> {
 436         let u = match self.iter.next() {
 437             Some(u) => *u,
 438             None => return None
 439         };
 440
 441         if u < 0xD800 || 0xDFFF < u {
 442             // not a surrogate
 443             Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
 444         } else if u >= 0xDC00 {
 445             // a trailing surrogate
 446             Some(Utf16Item::LoneSurrogate(u))
 447         } else {
 448             // preserve state for rewinding.
 449             let old = self.iter.clone();
 450
 451             let u2 = match self.iter.next() {
 452                 Some(u2) => *u2,
 453                 // eof
 454                 None => return Some(Utf16Item::LoneSurrogate(u))
 455             };
 456             if u2 < 0xDC00 || u2 > 0xDFFF {
 457                 // not a trailing surrogate so we're not a valid
 458                 // surrogate pair, so rewind to redecode u2 next time.
 459                 self.iter = old.clone();
 460                 return Some(Utf16Item::LoneSurrogate(u))
 461             }
 462
 463             // all ok, so lets decode it.
 464             let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
 465             Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
 466         }
 467     }
 468
 469     #[inline]
 470     fn size_hint(&self) -> (usize, Option<usize>) {
 471         let (low, high) = self.iter.size_hint();
 472         // we could be entirely valid surrogates (2 elements per
 473         // char), or entirely non-surrogates (1 element per char)
 474         (low / 2, high)
 475     }
 476 }
 477
 478 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
 479 /// returning invalid surrogates as `LoneSurrogate`s.
 480 ///
 481 /// # Examples
 482 ///
 483 /// ```
 484 /// # #![feature(unicode)]
 485 /// extern crate unicode;
 486 ///
 487 /// use unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
 488 ///
 489 /// fn main() {
 490 ///     // 𝄞mus<invalid>ic<invalid>
 491 ///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 492 ///              0x0073, 0xDD1E, 0x0069, 0x0063,
 493 ///              0xD834];
 494 ///
 495 ///     assert_eq!(unicode::str::utf16_items(&v).collect::<Vec<_>>(),
 496 ///                vec![ScalarValue('𝄞'),
 497 ///                     ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
 498 ///                     LoneSurrogate(0xDD1E),
 499 ///                     ScalarValue('i'), ScalarValue('c'),
 500 ///                     LoneSurrogate(0xD834)]);
 501 /// }
 502 /// ```
 503 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
 504     Utf16Items { iter : v.iter() }
 505 }
 506
 507 /// Iterator adaptor for encoding `char`s to UTF-16.
 508 #[derive(Clone)]
 509 pub struct Utf16Encoder<I> {
 510     chars: I,
 511     extra: u16
 512 }
 513
 514 impl<I> Utf16Encoder<I> {
 515     /// Create an UTF-16 encoder from any `char` iterator.
 516     pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<Item=char> {
 517         Utf16Encoder { chars: chars, extra: 0 }
 518     }
 519 }
 520
 521 impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
 522     type Item = u16;
 523
 524     #[inline]
 525     fn next(&mut self) -> Option<u16> {
 526         if self.extra != 0 {
 527             let tmp = self.extra;
 528             self.extra = 0;
 529             return Some(tmp);
 530         }
 531
 532         let mut buf = [0; 2];
 533         self.chars.next().map(|ch| {
 534             let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
 535             if n == 2 { self.extra = buf[1]; }
 536             buf[0]
 537         })
 538     }
 539
 540     #[inline]
 541     fn size_hint(&self) -> (usize, Option<usize>) {
 542         let (low, high) = self.chars.size_hint();
 543         // every char gets either one u16 or two u16,
 544         // so this iterator is between 1 or 2 times as
 545         // long as the underlying iterator.
 546         (low, high.and_then(|n| n.checked_mul(2)))
 547     }
 548 }
 549
 550 impl<'a> Iterator for Words<'a> {
 551     type Item = &'a str;
 552
 553     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
 554 }
 555 impl<'a> DoubleEndedIterator for Words<'a> {
 556     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
 557 }