1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Unicode-intensive string manipulations.
15 //! This module provides functionality to `str` that requires the Unicode methods provided by the
16 //! unicode parts of the CharExt trait.
18 use self::GraphemeState
::*;
23 use core
::iter
::Filter
;
28 use tables
::grapheme
::GraphemeCat
;
30 /// An iterator over the words of a string, separated by a sequence of whitespace
31 #[unstable(feature = "str_words",
32 reason
= "words() will be replaced by split_whitespace() in 1.1.0")]
33 pub struct Words
<'a
> {
34 inner
: Filter
<Split
<'a
, fn(char) -> bool
>, fn(&&str) -> bool
>,
37 /// Methods for Unicode string slices
38 #[allow(missing_docs)] // docs in libcollections
39 pub trait UnicodeStr
{
40 fn graphemes
<'a
>(&'a
self, is_extended
: bool
) -> Graphemes
<'a
>;
41 fn grapheme_indices
<'a
>(&'a
self, is_extended
: bool
) -> GraphemeIndices
<'a
>;
42 fn words
<'a
>(&'a
self) -> Words
<'a
>;
43 fn is_whitespace(&self) -> bool
;
44 fn is_alphanumeric(&self) -> bool
;
45 fn width(&self, is_cjk
: bool
) -> usize;
46 fn trim
<'a
>(&'a
self) -> &'a
str;
47 fn trim_left
<'a
>(&'a
self) -> &'a
str;
48 fn trim_right
<'a
>(&'a
self) -> &'a
str;
51 impl UnicodeStr
for str {
53 fn graphemes(&self, is_extended
: bool
) -> Graphemes
{
54 Graphemes { string: self, extended: is_extended, cat: None, catb: None }
58 fn grapheme_indices(&self, is_extended
: bool
) -> GraphemeIndices
{
59 GraphemeIndices { start_offset: self.as_ptr() as usize, iter: self.graphemes(is_extended) }
63 fn words(&self) -> Words
{
64 fn is_not_empty(s
: &&str) -> bool { !s.is_empty() }
65 let is_not_empty
: fn(&&str) -> bool
= is_not_empty
; // coerce to fn pointer
67 fn is_whitespace(c
: char) -> bool { c.is_whitespace() }
68 let is_whitespace
: fn(char) -> bool
= is_whitespace
; // coerce to fn pointer
70 Words { inner: self.split(is_whitespace).filter(is_not_empty) }
74 fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
77 fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
80 fn width(&self, is_cjk
: bool
) -> usize {
81 self.chars().map(|c
| c
.width(is_cjk
).unwrap_or(0)).sum()
85 fn trim(&self) -> &str {
86 self.trim_matches(|c
: char| c
.is_whitespace())
90 fn trim_left(&self) -> &str {
91 self.trim_left_matches(|c
: char| c
.is_whitespace())
95 fn trim_right(&self) -> &str {
96 self.trim_right_matches(|c
: char| c
.is_whitespace())
100 /// External iterator for grapheme clusters and byte offsets.
102 pub struct GraphemeIndices
<'a
> {
107 impl<'a
> Iterator
for GraphemeIndices
<'a
> {
108 type Item
= (usize, &'a
str);
111 fn next(&mut self) -> Option
<(usize, &'a
str)> {
112 self.iter
.next().map(|s
| (s
.as_ptr() as usize - self.start_offset
, s
))
116 fn size_hint(&self) -> (usize, Option
<usize>) {
117 self.iter
.size_hint()
121 impl<'a
> DoubleEndedIterator
for GraphemeIndices
<'a
> {
123 fn next_back(&mut self) -> Option
<(usize, &'a
str)> {
124 self.iter
.next_back().map(|s
| (s
.as_ptr() as usize - self.start_offset
, s
))
128 /// External iterator for a string's
129 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
131 pub struct Graphemes
<'a
> {
134 cat
: Option
<GraphemeCat
>,
135 catb
: Option
<GraphemeCat
>,
138 // state machine for cluster boundary rules
139 #[derive(PartialEq,Eq)]
149 impl<'a
> Iterator
for Graphemes
<'a
> {
153 fn size_hint(&self) -> (usize, Option
<usize>) {
154 let slen
= self.string
.len();
155 (cmp
::min(slen
, 1), Some(slen
))
159 fn next(&mut self) -> Option
<&'a
str> {
160 use tables
::grapheme
as gr
;
161 if self.string
.is_empty() {
165 let mut take_curr
= true;
167 let mut state
= Start
;
168 let mut cat
= gr
::GC_Any
;
169 for (curr
, ch
) in self.string
.char_indices() {
172 // retrieve cached category, if any
173 // We do this because most of the time we would end up
174 // looking up each character twice.
175 cat
= match self.cat
{
176 None
=> gr
::grapheme_category(ch
),
177 _
=> self.cat
.take().unwrap()
181 gr
::GC_Extend
=> true,
182 gr
::GC_SpacingMark
if self.extended
=> true,
185 state
= FindExtend
; // rule GB9/GB9a
189 state
= match state
{
190 Start
if '
\r'
== ch
=> {
191 let slen
= self.string
.len();
193 if nidx
!= slen
&& self.string
.char_at(nidx
) == '
\n'
{
194 idx
= nidx
; // rule GB3
199 gr
::GC_Control
=> break,
201 gr
::GC_LV
| gr
::GC_V
=> HangulLV
,
202 gr
::GC_LVT
| gr
::GC_T
=> HangulLVT
,
203 gr
::GC_Regional_Indicator
=> Regional
,
206 FindExtend
=> { // found non-extending when looking for extending
210 HangulL
=> match cat
{ // rule GB6: L x (L|V|LV|LVT)
211 gr
::GC_L
=> continue,
212 gr
::GC_LV
| gr
::GC_V
=> HangulLV
,
213 gr
::GC_LVT
=> HangulLVT
,
219 HangulLV
=> match cat
{ // rule GB7: (LV|V) x (V|T)
220 gr
::GC_V
=> continue,
221 gr
::GC_T
=> HangulLVT
,
227 HangulLVT
=> match cat
{ // rule GB8: (LVT|T) x T
228 gr
::GC_T
=> continue,
234 Regional
=> match cat
{ // rule GB8a
235 gr
::GC_Regional_Indicator
=> continue,
244 self.cat
= if take_curr
{
245 idx
= idx
+ self.string
.char_at(idx
).len_utf8();
251 let retstr
= &self.string
[..idx
];
252 self.string
= &self.string
[idx
..];
257 impl<'a
> DoubleEndedIterator
for Graphemes
<'a
> {
259 fn next_back(&mut self) -> Option
<&'a
str> {
260 use tables
::grapheme
as gr
;
261 if self.string
.is_empty() {
265 let mut take_curr
= true;
266 let mut idx
= self.string
.len();
267 let mut previdx
= idx
;
268 let mut state
= Start
;
269 let mut cat
= gr
::GC_Any
;
270 for (curr
, ch
) in self.string
.char_indices().rev() {
274 // cached category, if any
275 cat
= match self.catb
{
276 None
=> gr
::grapheme_category(ch
),
277 _
=> self.catb
.take().unwrap()
280 // a matching state machine that runs *backwards* across an input string
281 // note that this has some implications for the Hangul matching, since
282 // we now need to know what the rightward letter is:
284 // Right to left, we have:
288 // HangulL means the letter to the right is L
289 // HangulLV means the letter to the right is V
290 // HangulLVT means the letter to the right is T
291 state
= match state
{
292 Start
if '
\n'
== ch
=> {
293 if idx
> 0 && '
\r'
== self.string
.char_at_reverse(idx
) {
294 idx
-= 1; // rule GB3
298 Start
| FindExtend
=> match cat
{
299 gr
::GC_Extend
=> FindExtend
,
300 gr
::GC_SpacingMark
if self.extended
=> FindExtend
,
301 gr
::GC_L
| gr
::GC_LV
| gr
::GC_LVT
=> HangulL
,
302 gr
::GC_V
=> HangulLV
,
303 gr
::GC_T
=> HangulLVT
,
304 gr
::GC_Regional_Indicator
=> Regional
,
306 take_curr
= Start
== state
;
311 HangulL
=> match cat
{ // char to right is an L
312 gr
::GC_L
=> continue, // L x L is the only legal match
318 HangulLV
=> match cat
{ // char to right is a V
319 gr
::GC_V
=> continue, // V x V, right char is still V
320 gr
::GC_L
| gr
::GC_LV
=> HangulL
, // (L|V) x V, right char is now L
326 HangulLVT
=> match cat
{ // char to right is a T
327 gr
::GC_T
=> continue, // T x T, right char is still T
328 gr
::GC_V
=> HangulLV
, // V x T, right char is now V
329 gr
::GC_LV
| gr
::GC_LVT
=> HangulL
, // (LV|LVT) x T, right char is now L
335 Regional
=> match cat
{ // rule GB8a
336 gr
::GC_Regional_Indicator
=> continue,
345 self.catb
= if take_curr
{
352 let retstr
= &self.string
[idx
..];
353 self.string
= &self.string
[..idx
];
358 // https://tools.ietf.org/html/rfc3629
359 static UTF8_CHAR_WIDTH
: [u8; 256] = [
360 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
361 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
362 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
363 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
364 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
365 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
366 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
367 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
368 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
369 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
370 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
371 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
372 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
373 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
374 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
375 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
378 /// Given a first byte, determine how many bytes are in this UTF-8 character
380 pub fn utf8_char_width(b
: u8) -> usize {
381 return UTF8_CHAR_WIDTH
[b
as usize] as usize;
384 /// Determines if a vector of `u16` contains valid UTF-16
385 pub fn is_utf16(v
: &[u16]) -> bool
{
386 let mut it
= v
.iter();
387 macro_rules
! next
{ ($ret
:expr
) => {
388 match it
.next() { Some(u) => *u, None => return $ret }
394 match char::from_u32(u
as u32) {
397 let u2
= next
!(false);
398 if u
< 0xD7FF || u
> 0xDBFF ||
399 u2
< 0xDC00 || u2
> 0xDFFF { return false; }
405 /// An iterator that decodes UTF-16 encoded codepoints from a vector
408 pub struct Utf16Items
<'a
> {
409 iter
: slice
::Iter
<'a
, u16>
411 /// The possibilities for values decoded from a `u16` stream.
412 #[derive(Copy, PartialEq, Eq, Clone, Debug)]
414 /// A valid codepoint.
416 /// An invalid surrogate without its pair.
421 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
422 /// replacement character (U+FFFD).
424 pub fn to_char_lossy(&self) -> char {
426 Utf16Item
::ScalarValue(c
) => c
,
427 Utf16Item
::LoneSurrogate(_
) => '
\u{FFFD}'
432 impl<'a
> Iterator
for Utf16Items
<'a
> {
433 type Item
= Utf16Item
;
435 fn next(&mut self) -> Option
<Utf16Item
> {
436 let u
= match self.iter
.next() {
441 if u
< 0xD800 || 0xDFFF < u
{
443 Some(Utf16Item
::ScalarValue(unsafe {mem::transmute(u as u32)}
))
444 } else if u
>= 0xDC00 {
445 // a trailing surrogate
446 Some(Utf16Item
::LoneSurrogate(u
))
448 // preserve state for rewinding.
449 let old
= self.iter
.clone();
451 let u2
= match self.iter
.next() {
454 None
=> return Some(Utf16Item
::LoneSurrogate(u
))
456 if u2
< 0xDC00 || u2
> 0xDFFF {
457 // not a trailing surrogate so we're not a valid
458 // surrogate pair, so rewind to redecode u2 next time.
459 self.iter
= old
.clone();
460 return Some(Utf16Item
::LoneSurrogate(u
))
463 // all ok, so lets decode it.
464 let c
= (((u
- 0xD800) as u32) << 10 | (u2
- 0xDC00) as u32) + 0x1_0000;
465 Some(Utf16Item
::ScalarValue(unsafe {mem::transmute(c)}
))
470 fn size_hint(&self) -> (usize, Option
<usize>) {
471 let (low
, high
) = self.iter
.size_hint();
472 // we could be entirely valid surrogates (2 elements per
473 // char), or entirely non-surrogates (1 element per char)
478 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
479 /// returning invalid surrogates as `LoneSurrogate`s.
484 /// # #![feature(unicode)]
485 /// extern crate unicode;
487 /// use unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
490 /// // 𝄞mus<invalid>ic<invalid>
491 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
492 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
495 /// assert_eq!(unicode::str::utf16_items(&v).collect::<Vec<_>>(),
496 /// vec![ScalarValue('𝄞'),
497 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
498 /// LoneSurrogate(0xDD1E),
499 /// ScalarValue('i'), ScalarValue('c'),
500 /// LoneSurrogate(0xD834)]);
503 pub fn utf16_items
<'a
>(v
: &'a
[u16]) -> Utf16Items
<'a
> {
504 Utf16Items { iter : v.iter() }
507 /// Iterator adaptor for encoding `char`s to UTF-16.
509 pub struct Utf16Encoder
<I
> {
514 impl<I
> Utf16Encoder
<I
> {
515 /// Create an UTF-16 encoder from any `char` iterator.
516 pub fn new(chars
: I
) -> Utf16Encoder
<I
> where I
: Iterator
<Item
=char> {
517 Utf16Encoder { chars: chars, extra: 0 }
521 impl<I
> Iterator
for Utf16Encoder
<I
> where I
: Iterator
<Item
=char> {
525 fn next(&mut self) -> Option
<u16> {
527 let tmp
= self.extra
;
532 let mut buf
= [0; 2];
533 self.chars
.next().map(|ch
| {
534 let n
= CharExt
::encode_utf16(ch
, &mut buf
).unwrap_or(0);
535 if n
== 2 { self.extra = buf[1]; }
541 fn size_hint(&self) -> (usize, Option
<usize>) {
542 let (low
, high
) = self.chars
.size_hint();
543 // every char gets either one u16 or two u16,
544 // so this iterator is between 1 or 2 times as
545 // long as the underlying iterator.
546 (low
, high
.and_then(|n
| n
.checked_mul(2)))
550 impl<'a
> Iterator
for Words
<'a
> {
553 fn next(&mut self) -> Option
<&'a
str> { self.inner.next() }
555 impl<'a
> DoubleEndedIterator
for Words
<'a
> {
556 fn next_back(&mut self) -> Option
<&'a
str> { self.inner.next_back() }