[rustc.git] / src / librustc_unicode / u_str.rs

// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Unicode-intensive string manipulations.
//!
//! This module provides functionality to `str` that requires the Unicode
//! methods provided by the unicode parts of the CharExt trait.

use core::char;
use core::iter::{Filter, FusedIterator};
use core::str::Split;

/// An iterator over the non-whitespace substrings of a string,
/// separated by any amount of whitespace.
#[stable(feature = "split_whitespace", since = "1.1.0")]
pub struct SplitWhitespace<'a> {
    inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
}

/// Methods for Unicode string slices
#[allow(missing_docs)] // docs in libcollections
pub trait UnicodeStr {
    fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
    fn is_whitespace(&self) -> bool;
    fn is_alphanumeric(&self) -> bool;
    fn trim(&self) -> &str;
    fn trim_left(&self) -> &str;
    fn trim_right(&self) -> &str;
}

impl UnicodeStr for str {
    #[inline]
    fn split_whitespace(&self) -> SplitWhitespace {
        fn is_not_empty(s: &&str) -> bool {
            !s.is_empty()
        }
        let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer

        fn is_whitespace(c: char) -> bool {
            c.is_whitespace()
        }
        let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer

        SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
    }

    #[inline]
    fn is_whitespace(&self) -> bool {
        self.chars().all(|c| c.is_whitespace())
    }

    #[inline]
    fn is_alphanumeric(&self) -> bool {
        self.chars().all(|c| c.is_alphanumeric())
    }

    #[inline]
    fn trim(&self) -> &str {
        self.trim_matches(|c: char| c.is_whitespace())
    }

    #[inline]
    fn trim_left(&self) -> &str {
        self.trim_left_matches(|c: char| c.is_whitespace())
    }

    #[inline]
    fn trim_right(&self) -> &str {
        self.trim_right_matches(|c: char| c.is_whitespace())
    }
}

// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];

/// Given a first byte, determine how many bytes are in this UTF-8 character
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
    return UTF8_CHAR_WIDTH[b as usize] as usize;
}

/// Determines if a vector of `u16` contains valid UTF-16
pub fn is_utf16(v: &[u16]) -> bool {
    let mut it = v.iter();
    macro_rules! next { ($ret:expr) => {
            match it.next() { Some(u) => *u, None => return $ret }
        }
    }
    loop {
        let u = next!(true);

        match char::from_u32(u as u32) {
            Some(_) => {}
            None => {
                let u2 = next!(false);
                if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
                    return false;
                }
            }
        }
    }
}

/// Iterator adaptor for encoding `char`s to UTF-16.
#[derive(Clone)]
pub struct Utf16Encoder<I> {
    chars: I,
    extra: u16,
}

impl<I> Utf16Encoder<I> {
    /// Create a UTF-16 encoder from any `char` iterator.
    pub fn new(chars: I) -> Utf16Encoder<I>
        where I: Iterator<Item = char>
    {
        Utf16Encoder {
            chars: chars,
            extra: 0,
        }
    }
}

impl<I> Iterator for Utf16Encoder<I>
    where I: Iterator<Item = char>
{
    type Item = u16;

    #[inline]
    fn next(&mut self) -> Option<u16> {
        if self.extra != 0 {
            let tmp = self.extra;
            self.extra = 0;
            return Some(tmp);
        }

        let mut buf = [0; 2];
        self.chars.next().map(|ch| {
            let n = CharExt::encode_utf16(ch, &mut buf).len();
            if n == 2 {
                self.extra = buf[1];
            }
            buf[0]
        })
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        let (low, high) = self.chars.size_hint();
        // every char gets either one u16 or two u16,
        // so this iterator is between 1 or 2 times as
        // long as the underlying iterator.
        (low, high.and_then(|n| n.checked_mul(2)))
    }
}

#[unstable(feature = "fused", issue = "35602")]
impl<I> FusedIterator for Utf16Encoder<I>
    where I: FusedIterator<Item = char> {}

#[stable(feature = "split_whitespace", since = "1.1.0")]
impl<'a> Iterator for SplitWhitespace<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<&'a str> {
        self.inner.next()
    }
}

#[stable(feature = "split_whitespace", since = "1.1.0")]
impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
    fn next_back(&mut self) -> Option<&'a str> {
        self.inner.next_back()
    }
}

#[unstable(feature = "fused", issue = "35602")]
impl<'a> FusedIterator for SplitWhitespace<'a> {}
Commit	Line	Data
1a4d82fc JJ	1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	2	// file at the top-level directory of this distribution and at
	3	// http://rust-lang.org/COPYRIGHT.
	4	//
	5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	8	// option. This file may not be copied, modified, or distributed
	9	// except according to those terms.
1a4d82fc JJ	10
	11	//! Unicode-intensive string manipulations.
	12	//!
b039eaaf SL	13	//! This module provides functionality to `str` that requires the Unicode
b039eaaf SL	14	//! methods provided by the unicode parts of the CharExt trait.
1a4d82fc	15
1a4d82fc	16	use core::char;
9e0c209e	17	use core::iter::{Filter, FusedIterator};
1a4d82fc JJ	18	use core::str::Split;
1a4d82fc JJ	19
d9579d0f AL	20	/// An iterator over the non-whitespace substrings of a string,
	21	/// separated by any amount of whitespace.
	22	#[stable(feature = "split_whitespace", since = "1.1.0")]
	23	pub struct SplitWhitespace<'a> {
85aaf69f	24	inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
1a4d82fc JJ	25	}
	26
	27	/// Methods for Unicode string slices
	28	#[allow(missing_docs)] // docs in libcollections
	29	pub trait UnicodeStr {
d9579d0f	30	fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
1a4d82fc JJ	31	fn is_whitespace(&self) -> bool;
1a4d82fc JJ	32	fn is_alphanumeric(&self) -> bool;
7453a54e SL	33	fn trim(&self) -> &str;
	34	fn trim_left(&self) -> &str;
	35	fn trim_right(&self) -> &str;
1a4d82fc JJ	36	}
	37
	38	impl UnicodeStr for str {
d9579d0f AL	39	#[inline]
d9579d0f AL	40	fn split_whitespace(&self) -> SplitWhitespace {
b039eaaf SL	41	fn is_not_empty(s: &&str) -> bool {
	42	!s.is_empty()
	43	}
1a4d82fc JJ	44	let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
1a4d82fc JJ	45
b039eaaf SL	46	fn is_whitespace(c: char) -> bool {
	47	c.is_whitespace()
	48	}
1a4d82fc JJ	49	let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
1a4d82fc JJ	50
d9579d0f	51	SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
1a4d82fc JJ	52	}
	53
	54	#[inline]
b039eaaf SL	55	fn is_whitespace(&self) -> bool {
	56	self.chars().all(\|c\| c.is_whitespace())
	57	}
1a4d82fc JJ	58
1a4d82fc JJ	59	#[inline]
b039eaaf SL	60	fn is_alphanumeric(&self) -> bool {
	61	self.chars().all(\|c\| c.is_alphanumeric())
	62	}
1a4d82fc	63
1a4d82fc JJ	64	#[inline]
1a4d82fc JJ	65	fn trim(&self) -> &str {
c34b1796	66	self.trim_matches(\|c: char\| c.is_whitespace())
1a4d82fc JJ	67	}
	68
	69	#[inline]
	70	fn trim_left(&self) -> &str {
85aaf69f	71	self.trim_left_matches(\|c: char\| c.is_whitespace())
1a4d82fc JJ	72	}
	73
	74	#[inline]
	75	fn trim_right(&self) -> &str {
85aaf69f	76	self.trim_right_matches(\|c: char\| c.is_whitespace())
1a4d82fc JJ	77	}
	78	}
	79
1a4d82fc JJ	80	// https://tools.ietf.org/html/rfc3629
	81	static UTF8_CHAR_WIDTH: [u8; 256] = [
	82	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	83	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
	84	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	85	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
	86	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	87	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
	88	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	89	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
	90	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	91	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
	92	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	93	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
	94	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	95	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
	96	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
	97	4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
	98	];
	99
	100	/// Given a first byte, determine how many bytes are in this UTF-8 character
	101	#[inline]
85aaf69f SL	102	pub fn utf8_char_width(b: u8) -> usize {
85aaf69f SL	103	return UTF8_CHAR_WIDTH[b as usize] as usize;
1a4d82fc JJ	104	}
	105
	106	/// Determines if a vector of `u16` contains valid UTF-16
	107	pub fn is_utf16(v: &[u16]) -> bool {
	108	let mut it = v.iter();
	109	macro_rules! next { ($ret:expr) => {
	110	match it.next() { Some(u) => *u, None => return $ret }
	111	}
	112	}
	113	loop {
	114	let u = next!(true);
	115
	116	match char::from_u32(u as u32) {
	117	Some(_) => {}
	118	None => {
	119	let u2 = next!(false);
b039eaaf SL	120	if u < 0xD7FF \|\| u > 0xDBFF \|\| u2 < 0xDC00 \|\| u2 > 0xDFFF {
	121	return false;
	122	}
1a4d82fc JJ	123	}
	124	}
	125	}
	126	}
	127
1a4d82fc JJ	128	/// Iterator adaptor for encoding `char`s to UTF-16.
	129	#[derive(Clone)]
	130	pub struct Utf16Encoder<I> {
	131	chars: I,
b039eaaf	132	extra: u16,
1a4d82fc JJ	133	}
	134
	135	impl<I> Utf16Encoder<I> {
d9579d0f	136	/// Create a UTF-16 encoder from any `char` iterator.
b039eaaf SL	137	pub fn new(chars: I) -> Utf16Encoder<I>
	138	where I: Iterator<Item = char>
	139	{
	140	Utf16Encoder {
	141	chars: chars,
	142	extra: 0,
	143	}
1a4d82fc JJ	144	}
	145	}
	146
3157f602 XL	147	impl<I> Iterator for Utf16Encoder<I>
	148	where I: Iterator<Item = char>
	149	{
1a4d82fc JJ	150	type Item = u16;
	151
	152	#[inline]
	153	fn next(&mut self) -> Option<u16> {
	154	if self.extra != 0 {
	155	let tmp = self.extra;
	156	self.extra = 0;
	157	return Some(tmp);
	158	}
	159
c30ab7b3	160	let mut buf = [0; 2];
1a4d82fc	161	self.chars.next().map(\|ch\| {
c30ab7b3 SL	162	let n = CharExt::encode_utf16(ch, &mut buf).len();
	163	if n == 2 {
	164	self.extra = buf[1];
b039eaaf	165	}
c30ab7b3	166	buf[0]
1a4d82fc JJ	167	})
	168	}
	169
	170	#[inline]
85aaf69f	171	fn size_hint(&self) -> (usize, Option<usize>) {
1a4d82fc JJ	172	let (low, high) = self.chars.size_hint();
	173	// every char gets either one u16 or two u16,
	174	// so this iterator is between 1 or 2 times as
	175	// long as the underlying iterator.
	176	(low, high.and_then(\|n\| n.checked_mul(2)))
	177	}
	178	}
	179
9e0c209e SL	180	#[unstable(feature = "fused", issue = "35602")]
	181	impl<I> FusedIterator for Utf16Encoder<I>
	182	where I: FusedIterator<Item = char> {}
	183
c30ab7b3	184	#[stable(feature = "split_whitespace", since = "1.1.0")]
d9579d0f	185	impl<'a> Iterator for SplitWhitespace<'a> {
1a4d82fc JJ	186	type Item = &'a str;
1a4d82fc JJ	187
b039eaaf SL	188	fn next(&mut self) -> Option<&'a str> {
	189	self.inner.next()
	190	}
1a4d82fc	191	}
c30ab7b3 SL	192
c30ab7b3 SL	193	#[stable(feature = "split_whitespace", since = "1.1.0")]
d9579d0f	194	impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
b039eaaf SL	195	fn next_back(&mut self) -> Option<&'a str> {
	196	self.inner.next_back()
	197	}
1a4d82fc	198	}
9e0c209e SL	199
	200	#[unstable(feature = "fused", issue = "35602")]
	201	impl<'a> FusedIterator for SplitWhitespace<'a> {}