[rustc.git] / src / librustc_unicode / u_str.rs

// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Unicode-intensive string manipulations.
//!
//! This module provides functionality to `str` that requires the Unicode
//! methods provided by the unicode parts of the CharExt trait.

use char::{DecodeUtf16, decode_utf16};
use core::char;
use core::iter::{Cloned, Filter};
use core::slice;
use core::str::Split;

/// An iterator over the non-whitespace substrings of a string,
/// separated by any amount of whitespace.
#[stable(feature = "split_whitespace", since = "1.1.0")]
pub struct SplitWhitespace<'a> {
    inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
}

/// Methods for Unicode string slices
#[allow(missing_docs)] // docs in libcollections
pub trait UnicodeStr {
    fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
    fn is_whitespace(&self) -> bool;
    fn is_alphanumeric(&self) -> bool;
    fn trim<'a>(&'a self) -> &'a str;
    fn trim_left<'a>(&'a self) -> &'a str;
    fn trim_right<'a>(&'a self) -> &'a str;
}

impl UnicodeStr for str {
    #[inline]
    fn split_whitespace(&self) -> SplitWhitespace {
        fn is_not_empty(s: &&str) -> bool {
            !s.is_empty()
        }
        let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer

        fn is_whitespace(c: char) -> bool {
            c.is_whitespace()
        }
        let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer

        SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
    }

    #[inline]
    fn is_whitespace(&self) -> bool {
        self.chars().all(|c| c.is_whitespace())
    }

    #[inline]
    fn is_alphanumeric(&self) -> bool {
        self.chars().all(|c| c.is_alphanumeric())
    }

    #[inline]
    fn trim(&self) -> &str {
        self.trim_matches(|c: char| c.is_whitespace())
    }

    #[inline]
    fn trim_left(&self) -> &str {
        self.trim_left_matches(|c: char| c.is_whitespace())
    }

    #[inline]
    fn trim_right(&self) -> &str {
        self.trim_right_matches(|c: char| c.is_whitespace())
    }
}

// https://tools.ietf.org/html/rfc3629
static UTF8_CHAR_WIDTH: [u8; 256] = [
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
];

/// Given a first byte, determine how many bytes are in this UTF-8 character
#[inline]
pub fn utf8_char_width(b: u8) -> usize {
    return UTF8_CHAR_WIDTH[b as usize] as usize;
}

/// Determines if a vector of `u16` contains valid UTF-16
pub fn is_utf16(v: &[u16]) -> bool {
    let mut it = v.iter();
    macro_rules! next { ($ret:expr) => {
            match it.next() { Some(u) => *u, None => return $ret }
        }
    }
    loop {
        let u = next!(true);

        match char::from_u32(u as u32) {
            Some(_) => {}
            None => {
                let u2 = next!(false);
                if u < 0xD7FF || u > 0xDBFF || u2 < 0xDC00 || u2 > 0xDFFF {
                    return false;
                }
            }
        }
    }
}

/// An iterator that decodes UTF-16 encoded codepoints from a vector
/// of `u16`s.
#[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
#[derive(Clone)]
pub struct Utf16Items<'a> {
    decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>,
}

/// The possibilities for values decoded from a `u16` stream.
#[rustc_deprecated(since = "1.4.0",
                   reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
#[derive(Copy, PartialEq, Eq, Clone, Debug)]
pub enum Utf16Item {
    /// A valid codepoint.
    ScalarValue(char),
    /// An invalid surrogate without its pair.
    LoneSurrogate(u16),
}

#[allow(deprecated)]
impl Utf16Item {
    /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
    /// replacement character (U+FFFD).
    #[inline]
    pub fn to_char_lossy(&self) -> char {
        match *self {
            Utf16Item::ScalarValue(c) => c,
            Utf16Item::LoneSurrogate(_) => '\u{FFFD}',
        }
    }
}

#[rustc_deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
impl<'a> Iterator for Utf16Items<'a> {
    type Item = Utf16Item;

    fn next(&mut self) -> Option<Utf16Item> {
        self.decoder.next().map(|result| {
            match result {
                Ok(c) => Utf16Item::ScalarValue(c),
                Err(s) => Utf16Item::LoneSurrogate(s),
            }
        })
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.decoder.size_hint()
    }
}

/// Create an iterator over the UTF-16 encoded codepoints in `v`,
/// returning invalid surrogates as `LoneSurrogate`s.
///
/// # Examples
///
/// ```
/// #![feature(unicode, decode_utf16)]
/// # #![allow(deprecated)]
///
/// extern crate rustc_unicode;
///
/// use rustc_unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
///
/// fn main() {
///     // 𝄞mus<invalid>ic<invalid>
///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
///              0x0073, 0xDD1E, 0x0069, 0x0063,
///              0xD834];
///
///     assert_eq!(rustc_unicode::str::utf16_items(&v).collect::<Vec<_>>(),
///                vec![ScalarValue('𝄞'),
///                     ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
///                     LoneSurrogate(0xDD1E),
///                     ScalarValue('i'), ScalarValue('c'),
///                     LoneSurrogate(0xD834)]);
/// }
/// ```
#[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
#[allow(deprecated)]
pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
    Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
}

/// Iterator adaptor for encoding `char`s to UTF-16.
#[derive(Clone)]
pub struct Utf16Encoder<I> {
    chars: I,
    extra: u16,
}

impl<I> Utf16Encoder<I> {
    /// Create a UTF-16 encoder from any `char` iterator.
    pub fn new(chars: I) -> Utf16Encoder<I>
        where I: Iterator<Item = char>
    {
        Utf16Encoder {
            chars: chars,
            extra: 0,
        }
    }
}

impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
    type Item = u16;

    #[inline]
    fn next(&mut self) -> Option<u16> {
        if self.extra != 0 {
            let tmp = self.extra;
            self.extra = 0;
            return Some(tmp);
        }

        let mut buf = [0; 2];
        self.chars.next().map(|ch| {
            let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
            if n == 2 {
                self.extra = buf[1];
            }
            buf[0]
        })
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        let (low, high) = self.chars.size_hint();
        // every char gets either one u16 or two u16,
        // so this iterator is between 1 or 2 times as
        // long as the underlying iterator.
        (low, high.and_then(|n| n.checked_mul(2)))
    }
}

impl<'a> Iterator for SplitWhitespace<'a> {
    type Item = &'a str;

    fn next(&mut self) -> Option<&'a str> {
        self.inner.next()
    }
}
impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
    fn next_back(&mut self) -> Option<&'a str> {
        self.inner.next_back()
    }
}
Commit	Line	Data
1a4d82fc JJ	1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	2	// file at the top-level directory of this distribution and at
	3	// http://rust-lang.org/COPYRIGHT.
	4	//
	5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	8	// option. This file may not be copied, modified, or distributed
	9	// except according to those terms.
1a4d82fc JJ	10
	11	//! Unicode-intensive string manipulations.
	12	//!
b039eaaf SL	13	//! This module provides functionality to `str` that requires the Unicode
b039eaaf SL	14	//! methods provided by the unicode parts of the CharExt trait.
1a4d82fc	15
e9174d1e	16	use char::{DecodeUtf16, decode_utf16};
1a4d82fc	17	use core::char;
e9174d1e	18	use core::iter::{Cloned, Filter};
1a4d82fc JJ	19	use core::slice;
	20	use core::str::Split;
	21
d9579d0f AL	22	/// An iterator over the non-whitespace substrings of a string,
	23	/// separated by any amount of whitespace.
	24	#[stable(feature = "split_whitespace", since = "1.1.0")]
	25	pub struct SplitWhitespace<'a> {
85aaf69f	26	inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
1a4d82fc JJ	27	}
	28
	29	/// Methods for Unicode string slices
	30	#[allow(missing_docs)] // docs in libcollections
	31	pub trait UnicodeStr {
d9579d0f	32	fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
1a4d82fc JJ	33	fn is_whitespace(&self) -> bool;
1a4d82fc JJ	34	fn is_alphanumeric(&self) -> bool;
1a4d82fc JJ	35	fn trim<'a>(&'a self) -> &'a str;
	36	fn trim_left<'a>(&'a self) -> &'a str;
	37	fn trim_right<'a>(&'a self) -> &'a str;
	38	}
	39
	40	impl UnicodeStr for str {
d9579d0f AL	41	#[inline]
d9579d0f AL	42	fn split_whitespace(&self) -> SplitWhitespace {
b039eaaf SL	43	fn is_not_empty(s: &&str) -> bool {
	44	!s.is_empty()
	45	}
1a4d82fc JJ	46	let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
1a4d82fc JJ	47
b039eaaf SL	48	fn is_whitespace(c: char) -> bool {
	49	c.is_whitespace()
	50	}
1a4d82fc JJ	51	let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
1a4d82fc JJ	52
d9579d0f	53	SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
1a4d82fc JJ	54	}
	55
	56	#[inline]
b039eaaf SL	57	fn is_whitespace(&self) -> bool {
	58	self.chars().all(\|c\| c.is_whitespace())
	59	}
1a4d82fc JJ	60
1a4d82fc JJ	61	#[inline]
b039eaaf SL	62	fn is_alphanumeric(&self) -> bool {
	63	self.chars().all(\|c\| c.is_alphanumeric())
	64	}
1a4d82fc	65
1a4d82fc JJ	66	#[inline]
1a4d82fc JJ	67	fn trim(&self) -> &str {
c34b1796	68	self.trim_matches(\|c: char\| c.is_whitespace())
1a4d82fc JJ	69	}
	70
	71	#[inline]
	72	fn trim_left(&self) -> &str {
85aaf69f	73	self.trim_left_matches(\|c: char\| c.is_whitespace())
1a4d82fc JJ	74	}
	75
	76	#[inline]
	77	fn trim_right(&self) -> &str {
85aaf69f	78	self.trim_right_matches(\|c: char\| c.is_whitespace())
1a4d82fc JJ	79	}
	80	}
	81
1a4d82fc JJ	82	// https://tools.ietf.org/html/rfc3629
	83	static UTF8_CHAR_WIDTH: [u8; 256] = [
	84	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	85	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
	86	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	87	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
	88	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	89	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
	90	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	91	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
	92	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	93	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
	94	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	95	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
	96	0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	97	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
	98	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
	99	4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
	100	];
	101
	102	/// Given a first byte, determine how many bytes are in this UTF-8 character
	103	#[inline]
85aaf69f SL	104	pub fn utf8_char_width(b: u8) -> usize {
85aaf69f SL	105	return UTF8_CHAR_WIDTH[b as usize] as usize;
1a4d82fc JJ	106	}
	107
	108	/// Determines if a vector of `u16` contains valid UTF-16
	109	pub fn is_utf16(v: &[u16]) -> bool {
	110	let mut it = v.iter();
	111	macro_rules! next { ($ret:expr) => {
	112	match it.next() { Some(u) => *u, None => return $ret }
	113	}
	114	}
	115	loop {
	116	let u = next!(true);
	117
	118	match char::from_u32(u as u32) {
	119	Some(_) => {}
	120	None => {
	121	let u2 = next!(false);
b039eaaf SL	122	if u < 0xD7FF \|\| u > 0xDBFF \|\| u2 < 0xDC00 \|\| u2 > 0xDFFF {
	123	return false;
	124	}
1a4d82fc JJ	125	}
	126	}
	127	}
	128	}
	129
	130	/// An iterator that decodes UTF-16 encoded codepoints from a vector
	131	/// of `u16`s.
92a42be0	132	#[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::DecodeUtf16`")]
e9174d1e SL	133	#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
e9174d1e SL	134	#[allow(deprecated)]
1a4d82fc JJ	135	#[derive(Clone)]
1a4d82fc JJ	136	pub struct Utf16Items<'a> {
b039eaaf	137	decoder: DecodeUtf16<Cloned<slice::Iter<'a, u16>>>,
1a4d82fc	138	}
e9174d1e	139
1a4d82fc	140	/// The possibilities for values decoded from a `u16` stream.
92a42be0 SL	141	#[rustc_deprecated(since = "1.4.0",
92a42be0 SL	142	reason = "`char::DecodeUtf16` uses `Result<char, u16>` instead")]
e9174d1e SL	143	#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
e9174d1e SL	144	#[allow(deprecated)]
85aaf69f	145	#[derive(Copy, PartialEq, Eq, Clone, Debug)]
1a4d82fc JJ	146	pub enum Utf16Item {
	147	/// A valid codepoint.
	148	ScalarValue(char),
	149	/// An invalid surrogate without its pair.
b039eaaf	150	LoneSurrogate(u16),
1a4d82fc JJ	151	}
1a4d82fc JJ	152
e9174d1e	153	#[allow(deprecated)]
1a4d82fc JJ	154	impl Utf16Item {
	155	/// Convert `self` to a `char`, taking `LoneSurrogate`s to the
	156	/// replacement character (U+FFFD).
	157	#[inline]
	158	pub fn to_char_lossy(&self) -> char {
	159	match *self {
	160	Utf16Item::ScalarValue(c) => c,
b039eaaf	161	Utf16Item::LoneSurrogate(_) => '\u{FFFD}',
1a4d82fc JJ	162	}
	163	}
	164	}
	165
92a42be0	166	#[rustc_deprecated(since = "1.4.0", reason = "use `char::DecodeUtf16` instead")]
e9174d1e SL	167	#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
e9174d1e SL	168	#[allow(deprecated)]
1a4d82fc JJ	169	impl<'a> Iterator for Utf16Items<'a> {
	170	type Item = Utf16Item;
	171
	172	fn next(&mut self) -> Option<Utf16Item> {
b039eaaf SL	173	self.decoder.next().map(\|result\| {
	174	match result {
	175	Ok(c) => Utf16Item::ScalarValue(c),
	176	Err(s) => Utf16Item::LoneSurrogate(s),
	177	}
e9174d1e	178	})
1a4d82fc JJ	179	}
	180
	181	#[inline]
85aaf69f	182	fn size_hint(&self) -> (usize, Option<usize>) {
e9174d1e	183	self.decoder.size_hint()
1a4d82fc JJ	184	}
	185	}
	186
	187	/// Create an iterator over the UTF-16 encoded codepoints in `v`,
	188	/// returning invalid surrogates as `LoneSurrogate`s.
	189	///
c34b1796 AL	190	/// # Examples
	191	///
	192	/// ```
e9174d1e	193	/// #![feature(unicode, decode_utf16)]
92a42be0	194	/// # #![allow(deprecated)]
c1a9b12d	195	///
d9579d0f	196	/// extern crate rustc_unicode;
1a4d82fc	197	///
d9579d0f	198	/// use rustc_unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
1a4d82fc	199	///
c34b1796 AL	200	/// fn main() {
	201	/// // 𝄞mus<invalid>ic<invalid>
	202	/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
	203	/// 0x0073, 0xDD1E, 0x0069, 0x0063,
	204	/// 0xD834];
1a4d82fc	205	///
d9579d0f	206	/// assert_eq!(rustc_unicode::str::utf16_items(&v).collect::<Vec<_>>(),
c34b1796 AL	207	/// vec![ScalarValue('𝄞'),
	208	/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
	209	/// LoneSurrogate(0xDD1E),
	210	/// ScalarValue('i'), ScalarValue('c'),
	211	/// LoneSurrogate(0xD834)]);
	212	/// }
1a4d82fc	213	/// ```
92a42be0	214	#[rustc_deprecated(since = "1.4.0", reason = "renamed to `char::decode_utf16`")]
e9174d1e SL	215	#[unstable(feature = "decode_utf16", reason = "not exposed in std", issue = "27830")]
e9174d1e SL	216	#[allow(deprecated)]
1a4d82fc	217	pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
e9174d1e	218	Utf16Items { decoder: decode_utf16(v.iter().cloned()) }
1a4d82fc JJ	219	}
	220
	221	/// Iterator adaptor for encoding `char`s to UTF-16.
	222	#[derive(Clone)]
	223	pub struct Utf16Encoder<I> {
	224	chars: I,
b039eaaf	225	extra: u16,
1a4d82fc JJ	226	}
	227
	228	impl<I> Utf16Encoder<I> {
d9579d0f	229	/// Create a UTF-16 encoder from any `char` iterator.
b039eaaf SL	230	pub fn new(chars: I) -> Utf16Encoder<I>
	231	where I: Iterator<Item = char>
	232	{
	233	Utf16Encoder {
	234	chars: chars,
	235	extra: 0,
	236	}
1a4d82fc JJ	237	}
	238	}
	239
	240	impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
	241	type Item = u16;
	242
	243	#[inline]
	244	fn next(&mut self) -> Option<u16> {
	245	if self.extra != 0 {
	246	let tmp = self.extra;
	247	self.extra = 0;
	248	return Some(tmp);
	249	}
	250
c34b1796	251	let mut buf = [0; 2];
1a4d82fc	252	self.chars.next().map(\|ch\| {
85aaf69f	253	let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
b039eaaf SL	254	if n == 2 {
	255	self.extra = buf[1];
	256	}
1a4d82fc JJ	257	buf[0]
	258	})
	259	}
	260
	261	#[inline]
85aaf69f	262	fn size_hint(&self) -> (usize, Option<usize>) {
1a4d82fc JJ	263	let (low, high) = self.chars.size_hint();
	264	// every char gets either one u16 or two u16,
	265	// so this iterator is between 1 or 2 times as
	266	// long as the underlying iterator.
	267	(low, high.and_then(\|n\| n.checked_mul(2)))
	268	}
	269	}
	270
d9579d0f	271	impl<'a> Iterator for SplitWhitespace<'a> {
1a4d82fc JJ	272	type Item = &'a str;
1a4d82fc JJ	273
b039eaaf SL	274	fn next(&mut self) -> Option<&'a str> {
	275	self.inner.next()
	276	}
1a4d82fc	277	}
d9579d0f	278	impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
b039eaaf SL	279	fn next_back(&mut self) -> Option<&'a str> {
	280	self.inner.next_back()
	281	}
1a4d82fc	282	}