[rustc.git] / src / libcore / char.rs

// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Character manipulation.
//!
//! For more details, see ::rustc_unicode::char (a.k.a. std::char)

#![allow(non_snake_case)]
#![stable(feature = "core_char", since = "1.2.0")]

use iter::Iterator;
use mem::transmute;
use option::Option::{None, Some};
use option::Option;
use slice::SliceExt;

// UTF-8 ranges and tags for encoding characters
const TAG_CONT: u8    = 0b1000_0000;
const TAG_TWO_B: u8   = 0b1100_0000;
const TAG_THREE_B: u8 = 0b1110_0000;
const TAG_FOUR_B: u8  = 0b1111_0000;
const MAX_ONE_B: u32   =     0x80;
const MAX_TWO_B: u32   =    0x800;
const MAX_THREE_B: u32 =  0x10000;

/*
    Lu  Uppercase_Letter        an uppercase letter
    Ll  Lowercase_Letter        a lowercase letter
    Lt  Titlecase_Letter        a digraphic character, with first part uppercase
    Lm  Modifier_Letter         a modifier letter
    Lo  Other_Letter            other letters, including syllables and ideographs
    Mn  Nonspacing_Mark         a nonspacing combining mark (zero advance width)
    Mc  Spacing_Mark            a spacing combining mark (positive advance width)
    Me  Enclosing_Mark          an enclosing combining mark
    Nd  Decimal_Number          a decimal digit
    Nl  Letter_Number           a letterlike numeric character
    No  Other_Number            a numeric character of other type
    Pc  Connector_Punctuation   a connecting punctuation mark, like a tie
    Pd  Dash_Punctuation        a dash or hyphen punctuation mark
    Ps  Open_Punctuation        an opening punctuation mark (of a pair)
    Pe  Close_Punctuation       a closing punctuation mark (of a pair)
    Pi  Initial_Punctuation     an initial quotation mark
    Pf  Final_Punctuation       a final quotation mark
    Po  Other_Punctuation       a punctuation mark of other type
    Sm  Math_Symbol             a symbol of primarily mathematical use
    Sc  Currency_Symbol         a currency sign
    Sk  Modifier_Symbol         a non-letterlike modifier symbol
    So  Other_Symbol            a symbol of other type
    Zs  Space_Separator         a space character (of various non-zero widths)
    Zl  Line_Separator          U+2028 LINE SEPARATOR only
    Zp  Paragraph_Separator     U+2029 PARAGRAPH SEPARATOR only
    Cc  Control                 a C0 or C1 control code
    Cf  Format                  a format control character
    Cs  Surrogate               a surrogate code point
    Co  Private_Use             a private-use character
    Cn  Unassigned              a reserved unassigned code point or a noncharacter
*/

/// The highest valid code point
#[stable(feature = "rust1", since = "1.0.0")]
pub const MAX: char = '\u{10ffff}';

/// Converts a `u32` to an `Option<char>`.
///
/// # Examples
///
/// ```
/// use std::char;
///
/// assert_eq!(char::from_u32(0x2764), Some('❤'));
/// assert_eq!(char::from_u32(0x110000), None); // invalid character
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_u32(i: u32) -> Option<char> {
    // catch out-of-bounds and surrogates
    if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
        None
    } else {
        Some(unsafe { from_u32_unchecked(i) })
    }
}

/// Converts a `u32` to an `char`, not checking whether it is a valid unicode
/// codepoint.
#[inline]
#[unstable(feature = "char_from_unchecked", reason = "recently added API")]
pub unsafe fn from_u32_unchecked(i: u32) -> char {
    transmute(i)
}

/// Converts a number to the character representing it.
///
/// # Return value
///
/// Returns `Some(char)` if `num` represents one digit under `radix`,
/// using one character of `0-9` or `a-z`, or `None` if it doesn't.
///
/// # Panics
///
/// Panics if given an `radix` > 36.
///
/// # Examples
///
/// ```
/// use std::char;
///
/// let c = char::from_digit(4, 10);
///
/// assert_eq!(c, Some('4'));
/// ```
#[inline]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_digit(num: u32, radix: u32) -> Option<char> {
    if radix > 36 {
        panic!("from_digit: radix is too high (maximum 36)");
    }
    if num < radix {
        let num = num as u8;
        if num < 10 {
            Some((b'0' + num) as char)
        } else {
            Some((b'a' + num - 10) as char)
        }
    } else {
        None
    }
}

// NB: the stabilization and documentation for this trait is in
// unicode/char.rs, not here
#[allow(missing_docs)] // docs in libunicode/u_char.rs
#[doc(hidden)]
#[unstable(feature = "core_char_ext",
           reason = "the stable interface is `impl char` in later crate")]
pub trait CharExt {
    fn is_digit(self, radix: u32) -> bool;
    fn to_digit(self, radix: u32) -> Option<u32>;
    fn escape_unicode(self) -> EscapeUnicode;
    fn escape_default(self) -> EscapeDefault;
    fn len_utf8(self) -> usize;
    fn len_utf16(self) -> usize;
    fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
    fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
}

impl CharExt for char {
    #[inline]
    fn is_digit(self, radix: u32) -> bool {
        self.to_digit(radix).is_some()
    }

    #[inline]
    fn to_digit(self, radix: u32) -> Option<u32> {
        if radix > 36 {
            panic!("to_digit: radix is too high (maximum 36)");
        }
        let val = match self {
          '0' ... '9' => self as u32 - '0' as u32,
          'a' ... 'z' => self as u32 - 'a' as u32 + 10,
          'A' ... 'Z' => self as u32 - 'A' as u32 + 10,
          _ => return None,
        };
        if val < radix { Some(val) }
        else { None }
    }

    #[inline]
    fn escape_unicode(self) -> EscapeUnicode {
        EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash }
    }

    #[inline]
    fn escape_default(self) -> EscapeDefault {
        let init_state = match self {
            '\t' => EscapeDefaultState::Backslash('t'),
            '\r' => EscapeDefaultState::Backslash('r'),
            '\n' => EscapeDefaultState::Backslash('n'),
            '\\' => EscapeDefaultState::Backslash('\\'),
            '\'' => EscapeDefaultState::Backslash('\''),
            '"'  => EscapeDefaultState::Backslash('"'),
            '\x20' ... '\x7e' => EscapeDefaultState::Char(self),
            _ => EscapeDefaultState::Unicode(self.escape_unicode())
        };
        EscapeDefault { state: init_state }
    }

    #[inline]
    fn len_utf8(self) -> usize {
        let code = self as u32;
        if code < MAX_ONE_B {
            1
        } else if code < MAX_TWO_B {
            2
        } else if code < MAX_THREE_B {
            3
        } else {
            4
        }
    }

    #[inline]
    fn len_utf16(self) -> usize {
        let ch = self as u32;
        if (ch & 0xFFFF) == ch { 1 } else { 2 }
    }

    #[inline]
    fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
        encode_utf8_raw(self as u32, dst)
    }

    #[inline]
    fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
        encode_utf16_raw(self as u32, dst)
    }
}

/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
/// and then returns the number of bytes written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable(feature = "char_internals",
           reason = "this function should not be exposed publicly")]
#[doc(hidden)]
pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
    // Marked #[inline] to allow llvm optimizing it away
    if code < MAX_ONE_B && !dst.is_empty() {
        dst[0] = code as u8;
        Some(1)
    } else if code < MAX_TWO_B && dst.len() >= 2 {
        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
        Some(2)
    } else if code < MAX_THREE_B && dst.len() >= 3  {
        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
        dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
        Some(3)
    } else if dst.len() >= 4 {
        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
        dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
        Some(4)
    } else {
        None
    }
}

/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
/// and then returns the number of `u16`s written.
///
/// If the buffer is not large enough, nothing will be written into it
/// and a `None` will be returned.
#[inline]
#[unstable(feature = "char_internals",
           reason = "this function should not be exposed publicly")]
#[doc(hidden)]
pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
    // Marked #[inline] to allow llvm optimizing it away
    if (ch & 0xFFFF) == ch && !dst.is_empty() {
        // The BMP falls through (assuming non-surrogate, as it should)
        dst[0] = ch as u16;
        Some(1)
    } else if dst.len() >= 2 {
        // Supplementary planes break into surrogates.
        ch -= 0x1_0000;
        dst[0] = 0xD800 | ((ch >> 10) as u16);
        dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
        Some(2)
    } else {
        None
    }
}

/// An iterator over the characters that represent a `char`, as escaped by
/// Rust's unicode escaping rules.
#[derive(Clone)]
#[stable(feature = "rust1", since = "1.0.0")]
pub struct EscapeUnicode {
    c: char,
    state: EscapeUnicodeState
}

#[derive(Clone)]
enum EscapeUnicodeState {
    Backslash,
    Type,
    LeftBrace,
    Value(usize),
    RightBrace,
    Done,
}

#[stable(feature = "rust1", since = "1.0.0")]
impl Iterator for EscapeUnicode {
    type Item = char;

    fn next(&mut self) -> Option<char> {
        match self.state {
            EscapeUnicodeState::Backslash => {
                self.state = EscapeUnicodeState::Type;
                Some('\\')
            }
            EscapeUnicodeState::Type => {
                self.state = EscapeUnicodeState::LeftBrace;
                Some('u')
            }
            EscapeUnicodeState::LeftBrace => {
                let mut n = 0;
                while (self.c as u32) >> (4 * (n + 1)) != 0 {
                    n += 1;
                }
                self.state = EscapeUnicodeState::Value(n);
                Some('{')
            }
            EscapeUnicodeState::Value(offset) => {
                let c = from_digit(((self.c as u32) >> (offset * 4)) & 0xf, 16).unwrap();
                if offset == 0 {
                    self.state = EscapeUnicodeState::RightBrace;
                } else {
                    self.state = EscapeUnicodeState::Value(offset - 1);
                }
                Some(c)
            }
            EscapeUnicodeState::RightBrace => {
                self.state = EscapeUnicodeState::Done;
                Some('}')
            }
            EscapeUnicodeState::Done => None,
        }
    }
}

/// An iterator over the characters that represent a `char`, escaped
/// for maximum portability.
#[derive(Clone)]
#[stable(feature = "rust1", since = "1.0.0")]
pub struct EscapeDefault {
    state: EscapeDefaultState
}

#[derive(Clone)]
enum EscapeDefaultState {
    Backslash(char),
    Char(char),
    Done,
    Unicode(EscapeUnicode),
}

#[stable(feature = "rust1", since = "1.0.0")]
impl Iterator for EscapeDefault {
    type Item = char;

    fn next(&mut self) -> Option<char> {
        match self.state {
            EscapeDefaultState::Backslash(c) => {
                self.state = EscapeDefaultState::Char(c);
                Some('\\')
            }
            EscapeDefaultState::Char(c) => {
                self.state = EscapeDefaultState::Done;
                Some(c)
            }
            EscapeDefaultState::Done => None,
            EscapeDefaultState::Unicode(ref mut iter) => iter.next()
        }
    }
}
Commit	Line	Data
1a4d82fc JJ	1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
	2	// file at the top-level directory of this distribution and at
	3	// http://rust-lang.org/COPYRIGHT.
	4	//
	5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
	6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
	7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
	8	// option. This file may not be copied, modified, or distributed
	9	// except according to those terms.
	10
	11	//! Character manipulation.
	12	//!
d9579d0f	13	//! For more details, see ::rustc_unicode::char (a.k.a. std::char)
1a4d82fc JJ	14
1a4d82fc JJ	15	#![allow(non_snake_case)]
62682a34	16	#![stable(feature = "core_char", since = "1.2.0")]
1a4d82fc JJ	17
	18	use iter::Iterator;
	19	use mem::transmute;
	20	use option::Option::{None, Some};
	21	use option::Option;
	22	use slice::SliceExt;
	23
	24	// UTF-8 ranges and tags for encoding characters
c34b1796 AL	25	const TAG_CONT: u8 = 0b1000_0000;
	26	const TAG_TWO_B: u8 = 0b1100_0000;
	27	const TAG_THREE_B: u8 = 0b1110_0000;
	28	const TAG_FOUR_B: u8 = 0b1111_0000;
	29	const MAX_ONE_B: u32 = 0x80;
	30	const MAX_TWO_B: u32 = 0x800;
	31	const MAX_THREE_B: u32 = 0x10000;
1a4d82fc JJ	32
	33	/*
	34	Lu Uppercase_Letter an uppercase letter
	35	Ll Lowercase_Letter a lowercase letter
	36	Lt Titlecase_Letter a digraphic character, with first part uppercase
	37	Lm Modifier_Letter a modifier letter
	38	Lo Other_Letter other letters, including syllables and ideographs
	39	Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
	40	Mc Spacing_Mark a spacing combining mark (positive advance width)
	41	Me Enclosing_Mark an enclosing combining mark
	42	Nd Decimal_Number a decimal digit
	43	Nl Letter_Number a letterlike numeric character
	44	No Other_Number a numeric character of other type
	45	Pc Connector_Punctuation a connecting punctuation mark, like a tie
	46	Pd Dash_Punctuation a dash or hyphen punctuation mark
	47	Ps Open_Punctuation an opening punctuation mark (of a pair)
	48	Pe Close_Punctuation a closing punctuation mark (of a pair)
	49	Pi Initial_Punctuation an initial quotation mark
	50	Pf Final_Punctuation a final quotation mark
	51	Po Other_Punctuation a punctuation mark of other type
	52	Sm Math_Symbol a symbol of primarily mathematical use
	53	Sc Currency_Symbol a currency sign
	54	Sk Modifier_Symbol a non-letterlike modifier symbol
	55	So Other_Symbol a symbol of other type
	56	Zs Space_Separator a space character (of various non-zero widths)
	57	Zl Line_Separator U+2028 LINE SEPARATOR only
	58	Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
	59	Cc Control a C0 or C1 control code
	60	Cf Format a format control character
	61	Cs Surrogate a surrogate code point
	62	Co Private_Use a private-use character
	63	Cn Unassigned a reserved unassigned code point or a noncharacter
	64	*/
	65
	66	/// The highest valid code point
85aaf69f	67	#[stable(feature = "rust1", since = "1.0.0")]
1a4d82fc JJ	68	pub const MAX: char = '\u{10ffff}';
1a4d82fc JJ	69
85aaf69f SL	70	/// Converts a `u32` to an `Option<char>`.
	71	///
	72	/// # Examples
	73	///
	74	/// ```
	75	/// use std::char;
	76	///
62682a34 SL	77	/// assert_eq!(char::from_u32(0x2764), Some('❤'));
62682a34 SL	78	/// assert_eq!(char::from_u32(0x110000), None); // invalid character
85aaf69f	79	/// ```
1a4d82fc	80	#[inline]
85aaf69f	81	#[stable(feature = "rust1", since = "1.0.0")]
1a4d82fc JJ	82	pub fn from_u32(i: u32) -> Option<char> {
	83	// catch out-of-bounds and surrogates
	84	if (i > MAX as u32) \|\| (i >= 0xD800 && i <= 0xDFFF) {
	85	None
	86	} else {
c1a9b12d	87	Some(unsafe { from_u32_unchecked(i) })
1a4d82fc JJ	88	}
	89	}
	90
c1a9b12d SL	91	/// Converts a `u32` to an `char`, not checking whether it is a valid unicode
	92	/// codepoint.
	93	#[inline]
	94	#[unstable(feature = "char_from_unchecked", reason = "recently added API")]
	95	pub unsafe fn from_u32_unchecked(i: u32) -> char {
	96	transmute(i)
	97	}
	98
85aaf69f	99	/// Converts a number to the character representing it.
1a4d82fc JJ	100	///
	101	/// # Return value
	102	///
	103	/// Returns `Some(char)` if `num` represents one digit under `radix`,
	104	/// using one character of `0-9` or `a-z`, or `None` if it doesn't.
	105	///
	106	/// # Panics
	107	///
	108	/// Panics if given an `radix` > 36.
	109	///
85aaf69f SL	110	/// # Examples
	111	///
	112	/// ```
	113	/// use std::char;
	114	///
	115	/// let c = char::from_digit(4, 10);
	116	///
	117	/// assert_eq!(c, Some('4'));
	118	/// ```
1a4d82fc	119	#[inline]
c34b1796	120	#[stable(feature = "rust1", since = "1.0.0")]
85aaf69f	121	pub fn from_digit(num: u32, radix: u32) -> Option<char> {
1a4d82fc JJ	122	if radix > 36 {
	123	panic!("from_digit: radix is too high (maximum 36)");
	124	}
	125	if num < radix {
c1a9b12d SL	126	let num = num as u8;
	127	if num < 10 {
	128	Some((b'0' + num) as char)
	129	} else {
	130	Some((b'a' + num - 10) as char)
1a4d82fc JJ	131	}
	132	} else {
	133	None
	134	}
	135	}
	136
c34b1796 AL	137	// NB: the stabilization and documentation for this trait is in
	138	// unicode/char.rs, not here
	139	#[allow(missing_docs)] // docs in libunicode/u_char.rs
9346a6ac	140	#[doc(hidden)]
62682a34 SL	141	#[unstable(feature = "core_char_ext",
62682a34 SL	142	reason = "the stable interface is `impl char` in later crate")]
1a4d82fc	143	pub trait CharExt {
85aaf69f	144	fn is_digit(self, radix: u32) -> bool;
85aaf69f	145	fn to_digit(self, radix: u32) -> Option<u32>;
1a4d82fc	146	fn escape_unicode(self) -> EscapeUnicode;
1a4d82fc	147	fn escape_default(self) -> EscapeDefault;
85aaf69f	148	fn len_utf8(self) -> usize;
85aaf69f	149	fn len_utf16(self) -> usize;
85aaf69f	150	fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
85aaf69f	151	fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
1a4d82fc JJ	152	}
1a4d82fc JJ	153
1a4d82fc	154	impl CharExt for char {
62682a34	155	#[inline]
85aaf69f	156	fn is_digit(self, radix: u32) -> bool {
1a4d82fc JJ	157	self.to_digit(radix).is_some()
	158	}
	159
62682a34	160	#[inline]
85aaf69f	161	fn to_digit(self, radix: u32) -> Option<u32> {
1a4d82fc JJ	162	if radix > 36 {
	163	panic!("to_digit: radix is too high (maximum 36)");
	164	}
	165	let val = match self {
85aaf69f SL	166	'0' ... '9' => self as u32 - '0' as u32,
	167	'a' ... 'z' => self as u32 - 'a' as u32 + 10,
	168	'A' ... 'Z' => self as u32 - 'A' as u32 + 10,
1a4d82fc JJ	169	_ => return None,
	170	};
	171	if val < radix { Some(val) }
	172	else { None }
	173	}
	174
62682a34	175	#[inline]
1a4d82fc JJ	176	fn escape_unicode(self) -> EscapeUnicode {
	177	EscapeUnicode { c: self, state: EscapeUnicodeState::Backslash }
	178	}
	179
62682a34	180	#[inline]
1a4d82fc JJ	181	fn escape_default(self) -> EscapeDefault {
	182	let init_state = match self {
	183	'\t' => EscapeDefaultState::Backslash('t'),
	184	'\r' => EscapeDefaultState::Backslash('r'),
	185	'\n' => EscapeDefaultState::Backslash('n'),
	186	'\\' => EscapeDefaultState::Backslash('\\'),
	187	'\'' => EscapeDefaultState::Backslash('\''),
	188	'"' => EscapeDefaultState::Backslash('"'),
	189	'\x20' ... '\x7e' => EscapeDefaultState::Char(self),
	190	_ => EscapeDefaultState::Unicode(self.escape_unicode())
	191	};
	192	EscapeDefault { state: init_state }
	193	}
	194
	195	#[inline]
85aaf69f	196	fn len_utf8(self) -> usize {
1a4d82fc	197	let code = self as u32;
c34b1796 AL	198	if code < MAX_ONE_B {
	199	1
	200	} else if code < MAX_TWO_B {
	201	2
	202	} else if code < MAX_THREE_B {
	203	3
	204	} else {
	205	4
1a4d82fc JJ	206	}
	207	}
	208
	209	#[inline]
85aaf69f	210	fn len_utf16(self) -> usize {
1a4d82fc	211	let ch = self as u32;
c34b1796	212	if (ch & 0xFFFF) == ch { 1 } else { 2 }
1a4d82fc JJ	213	}
	214
	215	#[inline]
85aaf69f SL	216	fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
85aaf69f SL	217	encode_utf8_raw(self as u32, dst)
1a4d82fc JJ	218	}
	219
	220	#[inline]
85aaf69f SL	221	fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
	222	encode_utf16_raw(self as u32, dst)
	223	}
	224	}
	225
	226	/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
	227	/// and then returns the number of bytes written.
	228	///
	229	/// If the buffer is not large enough, nothing will be written into it
	230	/// and a `None` will be returned.
	231	#[inline]
62682a34 SL	232	#[unstable(feature = "char_internals",
	233	reason = "this function should not be exposed publicly")]
	234	#[doc(hidden)]
85aaf69f SL	235	pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
85aaf69f SL	236	// Marked #[inline] to allow llvm optimizing it away
9346a6ac	237	if code < MAX_ONE_B && !dst.is_empty() {
85aaf69f SL	238	dst[0] = code as u8;
	239	Some(1)
	240	} else if code < MAX_TWO_B && dst.len() >= 2 {
c34b1796 AL	241	dst[0] = (code >> 6 & 0x1F) as u8 \| TAG_TWO_B;
c34b1796 AL	242	dst[1] = (code & 0x3F) as u8 \| TAG_CONT;
85aaf69f SL	243	Some(2)
85aaf69f SL	244	} else if code < MAX_THREE_B && dst.len() >= 3 {
c34b1796 AL	245	dst[0] = (code >> 12 & 0x0F) as u8 \| TAG_THREE_B;
	246	dst[1] = (code >> 6 & 0x3F) as u8 \| TAG_CONT;
	247	dst[2] = (code & 0x3F) as u8 \| TAG_CONT;
85aaf69f SL	248	Some(3)
85aaf69f SL	249	} else if dst.len() >= 4 {
c34b1796 AL	250	dst[0] = (code >> 18 & 0x07) as u8 \| TAG_FOUR_B;
	251	dst[1] = (code >> 12 & 0x3F) as u8 \| TAG_CONT;
	252	dst[2] = (code >> 6 & 0x3F) as u8 \| TAG_CONT;
	253	dst[3] = (code & 0x3F) as u8 \| TAG_CONT;
85aaf69f SL	254	Some(4)
	255	} else {
	256	None
	257	}
	258	}
	259
	260	/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
	261	/// and then returns the number of `u16`s written.
	262	///
	263	/// If the buffer is not large enough, nothing will be written into it
	264	/// and a `None` will be returned.
	265	#[inline]
62682a34 SL	266	#[unstable(feature = "char_internals",
	267	reason = "this function should not be exposed publicly")]
	268	#[doc(hidden)]
85aaf69f SL	269	pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
85aaf69f SL	270	// Marked #[inline] to allow llvm optimizing it away
9346a6ac	271	if (ch & 0xFFFF) == ch && !dst.is_empty() {
85aaf69f SL	272	// The BMP falls through (assuming non-surrogate, as it should)
	273	dst[0] = ch as u16;
	274	Some(1)
	275	} else if dst.len() >= 2 {
	276	// Supplementary planes break into surrogates.
c34b1796 AL	277	ch -= 0x1_0000;
	278	dst[0] = 0xD800 \| ((ch >> 10) as u16);
	279	dst[1] = 0xDC00 \| ((ch as u16) & 0x3FF);
85aaf69f SL	280	Some(2)
	281	} else {
	282	None
1a4d82fc JJ	283	}
	284	}
	285
	286	/// An iterator over the characters that represent a `char`, as escaped by
	287	/// Rust's unicode escaping rules.
	288	#[derive(Clone)]
85aaf69f	289	#[stable(feature = "rust1", since = "1.0.0")]
1a4d82fc JJ	290	pub struct EscapeUnicode {
	291	c: char,
	292	state: EscapeUnicodeState
	293	}
	294
	295	#[derive(Clone)]
1a4d82fc JJ	296	enum EscapeUnicodeState {
	297	Backslash,
	298	Type,
	299	LeftBrace,
85aaf69f	300	Value(usize),
1a4d82fc JJ	301	RightBrace,
	302	Done,
	303	}
	304
85aaf69f	305	#[stable(feature = "rust1", since = "1.0.0")]
1a4d82fc JJ	306	impl Iterator for EscapeUnicode {
	307	type Item = char;
	308
	309	fn next(&mut self) -> Option<char> {
	310	match self.state {
	311	EscapeUnicodeState::Backslash => {
	312	self.state = EscapeUnicodeState::Type;
	313	Some('\\')
	314	}
	315	EscapeUnicodeState::Type => {
	316	self.state = EscapeUnicodeState::LeftBrace;
	317	Some('u')
	318	}
	319	EscapeUnicodeState::LeftBrace => {
85aaf69f	320	let mut n = 0;
1a4d82fc JJ	321	while (self.c as u32) >> (4 * (n + 1)) != 0 {
	322	n += 1;
	323	}
	324	self.state = EscapeUnicodeState::Value(n);
	325	Some('{')
	326	}
	327	EscapeUnicodeState::Value(offset) => {
c1a9b12d	328	let c = from_digit(((self.c as u32) >> (offset * 4)) & 0xf, 16).unwrap();
1a4d82fc JJ	329	if offset == 0 {
	330	self.state = EscapeUnicodeState::RightBrace;
	331	} else {
	332	self.state = EscapeUnicodeState::Value(offset - 1);
	333	}
c1a9b12d	334	Some(c)
1a4d82fc JJ	335	}
	336	EscapeUnicodeState::RightBrace => {
	337	self.state = EscapeUnicodeState::Done;
	338	Some('}')
	339	}
	340	EscapeUnicodeState::Done => None,
	341	}
	342	}
	343	}
	344
	345	/// An iterator over the characters that represent a `char`, escaped
	346	/// for maximum portability.
	347	#[derive(Clone)]
85aaf69f	348	#[stable(feature = "rust1", since = "1.0.0")]
1a4d82fc JJ	349	pub struct EscapeDefault {
	350	state: EscapeDefaultState
	351	}
	352
	353	#[derive(Clone)]
1a4d82fc JJ	354	enum EscapeDefaultState {
	355	Backslash(char),
	356	Char(char),
	357	Done,
	358	Unicode(EscapeUnicode),
	359	}
	360
85aaf69f	361	#[stable(feature = "rust1", since = "1.0.0")]
1a4d82fc JJ	362	impl Iterator for EscapeDefault {
	363	type Item = char;
	364
	365	fn next(&mut self) -> Option<char> {
	366	match self.state {
	367	EscapeDefaultState::Backslash(c) => {
	368	self.state = EscapeDefaultState::Char(c);
	369	Some('\\')
	370	}
	371	EscapeDefaultState::Char(c) => {
	372	self.state = EscapeDefaultState::Done;
	373	Some(c)
	374	}
	375	EscapeDefaultState::Done => None,
	376	EscapeDefaultState::Unicode(ref mut iter) => iter.next()
	377	}
	378	}
	379	}