[rustc.git] / vendor / tendril / src / fmt.rs

// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Marker types for formats.
//!
//! This module defines the types and traits used to mark a `Tendril`
//! with the format of data it contains. It includes those formats
//! for which `Tendril` supports at least some operations without
//! conversion.
//!
//! To convert a string tendril to/from a byte tendril in an arbitrary
//! character encoding, see the `encode` and `decode` methods on
//! `Tendril`.
//!
//! `Tendril` operations may become memory-unsafe if data invalid for
//! the format sneaks in. For that reason, these traits require
//! `unsafe impl`.

use std::default::Default;
use std::{char, mem, str};

use futf::{self, Codepoint, Meaning};

/// Implementation details.
///
/// You don't need these unless you are implementing
/// a new format.
pub mod imp {
    use std::default::Default;
    use std::{iter, mem, slice};

    /// Describes how to fix up encodings when concatenating.
    ///
    /// We can drop characters on either side of the splice,
    /// and insert up to 4 bytes in the middle.
    pub struct Fixup {
        pub drop_left: u32,
        pub drop_right: u32,
        pub insert_len: u32,
        pub insert_bytes: [u8; 4],
    }

    impl Default for Fixup {
        #[inline(always)]
        fn default() -> Fixup {
            Fixup {
                drop_left: 0,
                drop_right: 0,
                insert_len: 0,
                insert_bytes: [0; 4],
            }
        }
    }

    #[inline(always)]
    unsafe fn from_u32_unchecked(n: u32) -> char {
        mem::transmute(n)
    }

    pub struct SingleByteCharIndices<'a> {
        inner: iter::Enumerate<slice::Iter<'a, u8>>,
    }

    impl<'a> Iterator for SingleByteCharIndices<'a> {
        type Item = (usize, char);

        #[inline]
        fn next(&mut self) -> Option<(usize, char)> {
            self.inner
                .next()
                .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
        }
    }

    impl<'a> SingleByteCharIndices<'a> {
        #[inline]
        pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
            SingleByteCharIndices {
                inner: buf.iter().enumerate(),
            }
        }
    }
}

/// Trait for format marker types.
///
/// The type implementing this trait is usually not instantiated.
/// It's used with a phantom type parameter of `Tendril`.
pub unsafe trait Format {
    /// Check whether the buffer is valid for this format.
    fn validate(buf: &[u8]) -> bool;

    /// Check whether the buffer is valid for this format.
    ///
    /// You may assume the buffer is a prefix of a valid buffer.
    #[inline]
    fn validate_prefix(buf: &[u8]) -> bool {
        <Self as Format>::validate(buf)
    }

    /// Check whether the buffer is valid for this format.
    ///
    /// You may assume the buffer is a suffix of a valid buffer.
    #[inline]
    fn validate_suffix(buf: &[u8]) -> bool {
        <Self as Format>::validate(buf)
    }

    /// Check whether the buffer is valid for this format.
    ///
    /// You may assume the buffer is a contiguous subsequence
    /// of a valid buffer, but not necessarily a prefix or
    /// a suffix.
    #[inline]
    fn validate_subseq(buf: &[u8]) -> bool {
        <Self as Format>::validate(buf)
    }

    /// Compute any fixup needed when concatenating buffers.
    ///
    /// The default is to do nothing.
    ///
    /// The function is `unsafe` because it may assume the input
    /// buffers are already valid for the format. Also, no
    /// bounds-checking is performed on the return value!
    #[inline(always)]
    unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
        Default::default()
    }
}

/// Indicates that one format is a subset of another.
///
/// The subset format can be converted to the superset format
/// for free.
pub unsafe trait SubsetOf<Super>: Format
where
    Super: Format,
{
    /// Validate the *other* direction of conversion; check if
    /// this buffer from the superset format conforms to the
    /// subset format.
    ///
    /// The default calls `Self::validate`, but some conversions
    /// may implement a check which is cheaper than validating
    /// from scratch.
    fn revalidate_subset(x: &[u8]) -> bool {
        Self::validate(x)
    }
}

/// Indicates a format which corresponds to a Rust slice type,
/// representing exactly the same invariants.
pub unsafe trait SliceFormat: Format + Sized {
    type Slice: ?Sized + Slice;
}

/// Indicates a format which contains characters from Unicode
/// (all of it, or some proper subset).
pub unsafe trait CharFormat<'a>: Format {
    /// Iterator for characters and their byte indices.
    type Iter: Iterator<Item = (usize, char)>;

    /// Iterate over the characters of the string and their byte
    /// indices.
    ///
    /// You may assume the buffer is *already validated* for `Format`.
    unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;

    /// Encode the character as bytes and pass them to a continuation.
    ///
    /// Returns `Err(())` iff the character cannot be represented.
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]);
}

/// Indicates a Rust slice type that is represented in memory as bytes.
pub unsafe trait Slice {
    /// Access the raw bytes of the slice.
    fn as_bytes(&self) -> &[u8];

    /// Convert a byte slice to this kind of slice.
    ///
    /// You may assume the buffer is *already validated*
    /// for `Format`.
    unsafe fn from_bytes(x: &[u8]) -> &Self;

    /// Convert a byte slice to this kind of slice.
    ///
    /// You may assume the buffer is *already validated*
    /// for `Format`.
    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
}

/// Marker type for uninterpreted bytes.
///
/// Validation will never fail for this format.
#[derive(Copy, Clone, Default, Debug)]
pub struct Bytes;

unsafe impl Format for Bytes {
    #[inline(always)]
    fn validate(_: &[u8]) -> bool {
        true
    }
}

unsafe impl SliceFormat for Bytes {
    type Slice = [u8];
}

unsafe impl Slice for [u8] {
    #[inline(always)]
    fn as_bytes(&self) -> &[u8] {
        self
    }

    #[inline(always)]
    unsafe fn from_bytes(x: &[u8]) -> &[u8] {
        x
    }

    #[inline(always)]
    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
        x
    }
}

/// Marker type for ASCII text.
#[derive(Copy, Clone, Default, Debug)]
pub struct ASCII;

unsafe impl Format for ASCII {
    #[inline]
    fn validate(buf: &[u8]) -> bool {
        buf.iter().all(|&n| n <= 127)
    }

    #[inline(always)]
    fn validate_prefix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_suffix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_subseq(_: &[u8]) -> bool {
        true
    }
}

unsafe impl SubsetOf<UTF8> for ASCII {}
unsafe impl SubsetOf<Latin1> for ASCII {}

unsafe impl<'a> CharFormat<'a> for ASCII {
    type Iter = imp::SingleByteCharIndices<'a>;

    #[inline]
    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
        imp::SingleByteCharIndices::new(buf)
    }

    #[inline]
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]),
    {
        let n = ch as u32;
        if n > 0x7F {
            return Err(());
        }
        cont(&[n as u8]);
        Ok(())
    }
}

/// Marker type for UTF-8 text.
#[derive(Copy, Clone, Default, Debug)]
pub struct UTF8;

unsafe impl Format for UTF8 {
    #[inline]
    fn validate(buf: &[u8]) -> bool {
        str::from_utf8(buf).is_ok()
    }

    #[inline]
    fn validate_prefix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, buf.len() - 1) {
            Some(Codepoint {
                meaning: Meaning::Whole(_),
                ..
            }) => true,
            _ => false,
        }
    }

    #[inline]
    fn validate_suffix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, 0) {
            Some(Codepoint {
                meaning: Meaning::Whole(_),
                ..
            }) => true,
            _ => false,
        }
    }

    #[inline]
    fn validate_subseq(buf: &[u8]) -> bool {
        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
    }
}

unsafe impl SubsetOf<WTF8> for UTF8 {}

unsafe impl SliceFormat for UTF8 {
    type Slice = str;
}

unsafe impl Slice for str {
    #[inline(always)]
    fn as_bytes(&self) -> &[u8] {
        str::as_bytes(self)
    }

    #[inline(always)]
    unsafe fn from_bytes(x: &[u8]) -> &str {
        str::from_utf8_unchecked(x)
    }

    #[inline(always)]
    unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
        mem::transmute(x)
    }
}

unsafe impl<'a> CharFormat<'a> for UTF8 {
    type Iter = str::CharIndices<'a>;

    #[inline]
    unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
        str::from_utf8_unchecked(buf).char_indices()
    }

    #[inline]
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]),
    {
        cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
        Ok(())
    }
}

/// Marker type for WTF-8 text.
///
/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
#[derive(Copy, Clone, Default, Debug)]
pub struct WTF8;

#[inline]
fn wtf8_meaningful(m: Meaning) -> bool {
    match m {
        Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
        _ => false,
    }
}

unsafe impl Format for WTF8 {
    #[inline]
    fn validate(buf: &[u8]) -> bool {
        let mut i = 0;
        let mut prev_lead = false;
        while i < buf.len() {
            let codept = unwrap_or_return!(futf::classify(buf, i), false);
            if !wtf8_meaningful(codept.meaning) {
                return false;
            }
            i += codept.bytes.len();
            prev_lead = match codept.meaning {
                Meaning::TrailSurrogate(_) if prev_lead => return false,
                Meaning::LeadSurrogate(_) => true,
                _ => false,
            };
        }

        true
    }

    #[inline]
    fn validate_prefix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, buf.len() - 1) {
            Some(c) => wtf8_meaningful(c.meaning),
            _ => false,
        }
    }

    #[inline]
    fn validate_suffix(buf: &[u8]) -> bool {
        if buf.len() == 0 {
            return true;
        }
        match futf::classify(buf, 0) {
            Some(c) => wtf8_meaningful(c.meaning),
            _ => false,
        }
    }

    #[inline]
    fn validate_subseq(buf: &[u8]) -> bool {
        <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
    }

    #[inline]
    unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
        const ERR: &'static str = "WTF8: internal error";

        if lhs.len() >= 3 && rhs.len() >= 3 {
            if let (
                Some(Codepoint {
                    meaning: Meaning::LeadSurrogate(hi),
                    ..
                }),
                Some(Codepoint {
                    meaning: Meaning::TrailSurrogate(lo),
                    ..
                }),
            ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
            {
                let mut fixup = imp::Fixup {
                    drop_left: 3,
                    drop_right: 3,
                    insert_len: 0,
                    insert_bytes: [0_u8; 4],
                };

                let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);

                let ch = char::from_u32(n).expect(ERR);
                fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;

                return fixup;
            }
        }

        Default::default()
    }
}

/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
///
/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
/// C0 and C1 control characters from ECMA-48 / ISO 6429.
///
/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
/// many other aliases), which actually stand for Windows-1252.
#[derive(Copy, Clone, Default, Debug)]
pub struct Latin1;

unsafe impl Format for Latin1 {
    #[inline(always)]
    fn validate(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_prefix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_suffix(_: &[u8]) -> bool {
        true
    }

    #[inline(always)]
    fn validate_subseq(_: &[u8]) -> bool {
        true
    }
}

unsafe impl<'a> CharFormat<'a> for Latin1 {
    type Iter = imp::SingleByteCharIndices<'a>;

    #[inline]
    unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
        imp::SingleByteCharIndices::new(buf)
    }

    #[inline]
    fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
    where
        F: FnOnce(&[u8]),
    {
        let n = ch as u32;
        if n > 0xFF {
            return Err(());
        }
        cont(&[n as u8]);
        Ok(())
    }
}
Commit	Line	Data
83c7162d	1	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
923072b8 FG	2	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
923072b8 FG	3	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
83c7162d XL	4	// option. This file may not be copied, modified, or distributed
	5	// except according to those terms.
	6
	7	//! Marker types for formats.
	8	//!
	9	//! This module defines the types and traits used to mark a `Tendril`
	10	//! with the format of data it contains. It includes those formats
	11	//! for which `Tendril` supports at least some operations without
	12	//! conversion.
	13	//!
	14	//! To convert a string tendril to/from a byte tendril in an arbitrary
	15	//! character encoding, see the `encode` and `decode` methods on
	16	//! `Tendril`.
	17	//!
	18	//! `Tendril` operations may become memory-unsafe if data invalid for
	19	//! the format sneaks in. For that reason, these traits require
	20	//! `unsafe impl`.
	21
83c7162d	22	use std::default::Default;
923072b8	23	use std::{char, mem, str};
83c7162d XL	24
	25	use futf::{self, Codepoint, Meaning};
	26
83c7162d XL	27	/// Implementation details.
	28	///
	29	/// You don't need these unless you are implementing
	30	/// a new format.
	31	pub mod imp {
83c7162d	32	use std::default::Default;
923072b8	33	use std::{iter, mem, slice};
83c7162d XL	34
	35	/// Describes how to fix up encodings when concatenating.
	36	///
	37	/// We can drop characters on either side of the splice,
	38	/// and insert up to 4 bytes in the middle.
	39	pub struct Fixup {
	40	pub drop_left: u32,
	41	pub drop_right: u32,
	42	pub insert_len: u32,
	43	pub insert_bytes: [u8; 4],
	44	}
	45
	46	impl Default for Fixup {
	47	#[inline(always)]
	48	fn default() -> Fixup {
	49	Fixup {
	50	drop_left: 0,
	51	drop_right: 0,
	52	insert_len: 0,
	53	insert_bytes: [0; 4],
	54	}
	55	}
	56	}
	57
	58	#[inline(always)]
	59	unsafe fn from_u32_unchecked(n: u32) -> char {
	60	mem::transmute(n)
	61	}
	62
	63	pub struct SingleByteCharIndices<'a> {
	64	inner: iter::Enumerate<slice::Iter<'a, u8>>,
	65	}
	66
	67	impl<'a> Iterator for SingleByteCharIndices<'a> {
	68	type Item = (usize, char);
	69
	70	#[inline]
	71	fn next(&mut self) -> Option<(usize, char)> {
923072b8 FG	72	self.inner
	73	.next()
	74	.map(\|(i, &b)\| unsafe { (i, from_u32_unchecked(b as u32)) })
83c7162d XL	75	}
	76	}
	77
	78	impl<'a> SingleByteCharIndices<'a> {
	79	#[inline]
	80	pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
	81	SingleByteCharIndices {
	82	inner: buf.iter().enumerate(),
	83	}
	84	}
	85	}
	86	}
	87
	88	/// Trait for format marker types.
	89	///
	90	/// The type implementing this trait is usually not instantiated.
	91	/// It's used with a phantom type parameter of `Tendril`.
	92	pub unsafe trait Format {
	93	/// Check whether the buffer is valid for this format.
	94	fn validate(buf: &[u8]) -> bool;
	95
	96	/// Check whether the buffer is valid for this format.
	97	///
	98	/// You may assume the buffer is a prefix of a valid buffer.
	99	#[inline]
	100	fn validate_prefix(buf: &[u8]) -> bool {
	101	<Self as Format>::validate(buf)
	102	}
	103
	104	/// Check whether the buffer is valid for this format.
	105	///
	106	/// You may assume the buffer is a suffix of a valid buffer.
	107	#[inline]
	108	fn validate_suffix(buf: &[u8]) -> bool {
	109	<Self as Format>::validate(buf)
	110	}
	111
	112	/// Check whether the buffer is valid for this format.
	113	///
	114	/// You may assume the buffer is a contiguous subsequence
	115	/// of a valid buffer, but not necessarily a prefix or
	116	/// a suffix.
	117	#[inline]
	118	fn validate_subseq(buf: &[u8]) -> bool {
	119	<Self as Format>::validate(buf)
	120	}
	121
	122	/// Compute any fixup needed when concatenating buffers.
	123	///
	124	/// The default is to do nothing.
	125	///
	126	/// The function is `unsafe` because it may assume the input
	127	/// buffers are already valid for the format. Also, no
	128	/// bounds-checking is performed on the return value!
	129	#[inline(always)]
	130	unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
	131	Default::default()
	132	}
	133	}
	134
	135	/// Indicates that one format is a subset of another.
	136	///
	137	/// The subset format can be converted to the superset format
	138	/// for free.
139	pub unsafe trait SubsetOf<Super>: Format
923072b8 FG	140	where
923072b8 FG	141	Super: Format,
83c7162d XL	142	{
	143	/// Validate the other direction of conversion; check if
	144	/// this buffer from the superset format conforms to the
	145	/// subset format.
	146	///
	147	/// The default calls `Self::validate`, but some conversions
	148	/// may implement a check which is cheaper than validating
	149	/// from scratch.
	150	fn revalidate_subset(x: &[u8]) -> bool {
	151	Self::validate(x)
	152	}
	153	}
	154
	155	/// Indicates a format which corresponds to a Rust slice type,
	156	/// representing exactly the same invariants.
	157	pub unsafe trait SliceFormat: Format + Sized {
	158	type Slice: ?Sized + Slice;
	159	}
	160
	161	/// Indicates a format which contains characters from Unicode
	162	/// (all of it, or some proper subset).
	163	pub unsafe trait CharFormat<'a>: Format {
	164	/// Iterator for characters and their byte indices.
923072b8	165	type Iter: Iterator<Item = (usize, char)>;
83c7162d XL	166
	167	/// Iterate over the characters of the string and their byte
	168	/// indices.
	169	///
	170	/// You may assume the buffer is already validated for `Format`.
	171	unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
	172
	173	/// Encode the character as bytes and pass them to a continuation.
	174	///
	175	/// Returns `Err(())` iff the character cannot be represented.
	176	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
923072b8 FG	177	where
923072b8 FG	178	F: FnOnce(&[u8]);
83c7162d XL	179	}
	180
	181	/// Indicates a Rust slice type that is represented in memory as bytes.
	182	pub unsafe trait Slice {
	183	/// Access the raw bytes of the slice.
	184	fn as_bytes(&self) -> &[u8];
	185
	186	/// Convert a byte slice to this kind of slice.
	187	///
	188	/// You may assume the buffer is already validated
	189	/// for `Format`.
	190	unsafe fn from_bytes(x: &[u8]) -> &Self;
	191
	192	/// Convert a byte slice to this kind of slice.
	193	///
	194	/// You may assume the buffer is already validated
	195	/// for `Format`.
	196	unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
	197	}
	198
	199	/// Marker type for uninterpreted bytes.
	200	///
	201	/// Validation will never fail for this format.
	202	#[derive(Copy, Clone, Default, Debug)]
	203	pub struct Bytes;
	204
	205	unsafe impl Format for Bytes {
	206	#[inline(always)]
	207	fn validate(_: &[u8]) -> bool {
	208	true
	209	}
	210	}
	211
	212	unsafe impl SliceFormat for Bytes {
	213	type Slice = [u8];
	214	}
	215
	216	unsafe impl Slice for [u8] {
	217	#[inline(always)]
	218	fn as_bytes(&self) -> &[u8] {
	219	self
	220	}
	221
	222	#[inline(always)]
	223	unsafe fn from_bytes(x: &[u8]) -> &[u8] {
	224	x
	225	}
	226
	227	#[inline(always)]
	228	unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
	229	x
	230	}
	231	}
	232
	233	/// Marker type for ASCII text.
	234	#[derive(Copy, Clone, Default, Debug)]
	235	pub struct ASCII;
	236
	237	unsafe impl Format for ASCII {
	238	#[inline]
	239	fn validate(buf: &[u8]) -> bool {
	240	buf.iter().all(\|&n\| n <= 127)
	241	}
	242
243	#[inline(always)]
244	fn validate_prefix(_: &[u8]) -> bool {
245	true
246	}
247
248	#[inline(always)]
249	fn validate_suffix(_: &[u8]) -> bool {
250	true
251	}
252
253	#[inline(always)]
254	fn validate_subseq(_: &[u8]) -> bool {
255	true
256	}
257	}
258
923072b8 FG	259	unsafe impl SubsetOf<UTF8> for ASCII {}
923072b8 FG	260	unsafe impl SubsetOf<Latin1> for ASCII {}
83c7162d XL	261
	262	unsafe impl<'a> CharFormat<'a> for ASCII {
	263	type Iter = imp::SingleByteCharIndices<'a>;
	264
	265	#[inline]
	266	unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
	267	imp::SingleByteCharIndices::new(buf)
	268	}
	269
	270	#[inline]
	271	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
923072b8 FG	272	where
923072b8 FG	273	F: FnOnce(&[u8]),
83c7162d XL	274	{
83c7162d XL	275	let n = ch as u32;
923072b8 FG	276	if n > 0x7F {
	277	return Err(());
	278	}
83c7162d XL	279	cont(&[n as u8]);
	280	Ok(())
	281	}
	282	}
	283
	284	/// Marker type for UTF-8 text.
	285	#[derive(Copy, Clone, Default, Debug)]
	286	pub struct UTF8;
	287
	288	unsafe impl Format for UTF8 {
	289	#[inline]
	290	fn validate(buf: &[u8]) -> bool {
	291	str::from_utf8(buf).is_ok()
	292	}
	293
	294	#[inline]
	295	fn validate_prefix(buf: &[u8]) -> bool {
	296	if buf.len() == 0 {
	297	return true;
	298	}
	299	match futf::classify(buf, buf.len() - 1) {
923072b8 FG	300	Some(Codepoint {
	301	meaning: Meaning::Whole(_),
	302	..
	303	}) => true,
83c7162d XL	304	_ => false,
	305	}
	306	}
	307
	308	#[inline]
	309	fn validate_suffix(buf: &[u8]) -> bool {
	310	if buf.len() == 0 {
	311	return true;
	312	}
	313	match futf::classify(buf, 0) {
923072b8 FG	314	Some(Codepoint {
	315	meaning: Meaning::Whole(_),
	316	..
	317	}) => true,
83c7162d XL	318	_ => false,
	319	}
	320	}
	321
	322	#[inline]
	323	fn validate_subseq(buf: &[u8]) -> bool {
923072b8	324	<Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
83c7162d XL	325	}
	326	}
	327
923072b8	328	unsafe impl SubsetOf<WTF8> for UTF8 {}
83c7162d XL	329
	330	unsafe impl SliceFormat for UTF8 {
	331	type Slice = str;
	332	}
	333
	334	unsafe impl Slice for str {
	335	#[inline(always)]
	336	fn as_bytes(&self) -> &[u8] {
	337	str::as_bytes(self)
	338	}
	339
	340	#[inline(always)]
	341	unsafe fn from_bytes(x: &[u8]) -> &str {
	342	str::from_utf8_unchecked(x)
	343	}
	344
	345	#[inline(always)]
	346	unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
	347	mem::transmute(x)
	348	}
	349	}
	350
	351	unsafe impl<'a> CharFormat<'a> for UTF8 {
	352	type Iter = str::CharIndices<'a>;
	353
	354	#[inline]
	355	unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
	356	str::from_utf8_unchecked(buf).char_indices()
	357	}
	358
	359	#[inline]
	360	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
923072b8 FG	361	where
923072b8 FG	362	F: FnOnce(&[u8]),
83c7162d	363	{
923072b8 FG	364	cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
923072b8 FG	365	Ok(())
83c7162d XL	366	}
	367	}
	368
	369	/// Marker type for WTF-8 text.
	370	///
923072b8	371	/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
83c7162d XL	372	#[derive(Copy, Clone, Default, Debug)]
	373	pub struct WTF8;
	374
	375	#[inline]
	376	fn wtf8_meaningful(m: Meaning) -> bool {
	377	match m {
923072b8	378	Meaning::Whole(_) \| Meaning::LeadSurrogate(_) \| Meaning::TrailSurrogate(_) => true,
83c7162d XL	379	_ => false,
	380	}
	381	}
	382
	383	unsafe impl Format for WTF8 {
	384	#[inline]
	385	fn validate(buf: &[u8]) -> bool {
	386	let mut i = 0;
	387	let mut prev_lead = false;
	388	while i < buf.len() {
	389	let codept = unwrap_or_return!(futf::classify(buf, i), false);
	390	if !wtf8_meaningful(codept.meaning) {
	391	return false;
	392	}
	393	i += codept.bytes.len();
	394	prev_lead = match codept.meaning {
	395	Meaning::TrailSurrogate(_) if prev_lead => return false,
	396	Meaning::LeadSurrogate(_) => true,
	397	_ => false,
	398	};
	399	}
	400
	401	true
	402	}
	403
	404	#[inline]
	405	fn validate_prefix(buf: &[u8]) -> bool {
	406	if buf.len() == 0 {
	407	return true;
	408	}
	409	match futf::classify(buf, buf.len() - 1) {
	410	Some(c) => wtf8_meaningful(c.meaning),
	411	_ => false,
	412	}
	413	}
	414
	415	#[inline]
	416	fn validate_suffix(buf: &[u8]) -> bool {
	417	if buf.len() == 0 {
	418	return true;
	419	}
	420	match futf::classify(buf, 0) {
	421	Some(c) => wtf8_meaningful(c.meaning),
	422	_ => false,
	423	}
	424	}
	425
	426	#[inline]
	427	fn validate_subseq(buf: &[u8]) -> bool {
923072b8	428	<Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
83c7162d XL	429	}
	430
	431	#[inline]
	432	unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
	433	const ERR: &'static str = "WTF8: internal error";
	434
	435	if lhs.len() >= 3 && rhs.len() >= 3 {
923072b8 FG	436	if let (
	437	Some(Codepoint {
	438	meaning: Meaning::LeadSurrogate(hi),
	439	..
	440	}),
	441	Some(Codepoint {
	442	meaning: Meaning::TrailSurrogate(lo),
	443	..
	444	}),
	445	) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
83c7162d XL	446	{
	447	let mut fixup = imp::Fixup {
	448	drop_left: 3,
	449	drop_right: 3,
	450	insert_len: 0,
923072b8	451	insert_bytes: [0_u8; 4],
83c7162d XL	452	};
	453
	454	let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
	455
923072b8 FG	456	let ch = char::from_u32(n).expect(ERR);
923072b8 FG	457	fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
83c7162d XL	458
	459	return fixup;
	460	}
	461	}
	462
	463	Default::default()
	464	}
	465	}
	466
	467	/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
	468	///
	469	/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
	470	/// C0 and C1 control characters from ECMA-48 / ISO 6429.
	471	///
	472	/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
	473	/// many other aliases), which actually stand for Windows-1252.
	474	#[derive(Copy, Clone, Default, Debug)]
	475	pub struct Latin1;
	476
	477	unsafe impl Format for Latin1 {
	478	#[inline(always)]
	479	fn validate(_: &[u8]) -> bool {
	480	true
	481	}
	482
	483	#[inline(always)]
	484	fn validate_prefix(_: &[u8]) -> bool {
	485	true
	486	}
	487
	488	#[inline(always)]
	489	fn validate_suffix(_: &[u8]) -> bool {
	490	true
	491	}
	492
	493	#[inline(always)]
	494	fn validate_subseq(_: &[u8]) -> bool {
	495	true
	496	}
	497	}
	498
	499	unsafe impl<'a> CharFormat<'a> for Latin1 {
	500	type Iter = imp::SingleByteCharIndices<'a>;
	501
	502	#[inline]
	503	unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
	504	imp::SingleByteCharIndices::new(buf)
	505	}
	506
	507	#[inline]
	508	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
923072b8 FG	509	where
923072b8 FG	510	F: FnOnce(&[u8]),
83c7162d XL	511	{
83c7162d XL	512	let n = ch as u32;
923072b8 FG	513	if n > 0xFF {
	514	return Err(());
	515	}
83c7162d XL	516	cont(&[n as u8]);
	517	Ok(())
	518	}
	519	}