[rustc.git] / library / core / src / str / validations.rs

//! Operations related to UTF-8 validation.

use crate::mem;

use super::Utf8Error;

/// Returns the initial codepoint accumulator for the first byte.
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
/// for width 3, and 3 bits for width 4.
#[inline]
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
    (byte & (0x7F >> width)) as u32
}

/// Returns the value of `ch` updated with continuation byte `byte`.
#[inline]
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
    (ch << 6) | (byte & CONT_MASK) as u32
}

/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
/// bits `10`).
#[inline]
pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
    (byte as i8) < -64
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
    // Decode UTF-8
    let x = *bytes.next()?;
    if x < 128 {
        return Some(x as u32);
    }

    // Multibyte case follows
    // Decode from a byte combination out of: [[[x y] z] w]
    // NOTE: Performance is sensitive to the exact formulation here
    let init = utf8_first_byte(x, 2);
    // SAFETY: `bytes` produces an UTF-8-like string,
    // so the iterator must produce a value here.
    let y = unsafe { *bytes.next().unwrap_unchecked() };
    let mut ch = utf8_acc_cont_byte(init, y);
    if x >= 0xE0 {
        // [[x y z] w] case
        // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
        // SAFETY: `bytes` produces an UTF-8-like string,
        // so the iterator must produce a value here.
        let z = unsafe { *bytes.next().unwrap_unchecked() };
        let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
        ch = init << 12 | y_z;
        if x >= 0xF0 {
            // [x y z w] case
            // use only the lower 3 bits of `init`
            // SAFETY: `bytes` produces an UTF-8-like string,
            // so the iterator must produce a value here.
            let w = unsafe { *bytes.next().unwrap_unchecked() };
            ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
        }
    }

    Some(ch)
}

/// Reads the last code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[inline]
pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
where
    I: DoubleEndedIterator<Item = &'a u8>,
{
    // Decode UTF-8
    let w = match *bytes.next_back()? {
        next_byte if next_byte < 128 => return Some(next_byte as u32),
        back_byte => back_byte,
    };

    // Multibyte case follows
    // Decode from a byte combination out of: [x [y [z w]]]
    let mut ch;
    // SAFETY: `bytes` produces an UTF-8-like string,
    // so the iterator must produce a value here.
    let z = unsafe { *bytes.next_back().unwrap_unchecked() };
    ch = utf8_first_byte(z, 2);
    if utf8_is_cont_byte(z) {
        // SAFETY: `bytes` produces an UTF-8-like string,
        // so the iterator must produce a value here.
        let y = unsafe { *bytes.next_back().unwrap_unchecked() };
        ch = utf8_first_byte(y, 3);
        if utf8_is_cont_byte(y) {
            // SAFETY: `bytes` produces an UTF-8-like string,
            // so the iterator must produce a value here.
            let x = unsafe { *bytes.next_back().unwrap_unchecked() };
            ch = utf8_first_byte(x, 4);
            ch = utf8_acc_cont_byte(ch, y);
        }
        ch = utf8_acc_cont_byte(ch, z);
    }
    ch = utf8_acc_cont_byte(ch, w);

    Some(ch)
}

const NONASCII_MASK: usize = usize::repeat_u8(0x80);

/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
#[inline]
const fn contains_nonascii(x: usize) -> bool {
    (x & NONASCII_MASK) != 0
}

/// Walks through `v` checking that it's a valid UTF-8 sequence,
/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
#[inline(always)]
#[rustc_const_unstable(feature = "str_internals", issue = "none")]
pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
    let mut index = 0;
    let len = v.len();

    let usize_bytes = mem::size_of::<usize>();
    let ascii_block_size = 2 * usize_bytes;
    let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
    let align = v.as_ptr().align_offset(usize_bytes);

    while index < len {
        let old_offset = index;
        macro_rules! err {
            ($error_len: expr) => {
                return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
            };
        }

        macro_rules! next {
            () => {{
                index += 1;
                // we needed data, but there was none: error!
                if index >= len {
                    err!(None)
                }
                v[index]
            }};
        }

        let first = v[index];
        if first >= 128 {
            let w = utf8_char_width(first);
            // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
            //        first  C2 80        last DF BF
            // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
            //        first  E0 A0 80     last EF BF BF
            //   excluding surrogates codepoints  \u{d800} to  \u{dfff}
            //               ED A0 80 to       ED BF BF
            // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
            //        first  F0 90 80 80  last F4 8F BF BF
            //
            // Use the UTF-8 syntax from the RFC
            //
            // https://tools.ietf.org/html/rfc3629
            // UTF8-1      = %x00-7F
            // UTF8-2      = %xC2-DF UTF8-tail
            // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
            //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
            // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
            //               %xF4 %x80-8F 2( UTF8-tail )
            match w {
                2 => {
                    if next!() as i8 >= -64 {
                        err!(Some(1))
                    }
                }
                3 => {
                    match (first, next!()) {
                        (0xE0, 0xA0..=0xBF)
                        | (0xE1..=0xEC, 0x80..=0xBF)
                        | (0xED, 0x80..=0x9F)
                        | (0xEE..=0xEF, 0x80..=0xBF) => {}
                        _ => err!(Some(1)),
                    }
                    if next!() as i8 >= -64 {
                        err!(Some(2))
                    }
                }
                4 => {
                    match (first, next!()) {
                        (0xF0, 0x90..=0xBF) | (0xF1..=0xF3, 0x80..=0xBF) | (0xF4, 0x80..=0x8F) => {}
                        _ => err!(Some(1)),
                    }
                    if next!() as i8 >= -64 {
                        err!(Some(2))
                    }
                    if next!() as i8 >= -64 {
                        err!(Some(3))
                    }
                }
                _ => err!(Some(1)),
            }
            index += 1;
        } else {
            // Ascii case, try to skip forward quickly.
            // When the pointer is aligned, read 2 words of data per iteration
            // until we find a word containing a non-ascii byte.
            if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
                let ptr = v.as_ptr();
                while index < blocks_end {
                    // SAFETY: since `align - index` and `ascii_block_size` are
                    // multiples of `usize_bytes`, `block = ptr.add(index)` is
                    // always aligned with a `usize` so it's safe to dereference
                    // both `block` and `block.offset(1)`.
                    unsafe {
                        let block = ptr.add(index) as *const usize;
                        // break if there is a nonascii byte
                        let zu = contains_nonascii(*block);
                        let zv = contains_nonascii(*block.offset(1));
                        if zu || zv {
                            break;
                        }
                    }
                    index += ascii_block_size;
                }
                // step from the point where the wordwise loop stopped
                while index < len && v[index] < 128 {
                    index += 1;
                }
            } else {
                index += 1;
            }
        }
    }

    Ok(())
}

// https://tools.ietf.org/html/rfc3629
const UTF8_CHAR_WIDTH: &[u8; 256] = &[
    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
];

/// Given a first byte, determines how many bytes are in this UTF-8 character.
#[unstable(feature = "str_internals", issue = "none")]
#[must_use]
#[inline]
pub const fn utf8_char_width(b: u8) -> usize {
    UTF8_CHAR_WIDTH[b as usize] as usize
}

/// Mask of the value bits of a continuation byte.
const CONT_MASK: u8 = 0b0011_1111;
Commit	Line	Data
1b1a35ee XL	1	//! Operations related to UTF-8 validation.
	2
	3	use crate::mem;
	4
	5	use super::Utf8Error;
	6
	7	/// Returns the initial codepoint accumulator for the first byte.
	8	/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
	9	/// for width 3, and 3 bits for width 4.
	10	#[inline]
3c0e092e	11	const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
1b1a35ee XL	12	(byte & (0x7F >> width)) as u32
	13	}
	14
	15	/// Returns the value of `ch` updated with continuation byte `byte`.
	16	#[inline]
3c0e092e	17	const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
1b1a35ee XL	18	(ch << 6) \| (byte & CONT_MASK) as u32
	19	}
	20
	21	/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
	22	/// bits `10`).
	23	#[inline]
3c0e092e	24	pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
c295e0f8	25	(byte as i8) < -64
1b1a35ee XL	26	}
1b1a35ee XL	27
1b1a35ee XL	28	/// Reads the next code point out of a byte iterator (assuming a
1b1a35ee XL	29	/// UTF-8-like encoding).
3c0e092e XL	30	///
	31	/// # Safety
	32	///
	33	/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
1b1a35ee XL	34	#[unstable(feature = "str_internals", issue = "none")]
1b1a35ee XL	35	#[inline]
3c0e092e	36	pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
1b1a35ee XL	37	// Decode UTF-8
	38	let x = *bytes.next()?;
	39	if x < 128 {
	40	return Some(x as u32);
	41	}
	42
	43	// Multibyte case follows
	44	// Decode from a byte combination out of: [[[x y] z] w]
	45	// NOTE: Performance is sensitive to the exact formulation here
	46	let init = utf8_first_byte(x, 2);
3c0e092e XL	47	// SAFETY: `bytes` produces an UTF-8-like string,
	48	// so the iterator must produce a value here.
	49	let y = unsafe { *bytes.next().unwrap_unchecked() };
1b1a35ee XL	50	let mut ch = utf8_acc_cont_byte(init, y);
	51	if x >= 0xE0 {
	52	// [[x y z] w] case
	53	// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
3c0e092e XL	54	// SAFETY: `bytes` produces an UTF-8-like string,
	55	// so the iterator must produce a value here.
	56	let z = unsafe { *bytes.next().unwrap_unchecked() };
1b1a35ee XL	57	let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
	58	ch = init << 12 \| y_z;
	59	if x >= 0xF0 {
	60	// [x y z w] case
	61	// use only the lower 3 bits of `init`
3c0e092e XL	62	// SAFETY: `bytes` produces an UTF-8-like string,
	63	// so the iterator must produce a value here.
	64	let w = unsafe { *bytes.next().unwrap_unchecked() };
1b1a35ee XL	65	ch = (init & 7) << 18 \| utf8_acc_cont_byte(y_z, w);
	66	}
	67	}
	68
	69	Some(ch)
	70	}
	71
	72	/// Reads the last code point out of a byte iterator (assuming a
	73	/// UTF-8-like encoding).
3c0e092e XL	74	///
	75	/// # Safety
	76	///
	77	/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
1b1a35ee	78	#[inline]
3c0e092e	79	pub(super) unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
1b1a35ee XL	80	where
	81	I: DoubleEndedIterator<Item = &'a u8>,
	82	{
	83	// Decode UTF-8
	84	let w = match *bytes.next_back()? {
	85	next_byte if next_byte < 128 => return Some(next_byte as u32),
	86	back_byte => back_byte,
	87	};
	88
	89	// Multibyte case follows
	90	// Decode from a byte combination out of: [x [y [z w]]]
	91	let mut ch;
3c0e092e XL	92	// SAFETY: `bytes` produces an UTF-8-like string,
	93	// so the iterator must produce a value here.
	94	let z = unsafe { *bytes.next_back().unwrap_unchecked() };
1b1a35ee XL	95	ch = utf8_first_byte(z, 2);
1b1a35ee XL	96	if utf8_is_cont_byte(z) {
3c0e092e XL	97	// SAFETY: `bytes` produces an UTF-8-like string,
	98	// so the iterator must produce a value here.
	99	let y = unsafe { *bytes.next_back().unwrap_unchecked() };
1b1a35ee XL	100	ch = utf8_first_byte(y, 3);
1b1a35ee XL	101	if utf8_is_cont_byte(y) {
3c0e092e XL	102	// SAFETY: `bytes` produces an UTF-8-like string,
	103	// so the iterator must produce a value here.
	104	let x = unsafe { *bytes.next_back().unwrap_unchecked() };
1b1a35ee XL	105	ch = utf8_first_byte(x, 4);
	106	ch = utf8_acc_cont_byte(ch, y);
	107	}
	108	ch = utf8_acc_cont_byte(ch, z);
	109	}
	110	ch = utf8_acc_cont_byte(ch, w);
	111
	112	Some(ch)
	113	}
	114
04454e1e	115	const NONASCII_MASK: usize = usize::repeat_u8(0x80);
1b1a35ee XL	116
	117	/// Returns `true` if any byte in the word `x` is nonascii (>= 128).
	118	#[inline]
3c0e092e	119	const fn contains_nonascii(x: usize) -> bool {
1b1a35ee XL	120	(x & NONASCII_MASK) != 0
	121	}
	122
	123	/// Walks through `v` checking that it's a valid UTF-8 sequence,
	124	/// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`.
	125	#[inline(always)]
3c0e092e XL	126	#[rustc_const_unstable(feature = "str_internals", issue = "none")]
3c0e092e XL	127	pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
1b1a35ee XL	128	let mut index = 0;
	129	let len = v.len();
	130
	131	let usize_bytes = mem::size_of::<usize>();
	132	let ascii_block_size = 2 * usize_bytes;
	133	let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 };
	134	let align = v.as_ptr().align_offset(usize_bytes);
	135
	136	while index < len {
	137	let old_offset = index;
	138	macro_rules! err {
	139	($error_len: expr) => {
fc512014	140	return Err(Utf8Error { valid_up_to: old_offset, error_len: $error_len })
1b1a35ee XL	141	};
	142	}
	143
	144	macro_rules! next {
	145	() => {{
	146	index += 1;
	147	// we needed data, but there was none: error!
	148	if index >= len {
	149	err!(None)
	150	}
	151	v[index]
	152	}};
	153	}
	154
	155	let first = v[index];
	156	if first >= 128 {
3c0e092e	157	let w = utf8_char_width(first);
1b1a35ee XL	158	// 2-byte encoding is for codepoints \u{0080} to \u{07ff}
	159	// first C2 80 last DF BF
	160	// 3-byte encoding is for codepoints \u{0800} to \u{ffff}
	161	// first E0 A0 80 last EF BF BF
	162	// excluding surrogates codepoints \u{d800} to \u{dfff}
	163	// ED A0 80 to ED BF BF
	164	// 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
	165	// first F0 90 80 80 last F4 8F BF BF
	166	//
	167	// Use the UTF-8 syntax from the RFC
	168	//
	169	// https://tools.ietf.org/html/rfc3629
	170	// UTF8-1 = %x00-7F
	171	// UTF8-2 = %xC2-DF UTF8-tail
	172	// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
	173	// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
	174	// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
	175	// %xF4 %x80-8F 2( UTF8-tail )
	176	match w {
	177	2 => {
c295e0f8	178	if next!() as i8 >= -64 {
1b1a35ee XL	179	err!(Some(1))
	180	}
	181	}
	182	3 => {
	183	match (first, next!()) {
	184	(0xE0, 0xA0..=0xBF)
	185	\| (0xE1..=0xEC, 0x80..=0xBF)
	186	\| (0xED, 0x80..=0x9F)
	187	\| (0xEE..=0xEF, 0x80..=0xBF) => {}
	188	_ => err!(Some(1)),
	189	}
c295e0f8	190	if next!() as i8 >= -64 {
1b1a35ee XL	191	err!(Some(2))
	192	}
	193	}
	194	4 => {
	195	match (first, next!()) {
	196	(0xF0, 0x90..=0xBF) \| (0xF1..=0xF3, 0x80..=0xBF) \| (0xF4, 0x80..=0x8F) => {}
	197	_ => err!(Some(1)),
	198	}
c295e0f8	199	if next!() as i8 >= -64 {
1b1a35ee XL	200	err!(Some(2))
1b1a35ee XL	201	}
c295e0f8	202	if next!() as i8 >= -64 {
1b1a35ee XL	203	err!(Some(3))
	204	}
	205	}
	206	_ => err!(Some(1)),
	207	}
	208	index += 1;
	209	} else {
	210	// Ascii case, try to skip forward quickly.
	211	// When the pointer is aligned, read 2 words of data per iteration
	212	// until we find a word containing a non-ascii byte.
	213	if align != usize::MAX && align.wrapping_sub(index) % usize_bytes == 0 {
	214	let ptr = v.as_ptr();
	215	while index < blocks_end {
	216	// SAFETY: since `align - index` and `ascii_block_size` are
	217	// multiples of `usize_bytes`, `block = ptr.add(index)` is
	218	// always aligned with a `usize` so it's safe to dereference
	219	// both `block` and `block.offset(1)`.
	220	unsafe {
	221	let block = ptr.add(index) as *const usize;
	222	// break if there is a nonascii byte
	223	let zu = contains_nonascii(*block);
	224	let zv = contains_nonascii(*block.offset(1));
3c0e092e	225	if zu \|\| zv {
1b1a35ee XL	226	break;
	227	}
	228	}
	229	index += ascii_block_size;
	230	}
	231	// step from the point where the wordwise loop stopped
	232	while index < len && v[index] < 128 {
	233	index += 1;
	234	}
	235	} else {
	236	index += 1;
	237	}
	238	}
	239	}
	240
	241	Ok(())
	242	}
	243
	244	// https://tools.ietf.org/html/rfc3629
3c0e092e XL	245	const UTF8_CHAR_WIDTH: &[u8; 256] = &[
	246	// 1 2 3 4 5 6 7 8 9 A B C D E F
	247	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
	248	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
	249	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
	250	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
	251	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
	252	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
	253	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
	254	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
	255	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
	256	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
	257	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
	258	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
	259	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
	260	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
	261	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
	262	4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
1b1a35ee XL	263	];
	264
	265	/// Given a first byte, determines how many bytes are in this UTF-8 character.
	266	#[unstable(feature = "str_internals", issue = "none")]
3c0e092e	267	#[must_use]
1b1a35ee	268	#[inline]
3c0e092e	269	pub const fn utf8_char_width(b: u8) -> usize {
1b1a35ee XL	270	UTF8_CHAR_WIDTH[b as usize] as usize
	271	}
	272
	273	/// Mask of the value bits of a continuation byte.
	274	const CONT_MASK: u8 = 0b0011_1111;