]> git.proxmox.com Git - rustc.git/blame - library/core/src/char/decode.rs
New upstream version 1.60.0+dfsg1
[rustc.git] / library / core / src / char / decode.rs
CommitLineData
83c7162d
XL
1//! UTF-8 and UTF-16 decoding iterators
2
48663c56
XL
3use crate::fmt;
4
83c7162d
XL
5use super::from_u32_unchecked;
6
83c7162d 7/// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
94222f64
XL
8///
9/// This `struct` is created by the [`decode_utf16`] method on [`char`]. See its
10/// documentation for more.
11///
12/// [`decode_utf16`]: char::decode_utf16
83c7162d
XL
13#[stable(feature = "decode_utf16", since = "1.9.0")]
14#[derive(Clone, Debug)]
15pub struct DecodeUtf16<I>
60c5eb7d
XL
16where
17 I: Iterator<Item = u16>,
83c7162d
XL
18{
19 iter: I,
20 buf: Option<u16>,
21}
22
23/// An error that can be returned when decoding UTF-16 code points.
94222f64
XL
24///
25/// This `struct` is created when using the [`DecodeUtf16`] type.
83c7162d
XL
26#[stable(feature = "decode_utf16", since = "1.9.0")]
27#[derive(Debug, Clone, Eq, PartialEq)]
28pub struct DecodeUtf16Error {
29 code: u16,
30}
31
9fa01778 32/// Creates an iterator over the UTF-16 encoded code points in `iter`,
83c7162d
XL
33/// returning unpaired surrogates as `Err`s.
34///
35/// # Examples
36///
37/// Basic usage:
38///
39/// ```
40/// use std::char::decode_utf16;
41///
e74abb32
XL
42/// // 𝄞mus<invalid>ic<invalid>
43/// let v = [
44/// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834,
45/// ];
83c7162d 46///
e74abb32
XL
47/// assert_eq!(
48/// decode_utf16(v.iter().cloned())
49/// .map(|r| r.map_err(|e| e.unpaired_surrogate()))
50/// .collect::<Vec<_>>(),
51/// vec![
52/// Ok('𝄞'),
53/// Ok('m'), Ok('u'), Ok('s'),
54/// Err(0xDD1E),
55/// Ok('i'), Ok('c'),
56/// Err(0xD834)
57/// ]
58/// );
83c7162d
XL
59/// ```
60///
61/// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
62///
63/// ```
64/// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
65///
e74abb32
XL
66/// // 𝄞mus<invalid>ic<invalid>
67/// let v = [
68/// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834,
69/// ];
83c7162d 70///
e74abb32
XL
71/// assert_eq!(
72/// decode_utf16(v.iter().cloned())
73/// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
74/// .collect::<String>(),
75/// "𝄞mus�ic�"
76/// );
83c7162d
XL
77/// ```
78#[stable(feature = "decode_utf16", since = "1.9.0")]
79#[inline]
80pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> {
60c5eb7d 81 DecodeUtf16 { iter: iter.into_iter(), buf: None }
83c7162d
XL
82}
83
84#[stable(feature = "decode_utf16", since = "1.9.0")]
85impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
86 type Item = Result<char, DecodeUtf16Error>;
87
88 fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> {
89 let u = match self.buf.take() {
90 Some(buf) => buf,
60c5eb7d 91 None => self.iter.next()?,
83c7162d
XL
92 };
93
94 if u < 0xD800 || 0xDFFF < u {
60c5eb7d 95 // SAFETY: not a surrogate
83c7162d
XL
96 Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
97 } else if u >= 0xDC00 {
98 // a trailing surrogate
99 Some(Err(DecodeUtf16Error { code: u }))
100 } else {
101 let u2 = match self.iter.next() {
102 Some(u2) => u2,
103 // eof
104 None => return Some(Err(DecodeUtf16Error { code: u })),
105 };
106 if u2 < 0xDC00 || u2 > 0xDFFF {
107 // not a trailing surrogate so we're not a valid
108 // surrogate pair, so rewind to redecode u2 next time.
109 self.buf = Some(u2);
110 return Some(Err(DecodeUtf16Error { code: u }));
111 }
112
113 // all ok, so lets decode it.
114 let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
60c5eb7d 115 // SAFETY: we checked that it's a legal unicode value
83c7162d
XL
116 Some(Ok(unsafe { from_u32_unchecked(c) }))
117 }
118 }
119
120 #[inline]
121 fn size_hint(&self) -> (usize, Option<usize>) {
122 let (low, high) = self.iter.size_hint();
5099ac24
FG
123
124 let (low_buf, high_buf) = match self.buf {
125 // buf is empty, no additional elements from it.
126 None => (0, 0),
127 // `u` is a non surrogate, so it's always an additional character.
128 Some(u) if u < 0xD800 || 0xDFFF < u => (1, 1),
129 // `u` is a leading surrogate (it can never be a trailing surrogate and
130 // it's a surrogate due to the previous branch) and `self.iter` is empty.
131 //
132 // `u` can't be paired, since the `self.iter` is empty,
133 // so it will always become an additional element (error).
134 Some(_u) if high == Some(0) => (1, 1),
135 // `u` is a leading surrogate and `iter` may be non-empty.
136 //
137 // `u` can either pair with a trailing surrogate, in which case no additional elements
138 // are produced, or it can become an error, in which case it's an additional character (error).
139 Some(_u) => (0, 1),
140 };
141
142 // `self.iter` could contain entirely valid surrogates (2 elements per
143 // char), or entirely non-surrogates (1 element per char).
144 //
145 // On odd lower bound, at least one element must stay unpaired
146 // (with other elements from `self.iter`), so we round up.
147 let low = low.div_ceil(2) + low_buf;
148 let high = high.and_then(|h| h.checked_add(high_buf));
149
150 (low, high)
83c7162d
XL
151 }
152}
153
154impl DecodeUtf16Error {
155 /// Returns the unpaired surrogate which caused this error.
3c0e092e 156 #[must_use]
83c7162d
XL
157 #[stable(feature = "decode_utf16", since = "1.9.0")]
158 pub fn unpaired_surrogate(&self) -> u16 {
159 self.code
160 }
161}
162
163#[stable(feature = "decode_utf16", since = "1.9.0")]
164impl fmt::Display for DecodeUtf16Error {
48663c56 165 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
83c7162d
XL
166 write!(f, "unpaired surrogate found: {:x}", self.code)
167 }
168}