]>
Commit | Line | Data |
---|---|---|
83c7162d XL |
1 | //! UTF-8 and UTF-16 decoding iterators |
2 | ||
48663c56 XL |
3 | use crate::fmt; |
4 | ||
83c7162d XL |
5 | use super::from_u32_unchecked; |
6 | ||
83c7162d | 7 | /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s. |
94222f64 XL |
8 | /// |
9 | /// This `struct` is created by the [`decode_utf16`] method on [`char`]. See its | |
10 | /// documentation for more. | |
11 | /// | |
12 | /// [`decode_utf16`]: char::decode_utf16 | |
83c7162d XL |
13 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
14 | #[derive(Clone, Debug)] | |
15 | pub struct DecodeUtf16<I> | |
60c5eb7d XL |
16 | where |
17 | I: Iterator<Item = u16>, | |
83c7162d XL |
18 | { |
19 | iter: I, | |
20 | buf: Option<u16>, | |
21 | } | |
22 | ||
23 | /// An error that can be returned when decoding UTF-16 code points. | |
94222f64 XL |
24 | /// |
25 | /// This `struct` is created when using the [`DecodeUtf16`] type. | |
83c7162d XL |
26 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
27 | #[derive(Debug, Clone, Eq, PartialEq)] | |
28 | pub struct DecodeUtf16Error { | |
29 | code: u16, | |
30 | } | |
31 | ||
9fa01778 | 32 | /// Creates an iterator over the UTF-16 encoded code points in `iter`, |
83c7162d XL |
33 | /// returning unpaired surrogates as `Err`s. |
34 | /// | |
35 | /// # Examples | |
36 | /// | |
37 | /// Basic usage: | |
38 | /// | |
39 | /// ``` | |
40 | /// use std::char::decode_utf16; | |
41 | /// | |
e74abb32 XL |
42 | /// // 𝄞mus<invalid>ic<invalid> |
43 | /// let v = [ | |
44 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, | |
45 | /// ]; | |
83c7162d | 46 | /// |
e74abb32 XL |
47 | /// assert_eq!( |
48 | /// decode_utf16(v.iter().cloned()) | |
49 | /// .map(|r| r.map_err(|e| e.unpaired_surrogate())) | |
50 | /// .collect::<Vec<_>>(), | |
51 | /// vec![ | |
52 | /// Ok('𝄞'), | |
53 | /// Ok('m'), Ok('u'), Ok('s'), | |
54 | /// Err(0xDD1E), | |
55 | /// Ok('i'), Ok('c'), | |
56 | /// Err(0xD834) | |
57 | /// ] | |
58 | /// ); | |
83c7162d XL |
59 | /// ``` |
60 | /// | |
61 | /// A lossy decoder can be obtained by replacing `Err` results with the replacement character: | |
62 | /// | |
63 | /// ``` | |
64 | /// use std::char::{decode_utf16, REPLACEMENT_CHARACTER}; | |
65 | /// | |
e74abb32 XL |
66 | /// // 𝄞mus<invalid>ic<invalid> |
67 | /// let v = [ | |
68 | /// 0xD834, 0xDD1E, 0x006d, 0x0075, 0x0073, 0xDD1E, 0x0069, 0x0063, 0xD834, | |
69 | /// ]; | |
83c7162d | 70 | /// |
e74abb32 XL |
71 | /// assert_eq!( |
72 | /// decode_utf16(v.iter().cloned()) | |
73 | /// .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)) | |
74 | /// .collect::<String>(), | |
75 | /// "𝄞mus�ic�" | |
76 | /// ); | |
83c7162d XL |
77 | /// ``` |
78 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
79 | #[inline] | |
80 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iter: I) -> DecodeUtf16<I::IntoIter> { | |
60c5eb7d | 81 | DecodeUtf16 { iter: iter.into_iter(), buf: None } |
83c7162d XL |
82 | } |
83 | ||
84 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
85 | impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> { | |
86 | type Item = Result<char, DecodeUtf16Error>; | |
87 | ||
88 | fn next(&mut self) -> Option<Result<char, DecodeUtf16Error>> { | |
89 | let u = match self.buf.take() { | |
90 | Some(buf) => buf, | |
60c5eb7d | 91 | None => self.iter.next()?, |
83c7162d XL |
92 | }; |
93 | ||
94 | if u < 0xD800 || 0xDFFF < u { | |
60c5eb7d | 95 | // SAFETY: not a surrogate |
83c7162d XL |
96 | Some(Ok(unsafe { from_u32_unchecked(u as u32) })) |
97 | } else if u >= 0xDC00 { | |
98 | // a trailing surrogate | |
99 | Some(Err(DecodeUtf16Error { code: u })) | |
100 | } else { | |
101 | let u2 = match self.iter.next() { | |
102 | Some(u2) => u2, | |
103 | // eof | |
104 | None => return Some(Err(DecodeUtf16Error { code: u })), | |
105 | }; | |
106 | if u2 < 0xDC00 || u2 > 0xDFFF { | |
107 | // not a trailing surrogate so we're not a valid | |
108 | // surrogate pair, so rewind to redecode u2 next time. | |
109 | self.buf = Some(u2); | |
110 | return Some(Err(DecodeUtf16Error { code: u })); | |
111 | } | |
112 | ||
113 | // all ok, so lets decode it. | |
114 | let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; | |
60c5eb7d | 115 | // SAFETY: we checked that it's a legal unicode value |
83c7162d XL |
116 | Some(Ok(unsafe { from_u32_unchecked(c) })) |
117 | } | |
118 | } | |
119 | ||
120 | #[inline] | |
121 | fn size_hint(&self) -> (usize, Option<usize>) { | |
122 | let (low, high) = self.iter.size_hint(); | |
5099ac24 FG |
123 | |
124 | let (low_buf, high_buf) = match self.buf { | |
125 | // buf is empty, no additional elements from it. | |
126 | None => (0, 0), | |
127 | // `u` is a non surrogate, so it's always an additional character. | |
128 | Some(u) if u < 0xD800 || 0xDFFF < u => (1, 1), | |
129 | // `u` is a leading surrogate (it can never be a trailing surrogate and | |
130 | // it's a surrogate due to the previous branch) and `self.iter` is empty. | |
131 | // | |
132 | // `u` can't be paired, since the `self.iter` is empty, | |
133 | // so it will always become an additional element (error). | |
134 | Some(_u) if high == Some(0) => (1, 1), | |
135 | // `u` is a leading surrogate and `iter` may be non-empty. | |
136 | // | |
137 | // `u` can either pair with a trailing surrogate, in which case no additional elements | |
138 | // are produced, or it can become an error, in which case it's an additional character (error). | |
139 | Some(_u) => (0, 1), | |
140 | }; | |
141 | ||
142 | // `self.iter` could contain entirely valid surrogates (2 elements per | |
143 | // char), or entirely non-surrogates (1 element per char). | |
144 | // | |
145 | // On odd lower bound, at least one element must stay unpaired | |
146 | // (with other elements from `self.iter`), so we round up. | |
147 | let low = low.div_ceil(2) + low_buf; | |
148 | let high = high.and_then(|h| h.checked_add(high_buf)); | |
149 | ||
150 | (low, high) | |
83c7162d XL |
151 | } |
152 | } | |
153 | ||
154 | impl DecodeUtf16Error { | |
155 | /// Returns the unpaired surrogate which caused this error. | |
3c0e092e | 156 | #[must_use] |
83c7162d XL |
157 | #[stable(feature = "decode_utf16", since = "1.9.0")] |
158 | pub fn unpaired_surrogate(&self) -> u16 { | |
159 | self.code | |
160 | } | |
161 | } | |
162 | ||
163 | #[stable(feature = "decode_utf16", since = "1.9.0")] | |
164 | impl fmt::Display for DecodeUtf16Error { | |
48663c56 | 165 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
83c7162d XL |
166 | write!(f, "unpaired surrogate found: {:x}", self.code) |
167 | } | |
168 | } |