]>
Commit | Line | Data |
---|---|---|
f20569fa XL |
1 | //! Utilities for validating string and char literals and turning them into |
2 | //! values they represent. | |
3 | ||
4 | use std::ops::Range; | |
5 | use std::str::Chars; | |
6 | ||
7 | #[cfg(test)] | |
8 | mod tests; | |
9 | ||
10 | /// Errors that can occur during string unescaping. | |
11 | #[derive(Debug, PartialEq, Eq)] | |
12 | pub enum EscapeError { | |
13 | /// Expected 1 char, but 0 were found. | |
14 | ZeroChars, | |
15 | /// Expected 1 char, but more than 1 were found. | |
16 | MoreThanOneChar, | |
17 | ||
18 | /// Escaped '\' character without continuation. | |
19 | LoneSlash, | |
20 | /// Invalid escape character (e.g. '\z'). | |
21 | InvalidEscape, | |
22 | /// Raw '\r' encountered. | |
23 | BareCarriageReturn, | |
24 | /// Raw '\r' encountered in raw string. | |
25 | BareCarriageReturnInRawString, | |
26 | /// Unescaped character that was expected to be escaped (e.g. raw '\t'). | |
27 | EscapeOnlyChar, | |
28 | ||
29 | /// Numeric character escape is too short (e.g. '\x1'). | |
30 | TooShortHexEscape, | |
31 | /// Invalid character in numeric escape (e.g. '\xz') | |
32 | InvalidCharInHexEscape, | |
33 | /// Character code in numeric escape is non-ascii (e.g. '\xFF'). | |
34 | OutOfRangeHexEscape, | |
35 | ||
36 | /// '\u' not followed by '{'. | |
37 | NoBraceInUnicodeEscape, | |
38 | /// Non-hexadecimal value in '\u{..}'. | |
39 | InvalidCharInUnicodeEscape, | |
40 | /// '\u{}' | |
41 | EmptyUnicodeEscape, | |
42 | /// No closing brace in '\u{..}', e.g. '\u{12'. | |
43 | UnclosedUnicodeEscape, | |
44 | /// '\u{_12}' | |
45 | LeadingUnderscoreUnicodeEscape, | |
46 | /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}' | |
47 | OverlongUnicodeEscape, | |
48 | /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'. | |
49 | LoneSurrogateUnicodeEscape, | |
50 | /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'. | |
51 | OutOfRangeUnicodeEscape, | |
52 | ||
53 | /// Unicode escape code in byte literal. | |
54 | UnicodeEscapeInByte, | |
55 | /// Non-ascii character in byte literal. | |
56 | NonAsciiCharInByte, | |
57 | /// Non-ascii character in byte string literal. | |
58 | NonAsciiCharInByteString, | |
59 | } | |
60 | ||
61 | /// Takes a contents of a literal (without quotes) and produces a | |
62 | /// sequence of escaped characters or errors. | |
63 | /// Values are returned through invoking of the provided callback. | |
64 | pub fn unescape_literal<F>(literal_text: &str, mode: Mode, callback: &mut F) | |
65 | where | |
66 | F: FnMut(Range<usize>, Result<char, EscapeError>), | |
67 | { | |
68 | match mode { | |
69 | Mode::Char | Mode::Byte => { | |
70 | let mut chars = literal_text.chars(); | |
71 | let result = unescape_char_or_byte(&mut chars, mode); | |
72 | // The Chars iterator moved forward. | |
73 | callback(0..(literal_text.len() - chars.as_str().len()), result); | |
74 | } | |
75 | Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback), | |
76 | // NOTE: Raw strings do not perform any explicit character escaping, here we | |
77 | // only translate CRLF to LF and produce errors on bare CR. | |
78 | Mode::RawStr | Mode::RawByteStr => { | |
79 | unescape_raw_str_or_byte_str(literal_text, mode, callback) | |
80 | } | |
81 | } | |
82 | } | |
83 | ||
84 | /// Takes a contents of a byte, byte string or raw byte string (without quotes) | |
85 | /// and produces a sequence of bytes or errors. | |
86 | /// Values are returned through invoking of the provided callback. | |
87 | pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F) | |
88 | where | |
89 | F: FnMut(Range<usize>, Result<u8, EscapeError>), | |
90 | { | |
91 | assert!(mode.is_bytes()); | |
92 | unescape_literal(literal_text, mode, &mut |range, result| { | |
93 | callback(range, result.map(byte_from_char)); | |
94 | }) | |
95 | } | |
96 | ||
97 | /// Takes a contents of a char literal (without quotes), and returns an | |
98 | /// unescaped char or an error | |
99 | pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> { | |
100 | let mut chars = literal_text.chars(); | |
101 | unescape_char_or_byte(&mut chars, Mode::Char) | |
102 | .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) | |
103 | } | |
104 | ||
105 | /// Takes a contents of a byte literal (without quotes), and returns an | |
106 | /// unescaped byte or an error. | |
107 | pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> { | |
108 | let mut chars = literal_text.chars(); | |
109 | unescape_char_or_byte(&mut chars, Mode::Byte) | |
110 | .map(byte_from_char) | |
111 | .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) | |
112 | } | |
113 | ||
114 | /// What kind of literal do we parse. | |
115 | #[derive(Debug, Clone, Copy)] | |
116 | pub enum Mode { | |
117 | Char, | |
118 | Str, | |
119 | Byte, | |
120 | ByteStr, | |
121 | RawStr, | |
122 | RawByteStr, | |
123 | } | |
124 | ||
125 | impl Mode { | |
126 | pub fn in_single_quotes(self) -> bool { | |
127 | match self { | |
128 | Mode::Char | Mode::Byte => true, | |
129 | Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => false, | |
130 | } | |
131 | } | |
132 | ||
133 | pub fn in_double_quotes(self) -> bool { | |
134 | !self.in_single_quotes() | |
135 | } | |
136 | ||
137 | pub fn is_bytes(self) -> bool { | |
138 | match self { | |
139 | Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, | |
140 | Mode::Char | Mode::Str | Mode::RawStr => false, | |
141 | } | |
142 | } | |
143 | } | |
144 | ||
145 | fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | |
146 | if first_char != '\\' { | |
147 | // Previous character was not a slash, and we don't expect it to be | |
148 | // an escape-only character. | |
149 | return match first_char { | |
150 | '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), | |
151 | '\r' => Err(EscapeError::BareCarriageReturn), | |
152 | '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), | |
153 | '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), | |
154 | _ => { | |
155 | if mode.is_bytes() && !first_char.is_ascii() { | |
156 | // Byte literal can't be a non-ascii character. | |
157 | return Err(EscapeError::NonAsciiCharInByte); | |
158 | } | |
159 | Ok(first_char) | |
160 | } | |
161 | }; | |
162 | } | |
163 | ||
164 | // Previous character is '\\', try to unescape it. | |
165 | ||
166 | let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; | |
167 | ||
168 | let res = match second_char { | |
169 | '"' => '"', | |
170 | 'n' => '\n', | |
171 | 'r' => '\r', | |
172 | 't' => '\t', | |
173 | '\\' => '\\', | |
174 | '\'' => '\'', | |
175 | '0' => '\0', | |
176 | ||
177 | 'x' => { | |
178 | // Parse hexadecimal character code. | |
179 | ||
180 | let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; | |
181 | let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; | |
182 | ||
183 | let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; | |
184 | let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; | |
185 | ||
186 | let value = hi * 16 + lo; | |
187 | ||
188 | // For a byte literal verify that it is within ASCII range. | |
189 | if !mode.is_bytes() && !is_ascii(value) { | |
190 | return Err(EscapeError::OutOfRangeHexEscape); | |
191 | } | |
192 | let value = value as u8; | |
193 | ||
194 | value as char | |
195 | } | |
196 | ||
197 | 'u' => { | |
198 | // We've parsed '\u', now we have to parse '{..}'. | |
199 | ||
200 | if chars.next() != Some('{') { | |
201 | return Err(EscapeError::NoBraceInUnicodeEscape); | |
202 | } | |
203 | ||
204 | // First character must be a hexadecimal digit. | |
205 | let mut n_digits = 1; | |
206 | let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { | |
207 | '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), | |
208 | '}' => return Err(EscapeError::EmptyUnicodeEscape), | |
209 | c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, | |
210 | }; | |
211 | ||
212 | // First character is valid, now parse the rest of the number | |
213 | // and closing brace. | |
214 | loop { | |
215 | match chars.next() { | |
216 | None => return Err(EscapeError::UnclosedUnicodeEscape), | |
217 | Some('_') => continue, | |
218 | Some('}') => { | |
219 | if n_digits > 6 { | |
220 | return Err(EscapeError::OverlongUnicodeEscape); | |
221 | } | |
222 | ||
223 | // Incorrect syntax has higher priority for error reporting | |
224 | // than unallowed value for a literal. | |
225 | if mode.is_bytes() { | |
226 | return Err(EscapeError::UnicodeEscapeInByte); | |
227 | } | |
228 | ||
229 | break std::char::from_u32(value).ok_or_else(|| { | |
230 | if value > 0x10FFFF { | |
231 | EscapeError::OutOfRangeUnicodeEscape | |
232 | } else { | |
233 | EscapeError::LoneSurrogateUnicodeEscape | |
234 | } | |
235 | })?; | |
236 | } | |
237 | Some(c) => { | |
238 | let digit = | |
239 | c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; | |
240 | n_digits += 1; | |
241 | if n_digits > 6 { | |
242 | // Stop updating value since we're sure that it's is incorrect already. | |
243 | continue; | |
244 | } | |
245 | let digit = digit as u32; | |
246 | value = value * 16 + digit; | |
247 | } | |
248 | }; | |
249 | } | |
250 | } | |
251 | _ => return Err(EscapeError::InvalidEscape), | |
252 | }; | |
253 | Ok(res) | |
254 | } | |
255 | ||
256 | fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { | |
257 | let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; | |
258 | let res = scan_escape(first_char, chars, mode)?; | |
259 | if chars.next().is_some() { | |
260 | return Err(EscapeError::MoreThanOneChar); | |
261 | } | |
262 | Ok(res) | |
263 | } | |
264 | ||
265 | /// Takes a contents of a string literal (without quotes) and produces a | |
266 | /// sequence of escaped characters or errors. | |
267 | fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F) | |
268 | where | |
269 | F: FnMut(Range<usize>, Result<char, EscapeError>), | |
270 | { | |
271 | assert!(mode.in_double_quotes()); | |
272 | let initial_len = src.len(); | |
273 | let mut chars = src.chars(); | |
274 | while let Some(first_char) = chars.next() { | |
275 | let start = initial_len - chars.as_str().len() - first_char.len_utf8(); | |
276 | ||
277 | let unescaped_char = match first_char { | |
278 | '\\' => { | |
279 | let second_char = chars.clone().next(); | |
280 | match second_char { | |
281 | Some('\n') => { | |
282 | // Rust language specification requires us to skip whitespaces | |
283 | // if unescaped '\' character is followed by '\n'. | |
284 | // For details see [Rust language reference] | |
285 | // (https://doc.rust-lang.org/reference/tokens.html#string-literals). | |
286 | skip_ascii_whitespace(&mut chars); | |
287 | continue; | |
288 | } | |
289 | _ => scan_escape(first_char, &mut chars, mode), | |
290 | } | |
291 | } | |
292 | '\n' => Ok('\n'), | |
293 | '\t' => Ok('\t'), | |
294 | _ => scan_escape(first_char, &mut chars, mode), | |
295 | }; | |
296 | let end = initial_len - chars.as_str().len(); | |
297 | callback(start..end, unescaped_char); | |
298 | } | |
299 | ||
300 | fn skip_ascii_whitespace(chars: &mut Chars<'_>) { | |
301 | let str = chars.as_str(); | |
302 | let first_non_space = str | |
303 | .bytes() | |
304 | .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') | |
305 | .unwrap_or(str.len()); | |
306 | *chars = str[first_non_space..].chars() | |
307 | } | |
308 | } | |
309 | ||
310 | /// Takes a contents of a string literal (without quotes) and produces a | |
311 | /// sequence of characters or errors. | |
312 | /// NOTE: Raw strings do not perform any explicit character escaping, here we | |
313 | /// only translate CRLF to LF and produce errors on bare CR. | |
314 | fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F) | |
315 | where | |
316 | F: FnMut(Range<usize>, Result<char, EscapeError>), | |
317 | { | |
318 | assert!(mode.in_double_quotes()); | |
319 | let initial_len = literal_text.len(); | |
320 | ||
321 | let mut chars = literal_text.chars(); | |
322 | while let Some(curr) = chars.next() { | |
323 | let start = initial_len - chars.as_str().len() - curr.len_utf8(); | |
324 | ||
325 | let result = match curr { | |
326 | '\r' => Err(EscapeError::BareCarriageReturnInRawString), | |
327 | c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), | |
328 | c => Ok(c), | |
329 | }; | |
330 | let end = initial_len - chars.as_str().len(); | |
331 | ||
332 | callback(start..end, result); | |
333 | } | |
334 | } | |
335 | ||
336 | fn byte_from_char(c: char) -> u8 { | |
337 | let res = c as u32; | |
338 | assert!(res <= u8::MAX as u32, "guaranteed because of Mode::ByteStr"); | |
339 | res as u8 | |
340 | } | |
341 | ||
342 | fn is_ascii(x: u32) -> bool { | |
343 | x <= 0x7F | |
344 | } |