1 //! Utilities for validating string and char literals and turning them into
2 //! values they represent.
10 /// Errors and warnings that can occur during string unescaping.
11 #[derive(Debug, PartialEq, Eq)]
12 pub enum EscapeError
{
13 /// Expected 1 char, but 0 were found.
15 /// Expected 1 char, but more than 1 were found.
18 /// Escaped '\' character without continuation.
20 /// Invalid escape character (e.g. '\z').
22 /// Raw '\r' encountered.
24 /// Raw '\r' encountered in raw string.
25 BareCarriageReturnInRawString
,
26 /// Unescaped character that was expected to be escaped (e.g. raw '\t').
29 /// Numeric character escape is too short (e.g. '\x1').
31 /// Invalid character in numeric escape (e.g. '\xz')
32 InvalidCharInHexEscape
,
33 /// Character code in numeric escape is non-ascii (e.g. '\xFF').
36 /// '\u' not followed by '{'.
37 NoBraceInUnicodeEscape
,
38 /// Non-hexadecimal value in '\u{..}'.
39 InvalidCharInUnicodeEscape
,
42 /// No closing brace in '\u{..}', e.g. '\u{12'.
43 UnclosedUnicodeEscape
,
45 LeadingUnderscoreUnicodeEscape
,
46 /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
47 OverlongUnicodeEscape
,
48 /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
49 LoneSurrogateUnicodeEscape
,
50 /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
51 OutOfRangeUnicodeEscape
,
53 /// Unicode escape code in byte literal.
55 /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
58 /// After a line ending with '\', the next line contains whitespace
59 /// characters that are not skipped.
60 UnskippedWhitespaceWarning
,
62 /// After a line ending with '\', multiple lines are skipped.
63 MultipleSkippedLinesWarning
,
67 /// Returns true for actual errors, as opposed to warnings.
68 pub fn is_fatal(&self) -> bool
{
71 EscapeError
::UnskippedWhitespaceWarning
| EscapeError
::MultipleSkippedLinesWarning
76 /// Takes a contents of a literal (without quotes) and produces a
77 /// sequence of escaped characters or errors.
78 /// Values are returned through invoking of the provided callback.
79 pub fn unescape_literal
<F
>(src
: &str, mode
: Mode
, callback
: &mut F
)
81 F
: FnMut(Range
<usize>, Result
<char, EscapeError
>),
84 Mode
::Char
| Mode
::Byte
=> {
85 let mut chars
= src
.chars();
86 let res
= unescape_char_or_byte(&mut chars
, mode
== Mode
::Byte
);
87 callback(0..(src
.len() - chars
.as_str().len()), res
);
89 Mode
::Str
| Mode
::ByteStr
=> unescape_str_or_byte_str(src
, mode
== Mode
::ByteStr
, callback
),
90 Mode
::RawStr
| Mode
::RawByteStr
=> {
91 unescape_raw_str_or_raw_byte_str(src
, mode
== Mode
::RawByteStr
, callback
)
96 /// Takes a contents of a char literal (without quotes), and returns an
97 /// unescaped char or an error.
98 pub fn unescape_char(src
: &str) -> Result
<char, EscapeError
> {
99 unescape_char_or_byte(&mut src
.chars(), false)
102 /// Takes a contents of a byte literal (without quotes), and returns an
103 /// unescaped byte or an error.
104 pub fn unescape_byte(src
: &str) -> Result
<u8, EscapeError
> {
105 unescape_char_or_byte(&mut src
.chars(), true).map(byte_from_char
)
108 /// What kind of literal do we parse.
109 #[derive(Debug, Clone, Copy, PartialEq)]
120 pub fn in_double_quotes(self) -> bool
{
122 Mode
::Str
| Mode
::ByteStr
| Mode
::RawStr
| Mode
::RawByteStr
=> true,
123 Mode
::Char
| Mode
::Byte
=> false,
127 pub fn is_byte(self) -> bool
{
129 Mode
::Byte
| Mode
::ByteStr
| Mode
::RawByteStr
=> true,
130 Mode
::Char
| Mode
::Str
| Mode
::RawStr
=> false,
135 fn scan_escape(chars
: &mut Chars
<'_
>, is_byte
: bool
) -> Result
<char, EscapeError
> {
136 // Previous character was '\\', unescape what follows.
137 let res
= match chars
.next().ok_or(EscapeError
::LoneSlash
)?
{
147 // Parse hexadecimal character code.
149 let hi
= chars
.next().ok_or(EscapeError
::TooShortHexEscape
)?
;
150 let hi
= hi
.to_digit(16).ok_or(EscapeError
::InvalidCharInHexEscape
)?
;
152 let lo
= chars
.next().ok_or(EscapeError
::TooShortHexEscape
)?
;
153 let lo
= lo
.to_digit(16).ok_or(EscapeError
::InvalidCharInHexEscape
)?
;
155 let value
= hi
* 16 + lo
;
157 // For a non-byte literal verify that it is within ASCII range.
158 if !is_byte
&& !is_ascii(value
) {
159 return Err(EscapeError
::OutOfRangeHexEscape
);
161 let value
= value
as u8;
167 // We've parsed '\u', now we have to parse '{..}'.
169 if chars
.next() != Some('
{'
) {
170 return Err(EscapeError
::NoBraceInUnicodeEscape
);
173 // First character must be a hexadecimal digit.
174 let mut n_digits
= 1;
175 let mut value
: u32 = match chars
.next().ok_or(EscapeError
::UnclosedUnicodeEscape
)?
{
176 '_'
=> return Err(EscapeError
::LeadingUnderscoreUnicodeEscape
),
177 '
}'
=> return Err(EscapeError
::EmptyUnicodeEscape
),
178 c
=> c
.to_digit(16).ok_or(EscapeError
::InvalidCharInUnicodeEscape
)?
,
181 // First character is valid, now parse the rest of the number
182 // and closing brace.
185 None
=> return Err(EscapeError
::UnclosedUnicodeEscape
),
186 Some('_'
) => continue,
189 return Err(EscapeError
::OverlongUnicodeEscape
);
192 // Incorrect syntax has higher priority for error reporting
193 // than unallowed value for a literal.
195 return Err(EscapeError
::UnicodeEscapeInByte
);
198 break std
::char::from_u32(value
).ok_or_else(|| {
199 if value
> 0x10FFFF {
200 EscapeError
::OutOfRangeUnicodeEscape
202 EscapeError
::LoneSurrogateUnicodeEscape
208 c
.to_digit(16).ok_or(EscapeError
::InvalidCharInUnicodeEscape
)?
;
211 // Stop updating value since we're sure that it's incorrect already.
214 let digit
= digit
as u32;
215 value
= value
* 16 + digit
;
220 _
=> return Err(EscapeError
::InvalidEscape
),
226 fn ascii_check(c
: char, is_byte
: bool
) -> Result
<char, EscapeError
> {
227 if is_byte
&& !c
.is_ascii() {
228 // Byte literal can't be a non-ascii character.
229 Err(EscapeError
::NonAsciiCharInByte
)
235 fn unescape_char_or_byte(chars
: &mut Chars
<'_
>, is_byte
: bool
) -> Result
<char, EscapeError
> {
236 let c
= chars
.next().ok_or(EscapeError
::ZeroChars
)?
;
238 '
\\'
=> scan_escape(chars
, is_byte
),
239 '
\n'
| '
\t'
| '
\''
=> Err(EscapeError
::EscapeOnlyChar
),
240 '
\r'
=> Err(EscapeError
::BareCarriageReturn
),
241 _
=> ascii_check(c
, is_byte
),
243 if chars
.next().is_some() {
244 return Err(EscapeError
::MoreThanOneChar
);
249 /// Takes a contents of a string literal (without quotes) and produces a
250 /// sequence of escaped characters or errors.
251 fn unescape_str_or_byte_str
<F
>(src
: &str, is_byte
: bool
, callback
: &mut F
)
253 F
: FnMut(Range
<usize>, Result
<char, EscapeError
>),
255 let mut chars
= src
.chars();
257 // The `start` and `end` computation here is complicated because
258 // `skip_ascii_whitespace` makes us to skip over chars without counting
259 // them in the range computation.
260 while let Some(c
) = chars
.next() {
261 let start
= src
.len() - chars
.as_str().len() - c
.len_utf8();
264 match chars
.clone().next() {
266 // Rust language specification requires us to skip whitespaces
267 // if unescaped '\' character is followed by '\n'.
268 // For details see [Rust language reference]
269 // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
270 skip_ascii_whitespace(&mut chars
, start
, callback
);
273 _
=> scan_escape(&mut chars
, is_byte
),
278 '
"' => Err(EscapeError::EscapeOnlyChar),
279 '\r' => Err(EscapeError::BareCarriageReturn),
280 _ => ascii_check(c, is_byte),
282 let end = src.len() - chars.as_str().len();
283 callback(start..end, res);
286 fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
288 F: FnMut(Range<usize>, Result<char, EscapeError>),
290 let tail = chars.as_str();
291 let first_non_space = tail
293 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
294 .unwrap_or(tail.len());
295 if tail[1..first_non_space].contains('\n') {
296 // The +1 accounts for the escaping slash.
297 let end = start + first_non_space + 1;
298 callback(start..end, Err(EscapeError::MultipleSkippedLinesWarning));
300 let tail = &tail[first_non_space..];
301 if let Some(c) = tail.chars().nth(0) {
302 // For error reporting, we would like the span to contain the character that was not
303 // skipped. The +1 is necessary to account for the leading \ that started the escape.
304 let end = start + first_non_space + c.len_utf8() + 1;
305 if c.is_whitespace() {
306 callback(start..end, Err(EscapeError::UnskippedWhitespaceWarning));
309 *chars = tail.chars();
313 /// Takes a contents of a string literal (without quotes) and produces a
314 /// sequence of characters or errors.
315 /// NOTE: Raw strings do not perform any explicit character escaping, here we
316 /// only produce errors on bare CR.
317 fn unescape_raw_str_or_raw_byte_str<F>(src: &str, is_byte: bool, callback: &mut F)
319 F: FnMut(Range<usize>, Result<char, EscapeError>),
321 let mut chars = src.chars();
323 // The `start` and `end` computation here matches the one in
324 // `unescape_str_or_byte_str` for consistency, even though this function
325 // doesn't have to worry about skipping any chars.
326 while let Some(c) = chars.next() {
327 let start = src.len() - chars.as_str().len() - c.len_utf8();
329 '\r' => Err(EscapeError::BareCarriageReturnInRawString),
330 _ => ascii_check(c, is_byte),
332 let end = src.len() - chars.as_str().len();
333 callback(start..end, res);
338 pub fn byte_from_char(c: char) -> u8 {
340 debug_assert!(res <= u8::MAX as u32, "guaranteed because of Mode
::ByteStr
");
344 fn is_ascii(x: u32) -> bool {