src/librustc_lexer/src/unescape.rs

   1 //! Utilities for validating string and char literals and turning them into
   2 //! values they represent.
   3
   4 use std::ops::Range;
   5 use std::str::Chars;
   6
   7 #[cfg(test)]
   8 mod tests;
   9
  10 /// Errors that can occur during string unescaping.
  11 #[derive(Debug, PartialEq, Eq)]
  12 pub enum EscapeError {
  13     /// Expected 1 char, but 0 were found.
  14     ZeroChars,
  15     /// Expected 1 char, but more than 1 were found.
  16     MoreThanOneChar,
  17
  18     /// Escaped '\' character without continuation.
  19     LoneSlash,
  20     /// Invalid escape character (e.g. '\z').
  21     InvalidEscape,
  22     /// Raw '\r' encountered.
  23     BareCarriageReturn,
  24     /// Raw '\r' encountered in raw string.
  25     BareCarriageReturnInRawString,
  26     /// Unescaped character that was expected to be escaped (e.g. raw '\t').
  27     EscapeOnlyChar,
  28
  29     /// Numeric character escape is too short (e.g. '\x1').
  30     TooShortHexEscape,
  31     /// Invalid character in numeric escape (e.g. '\xz')
  32     InvalidCharInHexEscape,
  33     /// Character code in numeric escape is non-ascii (e.g. '\xFF').
  34     OutOfRangeHexEscape,
  35
  36     /// '\u' not followed by '{'.
  37     NoBraceInUnicodeEscape,
  38     /// Non-hexadecimal value in '\u{..}'.
  39     InvalidCharInUnicodeEscape,
  40     /// '\u{}'
  41     EmptyUnicodeEscape,
  42     /// No closing brace in '\u{..}', e.g. '\u{12'.
  43     UnclosedUnicodeEscape,
  44     /// '\u{_12}'
  45     LeadingUnderscoreUnicodeEscape,
  46     /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
  47     OverlongUnicodeEscape,
  48     /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
  49     LoneSurrogateUnicodeEscape,
  50     /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
  51     OutOfRangeUnicodeEscape,
  52
  53     /// Unicode escape code in byte literal.
  54     UnicodeEscapeInByte,
  55     /// Non-ascii character in byte literal.
  56     NonAsciiCharInByte,
  57     /// Non-ascii character in byte string literal.
  58     NonAsciiCharInByteString,
  59 }
  60
  61 /// Takes a contents of a literal (without quotes) and produces a
  62 /// sequence of escaped characters or errors.
  63 /// Values are returned through invoking of the provided callback.
  64 pub fn unescape_literal<F>(literal_text: &str, mode: Mode, callback: &mut F)
  65 where
  66     F: FnMut(Range<usize>, Result<char, EscapeError>),
  67 {
  68     match mode {
  69         Mode::Char | Mode::Byte => {
  70             let mut chars = literal_text.chars();
  71             let result = unescape_char_or_byte(&mut chars, mode);
  72             // The Chars iterator moved forward.
  73             callback(0..(literal_text.len() - chars.as_str().len()), result);
  74         }
  75         Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback),
  76         // NOTE: Raw strings do not perform any explicit character escaping, here we
  77         // only translate CRLF to LF and produce errors on bare CR.
  78         Mode::RawStr | Mode::RawByteStr => {
  79             unescape_raw_str_or_byte_str(literal_text, mode, callback)
  80         }
  81     }
  82 }
  83
  84 /// Takes a contents of a byte, byte string or raw byte string (without quotes)
  85 /// and produces a sequence of bytes or errors.
  86 /// Values are returned through invoking of the provided callback.
  87 pub fn unescape_byte_literal<F>(literal_text: &str, mode: Mode, callback: &mut F)
  88 where
  89     F: FnMut(Range<usize>, Result<u8, EscapeError>),
  90 {
  91     assert!(mode.is_bytes());
  92     unescape_literal(literal_text, mode, &mut |range, result| {
  93         callback(range, result.map(byte_from_char));
  94     })
  95 }
  96
  97 /// Takes a contents of a char literal (without quotes), and returns an
  98 /// unescaped char or an error
  99 pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
 100     let mut chars = literal_text.chars();
 101     unescape_char_or_byte(&mut chars, Mode::Char)
 102         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
 103 }
 104
 105 /// Takes a contents of a byte literal (without quotes), and returns an
 106 /// unescaped byte or an error.
 107 pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
 108     let mut chars = literal_text.chars();
 109     unescape_char_or_byte(&mut chars, Mode::Byte)
 110         .map(byte_from_char)
 111         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
 112 }
 113
 114 /// What kind of literal do we parse.
 115 #[derive(Debug, Clone, Copy)]
 116 pub enum Mode {
 117     Char,
 118     Str,
 119     Byte,
 120     ByteStr,
 121     RawStr,
 122     RawByteStr,
 123 }
 124
 125 impl Mode {
 126     pub fn in_single_quotes(self) -> bool {
 127         match self {
 128             Mode::Char | Mode::Byte => true,
 129             Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => false,
 130         }
 131     }
 132
 133     pub fn in_double_quotes(self) -> bool {
 134         !self.in_single_quotes()
 135     }
 136
 137     pub fn is_bytes(self) -> bool {
 138         match self {
 139             Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true,
 140             Mode::Char | Mode::Str | Mode::RawStr => false,
 141         }
 142     }
 143 }
 144
 145 fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
 146     if first_char != '\\' {
 147         // Previous character was not a slash, and we don't expect it to be
 148         // an escape-only character.
 149         return match first_char {
 150             '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
 151             '\r' => Err(EscapeError::BareCarriageReturn),
 152             '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
 153             '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
 154             _ => {
 155                 if mode.is_bytes() && !first_char.is_ascii() {
 156                     // Byte literal can't be a non-ascii character.
 157                     return Err(EscapeError::NonAsciiCharInByte);
 158                 }
 159                 Ok(first_char)
 160             }
 161         };
 162     }
 163
 164     // Previous character is '\\', try to unescape it.
 165
 166     let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
 167
 168     let res = match second_char {
 169         '"' => '"',
 170         'n' => '\n',
 171         'r' => '\r',
 172         't' => '\t',
 173         '\\' => '\\',
 174         '\'' => '\'',
 175         '0' => '\0',
 176
 177         'x' => {
 178             // Parse hexadecimal character code.
 179
 180             let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
 181             let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 182
 183             let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
 184             let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 185
 186             let value = hi * 16 + lo;
 187
 188             // For a byte literal verify that it is within ASCII range.
 189             if !mode.is_bytes() && !is_ascii(value) {
 190                 return Err(EscapeError::OutOfRangeHexEscape);
 191             }
 192             let value = value as u8;
 193
 194             value as char
 195         }
 196
 197         'u' => {
 198             // We've parsed '\u', now we have to parse '{..}'.
 199
 200             if chars.next() != Some('{') {
 201                 return Err(EscapeError::NoBraceInUnicodeEscape);
 202             }
 203
 204             // First characrer must be a hexadecimal digit.
 205             let mut n_digits = 1;
 206             let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
 207                 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
 208                 '}' => return Err(EscapeError::EmptyUnicodeEscape),
 209                 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
 210             };
 211
 212             // First character is valid, now parse the rest of the number
 213             // and closing brace.
 214             loop {
 215                 match chars.next() {
 216                     None => return Err(EscapeError::UnclosedUnicodeEscape),
 217                     Some('_') => continue,
 218                     Some('}') => {
 219                         if n_digits > 6 {
 220                             return Err(EscapeError::OverlongUnicodeEscape);
 221                         }
 222
 223                         // Incorrect syntax has higher priority for error reporting
 224                         // than unallowed value for a literal.
 225                         if mode.is_bytes() {
 226                             return Err(EscapeError::UnicodeEscapeInByte);
 227                         }
 228
 229                         break std::char::from_u32(value).ok_or_else(|| {
 230                             if value > 0x10FFFF {
 231                                 EscapeError::OutOfRangeUnicodeEscape
 232                             } else {
 233                                 EscapeError::LoneSurrogateUnicodeEscape
 234                             }
 235                         })?;
 236                     }
 237                     Some(c) => {
 238                         let digit =
 239                             c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
 240                         n_digits += 1;
 241                         if n_digits > 6 {
 242                             // Stop updating value since we're sure that it's is incorrect already.
 243                             continue;
 244                         }
 245                         let digit = digit as u32;
 246                         value = value * 16 + digit;
 247                     }
 248                 };
 249             }
 250         }
 251         _ => return Err(EscapeError::InvalidEscape),
 252     };
 253     Ok(res)
 254 }
 255
 256 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
 257     let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
 258     let res = scan_escape(first_char, chars, mode)?;
 259     if chars.next().is_some() {
 260         return Err(EscapeError::MoreThanOneChar);
 261     }
 262     Ok(res)
 263 }
 264
 265 /// Takes a contents of a string literal (without quotes) and produces a
 266 /// sequence of escaped characters or errors.
 267 fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
 268 where
 269     F: FnMut(Range<usize>, Result<char, EscapeError>),
 270 {
 271     assert!(mode.in_double_quotes());
 272     let initial_len = src.len();
 273     let mut chars = src.chars();
 274     while let Some(first_char) = chars.next() {
 275         let start = initial_len - chars.as_str().len() - first_char.len_utf8();
 276
 277         let unescaped_char = match first_char {
 278             '\\' => {
 279                 let second_char = chars.clone().next();
 280                 match second_char {
 281                     Some('\n') => {
 282                         // Rust language specification requires us to skip whitespaces
 283                         // if unescaped '\' character is followed by '\n'.
 284                         // For details see [Rust language reference]
 285                         // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
 286                         skip_ascii_whitespace(&mut chars);
 287                         continue;
 288                     }
 289                     _ => scan_escape(first_char, &mut chars, mode),
 290                 }
 291             }
 292             '\n' => Ok('\n'),
 293             '\t' => Ok('\t'),
 294             _ => scan_escape(first_char, &mut chars, mode),
 295         };
 296         let end = initial_len - chars.as_str().len();
 297         callback(start..end, unescaped_char);
 298     }
 299
 300     fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
 301         let str = chars.as_str();
 302         let first_non_space = str
 303             .bytes()
 304             .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
 305             .unwrap_or(str.len());
 306         *chars = str[first_non_space..].chars()
 307     }
 308 }
 309
 310 /// Takes a contents of a string literal (without quotes) and produces a
 311 /// sequence of characters or errors.
 312 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 313 /// only translate CRLF to LF and produce errors on bare CR.
 314 fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
 315 where
 316     F: FnMut(Range<usize>, Result<char, EscapeError>),
 317 {
 318     assert!(mode.in_double_quotes());
 319     let initial_len = literal_text.len();
 320
 321     let mut chars = literal_text.chars();
 322     while let Some(curr) = chars.next() {
 323         let start = initial_len - chars.as_str().len() - curr.len_utf8();
 324
 325         let result = match curr {
 326             '\r' => Err(EscapeError::BareCarriageReturnInRawString),
 327             c if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString),
 328             c => Ok(c),
 329         };
 330         let end = initial_len - chars.as_str().len();
 331
 332         callback(start..end, result);
 333     }
 334 }
 335
 336 fn byte_from_char(c: char) -> u8 {
 337     let res = c as u32;
 338     assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::ByteStr");
 339     res as u8
 340 }
 341
 342 fn is_ascii(x: u32) -> bool {
 343     x <= 0x7F
 344 }