]>
Commit | Line | Data |
---|---|---|
e74abb32 XL |
1 | //! Low-level Rust lexer. |
2 | //! | |
cdc7bbd5 | 3 | //! The idea with `rustc_lexer` is to make a reusable library, |
f9f354fc | 4 | //! by separating out pure lexing and rustc-specific concerns, like spans, |
1b1a35ee | 5 | //! error reporting, and interning. So, rustc_lexer operates directly on `&str`, |
f9f354fc XL |
6 | //! produces simple tokens which are a pair of type-tag and a bit of original text, |
7 | //! and does not report errors, instead storing them as flags on the token. | |
8 | //! | |
f035d41b | 9 | //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax. |
cdc7bbd5 | 10 | //! For that see [`rustc_parse::lexer`], which converts this basic token stream |
e74abb32 XL |
11 | //! into wide tokens used by actual parser. |
12 | //! | |
13 | //! The purpose of this crate is to convert raw sources into a labeled sequence | |
14 | //! of well-known token types, so building an actual Rust token stream will | |
15 | //! be easier. | |
16 | //! | |
f035d41b | 17 | //! The main entity of this crate is the [`TokenKind`] enum which represents common |
e74abb32 | 18 | //! lexeme types. |
f035d41b | 19 | //! |
cdc7bbd5 | 20 | //! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html |
e1599b0c XL |
21 | // We want to be able to build this crate with a stable compiler, so no |
22 | // `#![feature]` attributes should be added. | |
416331ca XL |
23 | |
24 | mod cursor; | |
25 | pub mod unescape; | |
26 | ||
ba9703b0 XL |
27 | #[cfg(test)] |
28 | mod tests; | |
29 | ||
60c5eb7d | 30 | use self::LiteralKind::*; |
dfeec247 XL |
31 | use self::TokenKind::*; |
32 | use crate::cursor::{Cursor, EOF_CHAR}; | |
f035d41b | 33 | use std::convert::TryFrom; |
416331ca | 34 | |
e74abb32 XL |
35 | /// Parsed token. |
36 | /// It doesn't contain information about data that has been parsed, | |
37 | /// only the type of the token and its size. | |
3dfed10e | 38 | #[derive(Debug)] |
416331ca XL |
39 | pub struct Token { |
40 | pub kind: TokenKind, | |
41 | pub len: usize, | |
42 | } | |
43 | ||
e74abb32 XL |
44 | impl Token { |
45 | fn new(kind: TokenKind, len: usize) -> Token { | |
46 | Token { kind, len } | |
47 | } | |
48 | } | |
49 | ||
74b04a01 | 50 | /// Enum representing common lexeme types. |
29967ef6 | 51 | // perf note: Changing all `usize` to `u32` doesn't change performance. See #77629 |
416331ca XL |
52 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
53 | pub enum TokenKind { | |
e74abb32 | 54 | // Multi-char tokens: |
e74abb32 | 55 | /// "// comment" |
3dfed10e | 56 | LineComment { doc_style: Option<DocStyle> }, |
f035d41b XL |
57 | /// `/* block comment */` |
58 | /// | |
59 | /// Block comments can be recursive, so the sequence like `/* /* */` | |
e74abb32 | 60 | /// will not be considered terminated and will result in a parsing error. |
3dfed10e | 61 | BlockComment { doc_style: Option<DocStyle>, terminated: bool }, |
e74abb32 | 62 | /// Any whitespace characters sequence. |
416331ca | 63 | Whitespace, |
e74abb32 XL |
64 | /// "ident" or "continue" |
65 | /// At this step keywords are also considered identifiers. | |
416331ca | 66 | Ident, |
e74abb32 | 67 | /// "r#ident" |
416331ca | 68 | RawIdent, |
136023e0 XL |
69 | /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the |
70 | /// prefix (`foo`) is included in the token, not the separator (which is | |
71 | /// lexed as its own distinct token). In Rust 2021 and later, reserved | |
72 | /// prefixes are reported as errors; in earlier editions, they result in a | |
73 | /// (allowed by default) lint, and are treated as regular identifier | |
74 | /// tokens. | |
75 | UnknownPrefix, | |
e74abb32 | 76 | /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. |
416331ca | 77 | Literal { kind: LiteralKind, suffix_start: usize }, |
e74abb32 | 78 | /// "'a" |
416331ca | 79 | Lifetime { starts_with_number: bool }, |
e74abb32 XL |
80 | |
81 | // One-char tokens: | |
e74abb32 | 82 | /// ";" |
416331ca | 83 | Semi, |
e74abb32 | 84 | /// "," |
416331ca | 85 | Comma, |
e74abb32 | 86 | /// "." |
416331ca | 87 | Dot, |
e74abb32 | 88 | /// "(" |
416331ca | 89 | OpenParen, |
e74abb32 | 90 | /// ")" |
416331ca | 91 | CloseParen, |
e74abb32 | 92 | /// "{" |
416331ca | 93 | OpenBrace, |
e74abb32 | 94 | /// "}" |
416331ca | 95 | CloseBrace, |
e74abb32 | 96 | /// "[" |
416331ca | 97 | OpenBracket, |
e74abb32 | 98 | /// "]" |
416331ca | 99 | CloseBracket, |
e74abb32 | 100 | /// "@" |
416331ca | 101 | At, |
e74abb32 | 102 | /// "#" |
416331ca | 103 | Pound, |
e74abb32 | 104 | /// "~" |
416331ca | 105 | Tilde, |
e74abb32 | 106 | /// "?" |
416331ca | 107 | Question, |
e74abb32 | 108 | /// ":" |
416331ca | 109 | Colon, |
e74abb32 | 110 | /// "$" |
416331ca | 111 | Dollar, |
e74abb32 | 112 | /// "=" |
416331ca | 113 | Eq, |
e74abb32 | 114 | /// "!" |
3dfed10e | 115 | Bang, |
e74abb32 | 116 | /// "<" |
416331ca | 117 | Lt, |
e74abb32 | 118 | /// ">" |
416331ca | 119 | Gt, |
e74abb32 | 120 | /// "-" |
416331ca | 121 | Minus, |
e74abb32 | 122 | /// "&" |
416331ca | 123 | And, |
e74abb32 | 124 | /// "|" |
416331ca | 125 | Or, |
e74abb32 | 126 | /// "+" |
416331ca | 127 | Plus, |
e74abb32 | 128 | /// "*" |
416331ca | 129 | Star, |
e74abb32 | 130 | /// "/" |
416331ca | 131 | Slash, |
e74abb32 | 132 | /// "^" |
416331ca | 133 | Caret, |
e74abb32 | 134 | /// "%" |
416331ca | 135 | Percent, |
e74abb32 XL |
136 | |
137 | /// Unknown token, not expected by the lexer, e.g. "№" | |
416331ca XL |
138 | Unknown, |
139 | } | |
416331ca | 140 | |
3dfed10e XL |
141 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
142 | pub enum DocStyle { | |
143 | Outer, | |
144 | Inner, | |
145 | } | |
146 | ||
416331ca XL |
147 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
148 | pub enum LiteralKind { | |
e74abb32 | 149 | /// "12_u8", "0o100", "0b120i99" |
416331ca | 150 | Int { base: Base, empty_int: bool }, |
e74abb32 | 151 | /// "12.34f32", "0b100.100" |
416331ca | 152 | Float { base: Base, empty_exponent: bool }, |
e74abb32 | 153 | /// "'a'", "'\\'", "'''", "';" |
416331ca | 154 | Char { terminated: bool }, |
e74abb32 | 155 | /// "b'a'", "b'\\'", "b'''", "b';" |
416331ca | 156 | Byte { terminated: bool }, |
e74abb32 | 157 | /// ""abc"", ""abc" |
416331ca | 158 | Str { terminated: bool }, |
e74abb32 | 159 | /// "b"abc"", "b"abc" |
416331ca | 160 | ByteStr { terminated: bool }, |
e74abb32 | 161 | /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" |
f035d41b | 162 | RawStr { n_hashes: u16, err: Option<RawStrError> }, |
e74abb32 | 163 | /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" |
f035d41b | 164 | RawByteStr { n_hashes: u16, err: Option<RawStrError> }, |
ba9703b0 XL |
165 | } |
166 | ||
167 | /// Error produced validating a raw string. Represents cases like: | |
f035d41b XL |
168 | /// - `r##~"abcde"##`: `InvalidStarter` |
169 | /// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` | |
170 | /// - Too many `#`s (>65535): `TooManyDelimiters` | |
29967ef6 | 171 | // perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 |
ba9703b0 | 172 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
f035d41b | 173 | pub enum RawStrError { |
ba9703b0 | 174 | /// Non `#` characters exist between `r` and `"` eg. `r#~"..` |
f035d41b | 175 | InvalidStarter { bad_char: char }, |
ba9703b0 XL |
176 | /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they |
177 | /// may have intended to terminate it. | |
178 | NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> }, | |
f035d41b XL |
179 | /// More than 65535 `#`s exist. |
180 | TooManyDelimiters { found: usize }, | |
416331ca | 181 | } |
416331ca | 182 | |
e74abb32 | 183 | /// Base of numeric literal encoding according to its prefix. |
416331ca XL |
184 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
185 | pub enum Base { | |
e74abb32 | 186 | /// Literal starts with "0b". |
416331ca | 187 | Binary, |
e74abb32 | 188 | /// Literal starts with "0o". |
416331ca | 189 | Octal, |
e74abb32 | 190 | /// Literal starts with "0x". |
416331ca | 191 | Hexadecimal, |
e74abb32 | 192 | /// Literal doesn't contain a prefix. |
416331ca XL |
193 | Decimal, |
194 | } | |
195 | ||
e74abb32 | 196 | /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", |
f9f354fc | 197 | /// but shebang isn't a part of rust syntax. |
416331ca | 198 | pub fn strip_shebang(input: &str) -> Option<usize> { |
f9f354fc XL |
199 | // Shebang must start with `#!` literally, without any preceding whitespace. |
200 | // For simplicity we consider any line starting with `#!` a shebang, | |
201 | // regardless of restrictions put on shebangs by specific platforms. | |
202 | if let Some(input_tail) = input.strip_prefix("#!") { | |
1b1a35ee | 203 | // Ok, this is a shebang but if the next non-whitespace token is `[`, |
f9f354fc | 204 | // then it may be valid Rust code, so consider it Rust code. |
1b1a35ee XL |
205 | let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| { |
206 | !matches!( | |
207 | tok, | |
208 | TokenKind::Whitespace | |
209 | | TokenKind::LineComment { doc_style: None } | |
210 | | TokenKind::BlockComment { doc_style: None, .. } | |
211 | ) | |
212 | }); | |
f9f354fc XL |
213 | if next_non_whitespace_token != Some(TokenKind::OpenBracket) { |
214 | // No other choice than to consider this a shebang. | |
215 | return Some(2 + input_tail.lines().next().unwrap_or_default().len()); | |
216 | } | |
416331ca | 217 | } |
f9f354fc | 218 | None |
416331ca XL |
219 | } |
220 | ||
e74abb32 | 221 | /// Parses the first token from the provided input string. |
416331ca XL |
222 | pub fn first_token(input: &str) -> Token { |
223 | debug_assert!(!input.is_empty()); | |
224 | Cursor::new(input).advance_token() | |
225 | } | |
226 | ||
e74abb32 | 227 | /// Creates an iterator that produces tokens from the input string. |
416331ca XL |
228 | pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { |
229 | std::iter::from_fn(move || { | |
230 | if input.is_empty() { | |
231 | return None; | |
232 | } | |
233 | let token = first_token(input); | |
234 | input = &input[token.len..]; | |
235 | Some(token) | |
236 | }) | |
237 | } | |
238 | ||
e1599b0c | 239 | /// True if `c` is considered a whitespace according to Rust language definition. |
e74abb32 XL |
240 | /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) |
241 | /// for definitions of these classes. | |
e1599b0c XL |
242 | pub fn is_whitespace(c: char) -> bool { |
243 | // This is Pattern_White_Space. | |
244 | // | |
245 | // Note that this set is stable (ie, it doesn't change with different | |
246 | // Unicode versions), so it's ok to just hard-code the values. | |
247 | ||
29967ef6 XL |
248 | matches!( |
249 | c, | |
e1599b0c | 250 | // Usual ASCII suspects |
29967ef6 | 251 | '\u{0009}' // \t |
e1599b0c XL |
252 | | '\u{000A}' // \n |
253 | | '\u{000B}' // vertical tab | |
254 | | '\u{000C}' // form feed | |
255 | | '\u{000D}' // \r | |
256 | | '\u{0020}' // space | |
257 | ||
258 | // NEXT LINE from latin1 | |
259 | | '\u{0085}' | |
260 | ||
261 | // Bidi markers | |
262 | | '\u{200E}' // LEFT-TO-RIGHT MARK | |
263 | | '\u{200F}' // RIGHT-TO-LEFT MARK | |
264 | ||
265 | // Dedicated whitespace characters from Unicode | |
266 | | '\u{2028}' // LINE SEPARATOR | |
267 | | '\u{2029}' // PARAGRAPH SEPARATOR | |
29967ef6 | 268 | ) |
e1599b0c XL |
269 | } |
270 | ||
271 | /// True if `c` is valid as a first character of an identifier. | |
e74abb32 XL |
272 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for |
273 | /// a formal definition of valid identifier name. | |
e1599b0c XL |
274 | pub fn is_id_start(c: char) -> bool { |
275 | // This is XID_Start OR '_' (which formally is not a XID_Start). | |
94222f64 | 276 | c == '_' || unicode_xid::UnicodeXID::is_xid_start(c) |
e1599b0c XL |
277 | } |
278 | ||
279 | /// True if `c` is valid as a non-first character of an identifier. | |
e74abb32 XL |
280 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for |
281 | /// a formal definition of valid identifier name. | |
e1599b0c | 282 | pub fn is_id_continue(c: char) -> bool { |
94222f64 | 283 | unicode_xid::UnicodeXID::is_xid_continue(c) |
e1599b0c XL |
284 | } |
285 | ||
3dfed10e XL |
286 | /// The passed string is lexically an identifier. |
287 | pub fn is_ident(string: &str) -> bool { | |
288 | let mut chars = string.chars(); | |
289 | if let Some(start) = chars.next() { | |
290 | is_id_start(start) && chars.all(is_id_continue) | |
291 | } else { | |
292 | false | |
293 | } | |
294 | } | |
295 | ||
416331ca | 296 | impl Cursor<'_> { |
e74abb32 | 297 | /// Parses a token from the input string. |
416331ca XL |
298 | fn advance_token(&mut self) -> Token { |
299 | let first_char = self.bump().unwrap(); | |
300 | let token_kind = match first_char { | |
e74abb32 | 301 | // Slash, comment or block comment. |
60c5eb7d | 302 | '/' => match self.first() { |
416331ca XL |
303 | '/' => self.line_comment(), |
304 | '*' => self.block_comment(), | |
e1599b0c | 305 | _ => Slash, |
416331ca | 306 | }, |
e74abb32 XL |
307 | |
308 | // Whitespace sequence. | |
e1599b0c | 309 | c if is_whitespace(c) => self.whitespace(), |
e74abb32 | 310 | |
60c5eb7d XL |
311 | // Raw identifier, raw string literal or identifier. |
312 | 'r' => match (self.first(), self.second()) { | |
e1599b0c | 313 | ('#', c1) if is_id_start(c1) => self.raw_ident(), |
416331ca | 314 | ('#', _) | ('"', _) => { |
f035d41b | 315 | let (n_hashes, err) = self.raw_double_quoted_string(1); |
416331ca | 316 | let suffix_start = self.len_consumed(); |
f035d41b | 317 | if err.is_none() { |
416331ca XL |
318 | self.eat_literal_suffix(); |
319 | } | |
f035d41b | 320 | let kind = RawStr { n_hashes, err }; |
416331ca XL |
321 | Literal { kind, suffix_start } |
322 | } | |
136023e0 | 323 | _ => self.ident_or_unknown_prefix(), |
416331ca | 324 | }, |
e74abb32 XL |
325 | |
326 | // Byte literal, byte string literal, raw byte string literal or identifier. | |
60c5eb7d | 327 | 'b' => match (self.first(), self.second()) { |
416331ca XL |
328 | ('\'', _) => { |
329 | self.bump(); | |
330 | let terminated = self.single_quoted_string(); | |
331 | let suffix_start = self.len_consumed(); | |
332 | if terminated { | |
333 | self.eat_literal_suffix(); | |
334 | } | |
335 | let kind = Byte { terminated }; | |
336 | Literal { kind, suffix_start } | |
337 | } | |
338 | ('"', _) => { | |
339 | self.bump(); | |
340 | let terminated = self.double_quoted_string(); | |
341 | let suffix_start = self.len_consumed(); | |
342 | if terminated { | |
343 | self.eat_literal_suffix(); | |
344 | } | |
345 | let kind = ByteStr { terminated }; | |
346 | Literal { kind, suffix_start } | |
347 | } | |
348 | ('r', '"') | ('r', '#') => { | |
349 | self.bump(); | |
f035d41b | 350 | let (n_hashes, err) = self.raw_double_quoted_string(2); |
416331ca | 351 | let suffix_start = self.len_consumed(); |
f035d41b | 352 | if err.is_none() { |
416331ca XL |
353 | self.eat_literal_suffix(); |
354 | } | |
f035d41b | 355 | let kind = RawByteStr { n_hashes, err }; |
416331ca XL |
356 | Literal { kind, suffix_start } |
357 | } | |
136023e0 | 358 | _ => self.ident_or_unknown_prefix(), |
416331ca | 359 | }, |
e74abb32 XL |
360 | |
361 | // Identifier (this should be checked after other variant that can | |
362 | // start as identifier). | |
136023e0 | 363 | c if is_id_start(c) => self.ident_or_unknown_prefix(), |
e74abb32 XL |
364 | |
365 | // Numeric literal. | |
416331ca XL |
366 | c @ '0'..='9' => { |
367 | let literal_kind = self.number(c); | |
368 | let suffix_start = self.len_consumed(); | |
369 | self.eat_literal_suffix(); | |
370 | TokenKind::Literal { kind: literal_kind, suffix_start } | |
371 | } | |
e74abb32 XL |
372 | |
373 | // One-symbol tokens. | |
416331ca XL |
374 | ';' => Semi, |
375 | ',' => Comma, | |
e1599b0c | 376 | '.' => Dot, |
416331ca XL |
377 | '(' => OpenParen, |
378 | ')' => CloseParen, | |
379 | '{' => OpenBrace, | |
380 | '}' => CloseBrace, | |
381 | '[' => OpenBracket, | |
382 | ']' => CloseBracket, | |
383 | '@' => At, | |
384 | '#' => Pound, | |
385 | '~' => Tilde, | |
386 | '?' => Question, | |
e1599b0c | 387 | ':' => Colon, |
416331ca | 388 | '$' => Dollar, |
e1599b0c | 389 | '=' => Eq, |
3dfed10e | 390 | '!' => Bang, |
e1599b0c XL |
391 | '<' => Lt, |
392 | '>' => Gt, | |
393 | '-' => Minus, | |
394 | '&' => And, | |
395 | '|' => Or, | |
396 | '+' => Plus, | |
397 | '*' => Star, | |
398 | '^' => Caret, | |
399 | '%' => Percent, | |
e74abb32 XL |
400 | |
401 | // Lifetime or character literal. | |
416331ca | 402 | '\'' => self.lifetime_or_char(), |
e74abb32 XL |
403 | |
404 | // String literal. | |
416331ca XL |
405 | '"' => { |
406 | let terminated = self.double_quoted_string(); | |
407 | let suffix_start = self.len_consumed(); | |
408 | if terminated { | |
409 | self.eat_literal_suffix(); | |
410 | } | |
411 | let kind = Str { terminated }; | |
412 | Literal { kind, suffix_start } | |
413 | } | |
414 | _ => Unknown, | |
415 | }; | |
416 | Token::new(token_kind, self.len_consumed()) | |
417 | } | |
418 | ||
419 | fn line_comment(&mut self) -> TokenKind { | |
60c5eb7d | 420 | debug_assert!(self.prev() == '/' && self.first() == '/'); |
416331ca | 421 | self.bump(); |
3dfed10e XL |
422 | |
423 | let doc_style = match self.first() { | |
424 | // `//!` is an inner line doc comment. | |
425 | '!' => Some(DocStyle::Inner), | |
426 | // `////` (more than 3 slashes) is not considered a doc comment. | |
427 | '/' if self.second() != '/' => Some(DocStyle::Outer), | |
428 | _ => None, | |
429 | }; | |
430 | ||
60c5eb7d | 431 | self.eat_while(|c| c != '\n'); |
3dfed10e | 432 | LineComment { doc_style } |
416331ca XL |
433 | } |
434 | ||
435 | fn block_comment(&mut self) -> TokenKind { | |
60c5eb7d | 436 | debug_assert!(self.prev() == '/' && self.first() == '*'); |
416331ca | 437 | self.bump(); |
3dfed10e XL |
438 | |
439 | let doc_style = match self.first() { | |
440 | // `/*!` is an inner block doc comment. | |
441 | '!' => Some(DocStyle::Inner), | |
442 | // `/***` (more than 2 stars) is not considered a doc comment. | |
443 | // `/**/` is not considered a doc comment. | |
444 | '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer), | |
445 | _ => None, | |
446 | }; | |
447 | ||
416331ca XL |
448 | let mut depth = 1usize; |
449 | while let Some(c) = self.bump() { | |
450 | match c { | |
60c5eb7d | 451 | '/' if self.first() == '*' => { |
416331ca XL |
452 | self.bump(); |
453 | depth += 1; | |
454 | } | |
60c5eb7d | 455 | '*' if self.first() == '/' => { |
416331ca XL |
456 | self.bump(); |
457 | depth -= 1; | |
458 | if depth == 0 { | |
e74abb32 XL |
459 | // This block comment is closed, so for a construction like "/* */ */" |
460 | // there will be a successfully parsed block comment "/* */" | |
461 | // and " */" will be processed separately. | |
416331ca XL |
462 | break; |
463 | } | |
464 | } | |
465 | _ => (), | |
466 | } | |
467 | } | |
468 | ||
3dfed10e | 469 | BlockComment { doc_style, terminated: depth == 0 } |
416331ca XL |
470 | } |
471 | ||
472 | fn whitespace(&mut self) -> TokenKind { | |
e1599b0c | 473 | debug_assert!(is_whitespace(self.prev())); |
60c5eb7d | 474 | self.eat_while(is_whitespace); |
416331ca XL |
475 | Whitespace |
476 | } | |
477 | ||
478 | fn raw_ident(&mut self) -> TokenKind { | |
dfeec247 | 479 | debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); |
60c5eb7d | 480 | // Eat "#" symbol. |
416331ca | 481 | self.bump(); |
60c5eb7d XL |
482 | // Eat the identifier part of RawIdent. |
483 | self.eat_identifier(); | |
416331ca XL |
484 | RawIdent |
485 | } | |
486 | ||
136023e0 | 487 | fn ident_or_unknown_prefix(&mut self) -> TokenKind { |
e1599b0c | 488 | debug_assert!(is_id_start(self.prev())); |
60c5eb7d XL |
489 | // Start is already eaten, eat the rest of identifier. |
490 | self.eat_while(is_id_continue); | |
136023e0 | 491 | // Known prefixes must have been handled earlier. So if |
94222f64 | 492 | // we see a prefix here, it is definitely an unknown prefix. |
136023e0 XL |
493 | match self.first() { |
494 | '#' | '"' | '\'' => UnknownPrefix, | |
495 | _ => Ident, | |
496 | } | |
416331ca XL |
497 | } |
498 | ||
499 | fn number(&mut self, first_digit: char) -> LiteralKind { | |
500 | debug_assert!('0' <= self.prev() && self.prev() <= '9'); | |
501 | let mut base = Base::Decimal; | |
502 | if first_digit == '0' { | |
e74abb32 | 503 | // Attempt to parse encoding base. |
60c5eb7d | 504 | let has_digits = match self.first() { |
416331ca XL |
505 | 'b' => { |
506 | base = Base::Binary; | |
507 | self.bump(); | |
508 | self.eat_decimal_digits() | |
509 | } | |
510 | 'o' => { | |
511 | base = Base::Octal; | |
512 | self.bump(); | |
513 | self.eat_decimal_digits() | |
514 | } | |
515 | 'x' => { | |
516 | base = Base::Hexadecimal; | |
517 | self.bump(); | |
518 | self.eat_hexadecimal_digits() | |
519 | } | |
e74abb32 | 520 | // Not a base prefix. |
416331ca XL |
521 | '0'..='9' | '_' | '.' | 'e' | 'E' => { |
522 | self.eat_decimal_digits(); | |
523 | true | |
524 | } | |
e74abb32 | 525 | // Just a 0. |
416331ca XL |
526 | _ => return Int { base, empty_int: false }, |
527 | }; | |
e74abb32 XL |
528 | // Base prefix was provided, but there were no digits |
529 | // after it, e.g. "0x". | |
416331ca XL |
530 | if !has_digits { |
531 | return Int { base, empty_int: true }; | |
532 | } | |
533 | } else { | |
e74abb32 | 534 | // No base prefix, parse number in the usual way. |
416331ca XL |
535 | self.eat_decimal_digits(); |
536 | }; | |
537 | ||
60c5eb7d | 538 | match self.first() { |
416331ca XL |
539 | // Don't be greedy if this is actually an |
540 | // integer literal followed by field/method access or a range pattern | |
541 | // (`0..2` and `12.foo()`) | |
dfeec247 | 542 | '.' if self.second() != '.' && !is_id_start(self.second()) => { |
416331ca XL |
543 | // might have stuff after the ., and if it does, it needs to start |
544 | // with a number | |
545 | self.bump(); | |
546 | let mut empty_exponent = false; | |
60c5eb7d | 547 | if self.first().is_digit(10) { |
416331ca | 548 | self.eat_decimal_digits(); |
60c5eb7d | 549 | match self.first() { |
416331ca XL |
550 | 'e' | 'E' => { |
551 | self.bump(); | |
60c5eb7d | 552 | empty_exponent = !self.eat_float_exponent(); |
416331ca XL |
553 | } |
554 | _ => (), | |
555 | } | |
556 | } | |
557 | Float { base, empty_exponent } | |
558 | } | |
559 | 'e' | 'E' => { | |
560 | self.bump(); | |
60c5eb7d | 561 | let empty_exponent = !self.eat_float_exponent(); |
416331ca XL |
562 | Float { base, empty_exponent } |
563 | } | |
564 | _ => Int { base, empty_int: false }, | |
565 | } | |
566 | } | |
567 | ||
568 | fn lifetime_or_char(&mut self) -> TokenKind { | |
569 | debug_assert!(self.prev() == '\''); | |
e74abb32 | 570 | |
60c5eb7d XL |
571 | let can_be_a_lifetime = if self.second() == '\'' { |
572 | // It's surely not a lifetime. | |
573 | false | |
574 | } else { | |
575 | // If the first symbol is valid for identifier, it can be a lifetime. | |
576 | // Also check if it's a number for a better error reporting (so '0 will | |
577 | // be reported as invalid lifetime and not as unterminated char literal). | |
578 | is_id_start(self.first()) || self.first().is_digit(10) | |
579 | }; | |
416331ca | 580 | |
60c5eb7d XL |
581 | if !can_be_a_lifetime { |
582 | let terminated = self.single_quoted_string(); | |
583 | let suffix_start = self.len_consumed(); | |
584 | if terminated { | |
585 | self.eat_literal_suffix(); | |
586 | } | |
587 | let kind = Char { terminated }; | |
588 | return Literal { kind, suffix_start }; | |
416331ca | 589 | } |
e74abb32 | 590 | |
60c5eb7d XL |
591 | // Either a lifetime or a character literal with |
592 | // length greater than 1. | |
593 | ||
594 | let starts_with_number = self.first().is_digit(10); | |
595 | ||
596 | // Skip the literal contents. | |
597 | // First symbol can be a number (which isn't a valid identifier start), | |
598 | // so skip it without any checks. | |
599 | self.bump(); | |
600 | self.eat_while(is_id_continue); | |
601 | ||
602 | // Check if after skipping literal contents we've met a closing | |
603 | // single quote (which means that user attempted to create a | |
604 | // string with single quotes). | |
605 | if self.first() == '\'' { | |
606 | self.bump(); | |
607 | let kind = Char { terminated: true }; | |
ba9703b0 XL |
608 | Literal { kind, suffix_start: self.len_consumed() } |
609 | } else { | |
610 | Lifetime { starts_with_number } | |
416331ca | 611 | } |
416331ca XL |
612 | } |
613 | ||
614 | fn single_quoted_string(&mut self) -> bool { | |
615 | debug_assert!(self.prev() == '\''); | |
60c5eb7d XL |
616 | // Check if it's a one-symbol literal. |
617 | if self.second() == '\'' && self.first() != '\\' { | |
618 | self.bump(); | |
416331ca | 619 | self.bump(); |
60c5eb7d | 620 | return true; |
416331ca | 621 | } |
60c5eb7d XL |
622 | |
623 | // Literal has more than one symbol. | |
624 | ||
e74abb32 | 625 | // Parse until either quotes are terminated or error is detected. |
416331ca | 626 | loop { |
60c5eb7d | 627 | match self.first() { |
e74abb32 | 628 | // Quotes are terminated, finish parsing. |
416331ca XL |
629 | '\'' => { |
630 | self.bump(); | |
631 | return true; | |
632 | } | |
60c5eb7d XL |
633 | // Probably beginning of the comment, which we don't want to include |
634 | // to the error report. | |
635 | '/' => break, | |
636 | // Newline without following '\'' means unclosed quote, stop parsing. | |
637 | '\n' if self.second() != '\'' => break, | |
638 | // End of file, stop parsing. | |
639 | EOF_CHAR if self.is_eof() => break, | |
e74abb32 | 640 | // Escaped slash is considered one character, so bump twice. |
416331ca XL |
641 | '\\' => { |
642 | self.bump(); | |
643 | self.bump(); | |
644 | } | |
e74abb32 | 645 | // Skip the character. |
416331ca XL |
646 | _ => { |
647 | self.bump(); | |
648 | } | |
649 | } | |
416331ca | 650 | } |
60c5eb7d | 651 | // String was not terminated. |
416331ca XL |
652 | false |
653 | } | |
654 | ||
e74abb32 XL |
655 | /// Eats double-quoted string and returns true |
656 | /// if string is terminated. | |
416331ca XL |
657 | fn double_quoted_string(&mut self) -> bool { |
658 | debug_assert!(self.prev() == '"'); | |
60c5eb7d XL |
659 | while let Some(c) = self.bump() { |
660 | match c { | |
416331ca | 661 | '"' => { |
416331ca XL |
662 | return true; |
663 | } | |
60c5eb7d XL |
664 | '\\' if self.first() == '\\' || self.first() == '"' => { |
665 | // Bump again to skip escaped character. | |
416331ca XL |
666 | self.bump(); |
667 | } | |
668 | _ => (), | |
669 | } | |
416331ca | 670 | } |
60c5eb7d XL |
671 | // End of file reached. |
672 | false | |
416331ca XL |
673 | } |
674 | ||
f035d41b XL |
675 | /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. |
676 | fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) { | |
677 | // Wrap the actual function to handle the error with too many hashes. | |
678 | // This way, it eats the whole raw string. | |
679 | let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); | |
680 | // Only up to 65535 `#`s are allowed in raw strings | |
681 | match u16::try_from(n_hashes) { | |
682 | Ok(num) => (num, err), | |
683 | // We lie about the number of hashes here :P | |
684 | Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), | |
685 | } | |
686 | } | |
687 | ||
688 | fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) { | |
416331ca | 689 | debug_assert!(self.prev() == 'r'); |
ba9703b0 | 690 | let start_pos = self.len_consumed(); |
f035d41b XL |
691 | let mut possible_terminator_offset = None; |
692 | let mut max_hashes = 0; | |
60c5eb7d | 693 | |
e74abb32 | 694 | // Count opening '#' symbols. |
29967ef6 XL |
695 | let mut eaten = 0; |
696 | while self.first() == '#' { | |
697 | eaten += 1; | |
698 | self.bump(); | |
699 | } | |
700 | let n_start_hashes = eaten; | |
60c5eb7d XL |
701 | |
702 | // Check that string is started. | |
703 | match self.bump() { | |
f035d41b XL |
704 | Some('"') => (), |
705 | c => { | |
706 | let c = c.unwrap_or(EOF_CHAR); | |
707 | return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); | |
ba9703b0 | 708 | } |
60c5eb7d XL |
709 | } |
710 | ||
711 | // Skip the string contents and on each '#' character met, check if this is | |
712 | // a raw string termination. | |
ba9703b0 | 713 | loop { |
60c5eb7d XL |
714 | self.eat_while(|c| c != '"'); |
715 | ||
716 | if self.is_eof() { | |
f035d41b | 717 | return ( |
ba9703b0 | 718 | n_start_hashes, |
f035d41b XL |
719 | Some(RawStrError::NoTerminator { |
720 | expected: n_start_hashes, | |
721 | found: max_hashes, | |
722 | possible_terminator_offset, | |
723 | }), | |
724 | ); | |
416331ca | 725 | } |
416331ca | 726 | |
60c5eb7d XL |
727 | // Eat closing double quote. |
728 | self.bump(); | |
729 | ||
730 | // Check that amount of closing '#' symbols | |
731 | // is equal to the amount of opening ones. | |
f9f354fc | 732 | // Note that this will not consume extra trailing `#` characters: |
f035d41b | 733 | // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` |
f9f354fc | 734 | // followed by a `#` token. |
29967ef6 XL |
735 | let mut n_end_hashes = 0; |
736 | while self.first() == '#' && n_end_hashes < n_start_hashes { | |
737 | n_end_hashes += 1; | |
738 | self.bump(); | |
739 | } | |
ba9703b0 XL |
740 | |
741 | if n_end_hashes == n_start_hashes { | |
f035d41b | 742 | return (n_start_hashes, None); |
ba9703b0 | 743 | } else if n_end_hashes > max_hashes { |
f9f354fc XL |
744 | // Keep track of possible terminators to give a hint about |
745 | // where there might be a missing terminator | |
ba9703b0 XL |
746 | possible_terminator_offset = |
747 | Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); | |
748 | max_hashes = n_end_hashes; | |
749 | } | |
416331ca XL |
750 | } |
751 | } | |
752 | ||
753 | fn eat_decimal_digits(&mut self) -> bool { | |
754 | let mut has_digits = false; | |
755 | loop { | |
60c5eb7d | 756 | match self.first() { |
416331ca XL |
757 | '_' => { |
758 | self.bump(); | |
759 | } | |
760 | '0'..='9' => { | |
761 | has_digits = true; | |
762 | self.bump(); | |
763 | } | |
764 | _ => break, | |
765 | } | |
766 | } | |
767 | has_digits | |
768 | } | |
769 | ||
770 | fn eat_hexadecimal_digits(&mut self) -> bool { | |
771 | let mut has_digits = false; | |
772 | loop { | |
60c5eb7d | 773 | match self.first() { |
416331ca XL |
774 | '_' => { |
775 | self.bump(); | |
776 | } | |
777 | '0'..='9' | 'a'..='f' | 'A'..='F' => { | |
778 | has_digits = true; | |
779 | self.bump(); | |
780 | } | |
781 | _ => break, | |
782 | } | |
783 | } | |
784 | has_digits | |
785 | } | |
786 | ||
60c5eb7d XL |
787 | /// Eats the float exponent. Returns true if at least one digit was met, |
788 | /// and returns false otherwise. | |
789 | fn eat_float_exponent(&mut self) -> bool { | |
416331ca | 790 | debug_assert!(self.prev() == 'e' || self.prev() == 'E'); |
60c5eb7d | 791 | if self.first() == '-' || self.first() == '+' { |
416331ca XL |
792 | self.bump(); |
793 | } | |
60c5eb7d | 794 | self.eat_decimal_digits() |
416331ca XL |
795 | } |
796 | ||
60c5eb7d | 797 | // Eats the suffix of the literal, e.g. "_u8". |
416331ca | 798 | fn eat_literal_suffix(&mut self) { |
60c5eb7d XL |
799 | self.eat_identifier(); |
800 | } | |
801 | ||
802 | // Eats the identifier. | |
803 | fn eat_identifier(&mut self) { | |
804 | if !is_id_start(self.first()) { | |
416331ca XL |
805 | return; |
806 | } | |
807 | self.bump(); | |
808 | ||
60c5eb7d XL |
809 | self.eat_while(is_id_continue); |
810 | } | |
811 | ||
812 | /// Eats symbols while predicate returns true or until the end of file is reached. | |
29967ef6 | 813 | fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { |
60c5eb7d | 814 | while predicate(self.first()) && !self.is_eof() { |
416331ca XL |
815 | self.bump(); |
816 | } | |
817 | } | |
416331ca | 818 | } |