]>
Commit | Line | Data |
---|---|---|
e74abb32 XL |
1 | //! Low-level Rust lexer. |
2 | //! | |
3 | //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax, | |
60c5eb7d | 4 | //! for that see `librustc_parse::lexer`, which converts this basic token stream |
e74abb32 XL |
5 | //! into wide tokens used by actual parser. |
6 | //! | |
7 | //! The purpose of this crate is to convert raw sources into a labeled sequence | |
8 | //! of well-known token types, so building an actual Rust token stream will | |
9 | //! be easier. | |
10 | //! | |
11 | //! Main entity of this crate is [`TokenKind`] enum which represents common | |
12 | //! lexeme types. | |
13 | ||
e1599b0c XL |
14 | // We want to be able to build this crate with a stable compiler, so no |
15 | // `#![feature]` attributes should be added. | |
416331ca XL |
16 | |
17 | mod cursor; | |
18 | pub mod unescape; | |
19 | ||
ba9703b0 XL |
20 | #[cfg(test)] |
21 | mod tests; | |
22 | ||
60c5eb7d | 23 | use self::LiteralKind::*; |
dfeec247 XL |
24 | use self::TokenKind::*; |
25 | use crate::cursor::{Cursor, EOF_CHAR}; | |
ba9703b0 | 26 | use std::convert::TryInto; |
416331ca | 27 | |
e74abb32 XL |
28 | /// Parsed token. |
29 | /// It doesn't contain information about data that has been parsed, | |
30 | /// only the type of the token and its size. | |
416331ca XL |
31 | pub struct Token { |
32 | pub kind: TokenKind, | |
33 | pub len: usize, | |
34 | } | |
35 | ||
e74abb32 XL |
36 | impl Token { |
37 | fn new(kind: TokenKind, len: usize) -> Token { | |
38 | Token { kind, len } | |
39 | } | |
40 | } | |
41 | ||
74b04a01 | 42 | /// Enum representing common lexeme types. |
416331ca XL |
43 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
44 | pub enum TokenKind { | |
e74abb32 | 45 | // Multi-char tokens: |
e74abb32 | 46 | /// "// comment" |
416331ca | 47 | LineComment, |
e74abb32 XL |
48 | /// "/* block comment */" |
49 | /// Block comments can be recursive, so the sequence like "/* /* */" | |
50 | /// will not be considered terminated and will result in a parsing error. | |
416331ca | 51 | BlockComment { terminated: bool }, |
e74abb32 | 52 | /// Any whitespace characters sequence. |
416331ca | 53 | Whitespace, |
e74abb32 XL |
54 | /// "ident" or "continue" |
55 | /// At this step keywords are also considered identifiers. | |
416331ca | 56 | Ident, |
e74abb32 | 57 | /// "r#ident" |
416331ca | 58 | RawIdent, |
e74abb32 | 59 | /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. |
416331ca | 60 | Literal { kind: LiteralKind, suffix_start: usize }, |
e74abb32 | 61 | /// "'a" |
416331ca | 62 | Lifetime { starts_with_number: bool }, |
e74abb32 XL |
63 | |
64 | // One-char tokens: | |
e74abb32 | 65 | /// ";" |
416331ca | 66 | Semi, |
e74abb32 | 67 | /// "," |
416331ca | 68 | Comma, |
e74abb32 | 69 | /// "." |
416331ca | 70 | Dot, |
e74abb32 | 71 | /// "(" |
416331ca | 72 | OpenParen, |
e74abb32 | 73 | /// ")" |
416331ca | 74 | CloseParen, |
e74abb32 | 75 | /// "{" |
416331ca | 76 | OpenBrace, |
e74abb32 | 77 | /// "}" |
416331ca | 78 | CloseBrace, |
e74abb32 | 79 | /// "[" |
416331ca | 80 | OpenBracket, |
e74abb32 | 81 | /// "]" |
416331ca | 82 | CloseBracket, |
e74abb32 | 83 | /// "@" |
416331ca | 84 | At, |
e74abb32 | 85 | /// "#" |
416331ca | 86 | Pound, |
e74abb32 | 87 | /// "~" |
416331ca | 88 | Tilde, |
e74abb32 | 89 | /// "?" |
416331ca | 90 | Question, |
e74abb32 | 91 | /// ":" |
416331ca | 92 | Colon, |
e74abb32 | 93 | /// "$" |
416331ca | 94 | Dollar, |
e74abb32 | 95 | /// "=" |
416331ca | 96 | Eq, |
e74abb32 | 97 | /// "!" |
416331ca | 98 | Not, |
e74abb32 | 99 | /// "<" |
416331ca | 100 | Lt, |
e74abb32 | 101 | /// ">" |
416331ca | 102 | Gt, |
e74abb32 | 103 | /// "-" |
416331ca | 104 | Minus, |
e74abb32 | 105 | /// "&" |
416331ca | 106 | And, |
e74abb32 | 107 | /// "|" |
416331ca | 108 | Or, |
e74abb32 | 109 | /// "+" |
416331ca | 110 | Plus, |
e74abb32 | 111 | /// "*" |
416331ca | 112 | Star, |
e74abb32 | 113 | /// "/" |
416331ca | 114 | Slash, |
e74abb32 | 115 | /// "^" |
416331ca | 116 | Caret, |
e74abb32 | 117 | /// "%" |
416331ca | 118 | Percent, |
e74abb32 XL |
119 | |
120 | /// Unknown token, not expected by the lexer, e.g. "№" | |
416331ca XL |
121 | Unknown, |
122 | } | |
416331ca XL |
123 | |
124 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
125 | pub enum LiteralKind { | |
e74abb32 | 126 | /// "12_u8", "0o100", "0b120i99" |
416331ca | 127 | Int { base: Base, empty_int: bool }, |
e74abb32 | 128 | /// "12.34f32", "0b100.100" |
416331ca | 129 | Float { base: Base, empty_exponent: bool }, |
e74abb32 | 130 | /// "'a'", "'\\'", "'''", "';" |
416331ca | 131 | Char { terminated: bool }, |
e74abb32 | 132 | /// "b'a'", "b'\\'", "b'''", "b';" |
416331ca | 133 | Byte { terminated: bool }, |
e74abb32 | 134 | /// ""abc"", ""abc" |
416331ca | 135 | Str { terminated: bool }, |
e74abb32 | 136 | /// "b"abc"", "b"abc" |
416331ca | 137 | ByteStr { terminated: bool }, |
e74abb32 | 138 | /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" |
ba9703b0 | 139 | RawStr(UnvalidatedRawStr), |
e74abb32 | 140 | /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" |
ba9703b0 XL |
141 | RawByteStr(UnvalidatedRawStr), |
142 | } | |
143 | ||
144 | /// Represents something that looks like a raw string, but may have some | |
145 | /// problems. Use `.validate()` to convert it into something | |
146 | /// usable. | |
147 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
148 | pub struct UnvalidatedRawStr { | |
149 | /// The prefix (`r###"`) is valid | |
150 | valid_start: bool, | |
151 | ||
152 | /// The postfix (`"###`) is valid | |
153 | valid_end: bool, | |
154 | ||
155 | /// The number of leading `#` | |
156 | n_start_hashes: usize, | |
157 | /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes` | |
158 | n_end_hashes: usize, | |
159 | /// The offset starting at `r` or `br` where the user may have intended to end the string. | |
160 | /// Currently, it is the longest sequence of pattern `"#+"`. | |
161 | possible_terminator_offset: Option<usize>, | |
162 | } | |
163 | ||
164 | /// Error produced validating a raw string. Represents cases like: | |
165 | /// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter` | |
166 | /// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` | |
167 | /// - Too many `#`s (>65536): `TooManyDelimiters` | |
168 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
169 | pub enum LexRawStrError { | |
170 | /// Non `#` characters exist between `r` and `"` eg. `r#~"..` | |
171 | InvalidStarter, | |
172 | /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they | |
173 | /// may have intended to terminate it. | |
174 | NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> }, | |
175 | /// More than 65536 `#`s exist. | |
176 | TooManyDelimiters, | |
177 | } | |
178 | ||
179 | /// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where | |
180 | /// there are a matching number of `#` characters in both. Note that this will | |
181 | /// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a | |
182 | /// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token. | |
183 | #[derive(Debug, Eq, PartialEq, Copy, Clone)] | |
184 | pub struct ValidatedRawStr { | |
185 | n_hashes: u16, | |
186 | } | |
187 | ||
188 | impl ValidatedRawStr { | |
189 | pub fn num_hashes(&self) -> u16 { | |
190 | self.n_hashes | |
191 | } | |
192 | } | |
193 | ||
194 | impl UnvalidatedRawStr { | |
195 | pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> { | |
196 | if !self.valid_start { | |
197 | return Err(LexRawStrError::InvalidStarter); | |
198 | } | |
199 | ||
200 | // Only up to 65535 `#`s are allowed in raw strings | |
201 | let n_start_safe: u16 = | |
202 | self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?; | |
203 | ||
204 | if self.n_start_hashes > self.n_end_hashes || !self.valid_end { | |
205 | Err(LexRawStrError::NoTerminator { | |
206 | expected: self.n_start_hashes, | |
207 | found: self.n_end_hashes, | |
208 | possible_terminator_offset: self.possible_terminator_offset, | |
209 | }) | |
210 | } else { | |
211 | // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end, | |
212 | // they must be equal. | |
213 | debug_assert_eq!(self.n_start_hashes, self.n_end_hashes); | |
214 | Ok(ValidatedRawStr { n_hashes: n_start_safe }) | |
215 | } | |
216 | } | |
416331ca | 217 | } |
416331ca | 218 | |
e74abb32 | 219 | /// Base of numeric literal encoding according to its prefix. |
416331ca XL |
220 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] |
221 | pub enum Base { | |
e74abb32 | 222 | /// Literal starts with "0b". |
416331ca | 223 | Binary, |
e74abb32 | 224 | /// Literal starts with "0o". |
416331ca | 225 | Octal, |
e74abb32 | 226 | /// Literal starts with "0x". |
416331ca | 227 | Hexadecimal, |
e74abb32 | 228 | /// Literal doesn't contain a prefix. |
416331ca XL |
229 | Decimal, |
230 | } | |
231 | ||
e74abb32 XL |
232 | /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", |
233 | /// but shebang isn't a part of rust syntax, so this function | |
234 | /// skips the line if it starts with a shebang ("#!"). | |
235 | /// Line won't be skipped if it represents a valid Rust syntax | |
236 | /// (e.g. "#![deny(missing_docs)]"). | |
416331ca XL |
237 | pub fn strip_shebang(input: &str) -> Option<usize> { |
238 | debug_assert!(!input.is_empty()); | |
239 | if !input.starts_with("#!") || input.starts_with("#![") { | |
240 | return None; | |
241 | } | |
242 | Some(input.find('\n').unwrap_or(input.len())) | |
243 | } | |
244 | ||
e74abb32 | 245 | /// Parses the first token from the provided input string. |
416331ca XL |
246 | pub fn first_token(input: &str) -> Token { |
247 | debug_assert!(!input.is_empty()); | |
248 | Cursor::new(input).advance_token() | |
249 | } | |
250 | ||
e74abb32 | 251 | /// Creates an iterator that produces tokens from the input string. |
416331ca XL |
252 | pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { |
253 | std::iter::from_fn(move || { | |
254 | if input.is_empty() { | |
255 | return None; | |
256 | } | |
257 | let token = first_token(input); | |
258 | input = &input[token.len..]; | |
259 | Some(token) | |
260 | }) | |
261 | } | |
262 | ||
e1599b0c | 263 | /// True if `c` is considered a whitespace according to Rust language definition. |
e74abb32 XL |
264 | /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) |
265 | /// for definitions of these classes. | |
e1599b0c XL |
266 | pub fn is_whitespace(c: char) -> bool { |
267 | // This is Pattern_White_Space. | |
268 | // | |
269 | // Note that this set is stable (ie, it doesn't change with different | |
270 | // Unicode versions), so it's ok to just hard-code the values. | |
271 | ||
272 | match c { | |
273 | // Usual ASCII suspects | |
274 | | '\u{0009}' // \t | |
275 | | '\u{000A}' // \n | |
276 | | '\u{000B}' // vertical tab | |
277 | | '\u{000C}' // form feed | |
278 | | '\u{000D}' // \r | |
279 | | '\u{0020}' // space | |
280 | ||
281 | // NEXT LINE from latin1 | |
282 | | '\u{0085}' | |
283 | ||
284 | // Bidi markers | |
285 | | '\u{200E}' // LEFT-TO-RIGHT MARK | |
286 | | '\u{200F}' // RIGHT-TO-LEFT MARK | |
287 | ||
288 | // Dedicated whitespace characters from Unicode | |
289 | | '\u{2028}' // LINE SEPARATOR | |
290 | | '\u{2029}' // PARAGRAPH SEPARATOR | |
ba9703b0 | 291 | => true, |
e1599b0c XL |
292 | _ => false, |
293 | } | |
294 | } | |
295 | ||
296 | /// True if `c` is valid as a first character of an identifier. | |
e74abb32 XL |
297 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for |
298 | /// a formal definition of valid identifier name. | |
e1599b0c XL |
299 | pub fn is_id_start(c: char) -> bool { |
300 | // This is XID_Start OR '_' (which formally is not a XID_Start). | |
301 | // We also add fast-path for ascii idents | |
302 | ('a' <= c && c <= 'z') | |
303 | || ('A' <= c && c <= 'Z') | |
304 | || c == '_' | |
305 | || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) | |
306 | } | |
307 | ||
308 | /// True if `c` is valid as a non-first character of an identifier. | |
e74abb32 XL |
309 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for |
310 | /// a formal definition of valid identifier name. | |
e1599b0c XL |
311 | pub fn is_id_continue(c: char) -> bool { |
312 | // This is exactly XID_Continue. | |
313 | // We also add fast-path for ascii idents | |
314 | ('a' <= c && c <= 'z') | |
315 | || ('A' <= c && c <= 'Z') | |
316 | || ('0' <= c && c <= '9') | |
317 | || c == '_' | |
318 | || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) | |
319 | } | |
320 | ||
416331ca | 321 | impl Cursor<'_> { |
e74abb32 | 322 | /// Parses a token from the input string. |
416331ca XL |
323 | fn advance_token(&mut self) -> Token { |
324 | let first_char = self.bump().unwrap(); | |
325 | let token_kind = match first_char { | |
e74abb32 | 326 | // Slash, comment or block comment. |
60c5eb7d | 327 | '/' => match self.first() { |
416331ca XL |
328 | '/' => self.line_comment(), |
329 | '*' => self.block_comment(), | |
e1599b0c | 330 | _ => Slash, |
416331ca | 331 | }, |
e74abb32 XL |
332 | |
333 | // Whitespace sequence. | |
e1599b0c | 334 | c if is_whitespace(c) => self.whitespace(), |
e74abb32 | 335 | |
60c5eb7d XL |
336 | // Raw identifier, raw string literal or identifier. |
337 | 'r' => match (self.first(), self.second()) { | |
e1599b0c | 338 | ('#', c1) if is_id_start(c1) => self.raw_ident(), |
416331ca | 339 | ('#', _) | ('"', _) => { |
ba9703b0 | 340 | let raw_str_i = self.raw_double_quoted_string(1); |
416331ca | 341 | let suffix_start = self.len_consumed(); |
ba9703b0 | 342 | if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes { |
416331ca XL |
343 | self.eat_literal_suffix(); |
344 | } | |
ba9703b0 | 345 | let kind = RawStr(raw_str_i); |
416331ca XL |
346 | Literal { kind, suffix_start } |
347 | } | |
348 | _ => self.ident(), | |
349 | }, | |
e74abb32 XL |
350 | |
351 | // Byte literal, byte string literal, raw byte string literal or identifier. | |
60c5eb7d | 352 | 'b' => match (self.first(), self.second()) { |
416331ca XL |
353 | ('\'', _) => { |
354 | self.bump(); | |
355 | let terminated = self.single_quoted_string(); | |
356 | let suffix_start = self.len_consumed(); | |
357 | if terminated { | |
358 | self.eat_literal_suffix(); | |
359 | } | |
360 | let kind = Byte { terminated }; | |
361 | Literal { kind, suffix_start } | |
362 | } | |
363 | ('"', _) => { | |
364 | self.bump(); | |
365 | let terminated = self.double_quoted_string(); | |
366 | let suffix_start = self.len_consumed(); | |
367 | if terminated { | |
368 | self.eat_literal_suffix(); | |
369 | } | |
370 | let kind = ByteStr { terminated }; | |
371 | Literal { kind, suffix_start } | |
372 | } | |
373 | ('r', '"') | ('r', '#') => { | |
374 | self.bump(); | |
ba9703b0 | 375 | let raw_str_i = self.raw_double_quoted_string(2); |
416331ca | 376 | let suffix_start = self.len_consumed(); |
ba9703b0 | 377 | let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes; |
416331ca XL |
378 | if terminated { |
379 | self.eat_literal_suffix(); | |
380 | } | |
ba9703b0 XL |
381 | |
382 | let kind = RawByteStr(raw_str_i); | |
416331ca XL |
383 | Literal { kind, suffix_start } |
384 | } | |
385 | _ => self.ident(), | |
386 | }, | |
e74abb32 XL |
387 | |
388 | // Identifier (this should be checked after other variant that can | |
389 | // start as identifier). | |
e1599b0c | 390 | c if is_id_start(c) => self.ident(), |
e74abb32 XL |
391 | |
392 | // Numeric literal. | |
416331ca XL |
393 | c @ '0'..='9' => { |
394 | let literal_kind = self.number(c); | |
395 | let suffix_start = self.len_consumed(); | |
396 | self.eat_literal_suffix(); | |
397 | TokenKind::Literal { kind: literal_kind, suffix_start } | |
398 | } | |
e74abb32 XL |
399 | |
400 | // One-symbol tokens. | |
416331ca XL |
401 | ';' => Semi, |
402 | ',' => Comma, | |
e1599b0c | 403 | '.' => Dot, |
416331ca XL |
404 | '(' => OpenParen, |
405 | ')' => CloseParen, | |
406 | '{' => OpenBrace, | |
407 | '}' => CloseBrace, | |
408 | '[' => OpenBracket, | |
409 | ']' => CloseBracket, | |
410 | '@' => At, | |
411 | '#' => Pound, | |
412 | '~' => Tilde, | |
413 | '?' => Question, | |
e1599b0c | 414 | ':' => Colon, |
416331ca | 415 | '$' => Dollar, |
e1599b0c XL |
416 | '=' => Eq, |
417 | '!' => Not, | |
418 | '<' => Lt, | |
419 | '>' => Gt, | |
420 | '-' => Minus, | |
421 | '&' => And, | |
422 | '|' => Or, | |
423 | '+' => Plus, | |
424 | '*' => Star, | |
425 | '^' => Caret, | |
426 | '%' => Percent, | |
e74abb32 XL |
427 | |
428 | // Lifetime or character literal. | |
416331ca | 429 | '\'' => self.lifetime_or_char(), |
e74abb32 XL |
430 | |
431 | // String literal. | |
416331ca XL |
432 | '"' => { |
433 | let terminated = self.double_quoted_string(); | |
434 | let suffix_start = self.len_consumed(); | |
435 | if terminated { | |
436 | self.eat_literal_suffix(); | |
437 | } | |
438 | let kind = Str { terminated }; | |
439 | Literal { kind, suffix_start } | |
440 | } | |
441 | _ => Unknown, | |
442 | }; | |
443 | Token::new(token_kind, self.len_consumed()) | |
444 | } | |
445 | ||
446 | fn line_comment(&mut self) -> TokenKind { | |
60c5eb7d | 447 | debug_assert!(self.prev() == '/' && self.first() == '/'); |
416331ca | 448 | self.bump(); |
60c5eb7d | 449 | self.eat_while(|c| c != '\n'); |
416331ca XL |
450 | LineComment |
451 | } | |
452 | ||
453 | fn block_comment(&mut self) -> TokenKind { | |
60c5eb7d | 454 | debug_assert!(self.prev() == '/' && self.first() == '*'); |
416331ca XL |
455 | self.bump(); |
456 | let mut depth = 1usize; | |
457 | while let Some(c) = self.bump() { | |
458 | match c { | |
60c5eb7d | 459 | '/' if self.first() == '*' => { |
416331ca XL |
460 | self.bump(); |
461 | depth += 1; | |
462 | } | |
60c5eb7d | 463 | '*' if self.first() == '/' => { |
416331ca XL |
464 | self.bump(); |
465 | depth -= 1; | |
466 | if depth == 0 { | |
e74abb32 XL |
467 | // This block comment is closed, so for a construction like "/* */ */" |
468 | // there will be a successfully parsed block comment "/* */" | |
469 | // and " */" will be processed separately. | |
416331ca XL |
470 | break; |
471 | } | |
472 | } | |
473 | _ => (), | |
474 | } | |
475 | } | |
476 | ||
477 | BlockComment { terminated: depth == 0 } | |
478 | } | |
479 | ||
480 | fn whitespace(&mut self) -> TokenKind { | |
e1599b0c | 481 | debug_assert!(is_whitespace(self.prev())); |
60c5eb7d | 482 | self.eat_while(is_whitespace); |
416331ca XL |
483 | Whitespace |
484 | } | |
485 | ||
486 | fn raw_ident(&mut self) -> TokenKind { | |
dfeec247 | 487 | debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); |
60c5eb7d | 488 | // Eat "#" symbol. |
416331ca | 489 | self.bump(); |
60c5eb7d XL |
490 | // Eat the identifier part of RawIdent. |
491 | self.eat_identifier(); | |
416331ca XL |
492 | RawIdent |
493 | } | |
494 | ||
495 | fn ident(&mut self) -> TokenKind { | |
e1599b0c | 496 | debug_assert!(is_id_start(self.prev())); |
60c5eb7d XL |
497 | // Start is already eaten, eat the rest of identifier. |
498 | self.eat_while(is_id_continue); | |
416331ca XL |
499 | Ident |
500 | } | |
501 | ||
502 | fn number(&mut self, first_digit: char) -> LiteralKind { | |
503 | debug_assert!('0' <= self.prev() && self.prev() <= '9'); | |
504 | let mut base = Base::Decimal; | |
505 | if first_digit == '0' { | |
e74abb32 | 506 | // Attempt to parse encoding base. |
60c5eb7d | 507 | let has_digits = match self.first() { |
416331ca XL |
508 | 'b' => { |
509 | base = Base::Binary; | |
510 | self.bump(); | |
511 | self.eat_decimal_digits() | |
512 | } | |
513 | 'o' => { | |
514 | base = Base::Octal; | |
515 | self.bump(); | |
516 | self.eat_decimal_digits() | |
517 | } | |
518 | 'x' => { | |
519 | base = Base::Hexadecimal; | |
520 | self.bump(); | |
521 | self.eat_hexadecimal_digits() | |
522 | } | |
e74abb32 | 523 | // Not a base prefix. |
416331ca XL |
524 | '0'..='9' | '_' | '.' | 'e' | 'E' => { |
525 | self.eat_decimal_digits(); | |
526 | true | |
527 | } | |
e74abb32 | 528 | // Just a 0. |
416331ca XL |
529 | _ => return Int { base, empty_int: false }, |
530 | }; | |
e74abb32 XL |
531 | // Base prefix was provided, but there were no digits |
532 | // after it, e.g. "0x". | |
416331ca XL |
533 | if !has_digits { |
534 | return Int { base, empty_int: true }; | |
535 | } | |
536 | } else { | |
e74abb32 | 537 | // No base prefix, parse number in the usual way. |
416331ca XL |
538 | self.eat_decimal_digits(); |
539 | }; | |
540 | ||
60c5eb7d | 541 | match self.first() { |
416331ca XL |
542 | // Don't be greedy if this is actually an |
543 | // integer literal followed by field/method access or a range pattern | |
544 | // (`0..2` and `12.foo()`) | |
dfeec247 | 545 | '.' if self.second() != '.' && !is_id_start(self.second()) => { |
416331ca XL |
546 | // might have stuff after the ., and if it does, it needs to start |
547 | // with a number | |
548 | self.bump(); | |
549 | let mut empty_exponent = false; | |
60c5eb7d | 550 | if self.first().is_digit(10) { |
416331ca | 551 | self.eat_decimal_digits(); |
60c5eb7d | 552 | match self.first() { |
416331ca XL |
553 | 'e' | 'E' => { |
554 | self.bump(); | |
60c5eb7d | 555 | empty_exponent = !self.eat_float_exponent(); |
416331ca XL |
556 | } |
557 | _ => (), | |
558 | } | |
559 | } | |
560 | Float { base, empty_exponent } | |
561 | } | |
562 | 'e' | 'E' => { | |
563 | self.bump(); | |
60c5eb7d | 564 | let empty_exponent = !self.eat_float_exponent(); |
416331ca XL |
565 | Float { base, empty_exponent } |
566 | } | |
567 | _ => Int { base, empty_int: false }, | |
568 | } | |
569 | } | |
570 | ||
571 | fn lifetime_or_char(&mut self) -> TokenKind { | |
572 | debug_assert!(self.prev() == '\''); | |
e74abb32 | 573 | |
60c5eb7d XL |
574 | let can_be_a_lifetime = if self.second() == '\'' { |
575 | // It's surely not a lifetime. | |
576 | false | |
577 | } else { | |
578 | // If the first symbol is valid for identifier, it can be a lifetime. | |
579 | // Also check if it's a number for a better error reporting (so '0 will | |
580 | // be reported as invalid lifetime and not as unterminated char literal). | |
581 | is_id_start(self.first()) || self.first().is_digit(10) | |
582 | }; | |
416331ca | 583 | |
60c5eb7d XL |
584 | if !can_be_a_lifetime { |
585 | let terminated = self.single_quoted_string(); | |
586 | let suffix_start = self.len_consumed(); | |
587 | if terminated { | |
588 | self.eat_literal_suffix(); | |
589 | } | |
590 | let kind = Char { terminated }; | |
591 | return Literal { kind, suffix_start }; | |
416331ca | 592 | } |
e74abb32 | 593 | |
60c5eb7d XL |
594 | // Either a lifetime or a character literal with |
595 | // length greater than 1. | |
596 | ||
597 | let starts_with_number = self.first().is_digit(10); | |
598 | ||
599 | // Skip the literal contents. | |
600 | // First symbol can be a number (which isn't a valid identifier start), | |
601 | // so skip it without any checks. | |
602 | self.bump(); | |
603 | self.eat_while(is_id_continue); | |
604 | ||
605 | // Check if after skipping literal contents we've met a closing | |
606 | // single quote (which means that user attempted to create a | |
607 | // string with single quotes). | |
608 | if self.first() == '\'' { | |
609 | self.bump(); | |
610 | let kind = Char { terminated: true }; | |
ba9703b0 XL |
611 | Literal { kind, suffix_start: self.len_consumed() } |
612 | } else { | |
613 | Lifetime { starts_with_number } | |
416331ca | 614 | } |
416331ca XL |
615 | } |
616 | ||
617 | fn single_quoted_string(&mut self) -> bool { | |
618 | debug_assert!(self.prev() == '\''); | |
60c5eb7d XL |
619 | // Check if it's a one-symbol literal. |
620 | if self.second() == '\'' && self.first() != '\\' { | |
621 | self.bump(); | |
416331ca | 622 | self.bump(); |
60c5eb7d | 623 | return true; |
416331ca | 624 | } |
60c5eb7d XL |
625 | |
626 | // Literal has more than one symbol. | |
627 | ||
e74abb32 | 628 | // Parse until either quotes are terminated or error is detected. |
416331ca | 629 | loop { |
60c5eb7d | 630 | match self.first() { |
e74abb32 | 631 | // Quotes are terminated, finish parsing. |
416331ca XL |
632 | '\'' => { |
633 | self.bump(); | |
634 | return true; | |
635 | } | |
60c5eb7d XL |
636 | // Probably beginning of the comment, which we don't want to include |
637 | // to the error report. | |
638 | '/' => break, | |
639 | // Newline without following '\'' means unclosed quote, stop parsing. | |
640 | '\n' if self.second() != '\'' => break, | |
641 | // End of file, stop parsing. | |
642 | EOF_CHAR if self.is_eof() => break, | |
e74abb32 | 643 | // Escaped slash is considered one character, so bump twice. |
416331ca XL |
644 | '\\' => { |
645 | self.bump(); | |
646 | self.bump(); | |
647 | } | |
e74abb32 | 648 | // Skip the character. |
416331ca XL |
649 | _ => { |
650 | self.bump(); | |
651 | } | |
652 | } | |
416331ca | 653 | } |
60c5eb7d | 654 | // String was not terminated. |
416331ca XL |
655 | false |
656 | } | |
657 | ||
e74abb32 XL |
658 | /// Eats double-quoted string and returns true |
659 | /// if string is terminated. | |
416331ca XL |
660 | fn double_quoted_string(&mut self) -> bool { |
661 | debug_assert!(self.prev() == '"'); | |
60c5eb7d XL |
662 | while let Some(c) = self.bump() { |
663 | match c { | |
416331ca | 664 | '"' => { |
416331ca XL |
665 | return true; |
666 | } | |
60c5eb7d XL |
667 | '\\' if self.first() == '\\' || self.first() == '"' => { |
668 | // Bump again to skip escaped character. | |
416331ca XL |
669 | self.bump(); |
670 | } | |
671 | _ => (), | |
672 | } | |
416331ca | 673 | } |
60c5eb7d XL |
674 | // End of file reached. |
675 | false | |
416331ca XL |
676 | } |
677 | ||
ba9703b0 XL |
678 | /// Eats the double-quoted string and returns an `UnvalidatedRawStr`. |
679 | fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr { | |
416331ca | 680 | debug_assert!(self.prev() == 'r'); |
ba9703b0 XL |
681 | let mut valid_start: bool = false; |
682 | let start_pos = self.len_consumed(); | |
683 | let (mut possible_terminator_offset, mut max_hashes) = (None, 0); | |
60c5eb7d | 684 | |
e74abb32 | 685 | // Count opening '#' symbols. |
ba9703b0 | 686 | let n_start_hashes = self.eat_while(|c| c == '#'); |
60c5eb7d XL |
687 | |
688 | // Check that string is started. | |
689 | match self.bump() { | |
ba9703b0 XL |
690 | Some('"') => valid_start = true, |
691 | _ => { | |
692 | return UnvalidatedRawStr { | |
693 | valid_start, | |
694 | valid_end: false, | |
695 | n_start_hashes, | |
696 | n_end_hashes: 0, | |
697 | possible_terminator_offset, | |
698 | }; | |
699 | } | |
60c5eb7d XL |
700 | } |
701 | ||
702 | // Skip the string contents and on each '#' character met, check if this is | |
703 | // a raw string termination. | |
ba9703b0 | 704 | loop { |
60c5eb7d XL |
705 | self.eat_while(|c| c != '"'); |
706 | ||
707 | if self.is_eof() { | |
ba9703b0 XL |
708 | return UnvalidatedRawStr { |
709 | valid_start, | |
710 | valid_end: false, | |
711 | n_start_hashes, | |
712 | n_end_hashes: max_hashes, | |
713 | possible_terminator_offset, | |
714 | }; | |
416331ca | 715 | } |
416331ca | 716 | |
60c5eb7d XL |
717 | // Eat closing double quote. |
718 | self.bump(); | |
719 | ||
720 | // Check that amount of closing '#' symbols | |
721 | // is equal to the amount of opening ones. | |
ba9703b0 | 722 | let mut hashes_left = n_start_hashes; |
60c5eb7d XL |
723 | let is_closing_hash = |c| { |
724 | if c == '#' && hashes_left != 0 { | |
725 | hashes_left -= 1; | |
726 | true | |
727 | } else { | |
728 | false | |
416331ca | 729 | } |
60c5eb7d | 730 | }; |
ba9703b0 XL |
731 | let n_end_hashes = self.eat_while(is_closing_hash); |
732 | ||
733 | if n_end_hashes == n_start_hashes { | |
734 | return UnvalidatedRawStr { | |
735 | valid_start, | |
736 | valid_end: true, | |
737 | n_start_hashes, | |
738 | n_end_hashes, | |
739 | possible_terminator_offset: None, | |
740 | }; | |
741 | } else if n_end_hashes > max_hashes { | |
742 | // Keep track of possible terminators to give a hint about where there might be | |
743 | // a missing terminator | |
744 | possible_terminator_offset = | |
745 | Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); | |
746 | max_hashes = n_end_hashes; | |
747 | } | |
416331ca XL |
748 | } |
749 | } | |
750 | ||
751 | fn eat_decimal_digits(&mut self) -> bool { | |
752 | let mut has_digits = false; | |
753 | loop { | |
60c5eb7d | 754 | match self.first() { |
416331ca XL |
755 | '_' => { |
756 | self.bump(); | |
757 | } | |
758 | '0'..='9' => { | |
759 | has_digits = true; | |
760 | self.bump(); | |
761 | } | |
762 | _ => break, | |
763 | } | |
764 | } | |
765 | has_digits | |
766 | } | |
767 | ||
768 | fn eat_hexadecimal_digits(&mut self) -> bool { | |
769 | let mut has_digits = false; | |
770 | loop { | |
60c5eb7d | 771 | match self.first() { |
416331ca XL |
772 | '_' => { |
773 | self.bump(); | |
774 | } | |
775 | '0'..='9' | 'a'..='f' | 'A'..='F' => { | |
776 | has_digits = true; | |
777 | self.bump(); | |
778 | } | |
779 | _ => break, | |
780 | } | |
781 | } | |
782 | has_digits | |
783 | } | |
784 | ||
60c5eb7d XL |
785 | /// Eats the float exponent. Returns true if at least one digit was met, |
786 | /// and returns false otherwise. | |
787 | fn eat_float_exponent(&mut self) -> bool { | |
416331ca | 788 | debug_assert!(self.prev() == 'e' || self.prev() == 'E'); |
60c5eb7d | 789 | if self.first() == '-' || self.first() == '+' { |
416331ca XL |
790 | self.bump(); |
791 | } | |
60c5eb7d | 792 | self.eat_decimal_digits() |
416331ca XL |
793 | } |
794 | ||
60c5eb7d | 795 | // Eats the suffix of the literal, e.g. "_u8". |
416331ca | 796 | fn eat_literal_suffix(&mut self) { |
60c5eb7d XL |
797 | self.eat_identifier(); |
798 | } | |
799 | ||
800 | // Eats the identifier. | |
801 | fn eat_identifier(&mut self) { | |
802 | if !is_id_start(self.first()) { | |
416331ca XL |
803 | return; |
804 | } | |
805 | self.bump(); | |
806 | ||
60c5eb7d XL |
807 | self.eat_while(is_id_continue); |
808 | } | |
809 | ||
810 | /// Eats symbols while predicate returns true or until the end of file is reached. | |
811 | /// Returns amount of eaten symbols. | |
812 | fn eat_while<F>(&mut self, mut predicate: F) -> usize | |
813 | where | |
dfeec247 | 814 | F: FnMut(char) -> bool, |
60c5eb7d XL |
815 | { |
816 | let mut eaten: usize = 0; | |
817 | while predicate(self.first()) && !self.is_eof() { | |
818 | eaten += 1; | |
416331ca XL |
819 | self.bump(); |
820 | } | |
60c5eb7d XL |
821 | |
822 | eaten | |
416331ca | 823 | } |
416331ca | 824 | } |