]>
Commit | Line | Data |
---|---|---|
f20569fa XL |
1 | //! Low-level Rust lexer. |
2 | //! | |
3 | //! The idea with `librustc_lexer` is to make a reusable library, | |
4 | //! by separating out pure lexing and rustc-specific concerns, like spans, | |
5 | //! error reporting, and interning. So, rustc_lexer operates directly on `&str`, | |
6 | //! produces simple tokens which are a pair of type-tag and a bit of original text, | |
7 | //! and does not report errors, instead storing them as flags on the token. | |
8 | //! | |
9 | //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax. | |
10 | //! For that see [`librustc_parse::lexer`], which converts this basic token stream | |
11 | //! into wide tokens used by actual parser. | |
12 | //! | |
13 | //! The purpose of this crate is to convert raw sources into a labeled sequence | |
14 | //! of well-known token types, so building an actual Rust token stream will | |
15 | //! be easier. | |
16 | //! | |
17 | //! The main entity of this crate is the [`TokenKind`] enum which represents common | |
18 | //! lexeme types. | |
19 | //! | |
20 | //! [`librustc_parse::lexer`]: ../rustc_parse/lexer/index.html | |
21 | // We want to be able to build this crate with a stable compiler, so no | |
22 | // `#![feature]` attributes should be added. | |
23 | ||
24 | mod cursor; | |
25 | pub mod unescape; | |
26 | ||
27 | #[cfg(test)] | |
28 | mod tests; | |
29 | ||
30 | use self::LiteralKind::*; | |
31 | use self::TokenKind::*; | |
32 | use crate::cursor::{Cursor, EOF_CHAR}; | |
33 | use std::convert::TryFrom; | |
34 | ||
35 | /// Parsed token. | |
36 | /// It doesn't contain information about data that has been parsed, | |
37 | /// only the type of the token and its size. | |
38 | #[derive(Debug)] | |
39 | pub struct Token { | |
40 | pub kind: TokenKind, | |
41 | pub len: usize, | |
42 | } | |
43 | ||
44 | impl Token { | |
45 | fn new(kind: TokenKind, len: usize) -> Token { | |
46 | Token { kind, len } | |
47 | } | |
48 | } | |
49 | ||
50 | /// Enum representing common lexeme types. | |
51 | // perf note: Changing all `usize` to `u32` doesn't change performance. See #77629 | |
52 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
53 | pub enum TokenKind { | |
54 | // Multi-char tokens: | |
55 | /// "// comment" | |
56 | LineComment { doc_style: Option<DocStyle> }, | |
57 | /// `/* block comment */` | |
58 | /// | |
59 | /// Block comments can be recursive, so the sequence like `/* /* */` | |
60 | /// will not be considered terminated and will result in a parsing error. | |
61 | BlockComment { doc_style: Option<DocStyle>, terminated: bool }, | |
62 | /// Any whitespace characters sequence. | |
63 | Whitespace, | |
64 | /// "ident" or "continue" | |
65 | /// At this step keywords are also considered identifiers. | |
66 | Ident, | |
67 | /// "r#ident" | |
68 | RawIdent, | |
69 | /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. | |
70 | Literal { kind: LiteralKind, suffix_start: usize }, | |
71 | /// "'a" | |
72 | Lifetime { starts_with_number: bool }, | |
73 | ||
74 | // One-char tokens: | |
75 | /// ";" | |
76 | Semi, | |
77 | /// "," | |
78 | Comma, | |
79 | /// "." | |
80 | Dot, | |
81 | /// "(" | |
82 | OpenParen, | |
83 | /// ")" | |
84 | CloseParen, | |
85 | /// "{" | |
86 | OpenBrace, | |
87 | /// "}" | |
88 | CloseBrace, | |
89 | /// "[" | |
90 | OpenBracket, | |
91 | /// "]" | |
92 | CloseBracket, | |
93 | /// "@" | |
94 | At, | |
95 | /// "#" | |
96 | Pound, | |
97 | /// "~" | |
98 | Tilde, | |
99 | /// "?" | |
100 | Question, | |
101 | /// ":" | |
102 | Colon, | |
103 | /// "$" | |
104 | Dollar, | |
105 | /// "=" | |
106 | Eq, | |
107 | /// "!" | |
108 | Bang, | |
109 | /// "<" | |
110 | Lt, | |
111 | /// ">" | |
112 | Gt, | |
113 | /// "-" | |
114 | Minus, | |
115 | /// "&" | |
116 | And, | |
117 | /// "|" | |
118 | Or, | |
119 | /// "+" | |
120 | Plus, | |
121 | /// "*" | |
122 | Star, | |
123 | /// "/" | |
124 | Slash, | |
125 | /// "^" | |
126 | Caret, | |
127 | /// "%" | |
128 | Percent, | |
129 | ||
130 | /// Unknown token, not expected by the lexer, e.g. "№" | |
131 | Unknown, | |
132 | } | |
133 | ||
134 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
135 | pub enum DocStyle { | |
136 | Outer, | |
137 | Inner, | |
138 | } | |
139 | ||
140 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
141 | pub enum LiteralKind { | |
142 | /// "12_u8", "0o100", "0b120i99" | |
143 | Int { base: Base, empty_int: bool }, | |
144 | /// "12.34f32", "0b100.100" | |
145 | Float { base: Base, empty_exponent: bool }, | |
146 | /// "'a'", "'\\'", "'''", "';" | |
147 | Char { terminated: bool }, | |
148 | /// "b'a'", "b'\\'", "b'''", "b';" | |
149 | Byte { terminated: bool }, | |
150 | /// ""abc"", ""abc" | |
151 | Str { terminated: bool }, | |
152 | /// "b"abc"", "b"abc" | |
153 | ByteStr { terminated: bool }, | |
154 | /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" | |
155 | RawStr { n_hashes: u16, err: Option<RawStrError> }, | |
156 | /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" | |
157 | RawByteStr { n_hashes: u16, err: Option<RawStrError> }, | |
158 | } | |
159 | ||
160 | /// Error produced validating a raw string. Represents cases like: | |
161 | /// - `r##~"abcde"##`: `InvalidStarter` | |
162 | /// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` | |
163 | /// - Too many `#`s (>65535): `TooManyDelimiters` | |
164 | // perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 | |
165 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
166 | pub enum RawStrError { | |
167 | /// Non `#` characters exist between `r` and `"` eg. `r#~"..` | |
168 | InvalidStarter { bad_char: char }, | |
169 | /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they | |
170 | /// may have intended to terminate it. | |
171 | NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> }, | |
172 | /// More than 65535 `#`s exist. | |
173 | TooManyDelimiters { found: usize }, | |
174 | } | |
175 | ||
176 | /// Base of numeric literal encoding according to its prefix. | |
177 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
178 | pub enum Base { | |
179 | /// Literal starts with "0b". | |
180 | Binary, | |
181 | /// Literal starts with "0o". | |
182 | Octal, | |
183 | /// Literal starts with "0x". | |
184 | Hexadecimal, | |
185 | /// Literal doesn't contain a prefix. | |
186 | Decimal, | |
187 | } | |
188 | ||
189 | /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", | |
190 | /// but shebang isn't a part of rust syntax. | |
191 | pub fn strip_shebang(input: &str) -> Option<usize> { | |
192 | // Shebang must start with `#!` literally, without any preceding whitespace. | |
193 | // For simplicity we consider any line starting with `#!` a shebang, | |
194 | // regardless of restrictions put on shebangs by specific platforms. | |
195 | if let Some(input_tail) = input.strip_prefix("#!") { | |
196 | // Ok, this is a shebang but if the next non-whitespace token is `[`, | |
197 | // then it may be valid Rust code, so consider it Rust code. | |
198 | let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| { | |
199 | !matches!( | |
200 | tok, | |
201 | TokenKind::Whitespace | |
202 | | TokenKind::LineComment { doc_style: None } | |
203 | | TokenKind::BlockComment { doc_style: None, .. } | |
204 | ) | |
205 | }); | |
206 | if next_non_whitespace_token != Some(TokenKind::OpenBracket) { | |
207 | // No other choice than to consider this a shebang. | |
208 | return Some(2 + input_tail.lines().next().unwrap_or_default().len()); | |
209 | } | |
210 | } | |
211 | None | |
212 | } | |
213 | ||
214 | /// Parses the first token from the provided input string. | |
215 | pub fn first_token(input: &str) -> Token { | |
216 | debug_assert!(!input.is_empty()); | |
217 | Cursor::new(input).advance_token() | |
218 | } | |
219 | ||
220 | /// Creates an iterator that produces tokens from the input string. | |
221 | pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { | |
222 | std::iter::from_fn(move || { | |
223 | if input.is_empty() { | |
224 | return None; | |
225 | } | |
226 | let token = first_token(input); | |
227 | input = &input[token.len..]; | |
228 | Some(token) | |
229 | }) | |
230 | } | |
231 | ||
232 | /// True if `c` is considered a whitespace according to Rust language definition. | |
233 | /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) | |
234 | /// for definitions of these classes. | |
235 | pub fn is_whitespace(c: char) -> bool { | |
236 | // This is Pattern_White_Space. | |
237 | // | |
238 | // Note that this set is stable (ie, it doesn't change with different | |
239 | // Unicode versions), so it's ok to just hard-code the values. | |
240 | ||
241 | matches!( | |
242 | c, | |
243 | // Usual ASCII suspects | |
244 | '\u{0009}' // \t | |
245 | | '\u{000A}' // \n | |
246 | | '\u{000B}' // vertical tab | |
247 | | '\u{000C}' // form feed | |
248 | | '\u{000D}' // \r | |
249 | | '\u{0020}' // space | |
250 | ||
251 | // NEXT LINE from latin1 | |
252 | | '\u{0085}' | |
253 | ||
254 | // Bidi markers | |
255 | | '\u{200E}' // LEFT-TO-RIGHT MARK | |
256 | | '\u{200F}' // RIGHT-TO-LEFT MARK | |
257 | ||
258 | // Dedicated whitespace characters from Unicode | |
259 | | '\u{2028}' // LINE SEPARATOR | |
260 | | '\u{2029}' // PARAGRAPH SEPARATOR | |
261 | ) | |
262 | } | |
263 | ||
264 | /// True if `c` is valid as a first character of an identifier. | |
265 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for | |
266 | /// a formal definition of valid identifier name. | |
267 | pub fn is_id_start(c: char) -> bool { | |
268 | // This is XID_Start OR '_' (which formally is not a XID_Start). | |
269 | // We also add fast-path for ascii idents | |
270 | ('a'..='z').contains(&c) | |
271 | || ('A'..='Z').contains(&c) | |
272 | || c == '_' | |
273 | || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) | |
274 | } | |
275 | ||
276 | /// True if `c` is valid as a non-first character of an identifier. | |
277 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for | |
278 | /// a formal definition of valid identifier name. | |
279 | pub fn is_id_continue(c: char) -> bool { | |
280 | // This is exactly XID_Continue. | |
281 | // We also add fast-path for ascii idents | |
282 | ('a'..='z').contains(&c) | |
283 | || ('A'..='Z').contains(&c) | |
284 | || ('0'..='9').contains(&c) | |
285 | || c == '_' | |
286 | || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) | |
287 | } | |
288 | ||
289 | /// The passed string is lexically an identifier. | |
290 | pub fn is_ident(string: &str) -> bool { | |
291 | let mut chars = string.chars(); | |
292 | if let Some(start) = chars.next() { | |
293 | is_id_start(start) && chars.all(is_id_continue) | |
294 | } else { | |
295 | false | |
296 | } | |
297 | } | |
298 | ||
299 | impl Cursor<'_> { | |
300 | /// Parses a token from the input string. | |
301 | fn advance_token(&mut self) -> Token { | |
302 | let first_char = self.bump().unwrap(); | |
303 | let token_kind = match first_char { | |
304 | // Slash, comment or block comment. | |
305 | '/' => match self.first() { | |
306 | '/' => self.line_comment(), | |
307 | '*' => self.block_comment(), | |
308 | _ => Slash, | |
309 | }, | |
310 | ||
311 | // Whitespace sequence. | |
312 | c if is_whitespace(c) => self.whitespace(), | |
313 | ||
314 | // Raw identifier, raw string literal or identifier. | |
315 | 'r' => match (self.first(), self.second()) { | |
316 | ('#', c1) if is_id_start(c1) => self.raw_ident(), | |
317 | ('#', _) | ('"', _) => { | |
318 | let (n_hashes, err) = self.raw_double_quoted_string(1); | |
319 | let suffix_start = self.len_consumed(); | |
320 | if err.is_none() { | |
321 | self.eat_literal_suffix(); | |
322 | } | |
323 | let kind = RawStr { n_hashes, err }; | |
324 | Literal { kind, suffix_start } | |
325 | } | |
326 | _ => self.ident(), | |
327 | }, | |
328 | ||
329 | // Byte literal, byte string literal, raw byte string literal or identifier. | |
330 | 'b' => match (self.first(), self.second()) { | |
331 | ('\'', _) => { | |
332 | self.bump(); | |
333 | let terminated = self.single_quoted_string(); | |
334 | let suffix_start = self.len_consumed(); | |
335 | if terminated { | |
336 | self.eat_literal_suffix(); | |
337 | } | |
338 | let kind = Byte { terminated }; | |
339 | Literal { kind, suffix_start } | |
340 | } | |
341 | ('"', _) => { | |
342 | self.bump(); | |
343 | let terminated = self.double_quoted_string(); | |
344 | let suffix_start = self.len_consumed(); | |
345 | if terminated { | |
346 | self.eat_literal_suffix(); | |
347 | } | |
348 | let kind = ByteStr { terminated }; | |
349 | Literal { kind, suffix_start } | |
350 | } | |
351 | ('r', '"') | ('r', '#') => { | |
352 | self.bump(); | |
353 | let (n_hashes, err) = self.raw_double_quoted_string(2); | |
354 | let suffix_start = self.len_consumed(); | |
355 | if err.is_none() { | |
356 | self.eat_literal_suffix(); | |
357 | } | |
358 | let kind = RawByteStr { n_hashes, err }; | |
359 | Literal { kind, suffix_start } | |
360 | } | |
361 | _ => self.ident(), | |
362 | }, | |
363 | ||
364 | // Identifier (this should be checked after other variant that can | |
365 | // start as identifier). | |
366 | c if is_id_start(c) => self.ident(), | |
367 | ||
368 | // Numeric literal. | |
369 | c @ '0'..='9' => { | |
370 | let literal_kind = self.number(c); | |
371 | let suffix_start = self.len_consumed(); | |
372 | self.eat_literal_suffix(); | |
373 | TokenKind::Literal { kind: literal_kind, suffix_start } | |
374 | } | |
375 | ||
376 | // One-symbol tokens. | |
377 | ';' => Semi, | |
378 | ',' => Comma, | |
379 | '.' => Dot, | |
380 | '(' => OpenParen, | |
381 | ')' => CloseParen, | |
382 | '{' => OpenBrace, | |
383 | '}' => CloseBrace, | |
384 | '[' => OpenBracket, | |
385 | ']' => CloseBracket, | |
386 | '@' => At, | |
387 | '#' => Pound, | |
388 | '~' => Tilde, | |
389 | '?' => Question, | |
390 | ':' => Colon, | |
391 | '$' => Dollar, | |
392 | '=' => Eq, | |
393 | '!' => Bang, | |
394 | '<' => Lt, | |
395 | '>' => Gt, | |
396 | '-' => Minus, | |
397 | '&' => And, | |
398 | '|' => Or, | |
399 | '+' => Plus, | |
400 | '*' => Star, | |
401 | '^' => Caret, | |
402 | '%' => Percent, | |
403 | ||
404 | // Lifetime or character literal. | |
405 | '\'' => self.lifetime_or_char(), | |
406 | ||
407 | // String literal. | |
408 | '"' => { | |
409 | let terminated = self.double_quoted_string(); | |
410 | let suffix_start = self.len_consumed(); | |
411 | if terminated { | |
412 | self.eat_literal_suffix(); | |
413 | } | |
414 | let kind = Str { terminated }; | |
415 | Literal { kind, suffix_start } | |
416 | } | |
417 | _ => Unknown, | |
418 | }; | |
419 | Token::new(token_kind, self.len_consumed()) | |
420 | } | |
421 | ||
422 | fn line_comment(&mut self) -> TokenKind { | |
423 | debug_assert!(self.prev() == '/' && self.first() == '/'); | |
424 | self.bump(); | |
425 | ||
426 | let doc_style = match self.first() { | |
427 | // `//!` is an inner line doc comment. | |
428 | '!' => Some(DocStyle::Inner), | |
429 | // `////` (more than 3 slashes) is not considered a doc comment. | |
430 | '/' if self.second() != '/' => Some(DocStyle::Outer), | |
431 | _ => None, | |
432 | }; | |
433 | ||
434 | self.eat_while(|c| c != '\n'); | |
435 | LineComment { doc_style } | |
436 | } | |
437 | ||
438 | fn block_comment(&mut self) -> TokenKind { | |
439 | debug_assert!(self.prev() == '/' && self.first() == '*'); | |
440 | self.bump(); | |
441 | ||
442 | let doc_style = match self.first() { | |
443 | // `/*!` is an inner block doc comment. | |
444 | '!' => Some(DocStyle::Inner), | |
445 | // `/***` (more than 2 stars) is not considered a doc comment. | |
446 | // `/**/` is not considered a doc comment. | |
447 | '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer), | |
448 | _ => None, | |
449 | }; | |
450 | ||
451 | let mut depth = 1usize; | |
452 | while let Some(c) = self.bump() { | |
453 | match c { | |
454 | '/' if self.first() == '*' => { | |
455 | self.bump(); | |
456 | depth += 1; | |
457 | } | |
458 | '*' if self.first() == '/' => { | |
459 | self.bump(); | |
460 | depth -= 1; | |
461 | if depth == 0 { | |
462 | // This block comment is closed, so for a construction like "/* */ */" | |
463 | // there will be a successfully parsed block comment "/* */" | |
464 | // and " */" will be processed separately. | |
465 | break; | |
466 | } | |
467 | } | |
468 | _ => (), | |
469 | } | |
470 | } | |
471 | ||
472 | BlockComment { doc_style, terminated: depth == 0 } | |
473 | } | |
474 | ||
475 | fn whitespace(&mut self) -> TokenKind { | |
476 | debug_assert!(is_whitespace(self.prev())); | |
477 | self.eat_while(is_whitespace); | |
478 | Whitespace | |
479 | } | |
480 | ||
481 | fn raw_ident(&mut self) -> TokenKind { | |
482 | debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); | |
483 | // Eat "#" symbol. | |
484 | self.bump(); | |
485 | // Eat the identifier part of RawIdent. | |
486 | self.eat_identifier(); | |
487 | RawIdent | |
488 | } | |
489 | ||
490 | fn ident(&mut self) -> TokenKind { | |
491 | debug_assert!(is_id_start(self.prev())); | |
492 | // Start is already eaten, eat the rest of identifier. | |
493 | self.eat_while(is_id_continue); | |
494 | Ident | |
495 | } | |
496 | ||
497 | fn number(&mut self, first_digit: char) -> LiteralKind { | |
498 | debug_assert!('0' <= self.prev() && self.prev() <= '9'); | |
499 | let mut base = Base::Decimal; | |
500 | if first_digit == '0' { | |
501 | // Attempt to parse encoding base. | |
502 | let has_digits = match self.first() { | |
503 | 'b' => { | |
504 | base = Base::Binary; | |
505 | self.bump(); | |
506 | self.eat_decimal_digits() | |
507 | } | |
508 | 'o' => { | |
509 | base = Base::Octal; | |
510 | self.bump(); | |
511 | self.eat_decimal_digits() | |
512 | } | |
513 | 'x' => { | |
514 | base = Base::Hexadecimal; | |
515 | self.bump(); | |
516 | self.eat_hexadecimal_digits() | |
517 | } | |
518 | // Not a base prefix. | |
519 | '0'..='9' | '_' | '.' | 'e' | 'E' => { | |
520 | self.eat_decimal_digits(); | |
521 | true | |
522 | } | |
523 | // Just a 0. | |
524 | _ => return Int { base, empty_int: false }, | |
525 | }; | |
526 | // Base prefix was provided, but there were no digits | |
527 | // after it, e.g. "0x". | |
528 | if !has_digits { | |
529 | return Int { base, empty_int: true }; | |
530 | } | |
531 | } else { | |
532 | // No base prefix, parse number in the usual way. | |
533 | self.eat_decimal_digits(); | |
534 | }; | |
535 | ||
536 | match self.first() { | |
537 | // Don't be greedy if this is actually an | |
538 | // integer literal followed by field/method access or a range pattern | |
539 | // (`0..2` and `12.foo()`) | |
540 | '.' if self.second() != '.' && !is_id_start(self.second()) => { | |
541 | // might have stuff after the ., and if it does, it needs to start | |
542 | // with a number | |
543 | self.bump(); | |
544 | let mut empty_exponent = false; | |
545 | if self.first().is_digit(10) { | |
546 | self.eat_decimal_digits(); | |
547 | match self.first() { | |
548 | 'e' | 'E' => { | |
549 | self.bump(); | |
550 | empty_exponent = !self.eat_float_exponent(); | |
551 | } | |
552 | _ => (), | |
553 | } | |
554 | } | |
555 | Float { base, empty_exponent } | |
556 | } | |
557 | 'e' | 'E' => { | |
558 | self.bump(); | |
559 | let empty_exponent = !self.eat_float_exponent(); | |
560 | Float { base, empty_exponent } | |
561 | } | |
562 | _ => Int { base, empty_int: false }, | |
563 | } | |
564 | } | |
565 | ||
566 | fn lifetime_or_char(&mut self) -> TokenKind { | |
567 | debug_assert!(self.prev() == '\''); | |
568 | ||
569 | let can_be_a_lifetime = if self.second() == '\'' { | |
570 | // It's surely not a lifetime. | |
571 | false | |
572 | } else { | |
573 | // If the first symbol is valid for identifier, it can be a lifetime. | |
574 | // Also check if it's a number for a better error reporting (so '0 will | |
575 | // be reported as invalid lifetime and not as unterminated char literal). | |
576 | is_id_start(self.first()) || self.first().is_digit(10) | |
577 | }; | |
578 | ||
579 | if !can_be_a_lifetime { | |
580 | let terminated = self.single_quoted_string(); | |
581 | let suffix_start = self.len_consumed(); | |
582 | if terminated { | |
583 | self.eat_literal_suffix(); | |
584 | } | |
585 | let kind = Char { terminated }; | |
586 | return Literal { kind, suffix_start }; | |
587 | } | |
588 | ||
589 | // Either a lifetime or a character literal with | |
590 | // length greater than 1. | |
591 | ||
592 | let starts_with_number = self.first().is_digit(10); | |
593 | ||
594 | // Skip the literal contents. | |
595 | // First symbol can be a number (which isn't a valid identifier start), | |
596 | // so skip it without any checks. | |
597 | self.bump(); | |
598 | self.eat_while(is_id_continue); | |
599 | ||
600 | // Check if after skipping literal contents we've met a closing | |
601 | // single quote (which means that user attempted to create a | |
602 | // string with single quotes). | |
603 | if self.first() == '\'' { | |
604 | self.bump(); | |
605 | let kind = Char { terminated: true }; | |
606 | Literal { kind, suffix_start: self.len_consumed() } | |
607 | } else { | |
608 | Lifetime { starts_with_number } | |
609 | } | |
610 | } | |
611 | ||
612 | fn single_quoted_string(&mut self) -> bool { | |
613 | debug_assert!(self.prev() == '\''); | |
614 | // Check if it's a one-symbol literal. | |
615 | if self.second() == '\'' && self.first() != '\\' { | |
616 | self.bump(); | |
617 | self.bump(); | |
618 | return true; | |
619 | } | |
620 | ||
621 | // Literal has more than one symbol. | |
622 | ||
623 | // Parse until either quotes are terminated or error is detected. | |
624 | loop { | |
625 | match self.first() { | |
626 | // Quotes are terminated, finish parsing. | |
627 | '\'' => { | |
628 | self.bump(); | |
629 | return true; | |
630 | } | |
631 | // Probably beginning of the comment, which we don't want to include | |
632 | // to the error report. | |
633 | '/' => break, | |
634 | // Newline without following '\'' means unclosed quote, stop parsing. | |
635 | '\n' if self.second() != '\'' => break, | |
636 | // End of file, stop parsing. | |
637 | EOF_CHAR if self.is_eof() => break, | |
638 | // Escaped slash is considered one character, so bump twice. | |
639 | '\\' => { | |
640 | self.bump(); | |
641 | self.bump(); | |
642 | } | |
643 | // Skip the character. | |
644 | _ => { | |
645 | self.bump(); | |
646 | } | |
647 | } | |
648 | } | |
649 | // String was not terminated. | |
650 | false | |
651 | } | |
652 | ||
653 | /// Eats double-quoted string and returns true | |
654 | /// if string is terminated. | |
655 | fn double_quoted_string(&mut self) -> bool { | |
656 | debug_assert!(self.prev() == '"'); | |
657 | while let Some(c) = self.bump() { | |
658 | match c { | |
659 | '"' => { | |
660 | return true; | |
661 | } | |
662 | '\\' if self.first() == '\\' || self.first() == '"' => { | |
663 | // Bump again to skip escaped character. | |
664 | self.bump(); | |
665 | } | |
666 | _ => (), | |
667 | } | |
668 | } | |
669 | // End of file reached. | |
670 | false | |
671 | } | |
672 | ||
673 | /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. | |
674 | fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) { | |
675 | // Wrap the actual function to handle the error with too many hashes. | |
676 | // This way, it eats the whole raw string. | |
677 | let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); | |
678 | // Only up to 65535 `#`s are allowed in raw strings | |
679 | match u16::try_from(n_hashes) { | |
680 | Ok(num) => (num, err), | |
681 | // We lie about the number of hashes here :P | |
682 | Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), | |
683 | } | |
684 | } | |
685 | ||
686 | fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) { | |
687 | debug_assert!(self.prev() == 'r'); | |
688 | let start_pos = self.len_consumed(); | |
689 | let mut possible_terminator_offset = None; | |
690 | let mut max_hashes = 0; | |
691 | ||
692 | // Count opening '#' symbols. | |
693 | let mut eaten = 0; | |
694 | while self.first() == '#' { | |
695 | eaten += 1; | |
696 | self.bump(); | |
697 | } | |
698 | let n_start_hashes = eaten; | |
699 | ||
700 | // Check that string is started. | |
701 | match self.bump() { | |
702 | Some('"') => (), | |
703 | c => { | |
704 | let c = c.unwrap_or(EOF_CHAR); | |
705 | return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); | |
706 | } | |
707 | } | |
708 | ||
709 | // Skip the string contents and on each '#' character met, check if this is | |
710 | // a raw string termination. | |
711 | loop { | |
712 | self.eat_while(|c| c != '"'); | |
713 | ||
714 | if self.is_eof() { | |
715 | return ( | |
716 | n_start_hashes, | |
717 | Some(RawStrError::NoTerminator { | |
718 | expected: n_start_hashes, | |
719 | found: max_hashes, | |
720 | possible_terminator_offset, | |
721 | }), | |
722 | ); | |
723 | } | |
724 | ||
725 | // Eat closing double quote. | |
726 | self.bump(); | |
727 | ||
728 | // Check that amount of closing '#' symbols | |
729 | // is equal to the amount of opening ones. | |
730 | // Note that this will not consume extra trailing `#` characters: | |
731 | // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` | |
732 | // followed by a `#` token. | |
733 | let mut n_end_hashes = 0; | |
734 | while self.first() == '#' && n_end_hashes < n_start_hashes { | |
735 | n_end_hashes += 1; | |
736 | self.bump(); | |
737 | } | |
738 | ||
739 | if n_end_hashes == n_start_hashes { | |
740 | return (n_start_hashes, None); | |
741 | } else if n_end_hashes > max_hashes { | |
742 | // Keep track of possible terminators to give a hint about | |
743 | // where there might be a missing terminator | |
744 | possible_terminator_offset = | |
745 | Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); | |
746 | max_hashes = n_end_hashes; | |
747 | } | |
748 | } | |
749 | } | |
750 | ||
751 | fn eat_decimal_digits(&mut self) -> bool { | |
752 | let mut has_digits = false; | |
753 | loop { | |
754 | match self.first() { | |
755 | '_' => { | |
756 | self.bump(); | |
757 | } | |
758 | '0'..='9' => { | |
759 | has_digits = true; | |
760 | self.bump(); | |
761 | } | |
762 | _ => break, | |
763 | } | |
764 | } | |
765 | has_digits | |
766 | } | |
767 | ||
768 | fn eat_hexadecimal_digits(&mut self) -> bool { | |
769 | let mut has_digits = false; | |
770 | loop { | |
771 | match self.first() { | |
772 | '_' => { | |
773 | self.bump(); | |
774 | } | |
775 | '0'..='9' | 'a'..='f' | 'A'..='F' => { | |
776 | has_digits = true; | |
777 | self.bump(); | |
778 | } | |
779 | _ => break, | |
780 | } | |
781 | } | |
782 | has_digits | |
783 | } | |
784 | ||
785 | /// Eats the float exponent. Returns true if at least one digit was met, | |
786 | /// and returns false otherwise. | |
787 | fn eat_float_exponent(&mut self) -> bool { | |
788 | debug_assert!(self.prev() == 'e' || self.prev() == 'E'); | |
789 | if self.first() == '-' || self.first() == '+' { | |
790 | self.bump(); | |
791 | } | |
792 | self.eat_decimal_digits() | |
793 | } | |
794 | ||
795 | // Eats the suffix of the literal, e.g. "_u8". | |
796 | fn eat_literal_suffix(&mut self) { | |
797 | self.eat_identifier(); | |
798 | } | |
799 | ||
800 | // Eats the identifier. | |
801 | fn eat_identifier(&mut self) { | |
802 | if !is_id_start(self.first()) { | |
803 | return; | |
804 | } | |
805 | self.bump(); | |
806 | ||
807 | self.eat_while(is_id_continue); | |
808 | } | |
809 | ||
810 | /// Eats symbols while predicate returns true or until the end of file is reached. | |
811 | fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { | |
812 | while predicate(self.first()) && !self.is_eof() { | |
813 | self.bump(); | |
814 | } | |
815 | } | |
816 | } |