]>
Commit | Line | Data |
---|---|---|
f035d41b XL |
1 | //! Low-level Rust lexer. |
2 | //! | |
3 | //! The idea with `librustc_lexer` is to make a reusable library, | |
4 | //! by separating out pure lexing and rustc-specific concerns, like spans, | |
5 | //! error reporting an interning. So, rustc_lexer operates directly on `&str`, | |
6 | //! produces simple tokens which are a pair of type-tag and a bit of original text, | |
7 | //! and does not report errors, instead storing them as flags on the token. | |
8 | //! | |
3dfed10e XL |
9 | //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax. |
10 | //! For that see [`librustc_parse::lexer`], which converts this basic token stream | |
f035d41b XL |
11 | //! into wide tokens used by actual parser. |
12 | //! | |
13 | //! The purpose of this crate is to convert raw sources into a labeled sequence | |
14 | //! of well-known token types, so building an actual Rust token stream will | |
15 | //! be easier. | |
16 | //! | |
3dfed10e | 17 | //! The main entity of this crate is the [`TokenKind`] enum which represents common |
f035d41b | 18 | //! lexeme types. |
3dfed10e XL |
19 | //! |
20 | //! [`librustc_parse::lexer`]: ../rustc_parse/lexer/index.html | |
f035d41b XL |
21 | // We want to be able to build this crate with a stable compiler, so no |
22 | // `#![feature]` attributes should be added. | |
23 | ||
24 | mod cursor; | |
25 | pub mod unescape; | |
26 | ||
27 | #[cfg(test)] | |
28 | mod tests; | |
29 | ||
30 | use self::LiteralKind::*; | |
31 | use self::TokenKind::*; | |
32 | use crate::cursor::{Cursor, EOF_CHAR}; | |
33 | use std::convert::TryFrom; | |
34 | ||
35 | /// Parsed token. | |
36 | /// It doesn't contain information about data that has been parsed, | |
37 | /// only the type of the token and its size. | |
38 | pub struct Token { | |
39 | pub kind: TokenKind, | |
40 | pub len: usize, | |
41 | } | |
42 | ||
43 | impl Token { | |
44 | fn new(kind: TokenKind, len: usize) -> Token { | |
45 | Token { kind, len } | |
46 | } | |
47 | } | |
48 | ||
49 | /// Enum representing common lexeme types. | |
50 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
51 | pub enum TokenKind { | |
52 | // Multi-char tokens: | |
53 | /// "// comment" | |
54 | LineComment, | |
3dfed10e XL |
55 | /// `/* block comment */` |
56 | /// | |
57 | /// Block comments can be recursive, so the sequence like `/* /* */` | |
f035d41b XL |
58 | /// will not be considered terminated and will result in a parsing error. |
59 | BlockComment { terminated: bool }, | |
60 | /// Any whitespace characters sequence. | |
61 | Whitespace, | |
62 | /// "ident" or "continue" | |
63 | /// At this step keywords are also considered identifiers. | |
64 | Ident, | |
65 | /// "r#ident" | |
66 | RawIdent, | |
67 | /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details. | |
68 | Literal { kind: LiteralKind, suffix_start: usize }, | |
69 | /// "'a" | |
70 | Lifetime { starts_with_number: bool }, | |
71 | ||
72 | // One-char tokens: | |
73 | /// ";" | |
74 | Semi, | |
75 | /// "," | |
76 | Comma, | |
77 | /// "." | |
78 | Dot, | |
79 | /// "(" | |
80 | OpenParen, | |
81 | /// ")" | |
82 | CloseParen, | |
83 | /// "{" | |
84 | OpenBrace, | |
85 | /// "}" | |
86 | CloseBrace, | |
87 | /// "[" | |
88 | OpenBracket, | |
89 | /// "]" | |
90 | CloseBracket, | |
91 | /// "@" | |
92 | At, | |
93 | /// "#" | |
94 | Pound, | |
95 | /// "~" | |
96 | Tilde, | |
97 | /// "?" | |
98 | Question, | |
99 | /// ":" | |
100 | Colon, | |
101 | /// "$" | |
102 | Dollar, | |
103 | /// "=" | |
104 | Eq, | |
105 | /// "!" | |
106 | Not, | |
107 | /// "<" | |
108 | Lt, | |
109 | /// ">" | |
110 | Gt, | |
111 | /// "-" | |
112 | Minus, | |
113 | /// "&" | |
114 | And, | |
115 | /// "|" | |
116 | Or, | |
117 | /// "+" | |
118 | Plus, | |
119 | /// "*" | |
120 | Star, | |
121 | /// "/" | |
122 | Slash, | |
123 | /// "^" | |
124 | Caret, | |
125 | /// "%" | |
126 | Percent, | |
127 | ||
128 | /// Unknown token, not expected by the lexer, e.g. "№" | |
129 | Unknown, | |
130 | } | |
131 | ||
132 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
133 | pub enum LiteralKind { | |
134 | /// "12_u8", "0o100", "0b120i99" | |
135 | Int { base: Base, empty_int: bool }, | |
136 | /// "12.34f32", "0b100.100" | |
137 | Float { base: Base, empty_exponent: bool }, | |
138 | /// "'a'", "'\\'", "'''", "';" | |
139 | Char { terminated: bool }, | |
140 | /// "b'a'", "b'\\'", "b'''", "b';" | |
141 | Byte { terminated: bool }, | |
142 | /// ""abc"", ""abc" | |
143 | Str { terminated: bool }, | |
144 | /// "b"abc"", "b"abc" | |
145 | ByteStr { terminated: bool }, | |
146 | /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" | |
147 | RawStr { n_hashes: u16, err: Option<RawStrError> }, | |
148 | /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" | |
149 | RawByteStr { n_hashes: u16, err: Option<RawStrError> }, | |
150 | } | |
151 | ||
152 | /// Error produced validating a raw string. Represents cases like: | |
153 | /// - `r##~"abcde"##`: `InvalidStarter` | |
154 | /// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` | |
155 | /// - Too many `#`s (>65535): `TooManyDelimiters` | |
156 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
157 | pub enum RawStrError { | |
158 | /// Non `#` characters exist between `r` and `"` eg. `r#~"..` | |
159 | InvalidStarter { bad_char: char }, | |
160 | /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they | |
161 | /// may have intended to terminate it. | |
162 | NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> }, | |
163 | /// More than 65535 `#`s exist. | |
164 | TooManyDelimiters { found: usize }, | |
165 | } | |
166 | ||
167 | /// Base of numeric literal encoding according to its prefix. | |
168 | #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] | |
169 | pub enum Base { | |
170 | /// Literal starts with "0b". | |
171 | Binary, | |
172 | /// Literal starts with "0o". | |
173 | Octal, | |
174 | /// Literal starts with "0x". | |
175 | Hexadecimal, | |
176 | /// Literal doesn't contain a prefix. | |
177 | Decimal, | |
178 | } | |
179 | ||
180 | /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", | |
181 | /// but shebang isn't a part of rust syntax. | |
182 | pub fn strip_shebang(input: &str) -> Option<usize> { | |
183 | // Shebang must start with `#!` literally, without any preceding whitespace. | |
3dfed10e XL |
184 | // For simplicity we consider any line starting with `#!` a shebang, |
185 | // regardless of restrictions put on shebangs by specific platforms. | |
186 | if let Some(input_tail) = input.strip_prefix("#!") { | |
187 | // Ok, this is a shebang but if the next non-whitespace token is `[` or maybe | |
188 | // a doc comment (due to `TokenKind::(Line,Block)Comment` ambiguity at lexer level), | |
189 | // then it may be valid Rust code, so consider it Rust code. | |
190 | let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| | |
191 | !matches!(tok, TokenKind::Whitespace | TokenKind::LineComment | TokenKind::BlockComment { .. }) | |
192 | ); | |
193 | if next_non_whitespace_token != Some(TokenKind::OpenBracket) { | |
194 | // No other choice than to consider this a shebang. | |
195 | return Some(2 + input_tail.lines().next().unwrap_or_default().len()); | |
f035d41b XL |
196 | } |
197 | } | |
198 | None | |
199 | } | |
200 | ||
201 | /// Parses the first token from the provided input string. | |
202 | pub fn first_token(input: &str) -> Token { | |
203 | debug_assert!(!input.is_empty()); | |
204 | Cursor::new(input).advance_token() | |
205 | } | |
206 | ||
207 | /// Creates an iterator that produces tokens from the input string. | |
208 | pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { | |
209 | std::iter::from_fn(move || { | |
210 | if input.is_empty() { | |
211 | return None; | |
212 | } | |
213 | let token = first_token(input); | |
214 | input = &input[token.len..]; | |
215 | Some(token) | |
216 | }) | |
217 | } | |
218 | ||
219 | /// True if `c` is considered a whitespace according to Rust language definition. | |
220 | /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html) | |
221 | /// for definitions of these classes. | |
222 | pub fn is_whitespace(c: char) -> bool { | |
223 | // This is Pattern_White_Space. | |
224 | // | |
225 | // Note that this set is stable (ie, it doesn't change with different | |
226 | // Unicode versions), so it's ok to just hard-code the values. | |
227 | ||
228 | match c { | |
229 | // Usual ASCII suspects | |
230 | | '\u{0009}' // \t | |
231 | | '\u{000A}' // \n | |
232 | | '\u{000B}' // vertical tab | |
233 | | '\u{000C}' // form feed | |
234 | | '\u{000D}' // \r | |
235 | | '\u{0020}' // space | |
236 | ||
237 | // NEXT LINE from latin1 | |
238 | | '\u{0085}' | |
239 | ||
240 | // Bidi markers | |
241 | | '\u{200E}' // LEFT-TO-RIGHT MARK | |
242 | | '\u{200F}' // RIGHT-TO-LEFT MARK | |
243 | ||
244 | // Dedicated whitespace characters from Unicode | |
245 | | '\u{2028}' // LINE SEPARATOR | |
246 | | '\u{2029}' // PARAGRAPH SEPARATOR | |
247 | => true, | |
248 | _ => false, | |
249 | } | |
250 | } | |
251 | ||
252 | /// True if `c` is valid as a first character of an identifier. | |
253 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for | |
254 | /// a formal definition of valid identifier name. | |
255 | pub fn is_id_start(c: char) -> bool { | |
256 | // This is XID_Start OR '_' (which formally is not a XID_Start). | |
257 | // We also add fast-path for ascii idents | |
258 | ('a' <= c && c <= 'z') | |
259 | || ('A' <= c && c <= 'Z') | |
260 | || c == '_' | |
261 | || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) | |
262 | } | |
263 | ||
264 | /// True if `c` is valid as a non-first character of an identifier. | |
265 | /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for | |
266 | /// a formal definition of valid identifier name. | |
267 | pub fn is_id_continue(c: char) -> bool { | |
268 | // This is exactly XID_Continue. | |
269 | // We also add fast-path for ascii idents | |
270 | ('a' <= c && c <= 'z') | |
271 | || ('A' <= c && c <= 'Z') | |
272 | || ('0' <= c && c <= '9') | |
273 | || c == '_' | |
274 | || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) | |
275 | } | |
276 | ||
277 | impl Cursor<'_> { | |
278 | /// Parses a token from the input string. | |
279 | fn advance_token(&mut self) -> Token { | |
280 | let first_char = self.bump().unwrap(); | |
281 | let token_kind = match first_char { | |
282 | // Slash, comment or block comment. | |
283 | '/' => match self.first() { | |
284 | '/' => self.line_comment(), | |
285 | '*' => self.block_comment(), | |
286 | _ => Slash, | |
287 | }, | |
288 | ||
289 | // Whitespace sequence. | |
290 | c if is_whitespace(c) => self.whitespace(), | |
291 | ||
292 | // Raw identifier, raw string literal or identifier. | |
293 | 'r' => match (self.first(), self.second()) { | |
294 | ('#', c1) if is_id_start(c1) => self.raw_ident(), | |
295 | ('#', _) | ('"', _) => { | |
296 | let (n_hashes, err) = self.raw_double_quoted_string(1); | |
297 | let suffix_start = self.len_consumed(); | |
298 | if err.is_none() { | |
299 | self.eat_literal_suffix(); | |
300 | } | |
301 | let kind = RawStr { n_hashes, err }; | |
302 | Literal { kind, suffix_start } | |
303 | } | |
304 | _ => self.ident(), | |
305 | }, | |
306 | ||
307 | // Byte literal, byte string literal, raw byte string literal or identifier. | |
308 | 'b' => match (self.first(), self.second()) { | |
309 | ('\'', _) => { | |
310 | self.bump(); | |
311 | let terminated = self.single_quoted_string(); | |
312 | let suffix_start = self.len_consumed(); | |
313 | if terminated { | |
314 | self.eat_literal_suffix(); | |
315 | } | |
316 | let kind = Byte { terminated }; | |
317 | Literal { kind, suffix_start } | |
318 | } | |
319 | ('"', _) => { | |
320 | self.bump(); | |
321 | let terminated = self.double_quoted_string(); | |
322 | let suffix_start = self.len_consumed(); | |
323 | if terminated { | |
324 | self.eat_literal_suffix(); | |
325 | } | |
326 | let kind = ByteStr { terminated }; | |
327 | Literal { kind, suffix_start } | |
328 | } | |
329 | ('r', '"') | ('r', '#') => { | |
330 | self.bump(); | |
331 | let (n_hashes, err) = self.raw_double_quoted_string(2); | |
332 | let suffix_start = self.len_consumed(); | |
333 | if err.is_none() { | |
334 | self.eat_literal_suffix(); | |
335 | } | |
336 | let kind = RawByteStr { n_hashes, err }; | |
337 | Literal { kind, suffix_start } | |
338 | } | |
339 | _ => self.ident(), | |
340 | }, | |
341 | ||
342 | // Identifier (this should be checked after other variant that can | |
343 | // start as identifier). | |
344 | c if is_id_start(c) => self.ident(), | |
345 | ||
346 | // Numeric literal. | |
347 | c @ '0'..='9' => { | |
348 | let literal_kind = self.number(c); | |
349 | let suffix_start = self.len_consumed(); | |
350 | self.eat_literal_suffix(); | |
351 | TokenKind::Literal { kind: literal_kind, suffix_start } | |
352 | } | |
353 | ||
354 | // One-symbol tokens. | |
355 | ';' => Semi, | |
356 | ',' => Comma, | |
357 | '.' => Dot, | |
358 | '(' => OpenParen, | |
359 | ')' => CloseParen, | |
360 | '{' => OpenBrace, | |
361 | '}' => CloseBrace, | |
362 | '[' => OpenBracket, | |
363 | ']' => CloseBracket, | |
364 | '@' => At, | |
365 | '#' => Pound, | |
366 | '~' => Tilde, | |
367 | '?' => Question, | |
368 | ':' => Colon, | |
369 | '$' => Dollar, | |
370 | '=' => Eq, | |
371 | '!' => Not, | |
372 | '<' => Lt, | |
373 | '>' => Gt, | |
374 | '-' => Minus, | |
375 | '&' => And, | |
376 | '|' => Or, | |
377 | '+' => Plus, | |
378 | '*' => Star, | |
379 | '^' => Caret, | |
380 | '%' => Percent, | |
381 | ||
382 | // Lifetime or character literal. | |
383 | '\'' => self.lifetime_or_char(), | |
384 | ||
385 | // String literal. | |
386 | '"' => { | |
387 | let terminated = self.double_quoted_string(); | |
388 | let suffix_start = self.len_consumed(); | |
389 | if terminated { | |
390 | self.eat_literal_suffix(); | |
391 | } | |
392 | let kind = Str { terminated }; | |
393 | Literal { kind, suffix_start } | |
394 | } | |
395 | _ => Unknown, | |
396 | }; | |
397 | Token::new(token_kind, self.len_consumed()) | |
398 | } | |
399 | ||
400 | fn line_comment(&mut self) -> TokenKind { | |
401 | debug_assert!(self.prev() == '/' && self.first() == '/'); | |
402 | self.bump(); | |
403 | self.eat_while(|c| c != '\n'); | |
404 | LineComment | |
405 | } | |
406 | ||
407 | fn block_comment(&mut self) -> TokenKind { | |
408 | debug_assert!(self.prev() == '/' && self.first() == '*'); | |
409 | self.bump(); | |
410 | let mut depth = 1usize; | |
411 | while let Some(c) = self.bump() { | |
412 | match c { | |
413 | '/' if self.first() == '*' => { | |
414 | self.bump(); | |
415 | depth += 1; | |
416 | } | |
417 | '*' if self.first() == '/' => { | |
418 | self.bump(); | |
419 | depth -= 1; | |
420 | if depth == 0 { | |
421 | // This block comment is closed, so for a construction like "/* */ */" | |
422 | // there will be a successfully parsed block comment "/* */" | |
423 | // and " */" will be processed separately. | |
424 | break; | |
425 | } | |
426 | } | |
427 | _ => (), | |
428 | } | |
429 | } | |
430 | ||
431 | BlockComment { terminated: depth == 0 } | |
432 | } | |
433 | ||
434 | fn whitespace(&mut self) -> TokenKind { | |
435 | debug_assert!(is_whitespace(self.prev())); | |
436 | self.eat_while(is_whitespace); | |
437 | Whitespace | |
438 | } | |
439 | ||
440 | fn raw_ident(&mut self) -> TokenKind { | |
441 | debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); | |
442 | // Eat "#" symbol. | |
443 | self.bump(); | |
444 | // Eat the identifier part of RawIdent. | |
445 | self.eat_identifier(); | |
446 | RawIdent | |
447 | } | |
448 | ||
449 | fn ident(&mut self) -> TokenKind { | |
450 | debug_assert!(is_id_start(self.prev())); | |
451 | // Start is already eaten, eat the rest of identifier. | |
452 | self.eat_while(is_id_continue); | |
453 | Ident | |
454 | } | |
455 | ||
456 | fn number(&mut self, first_digit: char) -> LiteralKind { | |
457 | debug_assert!('0' <= self.prev() && self.prev() <= '9'); | |
458 | let mut base = Base::Decimal; | |
459 | if first_digit == '0' { | |
460 | // Attempt to parse encoding base. | |
461 | let has_digits = match self.first() { | |
462 | 'b' => { | |
463 | base = Base::Binary; | |
464 | self.bump(); | |
465 | self.eat_decimal_digits() | |
466 | } | |
467 | 'o' => { | |
468 | base = Base::Octal; | |
469 | self.bump(); | |
470 | self.eat_decimal_digits() | |
471 | } | |
472 | 'x' => { | |
473 | base = Base::Hexadecimal; | |
474 | self.bump(); | |
475 | self.eat_hexadecimal_digits() | |
476 | } | |
477 | // Not a base prefix. | |
478 | '0'..='9' | '_' | '.' | 'e' | 'E' => { | |
479 | self.eat_decimal_digits(); | |
480 | true | |
481 | } | |
482 | // Just a 0. | |
483 | _ => return Int { base, empty_int: false }, | |
484 | }; | |
485 | // Base prefix was provided, but there were no digits | |
486 | // after it, e.g. "0x". | |
487 | if !has_digits { | |
488 | return Int { base, empty_int: true }; | |
489 | } | |
490 | } else { | |
491 | // No base prefix, parse number in the usual way. | |
492 | self.eat_decimal_digits(); | |
493 | }; | |
494 | ||
495 | match self.first() { | |
496 | // Don't be greedy if this is actually an | |
497 | // integer literal followed by field/method access or a range pattern | |
498 | // (`0..2` and `12.foo()`) | |
499 | '.' if self.second() != '.' && !is_id_start(self.second()) => { | |
500 | // might have stuff after the ., and if it does, it needs to start | |
501 | // with a number | |
502 | self.bump(); | |
503 | let mut empty_exponent = false; | |
504 | if self.first().is_digit(10) { | |
505 | self.eat_decimal_digits(); | |
506 | match self.first() { | |
507 | 'e' | 'E' => { | |
508 | self.bump(); | |
509 | empty_exponent = !self.eat_float_exponent(); | |
510 | } | |
511 | _ => (), | |
512 | } | |
513 | } | |
514 | Float { base, empty_exponent } | |
515 | } | |
516 | 'e' | 'E' => { | |
517 | self.bump(); | |
518 | let empty_exponent = !self.eat_float_exponent(); | |
519 | Float { base, empty_exponent } | |
520 | } | |
521 | _ => Int { base, empty_int: false }, | |
522 | } | |
523 | } | |
524 | ||
525 | fn lifetime_or_char(&mut self) -> TokenKind { | |
526 | debug_assert!(self.prev() == '\''); | |
527 | ||
528 | let can_be_a_lifetime = if self.second() == '\'' { | |
529 | // It's surely not a lifetime. | |
530 | false | |
531 | } else { | |
532 | // If the first symbol is valid for identifier, it can be a lifetime. | |
533 | // Also check if it's a number for a better error reporting (so '0 will | |
534 | // be reported as invalid lifetime and not as unterminated char literal). | |
535 | is_id_start(self.first()) || self.first().is_digit(10) | |
536 | }; | |
537 | ||
538 | if !can_be_a_lifetime { | |
539 | let terminated = self.single_quoted_string(); | |
540 | let suffix_start = self.len_consumed(); | |
541 | if terminated { | |
542 | self.eat_literal_suffix(); | |
543 | } | |
544 | let kind = Char { terminated }; | |
545 | return Literal { kind, suffix_start }; | |
546 | } | |
547 | ||
548 | // Either a lifetime or a character literal with | |
549 | // length greater than 1. | |
550 | ||
551 | let starts_with_number = self.first().is_digit(10); | |
552 | ||
553 | // Skip the literal contents. | |
554 | // First symbol can be a number (which isn't a valid identifier start), | |
555 | // so skip it without any checks. | |
556 | self.bump(); | |
557 | self.eat_while(is_id_continue); | |
558 | ||
559 | // Check if after skipping literal contents we've met a closing | |
560 | // single quote (which means that user attempted to create a | |
561 | // string with single quotes). | |
562 | if self.first() == '\'' { | |
563 | self.bump(); | |
564 | let kind = Char { terminated: true }; | |
565 | Literal { kind, suffix_start: self.len_consumed() } | |
566 | } else { | |
567 | Lifetime { starts_with_number } | |
568 | } | |
569 | } | |
570 | ||
571 | fn single_quoted_string(&mut self) -> bool { | |
572 | debug_assert!(self.prev() == '\''); | |
573 | // Check if it's a one-symbol literal. | |
574 | if self.second() == '\'' && self.first() != '\\' { | |
575 | self.bump(); | |
576 | self.bump(); | |
577 | return true; | |
578 | } | |
579 | ||
580 | // Literal has more than one symbol. | |
581 | ||
582 | // Parse until either quotes are terminated or error is detected. | |
583 | loop { | |
584 | match self.first() { | |
585 | // Quotes are terminated, finish parsing. | |
586 | '\'' => { | |
587 | self.bump(); | |
588 | return true; | |
589 | } | |
590 | // Probably beginning of the comment, which we don't want to include | |
591 | // to the error report. | |
592 | '/' => break, | |
593 | // Newline without following '\'' means unclosed quote, stop parsing. | |
594 | '\n' if self.second() != '\'' => break, | |
595 | // End of file, stop parsing. | |
596 | EOF_CHAR if self.is_eof() => break, | |
597 | // Escaped slash is considered one character, so bump twice. | |
598 | '\\' => { | |
599 | self.bump(); | |
600 | self.bump(); | |
601 | } | |
602 | // Skip the character. | |
603 | _ => { | |
604 | self.bump(); | |
605 | } | |
606 | } | |
607 | } | |
608 | // String was not terminated. | |
609 | false | |
610 | } | |
611 | ||
612 | /// Eats double-quoted string and returns true | |
613 | /// if string is terminated. | |
614 | fn double_quoted_string(&mut self) -> bool { | |
615 | debug_assert!(self.prev() == '"'); | |
616 | while let Some(c) = self.bump() { | |
617 | match c { | |
618 | '"' => { | |
619 | return true; | |
620 | } | |
621 | '\\' if self.first() == '\\' || self.first() == '"' => { | |
622 | // Bump again to skip escaped character. | |
623 | self.bump(); | |
624 | } | |
625 | _ => (), | |
626 | } | |
627 | } | |
628 | // End of file reached. | |
629 | false | |
630 | } | |
631 | ||
632 | /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. | |
633 | fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) { | |
634 | // Wrap the actual function to handle the error with too many hashes. | |
635 | // This way, it eats the whole raw string. | |
636 | let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); | |
637 | // Only up to 65535 `#`s are allowed in raw strings | |
638 | match u16::try_from(n_hashes) { | |
639 | Ok(num) => (num, err), | |
640 | // We lie about the number of hashes here :P | |
641 | Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), | |
642 | } | |
643 | } | |
644 | ||
645 | fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) { | |
646 | debug_assert!(self.prev() == 'r'); | |
647 | let start_pos = self.len_consumed(); | |
648 | let mut possible_terminator_offset = None; | |
649 | let mut max_hashes = 0; | |
650 | ||
651 | // Count opening '#' symbols. | |
652 | let n_start_hashes = self.eat_while(|c| c == '#'); | |
653 | ||
654 | // Check that string is started. | |
655 | match self.bump() { | |
656 | Some('"') => (), | |
657 | c => { | |
658 | let c = c.unwrap_or(EOF_CHAR); | |
659 | return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); | |
660 | } | |
661 | } | |
662 | ||
663 | // Skip the string contents and on each '#' character met, check if this is | |
664 | // a raw string termination. | |
665 | loop { | |
666 | self.eat_while(|c| c != '"'); | |
667 | ||
668 | if self.is_eof() { | |
669 | return ( | |
670 | n_start_hashes, | |
671 | Some(RawStrError::NoTerminator { | |
672 | expected: n_start_hashes, | |
673 | found: max_hashes, | |
674 | possible_terminator_offset, | |
675 | }), | |
676 | ); | |
677 | } | |
678 | ||
679 | // Eat closing double quote. | |
680 | self.bump(); | |
681 | ||
682 | // Check that amount of closing '#' symbols | |
683 | // is equal to the amount of opening ones. | |
684 | // Note that this will not consume extra trailing `#` characters: | |
685 | // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` | |
686 | // followed by a `#` token. | |
687 | let mut hashes_left = n_start_hashes; | |
688 | let is_closing_hash = |c| { | |
689 | if c == '#' && hashes_left != 0 { | |
690 | hashes_left -= 1; | |
691 | true | |
692 | } else { | |
693 | false | |
694 | } | |
695 | }; | |
696 | let n_end_hashes = self.eat_while(is_closing_hash); | |
697 | ||
698 | if n_end_hashes == n_start_hashes { | |
699 | return (n_start_hashes, None); | |
700 | } else if n_end_hashes > max_hashes { | |
701 | // Keep track of possible terminators to give a hint about | |
702 | // where there might be a missing terminator | |
703 | possible_terminator_offset = | |
704 | Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); | |
705 | max_hashes = n_end_hashes; | |
706 | } | |
707 | } | |
708 | } | |
709 | ||
710 | fn eat_decimal_digits(&mut self) -> bool { | |
711 | let mut has_digits = false; | |
712 | loop { | |
713 | match self.first() { | |
714 | '_' => { | |
715 | self.bump(); | |
716 | } | |
717 | '0'..='9' => { | |
718 | has_digits = true; | |
719 | self.bump(); | |
720 | } | |
721 | _ => break, | |
722 | } | |
723 | } | |
724 | has_digits | |
725 | } | |
726 | ||
727 | fn eat_hexadecimal_digits(&mut self) -> bool { | |
728 | let mut has_digits = false; | |
729 | loop { | |
730 | match self.first() { | |
731 | '_' => { | |
732 | self.bump(); | |
733 | } | |
734 | '0'..='9' | 'a'..='f' | 'A'..='F' => { | |
735 | has_digits = true; | |
736 | self.bump(); | |
737 | } | |
738 | _ => break, | |
739 | } | |
740 | } | |
741 | has_digits | |
742 | } | |
743 | ||
744 | /// Eats the float exponent. Returns true if at least one digit was met, | |
745 | /// and returns false otherwise. | |
746 | fn eat_float_exponent(&mut self) -> bool { | |
747 | debug_assert!(self.prev() == 'e' || self.prev() == 'E'); | |
748 | if self.first() == '-' || self.first() == '+' { | |
749 | self.bump(); | |
750 | } | |
751 | self.eat_decimal_digits() | |
752 | } | |
753 | ||
754 | // Eats the suffix of the literal, e.g. "_u8". | |
755 | fn eat_literal_suffix(&mut self) { | |
756 | self.eat_identifier(); | |
757 | } | |
758 | ||
759 | // Eats the identifier. | |
760 | fn eat_identifier(&mut self) { | |
761 | if !is_id_start(self.first()) { | |
762 | return; | |
763 | } | |
764 | self.bump(); | |
765 | ||
766 | self.eat_while(is_id_continue); | |
767 | } | |
768 | ||
769 | /// Eats symbols while predicate returns true or until the end of file is reached. | |
770 | /// Returns amount of eaten symbols. | |
771 | fn eat_while<F>(&mut self, mut predicate: F) -> usize | |
772 | where | |
773 | F: FnMut(char) -> bool, | |
774 | { | |
775 | let mut eaten: usize = 0; | |
776 | while predicate(self.first()) && !self.is_eof() { | |
777 | eaten += 1; | |
778 | self.bump(); | |
779 | } | |
780 | ||
781 | eaten | |
782 | } | |
783 | } |