]> git.proxmox.com Git - rustc.git/blame - src/librustc_lexer/src/lib.rs
New upstream version 1.44.1+dfsg1
[rustc.git] / src / librustc_lexer / src / lib.rs
CommitLineData
e74abb32
XL
1//! Low-level Rust lexer.
2//!
3//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax,
60c5eb7d 4//! for that see `librustc_parse::lexer`, which converts this basic token stream
e74abb32
XL
5//! into wide tokens used by actual parser.
6//!
7//! The purpose of this crate is to convert raw sources into a labeled sequence
8//! of well-known token types, so building an actual Rust token stream will
9//! be easier.
10//!
11//! Main entity of this crate is [`TokenKind`] enum which represents common
12//! lexeme types.
13
e1599b0c
XL
14// We want to be able to build this crate with a stable compiler, so no
15// `#![feature]` attributes should be added.
416331ca
XL
16
17mod cursor;
18pub mod unescape;
19
ba9703b0
XL
20#[cfg(test)]
21mod tests;
22
60c5eb7d 23use self::LiteralKind::*;
dfeec247
XL
24use self::TokenKind::*;
25use crate::cursor::{Cursor, EOF_CHAR};
ba9703b0 26use std::convert::TryInto;
416331ca 27
e74abb32
XL
28/// Parsed token.
29/// It doesn't contain information about data that has been parsed,
30/// only the type of the token and its size.
416331ca
XL
31pub struct Token {
32 pub kind: TokenKind,
33 pub len: usize,
34}
35
e74abb32
XL
36impl Token {
37 fn new(kind: TokenKind, len: usize) -> Token {
38 Token { kind, len }
39 }
40}
41
74b04a01 42/// Enum representing common lexeme types.
416331ca
XL
43#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
44pub enum TokenKind {
e74abb32 45 // Multi-char tokens:
e74abb32 46 /// "// comment"
416331ca 47 LineComment,
e74abb32
XL
48 /// "/* block comment */"
49 /// Block comments can be recursive, so the sequence like "/* /* */"
50 /// will not be considered terminated and will result in a parsing error.
416331ca 51 BlockComment { terminated: bool },
e74abb32 52 /// Any whitespace characters sequence.
416331ca 53 Whitespace,
e74abb32
XL
54 /// "ident" or "continue"
55 /// At this step keywords are also considered identifiers.
416331ca 56 Ident,
e74abb32 57 /// "r#ident"
416331ca 58 RawIdent,
e74abb32 59 /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
416331ca 60 Literal { kind: LiteralKind, suffix_start: usize },
e74abb32 61 /// "'a"
416331ca 62 Lifetime { starts_with_number: bool },
e74abb32
XL
63
64 // One-char tokens:
e74abb32 65 /// ";"
416331ca 66 Semi,
e74abb32 67 /// ","
416331ca 68 Comma,
e74abb32 69 /// "."
416331ca 70 Dot,
e74abb32 71 /// "("
416331ca 72 OpenParen,
e74abb32 73 /// ")"
416331ca 74 CloseParen,
e74abb32 75 /// "{"
416331ca 76 OpenBrace,
e74abb32 77 /// "}"
416331ca 78 CloseBrace,
e74abb32 79 /// "["
416331ca 80 OpenBracket,
e74abb32 81 /// "]"
416331ca 82 CloseBracket,
e74abb32 83 /// "@"
416331ca 84 At,
e74abb32 85 /// "#"
416331ca 86 Pound,
e74abb32 87 /// "~"
416331ca 88 Tilde,
e74abb32 89 /// "?"
416331ca 90 Question,
e74abb32 91 /// ":"
416331ca 92 Colon,
e74abb32 93 /// "$"
416331ca 94 Dollar,
e74abb32 95 /// "="
416331ca 96 Eq,
e74abb32 97 /// "!"
416331ca 98 Not,
e74abb32 99 /// "<"
416331ca 100 Lt,
e74abb32 101 /// ">"
416331ca 102 Gt,
e74abb32 103 /// "-"
416331ca 104 Minus,
e74abb32 105 /// "&"
416331ca 106 And,
e74abb32 107 /// "|"
416331ca 108 Or,
e74abb32 109 /// "+"
416331ca 110 Plus,
e74abb32 111 /// "*"
416331ca 112 Star,
e74abb32 113 /// "/"
416331ca 114 Slash,
e74abb32 115 /// "^"
416331ca 116 Caret,
e74abb32 117 /// "%"
416331ca 118 Percent,
e74abb32
XL
119
120 /// Unknown token, not expected by the lexer, e.g. "№"
416331ca
XL
121 Unknown,
122}
416331ca
XL
123
124#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
125pub enum LiteralKind {
e74abb32 126 /// "12_u8", "0o100", "0b120i99"
416331ca 127 Int { base: Base, empty_int: bool },
e74abb32 128 /// "12.34f32", "0b100.100"
416331ca 129 Float { base: Base, empty_exponent: bool },
e74abb32 130 /// "'a'", "'\\'", "'''", "';"
416331ca 131 Char { terminated: bool },
e74abb32 132 /// "b'a'", "b'\\'", "b'''", "b';"
416331ca 133 Byte { terminated: bool },
e74abb32 134 /// ""abc"", ""abc"
416331ca 135 Str { terminated: bool },
e74abb32 136 /// "b"abc"", "b"abc"
416331ca 137 ByteStr { terminated: bool },
e74abb32 138 /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
ba9703b0 139 RawStr(UnvalidatedRawStr),
e74abb32 140 /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
ba9703b0
XL
141 RawByteStr(UnvalidatedRawStr),
142}
143
144/// Represents something that looks like a raw string, but may have some
145/// problems. Use `.validate()` to convert it into something
146/// usable.
147#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
148pub struct UnvalidatedRawStr {
149 /// The prefix (`r###"`) is valid
150 valid_start: bool,
151
152 /// The postfix (`"###`) is valid
153 valid_end: bool,
154
155 /// The number of leading `#`
156 n_start_hashes: usize,
157 /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
158 n_end_hashes: usize,
159 /// The offset starting at `r` or `br` where the user may have intended to end the string.
160 /// Currently, it is the longest sequence of pattern `"#+"`.
161 possible_terminator_offset: Option<usize>,
162}
163
164/// Error produced validating a raw string. Represents cases like:
165/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
166/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
167/// - Too many `#`s (>65536): `TooManyDelimiters`
168#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
169pub enum LexRawStrError {
170 /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
171 InvalidStarter,
172 /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
173 /// may have intended to terminate it.
174 NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
175 /// More than 65536 `#`s exist.
176 TooManyDelimiters,
177}
178
179/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
180/// there are a matching number of `#` characters in both. Note that this will
181/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
182/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
183#[derive(Debug, Eq, PartialEq, Copy, Clone)]
184pub struct ValidatedRawStr {
185 n_hashes: u16,
186}
187
188impl ValidatedRawStr {
189 pub fn num_hashes(&self) -> u16 {
190 self.n_hashes
191 }
192}
193
194impl UnvalidatedRawStr {
195 pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
196 if !self.valid_start {
197 return Err(LexRawStrError::InvalidStarter);
198 }
199
200 // Only up to 65535 `#`s are allowed in raw strings
201 let n_start_safe: u16 =
202 self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
203
204 if self.n_start_hashes > self.n_end_hashes || !self.valid_end {
205 Err(LexRawStrError::NoTerminator {
206 expected: self.n_start_hashes,
207 found: self.n_end_hashes,
208 possible_terminator_offset: self.possible_terminator_offset,
209 })
210 } else {
211 // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
212 // they must be equal.
213 debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
214 Ok(ValidatedRawStr { n_hashes: n_start_safe })
215 }
216 }
416331ca 217}
416331ca 218
e74abb32 219/// Base of numeric literal encoding according to its prefix.
416331ca
XL
220#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
221pub enum Base {
e74abb32 222 /// Literal starts with "0b".
416331ca 223 Binary,
e74abb32 224 /// Literal starts with "0o".
416331ca 225 Octal,
e74abb32 226 /// Literal starts with "0x".
416331ca 227 Hexadecimal,
e74abb32 228 /// Literal doesn't contain a prefix.
416331ca
XL
229 Decimal,
230}
231
e74abb32
XL
232/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
233/// but shebang isn't a part of rust syntax, so this function
234/// skips the line if it starts with a shebang ("#!").
235/// Line won't be skipped if it represents a valid Rust syntax
236/// (e.g. "#![deny(missing_docs)]").
416331ca
XL
237pub fn strip_shebang(input: &str) -> Option<usize> {
238 debug_assert!(!input.is_empty());
239 if !input.starts_with("#!") || input.starts_with("#![") {
240 return None;
241 }
242 Some(input.find('\n').unwrap_or(input.len()))
243}
244
e74abb32 245/// Parses the first token from the provided input string.
416331ca
XL
246pub fn first_token(input: &str) -> Token {
247 debug_assert!(!input.is_empty());
248 Cursor::new(input).advance_token()
249}
250
e74abb32 251/// Creates an iterator that produces tokens from the input string.
416331ca
XL
252pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
253 std::iter::from_fn(move || {
254 if input.is_empty() {
255 return None;
256 }
257 let token = first_token(input);
258 input = &input[token.len..];
259 Some(token)
260 })
261}
262
e1599b0c 263/// True if `c` is considered a whitespace according to Rust language definition.
e74abb32
XL
264/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
265/// for definitions of these classes.
e1599b0c
XL
266pub fn is_whitespace(c: char) -> bool {
267 // This is Pattern_White_Space.
268 //
269 // Note that this set is stable (ie, it doesn't change with different
270 // Unicode versions), so it's ok to just hard-code the values.
271
272 match c {
273 // Usual ASCII suspects
274 | '\u{0009}' // \t
275 | '\u{000A}' // \n
276 | '\u{000B}' // vertical tab
277 | '\u{000C}' // form feed
278 | '\u{000D}' // \r
279 | '\u{0020}' // space
280
281 // NEXT LINE from latin1
282 | '\u{0085}'
283
284 // Bidi markers
285 | '\u{200E}' // LEFT-TO-RIGHT MARK
286 | '\u{200F}' // RIGHT-TO-LEFT MARK
287
288 // Dedicated whitespace characters from Unicode
289 | '\u{2028}' // LINE SEPARATOR
290 | '\u{2029}' // PARAGRAPH SEPARATOR
ba9703b0 291 => true,
e1599b0c
XL
292 _ => false,
293 }
294}
295
296/// True if `c` is valid as a first character of an identifier.
e74abb32
XL
297/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
298/// a formal definition of valid identifier name.
e1599b0c
XL
299pub fn is_id_start(c: char) -> bool {
300 // This is XID_Start OR '_' (which formally is not a XID_Start).
301 // We also add fast-path for ascii idents
302 ('a' <= c && c <= 'z')
303 || ('A' <= c && c <= 'Z')
304 || c == '_'
305 || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
306}
307
308/// True if `c` is valid as a non-first character of an identifier.
e74abb32
XL
309/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
310/// a formal definition of valid identifier name.
e1599b0c
XL
311pub fn is_id_continue(c: char) -> bool {
312 // This is exactly XID_Continue.
313 // We also add fast-path for ascii idents
314 ('a' <= c && c <= 'z')
315 || ('A' <= c && c <= 'Z')
316 || ('0' <= c && c <= '9')
317 || c == '_'
318 || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
319}
320
416331ca 321impl Cursor<'_> {
e74abb32 322 /// Parses a token from the input string.
416331ca
XL
323 fn advance_token(&mut self) -> Token {
324 let first_char = self.bump().unwrap();
325 let token_kind = match first_char {
e74abb32 326 // Slash, comment or block comment.
60c5eb7d 327 '/' => match self.first() {
416331ca
XL
328 '/' => self.line_comment(),
329 '*' => self.block_comment(),
e1599b0c 330 _ => Slash,
416331ca 331 },
e74abb32
XL
332
333 // Whitespace sequence.
e1599b0c 334 c if is_whitespace(c) => self.whitespace(),
e74abb32 335
60c5eb7d
XL
336 // Raw identifier, raw string literal or identifier.
337 'r' => match (self.first(), self.second()) {
e1599b0c 338 ('#', c1) if is_id_start(c1) => self.raw_ident(),
416331ca 339 ('#', _) | ('"', _) => {
ba9703b0 340 let raw_str_i = self.raw_double_quoted_string(1);
416331ca 341 let suffix_start = self.len_consumed();
ba9703b0 342 if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
416331ca
XL
343 self.eat_literal_suffix();
344 }
ba9703b0 345 let kind = RawStr(raw_str_i);
416331ca
XL
346 Literal { kind, suffix_start }
347 }
348 _ => self.ident(),
349 },
e74abb32
XL
350
351 // Byte literal, byte string literal, raw byte string literal or identifier.
60c5eb7d 352 'b' => match (self.first(), self.second()) {
416331ca
XL
353 ('\'', _) => {
354 self.bump();
355 let terminated = self.single_quoted_string();
356 let suffix_start = self.len_consumed();
357 if terminated {
358 self.eat_literal_suffix();
359 }
360 let kind = Byte { terminated };
361 Literal { kind, suffix_start }
362 }
363 ('"', _) => {
364 self.bump();
365 let terminated = self.double_quoted_string();
366 let suffix_start = self.len_consumed();
367 if terminated {
368 self.eat_literal_suffix();
369 }
370 let kind = ByteStr { terminated };
371 Literal { kind, suffix_start }
372 }
373 ('r', '"') | ('r', '#') => {
374 self.bump();
ba9703b0 375 let raw_str_i = self.raw_double_quoted_string(2);
416331ca 376 let suffix_start = self.len_consumed();
ba9703b0 377 let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
416331ca
XL
378 if terminated {
379 self.eat_literal_suffix();
380 }
ba9703b0
XL
381
382 let kind = RawByteStr(raw_str_i);
416331ca
XL
383 Literal { kind, suffix_start }
384 }
385 _ => self.ident(),
386 },
e74abb32
XL
387
388 // Identifier (this should be checked after other variant that can
389 // start as identifier).
e1599b0c 390 c if is_id_start(c) => self.ident(),
e74abb32
XL
391
392 // Numeric literal.
416331ca
XL
393 c @ '0'..='9' => {
394 let literal_kind = self.number(c);
395 let suffix_start = self.len_consumed();
396 self.eat_literal_suffix();
397 TokenKind::Literal { kind: literal_kind, suffix_start }
398 }
e74abb32
XL
399
400 // One-symbol tokens.
416331ca
XL
401 ';' => Semi,
402 ',' => Comma,
e1599b0c 403 '.' => Dot,
416331ca
XL
404 '(' => OpenParen,
405 ')' => CloseParen,
406 '{' => OpenBrace,
407 '}' => CloseBrace,
408 '[' => OpenBracket,
409 ']' => CloseBracket,
410 '@' => At,
411 '#' => Pound,
412 '~' => Tilde,
413 '?' => Question,
e1599b0c 414 ':' => Colon,
416331ca 415 '$' => Dollar,
e1599b0c
XL
416 '=' => Eq,
417 '!' => Not,
418 '<' => Lt,
419 '>' => Gt,
420 '-' => Minus,
421 '&' => And,
422 '|' => Or,
423 '+' => Plus,
424 '*' => Star,
425 '^' => Caret,
426 '%' => Percent,
e74abb32
XL
427
428 // Lifetime or character literal.
416331ca 429 '\'' => self.lifetime_or_char(),
e74abb32
XL
430
431 // String literal.
416331ca
XL
432 '"' => {
433 let terminated = self.double_quoted_string();
434 let suffix_start = self.len_consumed();
435 if terminated {
436 self.eat_literal_suffix();
437 }
438 let kind = Str { terminated };
439 Literal { kind, suffix_start }
440 }
441 _ => Unknown,
442 };
443 Token::new(token_kind, self.len_consumed())
444 }
445
446 fn line_comment(&mut self) -> TokenKind {
60c5eb7d 447 debug_assert!(self.prev() == '/' && self.first() == '/');
416331ca 448 self.bump();
60c5eb7d 449 self.eat_while(|c| c != '\n');
416331ca
XL
450 LineComment
451 }
452
453 fn block_comment(&mut self) -> TokenKind {
60c5eb7d 454 debug_assert!(self.prev() == '/' && self.first() == '*');
416331ca
XL
455 self.bump();
456 let mut depth = 1usize;
457 while let Some(c) = self.bump() {
458 match c {
60c5eb7d 459 '/' if self.first() == '*' => {
416331ca
XL
460 self.bump();
461 depth += 1;
462 }
60c5eb7d 463 '*' if self.first() == '/' => {
416331ca
XL
464 self.bump();
465 depth -= 1;
466 if depth == 0 {
e74abb32
XL
467 // This block comment is closed, so for a construction like "/* */ */"
468 // there will be a successfully parsed block comment "/* */"
469 // and " */" will be processed separately.
416331ca
XL
470 break;
471 }
472 }
473 _ => (),
474 }
475 }
476
477 BlockComment { terminated: depth == 0 }
478 }
479
480 fn whitespace(&mut self) -> TokenKind {
e1599b0c 481 debug_assert!(is_whitespace(self.prev()));
60c5eb7d 482 self.eat_while(is_whitespace);
416331ca
XL
483 Whitespace
484 }
485
486 fn raw_ident(&mut self) -> TokenKind {
dfeec247 487 debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
60c5eb7d 488 // Eat "#" symbol.
416331ca 489 self.bump();
60c5eb7d
XL
490 // Eat the identifier part of RawIdent.
491 self.eat_identifier();
416331ca
XL
492 RawIdent
493 }
494
495 fn ident(&mut self) -> TokenKind {
e1599b0c 496 debug_assert!(is_id_start(self.prev()));
60c5eb7d
XL
497 // Start is already eaten, eat the rest of identifier.
498 self.eat_while(is_id_continue);
416331ca
XL
499 Ident
500 }
501
502 fn number(&mut self, first_digit: char) -> LiteralKind {
503 debug_assert!('0' <= self.prev() && self.prev() <= '9');
504 let mut base = Base::Decimal;
505 if first_digit == '0' {
e74abb32 506 // Attempt to parse encoding base.
60c5eb7d 507 let has_digits = match self.first() {
416331ca
XL
508 'b' => {
509 base = Base::Binary;
510 self.bump();
511 self.eat_decimal_digits()
512 }
513 'o' => {
514 base = Base::Octal;
515 self.bump();
516 self.eat_decimal_digits()
517 }
518 'x' => {
519 base = Base::Hexadecimal;
520 self.bump();
521 self.eat_hexadecimal_digits()
522 }
e74abb32 523 // Not a base prefix.
416331ca
XL
524 '0'..='9' | '_' | '.' | 'e' | 'E' => {
525 self.eat_decimal_digits();
526 true
527 }
e74abb32 528 // Just a 0.
416331ca
XL
529 _ => return Int { base, empty_int: false },
530 };
e74abb32
XL
531 // Base prefix was provided, but there were no digits
532 // after it, e.g. "0x".
416331ca
XL
533 if !has_digits {
534 return Int { base, empty_int: true };
535 }
536 } else {
e74abb32 537 // No base prefix, parse number in the usual way.
416331ca
XL
538 self.eat_decimal_digits();
539 };
540
60c5eb7d 541 match self.first() {
416331ca
XL
542 // Don't be greedy if this is actually an
543 // integer literal followed by field/method access or a range pattern
544 // (`0..2` and `12.foo()`)
dfeec247 545 '.' if self.second() != '.' && !is_id_start(self.second()) => {
416331ca
XL
546 // might have stuff after the ., and if it does, it needs to start
547 // with a number
548 self.bump();
549 let mut empty_exponent = false;
60c5eb7d 550 if self.first().is_digit(10) {
416331ca 551 self.eat_decimal_digits();
60c5eb7d 552 match self.first() {
416331ca
XL
553 'e' | 'E' => {
554 self.bump();
60c5eb7d 555 empty_exponent = !self.eat_float_exponent();
416331ca
XL
556 }
557 _ => (),
558 }
559 }
560 Float { base, empty_exponent }
561 }
562 'e' | 'E' => {
563 self.bump();
60c5eb7d 564 let empty_exponent = !self.eat_float_exponent();
416331ca
XL
565 Float { base, empty_exponent }
566 }
567 _ => Int { base, empty_int: false },
568 }
569 }
570
571 fn lifetime_or_char(&mut self) -> TokenKind {
572 debug_assert!(self.prev() == '\'');
e74abb32 573
60c5eb7d
XL
574 let can_be_a_lifetime = if self.second() == '\'' {
575 // It's surely not a lifetime.
576 false
577 } else {
578 // If the first symbol is valid for identifier, it can be a lifetime.
579 // Also check if it's a number for a better error reporting (so '0 will
580 // be reported as invalid lifetime and not as unterminated char literal).
581 is_id_start(self.first()) || self.first().is_digit(10)
582 };
416331ca 583
60c5eb7d
XL
584 if !can_be_a_lifetime {
585 let terminated = self.single_quoted_string();
586 let suffix_start = self.len_consumed();
587 if terminated {
588 self.eat_literal_suffix();
589 }
590 let kind = Char { terminated };
591 return Literal { kind, suffix_start };
416331ca 592 }
e74abb32 593
60c5eb7d
XL
594 // Either a lifetime or a character literal with
595 // length greater than 1.
596
597 let starts_with_number = self.first().is_digit(10);
598
599 // Skip the literal contents.
600 // First symbol can be a number (which isn't a valid identifier start),
601 // so skip it without any checks.
602 self.bump();
603 self.eat_while(is_id_continue);
604
605 // Check if after skipping literal contents we've met a closing
606 // single quote (which means that user attempted to create a
607 // string with single quotes).
608 if self.first() == '\'' {
609 self.bump();
610 let kind = Char { terminated: true };
ba9703b0
XL
611 Literal { kind, suffix_start: self.len_consumed() }
612 } else {
613 Lifetime { starts_with_number }
416331ca 614 }
416331ca
XL
615 }
616
617 fn single_quoted_string(&mut self) -> bool {
618 debug_assert!(self.prev() == '\'');
60c5eb7d
XL
619 // Check if it's a one-symbol literal.
620 if self.second() == '\'' && self.first() != '\\' {
621 self.bump();
416331ca 622 self.bump();
60c5eb7d 623 return true;
416331ca 624 }
60c5eb7d
XL
625
626 // Literal has more than one symbol.
627
e74abb32 628 // Parse until either quotes are terminated or error is detected.
416331ca 629 loop {
60c5eb7d 630 match self.first() {
e74abb32 631 // Quotes are terminated, finish parsing.
416331ca
XL
632 '\'' => {
633 self.bump();
634 return true;
635 }
60c5eb7d
XL
636 // Probably beginning of the comment, which we don't want to include
637 // to the error report.
638 '/' => break,
639 // Newline without following '\'' means unclosed quote, stop parsing.
640 '\n' if self.second() != '\'' => break,
641 // End of file, stop parsing.
642 EOF_CHAR if self.is_eof() => break,
e74abb32 643 // Escaped slash is considered one character, so bump twice.
416331ca
XL
644 '\\' => {
645 self.bump();
646 self.bump();
647 }
e74abb32 648 // Skip the character.
416331ca
XL
649 _ => {
650 self.bump();
651 }
652 }
416331ca 653 }
60c5eb7d 654 // String was not terminated.
416331ca
XL
655 false
656 }
657
e74abb32
XL
658 /// Eats double-quoted string and returns true
659 /// if string is terminated.
416331ca
XL
660 fn double_quoted_string(&mut self) -> bool {
661 debug_assert!(self.prev() == '"');
60c5eb7d
XL
662 while let Some(c) = self.bump() {
663 match c {
416331ca 664 '"' => {
416331ca
XL
665 return true;
666 }
60c5eb7d
XL
667 '\\' if self.first() == '\\' || self.first() == '"' => {
668 // Bump again to skip escaped character.
416331ca
XL
669 self.bump();
670 }
671 _ => (),
672 }
416331ca 673 }
60c5eb7d
XL
674 // End of file reached.
675 false
416331ca
XL
676 }
677
ba9703b0
XL
678 /// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
679 fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
416331ca 680 debug_assert!(self.prev() == 'r');
ba9703b0
XL
681 let mut valid_start: bool = false;
682 let start_pos = self.len_consumed();
683 let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
60c5eb7d 684
e74abb32 685 // Count opening '#' symbols.
ba9703b0 686 let n_start_hashes = self.eat_while(|c| c == '#');
60c5eb7d
XL
687
688 // Check that string is started.
689 match self.bump() {
ba9703b0
XL
690 Some('"') => valid_start = true,
691 _ => {
692 return UnvalidatedRawStr {
693 valid_start,
694 valid_end: false,
695 n_start_hashes,
696 n_end_hashes: 0,
697 possible_terminator_offset,
698 };
699 }
60c5eb7d
XL
700 }
701
702 // Skip the string contents and on each '#' character met, check if this is
703 // a raw string termination.
ba9703b0 704 loop {
60c5eb7d
XL
705 self.eat_while(|c| c != '"');
706
707 if self.is_eof() {
ba9703b0
XL
708 return UnvalidatedRawStr {
709 valid_start,
710 valid_end: false,
711 n_start_hashes,
712 n_end_hashes: max_hashes,
713 possible_terminator_offset,
714 };
416331ca 715 }
416331ca 716
60c5eb7d
XL
717 // Eat closing double quote.
718 self.bump();
719
720 // Check that amount of closing '#' symbols
721 // is equal to the amount of opening ones.
ba9703b0 722 let mut hashes_left = n_start_hashes;
60c5eb7d
XL
723 let is_closing_hash = |c| {
724 if c == '#' && hashes_left != 0 {
725 hashes_left -= 1;
726 true
727 } else {
728 false
416331ca 729 }
60c5eb7d 730 };
ba9703b0
XL
731 let n_end_hashes = self.eat_while(is_closing_hash);
732
733 if n_end_hashes == n_start_hashes {
734 return UnvalidatedRawStr {
735 valid_start,
736 valid_end: true,
737 n_start_hashes,
738 n_end_hashes,
739 possible_terminator_offset: None,
740 };
741 } else if n_end_hashes > max_hashes {
742 // Keep track of possible terminators to give a hint about where there might be
743 // a missing terminator
744 possible_terminator_offset =
745 Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
746 max_hashes = n_end_hashes;
747 }
416331ca
XL
748 }
749 }
750
751 fn eat_decimal_digits(&mut self) -> bool {
752 let mut has_digits = false;
753 loop {
60c5eb7d 754 match self.first() {
416331ca
XL
755 '_' => {
756 self.bump();
757 }
758 '0'..='9' => {
759 has_digits = true;
760 self.bump();
761 }
762 _ => break,
763 }
764 }
765 has_digits
766 }
767
768 fn eat_hexadecimal_digits(&mut self) -> bool {
769 let mut has_digits = false;
770 loop {
60c5eb7d 771 match self.first() {
416331ca
XL
772 '_' => {
773 self.bump();
774 }
775 '0'..='9' | 'a'..='f' | 'A'..='F' => {
776 has_digits = true;
777 self.bump();
778 }
779 _ => break,
780 }
781 }
782 has_digits
783 }
784
60c5eb7d
XL
785 /// Eats the float exponent. Returns true if at least one digit was met,
786 /// and returns false otherwise.
787 fn eat_float_exponent(&mut self) -> bool {
416331ca 788 debug_assert!(self.prev() == 'e' || self.prev() == 'E');
60c5eb7d 789 if self.first() == '-' || self.first() == '+' {
416331ca
XL
790 self.bump();
791 }
60c5eb7d 792 self.eat_decimal_digits()
416331ca
XL
793 }
794
60c5eb7d 795 // Eats the suffix of the literal, e.g. "_u8".
416331ca 796 fn eat_literal_suffix(&mut self) {
60c5eb7d
XL
797 self.eat_identifier();
798 }
799
800 // Eats the identifier.
801 fn eat_identifier(&mut self) {
802 if !is_id_start(self.first()) {
416331ca
XL
803 return;
804 }
805 self.bump();
806
60c5eb7d
XL
807 self.eat_while(is_id_continue);
808 }
809
810 /// Eats symbols while predicate returns true or until the end of file is reached.
811 /// Returns amount of eaten symbols.
812 fn eat_while<F>(&mut self, mut predicate: F) -> usize
813 where
dfeec247 814 F: FnMut(char) -> bool,
60c5eb7d
XL
815 {
816 let mut eaten: usize = 0;
817 while predicate(self.first()) && !self.is_eof() {
818 eaten += 1;
416331ca
XL
819 self.bump();
820 }
60c5eb7d
XL
821
822 eaten
416331ca 823 }
416331ca 824}