1 use crate::parse
::ParseSess
;
2 use crate::parse
::token
::{self, Token, TokenKind}
;
3 use crate::symbol
::{sym, Symbol}
;
4 use crate::parse
::unescape_error_reporting
::{emit_unescape_error, push_escaped_char}
;
6 use errors
::{FatalError, DiagnosticBuilder}
;
7 use syntax_pos
::{BytePos, Pos, Span}
;
9 use rustc_lexer
::unescape
;
14 use std
::convert
::TryInto
;
15 use rustc_data_structures
::sync
::Lrc
;
25 #[derive(Clone, Debug)]
26 pub struct UnmatchedBrace
{
27 pub expected_delim
: token
::DelimToken
,
28 pub found_delim
: token
::DelimToken
,
30 pub unclosed_span
: Option
<Span
>,
31 pub candidate_span
: Option
<Span
>,
34 pub struct StringReader
<'a
> {
36 /// Initial position, read-only.
38 /// The absolute offset within the source_map of the current character.
40 /// Stop reading src at this index.
42 /// Source text to tokenize.
44 override_span
: Option
<Span
>,
47 impl<'a
> StringReader
<'a
> {
48 pub fn new(sess
: &'a ParseSess
,
49 source_file
: Lrc
<syntax_pos
::SourceFile
>,
50 override_span
: Option
<Span
>) -> Self {
51 if source_file
.src
.is_none() {
52 sess
.span_diagnostic
.bug(&format
!("Cannot lex source_file without source: {}",
56 let src
= (*source_file
.src
.as_ref().unwrap()).clone();
60 start_pos
: source_file
.start_pos
,
61 pos
: source_file
.start_pos
,
62 end_src_index
: src
.len(),
68 pub fn retokenize(sess
: &'a ParseSess
, mut span
: Span
) -> Self {
69 let begin
= sess
.source_map().lookup_byte_offset(span
.lo());
70 let end
= sess
.source_map().lookup_byte_offset(span
.hi());
72 // Make the range zero-length if the span is invalid.
73 if span
.lo() > span
.hi() || begin
.sf
.start_pos
!= end
.sf
.start_pos
{
74 span
= span
.shrink_to_lo();
77 let mut sr
= StringReader
::new(sess
, begin
.sf
, None
);
79 // Seek the lexer to the right byte range.
80 sr
.end_src_index
= sr
.src_index(span
.hi());
86 fn mk_sp(&self, lo
: BytePos
, hi
: BytePos
) -> Span
{
87 self.override_span
.unwrap_or_else(|| Span
::with_root_ctxt(lo
, hi
))
90 /// Returns the next token, including trivia like whitespace or comments.
92 /// `Err(())` means that some errors were encountered, which can be
93 /// retrieved using `buffer_fatal_errors`.
94 pub fn next_token(&mut self) -> Token
{
95 let start_src_index
= self.src_index(self.pos
);
96 let text
: &str = &self.src
[start_src_index
..self.end_src_index
];
99 let span
= self.mk_sp(self.pos
, self.pos
);
100 return Token
::new(token
::Eof
, span
);
104 let is_beginning_of_file
= self.pos
== self.start_pos
;
105 if is_beginning_of_file
{
106 if let Some(shebang_len
) = rustc_lexer
::strip_shebang(text
) {
107 let start
= self.pos
;
108 self.pos
= self.pos
+ BytePos
::from_usize(shebang_len
);
110 let sym
= self.symbol_from(start
+ BytePos
::from_usize("#!".len()));
111 let kind
= token
::Shebang(sym
);
113 let span
= self.mk_sp(start
, self.pos
);
114 return Token
::new(kind
, span
);
119 let token
= rustc_lexer
::first_token(text
);
121 let start
= self.pos
;
122 self.pos
= self.pos
+ BytePos
::from_usize(token
.len
);
124 debug
!("try_next_token: {:?}({:?})", token
.kind
, self.str_from(start
));
126 // This could use `?`, but that makes code significantly (10-20%) slower.
127 // https://github.com/rust-lang/rust/issues/37939
128 let kind
= self.cook_lexer_token(token
.kind
, start
);
130 let span
= self.mk_sp(start
, self.pos
);
131 Token
::new(kind
, span
)
134 /// Report a fatal lexical error with a given span.
135 fn fatal_span(&self, sp
: Span
, m
: &str) -> FatalError
{
136 self.sess
.span_diagnostic
.span_fatal(sp
, m
)
139 /// Report a lexical error with a given span.
140 fn err_span(&self, sp
: Span
, m
: &str) {
141 self.sess
.span_diagnostic
.struct_span_err(sp
, m
).emit();
145 /// Report a fatal error spanning [`from_pos`, `to_pos`).
146 fn fatal_span_(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str) -> FatalError
{
147 self.fatal_span(self.mk_sp(from_pos
, to_pos
), m
)
150 /// Report a lexical error spanning [`from_pos`, `to_pos`).
151 fn err_span_(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str) {
152 self.err_span(self.mk_sp(from_pos
, to_pos
), m
)
155 fn struct_span_fatal(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str)
156 -> DiagnosticBuilder
<'a
>
158 self.sess
.span_diagnostic
.struct_span_fatal(self.mk_sp(from_pos
, to_pos
), m
)
161 fn struct_fatal_span_char(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str, c
: char)
162 -> DiagnosticBuilder
<'a
>
164 let mut m
= m
.to_string();
166 push_escaped_char(&mut m
, c
);
168 self.sess
.span_diagnostic
.struct_span_fatal(self.mk_sp(from_pos
, to_pos
), &m
[..])
171 /// Turns simple `rustc_lexer::TokenKind` enum into a rich
172 /// `libsyntax::TokenKind`. This turns strings into interned
173 /// symbols and runs additional validation.
176 token
: rustc_lexer
::TokenKind
,
180 rustc_lexer
::TokenKind
::LineComment
=> {
181 let string
= self.str_from(start
);
182 // comments with only more "/"s are not doc comments
183 let tok
= if is_doc_comment(string
) {
186 idx
= match string
[idx
..].find('
\r'
) {
188 Some(it
) => idx
+ it
+ 1
190 if string
[idx
..].chars().next() != Some('
\n'
) {
191 self.err_span_(start
+ BytePos(idx
as u32 - 1),
192 start
+ BytePos(idx
as u32),
193 "bare CR not allowed in doc-comment");
196 token
::DocComment(Symbol
::intern(string
))
203 rustc_lexer
::TokenKind
::BlockComment { terminated }
=> {
204 let string
= self.str_from(start
);
205 // block comments starting with "/**" or "/*!" are doc-comments
206 // but comments with only "*"s between two "/"s are not
207 let is_doc_comment
= is_block_doc_comment(string
);
210 let msg
= if is_doc_comment
{
211 "unterminated block doc-comment"
213 "unterminated block comment"
215 let last_bpos
= self.pos
;
216 self.fatal_span_(start
, last_bpos
, msg
).raise();
219 let tok
= if is_doc_comment
{
220 let has_cr
= string
.contains('
\r'
);
221 let string
= if has_cr
{
222 self.translate_crlf(start
,
224 "bare CR not allowed in block doc-comment")
228 token
::DocComment(Symbol
::intern(&string
[..]))
235 rustc_lexer
::TokenKind
::Whitespace
=> token
::Whitespace
,
236 rustc_lexer
::TokenKind
::Ident
| rustc_lexer
::TokenKind
::RawIdent
=> {
237 let is_raw_ident
= token
== rustc_lexer
::TokenKind
::RawIdent
;
238 let mut ident_start
= start
;
240 ident_start
= ident_start
+ BytePos(2);
242 // FIXME: perform NFKC normalization here. (Issue #2253)
243 let sym
= self.symbol_from(ident_start
);
245 let span
= self.mk_sp(start
, self.pos
);
246 if !sym
.can_be_raw() {
247 self.err_span(span
, &format
!("`{}` cannot be a raw identifier", sym
));
249 self.sess
.raw_identifier_spans
.borrow_mut().push(span
);
251 token
::Ident(sym
, is_raw_ident
)
253 rustc_lexer
::TokenKind
::Literal { kind, suffix_start }
=> {
254 let suffix_start
= start
+ BytePos(suffix_start
as u32);
255 let (kind
, symbol
) = self.cook_lexer_literal(start
, suffix_start
, kind
);
256 let suffix
= if suffix_start
< self.pos
{
257 let string
= self.str_from(suffix_start
);
259 self.sess
.span_diagnostic
260 .struct_span_warn(self.mk_sp(suffix_start
, self.pos
),
261 "underscore literal suffix is not allowed")
262 .warn("this was previously accepted by the compiler but is \
263 being phased out; it will become a hard error in \
265 .note("for more information, see issue #42326 \
266 <https://github.com/rust-lang/rust/issues/42326>")
270 Some(Symbol
::intern(string
))
275 token
::Literal(token
::Lit { kind, symbol, suffix }
)
277 rustc_lexer
::TokenKind
::Lifetime { starts_with_number }
=> {
278 // Include the leading `'` in the real identifier, for macro
279 // expansion purposes. See #12512 for the gory details of why
280 // this is necessary.
281 let lifetime_name
= self.str_from(start
);
282 if starts_with_number
{
286 "lifetimes cannot start with a number",
289 let ident
= Symbol
::intern(lifetime_name
);
290 token
::Lifetime(ident
)
292 rustc_lexer
::TokenKind
::Semi
=> token
::Semi
,
293 rustc_lexer
::TokenKind
::Comma
=> token
::Comma
,
294 rustc_lexer
::TokenKind
::Dot
=> token
::Dot
,
295 rustc_lexer
::TokenKind
::OpenParen
=> token
::OpenDelim(token
::Paren
),
296 rustc_lexer
::TokenKind
::CloseParen
=> token
::CloseDelim(token
::Paren
),
297 rustc_lexer
::TokenKind
::OpenBrace
=> token
::OpenDelim(token
::Brace
),
298 rustc_lexer
::TokenKind
::CloseBrace
=> token
::CloseDelim(token
::Brace
),
299 rustc_lexer
::TokenKind
::OpenBracket
=> token
::OpenDelim(token
::Bracket
),
300 rustc_lexer
::TokenKind
::CloseBracket
=> token
::CloseDelim(token
::Bracket
),
301 rustc_lexer
::TokenKind
::At
=> token
::At
,
302 rustc_lexer
::TokenKind
::Pound
=> token
::Pound
,
303 rustc_lexer
::TokenKind
::Tilde
=> token
::Tilde
,
304 rustc_lexer
::TokenKind
::Question
=> token
::Question
,
305 rustc_lexer
::TokenKind
::Colon
=> token
::Colon
,
306 rustc_lexer
::TokenKind
::Dollar
=> token
::Dollar
,
307 rustc_lexer
::TokenKind
::Eq
=> token
::Eq
,
308 rustc_lexer
::TokenKind
::Not
=> token
::Not
,
309 rustc_lexer
::TokenKind
::Lt
=> token
::Lt
,
310 rustc_lexer
::TokenKind
::Gt
=> token
::Gt
,
311 rustc_lexer
::TokenKind
::Minus
=> token
::BinOp(token
::Minus
),
312 rustc_lexer
::TokenKind
::And
=> token
::BinOp(token
::And
),
313 rustc_lexer
::TokenKind
::Or
=> token
::BinOp(token
::Or
),
314 rustc_lexer
::TokenKind
::Plus
=> token
::BinOp(token
::Plus
),
315 rustc_lexer
::TokenKind
::Star
=> token
::BinOp(token
::Star
),
316 rustc_lexer
::TokenKind
::Slash
=> token
::BinOp(token
::Slash
),
317 rustc_lexer
::TokenKind
::Caret
=> token
::BinOp(token
::Caret
),
318 rustc_lexer
::TokenKind
::Percent
=> token
::BinOp(token
::Percent
),
320 rustc_lexer
::TokenKind
::Unknown
=> {
321 let c
= self.str_from(start
).chars().next().unwrap();
322 let mut err
= self.struct_fatal_span_char(start
,
324 "unknown start of token",
326 // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
327 // instead of keeping a table in `check_for_substitution`into the token. Ideally,
328 // this should be inside `rustc_lexer`. However, we should first remove compound
329 // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
330 // as there will be less overall work to do this way.
331 let token
= unicode_chars
::check_for_substitution(self, start
, c
, &mut err
)
332 .unwrap_or_else(|| token
::Unknown(self.symbol_from(start
)));
339 fn cook_lexer_literal(
342 suffix_start
: BytePos
,
343 kind
: rustc_lexer
::LiteralKind
344 ) -> (token
::LitKind
, Symbol
) {
346 rustc_lexer
::LiteralKind
::Char { terminated }
=> {
348 self.fatal_span_(start
, suffix_start
,
349 "unterminated character literal".into())
352 let content_start
= start
+ BytePos(1);
353 let content_end
= suffix_start
- BytePos(1);
354 self.validate_char_escape(content_start
, content_end
);
355 let id
= self.symbol_from_to(content_start
, content_end
);
358 rustc_lexer
::LiteralKind
::Byte { terminated }
=> {
360 self.fatal_span_(start
+ BytePos(1), suffix_start
,
361 "unterminated byte constant".into())
364 let content_start
= start
+ BytePos(2);
365 let content_end
= suffix_start
- BytePos(1);
366 self.validate_byte_escape(content_start
, content_end
);
367 let id
= self.symbol_from_to(content_start
, content_end
);
370 rustc_lexer
::LiteralKind
::Str { terminated }
=> {
372 self.fatal_span_(start
, suffix_start
,
373 "unterminated double quote string".into())
376 let content_start
= start
+ BytePos(1);
377 let content_end
= suffix_start
- BytePos(1);
378 self.validate_str_escape(content_start
, content_end
);
379 let id
= self.symbol_from_to(content_start
, content_end
);
382 rustc_lexer
::LiteralKind
::ByteStr { terminated }
=> {
384 self.fatal_span_(start
+ BytePos(1), suffix_start
,
385 "unterminated double quote byte string".into())
388 let content_start
= start
+ BytePos(2);
389 let content_end
= suffix_start
- BytePos(1);
390 self.validate_byte_str_escape(content_start
, content_end
);
391 let id
= self.symbol_from_to(content_start
, content_end
);
394 rustc_lexer
::LiteralKind
::RawStr { n_hashes, started, terminated }
=> {
396 self.report_non_started_raw_string(start
);
399 self.report_unterminated_raw_string(start
, n_hashes
)
401 let n_hashes
: u16 = self.restrict_n_hashes(start
, n_hashes
);
402 let n
= u32::from(n_hashes
);
403 let content_start
= start
+ BytePos(2 + n
);
404 let content_end
= suffix_start
- BytePos(1 + n
);
405 self.validate_raw_str_escape(content_start
, content_end
);
406 let id
= self.symbol_from_to(content_start
, content_end
);
407 (token
::StrRaw(n_hashes
), id
)
409 rustc_lexer
::LiteralKind
::RawByteStr { n_hashes, started, terminated }
=> {
411 self.report_non_started_raw_string(start
);
414 self.report_unterminated_raw_string(start
, n_hashes
)
416 let n_hashes
: u16 = self.restrict_n_hashes(start
, n_hashes
);
417 let n
= u32::from(n_hashes
);
418 let content_start
= start
+ BytePos(3 + n
);
419 let content_end
= suffix_start
- BytePos(1 + n
);
420 self.validate_raw_byte_str_escape(content_start
, content_end
);
421 let id
= self.symbol_from_to(content_start
, content_end
);
422 (token
::ByteStrRaw(n_hashes
), id
)
424 rustc_lexer
::LiteralKind
::Int { base, empty_int }
=> {
426 self.err_span_(start
, suffix_start
, "no valid digits found for number");
427 (token
::Integer
, sym
::integer(0))
429 self.validate_int_literal(base
, start
, suffix_start
);
430 (token
::Integer
, self.symbol_from_to(start
, suffix_start
))
433 rustc_lexer
::LiteralKind
::Float { base, empty_exponent }
=> {
435 let mut err
= self.struct_span_fatal(
437 "expected at least one digit in exponent"
443 Base
::Hexadecimal
=> {
444 self.err_span_(start
, suffix_start
,
445 "hexadecimal float literal is not supported")
448 self.err_span_(start
, suffix_start
,
449 "octal float literal is not supported")
452 self.err_span_(start
, suffix_start
,
453 "binary float literal is not supported")
458 let id
= self.symbol_from_to(start
, suffix_start
);
465 fn src_index(&self, pos
: BytePos
) -> usize {
466 (pos
- self.start_pos
).to_usize()
469 /// Slice of the source text from `start` up to but excluding `self.pos`,
470 /// meaning the slice does not include the character `self.ch`.
471 fn str_from(&self, start
: BytePos
) -> &str
473 self.str_from_to(start
, self.pos
)
476 /// Creates a Symbol from a given offset to the current offset.
477 fn symbol_from(&self, start
: BytePos
) -> Symbol
{
478 debug
!("taking an ident from {:?} to {:?}", start
, self.pos
);
479 Symbol
::intern(self.str_from(start
))
482 /// As symbol_from, with an explicit endpoint.
483 fn symbol_from_to(&self, start
: BytePos
, end
: BytePos
) -> Symbol
{
484 debug
!("taking an ident from {:?} to {:?}", start
, end
);
485 Symbol
::intern(self.str_from_to(start
, end
))
488 /// Slice of the source text spanning from `start` up to but excluding `end`.
489 fn str_from_to(&self, start
: BytePos
, end
: BytePos
) -> &str
491 &self.src
[self.src_index(start
)..self.src_index(end
)]
494 /// Converts CRLF to LF in the given string, raising an error on bare CR.
495 fn translate_crlf
<'b
>(&self, start
: BytePos
, s
: &'b
str, errmsg
: &'b
str) -> Cow
<'b
, str> {
496 let mut chars
= s
.char_indices().peekable();
497 while let Some((i
, ch
)) = chars
.next() {
499 if let Some((lf_idx
, '
\n'
)) = chars
.peek() {
500 return translate_crlf_(self, start
, s
, *lf_idx
, chars
, errmsg
).into();
502 let pos
= start
+ BytePos(i
as u32);
503 let end_pos
= start
+ BytePos((i
+ ch
.len_utf8()) as u32);
504 self.err_span_(pos
, end_pos
, errmsg
);
509 fn translate_crlf_(rdr
: &StringReader
<'_
>,
513 mut chars
: iter
::Peekable
<impl Iterator
<Item
= (usize, char)>>,
516 let mut buf
= String
::with_capacity(s
.len());
518 buf
.push_str(&s
[.. j
- 1]);
519 while let Some((i
, ch
)) = chars
.next() {
522 buf
.push_str(&s
[j
..i
]);
524 let next
= i
+ ch
.len_utf8();
526 if chars
.peek().map(|(_
, ch
)| *ch
) != Some('
\n'
) {
527 let pos
= start
+ BytePos(i
as u32);
528 let end_pos
= start
+ BytePos(next
as u32);
529 rdr
.err_span_(pos
, end_pos
, errmsg
);
534 buf
.push_str(&s
[j
..]);
540 fn report_non_started_raw_string(&self, start
: BytePos
) -> ! {
541 let bad_char
= self.str_from(start
).chars().last().unwrap();
543 .struct_fatal_span_char(
546 "found invalid character; only `#` is allowed \
547 in raw string delimitation",
554 fn report_unterminated_raw_string(&self, start
: BytePos
, n_hashes
: usize) -> ! {
555 let mut err
= self.struct_span_fatal(
557 "unterminated raw string",
560 self.mk_sp(start
, start
),
561 "unterminated raw string",
565 err
.note(&format
!("this raw string should be terminated with `\"{}`",
566 "#".repeat(n_hashes
as usize)));
573 fn restrict_n_hashes(&self, start
: BytePos
, n_hashes
: usize) -> u16 {
574 match n_hashes
.try_into() {
575 Ok(n_hashes
) => n_hashes
,
577 self.fatal_span_(start
,
579 "too many `#` symbols: raw strings may be \
580 delimited by up to 65535 `#` symbols").raise();
585 fn validate_char_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
586 let lit
= self.str_from_to(content_start
, content_end
);
587 if let Err((off
, err
)) = unescape
::unescape_char(lit
) {
589 &self.sess
.span_diagnostic
,
591 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
592 unescape
::Mode
::Char
,
599 fn validate_byte_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
600 let lit
= self.str_from_to(content_start
, content_end
);
601 if let Err((off
, err
)) = unescape
::unescape_byte(lit
) {
603 &self.sess
.span_diagnostic
,
605 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
606 unescape
::Mode
::Byte
,
613 fn validate_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
614 let lit
= self.str_from_to(content_start
, content_end
);
615 unescape
::unescape_str(lit
, &mut |range
, c
| {
616 if let Err(err
) = c
{
618 &self.sess
.span_diagnostic
,
620 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
629 fn validate_raw_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
630 let lit
= self.str_from_to(content_start
, content_end
);
631 unescape
::unescape_raw_str(lit
, &mut |range
, c
| {
632 if let Err(err
) = c
{
634 &self.sess
.span_diagnostic
,
636 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
645 fn validate_raw_byte_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
646 let lit
= self.str_from_to(content_start
, content_end
);
647 unescape
::unescape_raw_byte_str(lit
, &mut |range
, c
| {
648 if let Err(err
) = c
{
650 &self.sess
.span_diagnostic
,
652 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
653 unescape
::Mode
::ByteStr
,
661 fn validate_byte_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
662 let lit
= self.str_from_to(content_start
, content_end
);
663 unescape
::unescape_byte_str(lit
, &mut |range
, c
| {
664 if let Err(err
) = c
{
666 &self.sess
.span_diagnostic
,
668 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
669 unescape
::Mode
::ByteStr
,
677 fn validate_int_literal(&self, base
: Base
, content_start
: BytePos
, content_end
: BytePos
) {
678 let base
= match base
{
683 let s
= self.str_from_to(content_start
+ BytePos(2), content_end
);
684 for (idx
, c
) in s
.char_indices() {
685 let idx
= idx
as u32;
686 if c
!= '_'
&& c
.to_digit(base
).is_none() {
687 let lo
= content_start
+ BytePos(2 + idx
);
688 let hi
= content_start
+ BytePos(2 + idx
+ c
.len_utf8() as u32);
689 self.err_span_(lo
, hi
,
690 &format
!("invalid digit for a base {} literal", base
));
697 fn is_doc_comment(s
: &str) -> bool
{
698 let res
= (s
.starts_with("///") && *s
.as_bytes().get(3).unwrap_or(&b' '
) != b'
/'
) ||
699 s
.starts_with("//!");
700 debug
!("is {:?} a doc comment? {}", s
, res
);
704 fn is_block_doc_comment(s
: &str) -> bool
{
705 // Prevent `/**/` from being parsed as a doc comment
706 let res
= ((s
.starts_with("/**") && *s
.as_bytes().get(3).unwrap_or(&b' '
) != b'
*'
) ||
707 s
.starts_with("/*!")) && s
.len() >= 5;
708 debug
!("is {:?} a doc comment? {}", s
, res
);