1 use rustc_ast
::ast
::AttrStyle
;
2 use rustc_ast
::token
::{self, CommentKind, Token, TokenKind}
;
3 use rustc_ast
::tokenstream
::{Spacing, TokenStream}
;
4 use rustc_errors
::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult}
;
5 use rustc_lexer
::unescape
::{self, Mode}
;
6 use rustc_lexer
::{Base, DocStyle, RawStrError}
;
7 use rustc_session
::parse
::ParseSess
;
8 use rustc_span
::symbol
::{sym, Symbol}
;
9 use rustc_span
::{BytePos, Pos, Span}
;
14 mod unescape_error_reporting
;
17 use unescape_error_reporting
::{emit_unescape_error, push_escaped_char}
;
19 #[derive(Clone, Debug)]
20 pub struct UnmatchedBrace
{
21 pub expected_delim
: token
::DelimToken
,
22 pub found_delim
: Option
<token
::DelimToken
>,
24 pub unclosed_span
: Option
<Span
>,
25 pub candidate_span
: Option
<Span
>,
28 crate fn parse_token_trees
<'a
>(
32 override_span
: Option
<Span
>,
33 ) -> (PResult
<'a
, TokenStream
>, Vec
<UnmatchedBrace
>) {
34 StringReader { sess, start_pos, pos: start_pos, end_src_index: src.len(), src, override_span }
38 struct StringReader
<'a
> {
40 /// Initial position, read-only.
42 /// The absolute offset within the source_map of the current character.
44 /// Stop reading src at this index.
46 /// Source text to tokenize.
48 override_span
: Option
<Span
>,
51 impl<'a
> StringReader
<'a
> {
52 fn mk_sp(&self, lo
: BytePos
, hi
: BytePos
) -> Span
{
53 self.override_span
.unwrap_or_else(|| Span
::with_root_ctxt(lo
, hi
))
56 /// Returns the next token, and info about preceding whitespace, if any.
57 fn next_token(&mut self) -> (Spacing
, Token
) {
58 let mut spacing
= Spacing
::Joint
;
60 // Skip `#!` at the start of the file
61 let start_src_index
= self.src_index(self.pos
);
62 let text
: &str = &self.src
[start_src_index
..self.end_src_index
];
63 let is_beginning_of_file
= self.pos
== self.start_pos
;
64 if is_beginning_of_file
{
65 if let Some(shebang_len
) = rustc_lexer
::strip_shebang(text
) {
66 self.pos
= self.pos
+ BytePos
::from_usize(shebang_len
);
67 spacing
= Spacing
::Alone
;
71 // Skip trivial (whitespace & comments) tokens
73 let start_src_index
= self.src_index(self.pos
);
74 let text
: &str = &self.src
[start_src_index
..self.end_src_index
];
77 let span
= self.mk_sp(self.pos
, self.pos
);
78 return (spacing
, Token
::new(token
::Eof
, span
));
81 let token
= rustc_lexer
::first_token(text
);
84 self.pos
= self.pos
+ BytePos
::from_usize(token
.len
);
86 debug
!("next_token: {:?}({:?})", token
.kind
, self.str_from(start
));
88 match self.cook_lexer_token(token
.kind
, start
) {
90 let span
= self.mk_sp(start
, self.pos
);
91 return (spacing
, Token
::new(kind
, span
));
93 None
=> spacing
= Spacing
::Alone
,
98 /// Report a fatal lexical error with a given span.
99 fn fatal_span(&self, sp
: Span
, m
: &str) -> FatalError
{
100 self.sess
.span_diagnostic
.span_fatal(sp
, m
)
103 /// Report a lexical error with a given span.
104 fn err_span(&self, sp
: Span
, m
: &str) {
105 self.sess
.span_diagnostic
.struct_span_err(sp
, m
).emit();
108 /// Report a fatal error spanning [`from_pos`, `to_pos`).
109 fn fatal_span_(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str) -> FatalError
{
110 self.fatal_span(self.mk_sp(from_pos
, to_pos
), m
)
113 /// Report a lexical error spanning [`from_pos`, `to_pos`).
114 fn err_span_(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str) {
115 self.err_span(self.mk_sp(from_pos
, to_pos
), m
)
118 fn struct_fatal_span_char(
124 ) -> DiagnosticBuilder
<'a
> {
125 let mut m
= m
.to_string();
127 push_escaped_char(&mut m
, c
);
129 self.sess
.span_diagnostic
.struct_span_fatal(self.mk_sp(from_pos
, to_pos
), &m
[..])
132 /// Turns simple `rustc_lexer::TokenKind` enum into a rich
133 /// `librustc_ast::TokenKind`. This turns strings into interned
134 /// symbols and runs additional validation.
135 fn cook_lexer_token(&self, token
: rustc_lexer
::TokenKind
, start
: BytePos
) -> Option
<TokenKind
> {
137 rustc_lexer
::TokenKind
::LineComment { doc_style }
=> {
138 // Skip non-doc comments
139 let doc_style
= doc_style?
;
141 // Opening delimiter of the length 3 is not included into the symbol.
142 let content_start
= start
+ BytePos(3);
143 let content
= self.str_from(content_start
);
144 self.cook_doc_comment(content_start
, content
, CommentKind
::Line
, doc_style
)
146 rustc_lexer
::TokenKind
::BlockComment { doc_style, terminated }
=> {
148 let msg
= match doc_style
{
149 Some(_
) => "unterminated block doc-comment",
150 None
=> "unterminated block comment",
152 let last_bpos
= self.pos
;
155 .struct_span_fatal_with_code(
156 self.mk_sp(start
, last_bpos
),
164 // Skip non-doc comments
165 let doc_style
= doc_style?
;
167 // Opening delimiter of the length 3 and closing delimiter of the length 2
168 // are not included into the symbol.
169 let content_start
= start
+ BytePos(3);
170 let content_end
= self.pos
- BytePos(if terminated { 2 }
else { 0 }
);
171 let content
= self.str_from_to(content_start
, content_end
);
172 self.cook_doc_comment(content_start
, content
, CommentKind
::Block
, doc_style
)
174 rustc_lexer
::TokenKind
::Whitespace
=> return None
,
175 rustc_lexer
::TokenKind
::Ident
| rustc_lexer
::TokenKind
::RawIdent
=> {
176 let is_raw_ident
= token
== rustc_lexer
::TokenKind
::RawIdent
;
177 let mut ident_start
= start
;
179 ident_start
= ident_start
+ BytePos(2);
181 let sym
= nfc_normalize(self.str_from(ident_start
));
182 let span
= self.mk_sp(start
, self.pos
);
183 self.sess
.symbol_gallery
.insert(sym
, span
);
185 if !sym
.can_be_raw() {
186 self.err_span(span
, &format
!("`{}` cannot be a raw identifier", sym
));
188 self.sess
.raw_identifier_spans
.borrow_mut().push(span
);
190 token
::Ident(sym
, is_raw_ident
)
192 rustc_lexer
::TokenKind
::Literal { kind, suffix_start }
=> {
193 let suffix_start
= start
+ BytePos(suffix_start
as u32);
194 let (kind
, symbol
) = self.cook_lexer_literal(start
, suffix_start
, kind
);
195 let suffix
= if suffix_start
< self.pos
{
196 let string
= self.str_from(suffix_start
);
201 self.mk_sp(suffix_start
, self.pos
),
202 "underscore literal suffix is not allowed",
205 "this was previously accepted by the compiler but is \
206 being phased out; it will become a hard error in \
211 <https://github.com/rust-lang/rust/issues/42326> \
212 for more information",
217 Some(Symbol
::intern(string
))
222 token
::Literal(token
::Lit { kind, symbol, suffix }
)
224 rustc_lexer
::TokenKind
::Lifetime { starts_with_number }
=> {
225 // Include the leading `'` in the real identifier, for macro
226 // expansion purposes. See #12512 for the gory details of why
227 // this is necessary.
228 let lifetime_name
= self.str_from(start
);
229 if starts_with_number
{
230 self.err_span_(start
, self.pos
, "lifetimes cannot start with a number");
232 let ident
= Symbol
::intern(lifetime_name
);
233 token
::Lifetime(ident
)
235 rustc_lexer
::TokenKind
::Semi
=> token
::Semi
,
236 rustc_lexer
::TokenKind
::Comma
=> token
::Comma
,
237 rustc_lexer
::TokenKind
::Dot
=> token
::Dot
,
238 rustc_lexer
::TokenKind
::OpenParen
=> token
::OpenDelim(token
::Paren
),
239 rustc_lexer
::TokenKind
::CloseParen
=> token
::CloseDelim(token
::Paren
),
240 rustc_lexer
::TokenKind
::OpenBrace
=> token
::OpenDelim(token
::Brace
),
241 rustc_lexer
::TokenKind
::CloseBrace
=> token
::CloseDelim(token
::Brace
),
242 rustc_lexer
::TokenKind
::OpenBracket
=> token
::OpenDelim(token
::Bracket
),
243 rustc_lexer
::TokenKind
::CloseBracket
=> token
::CloseDelim(token
::Bracket
),
244 rustc_lexer
::TokenKind
::At
=> token
::At
,
245 rustc_lexer
::TokenKind
::Pound
=> token
::Pound
,
246 rustc_lexer
::TokenKind
::Tilde
=> token
::Tilde
,
247 rustc_lexer
::TokenKind
::Question
=> token
::Question
,
248 rustc_lexer
::TokenKind
::Colon
=> token
::Colon
,
249 rustc_lexer
::TokenKind
::Dollar
=> token
::Dollar
,
250 rustc_lexer
::TokenKind
::Eq
=> token
::Eq
,
251 rustc_lexer
::TokenKind
::Bang
=> token
::Not
,
252 rustc_lexer
::TokenKind
::Lt
=> token
::Lt
,
253 rustc_lexer
::TokenKind
::Gt
=> token
::Gt
,
254 rustc_lexer
::TokenKind
::Minus
=> token
::BinOp(token
::Minus
),
255 rustc_lexer
::TokenKind
::And
=> token
::BinOp(token
::And
),
256 rustc_lexer
::TokenKind
::Or
=> token
::BinOp(token
::Or
),
257 rustc_lexer
::TokenKind
::Plus
=> token
::BinOp(token
::Plus
),
258 rustc_lexer
::TokenKind
::Star
=> token
::BinOp(token
::Star
),
259 rustc_lexer
::TokenKind
::Slash
=> token
::BinOp(token
::Slash
),
260 rustc_lexer
::TokenKind
::Caret
=> token
::BinOp(token
::Caret
),
261 rustc_lexer
::TokenKind
::Percent
=> token
::BinOp(token
::Percent
),
263 rustc_lexer
::TokenKind
::Unknown
=> {
264 let c
= self.str_from(start
).chars().next().unwrap();
266 self.struct_fatal_span_char(start
, self.pos
, "unknown start of token", c
);
267 // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
268 // instead of keeping a table in `check_for_substitution`into the token. Ideally,
269 // this should be inside `rustc_lexer`. However, we should first remove compound
270 // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
271 // as there will be less overall work to do this way.
272 let token
= unicode_chars
::check_for_substitution(self, start
, c
, &mut err
);
281 content_start
: BytePos
,
283 comment_kind
: CommentKind
,
286 if content
.contains('
\r'
) {
287 for (idx
, _
) in content
.char_indices().filter(|&(_
, c
)| c
== '
\r'
) {
289 content_start
+ BytePos(idx
as u32),
290 content_start
+ BytePos(idx
as u32 + 1),
292 CommentKind
::Line
=> "bare CR not allowed in doc-comment",
293 CommentKind
::Block
=> "bare CR not allowed in block doc-comment",
299 let attr_style
= match doc_style
{
300 DocStyle
::Outer
=> AttrStyle
::Outer
,
301 DocStyle
::Inner
=> AttrStyle
::Inner
,
304 token
::DocComment(comment_kind
, attr_style
, Symbol
::intern(content
))
307 fn cook_lexer_literal(
310 suffix_start
: BytePos
,
311 kind
: rustc_lexer
::LiteralKind
,
312 ) -> (token
::LitKind
, Symbol
) {
313 // prefix means `"` or `br"` or `r###"`, ...
314 let (lit_kind
, mode
, prefix_len
, postfix_len
) = match kind
{
315 rustc_lexer
::LiteralKind
::Char { terminated }
=> {
319 .struct_span_fatal_with_code(
320 self.mk_sp(start
, suffix_start
),
321 "unterminated character literal",
327 (token
::Char
, Mode
::Char
, 1, 1) // ' '
329 rustc_lexer
::LiteralKind
::Byte { terminated }
=> {
333 .struct_span_fatal_with_code(
334 self.mk_sp(start
+ BytePos(1), suffix_start
),
335 "unterminated byte constant",
341 (token
::Byte
, Mode
::Byte
, 2, 1) // b' '
343 rustc_lexer
::LiteralKind
::Str { terminated }
=> {
347 .struct_span_fatal_with_code(
348 self.mk_sp(start
, suffix_start
),
349 "unterminated double quote string",
355 (token
::Str
, Mode
::Str
, 1, 1) // " "
357 rustc_lexer
::LiteralKind
::ByteStr { terminated }
=> {
361 .struct_span_fatal_with_code(
362 self.mk_sp(start
+ BytePos(1), suffix_start
),
363 "unterminated double quote byte string",
369 (token
::ByteStr
, Mode
::ByteStr
, 2, 1) // b" "
371 rustc_lexer
::LiteralKind
::RawStr { n_hashes, err }
=> {
372 self.report_raw_str_error(start
, err
);
373 let n
= u32::from(n_hashes
);
374 (token
::StrRaw(n_hashes
), Mode
::RawStr
, 2 + n
, 1 + n
) // r##" "##
376 rustc_lexer
::LiteralKind
::RawByteStr { n_hashes, err }
=> {
377 self.report_raw_str_error(start
, err
);
378 let n
= u32::from(n_hashes
);
379 (token
::ByteStrRaw(n_hashes
), Mode
::RawByteStr
, 3 + n
, 1 + n
) // br##" "##
381 rustc_lexer
::LiteralKind
::Int { base, empty_int }
=> {
382 return if empty_int
{
385 .struct_span_err_with_code(
386 self.mk_sp(start
, suffix_start
),
387 "no valid digits found for number",
391 (token
::Integer
, sym
::integer(0))
393 self.validate_int_literal(base
, start
, suffix_start
);
394 (token
::Integer
, self.symbol_from_to(start
, suffix_start
))
397 rustc_lexer
::LiteralKind
::Float { base, empty_exponent }
=> {
399 self.err_span_(start
, self.pos
, "expected at least one digit in exponent");
403 Base
::Hexadecimal
=> self.err_span_(
406 "hexadecimal float literal is not supported",
409 self.err_span_(start
, suffix_start
, "octal float literal is not supported")
412 self.err_span_(start
, suffix_start
, "binary float literal is not supported")
417 let id
= self.symbol_from_to(start
, suffix_start
);
418 return (token
::Float
, id
);
421 let content_start
= start
+ BytePos(prefix_len
);
422 let content_end
= suffix_start
- BytePos(postfix_len
);
423 let id
= self.symbol_from_to(content_start
, content_end
);
424 self.validate_literal_escape(mode
, content_start
, content_end
);
429 fn src_index(&self, pos
: BytePos
) -> usize {
430 (pos
- self.start_pos
).to_usize()
433 /// Slice of the source text from `start` up to but excluding `self.pos`,
434 /// meaning the slice does not include the character `self.ch`.
435 fn str_from(&self, start
: BytePos
) -> &str {
436 self.str_from_to(start
, self.pos
)
439 /// As symbol_from, with an explicit endpoint.
440 fn symbol_from_to(&self, start
: BytePos
, end
: BytePos
) -> Symbol
{
441 debug
!("taking an ident from {:?} to {:?}", start
, end
);
442 Symbol
::intern(self.str_from_to(start
, end
))
445 /// Slice of the source text spanning from `start` up to but excluding `end`.
446 fn str_from_to(&self, start
: BytePos
, end
: BytePos
) -> &str {
447 &self.src
[self.src_index(start
)..self.src_index(end
)]
450 fn report_raw_str_error(&self, start
: BytePos
, opt_err
: Option
<RawStrError
>) {
452 Some(RawStrError
::InvalidStarter { bad_char }
) => {
453 self.report_non_started_raw_string(start
, bad_char
)
455 Some(RawStrError
::NoTerminator { expected, found, possible_terminator_offset }
) => self
456 .report_unterminated_raw_string(start
, expected
, possible_terminator_offset
, found
),
457 Some(RawStrError
::TooManyDelimiters { found }
) => {
458 self.report_too_many_hashes(start
, found
)
464 fn report_non_started_raw_string(&self, start
: BytePos
, bad_char
: char) -> ! {
465 self.struct_fatal_span_char(
468 "found invalid character; only `#` is allowed in raw string delimitation",
475 fn report_unterminated_raw_string(
479 possible_offset
: Option
<usize>,
480 found_terminators
: usize,
482 let mut err
= self.sess
.span_diagnostic
.struct_span_fatal_with_code(
483 self.mk_sp(start
, start
),
484 "unterminated raw string",
488 err
.span_label(self.mk_sp(start
, start
), "unterminated raw string");
492 "this raw string should be terminated with `\"{}`",
497 if let Some(possible_offset
) = possible_offset
{
498 let lo
= start
+ BytePos(possible_offset
as u32);
499 let hi
= lo
+ BytePos(found_terminators
as u32);
500 let span
= self.mk_sp(lo
, hi
);
503 "consider terminating the string here",
504 "#".repeat(n_hashes
),
505 Applicability
::MaybeIncorrect
,
513 /// Note: It was decided to not add a test case, because it would be to big.
514 /// https://github.com/rust-lang/rust/pull/50296#issuecomment-392135180
515 fn report_too_many_hashes(&self, start
: BytePos
, found
: usize) -> ! {
520 "too many `#` symbols: raw strings may be delimited \
521 by up to 65535 `#` symbols, but found {}",
528 fn validate_literal_escape(&self, mode
: Mode
, content_start
: BytePos
, content_end
: BytePos
) {
529 let lit_content
= self.str_from_to(content_start
, content_end
);
530 unescape
::unescape_literal(lit_content
, mode
, &mut |range
, result
| {
531 // Here we only check for errors. The actual unescaping is done later.
532 if let Err(err
) = result
{
533 let span_with_quotes
=
534 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1));
536 &self.sess
.span_diagnostic
,
547 fn validate_int_literal(&self, base
: Base
, content_start
: BytePos
, content_end
: BytePos
) {
548 let base
= match base
{
553 let s
= self.str_from_to(content_start
+ BytePos(2), content_end
);
554 for (idx
, c
) in s
.char_indices() {
555 let idx
= idx
as u32;
556 if c
!= '_'
&& c
.to_digit(base
).is_none() {
557 let lo
= content_start
+ BytePos(2 + idx
);
558 let hi
= content_start
+ BytePos(2 + idx
+ c
.len_utf8() as u32);
559 self.err_span_(lo
, hi
, &format
!("invalid digit for a base {} literal", base
));
565 pub fn nfc_normalize(string
: &str) -> Symbol
{
566 use unicode_normalization
::{is_nfc_quick, IsNormalized, UnicodeNormalization}
;
567 match is_nfc_quick(string
.chars()) {
568 IsNormalized
::Yes
=> Symbol
::intern(string
),
570 let normalized_str
: String
= string
.chars().nfc().collect();
571 Symbol
::intern(&normalized_str
)