1 use crate::parse
::token
::{self, Token, TokenKind}
;
2 use crate::sess
::ParseSess
;
3 use crate::symbol
::{sym, Symbol}
;
4 use crate::parse
::unescape_error_reporting
::{emit_unescape_error, push_escaped_char}
;
6 use errors
::{FatalError, DiagnosticBuilder}
;
7 use syntax_pos
::{BytePos, Pos, Span}
;
9 use rustc_lexer
::unescape
;
12 use std
::convert
::TryInto
;
13 use rustc_data_structures
::sync
::Lrc
;
23 #[derive(Clone, Debug)]
24 pub struct UnmatchedBrace
{
25 pub expected_delim
: token
::DelimToken
,
26 pub found_delim
: Option
<token
::DelimToken
>,
28 pub unclosed_span
: Option
<Span
>,
29 pub candidate_span
: Option
<Span
>,
32 pub struct StringReader
<'a
> {
34 /// Initial position, read-only.
36 /// The absolute offset within the source_map of the current character.
38 /// Stop reading src at this index.
40 /// Source text to tokenize.
42 override_span
: Option
<Span
>,
45 impl<'a
> StringReader
<'a
> {
46 pub fn new(sess
: &'a ParseSess
,
47 source_file
: Lrc
<syntax_pos
::SourceFile
>,
48 override_span
: Option
<Span
>) -> Self {
49 if source_file
.src
.is_none() {
50 sess
.span_diagnostic
.bug(&format
!("cannot lex `source_file` without source: {}",
54 let src
= (*source_file
.src
.as_ref().unwrap()).clone();
58 start_pos
: source_file
.start_pos
,
59 pos
: source_file
.start_pos
,
60 end_src_index
: src
.len(),
66 pub fn retokenize(sess
: &'a ParseSess
, mut span
: Span
) -> Self {
67 let begin
= sess
.source_map().lookup_byte_offset(span
.lo());
68 let end
= sess
.source_map().lookup_byte_offset(span
.hi());
70 // Make the range zero-length if the span is invalid.
71 if span
.lo() > span
.hi() || begin
.sf
.start_pos
!= end
.sf
.start_pos
{
72 span
= span
.shrink_to_lo();
75 let mut sr
= StringReader
::new(sess
, begin
.sf
, None
);
77 // Seek the lexer to the right byte range.
78 sr
.end_src_index
= sr
.src_index(span
.hi());
84 fn mk_sp(&self, lo
: BytePos
, hi
: BytePos
) -> Span
{
85 self.override_span
.unwrap_or_else(|| Span
::with_root_ctxt(lo
, hi
))
88 /// Returns the next token, including trivia like whitespace or comments.
90 /// `Err(())` means that some errors were encountered, which can be
91 /// retrieved using `buffer_fatal_errors`.
92 pub fn next_token(&mut self) -> Token
{
93 let start_src_index
= self.src_index(self.pos
);
94 let text
: &str = &self.src
[start_src_index
..self.end_src_index
];
97 let span
= self.mk_sp(self.pos
, self.pos
);
98 return Token
::new(token
::Eof
, span
);
102 let is_beginning_of_file
= self.pos
== self.start_pos
;
103 if is_beginning_of_file
{
104 if let Some(shebang_len
) = rustc_lexer
::strip_shebang(text
) {
105 let start
= self.pos
;
106 self.pos
= self.pos
+ BytePos
::from_usize(shebang_len
);
108 let sym
= self.symbol_from(start
+ BytePos
::from_usize("#!".len()));
109 let kind
= token
::Shebang(sym
);
111 let span
= self.mk_sp(start
, self.pos
);
112 return Token
::new(kind
, span
);
117 let token
= rustc_lexer
::first_token(text
);
119 let start
= self.pos
;
120 self.pos
= self.pos
+ BytePos
::from_usize(token
.len
);
122 debug
!("try_next_token: {:?}({:?})", token
.kind
, self.str_from(start
));
124 // This could use `?`, but that makes code significantly (10-20%) slower.
125 // https://github.com/rust-lang/rust/issues/37939
126 let kind
= self.cook_lexer_token(token
.kind
, start
);
128 let span
= self.mk_sp(start
, self.pos
);
129 Token
::new(kind
, span
)
132 /// Report a fatal lexical error with a given span.
133 fn fatal_span(&self, sp
: Span
, m
: &str) -> FatalError
{
134 self.sess
.span_diagnostic
.span_fatal(sp
, m
)
137 /// Report a lexical error with a given span.
138 fn err_span(&self, sp
: Span
, m
: &str) {
139 self.sess
.span_diagnostic
.struct_span_err(sp
, m
).emit();
143 /// Report a fatal error spanning [`from_pos`, `to_pos`).
144 fn fatal_span_(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str) -> FatalError
{
145 self.fatal_span(self.mk_sp(from_pos
, to_pos
), m
)
148 /// Report a lexical error spanning [`from_pos`, `to_pos`).
149 fn err_span_(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str) {
150 self.err_span(self.mk_sp(from_pos
, to_pos
), m
)
153 fn struct_span_fatal(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str)
154 -> DiagnosticBuilder
<'a
>
156 self.sess
.span_diagnostic
.struct_span_fatal(self.mk_sp(from_pos
, to_pos
), m
)
159 fn struct_fatal_span_char(&self, from_pos
: BytePos
, to_pos
: BytePos
, m
: &str, c
: char)
160 -> DiagnosticBuilder
<'a
>
162 let mut m
= m
.to_string();
164 push_escaped_char(&mut m
, c
);
166 self.sess
.span_diagnostic
.struct_span_fatal(self.mk_sp(from_pos
, to_pos
), &m
[..])
169 /// Turns simple `rustc_lexer::TokenKind` enum into a rich
170 /// `libsyntax::TokenKind`. This turns strings into interned
171 /// symbols and runs additional validation.
174 token
: rustc_lexer
::TokenKind
,
178 rustc_lexer
::TokenKind
::LineComment
=> {
179 let string
= self.str_from(start
);
180 // comments with only more "/"s are not doc comments
181 let tok
= if is_doc_comment(string
) {
182 self.forbid_bare_cr(start
, string
, "bare CR not allowed in doc-comment");
183 token
::DocComment(Symbol
::intern(string
))
190 rustc_lexer
::TokenKind
::BlockComment { terminated }
=> {
191 let string
= self.str_from(start
);
192 // block comments starting with "/**" or "/*!" are doc-comments
193 // but comments with only "*"s between two "/"s are not
194 let is_doc_comment
= is_block_doc_comment(string
);
197 let msg
= if is_doc_comment
{
198 "unterminated block doc-comment"
200 "unterminated block comment"
202 let last_bpos
= self.pos
;
203 self.fatal_span_(start
, last_bpos
, msg
).raise();
206 let tok
= if is_doc_comment
{
207 self.forbid_bare_cr(start
,
209 "bare CR not allowed in block doc-comment");
210 token
::DocComment(Symbol
::intern(string
))
217 rustc_lexer
::TokenKind
::Whitespace
=> token
::Whitespace
,
218 rustc_lexer
::TokenKind
::Ident
| rustc_lexer
::TokenKind
::RawIdent
=> {
219 let is_raw_ident
= token
== rustc_lexer
::TokenKind
::RawIdent
;
220 let mut ident_start
= start
;
222 ident_start
= ident_start
+ BytePos(2);
224 // FIXME: perform NFKC normalization here. (Issue #2253)
225 let sym
= self.symbol_from(ident_start
);
227 let span
= self.mk_sp(start
, self.pos
);
228 if !sym
.can_be_raw() {
229 self.err_span(span
, &format
!("`{}` cannot be a raw identifier", sym
));
231 self.sess
.raw_identifier_spans
.borrow_mut().push(span
);
233 token
::Ident(sym
, is_raw_ident
)
235 rustc_lexer
::TokenKind
::Literal { kind, suffix_start }
=> {
236 let suffix_start
= start
+ BytePos(suffix_start
as u32);
237 let (kind
, symbol
) = self.cook_lexer_literal(start
, suffix_start
, kind
);
238 let suffix
= if suffix_start
< self.pos
{
239 let string
= self.str_from(suffix_start
);
241 self.sess
.span_diagnostic
242 .struct_span_warn(self.mk_sp(suffix_start
, self.pos
),
243 "underscore literal suffix is not allowed")
244 .warn("this was previously accepted by the compiler but is \
245 being phased out; it will become a hard error in \
247 .note("for more information, see issue #42326 \
248 <https://github.com/rust-lang/rust/issues/42326>")
252 Some(Symbol
::intern(string
))
257 token
::Literal(token
::Lit { kind, symbol, suffix }
)
259 rustc_lexer
::TokenKind
::Lifetime { starts_with_number }
=> {
260 // Include the leading `'` in the real identifier, for macro
261 // expansion purposes. See #12512 for the gory details of why
262 // this is necessary.
263 let lifetime_name
= self.str_from(start
);
264 if starts_with_number
{
268 "lifetimes cannot start with a number",
271 let ident
= Symbol
::intern(lifetime_name
);
272 token
::Lifetime(ident
)
274 rustc_lexer
::TokenKind
::Semi
=> token
::Semi
,
275 rustc_lexer
::TokenKind
::Comma
=> token
::Comma
,
276 rustc_lexer
::TokenKind
::Dot
=> token
::Dot
,
277 rustc_lexer
::TokenKind
::OpenParen
=> token
::OpenDelim(token
::Paren
),
278 rustc_lexer
::TokenKind
::CloseParen
=> token
::CloseDelim(token
::Paren
),
279 rustc_lexer
::TokenKind
::OpenBrace
=> token
::OpenDelim(token
::Brace
),
280 rustc_lexer
::TokenKind
::CloseBrace
=> token
::CloseDelim(token
::Brace
),
281 rustc_lexer
::TokenKind
::OpenBracket
=> token
::OpenDelim(token
::Bracket
),
282 rustc_lexer
::TokenKind
::CloseBracket
=> token
::CloseDelim(token
::Bracket
),
283 rustc_lexer
::TokenKind
::At
=> token
::At
,
284 rustc_lexer
::TokenKind
::Pound
=> token
::Pound
,
285 rustc_lexer
::TokenKind
::Tilde
=> token
::Tilde
,
286 rustc_lexer
::TokenKind
::Question
=> token
::Question
,
287 rustc_lexer
::TokenKind
::Colon
=> token
::Colon
,
288 rustc_lexer
::TokenKind
::Dollar
=> token
::Dollar
,
289 rustc_lexer
::TokenKind
::Eq
=> token
::Eq
,
290 rustc_lexer
::TokenKind
::Not
=> token
::Not
,
291 rustc_lexer
::TokenKind
::Lt
=> token
::Lt
,
292 rustc_lexer
::TokenKind
::Gt
=> token
::Gt
,
293 rustc_lexer
::TokenKind
::Minus
=> token
::BinOp(token
::Minus
),
294 rustc_lexer
::TokenKind
::And
=> token
::BinOp(token
::And
),
295 rustc_lexer
::TokenKind
::Or
=> token
::BinOp(token
::Or
),
296 rustc_lexer
::TokenKind
::Plus
=> token
::BinOp(token
::Plus
),
297 rustc_lexer
::TokenKind
::Star
=> token
::BinOp(token
::Star
),
298 rustc_lexer
::TokenKind
::Slash
=> token
::BinOp(token
::Slash
),
299 rustc_lexer
::TokenKind
::Caret
=> token
::BinOp(token
::Caret
),
300 rustc_lexer
::TokenKind
::Percent
=> token
::BinOp(token
::Percent
),
302 rustc_lexer
::TokenKind
::Unknown
=> {
303 let c
= self.str_from(start
).chars().next().unwrap();
304 let mut err
= self.struct_fatal_span_char(start
,
306 "unknown start of token",
308 // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs,
309 // instead of keeping a table in `check_for_substitution`into the token. Ideally,
310 // this should be inside `rustc_lexer`. However, we should first remove compound
311 // tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
312 // as there will be less overall work to do this way.
313 let token
= unicode_chars
::check_for_substitution(self, start
, c
, &mut err
)
314 .unwrap_or_else(|| token
::Unknown(self.symbol_from(start
)));
321 fn cook_lexer_literal(
324 suffix_start
: BytePos
,
325 kind
: rustc_lexer
::LiteralKind
326 ) -> (token
::LitKind
, Symbol
) {
328 rustc_lexer
::LiteralKind
::Char { terminated }
=> {
330 self.fatal_span_(start
, suffix_start
,
331 "unterminated character literal".into())
334 let content_start
= start
+ BytePos(1);
335 let content_end
= suffix_start
- BytePos(1);
336 self.validate_char_escape(content_start
, content_end
);
337 let id
= self.symbol_from_to(content_start
, content_end
);
340 rustc_lexer
::LiteralKind
::Byte { terminated }
=> {
342 self.fatal_span_(start
+ BytePos(1), suffix_start
,
343 "unterminated byte constant".into())
346 let content_start
= start
+ BytePos(2);
347 let content_end
= suffix_start
- BytePos(1);
348 self.validate_byte_escape(content_start
, content_end
);
349 let id
= self.symbol_from_to(content_start
, content_end
);
352 rustc_lexer
::LiteralKind
::Str { terminated }
=> {
354 self.fatal_span_(start
, suffix_start
,
355 "unterminated double quote string".into())
358 let content_start
= start
+ BytePos(1);
359 let content_end
= suffix_start
- BytePos(1);
360 self.validate_str_escape(content_start
, content_end
);
361 let id
= self.symbol_from_to(content_start
, content_end
);
364 rustc_lexer
::LiteralKind
::ByteStr { terminated }
=> {
366 self.fatal_span_(start
+ BytePos(1), suffix_start
,
367 "unterminated double quote byte string".into())
370 let content_start
= start
+ BytePos(2);
371 let content_end
= suffix_start
- BytePos(1);
372 self.validate_byte_str_escape(content_start
, content_end
);
373 let id
= self.symbol_from_to(content_start
, content_end
);
376 rustc_lexer
::LiteralKind
::RawStr { n_hashes, started, terminated }
=> {
378 self.report_non_started_raw_string(start
);
381 self.report_unterminated_raw_string(start
, n_hashes
)
383 let n_hashes
: u16 = self.restrict_n_hashes(start
, n_hashes
);
384 let n
= u32::from(n_hashes
);
385 let content_start
= start
+ BytePos(2 + n
);
386 let content_end
= suffix_start
- BytePos(1 + n
);
387 self.validate_raw_str_escape(content_start
, content_end
);
388 let id
= self.symbol_from_to(content_start
, content_end
);
389 (token
::StrRaw(n_hashes
), id
)
391 rustc_lexer
::LiteralKind
::RawByteStr { n_hashes, started, terminated }
=> {
393 self.report_non_started_raw_string(start
);
396 self.report_unterminated_raw_string(start
, n_hashes
)
398 let n_hashes
: u16 = self.restrict_n_hashes(start
, n_hashes
);
399 let n
= u32::from(n_hashes
);
400 let content_start
= start
+ BytePos(3 + n
);
401 let content_end
= suffix_start
- BytePos(1 + n
);
402 self.validate_raw_byte_str_escape(content_start
, content_end
);
403 let id
= self.symbol_from_to(content_start
, content_end
);
404 (token
::ByteStrRaw(n_hashes
), id
)
406 rustc_lexer
::LiteralKind
::Int { base, empty_int }
=> {
408 self.err_span_(start
, suffix_start
, "no valid digits found for number");
409 (token
::Integer
, sym
::integer(0))
411 self.validate_int_literal(base
, start
, suffix_start
);
412 (token
::Integer
, self.symbol_from_to(start
, suffix_start
))
415 rustc_lexer
::LiteralKind
::Float { base, empty_exponent }
=> {
417 let mut err
= self.struct_span_fatal(
419 "expected at least one digit in exponent"
425 Base
::Hexadecimal
=> {
426 self.err_span_(start
, suffix_start
,
427 "hexadecimal float literal is not supported")
430 self.err_span_(start
, suffix_start
,
431 "octal float literal is not supported")
434 self.err_span_(start
, suffix_start
,
435 "binary float literal is not supported")
440 let id
= self.symbol_from_to(start
, suffix_start
);
447 fn src_index(&self, pos
: BytePos
) -> usize {
448 (pos
- self.start_pos
).to_usize()
451 /// Slice of the source text from `start` up to but excluding `self.pos`,
452 /// meaning the slice does not include the character `self.ch`.
453 fn str_from(&self, start
: BytePos
) -> &str
455 self.str_from_to(start
, self.pos
)
458 /// Creates a Symbol from a given offset to the current offset.
459 fn symbol_from(&self, start
: BytePos
) -> Symbol
{
460 debug
!("taking an ident from {:?} to {:?}", start
, self.pos
);
461 Symbol
::intern(self.str_from(start
))
464 /// As symbol_from, with an explicit endpoint.
465 fn symbol_from_to(&self, start
: BytePos
, end
: BytePos
) -> Symbol
{
466 debug
!("taking an ident from {:?} to {:?}", start
, end
);
467 Symbol
::intern(self.str_from_to(start
, end
))
470 /// Slice of the source text spanning from `start` up to but excluding `end`.
471 fn str_from_to(&self, start
: BytePos
, end
: BytePos
) -> &str
473 &self.src
[self.src_index(start
)..self.src_index(end
)]
476 fn forbid_bare_cr(&self, start
: BytePos
, s
: &str, errmsg
: &str) {
479 idx
= match s
[idx
..].find('
\r'
) {
481 Some(it
) => idx
+ it
+ 1
483 self.err_span_(start
+ BytePos(idx
as u32 - 1),
484 start
+ BytePos(idx
as u32),
489 fn report_non_started_raw_string(&self, start
: BytePos
) -> ! {
490 let bad_char
= self.str_from(start
).chars().last().unwrap();
492 .struct_fatal_span_char(
495 "found invalid character; only `#` is allowed \
496 in raw string delimitation",
503 fn report_unterminated_raw_string(&self, start
: BytePos
, n_hashes
: usize) -> ! {
504 let mut err
= self.struct_span_fatal(
506 "unterminated raw string",
509 self.mk_sp(start
, start
),
510 "unterminated raw string",
514 err
.note(&format
!("this raw string should be terminated with `\"{}`",
515 "#".repeat(n_hashes
as usize)));
522 fn restrict_n_hashes(&self, start
: BytePos
, n_hashes
: usize) -> u16 {
523 match n_hashes
.try_into() {
524 Ok(n_hashes
) => n_hashes
,
526 self.fatal_span_(start
,
528 "too many `#` symbols: raw strings may be \
529 delimited by up to 65535 `#` symbols").raise();
534 fn validate_char_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
535 let lit
= self.str_from_to(content_start
, content_end
);
536 if let Err((off
, err
)) = unescape
::unescape_char(lit
) {
538 &self.sess
.span_diagnostic
,
540 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
541 unescape
::Mode
::Char
,
548 fn validate_byte_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
549 let lit
= self.str_from_to(content_start
, content_end
);
550 if let Err((off
, err
)) = unescape
::unescape_byte(lit
) {
552 &self.sess
.span_diagnostic
,
554 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
555 unescape
::Mode
::Byte
,
562 fn validate_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
563 let lit
= self.str_from_to(content_start
, content_end
);
564 unescape
::unescape_str(lit
, &mut |range
, c
| {
565 if let Err(err
) = c
{
567 &self.sess
.span_diagnostic
,
569 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
578 fn validate_raw_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
579 let lit
= self.str_from_to(content_start
, content_end
);
580 unescape
::unescape_raw_str(lit
, &mut |range
, c
| {
581 if let Err(err
) = c
{
583 &self.sess
.span_diagnostic
,
585 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
594 fn validate_raw_byte_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
595 let lit
= self.str_from_to(content_start
, content_end
);
596 unescape
::unescape_raw_byte_str(lit
, &mut |range
, c
| {
597 if let Err(err
) = c
{
599 &self.sess
.span_diagnostic
,
601 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
602 unescape
::Mode
::ByteStr
,
610 fn validate_byte_str_escape(&self, content_start
: BytePos
, content_end
: BytePos
) {
611 let lit
= self.str_from_to(content_start
, content_end
);
612 unescape
::unescape_byte_str(lit
, &mut |range
, c
| {
613 if let Err(err
) = c
{
615 &self.sess
.span_diagnostic
,
617 self.mk_sp(content_start
- BytePos(1), content_end
+ BytePos(1)),
618 unescape
::Mode
::ByteStr
,
626 fn validate_int_literal(&self, base
: Base
, content_start
: BytePos
, content_end
: BytePos
) {
627 let base
= match base
{
632 let s
= self.str_from_to(content_start
+ BytePos(2), content_end
);
633 for (idx
, c
) in s
.char_indices() {
634 let idx
= idx
as u32;
635 if c
!= '_'
&& c
.to_digit(base
).is_none() {
636 let lo
= content_start
+ BytePos(2 + idx
);
637 let hi
= content_start
+ BytePos(2 + idx
+ c
.len_utf8() as u32);
638 self.err_span_(lo
, hi
,
639 &format
!("invalid digit for a base {} literal", base
));
646 fn is_doc_comment(s
: &str) -> bool
{
647 let res
= (s
.starts_with("///") && *s
.as_bytes().get(3).unwrap_or(&b' '
) != b'
/'
) ||
648 s
.starts_with("//!");
649 debug
!("is {:?} a doc comment? {}", s
, res
);
653 fn is_block_doc_comment(s
: &str) -> bool
{
654 // Prevent `/**/` from being parsed as a doc comment
655 let res
= ((s
.starts_with("/**") && *s
.as_bytes().get(3).unwrap_or(&b' '
) != b'
*'
) ||
656 s
.starts_with("/*!")) && s
.len() >= 5;
657 debug
!("is {:?} a doc comment? {}", s
, res
);