]> git.proxmox.com Git - rustc.git/blame - compiler/rustc_parse/src/lexer/mod.rs
New upstream version 1.71.1+dfsg1
[rustc.git] / compiler / rustc_parse / src / lexer / mod.rs
CommitLineData
49aad941
FG
1use std::ops::Range;
2
9ffffee4 3use crate::errors;
3c0e092e 4use crate::lexer::unicode_chars::UNICODE_ARRAY;
9ffffee4 5use crate::make_unclosed_delims_error;
136023e0 6use rustc_ast::ast::{self, AttrStyle};
04454e1e 7use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
2b03887a 8use rustc_ast::tokenstream::TokenStream;
3c0e092e 9use rustc_ast::util::unicode::contains_text_flow_control_chars;
9ffffee4 10use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey};
49aad941 11use rustc_lexer::unescape::{self, EscapeError, Mode};
2b03887a 12use rustc_lexer::Cursor;
1b1a35ee 13use rustc_lexer::{Base, DocStyle, RawStrError};
c295e0f8 14use rustc_session::lint::builtin::{
3c0e092e 15 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
c295e0f8 16};
136023e0 17use rustc_session::lint::BuiltinLintDiagnostics;
74b04a01 18use rustc_session::parse::ParseSess;
dfeec247 19use rustc_span::symbol::{sym, Symbol};
136023e0 20use rustc_span::{edition::Edition, BytePos, Pos, Span};
1a4d82fc 21
9ffffee4 22mod diagnostics;
32a655c1 23mod tokentrees;
60c5eb7d 24mod unescape_error_reporting;
dfeec247 25mod unicode_chars;
ba9703b0 26
5869c6ff 27use unescape_error_reporting::{emit_unescape_error, escaped_char};
1a4d82fc 28
064997fb
FG
29// This type is used a lot. Make sure it doesn't unintentionally get bigger.
30//
31// This assertion is in this crate, rather than in `rustc_lexer`, because that
32// crate cannot depend on `rustc_data_structures`.
33#[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
34rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
35
9fa01778 36#[derive(Clone, Debug)]
9ffffee4 37pub struct UnmatchedDelim {
04454e1e
FG
38 pub expected_delim: Delimiter,
39 pub found_delim: Option<Delimiter>,
9fa01778
XL
40 pub found_span: Span,
41 pub unclosed_span: Option<Span>,
42 pub candidate_span: Option<Span>,
43}
44
923072b8 45pub(crate) fn parse_token_trees<'a>(
1b1a35ee 46 sess: &'a ParseSess,
2b03887a
FG
47 mut src: &'a str,
48 mut start_pos: BytePos,
1b1a35ee 49 override_span: Option<Span>,
9ffffee4 50) -> Result<TokenStream, Vec<Diagnostic>> {
2b03887a
FG
51 // Skip `#!`, if present.
52 if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
53 src = &src[shebang_len..];
54 start_pos = start_pos + BytePos::from_usize(shebang_len);
55 }
56
57 let cursor = Cursor::new(src);
9c376795
FG
58 let string_reader = StringReader {
59 sess,
60 start_pos,
61 pos: start_pos,
62 src,
63 cursor,
64 override_span,
65 nbsp_is_whitespace: false,
66 };
9ffffee4
FG
67 let (token_trees, unmatched_delims) =
68 tokentrees::TokenTreesReader::parse_all_token_trees(string_reader);
69 match token_trees {
70 Ok(stream) if unmatched_delims.is_empty() => Ok(stream),
71 _ => {
49aad941 72 // Return error if there are unmatched delimiters or unclosed delimiters.
9ffffee4
FG
73 // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
74 // because the delimiter mismatch is more likely to be the root cause of error
75
76 let mut buffer = Vec::with_capacity(1);
77 // Not using `emit_unclosed_delims` to use `db.buffer`
78 for unmatched in unmatched_delims {
79 if let Some(err) = make_unclosed_delims_error(unmatched, &sess) {
80 err.buffer(&mut buffer);
81 }
82 }
83 if let Err(err) = token_trees {
84 // Add unclosing delimiter error
85 err.buffer(&mut buffer);
86 }
87 Err(buffer)
88 }
89 }
1b1a35ee
XL
90}
91
92struct StringReader<'a> {
416331ca
XL
93 sess: &'a ParseSess,
94 /// Initial position, read-only.
95 start_pos: BytePos,
96 /// The absolute offset within the source_map of the current character.
f9f354fc 97 pos: BytePos,
416331ca 98 /// Source text to tokenize.
1b1a35ee 99 src: &'a str,
2b03887a
FG
100 /// Cursor for getting lexer tokens.
101 cursor: Cursor<'a>,
48663c56 102 override_span: Option<Span>,
9c376795
FG
103 /// When a "unknown start of token: \u{a0}" has already been emitted earlier
104 /// in this file, it's safe to treat further occurrences of the non-breaking
105 /// space character as whitespace.
106 nbsp_is_whitespace: bool,
cc61c64b
XL
107}
108
32a655c1 109impl<'a> StringReader<'a> {
416331ca 110 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
e1599b0c 111 self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
8bb4bdeb
XL
112 }
113
2b03887a
FG
114 /// Returns the next token, paired with a bool indicating if the token was
115 /// preceded by whitespace.
116 fn next_token(&mut self) -> (Token, bool) {
117 let mut preceded_by_whitespace = false;
9c376795 118 let mut swallow_next_invalid = 0;
1b1a35ee
XL
119 // Skip trivial (whitespace & comments) tokens
120 loop {
2b03887a 121 let token = self.cursor.advance_token();
1b1a35ee 122 let start = self.pos;
064997fb 123 self.pos = self.pos + BytePos(token.len);
416331ca 124
1b1a35ee 125 debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
416331ca 126
2b03887a
FG
127 // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
128 // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
129 // additional validation.
130 let kind = match token.kind {
131 rustc_lexer::TokenKind::LineComment { doc_style } => {
132 // Skip non-doc comments
133 let Some(doc_style) = doc_style else {
134 self.lint_unicode_text_flow(start);
135 preceded_by_whitespace = true;
136 continue;
137 };
138
139 // Opening delimiter of the length 3 is not included into the symbol.
140 let content_start = start + BytePos(3);
141 let content = self.str_from(content_start);
142 self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
143 }
144 rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
145 if !terminated {
146 self.report_unterminated_block_comment(start, doc_style);
147 }
148
149 // Skip non-doc comments
150 let Some(doc_style) = doc_style else {
151 self.lint_unicode_text_flow(start);
152 preceded_by_whitespace = true;
153 continue;
154 };
155
156 // Opening delimiter of the length 3 and closing delimiter of the length 2
157 // are not included into the symbol.
158 let content_start = start + BytePos(3);
159 let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
160 let content = self.str_from_to(content_start, content_end);
161 self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
162 }
163 rustc_lexer::TokenKind::Whitespace => {
164 preceded_by_whitespace = true;
165 continue;
166 }
167 rustc_lexer::TokenKind::Ident => {
168 let sym = nfc_normalize(self.str_from(start));
1b1a35ee 169 let span = self.mk_sp(start, self.pos);
2b03887a
FG
170 self.sess.symbol_gallery.insert(sym, span);
171 token::Ident(sym, false)
1b1a35ee 172 }
2b03887a
FG
173 rustc_lexer::TokenKind::RawIdent => {
174 let sym = nfc_normalize(self.str_from(start + BytePos(2)));
175 let span = self.mk_sp(start, self.pos);
176 self.sess.symbol_gallery.insert(sym, span);
177 if !sym.can_be_raw() {
9ffffee4 178 self.sess.emit_err(errors::CannotBeRawIdent { span, ident: sym });
2b03887a 179 }
353b0b11 180 self.sess.raw_identifier_spans.push(span);
2b03887a
FG
181 token::Ident(sym, true)
182 }
183 rustc_lexer::TokenKind::UnknownPrefix => {
184 self.report_unknown_prefix(start);
185 let sym = nfc_normalize(self.str_from(start));
186 let span = self.mk_sp(start, self.pos);
187 self.sess.symbol_gallery.insert(sym, span);
188 token::Ident(sym, false)
189 }
190 rustc_lexer::TokenKind::InvalidIdent
191 // Do not recover an identifier with emoji if the codepoint is a confusable
192 // with a recoverable substitution token, like `➖`.
193 if !UNICODE_ARRAY
194 .iter()
195 .any(|&(c, _, _)| {
196 let sym = self.str_from(start);
197 sym.chars().count() == 1 && c == sym.chars().next().unwrap()
198 }) =>
199 {
200 let sym = nfc_normalize(self.str_from(start));
201 let span = self.mk_sp(start, self.pos);
202 self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
203 .push(span);
204 token::Ident(sym, false)
205 }
206 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
207 let suffix_start = start + BytePos(suffix_start);
208 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
49aad941
FG
209 if let token::LitKind::CStr | token::LitKind::CStrRaw(_) = kind {
210 self.sess.gated_spans.gate(sym::c_str_literals, self.mk_sp(start, self.pos));
211 }
2b03887a
FG
212 let suffix = if suffix_start < self.pos {
213 let string = self.str_from(suffix_start);
214 if string == "_" {
215 self.sess
216 .span_diagnostic
49aad941 217 .emit_err(errors::UnderscoreLiteralSuffix { span: self.mk_sp(suffix_start, self.pos) });
2b03887a
FG
218 None
219 } else {
220 Some(Symbol::intern(string))
221 }
222 } else {
223 None
224 };
225 token::Literal(token::Lit { kind, symbol, suffix })
226 }
227 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
228 // Include the leading `'` in the real identifier, for macro
229 // expansion purposes. See #12512 for the gory details of why
230 // this is necessary.
231 let lifetime_name = self.str_from(start);
232 if starts_with_number {
233 let span = self.mk_sp(start, self.pos);
234 let mut diag = self.sess.struct_err("lifetimes cannot start with a number");
235 diag.set_span(span);
236 diag.stash(span, StashKey::LifetimeIsChar);
237 }
238 let ident = Symbol::intern(lifetime_name);
239 token::Lifetime(ident)
240 }
241 rustc_lexer::TokenKind::Semi => token::Semi,
242 rustc_lexer::TokenKind::Comma => token::Comma,
243 rustc_lexer::TokenKind::Dot => token::Dot,
244 rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
245 rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
246 rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
247 rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
248 rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
249 rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
250 rustc_lexer::TokenKind::At => token::At,
251 rustc_lexer::TokenKind::Pound => token::Pound,
252 rustc_lexer::TokenKind::Tilde => token::Tilde,
253 rustc_lexer::TokenKind::Question => token::Question,
254 rustc_lexer::TokenKind::Colon => token::Colon,
255 rustc_lexer::TokenKind::Dollar => token::Dollar,
256 rustc_lexer::TokenKind::Eq => token::Eq,
257 rustc_lexer::TokenKind::Bang => token::Not,
258 rustc_lexer::TokenKind::Lt => token::Lt,
259 rustc_lexer::TokenKind::Gt => token::Gt,
260 rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
261 rustc_lexer::TokenKind::And => token::BinOp(token::And),
262 rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
263 rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
264 rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
265 rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
266 rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
267 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
268
269 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
9c376795
FG
270 // Don't emit diagnostics for sequences of the same invalid token
271 if swallow_next_invalid > 0 {
272 swallow_next_invalid -= 1;
273 continue;
274 }
275 let mut it = self.str_from_to_end(start).chars();
276 let c = it.next().unwrap();
277 if c == '\u{00a0}' {
278 // If an error has already been reported on non-breaking
279 // space characters earlier in the file, treat all
280 // subsequent occurrences as whitespace.
281 if self.nbsp_is_whitespace {
282 preceded_by_whitespace = true;
283 continue;
284 }
285 self.nbsp_is_whitespace = true;
286 }
287 let repeats = it.take_while(|c1| *c1 == c).count();
2b03887a
FG
288 // FIXME: the lexer could be used to turn the ASCII version of unicode
289 // homoglyphs, instead of keeping a table in `check_for_substitution`into the
290 // token. Ideally, this should be inside `rustc_lexer`. However, we should
291 // first remove compound tokens like `<<` from `rustc_lexer`, and then add
292 // fancier error recovery to it, as there will be less overall work to do this
293 // way.
9ffffee4
FG
294 let (token, sugg) = unicode_chars::check_for_substitution(self, start, c, repeats+1);
295 self.sess.emit_err(errors::UnknownTokenStart {
296 span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
297 escaped: escaped_char(c),
298 sugg,
299 null: if c == '\x00' {Some(errors::UnknownTokenNull)} else {None},
300 repeat: if repeats > 0 {
301 swallow_next_invalid = repeats;
302 Some(errors::UnknownTokenRepeat { repeats })
303 } else {None}
304 });
305
2b03887a
FG
306 if let Some(token) = token {
307 token
308 } else {
309 preceded_by_whitespace = true;
310 continue;
311 }
312 }
313 rustc_lexer::TokenKind::Eof => token::Eof,
314 };
315 let span = self.mk_sp(start, self.pos);
316 return (Token::new(kind, span), preceded_by_whitespace);
1b1a35ee 317 }
1a4d82fc
JJ
318 }
319
dfeec247
XL
320 fn struct_fatal_span_char(
321 &self,
322 from_pos: BytePos,
323 to_pos: BytePos,
324 m: &str,
325 c: char,
5e7ed085 326 ) -> DiagnosticBuilder<'a, !> {
5869c6ff
XL
327 self.sess
328 .span_diagnostic
49aad941 329 .struct_span_fatal(self.mk_sp(from_pos, to_pos), format!("{}: {}", m, escaped_char(c)))
9cc50fc6 330 }
1a4d82fc 331
c295e0f8
XL
332 /// Detect usages of Unicode codepoints changing the direction of the text on screen and loudly
333 /// complain about it.
334 fn lint_unicode_text_flow(&self, start: BytePos) {
335 // Opening delimiter of the length 2 is not included into the comment text.
336 let content_start = start + BytePos(2);
337 let content = self.str_from(content_start);
3c0e092e
XL
338 if contains_text_flow_control_chars(content) {
339 let span = self.mk_sp(start, self.pos);
c295e0f8
XL
340 self.sess.buffer_lint_with_diagnostic(
341 &TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
342 span,
343 ast::CRATE_NODE_ID,
344 "unicode codepoint changing visible direction of text present in comment",
345 BuiltinLintDiagnostics::UnicodeTextFlow(span, content.to_string()),
346 );
347 }
348 }
349
3dfed10e
XL
350 fn cook_doc_comment(
351 &self,
352 content_start: BytePos,
353 content: &str,
354 comment_kind: CommentKind,
355 doc_style: DocStyle,
356 ) -> TokenKind {
357 if content.contains('\r') {
358 for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
9ffffee4 359 let span = self.mk_sp(
3dfed10e
XL
360 content_start + BytePos(idx as u32),
361 content_start + BytePos(idx as u32 + 1),
3dfed10e 362 );
9ffffee4
FG
363 let block = matches!(comment_kind, CommentKind::Block);
364 self.sess.emit_err(errors::CrDocComment { span, block });
3dfed10e
XL
365 }
366 }
367
368 let attr_style = match doc_style {
369 DocStyle::Outer => AttrStyle::Outer,
370 DocStyle::Inner => AttrStyle::Inner,
371 };
372
373 token::DocComment(comment_kind, attr_style, Symbol::intern(content))
374 }
375
416331ca
XL
376 fn cook_lexer_literal(
377 &self,
378 start: BytePos,
487cf647 379 end: BytePos,
dfeec247 380 kind: rustc_lexer::LiteralKind,
416331ca 381 ) -> (token::LitKind, Symbol) {
487cf647 382 match kind {
416331ca
XL
383 rustc_lexer::LiteralKind::Char { terminated } => {
384 if !terminated {
17df50a5 385 self.sess.span_diagnostic.span_fatal_with_code(
487cf647 386 self.mk_sp(start, end),
17df50a5
XL
387 "unterminated character literal",
388 error_code!(E0762),
389 )
416331ca 390 }
487cf647 391 self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
dfeec247 392 }
416331ca
XL
393 rustc_lexer::LiteralKind::Byte { terminated } => {
394 if !terminated {
17df50a5 395 self.sess.span_diagnostic.span_fatal_with_code(
487cf647 396 self.mk_sp(start + BytePos(1), end),
17df50a5
XL
397 "unterminated byte constant",
398 error_code!(E0763),
399 )
416331ca 400 }
487cf647 401 self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
dfeec247 402 }
416331ca
XL
403 rustc_lexer::LiteralKind::Str { terminated } => {
404 if !terminated {
17df50a5 405 self.sess.span_diagnostic.span_fatal_with_code(
487cf647 406 self.mk_sp(start, end),
17df50a5
XL
407 "unterminated double quote string",
408 error_code!(E0765),
409 )
416331ca 410 }
487cf647 411 self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
416331ca
XL
412 }
413 rustc_lexer::LiteralKind::ByteStr { terminated } => {
414 if !terminated {
17df50a5 415 self.sess.span_diagnostic.span_fatal_with_code(
487cf647 416 self.mk_sp(start + BytePos(1), end),
17df50a5
XL
417 "unterminated double quote byte string",
418 error_code!(E0766),
419 )
416331ca 420 }
487cf647 421 self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
416331ca 422 }
49aad941
FG
423 rustc_lexer::LiteralKind::CStr { terminated } => {
424 if !terminated {
425 self.sess.span_diagnostic.span_fatal_with_code(
426 self.mk_sp(start + BytePos(1), end),
427 "unterminated C string",
428 error_code!(E0767),
429 )
430 }
431 self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
432 }
064997fb
FG
433 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
434 if let Some(n_hashes) = n_hashes {
435 let n = u32::from(n_hashes);
487cf647
FG
436 let kind = token::StrRaw(n_hashes);
437 self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
064997fb
FG
438 } else {
439 self.report_raw_str_error(start, 1);
440 }
416331ca 441 }
064997fb
FG
442 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
443 if let Some(n_hashes) = n_hashes {
444 let n = u32::from(n_hashes);
487cf647
FG
445 let kind = token::ByteStrRaw(n_hashes);
446 self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
064997fb
FG
447 } else {
448 self.report_raw_str_error(start, 2);
449 }
416331ca 450 }
49aad941
FG
451 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
452 if let Some(n_hashes) = n_hashes {
453 let n = u32::from(n_hashes);
454 let kind = token::CStrRaw(n_hashes);
455 self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
456 } else {
457 self.report_raw_str_error(start, 2);
458 }
459 }
416331ca 460 rustc_lexer::LiteralKind::Int { base, empty_int } => {
487cf647 461 if empty_int {
9ffffee4
FG
462 let span = self.mk_sp(start, end);
463 self.sess.emit_err(errors::NoDigitsLiteral { span });
416331ca
XL
464 (token::Integer, sym::integer(0))
465 } else {
487cf647
FG
466 if matches!(base, Base::Binary | Base::Octal) {
467 let base = base as u32;
468 let s = self.str_from_to(start + BytePos(2), end);
469 for (idx, c) in s.char_indices() {
9ffffee4
FG
470 let span = self.mk_sp(
471 start + BytePos::from_usize(2 + idx),
472 start + BytePos::from_usize(2 + idx + c.len_utf8()),
473 );
487cf647 474 if c != '_' && c.to_digit(base).is_none() {
9ffffee4 475 self.sess.emit_err(errors::InvalidDigitLiteral { span, base });
487cf647
FG
476 }
477 }
478 }
479 (token::Integer, self.symbol_from_to(start, end))
480 }
dfeec247 481 }
416331ca
XL
482 rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
483 if empty_exponent {
9ffffee4
FG
484 let span = self.mk_sp(start, self.pos);
485 self.sess.emit_err(errors::EmptyExponentFloat { span });
416331ca 486 }
9ffffee4
FG
487 let base = match base {
488 Base::Hexadecimal => Some("hexadecimal"),
489 Base::Octal => Some("octal"),
490 Base::Binary => Some("binary"),
491 _ => None,
492 };
493 if let Some(base) = base {
494 let span = self.mk_sp(start, end);
495 self.sess.emit_err(errors::FloatLiteralUnsupportedBase { span, base });
416331ca 496 }
487cf647 497 (token::Float, self.symbol_from_to(start, end))
dfeec247 498 }
487cf647 499 }
f9f354fc
XL
500 }
501
94b46f34
XL
502 #[inline]
503 fn src_index(&self, pos: BytePos) -> usize {
416331ca 504 (pos - self.start_pos).to_usize()
1a4d82fc
JJ
505 }
506
dc9dc135
XL
507 /// Slice of the source text from `start` up to but excluding `self.pos`,
508 /// meaning the slice does not include the character `self.ch`.
9c376795 509 fn str_from(&self, start: BytePos) -> &'a str {
dc9dc135 510 self.str_from_to(start, self.pos)
1a4d82fc
JJ
511 }
512
dc9dc135
XL
513 /// As symbol_from, with an explicit endpoint.
514 fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
1a4d82fc 515 debug!("taking an ident from {:?} to {:?}", start, end);
dc9dc135 516 Symbol::intern(self.str_from_to(start, end))
1a4d82fc
JJ
517 }
518
dc9dc135 519 /// Slice of the source text spanning from `start` up to but excluding `end`.
9c376795 520 fn str_from_to(&self, start: BytePos, end: BytePos) -> &'a str {
dc9dc135 521 &self.src[self.src_index(start)..self.src_index(end)]
1a4d82fc
JJ
522 }
523
9c376795
FG
524 /// Slice of the source text spanning from `start` until the end
525 fn str_from_to_end(&self, start: BytePos) -> &'a str {
526 &self.src[self.src_index(start)..]
527 }
528
064997fb
FG
529 fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
530 match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
531 Err(RawStrError::InvalidStarter { bad_char }) => {
f035d41b
XL
532 self.report_non_started_raw_string(start, bad_char)
533 }
064997fb 534 Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
f035d41b 535 .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
064997fb 536 Err(RawStrError::TooManyDelimiters { found }) => {
f035d41b 537 self.report_too_many_hashes(start, found)
ba9703b0 538 }
064997fb 539 Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
ba9703b0
XL
540 }
541 }
542
f035d41b 543 fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
dfeec247
XL
544 self.struct_fatal_span_char(
545 start,
546 self.pos,
f035d41b 547 "found invalid character; only `#` is allowed in raw string delimitation",
dfeec247
XL
548 bad_char,
549 )
5e7ed085 550 .emit()
1a4d82fc
JJ
551 }
552
ba9703b0
XL
553 fn report_unterminated_raw_string(
554 &self,
555 start: BytePos,
064997fb
FG
556 n_hashes: u32,
557 possible_offset: Option<u32>,
558 found_terminators: u32,
ba9703b0 559 ) -> ! {
74b04a01
XL
560 let mut err = self.sess.span_diagnostic.struct_span_fatal_with_code(
561 self.mk_sp(start, start),
562 "unterminated raw string",
563 error_code!(E0748),
564 );
ba9703b0 565
dfeec247 566 err.span_label(self.mk_sp(start, start), "unterminated raw string");
1a4d82fc 567
416331ca 568 if n_hashes > 0 {
49aad941 569 err.note(format!(
dfeec247 570 "this raw string should be terminated with `\"{}`",
064997fb 571 "#".repeat(n_hashes as usize)
dfeec247 572 ));
48663c56 573 }
1a4d82fc 574
ba9703b0 575 if let Some(possible_offset) = possible_offset {
353b0b11
FG
576 let lo = start + BytePos(possible_offset);
577 let hi = lo + BytePos(found_terminators);
ba9703b0
XL
578 let span = self.mk_sp(lo, hi);
579 err.span_suggestion(
580 span,
581 "consider terminating the string here",
064997fb 582 "#".repeat(n_hashes as usize),
ba9703b0
XL
583 Applicability::MaybeIncorrect,
584 );
585 }
586
5e7ed085 587 err.emit()
1a4d82fc
JJ
588 }
589
04454e1e
FG
590 fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
591 let msg = match doc_style {
592 Some(_) => "unterminated block doc-comment",
593 None => "unterminated block comment",
594 };
595 let last_bpos = self.pos;
596 let mut err = self.sess.span_diagnostic.struct_span_fatal_with_code(
597 self.mk_sp(start, last_bpos),
598 msg,
599 error_code!(E0758),
600 );
601 let mut nested_block_comment_open_idxs = vec![];
602 let mut last_nested_block_comment_idxs = None;
603 let mut content_chars = self.str_from(start).char_indices().peekable();
604
605 while let Some((idx, current_char)) = content_chars.next() {
606 match content_chars.peek() {
607 Some((_, '*')) if current_char == '/' => {
608 nested_block_comment_open_idxs.push(idx);
609 }
610 Some((_, '/')) if current_char == '*' => {
611 last_nested_block_comment_idxs =
612 nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
613 }
614 _ => {}
615 };
616 }
617
618 if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
619 err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
620 .span_label(
621 self.mk_sp(
622 start + BytePos(nested_open_idx as u32),
623 start + BytePos(nested_open_idx as u32 + 2),
624 ),
625 "...as last nested comment starts here, maybe you want to close this instead?",
626 )
627 .span_label(
628 self.mk_sp(
629 start + BytePos(nested_close_idx as u32),
630 start + BytePos(nested_close_idx as u32 + 2),
631 ),
632 "...and last nested comment terminates here.",
633 );
634 }
635
636 err.emit();
637 }
638
136023e0
XL
639 // RFC 3101 introduced the idea of (reserved) prefixes. As of Rust 2021,
640 // using a (unknown) prefix is an error. In earlier editions, however, they
641 // only result in a (allowed by default) lint, and are treated as regular
642 // identifier tokens.
643 fn report_unknown_prefix(&self, start: BytePos) {
644 let prefix_span = self.mk_sp(start, self.pos);
9ffffee4 645 let prefix = self.str_from_to(start, self.pos);
136023e0
XL
646
647 let expn_data = prefix_span.ctxt().outer_expn_data();
648
649 if expn_data.edition >= Edition::Edition2021 {
650 // In Rust 2021, this is a hard error.
9ffffee4
FG
651 let sugg = if prefix == "rb" {
652 Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
94222f64 653 } else if expn_data.is_root() {
9ffffee4
FG
654 Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
655 } else {
656 None
657 };
658 self.sess.emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
136023e0
XL
659 } else {
660 // Before Rust 2021, only emit a lint for migration.
661 self.sess.buffer_lint_with_diagnostic(
662 &RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
663 prefix_span,
664 ast::CRATE_NODE_ID,
49aad941 665 format!("prefix `{prefix}` is unknown"),
136023e0
XL
666 BuiltinLintDiagnostics::ReservedPrefix(prefix_span),
667 );
668 }
669 }
670
9ffffee4
FG
671 fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
672 self.sess.emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1a4d82fc 673 }
48663c56 674
49aad941 675 fn cook_common(
5869c6ff 676 &self,
487cf647 677 kind: token::LitKind,
5869c6ff 678 mode: Mode,
487cf647
FG
679 start: BytePos,
680 end: BytePos,
5869c6ff
XL
681 prefix_len: u32,
682 postfix_len: u32,
49aad941 683 unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
487cf647
FG
684 ) -> (token::LitKind, Symbol) {
685 let mut has_fatal_err = false;
686 let content_start = start + BytePos(prefix_len);
687 let content_end = end - BytePos(postfix_len);
f9f354fc 688 let lit_content = self.str_from_to(content_start, content_end);
49aad941 689 unescape(lit_content, mode, &mut |range, result| {
f9f354fc
XL
690 // Here we only check for errors. The actual unescaping is done later.
691 if let Err(err) = result {
487cf647 692 let span_with_quotes = self.mk_sp(start, end);
fc512014
XL
693 let (start, end) = (range.start as u32, range.end as u32);
694 let lo = content_start + BytePos(start);
695 let hi = lo + BytePos(end - start);
696 let span = self.mk_sp(lo, hi);
487cf647
FG
697 if err.is_fatal() {
698 has_fatal_err = true;
699 }
dc9dc135
XL
700 emit_unescape_error(
701 &self.sess.span_diagnostic,
f9f354fc
XL
702 lit_content,
703 span_with_quotes,
fc512014 704 span,
f9f354fc 705 mode,
dc9dc135
XL
706 range,
707 err,
f9f354fc 708 );
dc9dc135 709 }
f9f354fc 710 });
1a4d82fc 711
487cf647
FG
712 // We normally exclude the quotes for the symbol, but for errors we
713 // include it because it results in clearer error messages.
714 if !has_fatal_err {
715 (kind, Symbol::intern(lit_content))
716 } else {
717 (token::Err, self.symbol_from_to(start, end))
416331ca
XL
718 }
719 }
49aad941
FG
720
721 fn cook_quoted(
722 &self,
723 kind: token::LitKind,
724 mode: Mode,
725 start: BytePos,
726 end: BytePos,
727 prefix_len: u32,
728 postfix_len: u32,
729 ) -> (token::LitKind, Symbol) {
730 self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
731 unescape::unescape_literal(src, mode, &mut |span, result| {
732 callback(span, result.map(drop))
733 })
734 })
735 }
736
737 fn cook_c_string(
738 &self,
739 kind: token::LitKind,
740 mode: Mode,
741 start: BytePos,
742 end: BytePos,
743 prefix_len: u32,
744 postfix_len: u32,
745 ) -> (token::LitKind, Symbol) {
746 self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
747 unescape::unescape_c_string(src, mode, &mut |span, result| {
748 callback(span, result.map(drop))
749 })
750 })
751 }
9cc50fc6 752}
dfeec247
XL
753
754pub fn nfc_normalize(string: &str) -> Symbol {
755 use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization};
756 match is_nfc_quick(string.chars()) {
757 IsNormalized::Yes => Symbol::intern(string),
758 _ => {
759 let normalized_str: String = string.chars().nfc().collect();
760 Symbol::intern(&normalized_str)
761 }
762 }
763}