]> git.proxmox.com Git - rustc.git/blob - compiler/rustc_parse/src/lexer/mod.rs
New upstream version 1.76.0+dfsg1
[rustc.git] / compiler / rustc_parse / src / lexer / mod.rs
1 use std::ops::Range;
2
3 use crate::errors;
4 use crate::lexer::unicode_chars::UNICODE_ARRAY;
5 use crate::make_unclosed_delims_error;
6 use rustc_ast::ast::{self, AttrStyle};
7 use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
8 use rustc_ast::tokenstream::TokenStream;
9 use rustc_ast::util::unicode::contains_text_flow_control_chars;
10 use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey};
11 use rustc_lexer::unescape::{self, EscapeError, Mode};
12 use rustc_lexer::{Base, DocStyle, RawStrError};
13 use rustc_lexer::{Cursor, LiteralKind};
14 use rustc_session::lint::builtin::{
15 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
16 };
17 use rustc_session::lint::BuiltinLintDiagnostics;
18 use rustc_session::parse::ParseSess;
19 use rustc_span::symbol::{sym, Symbol};
20 use rustc_span::{edition::Edition, BytePos, Pos, Span};
21
22 mod diagnostics;
23 mod tokentrees;
24 mod unescape_error_reporting;
25 mod unicode_chars;
26
27 use unescape_error_reporting::{emit_unescape_error, escaped_char};
28
29 // This type is used a lot. Make sure it doesn't unintentionally get bigger.
30 //
31 // This assertion is in this crate, rather than in `rustc_lexer`, because that
32 // crate cannot depend on `rustc_data_structures`.
33 #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
34 rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
35
36 #[derive(Clone, Debug)]
37 pub struct UnmatchedDelim {
38 pub expected_delim: Delimiter,
39 pub found_delim: Option<Delimiter>,
40 pub found_span: Span,
41 pub unclosed_span: Option<Span>,
42 pub candidate_span: Option<Span>,
43 }
44
45 pub(crate) fn parse_token_trees<'a>(
46 sess: &'a ParseSess,
47 mut src: &'a str,
48 mut start_pos: BytePos,
49 override_span: Option<Span>,
50 ) -> Result<TokenStream, Vec<Diagnostic>> {
51 // Skip `#!`, if present.
52 if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
53 src = &src[shebang_len..];
54 start_pos = start_pos + BytePos::from_usize(shebang_len);
55 }
56
57 let cursor = Cursor::new(src);
58 let string_reader = StringReader {
59 sess,
60 start_pos,
61 pos: start_pos,
62 src,
63 cursor,
64 override_span,
65 nbsp_is_whitespace: false,
66 };
67 let (stream, res, unmatched_delims) =
68 tokentrees::TokenTreesReader::parse_all_token_trees(string_reader);
69 match res {
70 Ok(_open_spacing) if unmatched_delims.is_empty() => Ok(stream),
71 _ => {
72 // Return error if there are unmatched delimiters or unclosed delimiters.
73 // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
74 // because the delimiter mismatch is more likely to be the root cause of error
75
76 let mut buffer = Vec::with_capacity(1);
77 for unmatched in unmatched_delims {
78 if let Some(err) = make_unclosed_delims_error(unmatched, sess) {
79 err.buffer(&mut buffer);
80 }
81 }
82 if let Err(errs) = res {
83 // Add unclosing delimiter or diff marker errors
84 for err in errs {
85 err.buffer(&mut buffer);
86 }
87 }
88 Err(buffer)
89 }
90 }
91 }
92
93 struct StringReader<'a> {
94 sess: &'a ParseSess,
95 /// Initial position, read-only.
96 start_pos: BytePos,
97 /// The absolute offset within the source_map of the current character.
98 pos: BytePos,
99 /// Source text to tokenize.
100 src: &'a str,
101 /// Cursor for getting lexer tokens.
102 cursor: Cursor<'a>,
103 override_span: Option<Span>,
104 /// When a "unknown start of token: \u{a0}" has already been emitted earlier
105 /// in this file, it's safe to treat further occurrences of the non-breaking
106 /// space character as whitespace.
107 nbsp_is_whitespace: bool,
108 }
109
110 impl<'a> StringReader<'a> {
111 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
112 self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
113 }
114
115 /// Returns the next token, paired with a bool indicating if the token was
116 /// preceded by whitespace.
117 fn next_token(&mut self) -> (Token, bool) {
118 let mut preceded_by_whitespace = false;
119 let mut swallow_next_invalid = 0;
120 // Skip trivial (whitespace & comments) tokens
121 loop {
122 let str_before = self.cursor.as_str();
123 let token = self.cursor.advance_token();
124 let start = self.pos;
125 self.pos = self.pos + BytePos(token.len);
126
127 debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
128
129 // Now "cook" the token, converting the simple `rustc_lexer::TokenKind` enum into a
130 // rich `rustc_ast::TokenKind`. This turns strings into interned symbols and runs
131 // additional validation.
132 let kind = match token.kind {
133 rustc_lexer::TokenKind::LineComment { doc_style } => {
134 // Skip non-doc comments
135 let Some(doc_style) = doc_style else {
136 self.lint_unicode_text_flow(start);
137 preceded_by_whitespace = true;
138 continue;
139 };
140
141 // Opening delimiter of the length 3 is not included into the symbol.
142 let content_start = start + BytePos(3);
143 let content = self.str_from(content_start);
144 self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
145 }
146 rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
147 if !terminated {
148 self.report_unterminated_block_comment(start, doc_style);
149 }
150
151 // Skip non-doc comments
152 let Some(doc_style) = doc_style else {
153 self.lint_unicode_text_flow(start);
154 preceded_by_whitespace = true;
155 continue;
156 };
157
158 // Opening delimiter of the length 3 and closing delimiter of the length 2
159 // are not included into the symbol.
160 let content_start = start + BytePos(3);
161 let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
162 let content = self.str_from_to(content_start, content_end);
163 self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
164 }
165 rustc_lexer::TokenKind::Whitespace => {
166 preceded_by_whitespace = true;
167 continue;
168 }
169 rustc_lexer::TokenKind::Ident => {
170 self.ident(start)
171 }
172 rustc_lexer::TokenKind::RawIdent => {
173 let sym = nfc_normalize(self.str_from(start + BytePos(2)));
174 let span = self.mk_sp(start, self.pos);
175 self.sess.symbol_gallery.insert(sym, span);
176 if !sym.can_be_raw() {
177 self.sess.emit_err(errors::CannotBeRawIdent { span, ident: sym });
178 }
179 self.sess.raw_identifier_spans.push(span);
180 token::Ident(sym, true)
181 }
182 rustc_lexer::TokenKind::UnknownPrefix => {
183 self.report_unknown_prefix(start);
184 self.ident(start)
185 }
186 rustc_lexer::TokenKind::InvalidIdent
187 // Do not recover an identifier with emoji if the codepoint is a confusable
188 // with a recoverable substitution token, like `âž–`.
189 if !UNICODE_ARRAY
190 .iter()
191 .any(|&(c, _, _)| {
192 let sym = self.str_from(start);
193 sym.chars().count() == 1 && c == sym.chars().next().unwrap()
194 }) =>
195 {
196 let sym = nfc_normalize(self.str_from(start));
197 let span = self.mk_sp(start, self.pos);
198 self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default()
199 .push(span);
200 token::Ident(sym, false)
201 }
202 // split up (raw) c string literals to an ident and a string literal when edition < 2021.
203 rustc_lexer::TokenKind::Literal {
204 kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
205 suffix_start: _,
206 } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
207 let prefix_len = match kind {
208 LiteralKind::CStr { .. } => 1,
209 LiteralKind::RawCStr { .. } => 2,
210 _ => unreachable!(),
211 };
212
213 // reset the state so that only the prefix ("c" or "cr")
214 // was consumed.
215 let lit_start = start + BytePos(prefix_len);
216 self.pos = lit_start;
217 self.cursor = Cursor::new(&str_before[prefix_len as usize..]);
218
219 self.report_unknown_prefix(start);
220 let prefix_span = self.mk_sp(start, lit_start);
221 return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
222 }
223 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
224 let suffix_start = start + BytePos(suffix_start);
225 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
226 if let token::LitKind::CStr | token::LitKind::CStrRaw(_) = kind {
227 self.sess.gated_spans.gate(sym::c_str_literals, self.mk_sp(start, self.pos));
228 }
229 let suffix = if suffix_start < self.pos {
230 let string = self.str_from(suffix_start);
231 if string == "_" {
232 self.sess
233 .dcx
234 .emit_err(errors::UnderscoreLiteralSuffix { span: self.mk_sp(suffix_start, self.pos) });
235 None
236 } else {
237 Some(Symbol::intern(string))
238 }
239 } else {
240 None
241 };
242 token::Literal(token::Lit { kind, symbol, suffix })
243 }
244 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
245 // Include the leading `'` in the real identifier, for macro
246 // expansion purposes. See #12512 for the gory details of why
247 // this is necessary.
248 let lifetime_name = self.str_from(start);
249 if starts_with_number {
250 let span = self.mk_sp(start, self.pos);
251 let mut diag = self.sess.struct_err("lifetimes cannot start with a number");
252 diag.set_span(span);
253 diag.stash(span, StashKey::LifetimeIsChar);
254 }
255 let ident = Symbol::intern(lifetime_name);
256 token::Lifetime(ident)
257 }
258 rustc_lexer::TokenKind::Semi => token::Semi,
259 rustc_lexer::TokenKind::Comma => token::Comma,
260 rustc_lexer::TokenKind::Dot => token::Dot,
261 rustc_lexer::TokenKind::OpenParen => token::OpenDelim(Delimiter::Parenthesis),
262 rustc_lexer::TokenKind::CloseParen => token::CloseDelim(Delimiter::Parenthesis),
263 rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(Delimiter::Brace),
264 rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(Delimiter::Brace),
265 rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(Delimiter::Bracket),
266 rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(Delimiter::Bracket),
267 rustc_lexer::TokenKind::At => token::At,
268 rustc_lexer::TokenKind::Pound => token::Pound,
269 rustc_lexer::TokenKind::Tilde => token::Tilde,
270 rustc_lexer::TokenKind::Question => token::Question,
271 rustc_lexer::TokenKind::Colon => token::Colon,
272 rustc_lexer::TokenKind::Dollar => token::Dollar,
273 rustc_lexer::TokenKind::Eq => token::Eq,
274 rustc_lexer::TokenKind::Bang => token::Not,
275 rustc_lexer::TokenKind::Lt => token::Lt,
276 rustc_lexer::TokenKind::Gt => token::Gt,
277 rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
278 rustc_lexer::TokenKind::And => token::BinOp(token::And),
279 rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
280 rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
281 rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
282 rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
283 rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
284 rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
285
286 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
287 // Don't emit diagnostics for sequences of the same invalid token
288 if swallow_next_invalid > 0 {
289 swallow_next_invalid -= 1;
290 continue;
291 }
292 let mut it = self.str_from_to_end(start).chars();
293 let c = it.next().unwrap();
294 if c == '\u{00a0}' {
295 // If an error has already been reported on non-breaking
296 // space characters earlier in the file, treat all
297 // subsequent occurrences as whitespace.
298 if self.nbsp_is_whitespace {
299 preceded_by_whitespace = true;
300 continue;
301 }
302 self.nbsp_is_whitespace = true;
303 }
304 let repeats = it.take_while(|c1| *c1 == c).count();
305 // FIXME: the lexer could be used to turn the ASCII version of unicode
306 // homoglyphs, instead of keeping a table in `check_for_substitution`into the
307 // token. Ideally, this should be inside `rustc_lexer`. However, we should
308 // first remove compound tokens like `<<` from `rustc_lexer`, and then add
309 // fancier error recovery to it, as there will be less overall work to do this
310 // way.
311 let (token, sugg) = unicode_chars::check_for_substitution(self, start, c, repeats+1);
312 self.sess.emit_err(errors::UnknownTokenStart {
313 span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
314 escaped: escaped_char(c),
315 sugg,
316 null: if c == '\x00' {Some(errors::UnknownTokenNull)} else {None},
317 repeat: if repeats > 0 {
318 swallow_next_invalid = repeats;
319 Some(errors::UnknownTokenRepeat { repeats })
320 } else {None}
321 });
322
323 if let Some(token) = token {
324 token
325 } else {
326 preceded_by_whitespace = true;
327 continue;
328 }
329 }
330 rustc_lexer::TokenKind::Eof => token::Eof,
331 };
332 let span = self.mk_sp(start, self.pos);
333 return (Token::new(kind, span), preceded_by_whitespace);
334 }
335 }
336
337 fn ident(&self, start: BytePos) -> TokenKind {
338 let sym = nfc_normalize(self.str_from(start));
339 let span = self.mk_sp(start, self.pos);
340 self.sess.symbol_gallery.insert(sym, span);
341 token::Ident(sym, false)
342 }
343
344 fn struct_fatal_span_char(
345 &self,
346 from_pos: BytePos,
347 to_pos: BytePos,
348 m: &str,
349 c: char,
350 ) -> DiagnosticBuilder<'a, !> {
351 self.sess
352 .dcx
353 .struct_span_fatal(self.mk_sp(from_pos, to_pos), format!("{}: {}", m, escaped_char(c)))
354 }
355
356 /// Detect usages of Unicode codepoints changing the direction of the text on screen and loudly
357 /// complain about it.
358 fn lint_unicode_text_flow(&self, start: BytePos) {
359 // Opening delimiter of the length 2 is not included into the comment text.
360 let content_start = start + BytePos(2);
361 let content = self.str_from(content_start);
362 if contains_text_flow_control_chars(content) {
363 let span = self.mk_sp(start, self.pos);
364 self.sess.buffer_lint_with_diagnostic(
365 TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
366 span,
367 ast::CRATE_NODE_ID,
368 "unicode codepoint changing visible direction of text present in comment",
369 BuiltinLintDiagnostics::UnicodeTextFlow(span, content.to_string()),
370 );
371 }
372 }
373
374 fn cook_doc_comment(
375 &self,
376 content_start: BytePos,
377 content: &str,
378 comment_kind: CommentKind,
379 doc_style: DocStyle,
380 ) -> TokenKind {
381 if content.contains('\r') {
382 for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
383 let span = self.mk_sp(
384 content_start + BytePos(idx as u32),
385 content_start + BytePos(idx as u32 + 1),
386 );
387 let block = matches!(comment_kind, CommentKind::Block);
388 self.sess.emit_err(errors::CrDocComment { span, block });
389 }
390 }
391
392 let attr_style = match doc_style {
393 DocStyle::Outer => AttrStyle::Outer,
394 DocStyle::Inner => AttrStyle::Inner,
395 };
396
397 token::DocComment(comment_kind, attr_style, Symbol::intern(content))
398 }
399
400 fn cook_lexer_literal(
401 &self,
402 start: BytePos,
403 end: BytePos,
404 kind: rustc_lexer::LiteralKind,
405 ) -> (token::LitKind, Symbol) {
406 match kind {
407 rustc_lexer::LiteralKind::Char { terminated } => {
408 if !terminated {
409 self.sess.dcx.span_fatal_with_code(
410 self.mk_sp(start, end),
411 "unterminated character literal",
412 error_code!(E0762),
413 )
414 }
415 self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
416 }
417 rustc_lexer::LiteralKind::Byte { terminated } => {
418 if !terminated {
419 self.sess.dcx.span_fatal_with_code(
420 self.mk_sp(start + BytePos(1), end),
421 "unterminated byte constant",
422 error_code!(E0763),
423 )
424 }
425 self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
426 }
427 rustc_lexer::LiteralKind::Str { terminated } => {
428 if !terminated {
429 self.sess.dcx.span_fatal_with_code(
430 self.mk_sp(start, end),
431 "unterminated double quote string",
432 error_code!(E0765),
433 )
434 }
435 self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
436 }
437 rustc_lexer::LiteralKind::ByteStr { terminated } => {
438 if !terminated {
439 self.sess.dcx.span_fatal_with_code(
440 self.mk_sp(start + BytePos(1), end),
441 "unterminated double quote byte string",
442 error_code!(E0766),
443 )
444 }
445 self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
446 }
447 rustc_lexer::LiteralKind::CStr { terminated } => {
448 if !terminated {
449 self.sess.dcx.span_fatal_with_code(
450 self.mk_sp(start + BytePos(1), end),
451 "unterminated C string",
452 error_code!(E0767),
453 )
454 }
455 self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
456 }
457 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
458 if let Some(n_hashes) = n_hashes {
459 let n = u32::from(n_hashes);
460 let kind = token::StrRaw(n_hashes);
461 self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
462 } else {
463 self.report_raw_str_error(start, 1);
464 }
465 }
466 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
467 if let Some(n_hashes) = n_hashes {
468 let n = u32::from(n_hashes);
469 let kind = token::ByteStrRaw(n_hashes);
470 self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
471 } else {
472 self.report_raw_str_error(start, 2);
473 }
474 }
475 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
476 if let Some(n_hashes) = n_hashes {
477 let n = u32::from(n_hashes);
478 let kind = token::CStrRaw(n_hashes);
479 self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
480 } else {
481 self.report_raw_str_error(start, 2);
482 }
483 }
484 rustc_lexer::LiteralKind::Int { base, empty_int } => {
485 if empty_int {
486 let span = self.mk_sp(start, end);
487 self.sess.emit_err(errors::NoDigitsLiteral { span });
488 (token::Integer, sym::integer(0))
489 } else {
490 if matches!(base, Base::Binary | Base::Octal) {
491 let base = base as u32;
492 let s = self.str_from_to(start + BytePos(2), end);
493 for (idx, c) in s.char_indices() {
494 let span = self.mk_sp(
495 start + BytePos::from_usize(2 + idx),
496 start + BytePos::from_usize(2 + idx + c.len_utf8()),
497 );
498 if c != '_' && c.to_digit(base).is_none() {
499 self.sess.emit_err(errors::InvalidDigitLiteral { span, base });
500 }
501 }
502 }
503 (token::Integer, self.symbol_from_to(start, end))
504 }
505 }
506 rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
507 if empty_exponent {
508 let span = self.mk_sp(start, self.pos);
509 self.sess.emit_err(errors::EmptyExponentFloat { span });
510 }
511 let base = match base {
512 Base::Hexadecimal => Some("hexadecimal"),
513 Base::Octal => Some("octal"),
514 Base::Binary => Some("binary"),
515 _ => None,
516 };
517 if let Some(base) = base {
518 let span = self.mk_sp(start, end);
519 self.sess.emit_err(errors::FloatLiteralUnsupportedBase { span, base });
520 }
521 (token::Float, self.symbol_from_to(start, end))
522 }
523 }
524 }
525
526 #[inline]
527 fn src_index(&self, pos: BytePos) -> usize {
528 (pos - self.start_pos).to_usize()
529 }
530
531 /// Slice of the source text from `start` up to but excluding `self.pos`,
532 /// meaning the slice does not include the character `self.ch`.
533 fn str_from(&self, start: BytePos) -> &'a str {
534 self.str_from_to(start, self.pos)
535 }
536
537 /// As symbol_from, with an explicit endpoint.
538 fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
539 debug!("taking an ident from {:?} to {:?}", start, end);
540 Symbol::intern(self.str_from_to(start, end))
541 }
542
543 /// Slice of the source text spanning from `start` up to but excluding `end`.
544 fn str_from_to(&self, start: BytePos, end: BytePos) -> &'a str {
545 &self.src[self.src_index(start)..self.src_index(end)]
546 }
547
548 /// Slice of the source text spanning from `start` until the end
549 fn str_from_to_end(&self, start: BytePos) -> &'a str {
550 &self.src[self.src_index(start)..]
551 }
552
553 fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
554 match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
555 Err(RawStrError::InvalidStarter { bad_char }) => {
556 self.report_non_started_raw_string(start, bad_char)
557 }
558 Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
559 .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
560 Err(RawStrError::TooManyDelimiters { found }) => {
561 self.report_too_many_hashes(start, found)
562 }
563 Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
564 }
565 }
566
567 fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
568 self.struct_fatal_span_char(
569 start,
570 self.pos,
571 "found invalid character; only `#` is allowed in raw string delimitation",
572 bad_char,
573 )
574 .emit()
575 }
576
577 fn report_unterminated_raw_string(
578 &self,
579 start: BytePos,
580 n_hashes: u32,
581 possible_offset: Option<u32>,
582 found_terminators: u32,
583 ) -> ! {
584 let mut err = self.sess.dcx.struct_span_fatal_with_code(
585 self.mk_sp(start, start),
586 "unterminated raw string",
587 error_code!(E0748),
588 );
589
590 err.span_label(self.mk_sp(start, start), "unterminated raw string");
591
592 if n_hashes > 0 {
593 err.note(format!(
594 "this raw string should be terminated with `\"{}`",
595 "#".repeat(n_hashes as usize)
596 ));
597 }
598
599 if let Some(possible_offset) = possible_offset {
600 let lo = start + BytePos(possible_offset);
601 let hi = lo + BytePos(found_terminators);
602 let span = self.mk_sp(lo, hi);
603 err.span_suggestion(
604 span,
605 "consider terminating the string here",
606 "#".repeat(n_hashes as usize),
607 Applicability::MaybeIncorrect,
608 );
609 }
610
611 err.emit()
612 }
613
614 fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
615 let msg = match doc_style {
616 Some(_) => "unterminated block doc-comment",
617 None => "unterminated block comment",
618 };
619 let last_bpos = self.pos;
620 let mut err = self.sess.dcx.struct_span_fatal_with_code(
621 self.mk_sp(start, last_bpos),
622 msg,
623 error_code!(E0758),
624 );
625 let mut nested_block_comment_open_idxs = vec![];
626 let mut last_nested_block_comment_idxs = None;
627 let mut content_chars = self.str_from(start).char_indices().peekable();
628
629 while let Some((idx, current_char)) = content_chars.next() {
630 match content_chars.peek() {
631 Some((_, '*')) if current_char == '/' => {
632 nested_block_comment_open_idxs.push(idx);
633 }
634 Some((_, '/')) if current_char == '*' => {
635 last_nested_block_comment_idxs =
636 nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
637 }
638 _ => {}
639 };
640 }
641
642 if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
643 err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
644 .span_label(
645 self.mk_sp(
646 start + BytePos(nested_open_idx as u32),
647 start + BytePos(nested_open_idx as u32 + 2),
648 ),
649 "...as last nested comment starts here, maybe you want to close this instead?",
650 )
651 .span_label(
652 self.mk_sp(
653 start + BytePos(nested_close_idx as u32),
654 start + BytePos(nested_close_idx as u32 + 2),
655 ),
656 "...and last nested comment terminates here.",
657 );
658 }
659
660 err.emit();
661 }
662
663 // RFC 3101 introduced the idea of (reserved) prefixes. As of Rust 2021,
664 // using a (unknown) prefix is an error. In earlier editions, however, they
665 // only result in a (allowed by default) lint, and are treated as regular
666 // identifier tokens.
667 fn report_unknown_prefix(&self, start: BytePos) {
668 let prefix_span = self.mk_sp(start, self.pos);
669 let prefix = self.str_from_to(start, self.pos);
670
671 let expn_data = prefix_span.ctxt().outer_expn_data();
672
673 if expn_data.edition >= Edition::Edition2021 {
674 // In Rust 2021, this is a hard error.
675 let sugg = if prefix == "rb" {
676 Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
677 } else if expn_data.is_root() {
678 Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
679 } else {
680 None
681 };
682 self.sess.emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
683 } else {
684 // Before Rust 2021, only emit a lint for migration.
685 self.sess.buffer_lint_with_diagnostic(
686 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
687 prefix_span,
688 ast::CRATE_NODE_ID,
689 format!("prefix `{prefix}` is unknown"),
690 BuiltinLintDiagnostics::ReservedPrefix(prefix_span),
691 );
692 }
693 }
694
695 fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
696 self.sess.emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
697 }
698
699 fn cook_common(
700 &self,
701 kind: token::LitKind,
702 mode: Mode,
703 start: BytePos,
704 end: BytePos,
705 prefix_len: u32,
706 postfix_len: u32,
707 unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
708 ) -> (token::LitKind, Symbol) {
709 let mut has_fatal_err = false;
710 let content_start = start + BytePos(prefix_len);
711 let content_end = end - BytePos(postfix_len);
712 let lit_content = self.str_from_to(content_start, content_end);
713 unescape(lit_content, mode, &mut |range, result| {
714 // Here we only check for errors. The actual unescaping is done later.
715 if let Err(err) = result {
716 let span_with_quotes = self.mk_sp(start, end);
717 let (start, end) = (range.start as u32, range.end as u32);
718 let lo = content_start + BytePos(start);
719 let hi = lo + BytePos(end - start);
720 let span = self.mk_sp(lo, hi);
721 if err.is_fatal() {
722 has_fatal_err = true;
723 }
724 emit_unescape_error(
725 &self.sess.dcx,
726 lit_content,
727 span_with_quotes,
728 span,
729 mode,
730 range,
731 err,
732 );
733 }
734 });
735
736 // We normally exclude the quotes for the symbol, but for errors we
737 // include it because it results in clearer error messages.
738 if !has_fatal_err {
739 (kind, Symbol::intern(lit_content))
740 } else {
741 (token::Err, self.symbol_from_to(start, end))
742 }
743 }
744
745 fn cook_quoted(
746 &self,
747 kind: token::LitKind,
748 mode: Mode,
749 start: BytePos,
750 end: BytePos,
751 prefix_len: u32,
752 postfix_len: u32,
753 ) -> (token::LitKind, Symbol) {
754 self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
755 unescape::unescape_literal(src, mode, &mut |span, result| {
756 callback(span, result.map(drop))
757 })
758 })
759 }
760
761 fn cook_c_string(
762 &self,
763 kind: token::LitKind,
764 mode: Mode,
765 start: BytePos,
766 end: BytePos,
767 prefix_len: u32,
768 postfix_len: u32,
769 ) -> (token::LitKind, Symbol) {
770 self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
771 unescape::unescape_c_string(src, mode, &mut |span, result| {
772 callback(span, result.map(drop))
773 })
774 })
775 }
776 }
777
778 pub fn nfc_normalize(string: &str) -> Symbol {
779 use unicode_normalization::{is_nfc_quick, IsNormalized, UnicodeNormalization};
780 match is_nfc_quick(string.chars()) {
781 IsNormalized::Yes => Symbol::intern(string),
782 _ => {
783 let normalized_str: String = string.chars().nfc().collect();
784 Symbol::intern(&normalized_str)
785 }
786 }
787 }