compiler/rustc_parse/src/lexer/unicode_chars.rs

   1 // Characters and their corresponding confusables were collected from
   2 // https://www.unicode.org/Public/security/10.0.0/confusables.txt
   3
   4 use super::StringReader;
   5 use crate::token;
   6 use rustc_errors::{Applicability, DiagnosticBuilder};
   7 use rustc_span::{symbol::kw, BytePos, Pos, Span};
   8
   9 #[rustfmt::skip] // for line breaks
  10 pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
  11     (' ', "Line Separator", ' '),
  12     (' ', "Paragraph Separator", ' '),
  13     (' ', "Ogham Space mark", ' '),
  14     (' ', "En Quad", ' '),
  15     (' ', "Em Quad", ' '),
  16     (' ', "En Space", ' '),
  17     (' ', "Em Space", ' '),
  18     (' ', "Three-Per-Em Space", ' '),
  19     (' ', "Four-Per-Em Space", ' '),
  20     (' ', "Six-Per-Em Space", ' '),
  21     (' ', "Punctuation Space", ' '),
  22     (' ', "Thin Space", ' '),
  23     (' ', "Hair Space", ' '),
  24     (' ', "Medium Mathematical Space", ' '),
  25     (' ', "No-Break Space", ' '),
  26     (' ', "Figure Space", ' '),
  27     (' ', "Narrow No-Break Space", ' '),
  28     ('　', "Ideographic Space", ' '),
  29
  30     ('ߺ', "Nko Lajanyalan", '_'),
  31     ('﹍', "Dashed Low Line", '_'),
  32     ('﹎', "Centreline Low Line", '_'),
  33     ('﹏', "Wavy Low Line", '_'),
  34     ('＿', "Fullwidth Low Line", '_'),
  35
  36     ('‐', "Hyphen", '-'),
  37     ('‑', "Non-Breaking Hyphen", '-'),
  38     ('‒', "Figure Dash", '-'),
  39     ('–', "En Dash", '-'),
  40     ('—', "Em Dash", '-'),
  41     ('﹘', "Small Em Dash", '-'),
  42     ('۔', "Arabic Full Stop", '-'),
  43     ('⁃', "Hyphen Bullet", '-'),
  44     ('˗', "Modifier Letter Minus Sign", '-'),
  45     ('−', "Minus Sign", '-'),
  46     ('➖', "Heavy Minus Sign", '-'),
  47     ('Ⲻ', "Coptic Letter Dialect-P Ni", '-'),
  48     ('ー', "Katakana-Hiragana Prolonged Sound Mark", '-'),
  49     ('－', "Fullwidth Hyphen-Minus", '-'),
  50     ('―', "Horizontal Bar", '-'),
  51     ('─', "Box Drawings Light Horizontal", '-'),
  52     ('━', "Box Drawings Heavy Horizontal", '-'),
  53     ('㇐', "CJK Stroke H", '-'),
  54     ('ꟷ', "Latin Epigraphic Letter Sideways I", '-'),
  55     ('ᅳ', "Hangul Jungseong Eu", '-'),
  56     ('ㅡ', "Hangul Letter Eu", '-'),
  57     ('一', "CJK Unified Ideograph-4E00", '-'),
  58     ('⼀', "Kangxi Radical One", '-'),
  59
  60     ('؍', "Arabic Date Separator", ','),
  61     ('٫', "Arabic Decimal Separator", ','),
  62     ('‚', "Single Low-9 Quotation Mark", ','),
  63     ('¸', "Cedilla", ','),
  64     ('ꓹ', "Lisu Letter Tone Na Po", ','),
  65     ('，', "Fullwidth Comma", ','),
  66
  67     (';', "Greek Question Mark", ';'),
  68     ('；', "Fullwidth Semicolon", ';'),
  69     ('︔', "Presentation Form For Vertical Semicolon", ';'),
  70
  71     ('ः', "Devanagari Sign Visarga", ':'),
  72     ('ઃ', "Gujarati Sign Visarga", ':'),
  73     ('：', "Fullwidth Colon", ':'),
  74     ('։', "Armenian Full Stop", ':'),
  75     ('܃', "Syriac Supralinear Colon", ':'),
  76     ('܄', "Syriac Sublinear Colon", ':'),
  77     ('᛬', "Runic Multiple Punctuation", ':'),
  78     ('︰', "Presentation Form For Vertical Two Dot Leader", ':'),
  79     ('᠃', "Mongolian Full Stop", ':'),
  80     ('᠉', "Mongolian Manchu Full Stop", ':'),
  81     ('⁚', "Two Dot Punctuation", ':'),
  82     ('׃', "Hebrew Punctuation Sof Pasuq", ':'),
  83     ('˸', "Modifier Letter Raised Colon", ':'),
  84     ('꞉', "Modifier Letter Colon", ':'),
  85     ('∶', "Ratio", ':'),
  86     ('ː', "Modifier Letter Triangular Colon", ':'),
  87     ('ꓽ', "Lisu Letter Tone Mya Jeu", ':'),
  88     ('︓', "Presentation Form For Vertical Colon", ':'),
  89
  90     ('！', "Fullwidth Exclamation Mark", '!'),
  91     ('ǃ', "Latin Letter Retroflex Click", '!'),
  92     ('ⵑ', "Tifinagh Letter Tuareg Yang", '!'),
  93     ('︕', "Presentation Form For Vertical Exclamation Mark", '!'),
  94
  95     ('ʔ', "Latin Letter Glottal Stop", '?'),
  96     ('Ɂ', "Latin Capital Letter Glottal Stop", '?'),
  97     ('ॽ', "Devanagari Letter Glottal Stop", '?'),
  98     ('Ꭾ', "Cherokee Letter He", '?'),
  99     ('ꛫ', "Bamum Letter Ntuu", '?'),
 100     ('？', "Fullwidth Question Mark", '?'),
 101     ('︖', "Presentation Form For Vertical Question Mark", '?'),
 102
 103     ('𝅭', "Musical Symbol Combining Augmentation Dot", '.'),
 104     ('․', "One Dot Leader", '.'),
 105     ('܁', "Syriac Supralinear Full Stop", '.'),
 106     ('܂', "Syriac Sublinear Full Stop", '.'),
 107     ('꘎', "Vai Full Stop", '.'),
 108     ('𐩐', "Kharoshthi Punctuation Dot", '.'),
 109     ('٠', "Arabic-Indic Digit Zero", '.'),
 110     ('۰', "Extended Arabic-Indic Digit Zero", '.'),
 111     ('ꓸ', "Lisu Letter Tone Mya Ti", '.'),
 112     ('·', "Middle Dot", '.'),
 113     ('・', "Katakana Middle Dot", '.'),
 114     ('･', "Halfwidth Katakana Middle Dot", '.'),
 115     ('᛫', "Runic Single Punctuation", '.'),
 116     ('·', "Greek Ano Teleia", '.'),
 117     ('⸱', "Word Separator Middle Dot", '.'),
 118     ('𐄁', "Aegean Word Separator Dot", '.'),
 119     ('•', "Bullet", '.'),
 120     ('‧', "Hyphenation Point", '.'),
 121     ('∙', "Bullet Operator", '.'),
 122     ('⋅', "Dot Operator", '.'),
 123     ('ꞏ', "Latin Letter Sinological Dot", '.'),
 124     ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'),
 125     ('ᐧ', "Canadian Syllabics Final Middle Dot", '.'),
 126     ('．', "Fullwidth Full Stop", '.'),
 127     ('。', "Ideographic Full Stop", '.'),
 128     ('︒', "Presentation Form For Vertical Ideographic Full Stop", '.'),
 129
 130     ('՝', "Armenian Comma", '\''),
 131     ('＇', "Fullwidth Apostrophe", '\''),
 132     ('‘', "Left Single Quotation Mark", '\''),
 133     ('’', "Right Single Quotation Mark", '\''),
 134     ('‛', "Single High-Reversed-9 Quotation Mark", '\''),
 135     ('′', "Prime", '\''),
 136     ('‵', "Reversed Prime", '\''),
 137     ('՚', "Armenian Apostrophe", '\''),
 138     ('׳', "Hebrew Punctuation Geresh", '\''),
 139     ('`', "Grave Accent", '\''),
 140     ('`', "Greek Varia", '\''),
 141     ('｀', "Fullwidth Grave Accent", '\''),
 142     ('´', "Acute Accent", '\''),
 143     ('΄', "Greek Tonos", '\''),
 144     ('´', "Greek Oxia", '\''),
 145     ('᾽', "Greek Koronis", '\''),
 146     ('᾿', "Greek Psili", '\''),
 147     ('῾', "Greek Dasia", '\''),
 148     ('ʹ', "Modifier Letter Prime", '\''),
 149     ('ʹ', "Greek Numeral Sign", '\''),
 150     ('ˈ', "Modifier Letter Vertical Line", '\''),
 151     ('ˊ', "Modifier Letter Acute Accent", '\''),
 152     ('ˋ', "Modifier Letter Grave Accent", '\''),
 153     ('˴', "Modifier Letter Middle Grave Accent", '\''),
 154     ('ʻ', "Modifier Letter Turned Comma", '\''),
 155     ('ʽ', "Modifier Letter Reversed Comma", '\''),
 156     ('ʼ', "Modifier Letter Apostrophe", '\''),
 157     ('ʾ', "Modifier Letter Right Half Ring", '\''),
 158     ('ꞌ', "Latin Small Letter Saltillo", '\''),
 159     ('י', "Hebrew Letter Yod", '\''),
 160     ('ߴ', "Nko High Tone Apostrophe", '\''),
 161     ('ߵ', "Nko Low Tone Apostrophe", '\''),
 162     ('ᑊ', "Canadian Syllabics West-Cree P", '\''),
 163     ('ᛌ', "Runic Letter Short-Twig-Sol S", '\''),
 164     ('𖽑', "Miao Sign Aspiration", '\''),
 165     ('𖽒', "Miao Sign Reformed Voicing", '\''),
 166
 167     ('᳓', "Vedic Sign Nihshvasa", '"'),
 168     ('＂', "Fullwidth Quotation Mark", '"'),
 169     ('“', "Left Double Quotation Mark", '"'),
 170     ('”', "Right Double Quotation Mark", '"'),
 171     ('‟', "Double High-Reversed-9 Quotation Mark", '"'),
 172     ('″', "Double Prime", '"'),
 173     ('‶', "Reversed Double Prime", '"'),
 174     ('〃', "Ditto Mark", '"'),
 175     ('״', "Hebrew Punctuation Gershayim", '"'),
 176     ('˝', "Double Acute Accent", '"'),
 177     ('ʺ', "Modifier Letter Double Prime", '"'),
 178     ('˶', "Modifier Letter Middle Double Acute Accent", '"'),
 179     ('˵', "Modifier Letter Middle Double Grave Accent", '"'),
 180     ('ˮ', "Modifier Letter Double Apostrophe", '"'),
 181     ('ײ', "Hebrew Ligature Yiddish Double Yod", '"'),
 182     ('❞', "Heavy Double Comma Quotation Mark Ornament", '"'),
 183     ('❝', "Heavy Double Turned Comma Quotation Mark Ornament", '"'),
 184
 185     ('（', "Fullwidth Left Parenthesis", '('),
 186     ('❨', "Medium Left Parenthesis Ornament", '('),
 187     ('﴾', "Ornate Left Parenthesis", '('),
 188
 189     ('）', "Fullwidth Right Parenthesis", ')'),
 190     ('❩', "Medium Right Parenthesis Ornament", ')'),
 191     ('﴿', "Ornate Right Parenthesis", ')'),
 192
 193     ('［', "Fullwidth Left Square Bracket", '['),
 194     ('❲', "Light Left Tortoise Shell Bracket Ornament", '['),
 195     ('「', "Left Corner Bracket", '['),
 196     ('『', "Left White Corner Bracket", '['),
 197     ('【', "Left Black Lenticular Bracket", '['),
 198     ('〔', "Left Tortoise Shell Bracket", '['),
 199     ('〖', "Left White Lenticular Bracket", '['),
 200     ('〘', "Left White Tortoise Shell Bracket", '['),
 201     ('〚', "Left White Square Bracket", '['),
 202
 203     ('］', "Fullwidth Right Square Bracket", ']'),
 204     ('❳', "Light Right Tortoise Shell Bracket Ornament", ']'),
 205     ('」', "Right Corner Bracket", ']'),
 206     ('』', "Right White Corner Bracket", ']'),
 207     ('】', "Right Black Lenticular Bracket", ']'),
 208     ('〕', "Right Tortoise Shell Bracket", ']'),
 209     ('〗', "Right White Lenticular Bracket", ']'),
 210     ('〙', "Right White Tortoise Shell Bracket", ']'),
 211     ('〛', "Right White Square Bracket", ']'),
 212
 213     ('❴', "Medium Left Curly Bracket Ornament", '{'),
 214     ('𝄔', "Musical Symbol Brace", '{'),
 215     ('｛', "Fullwidth Left Curly Bracket", '{'),
 216
 217     ('❵', "Medium Right Curly Bracket Ornament", '}'),
 218     ('｝', "Fullwidth Right Curly Bracket", '}'),
 219
 220     ('⁎', "Low Asterisk", '*'),
 221     ('٭', "Arabic Five Pointed Star", '*'),
 222     ('∗', "Asterisk Operator", '*'),
 223     ('𐌟', "Old Italic Letter Ess", '*'),
 224     ('＊', "Fullwidth Asterisk", '*'),
 225
 226     ('᜵', "Philippine Single Punctuation", '/'),
 227     ('⁁', "Caret Insertion Point", '/'),
 228     ('∕', "Division Slash", '/'),
 229     ('⁄', "Fraction Slash", '/'),
 230     ('╱', "Box Drawings Light Diagonal Upper Right To Lower Left", '/'),
 231     ('⟋', "Mathematical Rising Diagonal", '/'),
 232     ('⧸', "Big Solidus", '/'),
 233     ('𝈺', "Greek Instrumental Notation Symbol-47", '/'),
 234     ('㇓', "CJK Stroke Sp", '/'),
 235     ('〳', "Vertical Kana Repeat Mark Upper Half", '/'),
 236     ('Ⳇ', "Coptic Capital Letter Old Coptic Esh", '/'),
 237     ('ノ', "Katakana Letter No", '/'),
 238     ('丿', "CJK Unified Ideograph-4E3F", '/'),
 239     ('⼃', "Kangxi Radical Slash", '/'),
 240     ('／', "Fullwidth Solidus", '/'),
 241
 242     ('＼', "Fullwidth Reverse Solidus", '\\'),
 243     ('﹨', "Small Reverse Solidus", '\\'),
 244     ('∖', "Set Minus", '\\'),
 245     ('⟍', "Mathematical Falling Diagonal", '\\'),
 246     ('⧵', "Reverse Solidus Operator", '\\'),
 247     ('⧹', "Big Reverse Solidus", '\\'),
 248     ('⧹', "Greek Vocal Notation Symbol-16", '\\'),
 249     ('⧹', "Greek Instrumental Symbol-48", '\\'),
 250     ('㇔', "CJK Stroke D", '\\'),
 251     ('丶', "CJK Unified Ideograph-4E36", '\\'),
 252     ('⼂', "Kangxi Radical Dot", '\\'),
 253     ('、', "Ideographic Comma", '\\'),
 254     ('ヽ', "Katakana Iteration Mark", '\\'),
 255
 256     ('ꝸ', "Latin Small Letter Um", '&'),
 257     ('＆', "Fullwidth Ampersand", '&'),
 258
 259     ('᛭', "Runic Cross Punctuation", '+'),
 260     ('➕', "Heavy Plus Sign", '+'),
 261     ('𐊛', "Lycian Letter H", '+'),
 262     ('﬩', "Hebrew Letter Alternative Plus Sign", '+'),
 263     ('＋', "Fullwidth Plus Sign", '+'),
 264
 265     ('‹', "Single Left-Pointing Angle Quotation Mark", '<'),
 266     ('❮', "Heavy Left-Pointing Angle Quotation Mark Ornament", '<'),
 267     ('˂', "Modifier Letter Left Arrowhead", '<'),
 268     ('𝈶', "Greek Instrumental Symbol-40", '<'),
 269     ('ᐸ', "Canadian Syllabics Pa", '<'),
 270     ('ᚲ', "Runic Letter Kauna", '<'),
 271     ('❬', "Medium Left-Pointing Angle Bracket Ornament", '<'),
 272     ('⟨', "Mathematical Left Angle Bracket", '<'),
 273     ('〈', "Left-Pointing Angle Bracket", '<'),
 274     ('〈', "Left Angle Bracket", '<'),
 275     ('㇛', "CJK Stroke Pd", '<'),
 276     ('く', "Hiragana Letter Ku", '<'),
 277     ('𡿨', "CJK Unified Ideograph-21FE8", '<'),
 278     ('《', "Left Double Angle Bracket", '<'),
 279     ('＜', "Fullwidth Less-Than Sign", '<'),
 280
 281     ('᐀', "Canadian Syllabics Hyphen", '='),
 282     ('⹀', "Double Hyphen", '='),
 283     ('゠', "Katakana-Hiragana Double Hyphen", '='),
 284     ('꓿', "Lisu Punctuation Full Stop", '='),
 285     ('＝', "Fullwidth Equals Sign", '='),
 286
 287     ('›', "Single Right-Pointing Angle Quotation Mark", '>'),
 288     ('❯', "Heavy Right-Pointing Angle Quotation Mark Ornament", '>'),
 289     ('˃', "Modifier Letter Right Arrowhead", '>'),
 290     ('𝈷', "Greek Instrumental Symbol-42", '>'),
 291     ('ᐳ', "Canadian Syllabics Po", '>'),
 292     ('𖼿', "Miao Letter Archaic Zza", '>'),
 293     ('❭', "Medium Right-Pointing Angle Bracket Ornament", '>'),
 294     ('⟩', "Mathematical Right Angle Bracket", '>'),
 295     ('〉', "Right-Pointing Angle Bracket", '>'),
 296     ('〉', "Right Angle Bracket", '>'),
 297     ('》', "Right Double Angle Bracket", '>'),
 298     ('＞', "Fullwidth Greater-Than Sign", '>'),
 299 ];
 300
 301 // FIXME: the lexer could be used to turn the ASCII version of unicode homoglyphs, instead of
 302 // keeping the substitution token in this table. Ideally, this should be inside `rustc_lexer`.
 303 // However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
 304 // fancier error recovery to it, as there will be less overall work to do this way.
 305 const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
 306     (' ', "Space", None),
 307     ('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
 308     ('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
 309     (',', "Comma", Some(token::Comma)),
 310     (';', "Semicolon", Some(token::Semi)),
 311     (':', "Colon", Some(token::Colon)),
 312     ('!', "Exclamation Mark", Some(token::Not)),
 313     ('?', "Question Mark", Some(token::Question)),
 314     ('.', "Period", Some(token::Dot)),
 315     ('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
 316     (')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
 317     ('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
 318     (']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
 319     ('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
 320     ('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
 321     ('*', "Asterisk", Some(token::BinOp(token::Star))),
 322     ('/', "Slash", Some(token::BinOp(token::Slash))),
 323     ('\\', "Backslash", None),
 324     ('&', "Ampersand", Some(token::BinOp(token::And))),
 325     ('+', "Plus Sign", Some(token::BinOp(token::Plus))),
 326     ('<', "Less-Than Sign", Some(token::Lt)),
 327     ('=', "Equals Sign", Some(token::Eq)),
 328     ('>', "Greater-Than Sign", Some(token::Gt)),
 329     // FIXME: Literals are already lexed by this point, so we can't recover gracefully just by
 330     // spitting the correct token out.
 331     ('\'', "Single Quote", None),
 332     ('"', "Quotation Mark", None),
 333 ];
 334
 335 pub(super) fn check_for_substitution<'a>(
 336     reader: &StringReader<'a>,
 337     pos: BytePos,
 338     ch: char,
 339     err: &mut DiagnosticBuilder<'a>,
 340 ) -> Option<token::TokenKind> {
 341     let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
 342         Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
 343         None => return None,
 344     };
 345
 346     let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
 347
 348     let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
 349         Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
 350         None => {
 351             let msg = format!("substitution character not found for '{}'", ch);
 352             reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
 353             return None;
 354         }
 355     };
 356
 357     // special help suggestion for "directed" double quotes
 358     if let Some(s) = peek_delimited(&reader.src[reader.src_index(pos)..], '“', '”') {
 359         let msg = format!(
 360             "Unicode characters '“' (Left Double Quotation Mark) and \
 361              '”' (Right Double Quotation Mark) look like '{}' ({}), but are not",
 362             ascii_char, ascii_name
 363         );
 364         err.span_suggestion(
 365             Span::with_root_ctxt(
 366                 pos,
 367                 pos + Pos::from_usize('“'.len_utf8() + s.len() + '”'.len_utf8()),
 368             ),
 369             &msg,
 370             format!("\"{}\"", s),
 371             Applicability::MaybeIncorrect,
 372         );
 373     } else {
 374         let msg = format!(
 375             "Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
 376             ch, u_name, ascii_char, ascii_name
 377         );
 378         err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
 379     }
 380     token.clone()
 381 }
 382
 383 /// Extract string if found at current position with given delimiters
 384 fn peek_delimited(text: &str, from_ch: char, to_ch: char) -> Option<&str> {
 385     let mut chars = text.chars();
 386     let first_char = chars.next()?;
 387     if first_char != from_ch {
 388         return None;
 389     }
 390     let last_char_idx = chars.as_str().find(to_ch)?;
 391     Some(&chars.as_str()[..last_char_idx])
 392 }