compiler/rustc_lint/src/non_ascii_idents.rs

   1 use crate::{EarlyContext, EarlyLintPass, LintContext};
   2 use rustc_ast as ast;
   3 use rustc_data_structures::fx::FxHashMap;
   4 use rustc_span::symbol::Symbol;
   5
   6 declare_lint! {
   7     /// The `non_ascii_idents` lint detects non-ASCII identifiers.
   8     ///
   9     /// ### Example
  10     ///
  11     /// ```rust,compile_fail
  12     /// # #![allow(unused)]
  13     /// #![feature(non_ascii_idents)]
  14     /// #![deny(non_ascii_idents)]
  15     /// fn main() {
  16     ///     let föö = 1;
  17     /// }
  18     /// ```
  19     ///
  20     /// {{produces}}
  21     ///
  22     /// ### Explanation
  23     ///
  24     /// Currently on stable Rust, identifiers must contain ASCII characters.
  25     /// The [`non_ascii_idents`] nightly-only feature allows identifiers to
  26     /// contain non-ASCII characters. This lint allows projects that wish to
  27     /// retain the limit of only using ASCII characters to switch this lint to
  28     /// "forbid" (for example to ease collaboration or for security reasons).
  29     /// See [RFC 2457] for more details.
  30     ///
  31     /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
  32     /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
  33     pub NON_ASCII_IDENTS,
  34     Allow,
  35     "detects non-ASCII identifiers",
  36     crate_level_only
  37 }
  38
  39 declare_lint! {
  40     /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
  41     /// identifiers.
  42     ///
  43     /// ### Example
  44     ///
  45     /// ```rust
  46     /// # #![allow(unused)]
  47     /// #![feature(non_ascii_idents)]
  48     /// const µ: f64 = 0.000001;
  49     /// ```
  50     ///
  51     /// {{produces}}
  52     ///
  53     /// ### Explanation
  54     ///
  55     /// With the [`non_ascii_idents`] nightly-only feature enabled,
  56     /// identifiers are allowed to use non-ASCII characters. This lint warns
  57     /// about using characters which are not commonly used, and may cause
  58     /// visual confusion.
  59     ///
  60     /// This lint is triggered by identifiers that contain a codepoint that is
  61     /// not part of the set of "Allowed" codepoints as described by [Unicode®
  62     /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
  63     /// Security Profile for Identifiers][TR39Allowed].
  64     ///
  65     /// Note that the set of uncommon codepoints may change over time. Beware
  66     /// that if you "forbid" this lint that existing code may fail in the
  67     /// future.
  68     ///
  69     /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
  70     /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
  71     pub UNCOMMON_CODEPOINTS,
  72     Warn,
  73     "detects uncommon Unicode codepoints in identifiers",
  74     crate_level_only
  75 }
  76
  77 declare_lint! {
  78     /// The `confusable_idents` lint detects visually confusable pairs between
  79     /// identifiers.
  80     ///
  81     /// ### Example
  82     ///
  83     /// ```rust
  84     /// #![feature(non_ascii_idents)]
  85     ///
  86     /// // Latin Capital Letter E With Caron
  87     /// pub const Ě: i32 = 1;
  88     /// // Latin Capital Letter E With Breve
  89     /// pub const Ĕ: i32 = 2;
  90     /// ```
  91     ///
  92     /// {{produces}}
  93     ///
  94     /// ### Explanation
  95     ///
  96     /// With the [`non_ascii_idents`] nightly-only feature enabled,
  97     /// identifiers are allowed to use non-ASCII characters. This lint warns
  98     /// when different identifiers may appear visually similar, which can
  99     /// cause confusion.
 100     ///
 101     /// The confusable detection algorithm is based on [Unicode® Technical
 102     /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
 103     /// Detection][TR39Confusable]. For every distinct identifier X execute
 104     /// the function `skeleton(X)`. If there exist two distinct identifiers X
 105     /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
 106     /// The compiler uses the same mechanism to check if an identifier is too
 107     /// similar to a keyword.
 108     ///
 109     /// Note that the set of confusable characters may change over time.
 110     /// Beware that if you "forbid" this lint that existing code may fail in
 111     /// the future.
 112     ///
 113     /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
 114     /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
 115     pub CONFUSABLE_IDENTS,
 116     Warn,
 117     "detects visually confusable pairs between identifiers",
 118     crate_level_only
 119 }
 120
 121 declare_lint! {
 122     /// The `mixed_script_confusables` lint detects visually confusable
 123     /// characters in identifiers between different [scripts].
 124     ///
 125     /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
 126     ///
 127     /// ### Example
 128     ///
 129     /// ```rust
 130     /// #![feature(non_ascii_idents)]
 131     ///
 132     /// // The Japanese katakana character エ can be confused with the Han character 工.
 133     /// const エ: &'static str = "アイウ";
 134     /// ```
 135     ///
 136     /// {{produces}}
 137     ///
 138     /// ### Explanation
 139     ///
 140     /// With the [`non_ascii_idents`] nightly-only feature enabled,
 141     /// identifiers are allowed to use non-ASCII characters. This lint warns
 142     /// when characters between different scripts may appear visually similar,
 143     /// which can cause confusion.
 144     ///
 145     /// If the crate contains other identifiers in the same script that have
 146     /// non-confusable characters, then this lint will *not* be issued. For
 147     /// example, if the example given above has another identifier with
 148     /// katakana characters (such as `let カタカナ = 123;`), then this indicates
 149     /// that you are intentionally using katakana, and it will not warn about
 150     /// it.
 151     ///
 152     /// Note that the set of confusable characters may change over time.
 153     /// Beware that if you "forbid" this lint that existing code may fail in
 154     /// the future.
 155     ///
 156     /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
 157     pub MIXED_SCRIPT_CONFUSABLES,
 158     Warn,
 159     "detects Unicode scripts whose mixed script confusables codepoints are solely used",
 160     crate_level_only
 161 }
 162
 163 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
 164
 165 impl EarlyLintPass for NonAsciiIdents {
 166     fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
 167         use rustc_session::lint::Level;
 168         use rustc_span::Span;
 169         use std::collections::BTreeMap;
 170         use unicode_security::GeneralSecurityProfile;
 171
 172         let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
 173         let check_uncommon_codepoints =
 174             cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
 175         let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
 176         let check_mixed_script_confusables =
 177             cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
 178
 179         if !check_non_ascii_idents
 180             && !check_uncommon_codepoints
 181             && !check_confusable_idents
 182             && !check_mixed_script_confusables
 183         {
 184             return;
 185         }
 186
 187         let mut has_non_ascii_idents = false;
 188         let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
 189
 190         // Sort by `Span` so that error messages make sense with respect to the
 191         // order of identifier locations in the code.
 192         let mut symbols: Vec<_> = symbols.iter().collect();
 193         symbols.sort_by_key(|k| k.1);
 194
 195         for (symbol, &sp) in symbols.iter() {
 196             let symbol_str = symbol.as_str();
 197             if symbol_str.is_ascii() {
 198                 continue;
 199             }
 200             has_non_ascii_idents = true;
 201             cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
 202                 lint.build("identifier contains non-ASCII characters").emit()
 203             });
 204             if check_uncommon_codepoints
 205                 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
 206             {
 207                 cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
 208                     lint.build("identifier contains uncommon Unicode codepoints").emit()
 209                 })
 210             }
 211         }
 212
 213         if has_non_ascii_idents && check_confusable_idents {
 214             let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
 215                 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
 216             let mut skeleton_buf = String::new();
 217
 218             for (&symbol, &sp) in symbols.iter() {
 219                 use unicode_security::confusable_detection::skeleton;
 220
 221                 let symbol_str = symbol.as_str();
 222                 let is_ascii = symbol_str.is_ascii();
 223
 224                 // Get the skeleton as a `Symbol`.
 225                 skeleton_buf.clear();
 226                 skeleton_buf.extend(skeleton(&symbol_str));
 227                 let skeleton_sym = if *symbol_str == *skeleton_buf {
 228                     symbol
 229                 } else {
 230                     Symbol::intern(&skeleton_buf)
 231                 };
 232
 233                 skeleton_map
 234                     .entry(skeleton_sym)
 235                     .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
 236                         if !*existing_is_ascii || !is_ascii {
 237                             cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
 238                                 lint.build(&format!(
 239                                     "identifier pair considered confusable between `{}` and `{}`",
 240                                     existing_symbol.as_str(),
 241                                     symbol.as_str()
 242                                 ))
 243                                 .span_label(
 244                                     *existing_span,
 245                                     "this is where the previous identifier occurred",
 246                                 )
 247                                 .emit();
 248                             });
 249                         }
 250                         if *existing_is_ascii && !is_ascii {
 251                             *existing_symbol = symbol;
 252                             *existing_span = sp;
 253                             *existing_is_ascii = is_ascii;
 254                         }
 255                     })
 256                     .or_insert((symbol, sp, is_ascii));
 257             }
 258         }
 259
 260         if has_non_ascii_idents && check_mixed_script_confusables {
 261             use unicode_security::is_potential_mixed_script_confusable_char;
 262             use unicode_security::mixed_script::AugmentedScriptSet;
 263
 264             #[derive(Clone)]
 265             enum ScriptSetUsage {
 266                 Suspicious(Vec<char>, Span),
 267                 Verified,
 268             }
 269
 270             let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
 271                 FxHashMap::default();
 272             let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
 273             script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
 274
 275             let mut has_suspicous = false;
 276             for (symbol, &sp) in symbols.iter() {
 277                 let symbol_str = symbol.as_str();
 278                 for ch in symbol_str.chars() {
 279                     if ch.is_ascii() {
 280                         // all ascii characters are covered by exception.
 281                         continue;
 282                     }
 283                     if !GeneralSecurityProfile::identifier_allowed(ch) {
 284                         // this character is covered by `uncommon_codepoints` lint.
 285                         continue;
 286                     }
 287                     let augmented_script_set = AugmentedScriptSet::for_char(ch);
 288                     script_states
 289                         .entry(augmented_script_set)
 290                         .and_modify(|existing_state| {
 291                             if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
 292                                 if is_potential_mixed_script_confusable_char(ch) {
 293                                     ch_list.push(ch);
 294                                 } else {
 295                                     *existing_state = ScriptSetUsage::Verified;
 296                                 }
 297                             }
 298                         })
 299                         .or_insert_with(|| {
 300                             if !is_potential_mixed_script_confusable_char(ch) {
 301                                 ScriptSetUsage::Verified
 302                             } else {
 303                                 has_suspicous = true;
 304                                 ScriptSetUsage::Suspicious(vec![ch], sp)
 305                             }
 306                         });
 307                 }
 308             }
 309
 310             if has_suspicous {
 311                 let verified_augmented_script_sets = script_states
 312                     .iter()
 313                     .flat_map(|(k, v)| match v {
 314                         ScriptSetUsage::Verified => Some(*k),
 315                         _ => None,
 316                     })
 317                     .collect::<Vec<_>>();
 318
 319                 // we're sorting the output here.
 320                 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
 321                     BTreeMap::new();
 322
 323                 'outerloop: for (augment_script_set, usage) in script_states {
 324                     let (mut ch_list, sp) = match usage {
 325                         ScriptSetUsage::Verified => continue,
 326                         ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
 327                     };
 328
 329                     if augment_script_set.is_all() {
 330                         continue;
 331                     }
 332
 333                     for existing in verified_augmented_script_sets.iter() {
 334                         if existing.is_all() {
 335                             continue;
 336                         }
 337                         let mut intersect = *existing;
 338                         intersect.intersect_with(augment_script_set);
 339                         if !intersect.is_empty() && !intersect.is_all() {
 340                             continue 'outerloop;
 341                         }
 342                     }
 343
 344                     // We sort primitive chars here and can use unstable sort
 345                     ch_list.sort_unstable();
 346                     ch_list.dedup();
 347                     lint_reports.insert((sp, ch_list), augment_script_set);
 348                 }
 349
 350                 for ((sp, ch_list), script_set) in lint_reports {
 351                     cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
 352                         let message = format!(
 353                             "The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
 354                             script_set);
 355                         let mut note = "The usage includes ".to_string();
 356                         for (idx, ch) in ch_list.into_iter().enumerate() {
 357                             if idx != 0 {
 358                                 note += ", ";
 359                             }
 360                             let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
 361                             note += &char_info;
 362                         }
 363                         note += ".";
 364                         lint.build(&message).note(&note).note("Please recheck to make sure their usages are indeed what you want.").emit()
 365                     });
 366                 }
 367             }
 368         }
 369     }
 370 }