]> git.proxmox.com Git - rustc.git/blob - compiler/rustc_lint/src/non_ascii_idents.rs
New upstream version 1.48.0~beta.8+dfsg1
[rustc.git] / compiler / rustc_lint / src / non_ascii_idents.rs
1 use crate::{EarlyContext, EarlyLintPass, LintContext};
2 use rustc_ast as ast;
3 use rustc_data_structures::fx::FxHashMap;
4 use rustc_span::symbol::Symbol;
5
6 declare_lint! {
7 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
8 ///
9 /// ### Example
10 ///
11 /// ```rust,compile_fail
12 /// # #![allow(unused)]
13 /// #![feature(non_ascii_idents)]
14 /// #![deny(non_ascii_idents)]
15 /// fn main() {
16 /// let föö = 1;
17 /// }
18 /// ```
19 ///
20 /// {{produces}}
21 ///
22 /// ### Explanation
23 ///
24 /// Currently on stable Rust, identifiers must contain ASCII characters.
25 /// The [`non_ascii_idents`] nightly-only feature allows identifiers to
26 /// contain non-ASCII characters. This lint allows projects that wish to
27 /// retain the limit of only using ASCII characters to switch this lint to
28 /// "forbid" (for example to ease collaboration or for security reasons).
29 /// See [RFC 2457] for more details.
30 ///
31 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
32 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
33 pub NON_ASCII_IDENTS,
34 Allow,
35 "detects non-ASCII identifiers",
36 crate_level_only
37 }
38
39 declare_lint! {
40 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
41 /// identifiers.
42 ///
43 /// ### Example
44 ///
45 /// ```rust
46 /// # #![allow(unused)]
47 /// #![feature(non_ascii_idents)]
48 /// const µ: f64 = 0.000001;
49 /// ```
50 ///
51 /// {{produces}}
52 ///
53 /// ### Explanation
54 ///
55 /// With the [`non_ascii_idents`] nightly-only feature enabled,
56 /// identifiers are allowed to use non-ASCII characters. This lint warns
57 /// about using characters which are not commonly used, and may cause
58 /// visual confusion.
59 ///
60 /// This lint is triggered by identifiers that contain a codepoint that is
61 /// not part of the set of "Allowed" codepoints as described by [Unicode®
62 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
63 /// Security Profile for Identifiers][TR39Allowed].
64 ///
65 /// Note that the set of uncommon codepoints may change over time. Beware
66 /// that if you "forbid" this lint that existing code may fail in the
67 /// future.
68 ///
69 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
70 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71 pub UNCOMMON_CODEPOINTS,
72 Warn,
73 "detects uncommon Unicode codepoints in identifiers",
74 crate_level_only
75 }
76
77 declare_lint! {
78 /// The `confusable_idents` lint detects visually confusable pairs between
79 /// identifiers.
80 ///
81 /// ### Example
82 ///
83 /// ```rust
84 /// #![feature(non_ascii_idents)]
85 ///
86 /// // Latin Capital Letter E With Caron
87 /// pub const Ě: i32 = 1;
88 /// // Latin Capital Letter E With Breve
89 /// pub const Ĕ: i32 = 2;
90 /// ```
91 ///
92 /// {{produces}}
93 ///
94 /// ### Explanation
95 ///
96 /// With the [`non_ascii_idents`] nightly-only feature enabled,
97 /// identifiers are allowed to use non-ASCII characters. This lint warns
98 /// when different identifiers may appear visually similar, which can
99 /// cause confusion.
100 ///
101 /// The confusable detection algorithm is based on [Unicode® Technical
102 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
103 /// Detection][TR39Confusable]. For every distinct identifier X execute
104 /// the function `skeleton(X)`. If there exist two distinct identifiers X
105 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
106 /// The compiler uses the same mechanism to check if an identifier is too
107 /// similar to a keyword.
108 ///
109 /// Note that the set of confusable characters may change over time.
110 /// Beware that if you "forbid" this lint that existing code may fail in
111 /// the future.
112 ///
113 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
114 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
115 pub CONFUSABLE_IDENTS,
116 Warn,
117 "detects visually confusable pairs between identifiers",
118 crate_level_only
119 }
120
121 declare_lint! {
122 /// The `mixed_script_confusables` lint detects visually confusable
123 /// characters in identifiers between different [scripts].
124 ///
125 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
126 ///
127 /// ### Example
128 ///
129 /// ```rust
130 /// #![feature(non_ascii_idents)]
131 ///
132 /// // The Japanese katakana character エ can be confused with the Han character 工.
133 /// const エ: &'static str = "アイウ";
134 /// ```
135 ///
136 /// {{produces}}
137 ///
138 /// ### Explanation
139 ///
140 /// With the [`non_ascii_idents`] nightly-only feature enabled,
141 /// identifiers are allowed to use non-ASCII characters. This lint warns
142 /// when characters between different scripts may appear visually similar,
143 /// which can cause confusion.
144 ///
145 /// If the crate contains other identifiers in the same script that have
146 /// non-confusable characters, then this lint will *not* be issued. For
147 /// example, if the example given above has another identifier with
148 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
149 /// that you are intentionally using katakana, and it will not warn about
150 /// it.
151 ///
152 /// Note that the set of confusable characters may change over time.
153 /// Beware that if you "forbid" this lint that existing code may fail in
154 /// the future.
155 ///
156 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
157 pub MIXED_SCRIPT_CONFUSABLES,
158 Warn,
159 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
160 crate_level_only
161 }
162
163 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
164
165 impl EarlyLintPass for NonAsciiIdents {
166 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
167 use rustc_session::lint::Level;
168 use rustc_span::Span;
169 use std::collections::BTreeMap;
170 use unicode_security::GeneralSecurityProfile;
171
172 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
173 let check_uncommon_codepoints =
174 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
175 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
176 let check_mixed_script_confusables =
177 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
178
179 if !check_non_ascii_idents
180 && !check_uncommon_codepoints
181 && !check_confusable_idents
182 && !check_mixed_script_confusables
183 {
184 return;
185 }
186
187 let mut has_non_ascii_idents = false;
188 let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
189
190 // Sort by `Span` so that error messages make sense with respect to the
191 // order of identifier locations in the code.
192 let mut symbols: Vec<_> = symbols.iter().collect();
193 symbols.sort_by_key(|k| k.1);
194
195 for (symbol, &sp) in symbols.iter() {
196 let symbol_str = symbol.as_str();
197 if symbol_str.is_ascii() {
198 continue;
199 }
200 has_non_ascii_idents = true;
201 cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
202 lint.build("identifier contains non-ASCII characters").emit()
203 });
204 if check_uncommon_codepoints
205 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
206 {
207 cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
208 lint.build("identifier contains uncommon Unicode codepoints").emit()
209 })
210 }
211 }
212
213 if has_non_ascii_idents && check_confusable_idents {
214 let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
215 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
216 let mut skeleton_buf = String::new();
217
218 for (&symbol, &sp) in symbols.iter() {
219 use unicode_security::confusable_detection::skeleton;
220
221 let symbol_str = symbol.as_str();
222 let is_ascii = symbol_str.is_ascii();
223
224 // Get the skeleton as a `Symbol`.
225 skeleton_buf.clear();
226 skeleton_buf.extend(skeleton(&symbol_str));
227 let skeleton_sym = if *symbol_str == *skeleton_buf {
228 symbol
229 } else {
230 Symbol::intern(&skeleton_buf)
231 };
232
233 skeleton_map
234 .entry(skeleton_sym)
235 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
236 if !*existing_is_ascii || !is_ascii {
237 cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
238 lint.build(&format!(
239 "identifier pair considered confusable between `{}` and `{}`",
240 existing_symbol.as_str(),
241 symbol.as_str()
242 ))
243 .span_label(
244 *existing_span,
245 "this is where the previous identifier occurred",
246 )
247 .emit();
248 });
249 }
250 if *existing_is_ascii && !is_ascii {
251 *existing_symbol = symbol;
252 *existing_span = sp;
253 *existing_is_ascii = is_ascii;
254 }
255 })
256 .or_insert((symbol, sp, is_ascii));
257 }
258 }
259
260 if has_non_ascii_idents && check_mixed_script_confusables {
261 use unicode_security::is_potential_mixed_script_confusable_char;
262 use unicode_security::mixed_script::AugmentedScriptSet;
263
264 #[derive(Clone)]
265 enum ScriptSetUsage {
266 Suspicious(Vec<char>, Span),
267 Verified,
268 }
269
270 let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
271 FxHashMap::default();
272 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
273 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
274
275 let mut has_suspicous = false;
276 for (symbol, &sp) in symbols.iter() {
277 let symbol_str = symbol.as_str();
278 for ch in symbol_str.chars() {
279 if ch.is_ascii() {
280 // all ascii characters are covered by exception.
281 continue;
282 }
283 if !GeneralSecurityProfile::identifier_allowed(ch) {
284 // this character is covered by `uncommon_codepoints` lint.
285 continue;
286 }
287 let augmented_script_set = AugmentedScriptSet::for_char(ch);
288 script_states
289 .entry(augmented_script_set)
290 .and_modify(|existing_state| {
291 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
292 if is_potential_mixed_script_confusable_char(ch) {
293 ch_list.push(ch);
294 } else {
295 *existing_state = ScriptSetUsage::Verified;
296 }
297 }
298 })
299 .or_insert_with(|| {
300 if !is_potential_mixed_script_confusable_char(ch) {
301 ScriptSetUsage::Verified
302 } else {
303 has_suspicous = true;
304 ScriptSetUsage::Suspicious(vec![ch], sp)
305 }
306 });
307 }
308 }
309
310 if has_suspicous {
311 let verified_augmented_script_sets = script_states
312 .iter()
313 .flat_map(|(k, v)| match v {
314 ScriptSetUsage::Verified => Some(*k),
315 _ => None,
316 })
317 .collect::<Vec<_>>();
318
319 // we're sorting the output here.
320 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
321 BTreeMap::new();
322
323 'outerloop: for (augment_script_set, usage) in script_states {
324 let (mut ch_list, sp) = match usage {
325 ScriptSetUsage::Verified => continue,
326 ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
327 };
328
329 if augment_script_set.is_all() {
330 continue;
331 }
332
333 for existing in verified_augmented_script_sets.iter() {
334 if existing.is_all() {
335 continue;
336 }
337 let mut intersect = *existing;
338 intersect.intersect_with(augment_script_set);
339 if !intersect.is_empty() && !intersect.is_all() {
340 continue 'outerloop;
341 }
342 }
343
344 // We sort primitive chars here and can use unstable sort
345 ch_list.sort_unstable();
346 ch_list.dedup();
347 lint_reports.insert((sp, ch_list), augment_script_set);
348 }
349
350 for ((sp, ch_list), script_set) in lint_reports {
351 cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
352 let message = format!(
353 "The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
354 script_set);
355 let mut note = "The usage includes ".to_string();
356 for (idx, ch) in ch_list.into_iter().enumerate() {
357 if idx != 0 {
358 note += ", ";
359 }
360 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
361 note += &char_info;
362 }
363 note += ".";
364 lint.build(&message).note(&note).note("Please recheck to make sure their usages are indeed what you want.").emit()
365 });
366 }
367 }
368 }
369 }
370 }