]> git.proxmox.com Git - rustc.git/blame - compiler/rustc_lint/src/non_ascii_idents.rs
New upstream version 1.68.2+dfsg1
[rustc.git] / compiler / rustc_lint / src / non_ascii_idents.rs
CommitLineData
9c376795
FG
1use crate::lints::{
2 ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
3 MixedScriptConfusables,
4};
dfeec247 5use crate::{EarlyContext, EarlyLintPass, LintContext};
3dfed10e 6use rustc_ast as ast;
f9f354fc 7use rustc_data_structures::fx::FxHashMap;
3dfed10e 8use rustc_span::symbol::Symbol;
416331ca
XL
9
10declare_lint! {
1b1a35ee
XL
11 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
12 ///
13 /// ### Example
14 ///
15 /// ```rust,compile_fail
16 /// # #![allow(unused)]
1b1a35ee
XL
17 /// #![deny(non_ascii_idents)]
18 /// fn main() {
19 /// let föö = 1;
20 /// }
21 /// ```
22 ///
23 /// {{produces}}
24 ///
25 /// ### Explanation
26 ///
cdc7bbd5
XL
27 /// This lint allows projects that wish to retain the limit of only using
28 /// ASCII characters to switch this lint to "forbid" (for example to ease
29 /// collaboration or for security reasons).
1b1a35ee
XL
30 /// See [RFC 2457] for more details.
31 ///
1b1a35ee 32 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
416331ca
XL
33 pub NON_ASCII_IDENTS,
34 Allow,
f035d41b
XL
35 "detects non-ASCII identifiers",
36 crate_level_only
416331ca
XL
37}
38
dfeec247 39declare_lint! {
1b1a35ee
XL
40 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
41 /// identifiers.
42 ///
43 /// ### Example
44 ///
45 /// ```rust
46 /// # #![allow(unused)]
1b1a35ee
XL
47 /// const µ: f64 = 0.000001;
48 /// ```
49 ///
50 /// {{produces}}
51 ///
52 /// ### Explanation
53 ///
cdc7bbd5
XL
54 /// This lint warns about using characters which are not commonly used, and may
55 /// cause visual confusion.
1b1a35ee
XL
56 ///
57 /// This lint is triggered by identifiers that contain a codepoint that is
58 /// not part of the set of "Allowed" codepoints as described by [Unicode®
59 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
60 /// Security Profile for Identifiers][TR39Allowed].
61 ///
62 /// Note that the set of uncommon codepoints may change over time. Beware
63 /// that if you "forbid" this lint that existing code may fail in the
64 /// future.
65 ///
1b1a35ee 66 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
dfeec247
XL
67 pub UNCOMMON_CODEPOINTS,
68 Warn,
f035d41b
XL
69 "detects uncommon Unicode codepoints in identifiers",
70 crate_level_only
dfeec247
XL
71}
72
f9f354fc 73declare_lint! {
1b1a35ee
XL
74 /// The `confusable_idents` lint detects visually confusable pairs between
75 /// identifiers.
76 ///
77 /// ### Example
78 ///
79 /// ```rust
1b1a35ee
XL
80 /// // Latin Capital Letter E With Caron
81 /// pub const Ě: i32 = 1;
82 /// // Latin Capital Letter E With Breve
83 /// pub const Ĕ: i32 = 2;
84 /// ```
85 ///
86 /// {{produces}}
87 ///
88 /// ### Explanation
89 ///
cdc7bbd5
XL
90 /// This lint warns when different identifiers may appear visually similar,
91 /// which can cause confusion.
1b1a35ee
XL
92 ///
93 /// The confusable detection algorithm is based on [Unicode® Technical
94 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
95 /// Detection][TR39Confusable]. For every distinct identifier X execute
96 /// the function `skeleton(X)`. If there exist two distinct identifiers X
97 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
98 /// The compiler uses the same mechanism to check if an identifier is too
99 /// similar to a keyword.
100 ///
101 /// Note that the set of confusable characters may change over time.
102 /// Beware that if you "forbid" this lint that existing code may fail in
103 /// the future.
104 ///
1b1a35ee 105 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
f9f354fc 106 pub CONFUSABLE_IDENTS,
f035d41b
XL
107 Warn,
108 "detects visually confusable pairs between identifiers",
109 crate_level_only
f9f354fc
XL
110}
111
f035d41b 112declare_lint! {
1b1a35ee
XL
113 /// The `mixed_script_confusables` lint detects visually confusable
114 /// characters in identifiers between different [scripts].
115 ///
116 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
117 ///
118 /// ### Example
119 ///
120 /// ```rust
1b1a35ee
XL
121 /// // The Japanese katakana character エ can be confused with the Han character 工.
122 /// const エ: &'static str = "アイウ";
123 /// ```
124 ///
125 /// {{produces}}
126 ///
127 /// ### Explanation
128 ///
cdc7bbd5
XL
129 /// This lint warns when characters between different scripts may appear
130 /// visually similar, which can cause confusion.
1b1a35ee
XL
131 ///
132 /// If the crate contains other identifiers in the same script that have
133 /// non-confusable characters, then this lint will *not* be issued. For
134 /// example, if the example given above has another identifier with
135 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
136 /// that you are intentionally using katakana, and it will not warn about
137 /// it.
138 ///
139 /// Note that the set of confusable characters may change over time.
140 /// Beware that if you "forbid" this lint that existing code may fail in
141 /// the future.
f035d41b
XL
142 pub MIXED_SCRIPT_CONFUSABLES,
143 Warn,
144 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
145 crate_level_only
f9f354fc
XL
146}
147
f035d41b 148declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
416331ca
XL
149
150impl EarlyLintPass for NonAsciiIdents {
f9f354fc
XL
151 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
152 use rustc_session::lint::Level;
f035d41b
XL
153 use rustc_span::Span;
154 use std::collections::BTreeMap;
155 use unicode_security::GeneralSecurityProfile;
f035d41b
XL
156
157 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
158 let check_uncommon_codepoints =
159 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
160 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
161 let check_mixed_script_confusables =
162 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
163
164 if !check_non_ascii_idents
165 && !check_uncommon_codepoints
166 && !check_confusable_idents
167 && !check_mixed_script_confusables
168 {
f9f354fc
XL
169 return;
170 }
f035d41b
XL
171
172 let mut has_non_ascii_idents = false;
5099ac24 173 let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock();
3dfed10e
XL
174
175 // Sort by `Span` so that error messages make sense with respect to the
176 // order of identifier locations in the code.
177 let mut symbols: Vec<_> = symbols.iter().collect();
178 symbols.sort_by_key(|k| k.1);
179
f035d41b 180 for (symbol, &sp) in symbols.iter() {
f9f354fc 181 let symbol_str = symbol.as_str();
f035d41b
XL
182 if symbol_str.is_ascii() {
183 continue;
f9f354fc 184 }
f035d41b 185 has_non_ascii_idents = true;
9c376795 186 cx.emit_spanned_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
f035d41b
XL
187 if check_uncommon_codepoints
188 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
189 {
9c376795 190 cx.emit_spanned_lint(UNCOMMON_CODEPOINTS, sp, IdentifierUncommonCodepoints);
f9f354fc
XL
191 }
192 }
f035d41b
XL
193
194 if has_non_ascii_idents && check_confusable_idents {
3dfed10e 195 let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
f035d41b 196 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
3dfed10e
XL
197 let mut skeleton_buf = String::new();
198
199 for (&symbol, &sp) in symbols.iter() {
200 use unicode_security::confusable_detection::skeleton;
201
f9f354fc 202 let symbol_str = symbol.as_str();
f035d41b 203 let is_ascii = symbol_str.is_ascii();
3dfed10e
XL
204
205 // Get the skeleton as a `Symbol`.
206 skeleton_buf.clear();
207 skeleton_buf.extend(skeleton(&symbol_str));
208 let skeleton_sym = if *symbol_str == *skeleton_buf {
209 symbol
210 } else {
211 Symbol::intern(&skeleton_buf)
212 };
213
f035d41b 214 skeleton_map
3dfed10e
XL
215 .entry(skeleton_sym)
216 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
f035d41b 217 if !*existing_is_ascii || !is_ascii {
9c376795 218 cx.emit_spanned_lint(
2b03887a
FG
219 CONFUSABLE_IDENTS,
220 sp,
9c376795
FG
221 ConfusableIdentifierPair {
222 existing_sym: *existing_symbol,
223 sym: symbol,
224 label: *existing_span,
2b03887a
FG
225 },
226 );
f035d41b
XL
227 }
228 if *existing_is_ascii && !is_ascii {
3dfed10e 229 *existing_symbol = symbol;
f035d41b
XL
230 *existing_span = sp;
231 *existing_is_ascii = is_ascii;
232 }
233 })
3dfed10e 234 .or_insert((symbol, sp, is_ascii));
f9f354fc
XL
235 }
236 }
f035d41b
XL
237
238 if has_non_ascii_idents && check_mixed_script_confusables {
239 use unicode_security::is_potential_mixed_script_confusable_char;
240 use unicode_security::mixed_script::AugmentedScriptSet;
241
242 #[derive(Clone)]
243 enum ScriptSetUsage {
244 Suspicious(Vec<char>, Span),
245 Verified,
246 }
247
248 let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
249 FxHashMap::default();
250 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
251 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
252
253 let mut has_suspicous = false;
254 for (symbol, &sp) in symbols.iter() {
255 let symbol_str = symbol.as_str();
256 for ch in symbol_str.chars() {
257 if ch.is_ascii() {
258 // all ascii characters are covered by exception.
259 continue;
260 }
261 if !GeneralSecurityProfile::identifier_allowed(ch) {
262 // this character is covered by `uncommon_codepoints` lint.
263 continue;
264 }
265 let augmented_script_set = AugmentedScriptSet::for_char(ch);
266 script_states
267 .entry(augmented_script_set)
268 .and_modify(|existing_state| {
269 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
270 if is_potential_mixed_script_confusable_char(ch) {
271 ch_list.push(ch);
272 } else {
273 *existing_state = ScriptSetUsage::Verified;
274 }
275 }
276 })
277 .or_insert_with(|| {
278 if !is_potential_mixed_script_confusable_char(ch) {
279 ScriptSetUsage::Verified
280 } else {
281 has_suspicous = true;
282 ScriptSetUsage::Suspicious(vec![ch], sp)
283 }
284 });
285 }
286 }
287
288 if has_suspicous {
289 let verified_augmented_script_sets = script_states
290 .iter()
291 .flat_map(|(k, v)| match v {
292 ScriptSetUsage::Verified => Some(*k),
293 _ => None,
294 })
295 .collect::<Vec<_>>();
296
297 // we're sorting the output here.
298 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
299 BTreeMap::new();
300
301 'outerloop: for (augment_script_set, usage) in script_states {
5e7ed085 302 let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
f035d41b
XL
303
304 if augment_script_set.is_all() {
305 continue;
306 }
307
308 for existing in verified_augmented_script_sets.iter() {
309 if existing.is_all() {
310 continue;
311 }
312 let mut intersect = *existing;
313 intersect.intersect_with(augment_script_set);
314 if !intersect.is_empty() && !intersect.is_all() {
315 continue 'outerloop;
316 }
317 }
318
1b1a35ee
XL
319 // We sort primitive chars here and can use unstable sort
320 ch_list.sort_unstable();
f035d41b
XL
321 ch_list.dedup();
322 lint_reports.insert((sp, ch_list), augment_script_set);
323 }
324
325 for ((sp, ch_list), script_set) in lint_reports {
9c376795
FG
326 let mut includes = String::new();
327 for (idx, ch) in ch_list.into_iter().enumerate() {
328 if idx != 0 {
329 includes += ", ";
330 }
331 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
332 includes += &char_info;
333 }
334 cx.emit_spanned_lint(
2b03887a
FG
335 MIXED_SCRIPT_CONFUSABLES,
336 sp,
9c376795 337 MixedScriptConfusables { set: script_set.to_string(), includes },
2b03887a 338 );
f035d41b
XL
339 }
340 }
f9f354fc
XL
341 }
342 }
f035d41b 343}