]> git.proxmox.com Git - rustc.git/blame - compiler/rustc_lint/src/non_ascii_idents.rs
New upstream version 1.59.0+dfsg1
[rustc.git] / compiler / rustc_lint / src / non_ascii_idents.rs
CommitLineData
dfeec247 1use crate::{EarlyContext, EarlyLintPass, LintContext};
3dfed10e 2use rustc_ast as ast;
f9f354fc 3use rustc_data_structures::fx::FxHashMap;
3dfed10e 4use rustc_span::symbol::Symbol;
416331ca
XL
5
6declare_lint! {
1b1a35ee
XL
7 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
8 ///
9 /// ### Example
10 ///
11 /// ```rust,compile_fail
12 /// # #![allow(unused)]
1b1a35ee
XL
13 /// #![deny(non_ascii_idents)]
14 /// fn main() {
15 /// let föö = 1;
16 /// }
17 /// ```
18 ///
19 /// {{produces}}
20 ///
21 /// ### Explanation
22 ///
cdc7bbd5
XL
23 /// This lint allows projects that wish to retain the limit of only using
24 /// ASCII characters to switch this lint to "forbid" (for example to ease
25 /// collaboration or for security reasons).
1b1a35ee
XL
26 /// See [RFC 2457] for more details.
27 ///
1b1a35ee 28 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
416331ca
XL
29 pub NON_ASCII_IDENTS,
30 Allow,
f035d41b
XL
31 "detects non-ASCII identifiers",
32 crate_level_only
416331ca
XL
33}
34
dfeec247 35declare_lint! {
1b1a35ee
XL
36 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
37 /// identifiers.
38 ///
39 /// ### Example
40 ///
41 /// ```rust
42 /// # #![allow(unused)]
1b1a35ee
XL
43 /// const µ: f64 = 0.000001;
44 /// ```
45 ///
46 /// {{produces}}
47 ///
48 /// ### Explanation
49 ///
cdc7bbd5
XL
50 /// This lint warns about using characters which are not commonly used, and may
51 /// cause visual confusion.
1b1a35ee
XL
52 ///
53 /// This lint is triggered by identifiers that contain a codepoint that is
54 /// not part of the set of "Allowed" codepoints as described by [Unicode®
55 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
56 /// Security Profile for Identifiers][TR39Allowed].
57 ///
58 /// Note that the set of uncommon codepoints may change over time. Beware
59 /// that if you "forbid" this lint that existing code may fail in the
60 /// future.
61 ///
1b1a35ee 62 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
dfeec247
XL
63 pub UNCOMMON_CODEPOINTS,
64 Warn,
f035d41b
XL
65 "detects uncommon Unicode codepoints in identifiers",
66 crate_level_only
dfeec247
XL
67}
68
f9f354fc 69declare_lint! {
1b1a35ee
XL
70 /// The `confusable_idents` lint detects visually confusable pairs between
71 /// identifiers.
72 ///
73 /// ### Example
74 ///
75 /// ```rust
1b1a35ee
XL
76 /// // Latin Capital Letter E With Caron
77 /// pub const Ě: i32 = 1;
78 /// // Latin Capital Letter E With Breve
79 /// pub const Ĕ: i32 = 2;
80 /// ```
81 ///
82 /// {{produces}}
83 ///
84 /// ### Explanation
85 ///
cdc7bbd5
XL
86 /// This lint warns when different identifiers may appear visually similar,
87 /// which can cause confusion.
1b1a35ee
XL
88 ///
89 /// The confusable detection algorithm is based on [Unicode® Technical
90 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
91 /// Detection][TR39Confusable]. For every distinct identifier X execute
92 /// the function `skeleton(X)`. If there exist two distinct identifiers X
93 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
94 /// The compiler uses the same mechanism to check if an identifier is too
95 /// similar to a keyword.
96 ///
97 /// Note that the set of confusable characters may change over time.
98 /// Beware that if you "forbid" this lint that existing code may fail in
99 /// the future.
100 ///
1b1a35ee 101 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
f9f354fc 102 pub CONFUSABLE_IDENTS,
f035d41b
XL
103 Warn,
104 "detects visually confusable pairs between identifiers",
105 crate_level_only
f9f354fc
XL
106}
107
f035d41b 108declare_lint! {
1b1a35ee
XL
109 /// The `mixed_script_confusables` lint detects visually confusable
110 /// characters in identifiers between different [scripts].
111 ///
112 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
113 ///
114 /// ### Example
115 ///
116 /// ```rust
1b1a35ee
XL
117 /// // The Japanese katakana character エ can be confused with the Han character 工.
118 /// const エ: &'static str = "アイウ";
119 /// ```
120 ///
121 /// {{produces}}
122 ///
123 /// ### Explanation
124 ///
cdc7bbd5
XL
125 /// This lint warns when characters between different scripts may appear
126 /// visually similar, which can cause confusion.
1b1a35ee
XL
127 ///
128 /// If the crate contains other identifiers in the same script that have
129 /// non-confusable characters, then this lint will *not* be issued. For
130 /// example, if the example given above has another identifier with
131 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
132 /// that you are intentionally using katakana, and it will not warn about
133 /// it.
134 ///
135 /// Note that the set of confusable characters may change over time.
136 /// Beware that if you "forbid" this lint that existing code may fail in
137 /// the future.
f035d41b
XL
138 pub MIXED_SCRIPT_CONFUSABLES,
139 Warn,
140 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
141 crate_level_only
f9f354fc
XL
142}
143
f035d41b 144declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
416331ca
XL
145
146impl EarlyLintPass for NonAsciiIdents {
f9f354fc
XL
147 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
148 use rustc_session::lint::Level;
f035d41b
XL
149 use rustc_span::Span;
150 use std::collections::BTreeMap;
151 use unicode_security::GeneralSecurityProfile;
f035d41b
XL
152
153 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
154 let check_uncommon_codepoints =
155 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
156 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
157 let check_mixed_script_confusables =
158 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
159
160 if !check_non_ascii_idents
161 && !check_uncommon_codepoints
162 && !check_confusable_idents
163 && !check_mixed_script_confusables
164 {
f9f354fc
XL
165 return;
166 }
f035d41b
XL
167
168 let mut has_non_ascii_idents = false;
f9f354fc 169 let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
3dfed10e
XL
170
171 // Sort by `Span` so that error messages make sense with respect to the
172 // order of identifier locations in the code.
173 let mut symbols: Vec<_> = symbols.iter().collect();
174 symbols.sort_by_key(|k| k.1);
175
f035d41b 176 for (symbol, &sp) in symbols.iter() {
f9f354fc 177 let symbol_str = symbol.as_str();
f035d41b
XL
178 if symbol_str.is_ascii() {
179 continue;
f9f354fc 180 }
f035d41b
XL
181 has_non_ascii_idents = true;
182 cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
183 lint.build("identifier contains non-ASCII characters").emit()
184 });
185 if check_uncommon_codepoints
186 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
187 {
188 cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
189 lint.build("identifier contains uncommon Unicode codepoints").emit()
190 })
f9f354fc
XL
191 }
192 }
f035d41b
XL
193
194 if has_non_ascii_idents && check_confusable_idents {
3dfed10e 195 let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
f035d41b 196 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
3dfed10e
XL
197 let mut skeleton_buf = String::new();
198
199 for (&symbol, &sp) in symbols.iter() {
200 use unicode_security::confusable_detection::skeleton;
201
f9f354fc 202 let symbol_str = symbol.as_str();
f035d41b 203 let is_ascii = symbol_str.is_ascii();
3dfed10e
XL
204
205 // Get the skeleton as a `Symbol`.
206 skeleton_buf.clear();
207 skeleton_buf.extend(skeleton(&symbol_str));
208 let skeleton_sym = if *symbol_str == *skeleton_buf {
209 symbol
210 } else {
211 Symbol::intern(&skeleton_buf)
212 };
213
f035d41b 214 skeleton_map
3dfed10e
XL
215 .entry(skeleton_sym)
216 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
f035d41b
XL
217 if !*existing_is_ascii || !is_ascii {
218 cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
219 lint.build(&format!(
220 "identifier pair considered confusable between `{}` and `{}`",
a2a8927a 221 existing_symbol, symbol
f035d41b
XL
222 ))
223 .span_label(
224 *existing_span,
225 "this is where the previous identifier occurred",
226 )
227 .emit();
228 });
229 }
230 if *existing_is_ascii && !is_ascii {
3dfed10e 231 *existing_symbol = symbol;
f035d41b
XL
232 *existing_span = sp;
233 *existing_is_ascii = is_ascii;
234 }
235 })
3dfed10e 236 .or_insert((symbol, sp, is_ascii));
f9f354fc
XL
237 }
238 }
f035d41b
XL
239
240 if has_non_ascii_idents && check_mixed_script_confusables {
241 use unicode_security::is_potential_mixed_script_confusable_char;
242 use unicode_security::mixed_script::AugmentedScriptSet;
243
244 #[derive(Clone)]
245 enum ScriptSetUsage {
246 Suspicious(Vec<char>, Span),
247 Verified,
248 }
249
250 let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
251 FxHashMap::default();
252 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
253 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
254
255 let mut has_suspicous = false;
256 for (symbol, &sp) in symbols.iter() {
257 let symbol_str = symbol.as_str();
258 for ch in symbol_str.chars() {
259 if ch.is_ascii() {
260 // all ascii characters are covered by exception.
261 continue;
262 }
263 if !GeneralSecurityProfile::identifier_allowed(ch) {
264 // this character is covered by `uncommon_codepoints` lint.
265 continue;
266 }
267 let augmented_script_set = AugmentedScriptSet::for_char(ch);
268 script_states
269 .entry(augmented_script_set)
270 .and_modify(|existing_state| {
271 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
272 if is_potential_mixed_script_confusable_char(ch) {
273 ch_list.push(ch);
274 } else {
275 *existing_state = ScriptSetUsage::Verified;
276 }
277 }
278 })
279 .or_insert_with(|| {
280 if !is_potential_mixed_script_confusable_char(ch) {
281 ScriptSetUsage::Verified
282 } else {
283 has_suspicous = true;
284 ScriptSetUsage::Suspicious(vec![ch], sp)
285 }
286 });
287 }
288 }
289
290 if has_suspicous {
291 let verified_augmented_script_sets = script_states
292 .iter()
293 .flat_map(|(k, v)| match v {
294 ScriptSetUsage::Verified => Some(*k),
295 _ => None,
296 })
297 .collect::<Vec<_>>();
298
299 // we're sorting the output here.
300 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
301 BTreeMap::new();
302
303 'outerloop: for (augment_script_set, usage) in script_states {
304 let (mut ch_list, sp) = match usage {
305 ScriptSetUsage::Verified => continue,
306 ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
307 };
308
309 if augment_script_set.is_all() {
310 continue;
311 }
312
313 for existing in verified_augmented_script_sets.iter() {
314 if existing.is_all() {
315 continue;
316 }
317 let mut intersect = *existing;
318 intersect.intersect_with(augment_script_set);
319 if !intersect.is_empty() && !intersect.is_all() {
320 continue 'outerloop;
321 }
322 }
323
1b1a35ee
XL
324 // We sort primitive chars here and can use unstable sort
325 ch_list.sort_unstable();
f035d41b
XL
326 ch_list.dedup();
327 lint_reports.insert((sp, ch_list), augment_script_set);
328 }
329
330 for ((sp, ch_list), script_set) in lint_reports {
331 cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
332 let message = format!(
c295e0f8 333 "the usage of Script Group `{}` in this crate consists solely of mixed script confusables",
f035d41b 334 script_set);
c295e0f8 335 let mut note = "the usage includes ".to_string();
f035d41b
XL
336 for (idx, ch) in ch_list.into_iter().enumerate() {
337 if idx != 0 {
338 note += ", ";
339 }
340 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
341 note += &char_info;
342 }
c295e0f8 343 lint.build(&message).note(&note).note("please recheck to make sure their usages are indeed what you want").emit()
f9f354fc 344 });
f035d41b
XL
345 }
346 }
f9f354fc
XL
347 }
348 }
f035d41b 349}