]>
Commit | Line | Data |
---|---|---|
9c376795 FG |
1 | use crate::lints::{ |
2 | ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints, | |
3 | MixedScriptConfusables, | |
4 | }; | |
dfeec247 | 5 | use crate::{EarlyContext, EarlyLintPass, LintContext}; |
3dfed10e | 6 | use rustc_ast as ast; |
f9f354fc | 7 | use rustc_data_structures::fx::FxHashMap; |
3dfed10e | 8 | use rustc_span::symbol::Symbol; |
416331ca XL |
9 | |
10 | declare_lint! { | |
1b1a35ee XL |
11 | /// The `non_ascii_idents` lint detects non-ASCII identifiers. |
12 | /// | |
13 | /// ### Example | |
14 | /// | |
15 | /// ```rust,compile_fail | |
16 | /// # #![allow(unused)] | |
1b1a35ee XL |
17 | /// #![deny(non_ascii_idents)] |
18 | /// fn main() { | |
19 | /// let föö = 1; | |
20 | /// } | |
21 | /// ``` | |
22 | /// | |
23 | /// {{produces}} | |
24 | /// | |
25 | /// ### Explanation | |
26 | /// | |
cdc7bbd5 XL |
27 | /// This lint allows projects that wish to retain the limit of only using |
28 | /// ASCII characters to switch this lint to "forbid" (for example to ease | |
29 | /// collaboration or for security reasons). | |
1b1a35ee XL |
30 | /// See [RFC 2457] for more details. |
31 | /// | |
1b1a35ee | 32 | /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md |
416331ca XL |
33 | pub NON_ASCII_IDENTS, |
34 | Allow, | |
f035d41b XL |
35 | "detects non-ASCII identifiers", |
36 | crate_level_only | |
416331ca XL |
37 | } |
38 | ||
dfeec247 | 39 | declare_lint! { |
1b1a35ee XL |
40 | /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in |
41 | /// identifiers. | |
42 | /// | |
43 | /// ### Example | |
44 | /// | |
45 | /// ```rust | |
46 | /// # #![allow(unused)] | |
1b1a35ee XL |
47 | /// const µ: f64 = 0.000001; |
48 | /// ``` | |
49 | /// | |
50 | /// {{produces}} | |
51 | /// | |
52 | /// ### Explanation | |
53 | /// | |
cdc7bbd5 XL |
54 | /// This lint warns about using characters which are not commonly used, and may |
55 | /// cause visual confusion. | |
1b1a35ee XL |
56 | /// |
57 | /// This lint is triggered by identifiers that contain a codepoint that is | |
58 | /// not part of the set of "Allowed" codepoints as described by [Unicode® | |
59 | /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General | |
60 | /// Security Profile for Identifiers][TR39Allowed]. | |
61 | /// | |
62 | /// Note that the set of uncommon codepoints may change over time. Beware | |
63 | /// that if you "forbid" this lint that existing code may fail in the | |
64 | /// future. | |
65 | /// | |
1b1a35ee | 66 | /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile |
dfeec247 XL |
67 | pub UNCOMMON_CODEPOINTS, |
68 | Warn, | |
f035d41b XL |
69 | "detects uncommon Unicode codepoints in identifiers", |
70 | crate_level_only | |
dfeec247 XL |
71 | } |
72 | ||
f9f354fc | 73 | declare_lint! { |
1b1a35ee XL |
74 | /// The `confusable_idents` lint detects visually confusable pairs between |
75 | /// identifiers. | |
76 | /// | |
77 | /// ### Example | |
78 | /// | |
79 | /// ```rust | |
1b1a35ee XL |
80 | /// // Latin Capital Letter E With Caron |
81 | /// pub const Ě: i32 = 1; | |
82 | /// // Latin Capital Letter E With Breve | |
83 | /// pub const Ĕ: i32 = 2; | |
84 | /// ``` | |
85 | /// | |
86 | /// {{produces}} | |
87 | /// | |
88 | /// ### Explanation | |
89 | /// | |
cdc7bbd5 XL |
90 | /// This lint warns when different identifiers may appear visually similar, |
91 | /// which can cause confusion. | |
1b1a35ee XL |
92 | /// |
93 | /// The confusable detection algorithm is based on [Unicode® Technical | |
94 | /// Standard #39 Unicode Security Mechanisms Section 4 Confusable | |
95 | /// Detection][TR39Confusable]. For every distinct identifier X execute | |
96 | /// the function `skeleton(X)`. If there exist two distinct identifiers X | |
97 | /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it. | |
98 | /// The compiler uses the same mechanism to check if an identifier is too | |
99 | /// similar to a keyword. | |
100 | /// | |
101 | /// Note that the set of confusable characters may change over time. | |
102 | /// Beware that if you "forbid" this lint that existing code may fail in | |
103 | /// the future. | |
104 | /// | |
1b1a35ee | 105 | /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection |
f9f354fc | 106 | pub CONFUSABLE_IDENTS, |
f035d41b XL |
107 | Warn, |
108 | "detects visually confusable pairs between identifiers", | |
109 | crate_level_only | |
f9f354fc XL |
110 | } |
111 | ||
f035d41b | 112 | declare_lint! { |
1b1a35ee XL |
113 | /// The `mixed_script_confusables` lint detects visually confusable |
114 | /// characters in identifiers between different [scripts]. | |
115 | /// | |
116 | /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode) | |
117 | /// | |
118 | /// ### Example | |
119 | /// | |
120 | /// ```rust | |
1b1a35ee XL |
121 | /// // The Japanese katakana character エ can be confused with the Han character 工. |
122 | /// const エ: &'static str = "アイウ"; | |
123 | /// ``` | |
124 | /// | |
125 | /// {{produces}} | |
126 | /// | |
127 | /// ### Explanation | |
128 | /// | |
cdc7bbd5 XL |
129 | /// This lint warns when characters between different scripts may appear |
130 | /// visually similar, which can cause confusion. | |
1b1a35ee XL |
131 | /// |
132 | /// If the crate contains other identifiers in the same script that have | |
133 | /// non-confusable characters, then this lint will *not* be issued. For | |
134 | /// example, if the example given above has another identifier with | |
135 | /// katakana characters (such as `let カタカナ = 123;`), then this indicates | |
136 | /// that you are intentionally using katakana, and it will not warn about | |
137 | /// it. | |
138 | /// | |
139 | /// Note that the set of confusable characters may change over time. | |
140 | /// Beware that if you "forbid" this lint that existing code may fail in | |
141 | /// the future. | |
f035d41b XL |
142 | pub MIXED_SCRIPT_CONFUSABLES, |
143 | Warn, | |
144 | "detects Unicode scripts whose mixed script confusables codepoints are solely used", | |
145 | crate_level_only | |
f9f354fc XL |
146 | } |
147 | ||
f035d41b | 148 | declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]); |
416331ca XL |
149 | |
150 | impl EarlyLintPass for NonAsciiIdents { | |
f9f354fc XL |
151 | fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) { |
152 | use rustc_session::lint::Level; | |
f035d41b XL |
153 | use rustc_span::Span; |
154 | use std::collections::BTreeMap; | |
155 | use unicode_security::GeneralSecurityProfile; | |
f035d41b XL |
156 | |
157 | let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow; | |
158 | let check_uncommon_codepoints = | |
159 | cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow; | |
160 | let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow; | |
161 | let check_mixed_script_confusables = | |
162 | cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow; | |
163 | ||
164 | if !check_non_ascii_idents | |
165 | && !check_uncommon_codepoints | |
166 | && !check_confusable_idents | |
167 | && !check_mixed_script_confusables | |
168 | { | |
f9f354fc XL |
169 | return; |
170 | } | |
f035d41b XL |
171 | |
172 | let mut has_non_ascii_idents = false; | |
5099ac24 | 173 | let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock(); |
3dfed10e XL |
174 | |
175 | // Sort by `Span` so that error messages make sense with respect to the | |
176 | // order of identifier locations in the code. | |
177 | let mut symbols: Vec<_> = symbols.iter().collect(); | |
178 | symbols.sort_by_key(|k| k.1); | |
179 | ||
f035d41b | 180 | for (symbol, &sp) in symbols.iter() { |
f9f354fc | 181 | let symbol_str = symbol.as_str(); |
f035d41b XL |
182 | if symbol_str.is_ascii() { |
183 | continue; | |
f9f354fc | 184 | } |
f035d41b | 185 | has_non_ascii_idents = true; |
9c376795 | 186 | cx.emit_spanned_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar); |
f035d41b XL |
187 | if check_uncommon_codepoints |
188 | && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed) | |
189 | { | |
9c376795 | 190 | cx.emit_spanned_lint(UNCOMMON_CODEPOINTS, sp, IdentifierUncommonCodepoints); |
f9f354fc XL |
191 | } |
192 | } | |
f035d41b XL |
193 | |
194 | if has_non_ascii_idents && check_confusable_idents { | |
3dfed10e | 195 | let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> = |
f035d41b | 196 | FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default()); |
3dfed10e XL |
197 | let mut skeleton_buf = String::new(); |
198 | ||
199 | for (&symbol, &sp) in symbols.iter() { | |
200 | use unicode_security::confusable_detection::skeleton; | |
201 | ||
f9f354fc | 202 | let symbol_str = symbol.as_str(); |
f035d41b | 203 | let is_ascii = symbol_str.is_ascii(); |
3dfed10e XL |
204 | |
205 | // Get the skeleton as a `Symbol`. | |
206 | skeleton_buf.clear(); | |
207 | skeleton_buf.extend(skeleton(&symbol_str)); | |
208 | let skeleton_sym = if *symbol_str == *skeleton_buf { | |
209 | symbol | |
210 | } else { | |
211 | Symbol::intern(&skeleton_buf) | |
212 | }; | |
213 | ||
f035d41b | 214 | skeleton_map |
3dfed10e XL |
215 | .entry(skeleton_sym) |
216 | .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| { | |
f035d41b | 217 | if !*existing_is_ascii || !is_ascii { |
9c376795 | 218 | cx.emit_spanned_lint( |
2b03887a FG |
219 | CONFUSABLE_IDENTS, |
220 | sp, | |
9c376795 FG |
221 | ConfusableIdentifierPair { |
222 | existing_sym: *existing_symbol, | |
223 | sym: symbol, | |
224 | label: *existing_span, | |
2b03887a FG |
225 | }, |
226 | ); | |
f035d41b XL |
227 | } |
228 | if *existing_is_ascii && !is_ascii { | |
3dfed10e | 229 | *existing_symbol = symbol; |
f035d41b XL |
230 | *existing_span = sp; |
231 | *existing_is_ascii = is_ascii; | |
232 | } | |
233 | }) | |
3dfed10e | 234 | .or_insert((symbol, sp, is_ascii)); |
f9f354fc XL |
235 | } |
236 | } | |
f035d41b XL |
237 | |
238 | if has_non_ascii_idents && check_mixed_script_confusables { | |
239 | use unicode_security::is_potential_mixed_script_confusable_char; | |
240 | use unicode_security::mixed_script::AugmentedScriptSet; | |
241 | ||
242 | #[derive(Clone)] | |
243 | enum ScriptSetUsage { | |
244 | Suspicious(Vec<char>, Span), | |
245 | Verified, | |
246 | } | |
247 | ||
248 | let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> = | |
249 | FxHashMap::default(); | |
250 | let latin_augmented_script_set = AugmentedScriptSet::for_char('A'); | |
251 | script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified); | |
252 | ||
253 | let mut has_suspicous = false; | |
254 | for (symbol, &sp) in symbols.iter() { | |
255 | let symbol_str = symbol.as_str(); | |
256 | for ch in symbol_str.chars() { | |
257 | if ch.is_ascii() { | |
258 | // all ascii characters are covered by exception. | |
259 | continue; | |
260 | } | |
261 | if !GeneralSecurityProfile::identifier_allowed(ch) { | |
262 | // this character is covered by `uncommon_codepoints` lint. | |
263 | continue; | |
264 | } | |
265 | let augmented_script_set = AugmentedScriptSet::for_char(ch); | |
266 | script_states | |
267 | .entry(augmented_script_set) | |
268 | .and_modify(|existing_state| { | |
269 | if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state { | |
270 | if is_potential_mixed_script_confusable_char(ch) { | |
271 | ch_list.push(ch); | |
272 | } else { | |
273 | *existing_state = ScriptSetUsage::Verified; | |
274 | } | |
275 | } | |
276 | }) | |
277 | .or_insert_with(|| { | |
278 | if !is_potential_mixed_script_confusable_char(ch) { | |
279 | ScriptSetUsage::Verified | |
280 | } else { | |
281 | has_suspicous = true; | |
282 | ScriptSetUsage::Suspicious(vec![ch], sp) | |
283 | } | |
284 | }); | |
285 | } | |
286 | } | |
287 | ||
288 | if has_suspicous { | |
289 | let verified_augmented_script_sets = script_states | |
290 | .iter() | |
291 | .flat_map(|(k, v)| match v { | |
292 | ScriptSetUsage::Verified => Some(*k), | |
293 | _ => None, | |
294 | }) | |
295 | .collect::<Vec<_>>(); | |
296 | ||
297 | // we're sorting the output here. | |
298 | let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> = | |
299 | BTreeMap::new(); | |
300 | ||
301 | 'outerloop: for (augment_script_set, usage) in script_states { | |
5e7ed085 | 302 | let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue }; |
f035d41b XL |
303 | |
304 | if augment_script_set.is_all() { | |
305 | continue; | |
306 | } | |
307 | ||
308 | for existing in verified_augmented_script_sets.iter() { | |
309 | if existing.is_all() { | |
310 | continue; | |
311 | } | |
312 | let mut intersect = *existing; | |
313 | intersect.intersect_with(augment_script_set); | |
314 | if !intersect.is_empty() && !intersect.is_all() { | |
315 | continue 'outerloop; | |
316 | } | |
317 | } | |
318 | ||
1b1a35ee XL |
319 | // We sort primitive chars here and can use unstable sort |
320 | ch_list.sort_unstable(); | |
f035d41b XL |
321 | ch_list.dedup(); |
322 | lint_reports.insert((sp, ch_list), augment_script_set); | |
323 | } | |
324 | ||
325 | for ((sp, ch_list), script_set) in lint_reports { | |
9c376795 FG |
326 | let mut includes = String::new(); |
327 | for (idx, ch) in ch_list.into_iter().enumerate() { | |
328 | if idx != 0 { | |
329 | includes += ", "; | |
330 | } | |
331 | let char_info = format!("'{}' (U+{:04X})", ch, ch as u32); | |
332 | includes += &char_info; | |
333 | } | |
334 | cx.emit_spanned_lint( | |
2b03887a FG |
335 | MIXED_SCRIPT_CONFUSABLES, |
336 | sp, | |
9c376795 | 337 | MixedScriptConfusables { set: script_set.to_string(), includes }, |
2b03887a | 338 | ); |
f035d41b XL |
339 | } |
340 | } | |
f9f354fc XL |
341 | } |
342 | } | |
f035d41b | 343 | } |