]>
Commit | Line | Data |
---|---|---|
dfeec247 | 1 | use crate::{EarlyContext, EarlyLintPass, LintContext}; |
3dfed10e | 2 | use rustc_ast as ast; |
f9f354fc | 3 | use rustc_data_structures::fx::FxHashMap; |
3dfed10e | 4 | use rustc_span::symbol::Symbol; |
416331ca XL |
5 | |
6 | declare_lint! { | |
1b1a35ee XL |
7 | /// The `non_ascii_idents` lint detects non-ASCII identifiers. |
8 | /// | |
9 | /// ### Example | |
10 | /// | |
11 | /// ```rust,compile_fail | |
12 | /// # #![allow(unused)] | |
1b1a35ee XL |
13 | /// #![deny(non_ascii_idents)] |
14 | /// fn main() { | |
15 | /// let föö = 1; | |
16 | /// } | |
17 | /// ``` | |
18 | /// | |
19 | /// {{produces}} | |
20 | /// | |
21 | /// ### Explanation | |
22 | /// | |
cdc7bbd5 XL |
23 | /// This lint allows projects that wish to retain the limit of only using |
24 | /// ASCII characters to switch this lint to "forbid" (for example to ease | |
25 | /// collaboration or for security reasons). | |
1b1a35ee XL |
26 | /// See [RFC 2457] for more details. |
27 | /// | |
1b1a35ee | 28 | /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md |
416331ca XL |
29 | pub NON_ASCII_IDENTS, |
30 | Allow, | |
f035d41b XL |
31 | "detects non-ASCII identifiers", |
32 | crate_level_only | |
416331ca XL |
33 | } |
34 | ||
dfeec247 | 35 | declare_lint! { |
1b1a35ee XL |
36 | /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in |
37 | /// identifiers. | |
38 | /// | |
39 | /// ### Example | |
40 | /// | |
41 | /// ```rust | |
42 | /// # #![allow(unused)] | |
1b1a35ee XL |
43 | /// const µ: f64 = 0.000001; |
44 | /// ``` | |
45 | /// | |
46 | /// {{produces}} | |
47 | /// | |
48 | /// ### Explanation | |
49 | /// | |
cdc7bbd5 XL |
50 | /// This lint warns about using characters which are not commonly used, and may |
51 | /// cause visual confusion. | |
1b1a35ee XL |
52 | /// |
53 | /// This lint is triggered by identifiers that contain a codepoint that is | |
54 | /// not part of the set of "Allowed" codepoints as described by [Unicode® | |
55 | /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General | |
56 | /// Security Profile for Identifiers][TR39Allowed]. | |
57 | /// | |
58 | /// Note that the set of uncommon codepoints may change over time. Beware | |
59 | /// that if you "forbid" this lint that existing code may fail in the | |
60 | /// future. | |
61 | /// | |
1b1a35ee | 62 | /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile |
dfeec247 XL |
63 | pub UNCOMMON_CODEPOINTS, |
64 | Warn, | |
f035d41b XL |
65 | "detects uncommon Unicode codepoints in identifiers", |
66 | crate_level_only | |
dfeec247 XL |
67 | } |
68 | ||
f9f354fc | 69 | declare_lint! { |
1b1a35ee XL |
70 | /// The `confusable_idents` lint detects visually confusable pairs between |
71 | /// identifiers. | |
72 | /// | |
73 | /// ### Example | |
74 | /// | |
75 | /// ```rust | |
1b1a35ee XL |
76 | /// // Latin Capital Letter E With Caron |
77 | /// pub const Ě: i32 = 1; | |
78 | /// // Latin Capital Letter E With Breve | |
79 | /// pub const Ĕ: i32 = 2; | |
80 | /// ``` | |
81 | /// | |
82 | /// {{produces}} | |
83 | /// | |
84 | /// ### Explanation | |
85 | /// | |
cdc7bbd5 XL |
86 | /// This lint warns when different identifiers may appear visually similar, |
87 | /// which can cause confusion. | |
1b1a35ee XL |
88 | /// |
89 | /// The confusable detection algorithm is based on [Unicode® Technical | |
90 | /// Standard #39 Unicode Security Mechanisms Section 4 Confusable | |
91 | /// Detection][TR39Confusable]. For every distinct identifier X execute | |
92 | /// the function `skeleton(X)`. If there exist two distinct identifiers X | |
93 | /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it. | |
94 | /// The compiler uses the same mechanism to check if an identifier is too | |
95 | /// similar to a keyword. | |
96 | /// | |
97 | /// Note that the set of confusable characters may change over time. | |
98 | /// Beware that if you "forbid" this lint that existing code may fail in | |
99 | /// the future. | |
100 | /// | |
1b1a35ee | 101 | /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection |
f9f354fc | 102 | pub CONFUSABLE_IDENTS, |
f035d41b XL |
103 | Warn, |
104 | "detects visually confusable pairs between identifiers", | |
105 | crate_level_only | |
f9f354fc XL |
106 | } |
107 | ||
f035d41b | 108 | declare_lint! { |
1b1a35ee XL |
109 | /// The `mixed_script_confusables` lint detects visually confusable |
110 | /// characters in identifiers between different [scripts]. | |
111 | /// | |
112 | /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode) | |
113 | /// | |
114 | /// ### Example | |
115 | /// | |
116 | /// ```rust | |
1b1a35ee XL |
117 | /// // The Japanese katakana character エ can be confused with the Han character 工. |
118 | /// const エ: &'static str = "アイウ"; | |
119 | /// ``` | |
120 | /// | |
121 | /// {{produces}} | |
122 | /// | |
123 | /// ### Explanation | |
124 | /// | |
cdc7bbd5 XL |
125 | /// This lint warns when characters between different scripts may appear |
126 | /// visually similar, which can cause confusion. | |
1b1a35ee XL |
127 | /// |
128 | /// If the crate contains other identifiers in the same script that have | |
129 | /// non-confusable characters, then this lint will *not* be issued. For | |
130 | /// example, if the example given above has another identifier with | |
131 | /// katakana characters (such as `let カタカナ = 123;`), then this indicates | |
132 | /// that you are intentionally using katakana, and it will not warn about | |
133 | /// it. | |
134 | /// | |
135 | /// Note that the set of confusable characters may change over time. | |
136 | /// Beware that if you "forbid" this lint that existing code may fail in | |
137 | /// the future. | |
f035d41b XL |
138 | pub MIXED_SCRIPT_CONFUSABLES, |
139 | Warn, | |
140 | "detects Unicode scripts whose mixed script confusables codepoints are solely used", | |
141 | crate_level_only | |
f9f354fc XL |
142 | } |
143 | ||
f035d41b | 144 | declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]); |
416331ca XL |
145 | |
146 | impl EarlyLintPass for NonAsciiIdents { | |
f9f354fc XL |
147 | fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) { |
148 | use rustc_session::lint::Level; | |
f035d41b XL |
149 | use rustc_span::Span; |
150 | use std::collections::BTreeMap; | |
151 | use unicode_security::GeneralSecurityProfile; | |
f035d41b XL |
152 | |
153 | let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow; | |
154 | let check_uncommon_codepoints = | |
155 | cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow; | |
156 | let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow; | |
157 | let check_mixed_script_confusables = | |
158 | cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow; | |
159 | ||
160 | if !check_non_ascii_idents | |
161 | && !check_uncommon_codepoints | |
162 | && !check_confusable_idents | |
163 | && !check_mixed_script_confusables | |
164 | { | |
f9f354fc XL |
165 | return; |
166 | } | |
f035d41b XL |
167 | |
168 | let mut has_non_ascii_idents = false; | |
f9f354fc | 169 | let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock(); |
3dfed10e XL |
170 | |
171 | // Sort by `Span` so that error messages make sense with respect to the | |
172 | // order of identifier locations in the code. | |
173 | let mut symbols: Vec<_> = symbols.iter().collect(); | |
174 | symbols.sort_by_key(|k| k.1); | |
175 | ||
f035d41b | 176 | for (symbol, &sp) in symbols.iter() { |
f9f354fc | 177 | let symbol_str = symbol.as_str(); |
f035d41b XL |
178 | if symbol_str.is_ascii() { |
179 | continue; | |
f9f354fc | 180 | } |
f035d41b XL |
181 | has_non_ascii_idents = true; |
182 | cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| { | |
183 | lint.build("identifier contains non-ASCII characters").emit() | |
184 | }); | |
185 | if check_uncommon_codepoints | |
186 | && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed) | |
187 | { | |
188 | cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| { | |
189 | lint.build("identifier contains uncommon Unicode codepoints").emit() | |
190 | }) | |
f9f354fc XL |
191 | } |
192 | } | |
f035d41b XL |
193 | |
194 | if has_non_ascii_idents && check_confusable_idents { | |
3dfed10e | 195 | let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> = |
f035d41b | 196 | FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default()); |
3dfed10e XL |
197 | let mut skeleton_buf = String::new(); |
198 | ||
199 | for (&symbol, &sp) in symbols.iter() { | |
200 | use unicode_security::confusable_detection::skeleton; | |
201 | ||
f9f354fc | 202 | let symbol_str = symbol.as_str(); |
f035d41b | 203 | let is_ascii = symbol_str.is_ascii(); |
3dfed10e XL |
204 | |
205 | // Get the skeleton as a `Symbol`. | |
206 | skeleton_buf.clear(); | |
207 | skeleton_buf.extend(skeleton(&symbol_str)); | |
208 | let skeleton_sym = if *symbol_str == *skeleton_buf { | |
209 | symbol | |
210 | } else { | |
211 | Symbol::intern(&skeleton_buf) | |
212 | }; | |
213 | ||
f035d41b | 214 | skeleton_map |
3dfed10e XL |
215 | .entry(skeleton_sym) |
216 | .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| { | |
f035d41b XL |
217 | if !*existing_is_ascii || !is_ascii { |
218 | cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| { | |
219 | lint.build(&format!( | |
220 | "identifier pair considered confusable between `{}` and `{}`", | |
a2a8927a | 221 | existing_symbol, symbol |
f035d41b XL |
222 | )) |
223 | .span_label( | |
224 | *existing_span, | |
225 | "this is where the previous identifier occurred", | |
226 | ) | |
227 | .emit(); | |
228 | }); | |
229 | } | |
230 | if *existing_is_ascii && !is_ascii { | |
3dfed10e | 231 | *existing_symbol = symbol; |
f035d41b XL |
232 | *existing_span = sp; |
233 | *existing_is_ascii = is_ascii; | |
234 | } | |
235 | }) | |
3dfed10e | 236 | .or_insert((symbol, sp, is_ascii)); |
f9f354fc XL |
237 | } |
238 | } | |
f035d41b XL |
239 | |
240 | if has_non_ascii_idents && check_mixed_script_confusables { | |
241 | use unicode_security::is_potential_mixed_script_confusable_char; | |
242 | use unicode_security::mixed_script::AugmentedScriptSet; | |
243 | ||
244 | #[derive(Clone)] | |
245 | enum ScriptSetUsage { | |
246 | Suspicious(Vec<char>, Span), | |
247 | Verified, | |
248 | } | |
249 | ||
250 | let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> = | |
251 | FxHashMap::default(); | |
252 | let latin_augmented_script_set = AugmentedScriptSet::for_char('A'); | |
253 | script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified); | |
254 | ||
255 | let mut has_suspicous = false; | |
256 | for (symbol, &sp) in symbols.iter() { | |
257 | let symbol_str = symbol.as_str(); | |
258 | for ch in symbol_str.chars() { | |
259 | if ch.is_ascii() { | |
260 | // all ascii characters are covered by exception. | |
261 | continue; | |
262 | } | |
263 | if !GeneralSecurityProfile::identifier_allowed(ch) { | |
264 | // this character is covered by `uncommon_codepoints` lint. | |
265 | continue; | |
266 | } | |
267 | let augmented_script_set = AugmentedScriptSet::for_char(ch); | |
268 | script_states | |
269 | .entry(augmented_script_set) | |
270 | .and_modify(|existing_state| { | |
271 | if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state { | |
272 | if is_potential_mixed_script_confusable_char(ch) { | |
273 | ch_list.push(ch); | |
274 | } else { | |
275 | *existing_state = ScriptSetUsage::Verified; | |
276 | } | |
277 | } | |
278 | }) | |
279 | .or_insert_with(|| { | |
280 | if !is_potential_mixed_script_confusable_char(ch) { | |
281 | ScriptSetUsage::Verified | |
282 | } else { | |
283 | has_suspicous = true; | |
284 | ScriptSetUsage::Suspicious(vec![ch], sp) | |
285 | } | |
286 | }); | |
287 | } | |
288 | } | |
289 | ||
290 | if has_suspicous { | |
291 | let verified_augmented_script_sets = script_states | |
292 | .iter() | |
293 | .flat_map(|(k, v)| match v { | |
294 | ScriptSetUsage::Verified => Some(*k), | |
295 | _ => None, | |
296 | }) | |
297 | .collect::<Vec<_>>(); | |
298 | ||
299 | // we're sorting the output here. | |
300 | let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> = | |
301 | BTreeMap::new(); | |
302 | ||
303 | 'outerloop: for (augment_script_set, usage) in script_states { | |
304 | let (mut ch_list, sp) = match usage { | |
305 | ScriptSetUsage::Verified => continue, | |
306 | ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp), | |
307 | }; | |
308 | ||
309 | if augment_script_set.is_all() { | |
310 | continue; | |
311 | } | |
312 | ||
313 | for existing in verified_augmented_script_sets.iter() { | |
314 | if existing.is_all() { | |
315 | continue; | |
316 | } | |
317 | let mut intersect = *existing; | |
318 | intersect.intersect_with(augment_script_set); | |
319 | if !intersect.is_empty() && !intersect.is_all() { | |
320 | continue 'outerloop; | |
321 | } | |
322 | } | |
323 | ||
1b1a35ee XL |
324 | // We sort primitive chars here and can use unstable sort |
325 | ch_list.sort_unstable(); | |
f035d41b XL |
326 | ch_list.dedup(); |
327 | lint_reports.insert((sp, ch_list), augment_script_set); | |
328 | } | |
329 | ||
330 | for ((sp, ch_list), script_set) in lint_reports { | |
331 | cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| { | |
332 | let message = format!( | |
c295e0f8 | 333 | "the usage of Script Group `{}` in this crate consists solely of mixed script confusables", |
f035d41b | 334 | script_set); |
c295e0f8 | 335 | let mut note = "the usage includes ".to_string(); |
f035d41b XL |
336 | for (idx, ch) in ch_list.into_iter().enumerate() { |
337 | if idx != 0 { | |
338 | note += ", "; | |
339 | } | |
340 | let char_info = format!("'{}' (U+{:04X})", ch, ch as u32); | |
341 | note += &char_info; | |
342 | } | |
c295e0f8 | 343 | lint.build(&message).note(¬e).note("please recheck to make sure their usages are indeed what you want").emit() |
f9f354fc | 344 | }); |
f035d41b XL |
345 | } |
346 | } | |
f9f354fc XL |
347 | } |
348 | } | |
f035d41b | 349 | } |