1 use crate::{EarlyContext, EarlyLintPass, LintContext}
;
3 use rustc_data_structures
::fx
::FxHashMap
;
4 use rustc_span
::symbol
::Symbol
;
7 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
11 /// ```rust,compile_fail
12 /// # #![allow(unused)]
13 /// #![deny(non_ascii_idents)]
23 /// This lint allows projects that wish to retain the limit of only using
24 /// ASCII characters to switch this lint to "forbid" (for example to ease
25 /// collaboration or for security reasons).
26 /// See [RFC 2457] for more details.
28 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
31 "detects non-ASCII identifiers",
36 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
42 /// # #![allow(unused)]
43 /// const µ: f64 = 0.000001;
50 /// This lint warns about using characters which are not commonly used, and may
51 /// cause visual confusion.
53 /// This lint is triggered by identifiers that contain a codepoint that is
54 /// not part of the set of "Allowed" codepoints as described by [Unicode®
55 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
56 /// Security Profile for Identifiers][TR39Allowed].
58 /// Note that the set of uncommon codepoints may change over time. Beware
59 /// that if you "forbid" this lint that existing code may fail in the
62 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
63 pub UNCOMMON_CODEPOINTS
,
65 "detects uncommon Unicode codepoints in identifiers",
70 /// The `confusable_idents` lint detects visually confusable pairs between
76 /// // Latin Capital Letter E With Caron
77 /// pub const Ě: i32 = 1;
78 /// // Latin Capital Letter E With Breve
79 /// pub const Ĕ: i32 = 2;
86 /// This lint warns when different identifiers may appear visually similar,
87 /// which can cause confusion.
89 /// The confusable detection algorithm is based on [Unicode® Technical
90 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
91 /// Detection][TR39Confusable]. For every distinct identifier X execute
92 /// the function `skeleton(X)`. If there exist two distinct identifiers X
93 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
94 /// The compiler uses the same mechanism to check if an identifier is too
95 /// similar to a keyword.
97 /// Note that the set of confusable characters may change over time.
98 /// Beware that if you "forbid" this lint that existing code may fail in
101 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
102 pub CONFUSABLE_IDENTS
,
104 "detects visually confusable pairs between identifiers",
109 /// The `mixed_script_confusables` lint detects visually confusable
110 /// characters in identifiers between different [scripts].
112 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
117 /// // The Japanese katakana character エ can be confused with the Han character 工.
118 /// const エ: &'static str = "アイウ";
125 /// This lint warns when characters between different scripts may appear
126 /// visually similar, which can cause confusion.
128 /// If the crate contains other identifiers in the same script that have
129 /// non-confusable characters, then this lint will *not* be issued. For
130 /// example, if the example given above has another identifier with
131 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
132 /// that you are intentionally using katakana, and it will not warn about
135 /// Note that the set of confusable characters may change over time.
136 /// Beware that if you "forbid" this lint that existing code may fail in
138 pub MIXED_SCRIPT_CONFUSABLES
,
140 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
144 declare_lint_pass
!(NonAsciiIdents
=> [NON_ASCII_IDENTS
, UNCOMMON_CODEPOINTS
, CONFUSABLE_IDENTS
, MIXED_SCRIPT_CONFUSABLES
]);
146 impl EarlyLintPass
for NonAsciiIdents
{
147 fn check_crate(&mut self, cx
: &EarlyContext
<'_
>, _
: &ast
::Crate
) {
148 use rustc_session
::lint
::Level
;
149 use rustc_span
::Span
;
150 use std
::collections
::BTreeMap
;
151 use unicode_security
::GeneralSecurityProfile
;
153 let check_non_ascii_idents
= cx
.builder
.lint_level(NON_ASCII_IDENTS
).0 != Level
::Allow
;
154 let check_uncommon_codepoints
=
155 cx
.builder
.lint_level(UNCOMMON_CODEPOINTS
).0 != Level
::Allow
;
156 let check_confusable_idents
= cx
.builder
.lint_level(CONFUSABLE_IDENTS
).0 != Level
::Allow
;
157 let check_mixed_script_confusables
=
158 cx
.builder
.lint_level(MIXED_SCRIPT_CONFUSABLES
).0 != Level
::Allow
;
160 if !check_non_ascii_idents
161 && !check_uncommon_codepoints
162 && !check_confusable_idents
163 && !check_mixed_script_confusables
168 let mut has_non_ascii_idents
= false;
169 let symbols
= cx
.sess().parse_sess
.symbol_gallery
.symbols
.lock();
171 // Sort by `Span` so that error messages make sense with respect to the
172 // order of identifier locations in the code.
173 let mut symbols
: Vec
<_
> = symbols
.iter().collect();
174 symbols
.sort_by_key(|k
| k
.1);
176 for (symbol
, &sp
) in symbols
.iter() {
177 let symbol_str
= symbol
.as_str();
178 if symbol_str
.is_ascii() {
181 has_non_ascii_idents
= true;
182 cx
.struct_span_lint(NON_ASCII_IDENTS
, sp
, |lint
| {
183 lint
.build("identifier contains non-ASCII characters").emit()
185 if check_uncommon_codepoints
186 && !symbol_str
.chars().all(GeneralSecurityProfile
::identifier_allowed
)
188 cx
.struct_span_lint(UNCOMMON_CODEPOINTS
, sp
, |lint
| {
189 lint
.build("identifier contains uncommon Unicode codepoints").emit()
194 if has_non_ascii_idents
&& check_confusable_idents
{
195 let mut skeleton_map
: FxHashMap
<Symbol
, (Symbol
, Span
, bool
)> =
196 FxHashMap
::with_capacity_and_hasher(symbols
.len(), Default
::default());
197 let mut skeleton_buf
= String
::new();
199 for (&symbol
, &sp
) in symbols
.iter() {
200 use unicode_security
::confusable_detection
::skeleton
;
202 let symbol_str
= symbol
.as_str();
203 let is_ascii
= symbol_str
.is_ascii();
205 // Get the skeleton as a `Symbol`.
206 skeleton_buf
.clear();
207 skeleton_buf
.extend(skeleton(&symbol_str
));
208 let skeleton_sym
= if *symbol_str
== *skeleton_buf
{
211 Symbol
::intern(&skeleton_buf
)
216 .and_modify(|(existing_symbol
, existing_span
, existing_is_ascii
)| {
217 if !*existing_is_ascii
|| !is_ascii
{
218 cx
.struct_span_lint(CONFUSABLE_IDENTS
, sp
, |lint
| {
220 "identifier pair considered confusable between `{}` and `{}`",
221 existing_symbol
, symbol
225 "this is where the previous identifier occurred",
230 if *existing_is_ascii
&& !is_ascii
{
231 *existing_symbol
= symbol
;
233 *existing_is_ascii
= is_ascii
;
236 .or_insert((symbol
, sp
, is_ascii
));
240 if has_non_ascii_idents
&& check_mixed_script_confusables
{
241 use unicode_security
::is_potential_mixed_script_confusable_char
;
242 use unicode_security
::mixed_script
::AugmentedScriptSet
;
245 enum ScriptSetUsage
{
246 Suspicious(Vec
<char>, Span
),
250 let mut script_states
: FxHashMap
<AugmentedScriptSet
, ScriptSetUsage
> =
251 FxHashMap
::default();
252 let latin_augmented_script_set
= AugmentedScriptSet
::for_char('A'
);
253 script_states
.insert(latin_augmented_script_set
, ScriptSetUsage
::Verified
);
255 let mut has_suspicous
= false;
256 for (symbol
, &sp
) in symbols
.iter() {
257 let symbol_str
= symbol
.as_str();
258 for ch
in symbol_str
.chars() {
260 // all ascii characters are covered by exception.
263 if !GeneralSecurityProfile
::identifier_allowed(ch
) {
264 // this character is covered by `uncommon_codepoints` lint.
267 let augmented_script_set
= AugmentedScriptSet
::for_char(ch
);
269 .entry(augmented_script_set
)
270 .and_modify(|existing_state
| {
271 if let ScriptSetUsage
::Suspicious(ch_list
, _
) = existing_state
{
272 if is_potential_mixed_script_confusable_char(ch
) {
275 *existing_state
= ScriptSetUsage
::Verified
;
280 if !is_potential_mixed_script_confusable_char(ch
) {
281 ScriptSetUsage
::Verified
283 has_suspicous
= true;
284 ScriptSetUsage
::Suspicious(vec
![ch
], sp
)
291 let verified_augmented_script_sets
= script_states
293 .flat_map(|(k
, v
)| match v
{
294 ScriptSetUsage
::Verified
=> Some(*k
),
297 .collect
::<Vec
<_
>>();
299 // we're sorting the output here.
300 let mut lint_reports
: BTreeMap
<(Span
, Vec
<char>), AugmentedScriptSet
> =
303 'outerloop
: for (augment_script_set
, usage
) in script_states
{
304 let (mut ch_list
, sp
) = match usage
{
305 ScriptSetUsage
::Verified
=> continue,
306 ScriptSetUsage
::Suspicious(ch_list
, sp
) => (ch_list
, sp
),
309 if augment_script_set
.is_all() {
313 for existing
in verified_augmented_script_sets
.iter() {
314 if existing
.is_all() {
317 let mut intersect
= *existing
;
318 intersect
.intersect_with(augment_script_set
);
319 if !intersect
.is_empty() && !intersect
.is_all() {
324 // We sort primitive chars here and can use unstable sort
325 ch_list
.sort_unstable();
327 lint_reports
.insert((sp
, ch_list
), augment_script_set
);
330 for ((sp
, ch_list
), script_set
) in lint_reports
{
331 cx
.struct_span_lint(MIXED_SCRIPT_CONFUSABLES
, sp
, |lint
| {
332 let message
= format
!(
333 "the usage of Script Group `{}` in this crate consists solely of mixed script confusables",
335 let mut note
= "the usage includes ".to_string();
336 for (idx
, ch
) in ch_list
.into_iter().enumerate() {
340 let char_info
= format
!("'{}' (U+{:04X})", ch
, ch
as u32);
343 lint
.build(&message
).note(¬e
).note("please recheck to make sure their usages are indeed what you want").emit()