1 use crate::{EarlyContext, EarlyLintPass, LintContext}
;
3 use rustc_data_structures
::fx
::FxHashMap
;
4 use rustc_span
::symbol
::Symbol
;
7 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
11 /// ```rust,compile_fail
12 /// # #![allow(unused)]
13 /// #![feature(non_ascii_idents)]
14 /// #![deny(non_ascii_idents)]
24 /// Currently on stable Rust, identifiers must contain ASCII characters.
25 /// The [`non_ascii_idents`] nightly-only feature allows identifiers to
26 /// contain non-ASCII characters. This lint allows projects that wish to
27 /// retain the limit of only using ASCII characters to switch this lint to
28 /// "forbid" (for example to ease collaboration or for security reasons).
29 /// See [RFC 2457] for more details.
31 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
32 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
35 "detects non-ASCII identifiers",
40 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
46 /// # #![allow(unused)]
47 /// #![feature(non_ascii_idents)]
48 /// const µ: f64 = 0.000001;
55 /// With the [`non_ascii_idents`] nightly-only feature enabled,
56 /// identifiers are allowed to use non-ASCII characters. This lint warns
57 /// about using characters which are not commonly used, and may cause
60 /// This lint is triggered by identifiers that contain a codepoint that is
61 /// not part of the set of "Allowed" codepoints as described by [Unicode®
62 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
63 /// Security Profile for Identifiers][TR39Allowed].
65 /// Note that the set of uncommon codepoints may change over time. Beware
66 /// that if you "forbid" this lint that existing code may fail in the
69 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
70 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
71 pub UNCOMMON_CODEPOINTS
,
73 "detects uncommon Unicode codepoints in identifiers",
78 /// The `confusable_idents` lint detects visually confusable pairs between
84 /// #![feature(non_ascii_idents)]
86 /// // Latin Capital Letter E With Caron
87 /// pub const Ě: i32 = 1;
88 /// // Latin Capital Letter E With Breve
89 /// pub const Ĕ: i32 = 2;
96 /// With the [`non_ascii_idents`] nightly-only feature enabled,
97 /// identifiers are allowed to use non-ASCII characters. This lint warns
98 /// when different identifiers may appear visually similar, which can
101 /// The confusable detection algorithm is based on [Unicode® Technical
102 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
103 /// Detection][TR39Confusable]. For every distinct identifier X execute
104 /// the function `skeleton(X)`. If there exist two distinct identifiers X
105 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
106 /// The compiler uses the same mechanism to check if an identifier is too
107 /// similar to a keyword.
109 /// Note that the set of confusable characters may change over time.
110 /// Beware that if you "forbid" this lint that existing code may fail in
113 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
114 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
115 pub CONFUSABLE_IDENTS
,
117 "detects visually confusable pairs between identifiers",
122 /// The `mixed_script_confusables` lint detects visually confusable
123 /// characters in identifiers between different [scripts].
125 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
130 /// #![feature(non_ascii_idents)]
132 /// // The Japanese katakana character エ can be confused with the Han character 工.
133 /// const エ: &'static str = "アイウ";
140 /// With the [`non_ascii_idents`] nightly-only feature enabled,
141 /// identifiers are allowed to use non-ASCII characters. This lint warns
142 /// when characters between different scripts may appear visually similar,
143 /// which can cause confusion.
145 /// If the crate contains other identifiers in the same script that have
146 /// non-confusable characters, then this lint will *not* be issued. For
147 /// example, if the example given above has another identifier with
148 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
149 /// that you are intentionally using katakana, and it will not warn about
152 /// Note that the set of confusable characters may change over time.
153 /// Beware that if you "forbid" this lint that existing code may fail in
156 /// [`non_ascii_idents`]: https://doc.rust-lang.org/nightly/unstable-book/language-features/non-ascii-idents.html
157 pub MIXED_SCRIPT_CONFUSABLES
,
159 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
163 declare_lint_pass
!(NonAsciiIdents
=> [NON_ASCII_IDENTS
, UNCOMMON_CODEPOINTS
, CONFUSABLE_IDENTS
, MIXED_SCRIPT_CONFUSABLES
]);
165 impl EarlyLintPass
for NonAsciiIdents
{
166 fn check_crate(&mut self, cx
: &EarlyContext
<'_
>, _
: &ast
::Crate
) {
167 use rustc_session
::lint
::Level
;
168 use rustc_span
::Span
;
169 use std
::collections
::BTreeMap
;
170 use unicode_security
::GeneralSecurityProfile
;
172 let check_non_ascii_idents
= cx
.builder
.lint_level(NON_ASCII_IDENTS
).0 != Level
::Allow
;
173 let check_uncommon_codepoints
=
174 cx
.builder
.lint_level(UNCOMMON_CODEPOINTS
).0 != Level
::Allow
;
175 let check_confusable_idents
= cx
.builder
.lint_level(CONFUSABLE_IDENTS
).0 != Level
::Allow
;
176 let check_mixed_script_confusables
=
177 cx
.builder
.lint_level(MIXED_SCRIPT_CONFUSABLES
).0 != Level
::Allow
;
179 if !check_non_ascii_idents
180 && !check_uncommon_codepoints
181 && !check_confusable_idents
182 && !check_mixed_script_confusables
187 let mut has_non_ascii_idents
= false;
188 let symbols
= cx
.sess
.parse_sess
.symbol_gallery
.symbols
.lock();
190 // Sort by `Span` so that error messages make sense with respect to the
191 // order of identifier locations in the code.
192 let mut symbols
: Vec
<_
> = symbols
.iter().collect();
193 symbols
.sort_by_key(|k
| k
.1);
195 for (symbol
, &sp
) in symbols
.iter() {
196 let symbol_str
= symbol
.as_str();
197 if symbol_str
.is_ascii() {
200 has_non_ascii_idents
= true;
201 cx
.struct_span_lint(NON_ASCII_IDENTS
, sp
, |lint
| {
202 lint
.build("identifier contains non-ASCII characters").emit()
204 if check_uncommon_codepoints
205 && !symbol_str
.chars().all(GeneralSecurityProfile
::identifier_allowed
)
207 cx
.struct_span_lint(UNCOMMON_CODEPOINTS
, sp
, |lint
| {
208 lint
.build("identifier contains uncommon Unicode codepoints").emit()
213 if has_non_ascii_idents
&& check_confusable_idents
{
214 let mut skeleton_map
: FxHashMap
<Symbol
, (Symbol
, Span
, bool
)> =
215 FxHashMap
::with_capacity_and_hasher(symbols
.len(), Default
::default());
216 let mut skeleton_buf
= String
::new();
218 for (&symbol
, &sp
) in symbols
.iter() {
219 use unicode_security
::confusable_detection
::skeleton
;
221 let symbol_str
= symbol
.as_str();
222 let is_ascii
= symbol_str
.is_ascii();
224 // Get the skeleton as a `Symbol`.
225 skeleton_buf
.clear();
226 skeleton_buf
.extend(skeleton(&symbol_str
));
227 let skeleton_sym
= if *symbol_str
== *skeleton_buf
{
230 Symbol
::intern(&skeleton_buf
)
235 .and_modify(|(existing_symbol
, existing_span
, existing_is_ascii
)| {
236 if !*existing_is_ascii
|| !is_ascii
{
237 cx
.struct_span_lint(CONFUSABLE_IDENTS
, sp
, |lint
| {
239 "identifier pair considered confusable between `{}` and `{}`",
240 existing_symbol
.as_str(),
245 "this is where the previous identifier occurred",
250 if *existing_is_ascii
&& !is_ascii
{
251 *existing_symbol
= symbol
;
253 *existing_is_ascii
= is_ascii
;
256 .or_insert((symbol
, sp
, is_ascii
));
260 if has_non_ascii_idents
&& check_mixed_script_confusables
{
261 use unicode_security
::is_potential_mixed_script_confusable_char
;
262 use unicode_security
::mixed_script
::AugmentedScriptSet
;
265 enum ScriptSetUsage
{
266 Suspicious(Vec
<char>, Span
),
270 let mut script_states
: FxHashMap
<AugmentedScriptSet
, ScriptSetUsage
> =
271 FxHashMap
::default();
272 let latin_augmented_script_set
= AugmentedScriptSet
::for_char('A'
);
273 script_states
.insert(latin_augmented_script_set
, ScriptSetUsage
::Verified
);
275 let mut has_suspicous
= false;
276 for (symbol
, &sp
) in symbols
.iter() {
277 let symbol_str
= symbol
.as_str();
278 for ch
in symbol_str
.chars() {
280 // all ascii characters are covered by exception.
283 if !GeneralSecurityProfile
::identifier_allowed(ch
) {
284 // this character is covered by `uncommon_codepoints` lint.
287 let augmented_script_set
= AugmentedScriptSet
::for_char(ch
);
289 .entry(augmented_script_set
)
290 .and_modify(|existing_state
| {
291 if let ScriptSetUsage
::Suspicious(ch_list
, _
) = existing_state
{
292 if is_potential_mixed_script_confusable_char(ch
) {
295 *existing_state
= ScriptSetUsage
::Verified
;
300 if !is_potential_mixed_script_confusable_char(ch
) {
301 ScriptSetUsage
::Verified
303 has_suspicous
= true;
304 ScriptSetUsage
::Suspicious(vec
![ch
], sp
)
311 let verified_augmented_script_sets
= script_states
313 .flat_map(|(k
, v
)| match v
{
314 ScriptSetUsage
::Verified
=> Some(*k
),
317 .collect
::<Vec
<_
>>();
319 // we're sorting the output here.
320 let mut lint_reports
: BTreeMap
<(Span
, Vec
<char>), AugmentedScriptSet
> =
323 'outerloop
: for (augment_script_set
, usage
) in script_states
{
324 let (mut ch_list
, sp
) = match usage
{
325 ScriptSetUsage
::Verified
=> continue,
326 ScriptSetUsage
::Suspicious(ch_list
, sp
) => (ch_list
, sp
),
329 if augment_script_set
.is_all() {
333 for existing
in verified_augmented_script_sets
.iter() {
334 if existing
.is_all() {
337 let mut intersect
= *existing
;
338 intersect
.intersect_with(augment_script_set
);
339 if !intersect
.is_empty() && !intersect
.is_all() {
344 // We sort primitive chars here and can use unstable sort
345 ch_list
.sort_unstable();
347 lint_reports
.insert((sp
, ch_list
), augment_script_set
);
350 for ((sp
, ch_list
), script_set
) in lint_reports
{
351 cx
.struct_span_lint(MIXED_SCRIPT_CONFUSABLES
, sp
, |lint
| {
352 let message
= format
!(
353 "The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
355 let mut note
= "The usage includes ".to_string();
356 for (idx
, ch
) in ch_list
.into_iter().enumerate() {
360 let char_info
= format
!("'{}' (U+{:04X})", ch
, ch
as u32);
364 lint
.build(&message
).note(¬e
).note("Please recheck to make sure their usages are indeed what you want.").emit()