]>
Commit | Line | Data |
---|---|---|
60c5eb7d XL |
1 | // Copyright 2013-2014 The rust-url developers. |
2 | // | |
3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
6 | // option. This file may not be copied, modified, or distributed | |
7 | // except according to those terms. | |
8 | ||
9 | //! [*Unicode IDNA Compatibility Processing* | |
10 | //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) | |
11 | ||
12 | use self::Mapping::*; | |
13 | use punycode; | |
14 | #[allow(unused_imports, deprecated)] | |
15 | use std::ascii::AsciiExt; | |
16 | use std::cmp::Ordering::{Equal, Less, Greater}; | |
17 | use unicode_bidi::{BidiClass, bidi_class}; | |
18 | use unicode_normalization::UnicodeNormalization; | |
19 | use unicode_normalization::char::is_combining_mark; | |
20 | ||
21 | include!("uts46_mapping_table.rs"); | |
22 | ||
23 | ||
24 | pub static PUNYCODE_PREFIX: &'static str = "xn--"; | |
25 | ||
26 | ||
27 | #[derive(Debug)] | |
28 | struct StringTableSlice { | |
29 | // Store these as separate fields so the structure will have an | |
30 | // alignment of 1 and thus pack better into the Mapping enum, below. | |
31 | byte_start_lo: u8, | |
32 | byte_start_hi: u8, | |
33 | byte_len: u8, | |
34 | } | |
35 | ||
36 | fn decode_slice(slice: &StringTableSlice) -> &'static str { | |
37 | let lo = slice.byte_start_lo as usize; | |
38 | let hi = slice.byte_start_hi as usize; | |
39 | let start = (hi << 8) | lo; | |
40 | let len = slice.byte_len as usize; | |
41 | &STRING_TABLE[start..(start + len)] | |
42 | } | |
43 | ||
44 | #[repr(u8)] | |
45 | #[derive(Debug)] | |
46 | enum Mapping { | |
47 | Valid, | |
48 | Ignored, | |
49 | Mapped(StringTableSlice), | |
50 | Deviation(StringTableSlice), | |
51 | Disallowed, | |
52 | DisallowedStd3Valid, | |
53 | DisallowedStd3Mapped(StringTableSlice), | |
54 | } | |
55 | ||
56 | struct Range { | |
57 | from: char, | |
58 | to: char, | |
59 | } | |
60 | ||
61 | fn find_char(codepoint: char) -> &'static Mapping { | |
62 | let r = TABLE.binary_search_by(|ref range| { | |
63 | if codepoint > range.to { | |
64 | Less | |
65 | } else if codepoint < range.from { | |
66 | Greater | |
67 | } else { | |
68 | Equal | |
69 | } | |
70 | }); | |
71 | r.ok().map(|i| { | |
72 | const SINGLE_MARKER: u16 = 1 << 15; | |
73 | ||
74 | let x = INDEX_TABLE[i]; | |
75 | let single = (x & SINGLE_MARKER) != 0; | |
76 | let offset = !SINGLE_MARKER & x; | |
77 | ||
78 | if single { | |
79 | &MAPPING_TABLE[offset as usize] | |
80 | } else { | |
81 | &MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize] | |
82 | } | |
83 | }).unwrap() | |
84 | } | |
85 | ||
86 | fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) { | |
87 | match *find_char(codepoint) { | |
88 | Mapping::Valid => output.push(codepoint), | |
89 | Mapping::Ignored => {}, | |
90 | Mapping::Mapped(ref slice) => output.push_str(decode_slice(slice)), | |
91 | Mapping::Deviation(ref slice) => { | |
92 | if flags.transitional_processing { | |
93 | output.push_str(decode_slice(slice)) | |
94 | } else { | |
95 | output.push(codepoint) | |
96 | } | |
97 | } | |
98 | Mapping::Disallowed => { | |
99 | errors.push(Error::DissallowedCharacter); | |
100 | output.push(codepoint); | |
101 | } | |
102 | Mapping::DisallowedStd3Valid => { | |
103 | if flags.use_std3_ascii_rules { | |
104 | errors.push(Error::DissallowedByStd3AsciiRules); | |
105 | } | |
106 | output.push(codepoint) | |
107 | } | |
108 | Mapping::DisallowedStd3Mapped(ref slice) => { | |
109 | if flags.use_std3_ascii_rules { | |
110 | errors.push(Error::DissallowedMappedInStd3); | |
111 | } | |
112 | output.push_str(decode_slice(slice)) | |
113 | } | |
114 | } | |
115 | } | |
116 | ||
117 | // http://tools.ietf.org/html/rfc5893#section-2 | |
118 | fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool { | |
119 | // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label | |
120 | // is RTL if it contains at least one character of bidi class R, AL or AN. | |
121 | if !is_bidi_domain { | |
122 | return true; | |
123 | } | |
124 | ||
125 | let mut chars = label.chars(); | |
126 | let first_char_class = match chars.next() { | |
127 | Some(c) => bidi_class(c), | |
128 | None => return true, // empty string | |
129 | }; | |
130 | ||
131 | match first_char_class { | |
132 | // LTR label | |
133 | BidiClass::L => { | |
134 | // Rule 5 | |
135 | loop { | |
136 | match chars.next() { | |
137 | Some(c) => { | |
138 | if !matches!(bidi_class(c), | |
139 | BidiClass::L | BidiClass::EN | | |
140 | BidiClass::ES | BidiClass::CS | | |
141 | BidiClass::ET | BidiClass::ON | | |
142 | BidiClass::BN | BidiClass::NSM | |
143 | ) { | |
144 | return false; | |
145 | } | |
146 | }, | |
147 | None => { break; }, | |
148 | } | |
149 | } | |
150 | ||
151 | // Rule 6 | |
152 | // must end in L or EN followed by 0 or more NSM | |
153 | let mut rev_chars = label.chars().rev(); | |
154 | let mut last_non_nsm = rev_chars.next(); | |
155 | loop { | |
156 | match last_non_nsm { | |
157 | Some(c) if bidi_class(c) == BidiClass::NSM => { | |
158 | last_non_nsm = rev_chars.next(); | |
159 | continue; | |
160 | } | |
161 | _ => { break; }, | |
162 | } | |
163 | } | |
164 | match last_non_nsm { | |
165 | Some(c) if bidi_class(c) == BidiClass::L | |
166 | || bidi_class(c) == BidiClass::EN => {}, | |
167 | Some(_) => { return false; }, | |
168 | _ => {} | |
169 | } | |
170 | ||
171 | } | |
172 | ||
173 | // RTL label | |
174 | BidiClass::R | BidiClass::AL => { | |
175 | let mut found_en = false; | |
176 | let mut found_an = false; | |
177 | ||
178 | // Rule 2 | |
179 | loop { | |
180 | match chars.next() { | |
181 | Some(c) => { | |
182 | let char_class = bidi_class(c); | |
183 | ||
184 | if char_class == BidiClass::EN { | |
185 | found_en = true; | |
186 | } | |
187 | if char_class == BidiClass::AN { | |
188 | found_an = true; | |
189 | } | |
190 | ||
191 | if !matches!(char_class, BidiClass::R | BidiClass::AL | | |
192 | BidiClass::AN | BidiClass::EN | | |
193 | BidiClass::ES | BidiClass::CS | | |
194 | BidiClass::ET | BidiClass::ON | | |
195 | BidiClass::BN | BidiClass::NSM) { | |
196 | return false; | |
197 | } | |
198 | }, | |
199 | None => { break; }, | |
200 | } | |
201 | } | |
202 | // Rule 3 | |
203 | let mut rev_chars = label.chars().rev(); | |
204 | let mut last = rev_chars.next(); | |
205 | loop { // must end in L or EN followed by 0 or more NSM | |
206 | match last { | |
207 | Some(c) if bidi_class(c) == BidiClass::NSM => { | |
208 | last = rev_chars.next(); | |
209 | continue; | |
210 | } | |
211 | _ => { break; }, | |
212 | } | |
213 | } | |
214 | match last { | |
215 | Some(c) if matches!(bidi_class(c), BidiClass::R | BidiClass::AL | | |
216 | BidiClass::EN | BidiClass::AN) => {}, | |
217 | _ => { return false; } | |
218 | } | |
219 | ||
220 | // Rule 4 | |
221 | if found_an && found_en { | |
222 | return false; | |
223 | } | |
224 | } | |
225 | ||
226 | // Rule 1: Should start with L or R/AL | |
227 | _ => { | |
228 | return false; | |
229 | } | |
230 | } | |
231 | ||
232 | return true; | |
233 | } | |
234 | ||
235 | /// http://www.unicode.org/reports/tr46/#Validity_Criteria | |
236 | fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) { | |
237 | // V1: Must be in NFC form. | |
238 | if label.nfc().ne(label.chars()) { | |
239 | errors.push(Error::ValidityCriteria); | |
240 | } else { | |
241 | validate(label, is_bidi_domain, flags, errors); | |
242 | } | |
243 | } | |
244 | ||
245 | fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) { | |
246 | let first_char = label.chars().next(); | |
247 | if first_char == None { | |
248 | // Empty string, pass | |
249 | } | |
250 | ||
251 | // V2: No U+002D HYPHEN-MINUS in both third and fourth positions. | |
252 | // | |
253 | // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the | |
254 | // third and fourth positions. But nobody follows this criteria. See the spec issue below: | |
255 | // https://github.com/whatwg/url/issues/53 | |
256 | // | |
257 | // TODO: Add *CheckHyphens* flag. | |
258 | ||
259 | // V3: neither begin nor end with a U+002D HYPHEN-MINUS | |
260 | else if label.starts_with("-") || label.ends_with("-") { | |
261 | errors.push(Error::ValidityCriteria); | |
262 | } | |
263 | ||
264 | // V4: not contain a U+002E FULL STOP | |
265 | // | |
266 | // Here, label can't contain '.' since the input is from .split('.') | |
267 | ||
268 | // V5: not begin with a GC=Mark | |
269 | else if is_combining_mark(first_char.unwrap()) { | |
270 | errors.push(Error::ValidityCriteria); | |
271 | } | |
272 | ||
273 | // V6: Check against Mapping Table | |
274 | else if label.chars().any(|c| match *find_char(c) { | |
275 | Mapping::Valid => false, | |
276 | Mapping::Deviation(_) => flags.transitional_processing, | |
277 | Mapping::DisallowedStd3Valid => flags.use_std3_ascii_rules, | |
278 | _ => true, | |
279 | }) { | |
280 | errors.push(Error::ValidityCriteria); | |
281 | } | |
282 | ||
283 | // V7: ContextJ rules | |
284 | // | |
285 | // TODO: Implement rules and add *CheckJoiners* flag. | |
286 | ||
287 | // V8: Bidi rules | |
288 | // | |
289 | // TODO: Add *CheckBidi* flag | |
290 | else if !passes_bidi(label, is_bidi_domain) | |
291 | { | |
292 | errors.push(Error::ValidityCriteria); | |
293 | } | |
294 | } | |
295 | ||
296 | /// http://www.unicode.org/reports/tr46/#Processing | |
297 | fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String { | |
298 | let mut mapped = String::with_capacity(domain.len()); | |
299 | for c in domain.chars() { | |
300 | map_char(c, flags, &mut mapped, errors) | |
301 | } | |
302 | let mut normalized = String::with_capacity(mapped.len()); | |
303 | normalized.extend(mapped.nfc()); | |
304 | ||
305 | // Find out if it's a Bidi Domain Name | |
306 | // | |
307 | // First, check for literal bidi chars | |
308 | let mut is_bidi_domain = domain.chars().any(|c| | |
309 | matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN) | |
310 | ); | |
311 | if !is_bidi_domain { | |
312 | // Then check for punycode-encoded bidi chars | |
313 | for label in normalized.split('.') { | |
314 | if label.starts_with(PUNYCODE_PREFIX) { | |
315 | match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) { | |
316 | Some(decoded_label) => { | |
317 | if decoded_label.chars().any(|c| | |
318 | matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN) | |
319 | ) { | |
320 | is_bidi_domain = true; | |
321 | } | |
322 | } | |
323 | None => { | |
324 | is_bidi_domain = true; | |
325 | } | |
326 | } | |
327 | } | |
328 | } | |
329 | } | |
330 | ||
331 | let mut validated = String::new(); | |
332 | let mut first = true; | |
333 | for label in normalized.split('.') { | |
334 | if !first { | |
335 | validated.push('.'); | |
336 | } | |
337 | first = false; | |
338 | if label.starts_with(PUNYCODE_PREFIX) { | |
339 | match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) { | |
340 | Some(decoded_label) => { | |
341 | let flags = Flags { transitional_processing: false, ..flags }; | |
342 | validate_full(&decoded_label, is_bidi_domain, flags, errors); | |
343 | validated.push_str(&decoded_label) | |
344 | } | |
345 | None => errors.push(Error::PunycodeError) | |
346 | } | |
347 | } else { | |
348 | // `normalized` is already `NFC` so we can skip that check | |
349 | validate(label, is_bidi_domain, flags, errors); | |
350 | validated.push_str(label) | |
351 | } | |
352 | } | |
353 | validated | |
354 | } | |
355 | ||
356 | #[derive(Copy, Clone)] | |
357 | pub struct Flags { | |
358 | pub use_std3_ascii_rules: bool, | |
359 | pub transitional_processing: bool, | |
360 | pub verify_dns_length: bool, | |
361 | } | |
362 | ||
363 | #[derive(PartialEq, Eq, Clone, Copy, Debug)] | |
364 | enum Error { | |
365 | PunycodeError, | |
366 | ValidityCriteria, | |
367 | DissallowedByStd3AsciiRules, | |
368 | DissallowedMappedInStd3, | |
369 | DissallowedCharacter, | |
370 | TooLongForDns, | |
371 | TooShortForDns, | |
372 | } | |
373 | ||
374 | /// Errors recorded during UTS #46 processing. | |
375 | /// | |
376 | /// This is opaque for now, only indicating the presence of at least one error. | |
377 | /// More details may be exposed in the future. | |
378 | #[derive(Debug)] | |
379 | pub struct Errors(Vec<Error>); | |
380 | ||
381 | /// http://www.unicode.org/reports/tr46/#ToASCII | |
382 | pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> { | |
383 | let mut errors = Vec::new(); | |
384 | let mut result = String::new(); | |
385 | let mut first = true; | |
386 | for label in processing(domain, flags, &mut errors).split('.') { | |
387 | if !first { | |
388 | result.push('.'); | |
389 | } | |
390 | first = false; | |
391 | if label.is_ascii() { | |
392 | result.push_str(label); | |
393 | } else { | |
394 | match punycode::encode_str(label) { | |
395 | Some(x) => { | |
396 | result.push_str(PUNYCODE_PREFIX); | |
397 | result.push_str(&x); | |
398 | }, | |
399 | None => errors.push(Error::PunycodeError) | |
400 | } | |
401 | } | |
402 | } | |
403 | ||
404 | if flags.verify_dns_length { | |
405 | let domain = if result.ends_with(".") { &result[..result.len()-1] } else { &*result }; | |
406 | if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) { | |
407 | errors.push(Error::TooShortForDns) | |
408 | } | |
409 | if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) { | |
410 | errors.push(Error::TooLongForDns) | |
411 | } | |
412 | } | |
413 | if errors.is_empty() { | |
414 | Ok(result) | |
415 | } else { | |
416 | Err(Errors(errors)) | |
417 | } | |
418 | } | |
419 | ||
420 | /// http://www.unicode.org/reports/tr46/#ToUnicode | |
421 | /// | |
422 | /// Only `use_std3_ascii_rules` is used in `flags`. | |
423 | pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) { | |
424 | flags.transitional_processing = false; | |
425 | let mut errors = Vec::new(); | |
426 | let domain = processing(domain, flags, &mut errors); | |
427 | let errors = if errors.is_empty() { | |
428 | Ok(()) | |
429 | } else { | |
430 | Err(Errors(errors)) | |
431 | }; | |
432 | (domain, errors) | |
433 | } |