]> git.proxmox.com Git - rustc.git/blob - src/vendor/idna/src/uts46.rs
New upstream version 1.23.0+dfsg1
[rustc.git] / src / vendor / idna / src / uts46.rs
1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12 use self::Mapping::*;
13 use punycode;
14 use std::ascii::AsciiExt;
15 use std::cmp::Ordering::{Equal, Less, Greater};
16 use unicode_bidi::{BidiClass, bidi_class};
17 use unicode_normalization::UnicodeNormalization;
18 use unicode_normalization::char::is_combining_mark;
19
20 include!("uts46_mapping_table.rs");
21
22
23 pub static PUNYCODE_PREFIX: &'static str = "xn--";
24
25
26 #[derive(Debug)]
27 struct StringTableSlice {
28 // Store these as separate fields so the structure will have an
29 // alignment of 1 and thus pack better into the Mapping enum, below.
30 byte_start_lo: u8,
31 byte_start_hi: u8,
32 byte_len: u8,
33 }
34
35 fn decode_slice(slice: &StringTableSlice) -> &'static str {
36 let lo = slice.byte_start_lo as usize;
37 let hi = slice.byte_start_hi as usize;
38 let start = (hi << 8) | lo;
39 let len = slice.byte_len as usize;
40 &STRING_TABLE[start..(start + len)]
41 }
42
43 #[repr(u8)]
44 #[derive(Debug)]
45 enum Mapping {
46 Valid,
47 Ignored,
48 Mapped(StringTableSlice),
49 Deviation(StringTableSlice),
50 Disallowed,
51 DisallowedStd3Valid,
52 DisallowedStd3Mapped(StringTableSlice),
53 }
54
55 struct Range {
56 from: char,
57 to: char,
58 mapping: Mapping,
59 }
60
61 fn find_char(codepoint: char) -> &'static Mapping {
62 let r = TABLE.binary_search_by(|ref range| {
63 if codepoint > range.to {
64 Less
65 } else if codepoint < range.from {
66 Greater
67 } else {
68 Equal
69 }
70 });
71 r.ok().map(|i| &TABLE[i].mapping).unwrap()
72 }
73
74 fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
75 match *find_char(codepoint) {
76 Mapping::Valid => output.push(codepoint),
77 Mapping::Ignored => {},
78 Mapping::Mapped(ref slice) => output.push_str(decode_slice(slice)),
79 Mapping::Deviation(ref slice) => {
80 if flags.transitional_processing {
81 output.push_str(decode_slice(slice))
82 } else {
83 output.push(codepoint)
84 }
85 }
86 Mapping::Disallowed => {
87 errors.push(Error::DissallowedCharacter);
88 output.push(codepoint);
89 }
90 Mapping::DisallowedStd3Valid => {
91 if flags.use_std3_ascii_rules {
92 errors.push(Error::DissallowedByStd3AsciiRules);
93 }
94 output.push(codepoint)
95 }
96 Mapping::DisallowedStd3Mapped(ref slice) => {
97 if flags.use_std3_ascii_rules {
98 errors.push(Error::DissallowedMappedInStd3);
99 }
100 output.push_str(decode_slice(slice))
101 }
102 }
103 }
104
105 // http://tools.ietf.org/html/rfc5893#section-2
106 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
107 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
108 // is RTL if it contains at least one character of bidi class R, AL or AN.
109 if !is_bidi_domain {
110 return true;
111 }
112
113 let mut chars = label.chars();
114 let first_char_class = match chars.next() {
115 Some(c) => bidi_class(c),
116 None => return true, // empty string
117 };
118
119 match first_char_class {
120 // LTR label
121 BidiClass::L => {
122 // Rule 5
123 loop {
124 match chars.next() {
125 Some(c) => {
126 if !matches!(bidi_class(c),
127 BidiClass::L | BidiClass::EN |
128 BidiClass::ES | BidiClass::CS |
129 BidiClass::ET | BidiClass::ON |
130 BidiClass::BN | BidiClass::NSM
131 ) {
132 return false;
133 }
134 },
135 None => { break; },
136 }
137 }
138
139 // Rule 6
140 // must end in L or EN followed by 0 or more NSM
141 let mut rev_chars = label.chars().rev();
142 let mut last_non_nsm = rev_chars.next();
143 loop {
144 match last_non_nsm {
145 Some(c) if bidi_class(c) == BidiClass::NSM => {
146 last_non_nsm = rev_chars.next();
147 continue;
148 }
149 _ => { break; },
150 }
151 }
152 match last_non_nsm {
153 Some(c) if bidi_class(c) == BidiClass::L
154 || bidi_class(c) == BidiClass::EN => {},
155 Some(_) => { return false; },
156 _ => {}
157 }
158
159 }
160
161 // RTL label
162 BidiClass::R | BidiClass::AL => {
163 let mut found_en = false;
164 let mut found_an = false;
165
166 // Rule 2
167 loop {
168 match chars.next() {
169 Some(c) => {
170 let char_class = bidi_class(c);
171
172 if char_class == BidiClass::EN {
173 found_en = true;
174 }
175 if char_class == BidiClass::AN {
176 found_an = true;
177 }
178
179 if !matches!(char_class, BidiClass::R | BidiClass::AL |
180 BidiClass::AN | BidiClass::EN |
181 BidiClass::ES | BidiClass::CS |
182 BidiClass::ET | BidiClass::ON |
183 BidiClass::BN | BidiClass::NSM) {
184 return false;
185 }
186 },
187 None => { break; },
188 }
189 }
190 // Rule 3
191 let mut rev_chars = label.chars().rev();
192 let mut last = rev_chars.next();
193 loop { // must end in L or EN followed by 0 or more NSM
194 match last {
195 Some(c) if bidi_class(c) == BidiClass::NSM => {
196 last = rev_chars.next();
197 continue;
198 }
199 _ => { break; },
200 }
201 }
202 match last {
203 Some(c) if matches!(bidi_class(c), BidiClass::R | BidiClass::AL |
204 BidiClass::EN | BidiClass::AN) => {},
205 _ => { return false; }
206 }
207
208 // Rule 4
209 if found_an && found_en {
210 return false;
211 }
212 }
213
214 // Rule 1: Should start with L or R/AL
215 _ => {
216 return false;
217 }
218 }
219
220 return true;
221 }
222
223 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
224 fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
225 let first_char = label.chars().next();
226 if first_char == None {
227 // Empty string, pass
228 }
229
230 // V1: Must be in NFC form.
231 else if label.nfc().ne(label.chars()) {
232 errors.push(Error::ValidityCriteria);
233 }
234
235 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
236 //
237 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
238 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
239 // https://github.com/whatwg/url/issues/53
240 //
241 // TODO: Add *CheckHyphens* flag.
242
243 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
244 else if label.starts_with("-") || label.ends_with("-") {
245 errors.push(Error::ValidityCriteria);
246 }
247
248 // V4: not contain a U+002E FULL STOP
249 //
250 // Here, label can't contain '.' since the input is from .split('.')
251
252 // V5: not begin with a GC=Mark
253 else if is_combining_mark(first_char.unwrap()) {
254 errors.push(Error::ValidityCriteria);
255 }
256
257 // V6: Check against Mapping Table
258 else if label.chars().any(|c| match *find_char(c) {
259 Mapping::Valid => false,
260 Mapping::Deviation(_) => flags.transitional_processing,
261 Mapping::DisallowedStd3Valid => flags.use_std3_ascii_rules,
262 _ => true,
263 }) {
264 errors.push(Error::ValidityCriteria);
265 }
266
267 // V7: ContextJ rules
268 //
269 // TODO: Implement rules and add *CheckJoiners* flag.
270
271 // V8: Bidi rules
272 //
273 // TODO: Add *CheckBidi* flag
274 else if !passes_bidi(label, is_bidi_domain)
275 {
276 errors.push(Error::ValidityCriteria);
277 }
278 }
279
280 /// http://www.unicode.org/reports/tr46/#Processing
281 fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
282 let mut mapped = String::new();
283 for c in domain.chars() {
284 map_char(c, flags, &mut mapped, errors)
285 }
286 let normalized: String = mapped.nfc().collect();
287
288 // Find out if it's a Bidi Domain Name
289 //
290 // First, check for literal bidi chars
291 let mut is_bidi_domain = domain.chars().any(|c|
292 matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN)
293 );
294 if !is_bidi_domain {
295 // Then check for punycode-encoded bidi chars
296 for label in normalized.split('.') {
297 if label.starts_with(PUNYCODE_PREFIX) {
298 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
299 Some(decoded_label) => {
300 if decoded_label.chars().any(|c|
301 matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN)
302 ) {
303 is_bidi_domain = true;
304 }
305 }
306 None => {
307 is_bidi_domain = true;
308 }
309 }
310 }
311 }
312 }
313
314 let mut validated = String::new();
315 let mut first = true;
316 for label in normalized.split('.') {
317 if !first {
318 validated.push('.');
319 }
320 first = false;
321 if label.starts_with(PUNYCODE_PREFIX) {
322 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
323 Some(decoded_label) => {
324 let flags = Flags { transitional_processing: false, ..flags };
325 validate(&decoded_label, is_bidi_domain, flags, errors);
326 validated.push_str(&decoded_label)
327 }
328 None => errors.push(Error::PunycodeError)
329 }
330 } else {
331 validate(label, is_bidi_domain, flags, errors);
332 validated.push_str(label)
333 }
334 }
335 validated
336 }
337
338 #[derive(Copy, Clone)]
339 pub struct Flags {
340 pub use_std3_ascii_rules: bool,
341 pub transitional_processing: bool,
342 pub verify_dns_length: bool,
343 }
344
345 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
346 enum Error {
347 PunycodeError,
348 ValidityCriteria,
349 DissallowedByStd3AsciiRules,
350 DissallowedMappedInStd3,
351 DissallowedCharacter,
352 TooLongForDns,
353 TooShortForDns,
354 }
355
356 /// Errors recorded during UTS #46 processing.
357 ///
358 /// This is opaque for now, only indicating the presence of at least one error.
359 /// More details may be exposed in the future.
360 #[derive(Debug)]
361 pub struct Errors(Vec<Error>);
362
363 /// http://www.unicode.org/reports/tr46/#ToASCII
364 pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> {
365 let mut errors = Vec::new();
366 let mut result = String::new();
367 let mut first = true;
368 for label in processing(domain, flags, &mut errors).split('.') {
369 if !first {
370 result.push('.');
371 }
372 first = false;
373 if label.is_ascii() {
374 result.push_str(label);
375 } else {
376 match punycode::encode_str(label) {
377 Some(x) => {
378 result.push_str(PUNYCODE_PREFIX);
379 result.push_str(&x);
380 },
381 None => errors.push(Error::PunycodeError)
382 }
383 }
384 }
385
386 if flags.verify_dns_length {
387 let domain = if result.ends_with(".") { &result[..result.len()-1] } else { &*result };
388 if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) {
389 errors.push(Error::TooShortForDns)
390 }
391 if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
392 errors.push(Error::TooLongForDns)
393 }
394 }
395 if errors.is_empty() {
396 Ok(result)
397 } else {
398 Err(Errors(errors))
399 }
400 }
401
402 /// http://www.unicode.org/reports/tr46/#ToUnicode
403 ///
404 /// Only `use_std3_ascii_rules` is used in `flags`.
405 pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) {
406 flags.transitional_processing = false;
407 let mut errors = Vec::new();
408 let domain = processing(domain, flags, &mut errors);
409 let errors = if errors.is_empty() {
410 Ok(())
411 } else {
412 Err(Errors(errors))
413 };
414 (domain, errors)
415 }