]> git.proxmox.com Git - rustc.git/blob - vendor/idna-0.1.5/src/uts46.rs
New upstream version 1.41.1+dfsg1
[rustc.git] / vendor / idna-0.1.5 / src / uts46.rs
1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11
12 use self::Mapping::*;
13 use punycode;
14 #[allow(unused_imports, deprecated)]
15 use std::ascii::AsciiExt;
16 use std::cmp::Ordering::{Equal, Less, Greater};
17 use unicode_bidi::{BidiClass, bidi_class};
18 use unicode_normalization::UnicodeNormalization;
19 use unicode_normalization::char::is_combining_mark;
20
21 include!("uts46_mapping_table.rs");
22
23
24 pub static PUNYCODE_PREFIX: &'static str = "xn--";
25
26
27 #[derive(Debug)]
28 struct StringTableSlice {
29 // Store these as separate fields so the structure will have an
30 // alignment of 1 and thus pack better into the Mapping enum, below.
31 byte_start_lo: u8,
32 byte_start_hi: u8,
33 byte_len: u8,
34 }
35
36 fn decode_slice(slice: &StringTableSlice) -> &'static str {
37 let lo = slice.byte_start_lo as usize;
38 let hi = slice.byte_start_hi as usize;
39 let start = (hi << 8) | lo;
40 let len = slice.byte_len as usize;
41 &STRING_TABLE[start..(start + len)]
42 }
43
44 #[repr(u8)]
45 #[derive(Debug)]
46 enum Mapping {
47 Valid,
48 Ignored,
49 Mapped(StringTableSlice),
50 Deviation(StringTableSlice),
51 Disallowed,
52 DisallowedStd3Valid,
53 DisallowedStd3Mapped(StringTableSlice),
54 }
55
56 struct Range {
57 from: char,
58 to: char,
59 }
60
61 fn find_char(codepoint: char) -> &'static Mapping {
62 let r = TABLE.binary_search_by(|ref range| {
63 if codepoint > range.to {
64 Less
65 } else if codepoint < range.from {
66 Greater
67 } else {
68 Equal
69 }
70 });
71 r.ok().map(|i| {
72 const SINGLE_MARKER: u16 = 1 << 15;
73
74 let x = INDEX_TABLE[i];
75 let single = (x & SINGLE_MARKER) != 0;
76 let offset = !SINGLE_MARKER & x;
77
78 if single {
79 &MAPPING_TABLE[offset as usize]
80 } else {
81 &MAPPING_TABLE[(offset + (codepoint as u16 - TABLE[i].from as u16)) as usize]
82 }
83 }).unwrap()
84 }
85
86 fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec<Error>) {
87 match *find_char(codepoint) {
88 Mapping::Valid => output.push(codepoint),
89 Mapping::Ignored => {},
90 Mapping::Mapped(ref slice) => output.push_str(decode_slice(slice)),
91 Mapping::Deviation(ref slice) => {
92 if flags.transitional_processing {
93 output.push_str(decode_slice(slice))
94 } else {
95 output.push(codepoint)
96 }
97 }
98 Mapping::Disallowed => {
99 errors.push(Error::DissallowedCharacter);
100 output.push(codepoint);
101 }
102 Mapping::DisallowedStd3Valid => {
103 if flags.use_std3_ascii_rules {
104 errors.push(Error::DissallowedByStd3AsciiRules);
105 }
106 output.push(codepoint)
107 }
108 Mapping::DisallowedStd3Mapped(ref slice) => {
109 if flags.use_std3_ascii_rules {
110 errors.push(Error::DissallowedMappedInStd3);
111 }
112 output.push_str(decode_slice(slice))
113 }
114 }
115 }
116
117 // http://tools.ietf.org/html/rfc5893#section-2
118 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
119 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
120 // is RTL if it contains at least one character of bidi class R, AL or AN.
121 if !is_bidi_domain {
122 return true;
123 }
124
125 let mut chars = label.chars();
126 let first_char_class = match chars.next() {
127 Some(c) => bidi_class(c),
128 None => return true, // empty string
129 };
130
131 match first_char_class {
132 // LTR label
133 BidiClass::L => {
134 // Rule 5
135 loop {
136 match chars.next() {
137 Some(c) => {
138 if !matches!(bidi_class(c),
139 BidiClass::L | BidiClass::EN |
140 BidiClass::ES | BidiClass::CS |
141 BidiClass::ET | BidiClass::ON |
142 BidiClass::BN | BidiClass::NSM
143 ) {
144 return false;
145 }
146 },
147 None => { break; },
148 }
149 }
150
151 // Rule 6
152 // must end in L or EN followed by 0 or more NSM
153 let mut rev_chars = label.chars().rev();
154 let mut last_non_nsm = rev_chars.next();
155 loop {
156 match last_non_nsm {
157 Some(c) if bidi_class(c) == BidiClass::NSM => {
158 last_non_nsm = rev_chars.next();
159 continue;
160 }
161 _ => { break; },
162 }
163 }
164 match last_non_nsm {
165 Some(c) if bidi_class(c) == BidiClass::L
166 || bidi_class(c) == BidiClass::EN => {},
167 Some(_) => { return false; },
168 _ => {}
169 }
170
171 }
172
173 // RTL label
174 BidiClass::R | BidiClass::AL => {
175 let mut found_en = false;
176 let mut found_an = false;
177
178 // Rule 2
179 loop {
180 match chars.next() {
181 Some(c) => {
182 let char_class = bidi_class(c);
183
184 if char_class == BidiClass::EN {
185 found_en = true;
186 }
187 if char_class == BidiClass::AN {
188 found_an = true;
189 }
190
191 if !matches!(char_class, BidiClass::R | BidiClass::AL |
192 BidiClass::AN | BidiClass::EN |
193 BidiClass::ES | BidiClass::CS |
194 BidiClass::ET | BidiClass::ON |
195 BidiClass::BN | BidiClass::NSM) {
196 return false;
197 }
198 },
199 None => { break; },
200 }
201 }
202 // Rule 3
203 let mut rev_chars = label.chars().rev();
204 let mut last = rev_chars.next();
205 loop { // must end in L or EN followed by 0 or more NSM
206 match last {
207 Some(c) if bidi_class(c) == BidiClass::NSM => {
208 last = rev_chars.next();
209 continue;
210 }
211 _ => { break; },
212 }
213 }
214 match last {
215 Some(c) if matches!(bidi_class(c), BidiClass::R | BidiClass::AL |
216 BidiClass::EN | BidiClass::AN) => {},
217 _ => { return false; }
218 }
219
220 // Rule 4
221 if found_an && found_en {
222 return false;
223 }
224 }
225
226 // Rule 1: Should start with L or R/AL
227 _ => {
228 return false;
229 }
230 }
231
232 return true;
233 }
234
235 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
236 fn validate_full(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
237 // V1: Must be in NFC form.
238 if label.nfc().ne(label.chars()) {
239 errors.push(Error::ValidityCriteria);
240 } else {
241 validate(label, is_bidi_domain, flags, errors);
242 }
243 }
244
245 fn validate(label: &str, is_bidi_domain: bool, flags: Flags, errors: &mut Vec<Error>) {
246 let first_char = label.chars().next();
247 if first_char == None {
248 // Empty string, pass
249 }
250
251 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
252 //
253 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
254 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
255 // https://github.com/whatwg/url/issues/53
256 //
257 // TODO: Add *CheckHyphens* flag.
258
259 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
260 else if label.starts_with("-") || label.ends_with("-") {
261 errors.push(Error::ValidityCriteria);
262 }
263
264 // V4: not contain a U+002E FULL STOP
265 //
266 // Here, label can't contain '.' since the input is from .split('.')
267
268 // V5: not begin with a GC=Mark
269 else if is_combining_mark(first_char.unwrap()) {
270 errors.push(Error::ValidityCriteria);
271 }
272
273 // V6: Check against Mapping Table
274 else if label.chars().any(|c| match *find_char(c) {
275 Mapping::Valid => false,
276 Mapping::Deviation(_) => flags.transitional_processing,
277 Mapping::DisallowedStd3Valid => flags.use_std3_ascii_rules,
278 _ => true,
279 }) {
280 errors.push(Error::ValidityCriteria);
281 }
282
283 // V7: ContextJ rules
284 //
285 // TODO: Implement rules and add *CheckJoiners* flag.
286
287 // V8: Bidi rules
288 //
289 // TODO: Add *CheckBidi* flag
290 else if !passes_bidi(label, is_bidi_domain)
291 {
292 errors.push(Error::ValidityCriteria);
293 }
294 }
295
296 /// http://www.unicode.org/reports/tr46/#Processing
297 fn processing(domain: &str, flags: Flags, errors: &mut Vec<Error>) -> String {
298 let mut mapped = String::with_capacity(domain.len());
299 for c in domain.chars() {
300 map_char(c, flags, &mut mapped, errors)
301 }
302 let mut normalized = String::with_capacity(mapped.len());
303 normalized.extend(mapped.nfc());
304
305 // Find out if it's a Bidi Domain Name
306 //
307 // First, check for literal bidi chars
308 let mut is_bidi_domain = domain.chars().any(|c|
309 matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN)
310 );
311 if !is_bidi_domain {
312 // Then check for punycode-encoded bidi chars
313 for label in normalized.split('.') {
314 if label.starts_with(PUNYCODE_PREFIX) {
315 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
316 Some(decoded_label) => {
317 if decoded_label.chars().any(|c|
318 matches!(bidi_class(c), BidiClass::R | BidiClass::AL | BidiClass::AN)
319 ) {
320 is_bidi_domain = true;
321 }
322 }
323 None => {
324 is_bidi_domain = true;
325 }
326 }
327 }
328 }
329 }
330
331 let mut validated = String::new();
332 let mut first = true;
333 for label in normalized.split('.') {
334 if !first {
335 validated.push('.');
336 }
337 first = false;
338 if label.starts_with(PUNYCODE_PREFIX) {
339 match punycode::decode_to_string(&label[PUNYCODE_PREFIX.len()..]) {
340 Some(decoded_label) => {
341 let flags = Flags { transitional_processing: false, ..flags };
342 validate_full(&decoded_label, is_bidi_domain, flags, errors);
343 validated.push_str(&decoded_label)
344 }
345 None => errors.push(Error::PunycodeError)
346 }
347 } else {
348 // `normalized` is already `NFC` so we can skip that check
349 validate(label, is_bidi_domain, flags, errors);
350 validated.push_str(label)
351 }
352 }
353 validated
354 }
355
356 #[derive(Copy, Clone)]
357 pub struct Flags {
358 pub use_std3_ascii_rules: bool,
359 pub transitional_processing: bool,
360 pub verify_dns_length: bool,
361 }
362
363 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
364 enum Error {
365 PunycodeError,
366 ValidityCriteria,
367 DissallowedByStd3AsciiRules,
368 DissallowedMappedInStd3,
369 DissallowedCharacter,
370 TooLongForDns,
371 TooShortForDns,
372 }
373
374 /// Errors recorded during UTS #46 processing.
375 ///
376 /// This is opaque for now, only indicating the presence of at least one error.
377 /// More details may be exposed in the future.
378 #[derive(Debug)]
379 pub struct Errors(Vec<Error>);
380
381 /// http://www.unicode.org/reports/tr46/#ToASCII
382 pub fn to_ascii(domain: &str, flags: Flags) -> Result<String, Errors> {
383 let mut errors = Vec::new();
384 let mut result = String::new();
385 let mut first = true;
386 for label in processing(domain, flags, &mut errors).split('.') {
387 if !first {
388 result.push('.');
389 }
390 first = false;
391 if label.is_ascii() {
392 result.push_str(label);
393 } else {
394 match punycode::encode_str(label) {
395 Some(x) => {
396 result.push_str(PUNYCODE_PREFIX);
397 result.push_str(&x);
398 },
399 None => errors.push(Error::PunycodeError)
400 }
401 }
402 }
403
404 if flags.verify_dns_length {
405 let domain = if result.ends_with(".") { &result[..result.len()-1] } else { &*result };
406 if domain.len() < 1 || domain.split('.').any(|label| label.len() < 1) {
407 errors.push(Error::TooShortForDns)
408 }
409 if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
410 errors.push(Error::TooLongForDns)
411 }
412 }
413 if errors.is_empty() {
414 Ok(result)
415 } else {
416 Err(Errors(errors))
417 }
418 }
419
420 /// http://www.unicode.org/reports/tr46/#ToUnicode
421 ///
422 /// Only `use_std3_ascii_rules` is used in `flags`.
423 pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) {
424 flags.transitional_processing = false;
425 let mut errors = Vec::new();
426 let domain = processing(domain, flags, &mut errors);
427 let errors = if errors.is_empty() {
428 Ok(())
429 } else {
430 Err(Errors(errors))
431 };
432 (domain, errors)
433 }