src/vendor/idna/src/punycode.rs

   1 // Copyright 2013 The rust-url developers.
   2 //
   3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   6 // option. This file may not be copied, modified, or distributed
   7 // except according to those terms.
   8
   9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
  10 //!
  11 //! Since Punycode fundamentally works on unicode code points,
  12 //! `encode` and `decode` take and return slices and vectors of `char`.
  13 //! `encode_str` and `decode_to_string` provide convenience wrappers
  14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types.
  15
  16 use std::u32;
  17 use std::char;
  18 #[allow(unused_imports, deprecated)]
  19 use std::ascii::AsciiExt;
  20
  21 // Bootstring parameters for Punycode
  22 static BASE: u32 = 36;
  23 static T_MIN: u32 = 1;
  24 static T_MAX: u32 = 26;
  25 static SKEW: u32 = 38;
  26 static DAMP: u32 = 700;
  27 static INITIAL_BIAS: u32 = 72;
  28 static INITIAL_N: u32 = 0x80;
  29 static DELIMITER: char = '-';
  30
  31
  32 #[inline]
  33 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
  34     delta /= if first_time { DAMP } else { 2 };
  35     delta += delta / num_points;
  36     let mut k = 0;
  37     while delta > ((BASE - T_MIN) * T_MAX) / 2 {
  38         delta /= BASE - T_MIN;
  39         k += BASE;
  40     }
  41     k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
  42 }
  43
  44
  45 /// Convert Punycode to an Unicode `String`.
  46 ///
  47 /// This is a convenience wrapper around `decode`.
  48 #[inline]
  49 pub fn decode_to_string(input: &str) -> Option<String> {
  50     decode(input).map(|chars| chars.into_iter().collect())
  51 }
  52
  53
  54 /// Convert Punycode to Unicode.
  55 ///
  56 /// Return None on malformed input or overflow.
  57 /// Overflow can only happen on inputs that take more than
  58 /// 63 encoded bytes, the DNS limit on domain name labels.
  59 pub fn decode(input: &str) -> Option<Vec<char>> {
  60     // Handle "basic" (ASCII) code points.
  61     // They are encoded as-is before the last delimiter, if any.
  62     let (mut output, input) = match input.rfind(DELIMITER) {
  63         None => (Vec::new(), input),
  64         Some(position) => (
  65             input[..position].chars().collect(),
  66             if position > 0 { &input[position + 1..] } else { input }
  67         )
  68     };
  69     let mut code_point = INITIAL_N;
  70     let mut bias = INITIAL_BIAS;
  71     let mut i = 0;
  72     let mut iter = input.bytes();
  73     loop {
  74         let previous_i = i;
  75         let mut weight = 1;
  76         let mut k = BASE;
  77         let mut byte = match iter.next() {
  78             None => break,
  79             Some(byte) => byte,
  80         };
  81         // Decode a generalized variable-length integer into delta,
  82         // which gets added to i.
  83         loop {
  84             let digit = match byte {
  85                 byte @ b'0' ... b'9' => byte - b'0' + 26,
  86                 byte @ b'A' ... b'Z' => byte - b'A',
  87                 byte @ b'a' ... b'z' => byte - b'a',
  88                 _ => return None
  89             } as u32;
  90             if digit > (u32::MAX - i) / weight {
  91                 return None  // Overflow
  92             }
  93             i += digit * weight;
  94             let t = if k <= bias { T_MIN }
  95                     else if k >= bias + T_MAX { T_MAX }
  96                     else { k - bias };
  97             if digit < t {
  98                 break
  99             }
 100             if weight > u32::MAX / (BASE - t) {
 101                 return None  // Overflow
 102             }
 103             weight *= BASE - t;
 104             k += BASE;
 105             byte = match iter.next() {
 106                 None => return None,  // End of input before the end of this delta
 107                 Some(byte) => byte,
 108             };
 109         }
 110         let length = output.len() as u32;
 111         bias = adapt(i - previous_i, length + 1, previous_i == 0);
 112         if i / (length + 1) > u32::MAX - code_point {
 113             return None  // Overflow
 114         }
 115         // i was supposed to wrap around from length+1 to 0,
 116         // incrementing code_point each time.
 117         code_point += i / (length + 1);
 118         i %= length + 1;
 119         let c = match char::from_u32(code_point) {
 120             Some(c) => c,
 121             None => return None
 122         };
 123         output.insert(i as usize, c);
 124         i += 1;
 125     }
 126     Some(output)
 127 }
 128
 129
 130 /// Convert an Unicode `str` to Punycode.
 131 ///
 132 /// This is a convenience wrapper around `encode`.
 133 #[inline]
 134 pub fn encode_str(input: &str) -> Option<String> {
 135     encode(&input.chars().collect::<Vec<char>>())
 136 }
 137
 138
 139 /// Convert Unicode to Punycode.
 140 ///
 141 /// Return None on overflow, which can only happen on inputs that would take more than
 142 /// 63 encoded bytes, the DNS limit on domain name labels.
 143 pub fn encode(input: &[char]) -> Option<String> {
 144     // Handle "basic" (ASCII) code points. They are encoded as-is.
 145     let output_bytes = input.iter().filter_map(|&c|
 146         if c.is_ascii() { Some(c as u8) } else { None }
 147     ).collect();
 148     let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
 149     let basic_length = output.len() as u32;
 150     if basic_length > 0 {
 151         output.push_str("-")
 152     }
 153     let mut code_point = INITIAL_N;
 154     let mut delta = 0;
 155     let mut bias = INITIAL_BIAS;
 156     let mut processed = basic_length;
 157     let input_length = input.len() as u32;
 158     while processed < input_length {
 159         // All code points < code_point have been handled already.
 160         // Find the next larger one.
 161         let min_code_point = input.iter().map(|&c| c as u32)
 162                                   .filter(|&c| c >= code_point).min().unwrap();
 163         if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
 164             return None  // Overflow
 165         }
 166         // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
 167         delta += (min_code_point - code_point) * (processed + 1);
 168         code_point = min_code_point;
 169         for &c in input {
 170             let c = c as u32;
 171             if c < code_point {
 172                 delta += 1;
 173                 if delta == 0 {
 174                     return None  // Overflow
 175                 }
 176             }
 177             if c == code_point {
 178                 // Represent delta as a generalized variable-length integer:
 179                 let mut q = delta;
 180                 let mut k = BASE;
 181                 loop {
 182                     let t = if k <= bias { T_MIN }
 183                             else if k >= bias + T_MAX { T_MAX }
 184                             else { k - bias };
 185                     if q < t {
 186                         break
 187                     }
 188                     let value = t + ((q - t) % (BASE - t));
 189                     output.push(value_to_digit(value));
 190                     q = (q - t) / (BASE - t);
 191                     k += BASE;
 192                 }
 193                 output.push(value_to_digit(q));
 194                 bias = adapt(delta, processed + 1, processed == basic_length);
 195                 delta = 0;
 196                 processed += 1;
 197             }
 198         }
 199         delta += 1;
 200         code_point += 1;
 201     }
 202     Some(output)
 203 }
 204
 205
 206 #[inline]
 207 fn value_to_digit(value: u32) -> char {
 208     match value {
 209         0 ... 25 => (value as u8 + 'a' as u8) as char,  // a..z
 210         26 ... 35 => (value as u8 - 26 + '0' as u8) as char,  // 0..9
 211         _ => panic!()
 212     }
 213 }