src/vendor/idna/src/punycode.rs

   1 // Copyright 2013 The rust-url developers.
   2 //
   3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   6 // option. This file may not be copied, modified, or distributed
   7 // except according to those terms.
   8
   9 //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation.
  10 //!
  11 //! Since Punycode fundamentally works on unicode code points,
  12 //! `encode` and `decode` take and return slices and vectors of `char`.
  13 //! `encode_str` and `decode_to_string` provide convenience wrappers
  14 //! that convert from and to Rust’s UTF-8 based `str` and `String` types.
  15
  16 use std::u32;
  17 use std::char;
  18 use std::ascii::AsciiExt;
  19
  20 // Bootstring parameters for Punycode
  21 static BASE: u32 = 36;
  22 static T_MIN: u32 = 1;
  23 static T_MAX: u32 = 26;
  24 static SKEW: u32 = 38;
  25 static DAMP: u32 = 700;
  26 static INITIAL_BIAS: u32 = 72;
  27 static INITIAL_N: u32 = 0x80;
  28 static DELIMITER: char = '-';
  29
  30
  31 #[inline]
  32 fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 {
  33     delta /= if first_time { DAMP } else { 2 };
  34     delta += delta / num_points;
  35     let mut k = 0;
  36     while delta > ((BASE - T_MIN) * T_MAX) / 2 {
  37         delta /= BASE - T_MIN;
  38         k += BASE;
  39     }
  40     k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW))
  41 }
  42
  43
  44 /// Convert Punycode to an Unicode `String`.
  45 ///
  46 /// This is a convenience wrapper around `decode`.
  47 #[inline]
  48 pub fn decode_to_string(input: &str) -> Option<String> {
  49     decode(input).map(|chars| chars.into_iter().collect())
  50 }
  51
  52
  53 /// Convert Punycode to Unicode.
  54 ///
  55 /// Return None on malformed input or overflow.
  56 /// Overflow can only happen on inputs that take more than
  57 /// 63 encoded bytes, the DNS limit on domain name labels.
  58 pub fn decode(input: &str) -> Option<Vec<char>> {
  59     // Handle "basic" (ASCII) code points.
  60     // They are encoded as-is before the last delimiter, if any.
  61     let (mut output, input) = match input.rfind(DELIMITER) {
  62         None => (Vec::new(), input),
  63         Some(position) => (
  64             input[..position].chars().collect(),
  65             if position > 0 { &input[position + 1..] } else { input }
  66         )
  67     };
  68     let mut code_point = INITIAL_N;
  69     let mut bias = INITIAL_BIAS;
  70     let mut i = 0;
  71     let mut iter = input.bytes();
  72     loop {
  73         let previous_i = i;
  74         let mut weight = 1;
  75         let mut k = BASE;
  76         let mut byte = match iter.next() {
  77             None => break,
  78             Some(byte) => byte,
  79         };
  80         // Decode a generalized variable-length integer into delta,
  81         // which gets added to i.
  82         loop {
  83             let digit = match byte {
  84                 byte @ b'0' ... b'9' => byte - b'0' + 26,
  85                 byte @ b'A' ... b'Z' => byte - b'A',
  86                 byte @ b'a' ... b'z' => byte - b'a',
  87                 _ => return None
  88             } as u32;
  89             if digit > (u32::MAX - i) / weight {
  90                 return None  // Overflow
  91             }
  92             i += digit * weight;
  93             let t = if k <= bias { T_MIN }
  94                     else if k >= bias + T_MAX { T_MAX }
  95                     else { k - bias };
  96             if digit < t {
  97                 break
  98             }
  99             if weight > u32::MAX / (BASE - t) {
 100                 return None  // Overflow
 101             }
 102             weight *= BASE - t;
 103             k += BASE;
 104             byte = match iter.next() {
 105                 None => return None,  // End of input before the end of this delta
 106                 Some(byte) => byte,
 107             };
 108         }
 109         let length = output.len() as u32;
 110         bias = adapt(i - previous_i, length + 1, previous_i == 0);
 111         if i / (length + 1) > u32::MAX - code_point {
 112             return None  // Overflow
 113         }
 114         // i was supposed to wrap around from length+1 to 0,
 115         // incrementing code_point each time.
 116         code_point += i / (length + 1);
 117         i %= length + 1;
 118         let c = match char::from_u32(code_point) {
 119             Some(c) => c,
 120             None => return None
 121         };
 122         output.insert(i as usize, c);
 123         i += 1;
 124     }
 125     Some(output)
 126 }
 127
 128
 129 /// Convert an Unicode `str` to Punycode.
 130 ///
 131 /// This is a convenience wrapper around `encode`.
 132 #[inline]
 133 pub fn encode_str(input: &str) -> Option<String> {
 134     encode(&input.chars().collect::<Vec<char>>())
 135 }
 136
 137
 138 /// Convert Unicode to Punycode.
 139 ///
 140 /// Return None on overflow, which can only happen on inputs that would take more than
 141 /// 63 encoded bytes, the DNS limit on domain name labels.
 142 pub fn encode(input: &[char]) -> Option<String> {
 143     // Handle "basic" (ASCII) code points. They are encoded as-is.
 144     let output_bytes = input.iter().filter_map(|&c|
 145         if c.is_ascii() { Some(c as u8) } else { None }
 146     ).collect();
 147     let mut output = unsafe { String::from_utf8_unchecked(output_bytes) };
 148     let basic_length = output.len() as u32;
 149     if basic_length > 0 {
 150         output.push_str("-")
 151     }
 152     let mut code_point = INITIAL_N;
 153     let mut delta = 0;
 154     let mut bias = INITIAL_BIAS;
 155     let mut processed = basic_length;
 156     let input_length = input.len() as u32;
 157     while processed < input_length {
 158         // All code points < code_point have been handled already.
 159         // Find the next larger one.
 160         let min_code_point = input.iter().map(|&c| c as u32)
 161                                   .filter(|&c| c >= code_point).min().unwrap();
 162         if min_code_point - code_point > (u32::MAX - delta) / (processed + 1) {
 163             return None  // Overflow
 164         }
 165         // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0>
 166         delta += (min_code_point - code_point) * (processed + 1);
 167         code_point = min_code_point;
 168         for &c in input {
 169             let c = c as u32;
 170             if c < code_point {
 171                 delta += 1;
 172                 if delta == 0 {
 173                     return None  // Overflow
 174                 }
 175             }
 176             if c == code_point {
 177                 // Represent delta as a generalized variable-length integer:
 178                 let mut q = delta;
 179                 let mut k = BASE;
 180                 loop {
 181                     let t = if k <= bias { T_MIN }
 182                             else if k >= bias + T_MAX { T_MAX }
 183                             else { k - bias };
 184                     if q < t {
 185                         break
 186                     }
 187                     let value = t + ((q - t) % (BASE - t));
 188                     output.push(value_to_digit(value));
 189                     q = (q - t) / (BASE - t);
 190                     k += BASE;
 191                 }
 192                 output.push(value_to_digit(q));
 193                 bias = adapt(delta, processed + 1, processed == basic_length);
 194                 delta = 0;
 195                 processed += 1;
 196             }
 197         }
 198         delta += 1;
 199         code_point += 1;
 200     }
 201     Some(output)
 202 }
 203
 204
 205 #[inline]
 206 fn value_to_digit(value: u32) -> char {
 207     match value {
 208         0 ... 25 => (value as u8 + 'a' as u8) as char,  // a..z
 209         26 ... 35 => (value as u8 - 26 + '0' as u8) as char,  // 0..9
 210         _ => panic!()
 211     }
 212 }