1 // Copyright 2013-2014 The rust-url developers.
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
14 #[allow(unused_imports, deprecated)]
15 use std
::ascii
::AsciiExt
;
16 use std
::cmp
::Ordering
::{Equal, Less, Greater}
;
17 use unicode_bidi
::{BidiClass, bidi_class}
;
18 use unicode_normalization
::UnicodeNormalization
;
19 use unicode_normalization
::char::is_combining_mark
;
21 include
!("uts46_mapping_table.rs");
24 pub static PUNYCODE_PREFIX
: &'
static str = "xn--";
28 struct StringTableSlice
{
29 // Store these as separate fields so the structure will have an
30 // alignment of 1 and thus pack better into the Mapping enum, below.
36 fn decode_slice(slice
: &StringTableSlice
) -> &'
static str {
37 let lo
= slice
.byte_start_lo
as usize;
38 let hi
= slice
.byte_start_hi
as usize;
39 let start
= (hi
<< 8) | lo
;
40 let len
= slice
.byte_len
as usize;
41 &STRING_TABLE
[start
..(start
+ len
)]
49 Mapped(StringTableSlice
),
50 Deviation(StringTableSlice
),
53 DisallowedStd3Mapped(StringTableSlice
),
61 fn find_char(codepoint
: char) -> &'
static Mapping
{
62 let r
= TABLE
.binary_search_by(|ref range
| {
63 if codepoint
> range
.to
{
65 } else if codepoint
< range
.from
{
72 const SINGLE_MARKER
: u16 = 1 << 15;
74 let x
= INDEX_TABLE
[i
];
75 let single
= (x
& SINGLE_MARKER
) != 0;
76 let offset
= !SINGLE_MARKER
& x
;
79 &MAPPING_TABLE
[offset
as usize]
81 &MAPPING_TABLE
[(offset
+ (codepoint
as u16 - TABLE
[i
].from
as u16)) as usize]
86 fn map_char(codepoint
: char, flags
: Flags
, output
: &mut String
, errors
: &mut Vec
<Error
>) {
87 match *find_char(codepoint
) {
88 Mapping
::Valid
=> output
.push(codepoint
),
89 Mapping
::Ignored
=> {}
,
90 Mapping
::Mapped(ref slice
) => output
.push_str(decode_slice(slice
)),
91 Mapping
::Deviation(ref slice
) => {
92 if flags
.transitional_processing
{
93 output
.push_str(decode_slice(slice
))
95 output
.push(codepoint
)
98 Mapping
::Disallowed
=> {
99 errors
.push(Error
::DissallowedCharacter
);
100 output
.push(codepoint
);
102 Mapping
::DisallowedStd3Valid
=> {
103 if flags
.use_std3_ascii_rules
{
104 errors
.push(Error
::DissallowedByStd3AsciiRules
);
106 output
.push(codepoint
)
108 Mapping
::DisallowedStd3Mapped(ref slice
) => {
109 if flags
.use_std3_ascii_rules
{
110 errors
.push(Error
::DissallowedMappedInStd3
);
112 output
.push_str(decode_slice(slice
))
117 // http://tools.ietf.org/html/rfc5893#section-2
118 fn passes_bidi(label
: &str, is_bidi_domain
: bool
) -> bool
{
119 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
120 // is RTL if it contains at least one character of bidi class R, AL or AN.
125 let mut chars
= label
.chars();
126 let first_char_class
= match chars
.next() {
127 Some(c
) => bidi_class(c
),
128 None
=> return true, // empty string
131 match first_char_class
{
138 if !matches
!(bidi_class(c
),
139 BidiClass
::L
| BidiClass
::EN
|
140 BidiClass
::ES
| BidiClass
::CS
|
141 BidiClass
::ET
| BidiClass
::ON
|
142 BidiClass
::BN
| BidiClass
::NSM
152 // must end in L or EN followed by 0 or more NSM
153 let mut rev_chars
= label
.chars().rev();
154 let mut last_non_nsm
= rev_chars
.next();
157 Some(c
) if bidi_class(c
) == BidiClass
::NSM
=> {
158 last_non_nsm
= rev_chars
.next();
165 Some(c
) if bidi_class(c
) == BidiClass
::L
166 || bidi_class(c
) == BidiClass
::EN
=> {}
,
167 Some(_
) => { return false; }
,
174 BidiClass
::R
| BidiClass
::AL
=> {
175 let mut found_en
= false;
176 let mut found_an
= false;
182 let char_class
= bidi_class(c
);
184 if char_class
== BidiClass
::EN
{
187 if char_class
== BidiClass
::AN
{
191 if !matches
!(char_class
, BidiClass
::R
| BidiClass
::AL
|
192 BidiClass
::AN
| BidiClass
::EN
|
193 BidiClass
::ES
| BidiClass
::CS
|
194 BidiClass
::ET
| BidiClass
::ON
|
195 BidiClass
::BN
| BidiClass
::NSM
) {
203 let mut rev_chars
= label
.chars().rev();
204 let mut last
= rev_chars
.next();
205 loop { // must end in L or EN followed by 0 or more NSM
207 Some(c
) if bidi_class(c
) == BidiClass
::NSM
=> {
208 last
= rev_chars
.next();
215 Some(c
) if matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
|
216 BidiClass
::EN
| BidiClass
::AN
) => {}
,
217 _
=> { return false; }
221 if found_an
&& found_en
{
226 // Rule 1: Should start with L or R/AL
235 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
236 fn validate_full(label
: &str, is_bidi_domain
: bool
, flags
: Flags
, errors
: &mut Vec
<Error
>) {
237 // V1: Must be in NFC form.
238 if label
.nfc().ne(label
.chars()) {
239 errors
.push(Error
::ValidityCriteria
);
241 validate(label
, is_bidi_domain
, flags
, errors
);
245 fn validate(label
: &str, is_bidi_domain
: bool
, flags
: Flags
, errors
: &mut Vec
<Error
>) {
246 let first_char
= label
.chars().next();
247 if first_char
== None
{
248 // Empty string, pass
251 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
253 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
254 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
255 // https://github.com/whatwg/url/issues/53
257 // TODO: Add *CheckHyphens* flag.
259 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
260 else if label
.starts_with("-") || label
.ends_with("-") {
261 errors
.push(Error
::ValidityCriteria
);
264 // V4: not contain a U+002E FULL STOP
266 // Here, label can't contain '.' since the input is from .split('.')
268 // V5: not begin with a GC=Mark
269 else if is_combining_mark(first_char
.unwrap()) {
270 errors
.push(Error
::ValidityCriteria
);
273 // V6: Check against Mapping Table
274 else if label
.chars().any(|c
| match *find_char(c
) {
275 Mapping
::Valid
=> false,
276 Mapping
::Deviation(_
) => flags
.transitional_processing
,
277 Mapping
::DisallowedStd3Valid
=> flags
.use_std3_ascii_rules
,
280 errors
.push(Error
::ValidityCriteria
);
283 // V7: ContextJ rules
285 // TODO: Implement rules and add *CheckJoiners* flag.
289 // TODO: Add *CheckBidi* flag
290 else if !passes_bidi(label
, is_bidi_domain
)
292 errors
.push(Error
::ValidityCriteria
);
296 /// http://www.unicode.org/reports/tr46/#Processing
297 fn processing(domain
: &str, flags
: Flags
, errors
: &mut Vec
<Error
>) -> String
{
298 let mut mapped
= String
::with_capacity(domain
.len());
299 for c
in domain
.chars() {
300 map_char(c
, flags
, &mut mapped
, errors
)
302 let mut normalized
= String
::with_capacity(mapped
.len());
303 normalized
.extend(mapped
.nfc());
305 // Find out if it's a Bidi Domain Name
307 // First, check for literal bidi chars
308 let mut is_bidi_domain
= domain
.chars().any(|c
|
309 matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
| BidiClass
::AN
)
312 // Then check for punycode-encoded bidi chars
313 for label
in normalized
.split('
.'
) {
314 if label
.starts_with(PUNYCODE_PREFIX
) {
315 match punycode
::decode_to_string(&label
[PUNYCODE_PREFIX
.len()..]) {
316 Some(decoded_label
) => {
317 if decoded_label
.chars().any(|c
|
318 matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
| BidiClass
::AN
)
320 is_bidi_domain
= true;
324 is_bidi_domain
= true;
331 let mut validated
= String
::new();
332 let mut first
= true;
333 for label
in normalized
.split('
.'
) {
338 if label
.starts_with(PUNYCODE_PREFIX
) {
339 match punycode
::decode_to_string(&label
[PUNYCODE_PREFIX
.len()..]) {
340 Some(decoded_label
) => {
341 let flags
= Flags { transitional_processing: false, ..flags }
;
342 validate_full(&decoded_label
, is_bidi_domain
, flags
, errors
);
343 validated
.push_str(&decoded_label
)
345 None
=> errors
.push(Error
::PunycodeError
)
348 // `normalized` is already `NFC` so we can skip that check
349 validate(label
, is_bidi_domain
, flags
, errors
);
350 validated
.push_str(label
)
356 #[derive(Copy, Clone)]
358 pub use_std3_ascii_rules
: bool
,
359 pub transitional_processing
: bool
,
360 pub verify_dns_length
: bool
,
363 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
367 DissallowedByStd3AsciiRules
,
368 DissallowedMappedInStd3
,
369 DissallowedCharacter
,
374 /// Errors recorded during UTS #46 processing.
376 /// This is opaque for now, only indicating the presence of at least one error.
377 /// More details may be exposed in the future.
379 pub struct Errors(Vec
<Error
>);
381 /// http://www.unicode.org/reports/tr46/#ToASCII
382 pub fn to_ascii(domain
: &str, flags
: Flags
) -> Result
<String
, Errors
> {
383 let mut errors
= Vec
::new();
384 let mut result
= String
::new();
385 let mut first
= true;
386 for label
in processing(domain
, flags
, &mut errors
).split('
.'
) {
391 if label
.is_ascii() {
392 result
.push_str(label
);
394 match punycode
::encode_str(label
) {
396 result
.push_str(PUNYCODE_PREFIX
);
399 None
=> errors
.push(Error
::PunycodeError
)
404 if flags
.verify_dns_length
{
405 let domain
= if result
.ends_with(".") { &result[..result.len()-1] }
else { &*result }
;
406 if domain
.len() < 1 || domain
.split('
.'
).any(|label
| label
.len() < 1) {
407 errors
.push(Error
::TooShortForDns
)
409 if domain
.len() > 253 || domain
.split('
.'
).any(|label
| label
.len() > 63) {
410 errors
.push(Error
::TooLongForDns
)
413 if errors
.is_empty() {
420 /// http://www.unicode.org/reports/tr46/#ToUnicode
422 /// Only `use_std3_ascii_rules` is used in `flags`.
423 pub fn to_unicode(domain
: &str, mut flags
: Flags
) -> (String
, Result
<(), Errors
>) {
424 flags
.transitional_processing
= false;
425 let mut errors
= Vec
::new();
426 let domain
= processing(domain
, flags
, &mut errors
);
427 let errors
= if errors
.is_empty() {