1 // Copyright 2013-2014 The rust-url developers.
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
14 use std
::ascii
::AsciiExt
;
15 use std
::cmp
::Ordering
::{Equal, Less, Greater}
;
16 use unicode_bidi
::{BidiClass, bidi_class}
;
17 use unicode_normalization
::UnicodeNormalization
;
18 use unicode_normalization
::char::is_combining_mark
;
20 include
!("uts46_mapping_table.rs");
23 pub static PUNYCODE_PREFIX
: &'
static str = "xn--";
27 struct StringTableSlice
{
28 // Store these as separate fields so the structure will have an
29 // alignment of 1 and thus pack better into the Mapping enum, below.
35 fn decode_slice(slice
: &StringTableSlice
) -> &'
static str {
36 let lo
= slice
.byte_start_lo
as usize;
37 let hi
= slice
.byte_start_hi
as usize;
38 let start
= (hi
<< 8) | lo
;
39 let len
= slice
.byte_len
as usize;
40 &STRING_TABLE
[start
..(start
+ len
)]
48 Mapped(StringTableSlice
),
49 Deviation(StringTableSlice
),
52 DisallowedStd3Mapped(StringTableSlice
),
61 fn find_char(codepoint
: char) -> &'
static Mapping
{
62 let r
= TABLE
.binary_search_by(|ref range
| {
63 if codepoint
> range
.to
{
65 } else if codepoint
< range
.from
{
71 r
.ok().map(|i
| &TABLE
[i
].mapping
).unwrap()
74 fn map_char(codepoint
: char, flags
: Flags
, output
: &mut String
, errors
: &mut Vec
<Error
>) {
75 match *find_char(codepoint
) {
76 Mapping
::Valid
=> output
.push(codepoint
),
77 Mapping
::Ignored
=> {}
,
78 Mapping
::Mapped(ref slice
) => output
.push_str(decode_slice(slice
)),
79 Mapping
::Deviation(ref slice
) => {
80 if flags
.transitional_processing
{
81 output
.push_str(decode_slice(slice
))
83 output
.push(codepoint
)
86 Mapping
::Disallowed
=> {
87 errors
.push(Error
::DissallowedCharacter
);
88 output
.push(codepoint
);
90 Mapping
::DisallowedStd3Valid
=> {
91 if flags
.use_std3_ascii_rules
{
92 errors
.push(Error
::DissallowedByStd3AsciiRules
);
94 output
.push(codepoint
)
96 Mapping
::DisallowedStd3Mapped(ref slice
) => {
97 if flags
.use_std3_ascii_rules
{
98 errors
.push(Error
::DissallowedMappedInStd3
);
100 output
.push_str(decode_slice(slice
))
105 // http://tools.ietf.org/html/rfc5893#section-2
106 fn passes_bidi(label
: &str, is_bidi_domain
: bool
) -> bool
{
107 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
108 // is RTL if it contains at least one character of bidi class R, AL or AN.
113 let mut chars
= label
.chars();
114 let first_char_class
= match chars
.next() {
115 Some(c
) => bidi_class(c
),
116 None
=> return true, // empty string
119 match first_char_class
{
126 if !matches
!(bidi_class(c
),
127 BidiClass
::L
| BidiClass
::EN
|
128 BidiClass
::ES
| BidiClass
::CS
|
129 BidiClass
::ET
| BidiClass
::ON
|
130 BidiClass
::BN
| BidiClass
::NSM
140 // must end in L or EN followed by 0 or more NSM
141 let mut rev_chars
= label
.chars().rev();
142 let mut last_non_nsm
= rev_chars
.next();
145 Some(c
) if bidi_class(c
) == BidiClass
::NSM
=> {
146 last_non_nsm
= rev_chars
.next();
153 Some(c
) if bidi_class(c
) == BidiClass
::L
154 || bidi_class(c
) == BidiClass
::EN
=> {}
,
155 Some(_
) => { return false; }
,
162 BidiClass
::R
| BidiClass
::AL
=> {
163 let mut found_en
= false;
164 let mut found_an
= false;
170 let char_class
= bidi_class(c
);
172 if char_class
== BidiClass
::EN
{
175 if char_class
== BidiClass
::AN
{
179 if !matches
!(char_class
, BidiClass
::R
| BidiClass
::AL
|
180 BidiClass
::AN
| BidiClass
::EN
|
181 BidiClass
::ES
| BidiClass
::CS
|
182 BidiClass
::ET
| BidiClass
::ON
|
183 BidiClass
::BN
| BidiClass
::NSM
) {
191 let mut rev_chars
= label
.chars().rev();
192 let mut last
= rev_chars
.next();
193 loop { // must end in L or EN followed by 0 or more NSM
195 Some(c
) if bidi_class(c
) == BidiClass
::NSM
=> {
196 last
= rev_chars
.next();
203 Some(c
) if matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
|
204 BidiClass
::EN
| BidiClass
::AN
) => {}
,
205 _
=> { return false; }
209 if found_an
&& found_en
{
214 // Rule 1: Should start with L or R/AL
223 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
224 fn validate(label
: &str, is_bidi_domain
: bool
, flags
: Flags
, errors
: &mut Vec
<Error
>) {
225 let first_char
= label
.chars().next();
226 if first_char
== None
{
227 // Empty string, pass
230 // V1: Must be in NFC form.
231 else if label
.nfc().ne(label
.chars()) {
232 errors
.push(Error
::ValidityCriteria
);
235 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
237 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
238 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
239 // https://github.com/whatwg/url/issues/53
241 // TODO: Add *CheckHyphens* flag.
243 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
244 else if label
.starts_with("-") || label
.ends_with("-") {
245 errors
.push(Error
::ValidityCriteria
);
248 // V4: not contain a U+002E FULL STOP
250 // Here, label can't contain '.' since the input is from .split('.')
252 // V5: not begin with a GC=Mark
253 else if is_combining_mark(first_char
.unwrap()) {
254 errors
.push(Error
::ValidityCriteria
);
257 // V6: Check against Mapping Table
258 else if label
.chars().any(|c
| match *find_char(c
) {
259 Mapping
::Valid
=> false,
260 Mapping
::Deviation(_
) => flags
.transitional_processing
,
261 Mapping
::DisallowedStd3Valid
=> flags
.use_std3_ascii_rules
,
264 errors
.push(Error
::ValidityCriteria
);
267 // V7: ContextJ rules
269 // TODO: Implement rules and add *CheckJoiners* flag.
273 // TODO: Add *CheckBidi* flag
274 else if !passes_bidi(label
, is_bidi_domain
)
276 errors
.push(Error
::ValidityCriteria
);
280 /// http://www.unicode.org/reports/tr46/#Processing
281 fn processing(domain
: &str, flags
: Flags
, errors
: &mut Vec
<Error
>) -> String
{
282 let mut mapped
= String
::new();
283 for c
in domain
.chars() {
284 map_char(c
, flags
, &mut mapped
, errors
)
286 let normalized
: String
= mapped
.nfc().collect();
288 // Find out if it's a Bidi Domain Name
290 // First, check for literal bidi chars
291 let mut is_bidi_domain
= domain
.chars().any(|c
|
292 matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
| BidiClass
::AN
)
295 // Then check for punycode-encoded bidi chars
296 for label
in normalized
.split('
.'
) {
297 if label
.starts_with(PUNYCODE_PREFIX
) {
298 match punycode
::decode_to_string(&label
[PUNYCODE_PREFIX
.len()..]) {
299 Some(decoded_label
) => {
300 if decoded_label
.chars().any(|c
|
301 matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
| BidiClass
::AN
)
303 is_bidi_domain
= true;
307 is_bidi_domain
= true;
314 let mut validated
= String
::new();
315 let mut first
= true;
316 for label
in normalized
.split('
.'
) {
321 if label
.starts_with(PUNYCODE_PREFIX
) {
322 match punycode
::decode_to_string(&label
[PUNYCODE_PREFIX
.len()..]) {
323 Some(decoded_label
) => {
324 let flags
= Flags { transitional_processing: false, ..flags }
;
325 validate(&decoded_label
, is_bidi_domain
, flags
, errors
);
326 validated
.push_str(&decoded_label
)
328 None
=> errors
.push(Error
::PunycodeError
)
331 validate(label
, is_bidi_domain
, flags
, errors
);
332 validated
.push_str(label
)
338 #[derive(Copy, Clone)]
340 pub use_std3_ascii_rules
: bool
,
341 pub transitional_processing
: bool
,
342 pub verify_dns_length
: bool
,
345 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
349 DissallowedByStd3AsciiRules
,
350 DissallowedMappedInStd3
,
351 DissallowedCharacter
,
356 /// Errors recorded during UTS #46 processing.
358 /// This is opaque for now, only indicating the presence of at least one error.
359 /// More details may be exposed in the future.
361 pub struct Errors(Vec
<Error
>);
363 /// http://www.unicode.org/reports/tr46/#ToASCII
364 pub fn to_ascii(domain
: &str, flags
: Flags
) -> Result
<String
, Errors
> {
365 let mut errors
= Vec
::new();
366 let mut result
= String
::new();
367 let mut first
= true;
368 for label
in processing(domain
, flags
, &mut errors
).split('
.'
) {
373 if label
.is_ascii() {
374 result
.push_str(label
);
376 match punycode
::encode_str(label
) {
378 result
.push_str(PUNYCODE_PREFIX
);
381 None
=> errors
.push(Error
::PunycodeError
)
386 if flags
.verify_dns_length
{
387 let domain
= if result
.ends_with(".") { &result[..result.len()-1] }
else { &*result }
;
388 if domain
.len() < 1 || domain
.split('
.'
).any(|label
| label
.len() < 1) {
389 errors
.push(Error
::TooShortForDns
)
391 if domain
.len() > 253 || domain
.split('
.'
).any(|label
| label
.len() > 63) {
392 errors
.push(Error
::TooLongForDns
)
395 if errors
.is_empty() {
402 /// http://www.unicode.org/reports/tr46/#ToUnicode
404 /// Only `use_std3_ascii_rules` is used in `flags`.
405 pub fn to_unicode(domain
: &str, mut flags
: Flags
) -> (String
, Result
<(), Errors
>) {
406 flags
.transitional_processing
= false;
407 let mut errors
= Vec
::new();
408 let domain
= processing(domain
, flags
, &mut errors
);
409 let errors
= if errors
.is_empty() {