1 // Copyright 2013-2014 The rust-url developers.
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
14 use std
::cmp
::Ordering
::{Equal, Greater, Less}
;
15 use unicode_bidi
::{bidi_class, BidiClass}
;
16 use unicode_normalization
::char::is_combining_mark
;
17 use unicode_normalization
::UnicodeNormalization
;
19 include
!("uts46_mapping_table.rs");
21 const PUNYCODE_PREFIX
: &'
static str = "xn--";
24 struct StringTableSlice
{
25 // Store these as separate fields so the structure will have an
26 // alignment of 1 and thus pack better into the Mapping enum, below.
32 fn decode_slice(slice
: &StringTableSlice
) -> &'
static str {
33 let lo
= slice
.byte_start_lo
as usize;
34 let hi
= slice
.byte_start_hi
as usize;
35 let start
= (hi
<< 8) | lo
;
36 let len
= slice
.byte_len
as usize;
37 &STRING_TABLE
[start
..(start
+ len
)]
45 Mapped(StringTableSlice
),
46 Deviation(StringTableSlice
),
49 DisallowedStd3Mapped(StringTableSlice
),
57 fn find_char(codepoint
: char) -> &'
static Mapping
{
58 let r
= TABLE
.binary_search_by(|ref range
| {
59 if codepoint
> range
.to
{
61 } else if codepoint
< range
.from
{
69 const SINGLE_MARKER
: u16 = 1 << 15;
71 let x
= INDEX_TABLE
[i
];
72 let single
= (x
& SINGLE_MARKER
) != 0;
73 let offset
= !SINGLE_MARKER
& x
;
76 &MAPPING_TABLE
[offset
as usize]
78 &MAPPING_TABLE
[(offset
+ (codepoint
as u16 - TABLE
[i
].from
as u16)) as usize]
84 fn map_char(codepoint
: char, config
: Config
, output
: &mut String
, errors
: &mut Vec
<Error
>) {
85 match *find_char(codepoint
) {
86 Mapping
::Valid
=> output
.push(codepoint
),
87 Mapping
::Ignored
=> {}
88 Mapping
::Mapped(ref slice
) => output
.push_str(decode_slice(slice
)),
89 Mapping
::Deviation(ref slice
) => {
90 if config
.transitional_processing
{
91 output
.push_str(decode_slice(slice
))
93 output
.push(codepoint
)
96 Mapping
::Disallowed
=> {
97 errors
.push(Error
::DissallowedCharacter
);
98 output
.push(codepoint
);
100 Mapping
::DisallowedStd3Valid
=> {
101 if config
.use_std3_ascii_rules
{
102 errors
.push(Error
::DissallowedByStd3AsciiRules
);
104 output
.push(codepoint
)
106 Mapping
::DisallowedStd3Mapped(ref slice
) => {
107 if config
.use_std3_ascii_rules
{
108 errors
.push(Error
::DissallowedMappedInStd3
);
110 output
.push_str(decode_slice(slice
))
115 // http://tools.ietf.org/html/rfc5893#section-2
116 fn passes_bidi(label
: &str, is_bidi_domain
: bool
) -> bool
{
117 // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label. A label
118 // is RTL if it contains at least one character of bidi class R, AL or AN.
123 let mut chars
= label
.chars();
124 let first_char_class
= match chars
.next() {
125 Some(c
) => bidi_class(c
),
126 None
=> return true, // empty string
129 match first_char_class
{
157 // must end in L or EN followed by 0 or more NSM
158 let mut rev_chars
= label
.chars().rev();
159 let mut last_non_nsm
= rev_chars
.next();
162 Some(c
) if bidi_class(c
) == BidiClass
::NSM
=> {
163 last_non_nsm
= rev_chars
.next();
172 Some(c
) if bidi_class(c
) == BidiClass
::L
|| bidi_class(c
) == BidiClass
::EN
=> {}
181 BidiClass
::R
| BidiClass
::AL
=> {
182 let mut found_en
= false;
183 let mut found_an
= false;
189 let char_class
= bidi_class(c
);
191 if char_class
== BidiClass
::EN
{
194 if char_class
== BidiClass
::AN
{
220 let mut rev_chars
= label
.chars().rev();
221 let mut last
= rev_chars
.next();
223 // must end in L or EN followed by 0 or more NSM
225 Some(c
) if bidi_class(c
) == BidiClass
::NSM
=> {
226 last
= rev_chars
.next();
238 BidiClass
::R
| BidiClass
::AL
| BidiClass
::EN
| BidiClass
::AN
246 if found_an
&& found_en
{
251 // Rule 1: Should start with L or R/AL
260 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
261 fn validate_full(label
: &str, is_bidi_domain
: bool
, config
: Config
, errors
: &mut Vec
<Error
>) {
262 // V1: Must be in NFC form.
263 if label
.nfc().ne(label
.chars()) {
264 errors
.push(Error
::ValidityCriteria
);
266 validate(label
, is_bidi_domain
, config
, errors
);
270 fn validate(label
: &str, is_bidi_domain
: bool
, config
: Config
, errors
: &mut Vec
<Error
>) {
271 let first_char
= label
.chars().next();
272 if first_char
== None
{
273 // Empty string, pass
275 // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
277 // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
278 // third and fourth positions. But nobody follows this criteria. See the spec issue below:
279 // https://github.com/whatwg/url/issues/53
281 // V3: neither begin nor end with a U+002D HYPHEN-MINUS
282 else if config
.check_hyphens
&& (label
.starts_with("-") || label
.ends_with("-")) {
283 errors
.push(Error
::ValidityCriteria
);
285 // V4: not contain a U+002E FULL STOP
287 // Here, label can't contain '.' since the input is from .split('.')
289 // V5: not begin with a GC=Mark
290 else if is_combining_mark(first_char
.unwrap()) {
291 errors
.push(Error
::ValidityCriteria
);
293 // V6: Check against Mapping Table
294 else if label
.chars().any(|c
| match *find_char(c
) {
295 Mapping
::Valid
=> false,
296 Mapping
::Deviation(_
) => config
.transitional_processing
,
297 Mapping
::DisallowedStd3Valid
=> config
.use_std3_ascii_rules
,
300 errors
.push(Error
::ValidityCriteria
);
302 // V7: ContextJ rules
304 // TODO: Implement rules and add *CheckJoiners* flag.
308 // TODO: Add *CheckBidi* flag
309 else if !passes_bidi(label
, is_bidi_domain
) {
310 errors
.push(Error
::ValidityCriteria
);
314 /// http://www.unicode.org/reports/tr46/#Processing
315 fn processing(domain
: &str, config
: Config
, errors
: &mut Vec
<Error
>) -> String
{
316 let mut mapped
= String
::with_capacity(domain
.len());
317 for c
in domain
.chars() {
318 map_char(c
, config
, &mut mapped
, errors
)
320 let mut normalized
= String
::with_capacity(mapped
.len());
321 normalized
.extend(mapped
.nfc());
323 // Find out if it's a Bidi Domain Name
325 // First, check for literal bidi chars
326 let mut is_bidi_domain
= domain
328 .any(|c
| matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
| BidiClass
::AN
));
330 // Then check for punycode-encoded bidi chars
331 for label
in normalized
.split('
.'
) {
332 if label
.starts_with(PUNYCODE_PREFIX
) {
333 match punycode
::decode_to_string(&label
[PUNYCODE_PREFIX
.len()..]) {
334 Some(decoded_label
) => {
335 if decoded_label
.chars().any(|c
| {
336 matches
!(bidi_class(c
), BidiClass
::R
| BidiClass
::AL
| BidiClass
::AN
)
338 is_bidi_domain
= true;
342 is_bidi_domain
= true;
349 let mut validated
= String
::new();
350 let mut first
= true;
351 for label
in normalized
.split('
.'
) {
356 if label
.starts_with(PUNYCODE_PREFIX
) {
357 match punycode
::decode_to_string(&label
[PUNYCODE_PREFIX
.len()..]) {
358 Some(decoded_label
) => {
359 let config
= config
.transitional_processing(false);
360 validate_full(&decoded_label
, is_bidi_domain
, config
, errors
);
361 validated
.push_str(&decoded_label
)
363 None
=> errors
.push(Error
::PunycodeError
),
366 // `normalized` is already `NFC` so we can skip that check
367 validate(label
, is_bidi_domain
, config
, errors
);
368 validated
.push_str(label
)
374 #[derive(Clone, Copy)]
376 use_std3_ascii_rules
: bool
,
377 transitional_processing
: bool
,
378 verify_dns_length
: bool
,
382 /// The defaults are that of https://url.spec.whatwg.org/#idna
383 impl Default
for Config
{
384 fn default() -> Self {
386 use_std3_ascii_rules
: false,
387 transitional_processing
: false,
388 check_hyphens
: false,
390 // check_joiners: true,
392 // Only use for to_ascii, not to_unicode
393 verify_dns_length
: false,
400 pub fn use_std3_ascii_rules(mut self, value
: bool
) -> Self {
401 self.use_std3_ascii_rules
= value
;
406 pub fn transitional_processing(mut self, value
: bool
) -> Self {
407 self.transitional_processing
= value
;
412 pub fn verify_dns_length(mut self, value
: bool
) -> Self {
413 self.verify_dns_length
= value
;
418 pub fn check_hyphens(mut self, value
: bool
) -> Self {
419 self.check_hyphens
= value
;
423 /// http://www.unicode.org/reports/tr46/#ToASCII
424 pub fn to_ascii(self, domain
: &str) -> Result
<String
, Errors
> {
425 let mut errors
= Vec
::new();
426 let mut result
= String
::new();
427 let mut first
= true;
428 for label
in processing(domain
, self, &mut errors
).split('
.'
) {
433 if label
.is_ascii() {
434 result
.push_str(label
);
436 match punycode
::encode_str(label
) {
438 result
.push_str(PUNYCODE_PREFIX
);
441 None
=> errors
.push(Error
::PunycodeError
),
446 if self.verify_dns_length
{
447 let domain
= if result
.ends_with(".") {
448 &result
[..result
.len() - 1]
452 if domain
.len() < 1 || domain
.split('
.'
).any(|label
| label
.len() < 1) {
453 errors
.push(Error
::TooShortForDns
)
455 if domain
.len() > 253 || domain
.split('
.'
).any(|label
| label
.len() > 63) {
456 errors
.push(Error
::TooLongForDns
)
459 if errors
.is_empty() {
466 /// http://www.unicode.org/reports/tr46/#ToUnicode
467 pub fn to_unicode(self, domain
: &str) -> (String
, Result
<(), Errors
>) {
468 let mut errors
= Vec
::new();
469 let domain
= processing(domain
, self, &mut errors
);
470 let errors
= if errors
.is_empty() {
479 #[derive(PartialEq, Eq, Clone, Copy, Debug)]
483 DissallowedByStd3AsciiRules
,
484 DissallowedMappedInStd3
,
485 DissallowedCharacter
,
490 /// Errors recorded during UTS #46 processing.
492 /// This is opaque for now, only indicating the presence of at least one error.
493 /// More details may be exposed in the future.
495 pub struct Errors(Vec
<Error
>);