1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11 
12 use self::Mapping::*;
13 use crate::punycode;
14 
15 use alloc::string::String;
16 use core::fmt;
17 use unicode_bidi::{bidi_class, BidiClass};
18 use unicode_normalization::char::is_combining_mark;
19 use unicode_normalization::{is_nfc, UnicodeNormalization};
20 
21 include!("uts46_mapping_table.rs");
22 
23 const PUNYCODE_PREFIX: &str = "xn--";
24 
25 #[derive(Debug)]
26 struct StringTableSlice {
27     // Store these as separate fields so the structure will have an
28     // alignment of 1 and thus pack better into the Mapping enum, below.
29     byte_start_lo: u8,
30     byte_start_hi: u8,
31     byte_len: u8,
32 }
33 
decode_slice(slice: &StringTableSlice) -> &'static str34 fn decode_slice(slice: &StringTableSlice) -> &'static str {
35     let lo = slice.byte_start_lo as usize;
36     let hi = slice.byte_start_hi as usize;
37     let start = (hi << 8) | lo;
38     let len = slice.byte_len as usize;
39     &STRING_TABLE[start..(start + len)]
40 }
41 
42 #[repr(u8)]
43 #[derive(Debug)]
44 enum Mapping {
45     Valid,
46     Ignored,
47     Mapped(StringTableSlice),
48     Deviation(StringTableSlice),
49     Disallowed,
50     DisallowedStd3Valid,
51     DisallowedStd3Mapped(StringTableSlice),
52     DisallowedIdna2008,
53 }
54 
find_char(codepoint: char) -> &'static Mapping55 fn find_char(codepoint: char) -> &'static Mapping {
56     let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
57         Ok(idx) => idx,
58         Err(idx) => idx - 1,
59     };
60 
61     const SINGLE_MARKER: u16 = 1 << 15;
62 
63     let (base, x) = TABLE[idx];
64     let single = (x & SINGLE_MARKER) != 0;
65     let offset = !SINGLE_MARKER & x;
66 
67     if single {
68         &MAPPING_TABLE[offset as usize]
69     } else {
70         &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
71     }
72 }
73 
74 struct Mapper<'a> {
75     chars: core::str::Chars<'a>,
76     config: Config,
77     errors: &'a mut Errors,
78     slice: Option<core::str::Chars<'static>>,
79 }
80 
81 impl<'a> Iterator for Mapper<'a> {
82     type Item = char;
83 
next(&mut self) -> Option<Self::Item>84     fn next(&mut self) -> Option<Self::Item> {
85         loop {
86             if let Some(s) = &mut self.slice {
87                 match s.next() {
88                     Some(c) => return Some(c),
89                     None => {
90                         self.slice = None;
91                     }
92                 }
93             }
94 
95             let codepoint = self.chars.next()?;
96             if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
97                 return Some(codepoint);
98             }
99 
100             return Some(match *find_char(codepoint) {
101                 Mapping::Valid => codepoint,
102                 Mapping::Ignored => continue,
103                 Mapping::Mapped(ref slice) => {
104                     self.slice = Some(decode_slice(slice).chars());
105                     continue;
106                 }
107                 Mapping::Deviation(ref slice) => {
108                     if self.config.transitional_processing {
109                         self.slice = Some(decode_slice(slice).chars());
110                         continue;
111                     } else {
112                         codepoint
113                     }
114                 }
115                 Mapping::Disallowed => {
116                     self.errors.disallowed_character = true;
117                     codepoint
118                 }
119                 Mapping::DisallowedStd3Valid => {
120                     if self.config.use_std3_ascii_rules {
121                         self.errors.disallowed_by_std3_ascii_rules = true;
122                     };
123                     codepoint
124                 }
125                 Mapping::DisallowedStd3Mapped(ref slice) => {
126                     if self.config.use_std3_ascii_rules {
127                         self.errors.disallowed_mapped_in_std3 = true;
128                     };
129                     self.slice = Some(decode_slice(slice).chars());
130                     continue;
131                 }
132                 Mapping::DisallowedIdna2008 => {
133                     if self.config.use_idna_2008_rules {
134                         self.errors.disallowed_in_idna_2008 = true;
135                     }
136                     codepoint
137                 }
138             });
139         }
140     }
141 }
142 
143 // http://tools.ietf.org/html/rfc5893#section-2
passes_bidi(label: &str, is_bidi_domain: bool) -> bool144 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
145     // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label.  A label
146     // is RTL if it contains at least one character of bidi class R, AL or AN.
147     if !is_bidi_domain {
148         return true;
149     }
150 
151     let mut chars = label.chars();
152     let first_char_class = match chars.next() {
153         Some(c) => bidi_class(c),
154         None => return true, // empty string
155     };
156 
157     match first_char_class {
158         // LTR label
159         BidiClass::L => {
160             // Rule 5
161             for c in chars.by_ref() {
162                 if !matches!(
163                     bidi_class(c),
164                     BidiClass::L
165                         | BidiClass::EN
166                         | BidiClass::ES
167                         | BidiClass::CS
168                         | BidiClass::ET
169                         | BidiClass::ON
170                         | BidiClass::BN
171                         | BidiClass::NSM
172                 ) {
173                     return false;
174                 }
175             }
176 
177             // Rule 6
178             // must end in L or EN followed by 0 or more NSM
179             let mut rev_chars = label.chars().rev();
180             let mut last_non_nsm = rev_chars.next();
181             loop {
182                 match last_non_nsm {
183                     Some(c) if bidi_class(c) == BidiClass::NSM => {
184                         last_non_nsm = rev_chars.next();
185                         continue;
186                     }
187                     _ => {
188                         break;
189                     }
190                 }
191             }
192             match last_non_nsm {
193                 Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
194                 Some(_) => {
195                     return false;
196                 }
197                 _ => {}
198             }
199         }
200 
201         // RTL label
202         BidiClass::R | BidiClass::AL => {
203             let mut found_en = false;
204             let mut found_an = false;
205 
206             // Rule 2
207             for c in chars {
208                 let char_class = bidi_class(c);
209                 if char_class == BidiClass::EN {
210                     found_en = true;
211                 } else if char_class == BidiClass::AN {
212                     found_an = true;
213                 }
214 
215                 if !matches!(
216                     char_class,
217                     BidiClass::R
218                         | BidiClass::AL
219                         | BidiClass::AN
220                         | BidiClass::EN
221                         | BidiClass::ES
222                         | BidiClass::CS
223                         | BidiClass::ET
224                         | BidiClass::ON
225                         | BidiClass::BN
226                         | BidiClass::NSM
227                 ) {
228                     return false;
229                 }
230             }
231             // Rule 3
232             let mut rev_chars = label.chars().rev();
233             let mut last = rev_chars.next();
234             loop {
235                 // must end in L or EN followed by 0 or more NSM
236                 match last {
237                     Some(c) if bidi_class(c) == BidiClass::NSM => {
238                         last = rev_chars.next();
239                         continue;
240                     }
241                     _ => {
242                         break;
243                     }
244                 }
245             }
246             match last {
247                 Some(c)
248                     if matches!(
249                         bidi_class(c),
250                         BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
251                     ) => {}
252                 _ => {
253                     return false;
254                 }
255             }
256 
257             // Rule 4
258             if found_an && found_en {
259                 return false;
260             }
261         }
262 
263         // Rule 1: Should start with L or R/AL
264         _ => {
265             return false;
266         }
267     }
268 
269     true
270 }
271 
272 /// Check the validity criteria for the given label
273 ///
274 /// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
275 ///
276 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
check_validity(label: &str, config: Config, errors: &mut Errors)277 fn check_validity(label: &str, config: Config, errors: &mut Errors) {
278     let first_char = label.chars().next();
279     if first_char.is_none() {
280         // Empty string, pass
281         return;
282     }
283 
284     // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
285     //
286     // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
287     // third and fourth positions. But nobody follows this criteria. See the spec issue below:
288     // https://github.com/whatwg/url/issues/53
289 
290     // V3: neither begin nor end with a U+002D HYPHEN-MINUS
291     if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
292         errors.check_hyphens = true;
293         return;
294     }
295 
296     // V4: not contain a U+002E FULL STOP
297     //
298     // Here, label can't contain '.' since the input is from .split('.')
299 
300     // V5: not begin with a GC=Mark
301     if is_combining_mark(first_char.unwrap()) {
302         errors.start_combining_mark = true;
303         return;
304     }
305 
306     // V6: Check against Mapping Table
307     if label.chars().any(|c| match *find_char(c) {
308         Mapping::Valid | Mapping::DisallowedIdna2008 => false,
309         Mapping::Deviation(_) => config.transitional_processing,
310         Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
311         _ => true,
312     }) {
313         errors.invalid_mapping = true;
314     }
315 
316     // V7: ContextJ rules
317     //
318     // TODO: Implement rules and add *CheckJoiners* flag.
319 
320     // V8: Bidi rules are checked inside `processing()`
321 }
322 
323 // Detect simple cases: all lowercase ASCII characters and digits where none
324 // of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
is_simple(domain: &str) -> bool325 fn is_simple(domain: &str) -> bool {
326     if domain.is_empty() {
327         return false;
328     }
329     let (mut prev, mut puny_prefix) = ('?', 0);
330     for c in domain.chars() {
331         if c == '.' {
332             if prev == '-' {
333                 return false;
334             }
335             puny_prefix = 0;
336             continue;
337         } else if puny_prefix == 0 && c == '-' {
338             return false;
339         } else if puny_prefix < 5 {
340             if c == ['x', 'n', '-', '-'][puny_prefix] {
341                 puny_prefix += 1;
342                 if puny_prefix == 4 {
343                     return false;
344                 }
345             } else {
346                 puny_prefix = 5;
347             }
348         }
349         if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
350             return false;
351         }
352         prev = c;
353     }
354 
355     true
356 }
357 
358 /// http://www.unicode.org/reports/tr46/#Processing
processing( domain: &str, config: Config, normalized: &mut String, output: &mut String, ) -> Errors359 fn processing(
360     domain: &str,
361     config: Config,
362     normalized: &mut String,
363     output: &mut String,
364 ) -> Errors {
365     normalized.clear();
366     let mut errors = Errors::default();
367     let offset = output.len();
368 
369     let iter = Mapper {
370         chars: domain.chars(),
371         config,
372         errors: &mut errors,
373         slice: None,
374     };
375 
376     normalized.extend(iter.nfc());
377 
378     let mut decoder = punycode::Decoder::default();
379     let non_transitional = config.transitional_processing(false);
380     let (mut first, mut has_bidi_labels) = (true, false);
381     for label in normalized.split('.') {
382         if !first {
383             output.push('.');
384         }
385         first = false;
386         if let Some(remainder) = label.strip_prefix(PUNYCODE_PREFIX) {
387             match decoder.decode(remainder) {
388                 Ok(decode) => {
389                     let start = output.len();
390                     output.extend(decode);
391                     let decoded_label = &output[start..];
392 
393                     if !has_bidi_labels {
394                         has_bidi_labels |= is_bidi_domain(decoded_label);
395                     }
396 
397                     if !errors.is_err() {
398                         if !is_nfc(decoded_label) {
399                             errors.nfc = true;
400                         } else {
401                             check_validity(decoded_label, non_transitional, &mut errors);
402                         }
403                     }
404                 }
405                 Err(()) => {
406                     has_bidi_labels = true;
407                     errors.punycode = true;
408                 }
409             }
410         } else {
411             if !has_bidi_labels {
412                 has_bidi_labels |= is_bidi_domain(label);
413             }
414 
415             // `normalized` is already `NFC` so we can skip that check
416             check_validity(label, config, &mut errors);
417             output.push_str(label)
418         }
419     }
420 
421     for label in output[offset..].split('.') {
422         // V8: Bidi rules
423         //
424         // TODO: Add *CheckBidi* flag
425         if !passes_bidi(label, has_bidi_labels) {
426             errors.check_bidi = true;
427             break;
428         }
429     }
430 
431     errors
432 }
433 
434 #[derive(Default)]
435 pub struct Idna {
436     config: Config,
437     normalized: String,
438     output: String,
439 }
440 
441 impl Idna {
new(config: Config) -> Self442     pub fn new(config: Config) -> Self {
443         Self {
444             config,
445             normalized: String::new(),
446             output: String::new(),
447         }
448     }
449 
to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors450     pub fn to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors {
451         if is_simple(domain) {
452             out.push_str(domain);
453             return Errors::default();
454         }
455         let mut errors = processing(domain, self.config, &mut self.normalized, out);
456         self.output = core::mem::replace(out, String::with_capacity(out.len()));
457         let mut first = true;
458         for label in self.output.split('.') {
459             if !first {
460                 out.push('.');
461             }
462             first = false;
463 
464             if label.is_ascii() {
465                 out.push_str(label);
466             } else {
467                 let offset = out.len();
468                 out.push_str(PUNYCODE_PREFIX);
469                 if let Err(()) = punycode::encode_into(label.chars(), out) {
470                     errors.punycode = true;
471                     out.truncate(offset);
472                 }
473             }
474         }
475         errors
476     }
477 
478     /// http://www.unicode.org/reports/tr46/#ToASCII
479     #[allow(clippy::wrong_self_convention)]
to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors>480     pub fn to_ascii(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
481         let mut errors = self.to_ascii_inner(domain, out);
482 
483         if self.config.verify_dns_length {
484             let domain = if out.ends_with('.') {
485                 &out[..out.len() - 1]
486             } else {
487                 &*out
488             };
489             if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
490                 errors.too_short_for_dns = true;
491             }
492             if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
493                 errors.too_long_for_dns = true;
494             }
495         }
496 
497         errors.into()
498     }
499 
500     /// http://www.unicode.org/reports/tr46/#ToUnicode
501     #[allow(clippy::wrong_self_convention)]
to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors>502     pub fn to_unicode(&mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
503         if is_simple(domain) {
504             out.push_str(domain);
505             return Errors::default().into();
506         }
507         processing(domain, self.config, &mut self.normalized, out).into()
508     }
509 }
510 
511 #[derive(Clone, Copy)]
512 #[must_use]
513 pub struct Config {
514     use_std3_ascii_rules: bool,
515     transitional_processing: bool,
516     verify_dns_length: bool,
517     check_hyphens: bool,
518     use_idna_2008_rules: bool,
519 }
520 
521 /// The defaults are that of https://url.spec.whatwg.org/#idna
522 impl Default for Config {
default() -> Self523     fn default() -> Self {
524         Config {
525             use_std3_ascii_rules: false,
526             transitional_processing: false,
527             check_hyphens: false,
528             // check_bidi: true,
529             // check_joiners: true,
530 
531             // Only use for to_ascii, not to_unicode
532             verify_dns_length: false,
533             use_idna_2008_rules: false,
534         }
535     }
536 }
537 
538 impl Config {
539     #[inline]
use_std3_ascii_rules(mut self, value: bool) -> Self540     pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
541         self.use_std3_ascii_rules = value;
542         self
543     }
544 
545     #[inline]
transitional_processing(mut self, value: bool) -> Self546     pub fn transitional_processing(mut self, value: bool) -> Self {
547         self.transitional_processing = value;
548         self
549     }
550 
551     #[inline]
verify_dns_length(mut self, value: bool) -> Self552     pub fn verify_dns_length(mut self, value: bool) -> Self {
553         self.verify_dns_length = value;
554         self
555     }
556 
557     #[inline]
check_hyphens(mut self, value: bool) -> Self558     pub fn check_hyphens(mut self, value: bool) -> Self {
559         self.check_hyphens = value;
560         self
561     }
562 
563     #[inline]
use_idna_2008_rules(mut self, value: bool) -> Self564     pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
565         self.use_idna_2008_rules = value;
566         self
567     }
568 
569     /// http://www.unicode.org/reports/tr46/#ToASCII
to_ascii(self, domain: &str) -> Result<String, Errors>570     pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
571         let mut result = String::with_capacity(domain.len());
572         let mut codec = Idna::new(self);
573         codec.to_ascii(domain, &mut result).map(|()| result)
574     }
575 
576     /// http://www.unicode.org/reports/tr46/#ToUnicode
to_unicode(self, domain: &str) -> (String, Result<(), Errors>)577     pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
578         let mut codec = Idna::new(self);
579         let mut out = String::with_capacity(domain.len());
580         let result = codec.to_unicode(domain, &mut out);
581         (out, result)
582     }
583 }
584 
is_bidi_domain(s: &str) -> bool585 fn is_bidi_domain(s: &str) -> bool {
586     for c in s.chars() {
587         if c.is_ascii_graphic() {
588             continue;
589         }
590         match bidi_class(c) {
591             BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
592             _ => {}
593         }
594     }
595     false
596 }
597 
598 /// Errors recorded during UTS #46 processing.
599 ///
600 /// This is opaque for now, indicating what types of errors have been encountered at least once.
601 /// More details may be exposed in the future.
602 #[derive(Default)]
603 pub struct Errors {
604     punycode: bool,
605     check_hyphens: bool,
606     check_bidi: bool,
607     start_combining_mark: bool,
608     invalid_mapping: bool,
609     nfc: bool,
610     disallowed_by_std3_ascii_rules: bool,
611     disallowed_mapped_in_std3: bool,
612     disallowed_character: bool,
613     too_long_for_dns: bool,
614     too_short_for_dns: bool,
615     disallowed_in_idna_2008: bool,
616 }
617 
618 impl Errors {
is_err(&self) -> bool619     fn is_err(&self) -> bool {
620         let Errors {
621             punycode,
622             check_hyphens,
623             check_bidi,
624             start_combining_mark,
625             invalid_mapping,
626             nfc,
627             disallowed_by_std3_ascii_rules,
628             disallowed_mapped_in_std3,
629             disallowed_character,
630             too_long_for_dns,
631             too_short_for_dns,
632             disallowed_in_idna_2008,
633         } = *self;
634         punycode
635             || check_hyphens
636             || check_bidi
637             || start_combining_mark
638             || invalid_mapping
639             || nfc
640             || disallowed_by_std3_ascii_rules
641             || disallowed_mapped_in_std3
642             || disallowed_character
643             || too_long_for_dns
644             || too_short_for_dns
645             || disallowed_in_idna_2008
646     }
647 }
648 
649 impl fmt::Debug for Errors {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result650     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
651         let Errors {
652             punycode,
653             check_hyphens,
654             check_bidi,
655             start_combining_mark,
656             invalid_mapping,
657             nfc,
658             disallowed_by_std3_ascii_rules,
659             disallowed_mapped_in_std3,
660             disallowed_character,
661             too_long_for_dns,
662             too_short_for_dns,
663             disallowed_in_idna_2008,
664         } = *self;
665 
666         let fields = [
667             ("punycode", punycode),
668             ("check_hyphens", check_hyphens),
669             ("check_bidi", check_bidi),
670             ("start_combining_mark", start_combining_mark),
671             ("invalid_mapping", invalid_mapping),
672             ("nfc", nfc),
673             (
674                 "disallowed_by_std3_ascii_rules",
675                 disallowed_by_std3_ascii_rules,
676             ),
677             ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
678             ("disallowed_character", disallowed_character),
679             ("too_long_for_dns", too_long_for_dns),
680             ("too_short_for_dns", too_short_for_dns),
681             ("disallowed_in_idna_2008", disallowed_in_idna_2008),
682         ];
683 
684         let mut empty = true;
685         f.write_str("Errors { ")?;
686         for (name, val) in &fields {
687             if *val {
688                 if !empty {
689                     f.write_str(", ")?;
690                 }
691                 f.write_str(name)?;
692                 empty = false;
693             }
694         }
695 
696         if !empty {
697             f.write_str(" }")
698         } else {
699             f.write_str("}")
700         }
701     }
702 }
703 
704 impl From<Errors> for Result<(), Errors> {
from(e: Errors) -> Result<(), Errors>705     fn from(e: Errors) -> Result<(), Errors> {
706         if !e.is_err() {
707             Ok(())
708         } else {
709             Err(e)
710         }
711     }
712 }
713 
714 #[cfg(feature = "std")]
715 impl std::error::Error for Errors {}
716 
717 impl fmt::Display for Errors {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result718     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719         fmt::Debug::fmt(self, f)
720     }
721 }
722 
723 #[cfg(test)]
724 mod tests {
725     use super::{find_char, Mapping};
726 
727     #[test]
mapping_fast_path()728     fn mapping_fast_path() {
729         assert_matches!(find_char('-'), &Mapping::Valid);
730         assert_matches!(find_char('.'), &Mapping::Valid);
731         for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
732             assert_matches!(find_char(*c), &Mapping::Valid);
733         }
734         for c in &[
735             'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
736             'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
737         ] {
738             assert_matches!(find_char(*c), &Mapping::Valid);
739         }
740     }
741 }
742