vendor/icu_locid/src/langid.rs

   1 // This file is part of ICU4X. For terms of use, please see the file
   2 // called LICENSE at the top level of the ICU4X source tree
   3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
   4
   5 use core::cmp::Ordering;
   6 use core::str::FromStr;
   7
   8 use crate::ordering::SubtagOrderingResult;
   9 use crate::parser::{
  10     parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
  11     ParserMode, SubtagIterator,
  12 };
  13 use crate::subtags;
  14 use alloc::string::String;
  15 use writeable::Writeable;
  16
  17 /// A core struct representing a [`Unicode BCP47 Language Identifier`].
  18 ///
  19 /// # Examples
  20 ///
  21 /// ```
  22 /// use icu::locid::{
  23 ///     langid, subtags_language as language, subtags_region as region,
  24 /// };
  25 ///
  26 /// let li = langid!("en-US");
  27 ///
  28 /// assert_eq!(li.language, language!("en"));
  29 /// assert_eq!(li.script, None);
  30 /// assert_eq!(li.region, Some(region!("US")));
  31 /// assert_eq!(li.variants.len(), 0);
  32 /// ```
  33 ///
  34 /// # Parsing
  35 ///
  36 /// Unicode recognizes three levels of standard conformance for any language identifier:
  37 ///
  38 ///  * *well-formed* - syntactically correct
  39 ///  * *valid* - well-formed and only uses registered language, region, script and variant subtags...
  40 ///  * *canonical* - valid and no deprecated codes or structure.
  41 ///
  42 /// At the moment parsing normalizes a well-formed language identifier converting
  43 /// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
  44 ///
  45 /// Any bogus subtags will cause the parsing to fail with an error.
  46 /// No subtag validation is performed.
  47 ///
  48 /// # Examples
  49 ///
  50 /// ```
  51 /// use icu::locid::{
  52 ///     langid, subtags_language as language, subtags_region as region,
  53 ///     subtags_script as script, subtags_variant as variant,
  54 /// };
  55 ///
  56 /// let li = langid!("eN_latn_Us-Valencia");
  57 ///
  58 /// assert_eq!(li.language, language!("en"));
  59 /// assert_eq!(li.script, Some(script!("Latn")));
  60 /// assert_eq!(li.region, Some(region!("US")));
  61 /// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
  62 /// ```
  63 ///
  64 /// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
  65 #[derive(Default, PartialEq, Eq, Clone, Hash)]
  66 #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
  67 pub struct LanguageIdentifier {
  68     /// Language subtag of the language identifier.
  69     pub language: subtags::Language,
  70     /// Script subtag of the language identifier.
  71     pub script: Option<subtags::Script>,
  72     /// Region subtag of the language identifier.
  73     pub region: Option<subtags::Region>,
  74     /// Variant subtags of the language identifier.
  75     pub variants: subtags::Variants,
  76 }
  77
  78 impl LanguageIdentifier {
  79     /// A constructor which takes a utf8 slice, parses it and
  80     /// produces a well-formed [`LanguageIdentifier`].
  81     ///
  82     /// # Examples
  83     ///
  84     /// ```
  85     /// use icu::locid::LanguageIdentifier;
  86     ///
  87     /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
  88     /// ```
  89     pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
  90         parse_language_identifier(v, ParserMode::LanguageIdentifier)
  91     }
  92
  93     #[doc(hidden)]
  94     #[allow(clippy::type_complexity)]
  95     // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
  96     // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
  97     pub const fn try_from_bytes_with_single_variant(
  98         v: &[u8],
  99     ) -> Result<
 100         (
 101             subtags::Language,
 102             Option<subtags::Script>,
 103             Option<subtags::Region>,
 104             Option<subtags::Variant>,
 105         ),
 106         ParserError,
 107     > {
 108         parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier)
 109     }
 110
 111     /// A constructor which takes a utf8 slice which may contain extension keys,
 112     /// parses it and produces a well-formed [`LanguageIdentifier`].
 113     ///
 114     /// # Examples
 115     ///
 116     /// ```
 117     /// use icu::locid::{langid, LanguageIdentifier};
 118     ///
 119     /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
 120     ///     .expect("Parsing failed.");
 121     ///
 122     /// assert_eq!(li, langid!("en-US"));
 123     /// ```
 124     ///
 125     /// This method should be used for input that may be a locale identifier.
 126     /// All extensions will be lost.
 127     pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> {
 128         parse_language_identifier(v, ParserMode::Locale)
 129     }
 130
 131     /// The default undefined language "und". Same as [`default()`](Default::default()).
 132     ///
 133     /// # Examples
 134     ///
 135     /// ```
 136     /// use icu::locid::LanguageIdentifier;
 137     ///
 138     /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
 139     /// ```
 140     pub const UND: Self = Self {
 141         language: subtags::Language::UND,
 142         script: None,
 143         region: None,
 144         variants: subtags::Variants::new(),
 145     };
 146
 147     /// This is a best-effort operation that performs all available levels of canonicalization.
 148     ///
 149     /// At the moment the operation will normalize casing and the separator, but in the future
 150     /// it may also validate and update from deprecated subtags to canonical ones.
 151     ///
 152     /// # Examples
 153     ///
 154     /// ```
 155     /// use icu::locid::LanguageIdentifier;
 156     ///
 157     /// assert_eq!(
 158     ///     LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
 159     ///     Ok("pl-Latn-PL")
 160     /// );
 161     /// ```
 162     pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
 163         let lang_id = Self::try_from_bytes(input.as_ref())?;
 164         Ok(lang_id.write_to_string().into_owned())
 165     }
 166
 167     /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
 168     ///
 169     /// The return value is equivalent to what would happen if you first converted this
 170     /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
 171     ///
 172     /// This function is case-sensitive and results in a *total order*, so it is appropriate for
 173     /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
 174     ///
 175     /// # Examples
 176     ///
 177     /// ```
 178     /// use icu::locid::LanguageIdentifier;
 179     /// use std::cmp::Ordering;
 180     ///
 181     /// let bcp47_strings: &[&str] = &[
 182     ///     "pl-Latn-PL",
 183     ///     "und",
 184     ///     "und-Adlm",
 185     ///     "und-GB",
 186     ///     "und-ZA",
 187     ///     "und-fonipa",
 188     ///     "zh",
 189     /// ];
 190     ///
 191     /// for ab in bcp47_strings.windows(2) {
 192     ///     let a = ab[0];
 193     ///     let b = ab[1];
 194     ///     assert!(a.cmp(b) == Ordering::Less);
 195     ///     let a_langid = a.parse::<LanguageIdentifier>().unwrap();
 196     ///     assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
 197     ///     assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
 198     /// }
 199     /// ```
 200     pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
 201         self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
 202     }
 203
 204     /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
 205     ///
 206     /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
 207     /// a more modular version that allows multiple subtag iterators to be chained together.
 208     ///
 209     /// For an additional example, see [`SubtagOrderingResult`].
 210     ///
 211     /// # Examples
 212     ///
 213     /// ```
 214     /// use icu::locid::LanguageIdentifier;
 215     /// use std::cmp::Ordering;
 216     ///
 217     /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
 218     ///
 219     /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
 220     /// assert_eq!(
 221     ///     Ordering::Equal,
 222     ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
 223     /// );
 224     ///
 225     /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
 226     /// assert_eq!(
 227     ///     Ordering::Less,
 228     ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
 229     /// );
 230     ///
 231     /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
 232     /// assert_eq!(
 233     ///     Ordering::Greater,
 234     ///     loc.strict_cmp_iter(subtags.iter().copied()).end()
 235     /// );
 236     /// ```
 237     pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
 238     where
 239         I: Iterator<Item = &'l [u8]>,
 240     {
 241         let r = self.for_each_subtag_str(&mut |subtag| {
 242             if let Some(other) = subtags.next() {
 243                 match subtag.as_bytes().cmp(other) {
 244                     Ordering::Equal => Ok(()),
 245                     not_equal => Err(not_equal),
 246                 }
 247             } else {
 248                 Err(Ordering::Greater)
 249             }
 250         });
 251         match r {
 252             Ok(_) => SubtagOrderingResult::Subtags(subtags),
 253             Err(o) => SubtagOrderingResult::Ordering(o),
 254         }
 255     }
 256
 257     /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
 258     ///
 259     /// The return value is equivalent to what would happen if you first parsed the
 260     /// BCP-47 string to a `LanguageIdentifier` and then performed a structucal comparison.
 261     ///
 262     /// # Examples
 263     ///
 264     /// ```
 265     /// use icu::locid::LanguageIdentifier;
 266     /// use std::cmp::Ordering;
 267     ///
 268     /// let bcp47_strings: &[&str] = &[
 269     ///     "pl-LaTn-pL",
 270     ///     "uNd",
 271     ///     "UnD-adlm",
 272     ///     "uNd-GB",
 273     ///     "UND-FONIPA",
 274     ///     "ZH",
 275     /// ];
 276     ///
 277     /// for a in bcp47_strings {
 278     ///     assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
 279     /// }
 280     /// ```
 281     pub fn normalizing_eq(&self, other: &str) -> bool {
 282         macro_rules! subtag_matches {
 283             ($T:ty, $iter:ident, $expected:expr) => {
 284                 $iter
 285                     .next()
 286                     .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
 287                     .unwrap_or(false)
 288             };
 289         }
 290
 291         let mut iter = SubtagIterator::new(other.as_bytes());
 292         if !subtag_matches!(subtags::Language, iter, self.language) {
 293             return false;
 294         }
 295         if let Some(ref script) = self.script {
 296             if !subtag_matches!(subtags::Script, iter, *script) {
 297                 return false;
 298             }
 299         }
 300         if let Some(ref region) = self.region {
 301             if !subtag_matches!(subtags::Region, iter, *region) {
 302                 return false;
 303             }
 304         }
 305         for variant in self.variants.iter() {
 306             if !subtag_matches!(subtags::Variant, iter, *variant) {
 307                 return false;
 308             }
 309         }
 310         iter.next() == None
 311     }
 312
 313     pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
 314     where
 315         F: FnMut(&str) -> Result<(), E>,
 316     {
 317         f(self.language.as_str())?;
 318         if let Some(ref script) = self.script {
 319             f(script.as_str())?;
 320         }
 321         if let Some(ref region) = self.region {
 322             f(region.as_str())?;
 323         }
 324         for variant in self.variants.iter() {
 325             f(variant.as_str())?;
 326         }
 327         Ok(())
 328     }
 329 }
 330
 331 impl AsRef<LanguageIdentifier> for LanguageIdentifier {
 332     fn as_ref(&self) -> &Self {
 333         self
 334     }
 335 }
 336
 337 impl AsMut<LanguageIdentifier> for LanguageIdentifier {
 338     fn as_mut(&mut self) -> &mut Self {
 339         self
 340     }
 341 }
 342
 343 impl core::fmt::Debug for LanguageIdentifier {
 344     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
 345         core::fmt::Display::fmt(&self, f)
 346     }
 347 }
 348
 349 impl FromStr for LanguageIdentifier {
 350     type Err = ParserError;
 351
 352     fn from_str(source: &str) -> Result<Self, Self::Err> {
 353         Self::try_from_bytes(source.as_bytes())
 354     }
 355 }
 356
 357 impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
 358
 359 #[test]
 360 fn test_writeable() {
 361     use writeable::assert_writeable_eq;
 362     assert_writeable_eq!(LanguageIdentifier::UND, "und");
 363     assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
 364     assert_writeable_eq!(
 365         "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
 366         "und-Mymr",
 367     );
 368     assert_writeable_eq!(
 369         "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
 370         "my-Mymr-MM",
 371     );
 372     assert_writeable_eq!(
 373         "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
 374         "my-Mymr-MM-posix",
 375     );
 376     assert_writeable_eq!(
 377         "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
 378         "zh-macos-posix",
 379     );
 380 }
 381
 382 /// # Examples
 383 ///
 384 /// ```
 385 /// use icu::locid::{
 386 ///     langid, subtags_language as language, LanguageIdentifier,
 387 /// };
 388 ///
 389 /// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
 390 /// ```
 391 impl From<subtags::Language> for LanguageIdentifier {
 392     fn from(language: subtags::Language) -> Self {
 393         Self {
 394             language,
 395             ..Default::default()
 396         }
 397     }
 398 }
 399
 400 /// # Examples
 401 ///
 402 /// ```
 403 /// use icu::locid::{langid, subtags_script as script, LanguageIdentifier};
 404 ///
 405 /// assert_eq!(
 406 ///     LanguageIdentifier::from(Some(script!("latn"))),
 407 ///     langid!("und-Latn")
 408 /// );
 409 /// ```
 410 impl From<Option<subtags::Script>> for LanguageIdentifier {
 411     fn from(script: Option<subtags::Script>) -> Self {
 412         Self {
 413             script,
 414             ..Default::default()
 415         }
 416     }
 417 }
 418
 419 /// # Examples
 420 ///
 421 /// ```
 422 /// use icu::locid::{langid, subtags_region as region, LanguageIdentifier};
 423 ///
 424 /// assert_eq!(
 425 ///     LanguageIdentifier::from(Some(region!("US"))),
 426 ///     langid!("und-US")
 427 /// );
 428 /// ```
 429 impl From<Option<subtags::Region>> for LanguageIdentifier {
 430     fn from(region: Option<subtags::Region>) -> Self {
 431         Self {
 432             region,
 433             ..Default::default()
 434         }
 435     }
 436 }
 437
 438 /// Convert from an LSR tuple to a [`LanguageIdentifier`].
 439 ///
 440 /// # Examples
 441 ///
 442 /// ```
 443 /// use icu::locid::{
 444 ///     langid, subtags_language as language, subtags_region as region,
 445 ///     subtags_script as script, LanguageIdentifier,
 446 /// };
 447 ///
 448 /// let lang = language!("en");
 449 /// let script = script!("Latn");
 450 /// let region = region!("US");
 451 /// assert_eq!(
 452 ///     LanguageIdentifier::from((lang, Some(script), Some(region))),
 453 ///     langid!("en-Latn-US")
 454 /// );
 455 /// ```
 456 impl
 457     From<(
 458         subtags::Language,
 459         Option<subtags::Script>,
 460         Option<subtags::Region>,
 461     )> for LanguageIdentifier
 462 {
 463     fn from(
 464         lsr: (
 465             subtags::Language,
 466             Option<subtags::Script>,
 467             Option<subtags::Region>,
 468         ),
 469     ) -> Self {
 470         Self {
 471             language: lsr.0,
 472             script: lsr.1,
 473             region: lsr.2,
 474             ..Default::default()
 475         }
 476     }
 477 }
 478
 479 /// Convert from a [`LanguageIdentifier`] to an LSR tuple.
 480 ///
 481 /// # Examples
 482 ///
 483 /// ```
 484 /// use icu::locid::{
 485 ///     langid, subtags_language as language, subtags_region as region,
 486 ///     subtags_script as script,
 487 /// };
 488 ///
 489 /// let lid = langid!("en-Latn-US");
 490 /// let (lang, script, region) = (&lid).into();
 491 ///
 492 /// assert_eq!(lang, language!("en"));
 493 /// assert_eq!(script, Some(script!("Latn")));
 494 /// assert_eq!(region, Some(region!("US")));
 495 /// ```
 496 impl From<&LanguageIdentifier>
 497     for (
 498         subtags::Language,
 499         Option<subtags::Script>,
 500         Option<subtags::Region>,
 501     )
 502 {
 503     fn from(langid: &LanguageIdentifier) -> Self {
 504         (langid.language, langid.script, langid.region)
 505     }
 506 }