1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
5 use core
::cmp
::Ordering
;
6 use core
::str::FromStr
;
8 use crate::ordering
::SubtagOrderingResult
;
10 parse_language_identifier
, parse_language_identifier_with_single_variant
, ParserError
,
11 ParserMode
, SubtagIterator
,
14 use alloc
::string
::String
;
15 use writeable
::Writeable
;
17 /// A core struct representing a [`Unicode BCP47 Language Identifier`].
23 /// langid, subtags_language as language, subtags_region as region,
26 /// let li = langid!("en-US");
28 /// assert_eq!(li.language, language!("en"));
29 /// assert_eq!(li.script, None);
30 /// assert_eq!(li.region, Some(region!("US")));
31 /// assert_eq!(li.variants.len(), 0);
36 /// Unicode recognizes three levels of standard conformance for any language identifier:
38 /// * *well-formed* - syntactically correct
39 /// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
40 /// * *canonical* - valid and no deprecated codes or structure.
42 /// At the moment parsing normalizes a well-formed language identifier converting
43 /// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
45 /// Any bogus subtags will cause the parsing to fail with an error.
46 /// No subtag validation is performed.
52 /// langid, subtags_language as language, subtags_region as region,
53 /// subtags_script as script, subtags_variant as variant,
56 /// let li = langid!("eN_latn_Us-Valencia");
58 /// assert_eq!(li.language, language!("en"));
59 /// assert_eq!(li.script, Some(script!("Latn")));
60 /// assert_eq!(li.region, Some(region!("US")));
61 /// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
64 /// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
65 #[derive(Default, PartialEq, Eq, Clone, Hash)]
66 #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
67 pub struct LanguageIdentifier
{
68 /// Language subtag of the language identifier.
69 pub language
: subtags
::Language
,
70 /// Script subtag of the language identifier.
71 pub script
: Option
<subtags
::Script
>,
72 /// Region subtag of the language identifier.
73 pub region
: Option
<subtags
::Region
>,
74 /// Variant subtags of the language identifier.
75 pub variants
: subtags
::Variants
,
78 impl LanguageIdentifier
{
79 /// A constructor which takes a utf8 slice, parses it and
80 /// produces a well-formed [`LanguageIdentifier`].
85 /// use icu::locid::LanguageIdentifier;
87 /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
89 pub fn try_from_bytes(v
: &[u8]) -> Result
<Self, ParserError
> {
90 parse_language_identifier(v
, ParserMode
::LanguageIdentifier
)
94 #[allow(clippy::type_complexity)]
95 // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
96 // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
97 pub const fn try_from_bytes_with_single_variant(
102 Option
<subtags
::Script
>,
103 Option
<subtags
::Region
>,
104 Option
<subtags
::Variant
>,
108 parse_language_identifier_with_single_variant(v
, ParserMode
::LanguageIdentifier
)
111 /// A constructor which takes a utf8 slice which may contain extension keys,
112 /// parses it and produces a well-formed [`LanguageIdentifier`].
117 /// use icu::locid::{langid, LanguageIdentifier};
119 /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
120 /// .expect("Parsing failed.");
122 /// assert_eq!(li, langid!("en-US"));
125 /// This method should be used for input that may be a locale identifier.
126 /// All extensions will be lost.
127 pub fn try_from_locale_bytes(v
: &[u8]) -> Result
<Self, ParserError
> {
128 parse_language_identifier(v
, ParserMode
::Locale
)
131 /// The default undefined language "und". Same as [`default()`](Default::default()).
136 /// use icu::locid::LanguageIdentifier;
138 /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
140 pub const UND
: Self = Self {
141 language
: subtags
::Language
::UND
,
144 variants
: subtags
::Variants
::new(),
147 /// This is a best-effort operation that performs all available levels of canonicalization.
149 /// At the moment the operation will normalize casing and the separator, but in the future
150 /// it may also validate and update from deprecated subtags to canonical ones.
155 /// use icu::locid::LanguageIdentifier;
158 /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
162 pub fn canonicalize
<S
: AsRef
<[u8]>>(input
: S
) -> Result
<String
, ParserError
> {
163 let lang_id
= Self::try_from_bytes(input
.as_ref())?
;
164 Ok(lang_id
.write_to_string().into_owned())
167 /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
169 /// The return value is equivalent to what would happen if you first converted this
170 /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
172 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
173 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
178 /// use icu::locid::LanguageIdentifier;
179 /// use std::cmp::Ordering;
181 /// let bcp47_strings: &[&str] = &[
191 /// for ab in bcp47_strings.windows(2) {
194 /// assert!(a.cmp(b) == Ordering::Less);
195 /// let a_langid = a.parse::<LanguageIdentifier>().unwrap();
196 /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
197 /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
200 pub fn strict_cmp(&self, other
: &[u8]) -> Ordering
{
201 self.strict_cmp_iter(other
.split(|b
| *b
== b'
-'
)).end()
204 /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
206 /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
207 /// a more modular version that allows multiple subtag iterators to be chained together.
209 /// For an additional example, see [`SubtagOrderingResult`].
214 /// use icu::locid::LanguageIdentifier;
215 /// use std::cmp::Ordering;
217 /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
219 /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
222 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
225 /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
228 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
231 /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
233 /// Ordering::Greater,
234 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
237 pub fn strict_cmp_iter
<'l
, I
>(&self, mut subtags
: I
) -> SubtagOrderingResult
<I
>
239 I
: Iterator
<Item
= &'l
[u8]>,
241 let r
= self.for_each_subtag_str(&mut |subtag
| {
242 if let Some(other
) = subtags
.next() {
243 match subtag
.as_bytes().cmp(other
) {
244 Ordering
::Equal
=> Ok(()),
245 not_equal
=> Err(not_equal
),
248 Err(Ordering
::Greater
)
252 Ok(_
) => SubtagOrderingResult
::Subtags(subtags
),
253 Err(o
) => SubtagOrderingResult
::Ordering(o
),
257 /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
259 /// The return value is equivalent to what would happen if you first parsed the
260 /// BCP-47 string to a `LanguageIdentifier` and then performed a structucal comparison.
265 /// use icu::locid::LanguageIdentifier;
266 /// use std::cmp::Ordering;
268 /// let bcp47_strings: &[&str] = &[
277 /// for a in bcp47_strings {
278 /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
281 pub fn normalizing_eq(&self, other
: &str) -> bool
{
282 macro_rules
! subtag_matches
{
283 ($T
:ty
, $iter
:ident
, $expected
:expr
) => {
286 .map(|b
| <$T
>::try_from_bytes(b
) == Ok($expected
))
291 let mut iter
= SubtagIterator
::new(other
.as_bytes());
292 if !subtag_matches
!(subtags
::Language
, iter
, self.language
) {
295 if let Some(ref script
) = self.script
{
296 if !subtag_matches
!(subtags
::Script
, iter
, *script
) {
300 if let Some(ref region
) = self.region
{
301 if !subtag_matches
!(subtags
::Region
, iter
, *region
) {
305 for variant
in self.variants
.iter() {
306 if !subtag_matches
!(subtags
::Variant
, iter
, *variant
) {
313 pub(crate) fn for_each_subtag_str
<E
, F
>(&self, f
: &mut F
) -> Result
<(), E
>
315 F
: FnMut(&str) -> Result
<(), E
>,
317 f(self.language
.as_str())?
;
318 if let Some(ref script
) = self.script
{
321 if let Some(ref region
) = self.region
{
324 for variant
in self.variants
.iter() {
325 f(variant
.as_str())?
;
331 impl AsRef
<LanguageIdentifier
> for LanguageIdentifier
{
332 fn as_ref(&self) -> &Self {
337 impl AsMut
<LanguageIdentifier
> for LanguageIdentifier
{
338 fn as_mut(&mut self) -> &mut Self {
343 impl core
::fmt
::Debug
for LanguageIdentifier
{
344 fn fmt(&self, f
: &mut core
::fmt
::Formatter
) -> core
::fmt
::Result
{
345 core
::fmt
::Display
::fmt(&self, f
)
349 impl FromStr
for LanguageIdentifier
{
350 type Err
= ParserError
;
352 fn from_str(source
: &str) -> Result
<Self, Self::Err
> {
353 Self::try_from_bytes(source
.as_bytes())
357 impl_writeable_for_each_subtag_str_no_test
!(LanguageIdentifier
, selff
, selff
.script
.is_none() && selff
.region
.is_none() && selff
.variants
.is_empty() => selff
.language
.write_to_string());
360 fn test_writeable() {
361 use writeable
::assert_writeable_eq
;
362 assert_writeable_eq
!(LanguageIdentifier
::UND
, "und");
363 assert_writeable_eq
!("und-001".parse
::<LanguageIdentifier
>().unwrap(), "und-001");
364 assert_writeable_eq
!(
365 "und-Mymr".parse
::<LanguageIdentifier
>().unwrap(),
368 assert_writeable_eq
!(
369 "my-Mymr-MM".parse
::<LanguageIdentifier
>().unwrap(),
372 assert_writeable_eq
!(
373 "my-Mymr-MM-posix".parse
::<LanguageIdentifier
>().unwrap(),
376 assert_writeable_eq
!(
377 "zh-macos-posix".parse
::<LanguageIdentifier
>().unwrap(),
385 /// use icu::locid::{
386 /// langid, subtags_language as language, LanguageIdentifier,
389 /// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
391 impl From
<subtags
::Language
> for LanguageIdentifier
{
392 fn from(language
: subtags
::Language
) -> Self {
403 /// use icu::locid::{langid, subtags_script as script, LanguageIdentifier};
406 /// LanguageIdentifier::from(Some(script!("latn"))),
407 /// langid!("und-Latn")
410 impl From
<Option
<subtags
::Script
>> for LanguageIdentifier
{
411 fn from(script
: Option
<subtags
::Script
>) -> Self {
422 /// use icu::locid::{langid, subtags_region as region, LanguageIdentifier};
425 /// LanguageIdentifier::from(Some(region!("US"))),
426 /// langid!("und-US")
429 impl From
<Option
<subtags
::Region
>> for LanguageIdentifier
{
430 fn from(region
: Option
<subtags
::Region
>) -> Self {
438 /// Convert from an LSR tuple to a [`LanguageIdentifier`].
443 /// use icu::locid::{
444 /// langid, subtags_language as language, subtags_region as region,
445 /// subtags_script as script, LanguageIdentifier,
448 /// let lang = language!("en");
449 /// let script = script!("Latn");
450 /// let region = region!("US");
452 /// LanguageIdentifier::from((lang, Some(script), Some(region))),
453 /// langid!("en-Latn-US")
459 Option
<subtags
::Script
>,
460 Option
<subtags
::Region
>,
461 )> for LanguageIdentifier
466 Option
<subtags
::Script
>,
467 Option
<subtags
::Region
>,
479 /// Convert from a [`LanguageIdentifier`] to an LSR tuple.
484 /// use icu::locid::{
485 /// langid, subtags_language as language, subtags_region as region,
486 /// subtags_script as script,
489 /// let lid = langid!("en-Latn-US");
490 /// let (lang, script, region) = (&lid).into();
492 /// assert_eq!(lang, language!("en"));
493 /// assert_eq!(script, Some(script!("Latn")));
494 /// assert_eq!(region, Some(region!("US")));
496 impl From
<&LanguageIdentifier
>
499 Option
<subtags
::Script
>,
500 Option
<subtags
::Region
>,
503 fn from(langid
: &LanguageIdentifier
) -> Self {
504 (langid
.language
, langid
.script
, langid
.region
)