1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
5 use crate::ordering
::SubtagOrderingResult
;
7 parse_locale
, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension
,
8 ParserError
, ParserMode
, SubtagIterator
,
10 use crate::{extensions, subtags, LanguageIdentifier}
;
11 use alloc
::string
::String
;
12 use core
::cmp
::Ordering
;
13 use core
::str::FromStr
;
14 use tinystr
::TinyAsciiStr
;
15 use writeable
::Writeable
;
17 /// A core struct representing a [`Unicode Locale Identifier`].
19 /// A locale is made of two parts:
20 /// * Unicode Language Identifier
21 /// * A set of Unicode Extensions
23 /// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
24 /// on top of that is able to parse, manipulate and serialize unicode extension fields.
31 /// extensions_unicode_key as key, extensions_unicode_value as value,
32 /// locale, subtags_language as language, subtags_region as region,
35 /// let loc = locale!("en-US-u-ca-buddhist");
37 /// assert_eq!(loc.id.language, language!("en"));
38 /// assert_eq!(loc.id.script, None);
39 /// assert_eq!(loc.id.region, Some(region!("US")));
40 /// assert_eq!(loc.id.variants.len(), 0);
42 /// loc.extensions.unicode.keywords.get(&key!("ca")),
43 /// Some(&value!("buddhist"))
49 /// Unicode recognizes three levels of standard conformance for a locale:
51 /// * *well-formed* - syntactically correct
52 /// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
53 /// * *canonical* - valid and no deprecated codes or structure.
55 /// At the moment parsing normalizes a well-formed locale identifier converting
56 /// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
58 /// Any bogus subtags will cause the parsing to fail with an error.
59 /// No subtag validation or canonicalization is performed.
64 /// use icu::locid::{subtags::*, Locale};
66 /// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
68 /// .expect("Failed to parse.");
70 /// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
71 /// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
72 /// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
74 /// loc.id.variants.get(0),
75 /// "valencia".parse::<Variant>().ok().as_ref()
78 /// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
79 #[derive(Default, PartialEq, Eq, Clone, Hash)]
80 #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
82 /// The basic language/script/region components in the locale identifier along with any variants.
83 pub id
: LanguageIdentifier
,
84 /// Any extensions present in the locale identifier.
85 pub extensions
: extensions
::Extensions
,
90 // Remove when we upgrade to a compiler where the new sizes are default
91 let forced_nightly
= std
::env
::var("ICU4X_BUILDING_WITH_FORCED_NIGHTLY").is_ok();
92 assert_eq
!(core
::mem
::size_of
::<subtags
::Language
>(), 3);
93 assert_eq
!(core
::mem
::size_of
::<subtags
::Script
>(), 4);
94 assert_eq
!(core
::mem
::size_of
::<subtags
::Region
>(), 3);
95 assert_eq
!(core
::mem
::size_of
::<subtags
::Variant
>(), 8);
96 assert_eq
!(core
::mem
::size_of
::<subtags
::Variants
>(), 32);
97 assert_eq
!(core
::mem
::size_of
::<LanguageIdentifier
>(), 48);
99 assert_eq
!(core
::mem
::size_of
::<extensions
::transform
::Transform
>(), 72);
100 assert_eq
!(core
::mem
::size_of
::<Option
<LanguageIdentifier
>>(), 48);
101 assert_eq
!(core
::mem
::size_of
::<extensions
::transform
::Fields
>(), 24);
103 assert_eq
!(core
::mem
::size_of
::<extensions
::unicode
::Attributes
>(), 24);
105 core
::mem
::size_of
::<extensions
::unicode
::Keywords
>(),
106 if forced_nightly { 40 }
else { 48 }
108 assert_eq
!(core
::mem
::size_of
::<Vec
<extensions
::other
::Other
>>(), 24);
109 assert_eq
!(core
::mem
::size_of
::<extensions
::private
::Private
>(), 24);
111 core
::mem
::size_of
::<extensions
::Extensions
>(),
112 if forced_nightly { 184 }
else { 192 }
116 core
::mem
::size_of
::<Locale
>(),
117 if forced_nightly { 232 }
else { 240 }
122 /// A constructor which takes a utf8 slice, parses it and
123 /// produces a well-formed [`Locale`].
128 /// use icu::locid::Locale;
130 /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
132 pub fn try_from_bytes(v
: &[u8]) -> Result
<Self, ParserError
> {
136 /// The default undefined locale "und". Same as [`default()`](Default::default()).
141 /// use icu::locid::Locale;
143 /// assert_eq!(Locale::default(), Locale::UND);
145 pub const UND
: Self = Self {
146 id
: LanguageIdentifier
::UND
,
147 extensions
: extensions
::Extensions
::new(),
150 /// This is a best-effort operation that performs all available levels of canonicalization.
152 /// At the moment the operation will normalize casing and the separator, but in the future
153 /// it may also validate and update from deprecated subtags to canonical ones.
158 /// use icu::locid::Locale;
161 /// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
162 /// Ok("pl-Latn-PL-u-hc-h12")
165 pub fn canonicalize
<S
: AsRef
<[u8]>>(input
: S
) -> Result
<String
, ParserError
> {
166 let locale
= Self::try_from_bytes(input
.as_ref())?
;
167 Ok(locale
.write_to_string().into_owned())
170 /// Compare this [`Locale`] with BCP-47 bytes.
172 /// The return value is equivalent to what would happen if you first converted this
173 /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
175 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
176 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
181 /// use icu::locid::Locale;
182 /// use std::cmp::Ordering;
184 /// let bcp47_strings: &[&str] = &[
189 /// "und-u-ca-hebrew",
190 /// "und-u-ca-japanese",
194 /// for ab in bcp47_strings.windows(2) {
197 /// assert!(a.cmp(b) == Ordering::Less);
198 /// let a_loc = a.parse::<Locale>().unwrap();
199 /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
200 /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
203 pub fn strict_cmp(&self, other
: &[u8]) -> Ordering
{
204 self.strict_cmp_iter(other
.split(|b
| *b
== b'
-'
)).end()
207 /// Compare this [`Locale`] with an iterator of BCP-47 subtags.
209 /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
210 /// a more modular version that allows multiple subtag iterators to be chained together.
212 /// For an additional example, see [`SubtagOrderingResult`].
217 /// use icu::locid::locale;
218 /// use std::cmp::Ordering;
220 /// let subtags: &[&[u8]] =
221 /// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
223 /// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
226 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
229 /// let loc = locale!("ca-ES-valencia");
232 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
235 /// let loc = locale!("ca-ES-valencia-u-nu-arab");
237 /// Ordering::Greater,
238 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
241 pub fn strict_cmp_iter
<'l
, I
>(&self, mut subtags
: I
) -> SubtagOrderingResult
<I
>
243 I
: Iterator
<Item
= &'l
[u8]>,
245 let r
= self.for_each_subtag_str(&mut |subtag
| {
246 if let Some(other
) = subtags
.next() {
247 match subtag
.as_bytes().cmp(other
) {
248 Ordering
::Equal
=> Ok(()),
249 not_equal
=> Err(not_equal
),
252 Err(Ordering
::Greater
)
256 Ok(_
) => SubtagOrderingResult
::Subtags(subtags
),
257 Err(o
) => SubtagOrderingResult
::Ordering(o
),
261 /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
263 /// The return value is equivalent to what would happen if you first parsed the
264 /// BCP-47 string to a `Locale` and then performed a structucal comparison.
269 /// use icu::locid::Locale;
270 /// use std::cmp::Ordering;
272 /// let bcp47_strings: &[&str] = &[
277 /// "uNd-u-CA-Japanese",
281 /// for a in bcp47_strings {
282 /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
285 pub fn normalizing_eq(&self, other
: &str) -> bool
{
286 macro_rules
! subtag_matches
{
287 ($T
:ty
, $iter
:ident
, $expected
:expr
) => {
290 .map(|b
| <$T
>::try_from_bytes(b
) == Ok($expected
))
295 let mut iter
= SubtagIterator
::new(other
.as_bytes());
296 if !subtag_matches
!(subtags
::Language
, iter
, self.id
.language
) {
299 if let Some(ref script
) = self.id
.script
{
300 if !subtag_matches
!(subtags
::Script
, iter
, *script
) {
304 if let Some(ref region
) = self.id
.region
{
305 if !subtag_matches
!(subtags
::Region
, iter
, *region
) {
309 for variant
in self.id
.variants
.iter() {
310 if !subtag_matches
!(subtags
::Variant
, iter
, *variant
) {
314 if !self.extensions
.is_empty() {
315 match extensions
::Extensions
::try_from_iter(&mut iter
) {
317 if self.extensions
!= exts
{
330 #[allow(clippy::type_complexity)]
331 pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
336 Option
<subtags
::Script
>,
337 Option
<subtags
::Region
>,
338 Option
<subtags
::Variant
>,
339 Option
<(extensions
::unicode
::Key
, Option
<TinyAsciiStr
<8>>)>,
343 parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
349 pub(crate) fn for_each_subtag_str
<E
, F
>(&self, f
: &mut F
) -> Result
<(), E
>
351 F
: FnMut(&str) -> Result
<(), E
>,
353 self.id
.for_each_subtag_str(f
)?
;
354 self.extensions
.for_each_subtag_str(f
)?
;
359 impl FromStr
for Locale
{
360 type Err
= ParserError
;
362 fn from_str(source
: &str) -> Result
<Self, Self::Err
> {
363 Self::try_from_bytes(source
.as_bytes())
367 impl From
<LanguageIdentifier
> for Locale
{
368 fn from(id
: LanguageIdentifier
) -> Self {
371 extensions
: extensions
::Extensions
::default(),
376 impl From
<Locale
> for LanguageIdentifier
{
377 fn from(loc
: Locale
) -> Self {
382 impl AsRef
<LanguageIdentifier
> for Locale
{
383 fn as_ref(&self) -> &LanguageIdentifier
{
388 impl AsMut
<LanguageIdentifier
> for Locale
{
389 fn as_mut(&mut self) -> &mut LanguageIdentifier
{
394 impl core
::fmt
::Debug
for Locale
{
395 fn fmt(&self, f
: &mut core
::fmt
::Formatter
) -> core
::fmt
::Result
{
396 writeable
::Writeable
::write_to(self, f
)
400 impl_writeable_for_each_subtag_str_no_test
!(Locale
, selff
, selff
.extensions
.is_empty() => selff
.id
.write_to_string());
403 fn test_writeable() {
404 use writeable
::assert_writeable_eq
;
405 assert_writeable_eq
!(Locale
::UND
, "und");
406 assert_writeable_eq
!("und-001".parse
::<Locale
>().unwrap(), "und-001");
407 assert_writeable_eq
!("und-Mymr".parse
::<Locale
>().unwrap(), "und-Mymr");
408 assert_writeable_eq
!("my-Mymr-MM".parse
::<Locale
>().unwrap(), "my-Mymr-MM");
409 assert_writeable_eq
!(
410 "my-Mymr-MM-posix".parse
::<Locale
>().unwrap(),
413 assert_writeable_eq
!(
414 "zh-macos-posix".parse
::<Locale
>().unwrap(),
417 assert_writeable_eq
!(
418 "my-t-my-d0-zawgyi".parse
::<Locale
>().unwrap(),
421 assert_writeable_eq
!(
422 "ar-SA-u-ca-islamic-civil".parse
::<Locale
>().unwrap(),
423 "ar-SA-u-ca-islamic-civil",
425 assert_writeable_eq
!(
426 "en-001-x-foo-bar".parse
::<Locale
>().unwrap(),
429 assert_writeable_eq
!("und-t-m0-true".parse
::<Locale
>().unwrap(), "und-t-m0-true",);
435 /// use icu::locid::Locale;
436 /// use icu::locid::{locale, subtags_language as language};
438 /// assert_eq!(Locale::from(language!("en")), locale!("en"));
440 impl From
<subtags
::Language
> for Locale
{
441 fn from(language
: subtags
::Language
) -> Self {
452 /// use icu::locid::Locale;
453 /// use icu::locid::{locale, subtags_script as script};
455 /// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
457 impl From
<Option
<subtags
::Script
>> for Locale
{
458 fn from(script
: Option
<subtags
::Script
>) -> Self {
469 /// use icu::locid::Locale;
470 /// use icu::locid::{locale, subtags_region as region};
472 /// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
474 impl From
<Option
<subtags
::Region
>> for Locale
{
475 fn from(region
: Option
<subtags
::Region
>) -> Self {
486 /// use icu::locid::Locale;
487 /// use icu::locid::{
488 /// locale, subtags_language as language, subtags_region as region,
489 /// subtags_script as script,
495 /// Some(script!("Latn")),
496 /// Some(region!("US"))
498 /// locale!("en-Latn-US")
504 Option
<subtags
::Script
>,
505 Option
<subtags
::Region
>,
511 Option
<subtags
::Script
>,
512 Option
<subtags
::Region
>,