]> git.proxmox.com Git - rustc.git/blob - vendor/icu_locid/src/locale.rs
New upstream version 1.69.0+dfsg1
[rustc.git] / vendor / icu_locid / src / locale.rs
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 use crate::ordering::SubtagOrderingResult;
6 use crate::parser::{
7 parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
8 ParserError, ParserMode, SubtagIterator,
9 };
10 use crate::{extensions, subtags, LanguageIdentifier};
11 use alloc::string::String;
12 use core::cmp::Ordering;
13 use core::str::FromStr;
14 use tinystr::TinyAsciiStr;
15 use writeable::Writeable;
16
17 /// A core struct representing a [`Unicode Locale Identifier`].
18 ///
19 /// A locale is made of two parts:
20 /// * Unicode Language Identifier
21 /// * A set of Unicode Extensions
22 ///
23 /// [`Locale`] exposes all of the same fields and methods as [`LanguageIdentifier`], and
24 /// on top of that is able to parse, manipulate and serialize unicode extension fields.
25 ///
26 ///
27 /// # Examples
28 ///
29 /// ```
30 /// use icu_locid::{
31 /// extensions_unicode_key as key, extensions_unicode_value as value,
32 /// locale, subtags_language as language, subtags_region as region,
33 /// };
34 ///
35 /// let loc = locale!("en-US-u-ca-buddhist");
36 ///
37 /// assert_eq!(loc.id.language, language!("en"));
38 /// assert_eq!(loc.id.script, None);
39 /// assert_eq!(loc.id.region, Some(region!("US")));
40 /// assert_eq!(loc.id.variants.len(), 0);
41 /// assert_eq!(
42 /// loc.extensions.unicode.keywords.get(&key!("ca")),
43 /// Some(&value!("buddhist"))
44 /// );
45 /// ```
46 ///
47 /// # Parsing
48 ///
49 /// Unicode recognizes three levels of standard conformance for a locale:
50 ///
51 /// * *well-formed* - syntactically correct
52 /// * *valid* - well-formed and only uses registered language subtags, extensions, keywords, types...
53 /// * *canonical* - valid and no deprecated codes or structure.
54 ///
55 /// At the moment parsing normalizes a well-formed locale identifier converting
56 /// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
57 ///
58 /// Any bogus subtags will cause the parsing to fail with an error.
59 /// No subtag validation or canonicalization is performed.
60 ///
61 /// # Examples
62 ///
63 /// ```
64 /// use icu::locid::{subtags::*, Locale};
65 ///
66 /// let loc: Locale = "eN_latn_Us-Valencia_u-hC-H12"
67 /// .parse()
68 /// .expect("Failed to parse.");
69 ///
70 /// assert_eq!(loc.id.language, "en".parse::<Language>().unwrap());
71 /// assert_eq!(loc.id.script, "Latn".parse::<Script>().ok());
72 /// assert_eq!(loc.id.region, "US".parse::<Region>().ok());
73 /// assert_eq!(
74 /// loc.id.variants.get(0),
75 /// "valencia".parse::<Variant>().ok().as_ref()
76 /// );
77 /// ```
78 /// [`Unicode Locale Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_locale_identifier
79 #[derive(Default, PartialEq, Eq, Clone, Hash)]
80 #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
81 pub struct Locale {
82 /// The basic language/script/region components in the locale identifier along with any variants.
83 pub id: LanguageIdentifier,
84 /// Any extensions present in the locale identifier.
85 pub extensions: extensions::Extensions,
86 }
87
88 #[test]
89 fn test_sizes() {
90 // Remove when we upgrade to a compiler where the new sizes are default
91 let forced_nightly = std::env::var("ICU4X_BUILDING_WITH_FORCED_NIGHTLY").is_ok();
92 assert_eq!(core::mem::size_of::<subtags::Language>(), 3);
93 assert_eq!(core::mem::size_of::<subtags::Script>(), 4);
94 assert_eq!(core::mem::size_of::<subtags::Region>(), 3);
95 assert_eq!(core::mem::size_of::<subtags::Variant>(), 8);
96 assert_eq!(core::mem::size_of::<subtags::Variants>(), 32);
97 assert_eq!(core::mem::size_of::<LanguageIdentifier>(), 48);
98
99 assert_eq!(core::mem::size_of::<extensions::transform::Transform>(), 72);
100 assert_eq!(core::mem::size_of::<Option<LanguageIdentifier>>(), 48);
101 assert_eq!(core::mem::size_of::<extensions::transform::Fields>(), 24);
102
103 assert_eq!(core::mem::size_of::<extensions::unicode::Attributes>(), 24);
104 assert_eq!(
105 core::mem::size_of::<extensions::unicode::Keywords>(),
106 if forced_nightly { 40 } else { 48 }
107 );
108 assert_eq!(core::mem::size_of::<Vec<extensions::other::Other>>(), 24);
109 assert_eq!(core::mem::size_of::<extensions::private::Private>(), 24);
110 assert_eq!(
111 core::mem::size_of::<extensions::Extensions>(),
112 if forced_nightly { 184 } else { 192 }
113 );
114
115 assert_eq!(
116 core::mem::size_of::<Locale>(),
117 if forced_nightly { 232 } else { 240 }
118 );
119 }
120
121 impl Locale {
122 /// A constructor which takes a utf8 slice, parses it and
123 /// produces a well-formed [`Locale`].
124 ///
125 /// # Examples
126 ///
127 /// ```
128 /// use icu::locid::Locale;
129 ///
130 /// Locale::try_from_bytes(b"en-US-u-hc-h12").unwrap();
131 /// ```
132 pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
133 parse_locale(v)
134 }
135
136 /// The default undefined locale "und". Same as [`default()`](Default::default()).
137 ///
138 /// # Examples
139 ///
140 /// ```
141 /// use icu::locid::Locale;
142 ///
143 /// assert_eq!(Locale::default(), Locale::UND);
144 /// ```
145 pub const UND: Self = Self {
146 id: LanguageIdentifier::UND,
147 extensions: extensions::Extensions::new(),
148 };
149
150 /// This is a best-effort operation that performs all available levels of canonicalization.
151 ///
152 /// At the moment the operation will normalize casing and the separator, but in the future
153 /// it may also validate and update from deprecated subtags to canonical ones.
154 ///
155 /// # Examples
156 ///
157 /// ```
158 /// use icu::locid::Locale;
159 ///
160 /// assert_eq!(
161 /// Locale::canonicalize("pL_latn_pl-U-HC-H12").as_deref(),
162 /// Ok("pl-Latn-PL-u-hc-h12")
163 /// );
164 /// ```
165 pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
166 let locale = Self::try_from_bytes(input.as_ref())?;
167 Ok(locale.write_to_string().into_owned())
168 }
169
170 /// Compare this [`Locale`] with BCP-47 bytes.
171 ///
172 /// The return value is equivalent to what would happen if you first converted this
173 /// [`Locale`] to a BCP-47 string and then performed a byte comparison.
174 ///
175 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
176 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
177 ///
178 /// # Examples
179 ///
180 /// ```
181 /// use icu::locid::Locale;
182 /// use std::cmp::Ordering;
183 ///
184 /// let bcp47_strings: &[&str] = &[
185 /// "pl-Latn-PL",
186 /// "und",
187 /// "und-fonipa",
188 /// "und-t-m0-true",
189 /// "und-u-ca-hebrew",
190 /// "und-u-ca-japanese",
191 /// "zh",
192 /// ];
193 ///
194 /// for ab in bcp47_strings.windows(2) {
195 /// let a = ab[0];
196 /// let b = ab[1];
197 /// assert!(a.cmp(b) == Ordering::Less);
198 /// let a_loc = a.parse::<Locale>().unwrap();
199 /// assert!(a_loc.strict_cmp(a.as_bytes()) == Ordering::Equal);
200 /// assert!(a_loc.strict_cmp(b.as_bytes()) == Ordering::Less);
201 /// }
202 /// ```
203 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
204 self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
205 }
206
207 /// Compare this [`Locale`] with an iterator of BCP-47 subtags.
208 ///
209 /// This function has the same equality semantics as [`Locale::strict_cmp`]. It is intended as
210 /// a more modular version that allows multiple subtag iterators to be chained together.
211 ///
212 /// For an additional example, see [`SubtagOrderingResult`].
213 ///
214 /// # Examples
215 ///
216 /// ```
217 /// use icu::locid::locale;
218 /// use std::cmp::Ordering;
219 ///
220 /// let subtags: &[&[u8]] =
221 /// &[b"ca", b"ES", b"valencia", b"u", b"ca", b"hebrew"];
222 ///
223 /// let loc = locale!("ca-ES-valencia-u-ca-hebrew");
224 /// assert_eq!(
225 /// Ordering::Equal,
226 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
227 /// );
228 ///
229 /// let loc = locale!("ca-ES-valencia");
230 /// assert_eq!(
231 /// Ordering::Less,
232 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
233 /// );
234 ///
235 /// let loc = locale!("ca-ES-valencia-u-nu-arab");
236 /// assert_eq!(
237 /// Ordering::Greater,
238 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
239 /// );
240 /// ```
241 pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
242 where
243 I: Iterator<Item = &'l [u8]>,
244 {
245 let r = self.for_each_subtag_str(&mut |subtag| {
246 if let Some(other) = subtags.next() {
247 match subtag.as_bytes().cmp(other) {
248 Ordering::Equal => Ok(()),
249 not_equal => Err(not_equal),
250 }
251 } else {
252 Err(Ordering::Greater)
253 }
254 });
255 match r {
256 Ok(_) => SubtagOrderingResult::Subtags(subtags),
257 Err(o) => SubtagOrderingResult::Ordering(o),
258 }
259 }
260
261 /// Compare this `Locale` with a potentially unnormalized BCP-47 string.
262 ///
263 /// The return value is equivalent to what would happen if you first parsed the
264 /// BCP-47 string to a `Locale` and then performed a structucal comparison.
265 ///
266 /// # Examples
267 ///
268 /// ```
269 /// use icu::locid::Locale;
270 /// use std::cmp::Ordering;
271 ///
272 /// let bcp47_strings: &[&str] = &[
273 /// "pl-LaTn-pL",
274 /// "uNd",
275 /// "UND-FONIPA",
276 /// "UnD-t-m0-TrUe",
277 /// "uNd-u-CA-Japanese",
278 /// "ZH",
279 /// ];
280 ///
281 /// for a in bcp47_strings {
282 /// assert!(a.parse::<Locale>().unwrap().normalizing_eq(a));
283 /// }
284 /// ```
285 pub fn normalizing_eq(&self, other: &str) -> bool {
286 macro_rules! subtag_matches {
287 ($T:ty, $iter:ident, $expected:expr) => {
288 $iter
289 .next()
290 .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
291 .unwrap_or(false)
292 };
293 }
294
295 let mut iter = SubtagIterator::new(other.as_bytes());
296 if !subtag_matches!(subtags::Language, iter, self.id.language) {
297 return false;
298 }
299 if let Some(ref script) = self.id.script {
300 if !subtag_matches!(subtags::Script, iter, *script) {
301 return false;
302 }
303 }
304 if let Some(ref region) = self.id.region {
305 if !subtag_matches!(subtags::Region, iter, *region) {
306 return false;
307 }
308 }
309 for variant in self.id.variants.iter() {
310 if !subtag_matches!(subtags::Variant, iter, *variant) {
311 return false;
312 }
313 }
314 if !self.extensions.is_empty() {
315 match extensions::Extensions::try_from_iter(&mut iter) {
316 Ok(exts) => {
317 if self.extensions != exts {
318 return false;
319 }
320 }
321 Err(_) => {
322 return false;
323 }
324 }
325 }
326 iter.next() == None
327 }
328
329 #[doc(hidden)]
330 #[allow(clippy::type_complexity)]
331 pub const fn try_from_bytes_with_single_variant_single_keyword_unicode_extension(
332 v: &[u8],
333 ) -> Result<
334 (
335 subtags::Language,
336 Option<subtags::Script>,
337 Option<subtags::Region>,
338 Option<subtags::Variant>,
339 Option<(extensions::unicode::Key, Option<TinyAsciiStr<8>>)>,
340 ),
341 ParserError,
342 > {
343 parse_locale_with_single_variant_single_keyword_unicode_keyword_extension(
344 v,
345 ParserMode::Locale,
346 )
347 }
348
349 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
350 where
351 F: FnMut(&str) -> Result<(), E>,
352 {
353 self.id.for_each_subtag_str(f)?;
354 self.extensions.for_each_subtag_str(f)?;
355 Ok(())
356 }
357 }
358
359 impl FromStr for Locale {
360 type Err = ParserError;
361
362 fn from_str(source: &str) -> Result<Self, Self::Err> {
363 Self::try_from_bytes(source.as_bytes())
364 }
365 }
366
367 impl From<LanguageIdentifier> for Locale {
368 fn from(id: LanguageIdentifier) -> Self {
369 Self {
370 id,
371 extensions: extensions::Extensions::default(),
372 }
373 }
374 }
375
376 impl From<Locale> for LanguageIdentifier {
377 fn from(loc: Locale) -> Self {
378 loc.id
379 }
380 }
381
382 impl AsRef<LanguageIdentifier> for Locale {
383 fn as_ref(&self) -> &LanguageIdentifier {
384 &self.id
385 }
386 }
387
388 impl AsMut<LanguageIdentifier> for Locale {
389 fn as_mut(&mut self) -> &mut LanguageIdentifier {
390 &mut self.id
391 }
392 }
393
394 impl core::fmt::Debug for Locale {
395 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
396 writeable::Writeable::write_to(self, f)
397 }
398 }
399
400 impl_writeable_for_each_subtag_str_no_test!(Locale, selff, selff.extensions.is_empty() => selff.id.write_to_string());
401
402 #[test]
403 fn test_writeable() {
404 use writeable::assert_writeable_eq;
405 assert_writeable_eq!(Locale::UND, "und");
406 assert_writeable_eq!("und-001".parse::<Locale>().unwrap(), "und-001");
407 assert_writeable_eq!("und-Mymr".parse::<Locale>().unwrap(), "und-Mymr");
408 assert_writeable_eq!("my-Mymr-MM".parse::<Locale>().unwrap(), "my-Mymr-MM");
409 assert_writeable_eq!(
410 "my-Mymr-MM-posix".parse::<Locale>().unwrap(),
411 "my-Mymr-MM-posix",
412 );
413 assert_writeable_eq!(
414 "zh-macos-posix".parse::<Locale>().unwrap(),
415 "zh-macos-posix",
416 );
417 assert_writeable_eq!(
418 "my-t-my-d0-zawgyi".parse::<Locale>().unwrap(),
419 "my-t-my-d0-zawgyi",
420 );
421 assert_writeable_eq!(
422 "ar-SA-u-ca-islamic-civil".parse::<Locale>().unwrap(),
423 "ar-SA-u-ca-islamic-civil",
424 );
425 assert_writeable_eq!(
426 "en-001-x-foo-bar".parse::<Locale>().unwrap(),
427 "en-001-x-foo-bar",
428 );
429 assert_writeable_eq!("und-t-m0-true".parse::<Locale>().unwrap(), "und-t-m0-true",);
430 }
431
432 /// # Examples
433 ///
434 /// ```
435 /// use icu::locid::Locale;
436 /// use icu::locid::{locale, subtags_language as language};
437 ///
438 /// assert_eq!(Locale::from(language!("en")), locale!("en"));
439 /// ```
440 impl From<subtags::Language> for Locale {
441 fn from(language: subtags::Language) -> Self {
442 Self {
443 id: language.into(),
444 ..Default::default()
445 }
446 }
447 }
448
449 /// # Examples
450 ///
451 /// ```
452 /// use icu::locid::Locale;
453 /// use icu::locid::{locale, subtags_script as script};
454 ///
455 /// assert_eq!(Locale::from(Some(script!("latn"))), locale!("und-Latn"));
456 /// ```
457 impl From<Option<subtags::Script>> for Locale {
458 fn from(script: Option<subtags::Script>) -> Self {
459 Self {
460 id: script.into(),
461 ..Default::default()
462 }
463 }
464 }
465
466 /// # Examples
467 ///
468 /// ```
469 /// use icu::locid::Locale;
470 /// use icu::locid::{locale, subtags_region as region};
471 ///
472 /// assert_eq!(Locale::from(Some(region!("US"))), locale!("und-US"));
473 /// ```
474 impl From<Option<subtags::Region>> for Locale {
475 fn from(region: Option<subtags::Region>) -> Self {
476 Self {
477 id: region.into(),
478 ..Default::default()
479 }
480 }
481 }
482
483 /// # Examples
484 ///
485 /// ```
486 /// use icu::locid::Locale;
487 /// use icu::locid::{
488 /// locale, subtags_language as language, subtags_region as region,
489 /// subtags_script as script,
490 /// };
491 ///
492 /// assert_eq!(
493 /// Locale::from((
494 /// language!("en"),
495 /// Some(script!("Latn")),
496 /// Some(region!("US"))
497 /// )),
498 /// locale!("en-Latn-US")
499 /// );
500 /// ```
501 impl
502 From<(
503 subtags::Language,
504 Option<subtags::Script>,
505 Option<subtags::Region>,
506 )> for Locale
507 {
508 fn from(
509 lsr: (
510 subtags::Language,
511 Option<subtags::Script>,
512 Option<subtags::Region>,
513 ),
514 ) -> Self {
515 Self {
516 id: lsr.into(),
517 ..Default::default()
518 }
519 }
520 }