]> git.proxmox.com Git - rustc.git/blob - vendor/icu_locid/src/langid.rs
New upstream version 1.69.0+dfsg1
[rustc.git] / vendor / icu_locid / src / langid.rs
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 use core::cmp::Ordering;
6 use core::str::FromStr;
7
8 use crate::ordering::SubtagOrderingResult;
9 use crate::parser::{
10 parse_language_identifier, parse_language_identifier_with_single_variant, ParserError,
11 ParserMode, SubtagIterator,
12 };
13 use crate::subtags;
14 use alloc::string::String;
15 use writeable::Writeable;
16
17 /// A core struct representing a [`Unicode BCP47 Language Identifier`].
18 ///
19 /// # Examples
20 ///
21 /// ```
22 /// use icu::locid::{
23 /// langid, subtags_language as language, subtags_region as region,
24 /// };
25 ///
26 /// let li = langid!("en-US");
27 ///
28 /// assert_eq!(li.language, language!("en"));
29 /// assert_eq!(li.script, None);
30 /// assert_eq!(li.region, Some(region!("US")));
31 /// assert_eq!(li.variants.len(), 0);
32 /// ```
33 ///
34 /// # Parsing
35 ///
36 /// Unicode recognizes three levels of standard conformance for any language identifier:
37 ///
38 /// * *well-formed* - syntactically correct
39 /// * *valid* - well-formed and only uses registered language, region, script and variant subtags...
40 /// * *canonical* - valid and no deprecated codes or structure.
41 ///
42 /// At the moment parsing normalizes a well-formed language identifier converting
43 /// `_` separators to `-` and adjusting casing to conform to the Unicode standard.
44 ///
45 /// Any bogus subtags will cause the parsing to fail with an error.
46 /// No subtag validation is performed.
47 ///
48 /// # Examples
49 ///
50 /// ```
51 /// use icu::locid::{
52 /// langid, subtags_language as language, subtags_region as region,
53 /// subtags_script as script, subtags_variant as variant,
54 /// };
55 ///
56 /// let li = langid!("eN_latn_Us-Valencia");
57 ///
58 /// assert_eq!(li.language, language!("en"));
59 /// assert_eq!(li.script, Some(script!("Latn")));
60 /// assert_eq!(li.region, Some(region!("US")));
61 /// assert_eq!(li.variants.get(0), Some(&variant!("valencia")));
62 /// ```
63 ///
64 /// [`Unicode BCP47 Language Identifier`]: https://unicode.org/reports/tr35/tr35.html#Unicode_language_identifier
65 #[derive(Default, PartialEq, Eq, Clone, Hash)]
66 #[allow(clippy::exhaustive_structs)] // This struct is stable (and invoked by a macro)
67 pub struct LanguageIdentifier {
68 /// Language subtag of the language identifier.
69 pub language: subtags::Language,
70 /// Script subtag of the language identifier.
71 pub script: Option<subtags::Script>,
72 /// Region subtag of the language identifier.
73 pub region: Option<subtags::Region>,
74 /// Variant subtags of the language identifier.
75 pub variants: subtags::Variants,
76 }
77
78 impl LanguageIdentifier {
79 /// A constructor which takes a utf8 slice, parses it and
80 /// produces a well-formed [`LanguageIdentifier`].
81 ///
82 /// # Examples
83 ///
84 /// ```
85 /// use icu::locid::LanguageIdentifier;
86 ///
87 /// LanguageIdentifier::try_from_bytes(b"en-US").expect("Parsing failed");
88 /// ```
89 pub fn try_from_bytes(v: &[u8]) -> Result<Self, ParserError> {
90 parse_language_identifier(v, ParserMode::LanguageIdentifier)
91 }
92
93 #[doc(hidden)]
94 #[allow(clippy::type_complexity)]
95 // The return type should be `Result<Self, ParserError>` once the `const_precise_live_drops`
96 // is stabilized ([rust-lang#73255](https://github.com/rust-lang/rust/issues/73255)).
97 pub const fn try_from_bytes_with_single_variant(
98 v: &[u8],
99 ) -> Result<
100 (
101 subtags::Language,
102 Option<subtags::Script>,
103 Option<subtags::Region>,
104 Option<subtags::Variant>,
105 ),
106 ParserError,
107 > {
108 parse_language_identifier_with_single_variant(v, ParserMode::LanguageIdentifier)
109 }
110
111 /// A constructor which takes a utf8 slice which may contain extension keys,
112 /// parses it and produces a well-formed [`LanguageIdentifier`].
113 ///
114 /// # Examples
115 ///
116 /// ```
117 /// use icu::locid::{langid, LanguageIdentifier};
118 ///
119 /// let li = LanguageIdentifier::try_from_locale_bytes(b"en-US-x-posix")
120 /// .expect("Parsing failed.");
121 ///
122 /// assert_eq!(li, langid!("en-US"));
123 /// ```
124 ///
125 /// This method should be used for input that may be a locale identifier.
126 /// All extensions will be lost.
127 pub fn try_from_locale_bytes(v: &[u8]) -> Result<Self, ParserError> {
128 parse_language_identifier(v, ParserMode::Locale)
129 }
130
131 /// The default undefined language "und". Same as [`default()`](Default::default()).
132 ///
133 /// # Examples
134 ///
135 /// ```
136 /// use icu::locid::LanguageIdentifier;
137 ///
138 /// assert_eq!(LanguageIdentifier::default(), LanguageIdentifier::UND);
139 /// ```
140 pub const UND: Self = Self {
141 language: subtags::Language::UND,
142 script: None,
143 region: None,
144 variants: subtags::Variants::new(),
145 };
146
147 /// This is a best-effort operation that performs all available levels of canonicalization.
148 ///
149 /// At the moment the operation will normalize casing and the separator, but in the future
150 /// it may also validate and update from deprecated subtags to canonical ones.
151 ///
152 /// # Examples
153 ///
154 /// ```
155 /// use icu::locid::LanguageIdentifier;
156 ///
157 /// assert_eq!(
158 /// LanguageIdentifier::canonicalize("pL_latn_pl").as_deref(),
159 /// Ok("pl-Latn-PL")
160 /// );
161 /// ```
162 pub fn canonicalize<S: AsRef<[u8]>>(input: S) -> Result<String, ParserError> {
163 let lang_id = Self::try_from_bytes(input.as_ref())?;
164 Ok(lang_id.write_to_string().into_owned())
165 }
166
167 /// Compare this [`LanguageIdentifier`] with BCP-47 bytes.
168 ///
169 /// The return value is equivalent to what would happen if you first converted this
170 /// [`LanguageIdentifier`] to a BCP-47 string and then performed a byte comparison.
171 ///
172 /// This function is case-sensitive and results in a *total order*, so it is appropriate for
173 /// binary search. The only argument producing [`Ordering::Equal`] is `self.to_string()`.
174 ///
175 /// # Examples
176 ///
177 /// ```
178 /// use icu::locid::LanguageIdentifier;
179 /// use std::cmp::Ordering;
180 ///
181 /// let bcp47_strings: &[&str] = &[
182 /// "pl-Latn-PL",
183 /// "und",
184 /// "und-Adlm",
185 /// "und-GB",
186 /// "und-ZA",
187 /// "und-fonipa",
188 /// "zh",
189 /// ];
190 ///
191 /// for ab in bcp47_strings.windows(2) {
192 /// let a = ab[0];
193 /// let b = ab[1];
194 /// assert!(a.cmp(b) == Ordering::Less);
195 /// let a_langid = a.parse::<LanguageIdentifier>().unwrap();
196 /// assert!(a_langid.strict_cmp(a.as_bytes()) == Ordering::Equal);
197 /// assert!(a_langid.strict_cmp(b.as_bytes()) == Ordering::Less);
198 /// }
199 /// ```
200 pub fn strict_cmp(&self, other: &[u8]) -> Ordering {
201 self.strict_cmp_iter(other.split(|b| *b == b'-')).end()
202 }
203
204 /// Compare this [`LanguageIdentifier`] with an iterator of BCP-47 subtags.
205 ///
206 /// This function has the same equality semantics as [`LanguageIdentifier::strict_cmp`]. It is intended as
207 /// a more modular version that allows multiple subtag iterators to be chained together.
208 ///
209 /// For an additional example, see [`SubtagOrderingResult`].
210 ///
211 /// # Examples
212 ///
213 /// ```
214 /// use icu::locid::LanguageIdentifier;
215 /// use std::cmp::Ordering;
216 ///
217 /// let subtags: &[&[u8]] = &[b"ca", b"ES", b"valencia"];
218 ///
219 /// let loc = "ca-ES-valencia".parse::<LanguageIdentifier>().unwrap();
220 /// assert_eq!(
221 /// Ordering::Equal,
222 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
223 /// );
224 ///
225 /// let loc = "ca-ES".parse::<LanguageIdentifier>().unwrap();
226 /// assert_eq!(
227 /// Ordering::Less,
228 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
229 /// );
230 ///
231 /// let loc = "ca-ZA".parse::<LanguageIdentifier>().unwrap();
232 /// assert_eq!(
233 /// Ordering::Greater,
234 /// loc.strict_cmp_iter(subtags.iter().copied()).end()
235 /// );
236 /// ```
237 pub fn strict_cmp_iter<'l, I>(&self, mut subtags: I) -> SubtagOrderingResult<I>
238 where
239 I: Iterator<Item = &'l [u8]>,
240 {
241 let r = self.for_each_subtag_str(&mut |subtag| {
242 if let Some(other) = subtags.next() {
243 match subtag.as_bytes().cmp(other) {
244 Ordering::Equal => Ok(()),
245 not_equal => Err(not_equal),
246 }
247 } else {
248 Err(Ordering::Greater)
249 }
250 });
251 match r {
252 Ok(_) => SubtagOrderingResult::Subtags(subtags),
253 Err(o) => SubtagOrderingResult::Ordering(o),
254 }
255 }
256
257 /// Compare this `LanguageIdentifier` with a potentially unnormalized BCP-47 string.
258 ///
259 /// The return value is equivalent to what would happen if you first parsed the
260 /// BCP-47 string to a `LanguageIdentifier` and then performed a structucal comparison.
261 ///
262 /// # Examples
263 ///
264 /// ```
265 /// use icu::locid::LanguageIdentifier;
266 /// use std::cmp::Ordering;
267 ///
268 /// let bcp47_strings: &[&str] = &[
269 /// "pl-LaTn-pL",
270 /// "uNd",
271 /// "UnD-adlm",
272 /// "uNd-GB",
273 /// "UND-FONIPA",
274 /// "ZH",
275 /// ];
276 ///
277 /// for a in bcp47_strings {
278 /// assert!(a.parse::<LanguageIdentifier>().unwrap().normalizing_eq(a));
279 /// }
280 /// ```
281 pub fn normalizing_eq(&self, other: &str) -> bool {
282 macro_rules! subtag_matches {
283 ($T:ty, $iter:ident, $expected:expr) => {
284 $iter
285 .next()
286 .map(|b| <$T>::try_from_bytes(b) == Ok($expected))
287 .unwrap_or(false)
288 };
289 }
290
291 let mut iter = SubtagIterator::new(other.as_bytes());
292 if !subtag_matches!(subtags::Language, iter, self.language) {
293 return false;
294 }
295 if let Some(ref script) = self.script {
296 if !subtag_matches!(subtags::Script, iter, *script) {
297 return false;
298 }
299 }
300 if let Some(ref region) = self.region {
301 if !subtag_matches!(subtags::Region, iter, *region) {
302 return false;
303 }
304 }
305 for variant in self.variants.iter() {
306 if !subtag_matches!(subtags::Variant, iter, *variant) {
307 return false;
308 }
309 }
310 iter.next() == None
311 }
312
313 pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
314 where
315 F: FnMut(&str) -> Result<(), E>,
316 {
317 f(self.language.as_str())?;
318 if let Some(ref script) = self.script {
319 f(script.as_str())?;
320 }
321 if let Some(ref region) = self.region {
322 f(region.as_str())?;
323 }
324 for variant in self.variants.iter() {
325 f(variant.as_str())?;
326 }
327 Ok(())
328 }
329 }
330
331 impl AsRef<LanguageIdentifier> for LanguageIdentifier {
332 fn as_ref(&self) -> &Self {
333 self
334 }
335 }
336
337 impl AsMut<LanguageIdentifier> for LanguageIdentifier {
338 fn as_mut(&mut self) -> &mut Self {
339 self
340 }
341 }
342
343 impl core::fmt::Debug for LanguageIdentifier {
344 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
345 core::fmt::Display::fmt(&self, f)
346 }
347 }
348
349 impl FromStr for LanguageIdentifier {
350 type Err = ParserError;
351
352 fn from_str(source: &str) -> Result<Self, Self::Err> {
353 Self::try_from_bytes(source.as_bytes())
354 }
355 }
356
357 impl_writeable_for_each_subtag_str_no_test!(LanguageIdentifier, selff, selff.script.is_none() && selff.region.is_none() && selff.variants.is_empty() => selff.language.write_to_string());
358
359 #[test]
360 fn test_writeable() {
361 use writeable::assert_writeable_eq;
362 assert_writeable_eq!(LanguageIdentifier::UND, "und");
363 assert_writeable_eq!("und-001".parse::<LanguageIdentifier>().unwrap(), "und-001");
364 assert_writeable_eq!(
365 "und-Mymr".parse::<LanguageIdentifier>().unwrap(),
366 "und-Mymr",
367 );
368 assert_writeable_eq!(
369 "my-Mymr-MM".parse::<LanguageIdentifier>().unwrap(),
370 "my-Mymr-MM",
371 );
372 assert_writeable_eq!(
373 "my-Mymr-MM-posix".parse::<LanguageIdentifier>().unwrap(),
374 "my-Mymr-MM-posix",
375 );
376 assert_writeable_eq!(
377 "zh-macos-posix".parse::<LanguageIdentifier>().unwrap(),
378 "zh-macos-posix",
379 );
380 }
381
382 /// # Examples
383 ///
384 /// ```
385 /// use icu::locid::{
386 /// langid, subtags_language as language, LanguageIdentifier,
387 /// };
388 ///
389 /// assert_eq!(LanguageIdentifier::from(language!("en")), langid!("en"));
390 /// ```
391 impl From<subtags::Language> for LanguageIdentifier {
392 fn from(language: subtags::Language) -> Self {
393 Self {
394 language,
395 ..Default::default()
396 }
397 }
398 }
399
400 /// # Examples
401 ///
402 /// ```
403 /// use icu::locid::{langid, subtags_script as script, LanguageIdentifier};
404 ///
405 /// assert_eq!(
406 /// LanguageIdentifier::from(Some(script!("latn"))),
407 /// langid!("und-Latn")
408 /// );
409 /// ```
410 impl From<Option<subtags::Script>> for LanguageIdentifier {
411 fn from(script: Option<subtags::Script>) -> Self {
412 Self {
413 script,
414 ..Default::default()
415 }
416 }
417 }
418
419 /// # Examples
420 ///
421 /// ```
422 /// use icu::locid::{langid, subtags_region as region, LanguageIdentifier};
423 ///
424 /// assert_eq!(
425 /// LanguageIdentifier::from(Some(region!("US"))),
426 /// langid!("und-US")
427 /// );
428 /// ```
429 impl From<Option<subtags::Region>> for LanguageIdentifier {
430 fn from(region: Option<subtags::Region>) -> Self {
431 Self {
432 region,
433 ..Default::default()
434 }
435 }
436 }
437
438 /// Convert from an LSR tuple to a [`LanguageIdentifier`].
439 ///
440 /// # Examples
441 ///
442 /// ```
443 /// use icu::locid::{
444 /// langid, subtags_language as language, subtags_region as region,
445 /// subtags_script as script, LanguageIdentifier,
446 /// };
447 ///
448 /// let lang = language!("en");
449 /// let script = script!("Latn");
450 /// let region = region!("US");
451 /// assert_eq!(
452 /// LanguageIdentifier::from((lang, Some(script), Some(region))),
453 /// langid!("en-Latn-US")
454 /// );
455 /// ```
456 impl
457 From<(
458 subtags::Language,
459 Option<subtags::Script>,
460 Option<subtags::Region>,
461 )> for LanguageIdentifier
462 {
463 fn from(
464 lsr: (
465 subtags::Language,
466 Option<subtags::Script>,
467 Option<subtags::Region>,
468 ),
469 ) -> Self {
470 Self {
471 language: lsr.0,
472 script: lsr.1,
473 region: lsr.2,
474 ..Default::default()
475 }
476 }
477 }
478
479 /// Convert from a [`LanguageIdentifier`] to an LSR tuple.
480 ///
481 /// # Examples
482 ///
483 /// ```
484 /// use icu::locid::{
485 /// langid, subtags_language as language, subtags_region as region,
486 /// subtags_script as script,
487 /// };
488 ///
489 /// let lid = langid!("en-Latn-US");
490 /// let (lang, script, region) = (&lid).into();
491 ///
492 /// assert_eq!(lang, language!("en"));
493 /// assert_eq!(script, Some(script!("Latn")));
494 /// assert_eq!(region, Some(region!("US")));
495 /// ```
496 impl From<&LanguageIdentifier>
497 for (
498 subtags::Language,
499 Option<subtags::Script>,
500 Option<subtags::Region>,
501 )
502 {
503 fn from(langid: &LanguageIdentifier) -> Self {
504 (langid.language, langid.script, langid.region)
505 }
506 }