]> git.proxmox.com Git - rustc.git/blob - vendor/unicode-script/src/lib.rs
New upstream version 1.49.0+dfsg1
[rustc.git] / vendor / unicode-script / src / lib.rs
1 //! This crate exposes the Unicode `Script` and `Script_Extension`
2 //! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
3
4 #![cfg_attr(not(test), no_std)]
5 #![cfg_attr(feature = "bench", feature(test))]
6
7 #[rustfmt::skip]
8 mod tables;
9
10 use core::convert::TryFrom;
11 use core::fmt;
12 use core::u64;
13 pub use tables::script_extensions;
14 use tables::{get_script, get_script_extension, NEXT_SCRIPT};
15 pub use tables::{Script, UNICODE_VERSION};
16
17 impl Script {
18 /// Get the full name of a script
19 pub fn full_name(self) -> &'static str {
20 self.inner_full_name()
21 }
22
23 /// Get the four-character short name of a script
24 pub fn short_name(self) -> &'static str {
25 self.inner_short_name()
26 }
27
28 /// Is this script "Recommended" according to
29 /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
30 pub fn is_recommended(self) -> bool {
31 use Script::*;
32 match self {
33 Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari
34 | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew
35 | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya
36 | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true,
37 _ => false,
38 }
39 }
40 }
41
42 impl From<Script> for ScriptExtension {
43 fn from(script: Script) -> Self {
44 if script == Script::Common {
45 ScriptExtension::new_common()
46 } else if script == Script::Inherited {
47 ScriptExtension::new_inherited()
48 } else if script == Script::Unknown {
49 ScriptExtension::new_unknown()
50 } else {
51 let mut first = 0;
52 let mut second = 0;
53 let mut third = 0;
54 let bit = script as u8;
55 // Find out which field it's in, and set the appropriate bit there
56 if bit < 64 {
57 first = 1 << bit as u64;
58 } else if bit < 128 {
59 // offset by 64 since `bit` is an absolute number,
60 // not relative to the chunk
61 second = 1 << (bit - 64) as u64;
62 } else {
63 third = 1 << (bit - 128) as u32;
64 }
65 ScriptExtension::new(first, second, third)
66 }
67 }
68 }
69
70 impl TryFrom<ScriptExtension> for Script {
71 type Error = ();
72 fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
73 if ext.is_common_or_inherited() {
74 if ext.common {
75 Ok(Script::Common)
76 } else {
77 Ok(Script::Inherited)
78 }
79 } else if ext.is_empty() {
80 Ok(Script::Unknown)
81 } else {
82 // filled elements will have set ones
83 let fo = ext.first.count_ones();
84 let so = ext.second.count_ones();
85 let to = ext.third.count_ones();
86 // only one bit set, in the first chunk
87 if fo == 1 && so == 0 && to == 0 {
88 // use trailing_zeroes() to figure out which bit it is
89 Ok(Script::for_integer(ext.first.trailing_zeros() as u8))
90 // only one bit set, in the second chunk
91 } else if fo == 0 && so == 1 && to == 0 {
92 Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8))
93 // only one bit set, in the third chunk
94 } else if fo == 0 && so == 0 && to == 1 {
95 Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8))
96 } else {
97 Err(())
98 }
99 }
100 }
101 }
102
103 impl Default for Script {
104 fn default() -> Self {
105 Script::Common
106 }
107 }
108
109 impl From<char> for Script {
110 fn from(o: char) -> Self {
111 o.script()
112 }
113 }
114
115 impl fmt::Display for Script {
116 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
117 write!(f, "{}", self.full_name())
118 }
119 }
120
121 #[derive(Clone, Copy, PartialEq, Eq, Hash)]
122 #[non_exhaustive]
123 /// A value for the `Script_Extension` property
124 ///
125 /// [`ScriptExtension`] is one or more [`Script`]
126 ///
127 /// This is essentially an optimized version of `Vec<Script>` that uses bitfields
128 pub struct ScriptExtension {
129 // A bitset for the first 64 scripts
130 first: u64,
131 // A bitset for the scripts 65-128
132 second: u64,
133 // A bitset for scripts after 128
134 third: u32,
135 // Both Common and Inherited are represented by all used bits being set,
136 // this flag lets us distinguish the two.
137 common: bool,
138 }
139
140 impl ScriptExtension {
141 // We don't use the complete u32 of `third`, so the "all" value is not just u32::MAX
142 // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
143 // it in the range of `third`, create a u32 with just that bit set, and subtract 1
144 // to create one with all the lower bits set.
145 const THIRD_MAX: u32 = ((1 << (NEXT_SCRIPT - 128)) - 1);
146
147 pub(crate) const fn new(first: u64, second: u64, third: u32) -> Self {
148 ScriptExtension {
149 first,
150 second,
151 third,
152 common: false,
153 }
154 }
155
156 pub(crate) const fn new_common() -> Self {
157 ScriptExtension {
158 first: u64::MAX,
159 second: u64::MAX,
160 third: Self::THIRD_MAX,
161 common: true,
162 }
163 }
164
165 pub(crate) const fn new_inherited() -> Self {
166 ScriptExtension {
167 first: u64::MAX,
168 second: u64::MAX,
169 third: Self::THIRD_MAX,
170 common: false,
171 }
172 }
173
174 pub(crate) const fn new_unknown() -> Self {
175 ScriptExtension {
176 first: 0,
177 second: 0,
178 third: 0,
179 common: false,
180 }
181 }
182
183 const fn is_common_or_inherited(self) -> bool {
184 (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX)
185 }
186
187 /// Checks if the script extension is Common
188 pub const fn is_common(self) -> bool {
189 self.is_common_or_inherited() & self.common
190 }
191
192 /// Checks if the script extension is Inherited
193 pub const fn is_inherited(self) -> bool {
194 self.is_common_or_inherited() & !self.common
195 }
196
197 /// Checks if the script extension is empty (unknown)
198 pub const fn is_empty(self) -> bool {
199 (self.first == 0) & (self.second == 0) & (self.third == 0)
200 }
201
202 /// Returns the number of scripts in the script extension
203 pub fn len(self) -> usize {
204 if self.is_common_or_inherited() {
205 1
206 } else {
207 (self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize
208 }
209 }
210
211 /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
212 /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
213 /// in `self`
214 ///
215 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
216 /// everything, the intersection of `Common` and `Inherited` is `Inherited`
217 pub fn intersect_with(&mut self, other: Self) {
218 *self = self.intersection(other)
219 }
220
221 /// Find the intersection between two ScriptExtensions. Returns Unknown if things
222 /// do not intersect.
223 ///
224 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
225 /// everything, the intersection of `Common` and `Inherited` is `Inherited`
226 pub const fn intersection(self, other: Self) -> Self {
227 let first = self.first & other.first;
228 let second = self.second & other.second;
229 let third = self.third & other.third;
230 let common = self.common & other.common;
231 ScriptExtension {
232 first,
233 second,
234 third,
235 common,
236 }
237 }
238
239 /// Find the union between two ScriptExtensions.
240 ///
241 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
242 /// everything, the union of `Common` and `Inherited` is `Common`
243 pub const fn union(self, other: Self) -> Self {
244 let first = self.first | other.first;
245 let second = self.second | other.second;
246 let third = self.third | other.third;
247 let common = self.common | other.common;
248 ScriptExtension {
249 first,
250 second,
251 third,
252 common,
253 }
254 }
255
256 /// Check if this ScriptExtension contains the given script
257 ///
258 /// Should be used with specific scripts only, this will
259 /// return `true` if `self` is not `Unknown` and `script` is
260 /// `Common` or `Inherited`
261 pub fn contains_script(self, script: Script) -> bool {
262 !self.intersection(script.into()).is_empty()
263 }
264
265 /// Get the intersection of script extensions of all characters
266 /// in a string.
267 pub fn for_str(x: &str) -> Self {
268 let mut ext = ScriptExtension::default();
269 for ch in x.chars() {
270 ext.intersect_with(ch.into());
271 }
272 ext
273 }
274
275 /// Iterate over the scripts in this script extension
276 ///
277 /// Will never yield Script::Unknown
278 pub fn iter(self) -> ScriptIterator {
279 ScriptIterator { ext: self }
280 }
281 }
282
283 impl Default for ScriptExtension {
284 fn default() -> Self {
285 ScriptExtension::new_common()
286 }
287 }
288
289 impl From<char> for ScriptExtension {
290 fn from(o: char) -> Self {
291 o.script_extension()
292 }
293 }
294
295 impl From<&'_ str> for ScriptExtension {
296 fn from(o: &'_ str) -> Self {
297 Self::for_str(o)
298 }
299 }
300
301 impl fmt::Debug for ScriptExtension {
302 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
303 write!(f, "ScriptExtension(")?;
304 fmt::Display::fmt(self, f)?;
305 write!(f, ")")
306 }
307 }
308
309 impl fmt::Display for ScriptExtension {
310 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
311 if self.is_common() {
312 write!(f, "Common")?;
313 } else if self.is_inherited() {
314 write!(f, "Inherited")?;
315 } else if self.is_empty() {
316 write!(f, "Unknown")?;
317 } else {
318 let mut first = true;
319 for script in self.iter() {
320 if !first {
321 write!(f, " + ")?;
322 first = false;
323 }
324 script.full_name().fmt(f)?;
325 }
326 }
327 Ok(())
328 }
329 }
330
331 /// Extension trait on `char` for calculating script properties
332 pub trait UnicodeScript {
333 /// Get the script for a given character
334 fn script(&self) -> Script;
335 /// Get the Script_Extension for a given character
336 fn script_extension(&self) -> ScriptExtension;
337 }
338
339 impl UnicodeScript for char {
340 fn script(&self) -> Script {
341 get_script(*self).unwrap_or(Script::Unknown)
342 }
343
344 fn script_extension(&self) -> ScriptExtension {
345 get_script_extension(*self).unwrap_or_else(|| self.script().into())
346 }
347 }
348
349 /// Iterator over scripts in a [ScriptExtension].
350 ///
351 /// Can be obtained ia [ScriptExtension::iter()]
352 pub struct ScriptIterator {
353 ext: ScriptExtension,
354 }
355
356 impl Iterator for ScriptIterator {
357 type Item = Script;
358
359 fn next(&mut self) -> Option<Script> {
360 if self.ext.is_common_or_inherited() {
361 let common = self.ext.common;
362 self.ext = ScriptExtension::new_unknown();
363 if common {
364 Some(Script::Common)
365 } else {
366 Some(Script::Inherited)
367 }
368 // Are there bits left in the first chunk?
369 } else if self.ext.first != 0 {
370 // Find the next bit
371 let bit = self.ext.first.trailing_zeros();
372 // unset just that bit
373 self.ext.first &= !(1 << bit);
374 Some(Script::for_integer(bit as u8))
375 // Are there bits left in the second chunk?
376 } else if self.ext.second != 0 {
377 let bit = self.ext.second.trailing_zeros();
378 self.ext.second &= !(1 << bit);
379 Some(Script::for_integer(64 + bit as u8))
380 // Are there bits left in the third chunk?
381 } else if self.ext.third != 0 {
382 let bit = self.ext.third.trailing_zeros();
383 self.ext.third &= !(1 << bit);
384 Some(Script::for_integer(128 + bit as u8))
385 } else {
386 // Script::Unknown
387 None
388 }
389 }
390 }
391
392 #[cfg(test)]
393 mod tests {
394 use crate::*;
395 use std::collections::HashSet;
396 use std::convert::TryInto;
397
398 #[cfg(feature = "bench")]
399 use test::bench::Bencher;
400 #[cfg(feature = "bench")]
401 extern crate test;
402
403 #[test]
404 fn test_conversion() {
405 let mut seen_scripts = HashSet::new();
406 let mut seen_exts = HashSet::new();
407 for bit in 0..NEXT_SCRIPT {
408 let script = Script::for_integer(bit);
409 let ext = script.into();
410 if seen_scripts.contains(&script) {
411 panic!("Found script {:?} twice!", script)
412 }
413 if seen_exts.contains(&ext) {
414 panic!("Found extension {:?} twice!", ext)
415 }
416 seen_scripts.insert(script);
417 seen_exts.insert(ext);
418 assert_eq!(script as u8, bit);
419 assert!(!ScriptExtension::new_common().intersection(ext).is_empty());
420 assert!(!ScriptExtension::new_inherited()
421 .intersection(ext)
422 .is_empty());
423 assert!(ScriptExtension::new_unknown().intersection(ext).is_empty());
424 assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]);
425 assert_eq!(Ok(script), ext.try_into());
426 }
427 }
428
429 #[test]
430 fn test_specific() {
431 let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
432 let ext = ScriptExtension::for_str(s);
433 assert_eq!(ext, script_extensions::DEVA);
434 println!(
435 "{:?}",
436 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
437 );
438 println!(
439 "{:?}",
440 ext.intersection(
441 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
442 )
443 );
444 assert!(!ext
445 .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH)
446 .is_empty());
447
448 let u = ext.union(Script::Dogra.into());
449 assert_eq!(
450 u.intersection(
451 script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
452 ),
453 u
454 );
455 }
456
457 #[test]
458 fn test_specific_ext() {
459 let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH;
460
461 let all: HashSet<_> = ext.iter().collect();
462
463 for bit in 0..NEXT_SCRIPT {
464 let script = Script::for_integer(bit);
465
466 if all.contains(&script) {
467 assert!(ext.contains_script(script))
468 } else {
469 assert!(!ext.contains_script(script))
470 }
471 }
472
473 assert!(ext.contains_script(Script::Devanagari));
474 assert!(ext.contains_script(Script::Dogra));
475 assert!(ext.contains_script(Script::Gujarati));
476 assert!(ext.contains_script(Script::Gurmukhi));
477 assert!(ext.contains_script(Script::Khojki));
478 assert!(ext.contains_script(Script::Kaithi));
479 assert!(ext.contains_script(Script::Mahajani));
480 assert!(ext.contains_script(Script::Modi));
481 assert!(ext.contains_script(Script::Khudawadi));
482 assert!(ext.contains_script(Script::Takri));
483 assert!(ext.contains_script(Script::Tirhuta));
484
485 let scr: Result<Script, _> = ext.try_into();
486 assert!(scr.is_err());
487 }
488
489 #[cfg(feature = "bench")]
490 #[bench]
491 fn bench_script_intersection(b: &mut Bencher) {
492 b.iter(|| {
493 let script = test::black_box(Script::Devanagari);
494 let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
495 test::black_box(ext.intersection(script.into()));
496 })
497 }
498
499 #[cfg(feature = "bench")]
500 #[bench]
501 fn bench_ext_to_script(b: &mut Bencher) {
502 let ext: ScriptExtension = Script::Devanagari.into();
503 b.iter(|| {
504 let ext = test::black_box(ext);
505 let script: Result<Script, _> = ext.try_into();
506 let _ = test::black_box(script);
507 })
508 }
509
510 #[cfg(feature = "bench")]
511 #[bench]
512 fn bench_script_to_ext(b: &mut Bencher) {
513 b.iter(|| {
514 let script = test::black_box(Script::Devanagari);
515 let ext: ScriptExtension = script.into();
516 test::black_box(ext);
517 })
518 }
519
520 #[cfg(feature = "bench")]
521 #[bench]
522 fn bench_ext_intersection(b: &mut Bencher) {
523 b.iter(|| {
524 let e1 = test::black_box(script_extensions::ARAB_ROHG_SYRC_THAA_YEZI);
525 let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
526 test::black_box(e2.intersection(e1));
527 })
528 }
529
530 #[cfg(feature = "bench")]
531 #[bench]
532 fn bench_to_vec(b: &mut Bencher) {
533 b.iter(|| {
534 let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH);
535 test::black_box(ext.iter().collect::<Vec<_>>());
536 })
537 }
538
539 #[cfg(feature = "bench")]
540 #[bench]
541 fn bench_string_ext(b: &mut Bencher) {
542 b.iter(|| {
543 let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
544 test::black_box(ScriptExtension::for_str(s));
545 })
546 }
547 }