1 //! This crate exposes the Unicode `Script` and `Script_Extension`
2 //! properties from [UAX #24](http://www.unicode.org/reports/tr24/)
4 #![cfg_attr(not(test), no_std)]
5 #![cfg_attr(feature = "bench", feature(test))]
10 use core
::convert
::TryFrom
;
13 pub use tables
::script_extensions
;
14 use tables
::{get_script, get_script_extension, NEXT_SCRIPT}
;
15 pub use tables
::{Script, UNICODE_VERSION}
;
18 /// Get the full name of a script
19 pub fn full_name(self) -> &'
static str {
20 self.inner_full_name()
23 /// Get the four-character short name of a script
24 pub fn short_name(self) -> &'
static str {
25 self.inner_short_name()
28 /// Is this script "Recommended" according to
29 /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)?
30 pub fn is_recommended(self) -> bool
{
33 Common
| Inherited
| Arabic
| Armenian
| Bengali
| Bopomofo
| Cyrillic
| Devanagari
34 | Ethiopic
| Georgian
| Greek
| Gujarati
| Gurmukhi
| Han
| Hangul
| Hebrew
35 | Hiragana
| Kannada
| Katakana
| Khmer
| Lao
| Latin
| Malayalam
| Myanmar
| Oriya
36 | Sinhala
| Tamil
| Telugu
| Thaana
| Thai
| Tibetan
=> true,
42 impl From
<Script
> for ScriptExtension
{
43 fn from(script
: Script
) -> Self {
44 if script
== Script
::Common
{
45 ScriptExtension
::new_common()
46 } else if script
== Script
::Inherited
{
47 ScriptExtension
::new_inherited()
48 } else if script
== Script
::Unknown
{
49 ScriptExtension
::new_unknown()
54 let bit
= script
as u8;
55 // Find out which field it's in, and set the appropriate bit there
57 first
= 1 << bit
as u64;
59 // offset by 64 since `bit` is an absolute number,
60 // not relative to the chunk
61 second
= 1 << (bit
- 64) as u64;
63 third
= 1 << (bit
- 128) as u32;
65 ScriptExtension
::new(first
, second
, third
)
70 impl TryFrom
<ScriptExtension
> for Script
{
72 fn try_from(ext
: ScriptExtension
) -> Result
<Self, ()> {
73 if ext
.is_common_or_inherited() {
79 } else if ext
.is_empty() {
82 // filled elements will have set ones
83 let fo
= ext
.first
.count_ones();
84 let so
= ext
.second
.count_ones();
85 let to
= ext
.third
.count_ones();
86 // only one bit set, in the first chunk
87 if fo
== 1 && so
== 0 && to
== 0 {
88 // use trailing_zeroes() to figure out which bit it is
89 Ok(Script
::for_integer(ext
.first
.trailing_zeros() as u8))
90 // only one bit set, in the second chunk
91 } else if fo
== 0 && so
== 1 && to
== 0 {
92 Ok(Script
::for_integer(64 + ext
.second
.trailing_zeros() as u8))
93 // only one bit set, in the third chunk
94 } else if fo
== 0 && so
== 0 && to
== 1 {
95 Ok(Script
::for_integer(128 + ext
.third
.trailing_zeros() as u8))
103 impl Default
for Script
{
104 fn default() -> Self {
109 impl From
<char> for Script
{
110 fn from(o
: char) -> Self {
115 impl fmt
::Display
for Script
{
116 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
117 write
!(f
, "{}", self.full_name())
121 #[derive(Clone, Copy, PartialEq, Eq, Hash)]
123 /// A value for the `Script_Extension` property
125 /// [`ScriptExtension`] is one or more [`Script`]
127 /// This is essentially an optimized version of `Vec<Script>` that uses bitfields
128 pub struct ScriptExtension
{
129 // A bitset for the first 64 scripts
131 // A bitset for the scripts 65-128
133 // A bitset for scripts after 128
135 // Both Common and Inherited are represented by all used bits being set,
136 // this flag lets us distinguish the two.
140 impl ScriptExtension
{
141 // We don't use the complete u32 of `third`, so the "all" value is not just u32::MAX
142 // Instead, we take the number of the next (unused) script bit, subtract 128 to bring
143 // it in the range of `third`, create a u32 with just that bit set, and subtract 1
144 // to create one with all the lower bits set.
145 const THIRD_MAX
: u32 = ((1 << (NEXT_SCRIPT
- 128)) - 1);
147 pub(crate) const fn new(first
: u64, second
: u64, third
: u32) -> Self {
156 pub(crate) const fn new_common() -> Self {
160 third
: Self::THIRD_MAX
,
165 pub(crate) const fn new_inherited() -> Self {
169 third
: Self::THIRD_MAX
,
174 pub(crate) const fn new_unknown() -> Self {
183 const fn is_common_or_inherited(self) -> bool
{
184 (self.first
== u64::MAX
) & (self.second
== u64::MAX
) & (self.third
== Self::THIRD_MAX
)
187 /// Checks if the script extension is Common
188 pub const fn is_common(self) -> bool
{
189 self.is_common_or_inherited() & self.common
192 /// Checks if the script extension is Inherited
193 pub const fn is_inherited(self) -> bool
{
194 self.is_common_or_inherited() & !self.common
197 /// Checks if the script extension is empty (unknown)
198 pub const fn is_empty(self) -> bool
{
199 (self.first
== 0) & (self.second
== 0) & (self.third
== 0)
202 /// Returns the number of scripts in the script extension
203 pub fn len(self) -> usize {
204 if self.is_common_or_inherited() {
207 (self.first
.count_ones() + self.second
.count_ones() + self.third
.count_ones()) as usize
211 /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things
212 /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result
215 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
216 /// everything, the intersection of `Common` and `Inherited` is `Inherited`
217 pub fn intersect_with(&mut self, other
: Self) {
218 *self = self.intersection(other
)
221 /// Find the intersection between two ScriptExtensions. Returns Unknown if things
222 /// do not intersect.
224 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
225 /// everything, the intersection of `Common` and `Inherited` is `Inherited`
226 pub const fn intersection(self, other
: Self) -> Self {
227 let first
= self.first
& other
.first
;
228 let second
= self.second
& other
.second
;
229 let third
= self.third
& other
.third
;
230 let common
= self.common
& other
.common
;
239 /// Find the union between two ScriptExtensions.
241 /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting
242 /// everything, the union of `Common` and `Inherited` is `Common`
243 pub const fn union(self, other
: Self) -> Self {
244 let first
= self.first
| other
.first
;
245 let second
= self.second
| other
.second
;
246 let third
= self.third
| other
.third
;
247 let common
= self.common
| other
.common
;
256 /// Check if this ScriptExtension contains the given script
258 /// Should be used with specific scripts only, this will
259 /// return `true` if `self` is not `Unknown` and `script` is
260 /// `Common` or `Inherited`
261 pub fn contains_script(self, script
: Script
) -> bool
{
262 !self.intersection(script
.into()).is_empty()
265 /// Get the intersection of script extensions of all characters
267 pub fn for_str(x
: &str) -> Self {
268 let mut ext
= ScriptExtension
::default();
269 for ch
in x
.chars() {
270 ext
.intersect_with(ch
.into());
275 /// Iterate over the scripts in this script extension
277 /// Will never yield Script::Unknown
278 pub fn iter(self) -> ScriptIterator
{
279 ScriptIterator { ext: self }
283 impl Default
for ScriptExtension
{
284 fn default() -> Self {
285 ScriptExtension
::new_common()
289 impl From
<char> for ScriptExtension
{
290 fn from(o
: char) -> Self {
295 impl From
<&'_
str> for ScriptExtension
{
296 fn from(o
: &'_
str) -> Self {
301 impl fmt
::Debug
for ScriptExtension
{
302 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
303 write
!(f
, "ScriptExtension(")?
;
304 fmt
::Display
::fmt(self, f
)?
;
309 impl fmt
::Display
for ScriptExtension
{
310 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
311 if self.is_common() {
312 write
!(f
, "Common")?
;
313 } else if self.is_inherited() {
314 write
!(f
, "Inherited")?
;
315 } else if self.is_empty() {
316 write
!(f
, "Unknown")?
;
318 let mut first
= true;
319 for script
in self.iter() {
324 script
.full_name().fmt(f
)?
;
331 /// Extension trait on `char` for calculating script properties
332 pub trait UnicodeScript
{
333 /// Get the script for a given character
334 fn script(&self) -> Script
;
335 /// Get the Script_Extension for a given character
336 fn script_extension(&self) -> ScriptExtension
;
339 impl UnicodeScript
for char {
340 fn script(&self) -> Script
{
341 get_script(*self).unwrap_or(Script
::Unknown
)
344 fn script_extension(&self) -> ScriptExtension
{
345 get_script_extension(*self).unwrap_or_else(|| self.script().into())
349 /// Iterator over scripts in a [ScriptExtension].
351 /// Can be obtained ia [ScriptExtension::iter()]
352 pub struct ScriptIterator
{
353 ext
: ScriptExtension
,
356 impl Iterator
for ScriptIterator
{
359 fn next(&mut self) -> Option
<Script
> {
360 if self.ext
.is_common_or_inherited() {
361 let common
= self.ext
.common
;
362 self.ext
= ScriptExtension
::new_unknown();
366 Some(Script
::Inherited
)
368 // Are there bits left in the first chunk?
369 } else if self.ext
.first
!= 0 {
371 let bit
= self.ext
.first
.trailing_zeros();
372 // unset just that bit
373 self.ext
.first
&= !(1 << bit
);
374 Some(Script
::for_integer(bit
as u8))
375 // Are there bits left in the second chunk?
376 } else if self.ext
.second
!= 0 {
377 let bit
= self.ext
.second
.trailing_zeros();
378 self.ext
.second
&= !(1 << bit
);
379 Some(Script
::for_integer(64 + bit
as u8))
380 // Are there bits left in the third chunk?
381 } else if self.ext
.third
!= 0 {
382 let bit
= self.ext
.third
.trailing_zeros();
383 self.ext
.third
&= !(1 << bit
);
384 Some(Script
::for_integer(128 + bit
as u8))
395 use std
::collections
::HashSet
;
396 use std
::convert
::TryInto
;
398 #[cfg(feature = "bench")]
399 use test
::bench
::Bencher
;
400 #[cfg(feature = "bench")]
404 fn test_conversion() {
405 let mut seen_scripts
= HashSet
::new();
406 let mut seen_exts
= HashSet
::new();
407 for bit
in 0..NEXT_SCRIPT
{
408 let script
= Script
::for_integer(bit
);
409 let ext
= script
.into();
410 if seen_scripts
.contains(&script
) {
411 panic
!("Found script {:?} twice!", script
)
413 if seen_exts
.contains(&ext
) {
414 panic
!("Found extension {:?} twice!", ext
)
416 seen_scripts
.insert(script
);
417 seen_exts
.insert(ext
);
418 assert_eq
!(script
as u8, bit
);
419 assert
!(!ScriptExtension
::new_common().intersection(ext
).is_empty());
420 assert
!(!ScriptExtension
::new_inherited()
423 assert
!(ScriptExtension
::new_unknown().intersection(ext
).is_empty());
424 assert_eq
!(ext
.iter().collect
::<Vec
<_
>>(), vec
![script
]);
425 assert_eq
!(Ok(script
), ext
.try_into());
431 let s
= "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.";
432 let ext
= ScriptExtension
::for_str(s
);
433 assert_eq
!(ext
, script_extensions
::DEVA
);
436 script_extensions
::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
441 script_extensions
::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
445 .intersection(script_extensions
::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
)
448 let u
= ext
.union(Script
::Dogra
.into());
451 script_extensions
::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
458 fn test_specific_ext() {
459 let ext
= script_extensions
::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH
;
461 let all
: HashSet
<_
> = ext
.iter().collect();
463 for bit
in 0..NEXT_SCRIPT
{
464 let script
= Script
::for_integer(bit
);
466 if all
.contains(&script
) {
467 assert
!(ext
.contains_script(script
))
469 assert
!(!ext
.contains_script(script
))
473 assert
!(ext
.contains_script(Script
::Devanagari
));
474 assert
!(ext
.contains_script(Script
::Dogra
));
475 assert
!(ext
.contains_script(Script
::Gujarati
));
476 assert
!(ext
.contains_script(Script
::Gurmukhi
));
477 assert
!(ext
.contains_script(Script
::Khojki
));
478 assert
!(ext
.contains_script(Script
::Kaithi
));
479 assert
!(ext
.contains_script(Script
::Mahajani
));
480 assert
!(ext
.contains_script(Script
::Modi
));
481 assert
!(ext
.contains_script(Script
::Khudawadi
));
482 assert
!(ext
.contains_script(Script
::Takri
));
483 assert
!(ext
.contains_script(Script
::Tirhuta
));
485 let scr
: Result
<Script
, _
> = ext
.try_into();
486 assert
!(scr
.is_err());
489 #[cfg(feature = "bench")]
491 fn bench_script_intersection(b
: &mut Bencher
) {
493 let script
= test
::black_box(Script
::Devanagari
);
494 let ext
= test
::black_box(script_extensions
::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH
);
495 test
::black_box(ext
.intersection(script
.into()));
499 #[cfg(feature = "bench")]
501 fn bench_ext_to_script(b
: &mut Bencher
) {
502 let ext
: ScriptExtension
= Script
::Devanagari
.into();
504 let ext
= test
::black_box(ext
);
505 let script
: Result
<Script
, _
> = ext
.try_into();
506 let _
= test
::black_box(script
);
510 #[cfg(feature = "bench")]
512 fn bench_script_to_ext(b
: &mut Bencher
) {
514 let script
= test
::black_box(Script
::Devanagari
);
515 let ext
: ScriptExtension
= script
.into();
516 test
::black_box(ext
);
520 #[cfg(feature = "bench")]
522 fn bench_ext_intersection(b
: &mut Bencher
) {
524 let e1
= test
::black_box(script_extensions
::ARAB_ROHG_SYRC_THAA_YEZI
);
525 let e2
= test
::black_box(script_extensions
::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH
);
526 test
::black_box(e2
.intersection(e1
));
530 #[cfg(feature = "bench")]
532 fn bench_to_vec(b
: &mut Bencher
) {
534 let ext
= test
::black_box(script_extensions
::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH
);
535 test
::black_box(ext
.iter().collect
::<Vec
<_
>>());
539 #[cfg(feature = "bench")]
541 fn bench_string_ext(b
: &mut Bencher
) {
543 let s
= test
::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे.");
544 test
::black_box(ScriptExtension
::for_str(s
));