9 use common
::{UcdFile, UcdFileByCodepoint, Codepoint, CodepointIter}
;
12 /// Represents a single row in the `UnicodeData.txt` file.
14 /// These fields were taken from UAX44, Table 9, as part of the documentation
16 /// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
17 #[derive(Clone, Debug, Default, Eq, PartialEq)]
18 pub struct UnicodeData
{
19 /// The codepoint corresponding to this row.
20 pub codepoint
: Codepoint
,
21 /// The name of this codepoint.
23 /// The "general category" of this codepoint.
24 pub general_category
: String
,
25 /// The class of this codepoint used in the Canonical Ordering Algorithm.
27 /// Note that some classes map to a particular symbol. See
28 /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
29 pub canonical_combining_class
: u8,
30 /// The bidirectional class of this codepoint.
32 /// Possible values are listed in
33 /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
34 pub bidi_class
: String
,
35 /// The decomposition mapping for this codepoint. This includes its
36 /// formatting tag (if present).
37 pub decomposition
: UnicodeDataDecomposition
,
38 /// A decimal numeric representation of this codepoint, if it has the
39 /// property `Numeric_Type=Decimal`.
40 pub numeric_type_decimal
: Option
<u8>,
41 /// A decimal numeric representation of this codepoint, if it has the
42 /// property `Numeric_Type=Digit`. Note that while this field is still
43 /// populated for existing codepoints, no new codepoints will have this
45 pub numeric_type_digit
: Option
<u8>,
46 /// A decimal or rational numeric representation of this codepoint, if it
47 /// has the property `Numeric_Type=Numeric`.
48 pub numeric_type_numeric
: Option
<UnicodeDataNumeric
>,
49 /// A boolean indicating whether this codepoint is "mirrored" in
50 /// bidirectional text.
51 pub bidi_mirrored
: bool
,
52 /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
53 /// this field is empty unless it is significantly different from
55 pub unicode1_name
: String
,
56 /// The ISO 10464 comment field. This no longer contains any non-NULL
58 pub iso_comment
: String
,
59 /// This codepoint's simple uppercase mapping, if it exists.
60 pub simple_uppercase_mapping
: Option
<Codepoint
>,
61 /// This codepoint's simple lowercase mapping, if it exists.
62 pub simple_lowercase_mapping
: Option
<Codepoint
>,
63 /// This codepoint's simple titlecase mapping, if it exists.
64 pub simple_titlecase_mapping
: Option
<Codepoint
>,
67 impl UcdFile
for UnicodeData
{
68 fn relative_file_path() -> &'
static Path
{
69 Path
::new("UnicodeData.txt")
73 impl UcdFileByCodepoint
for UnicodeData
{
74 fn codepoints(&self) -> CodepointIter
{
75 self.codepoint
.into_iter()
80 /// Returns true if and only if this record corresponds to the start of a
82 pub fn is_range_start(&self) -> bool
{
83 self.name
.starts_with('
<'
)
84 && self.name
.ends_with('
>'
)
85 && self.name
.contains("First")
88 /// Returns true if and only if this record corresponds to the end of a
90 pub fn is_range_end(&self) -> bool
{
91 self.name
.starts_with('
<'
)
92 && self.name
.ends_with('
>'
)
93 && self.name
.contains("Last")
97 impl FromStr
for UnicodeData
{
100 fn from_str(line
: &str) -> Result
<UnicodeData
, Error
> {
102 static ref PARTS
: Regex
= Regex
::new(
105 ([A-Z0-9]+); # 1; codepoint
107 ([^;]+); # 3; general category
108 ([0-9]+); # 4; canonical combining class
109 ([^;]+); # 5; bidi class
110 ([^;]*); # 6; decomposition
111 ([0-9]*); # 7; numeric type decimal
112 ([0-9]*); # 8; numeric type digit
113 ([-0-9/]*); # 9; numeric type numeric
114 ([YN]); # 10; bidi mirrored
115 ([^;]*); # 11; unicode1 name
116 ([^;]*); # 12; ISO comment
117 ([^;]*); # 13; simple uppercase mapping
118 ([^;]*); # 14; simple lowercase mapping
119 ([^;]*) # 15; simple titlecase mapping
124 let caps
= match PARTS
.captures(line
.trim()) {
126 None
=> return err
!("invalid UnicodeData line"),
128 let capget
= |n
| caps
.get(n
).unwrap().as_str();
129 let mut data
= UnicodeData
::default();
131 data
.codepoint
= capget(1).parse()?
;
132 data
.name
= capget(2).to_string();
133 data
.general_category
= capget(3).to_string();
134 data
.canonical_combining_class
= match capget(4).parse() {
136 Err(err
) => return err
!(
137 "failed to parse canonical combining class '{}': {}",
140 data
.bidi_class
= capget(5).to_string();
141 if !caps
[6].is_empty() {
142 data
.decomposition
= caps
[6].parse()?
;
144 data
.decomposition
.push(data
.codepoint
)?
;
146 if !capget(7).is_empty() {
147 data
.numeric_type_decimal
= Some(match capget(7).parse() {
149 Err(err
) => return err
!(
150 "failed to parse numeric type decimal '{}': {}",
154 if !capget(8).is_empty() {
155 data
.numeric_type_digit
= Some(match capget(8).parse() {
157 Err(err
) => return err
!(
158 "failed to parse numeric type digit '{}': {}",
162 if !capget(9).is_empty() {
163 data
.numeric_type_numeric
= Some(capget(9).parse()?
);
165 data
.bidi_mirrored
= capget(10) == "Y";
166 data
.unicode1_name
= capget(11).to_string();
167 data
.iso_comment
= capget(12).to_string();
168 if !capget(13).is_empty() {
169 data
.simple_uppercase_mapping
= Some(capget(13).parse()?
);
171 if !capget(14).is_empty() {
172 data
.simple_lowercase_mapping
= Some(capget(14).parse()?
);
174 if !capget(15).is_empty() {
175 data
.simple_titlecase_mapping
= Some(capget(15).parse()?
);
181 impl fmt
::Display
for UnicodeData
{
182 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
183 write
!(f
, "{};", self.codepoint
)?
;
184 write
!(f
, "{};", self.name
)?
;
185 write
!(f
, "{};", self.general_category
)?
;
186 write
!(f
, "{};", self.canonical_combining_class
)?
;
187 write
!(f
, "{};", self.bidi_class
)?
;
188 if self.decomposition
.is_canonical()
189 && self.decomposition
.mapping() == &[self.codepoint
]
193 write
!(f
, "{};", self.decomposition
)?
;
195 if let Some(n
) = self.numeric_type_decimal
{
196 write
!(f
, "{};", n
)?
;
200 if let Some(n
) = self.numeric_type_digit
{
201 write
!(f
, "{};", n
)?
;
205 if let Some(n
) = self.numeric_type_numeric
{
206 write
!(f
, "{};", n
)?
;
210 write
!(f
, "{};", if self.bidi_mirrored { "Y" }
else { "N" }
)?
;
211 write
!(f
, "{};", self.unicode1_name
)?
;
212 write
!(f
, "{};", self.iso_comment
)?
;
213 if let Some(cp
) = self.simple_uppercase_mapping
{
214 write
!(f
, "{};", cp
)?
;
218 if let Some(cp
) = self.simple_lowercase_mapping
{
219 write
!(f
, "{};", cp
)?
;
223 if let Some(cp
) = self.simple_titlecase_mapping
{
224 write
!(f
, "{}", cp
)?
;
230 /// Represents a decomposition mapping of a single row in the
231 /// `UnicodeData.txt` file.
232 #[derive(Clone, Debug, Default, Eq, PartialEq)]
233 pub struct UnicodeDataDecomposition
{
234 /// The formatting tag associated with this mapping, if present.
235 pub tag
: Option
<UnicodeDataDecompositionTag
>,
236 /// The number of codepoints in this mapping.
238 /// The codepoints in the mapping. Entries beyond `len` in the mapping
239 /// are always U+0000. If no mapping was present, then this always contains
240 /// a single codepoint corresponding to this row's character.
241 pub mapping
: [Codepoint
; 18],
244 impl UnicodeDataDecomposition
{
245 /// Create a new decomposition mapping with the given tag and codepoints.
247 /// If there are too many codepoints, then an error is returned.
249 tag
: Option
<UnicodeDataDecompositionTag
>,
250 mapping
: &[Codepoint
],
251 ) -> Result
<UnicodeDataDecomposition
, Error
> {
252 let mut x
= UnicodeDataDecomposition
::default();
260 /// Add a new codepoint to this decomposition's mapping.
262 /// If the mapping is already full, then this returns an error.
263 pub fn push(&mut self, cp
: Codepoint
) -> Result
<(), Error
> {
264 if self.len
>= self.mapping
.len() {
265 return err
!("invalid decomposition mapping (too many codepoints)");
267 self.mapping
[self.len
] = cp
;
272 /// Return the mapping as a slice of codepoints. The slice returned
273 /// has length equivalent to the number of codepoints in this mapping.
274 pub fn mapping(&self) -> &[Codepoint
] {
275 &self.mapping
[..self.len
]
278 /// Returns true if and only if this decomposition mapping is canonical.
279 pub fn is_canonical(&self) -> bool
{
284 impl FromStr
for UnicodeDataDecomposition
{
287 fn from_str(s
: &str) -> Result
<UnicodeDataDecomposition
, Error
> {
289 static ref WITH_TAG
: Regex
= Regex
::new(
290 r
"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
292 static ref CHARS
: Regex
= Regex
::new(r
"[0-9A-F]+").unwrap();
295 return err
!("expected non-empty string for \
296 UnicodeDataDecomposition value");
298 let caps
= match WITH_TAG
.captures(s
) {
300 None
=> return err
!("invalid decomposition value"),
302 let mut decomp
= UnicodeDataDecomposition
::default();
303 let mut codepoints
= s
;
304 if let Some(m
) = caps
.name("tag") {
305 decomp
.tag
= Some(m
.as_str().parse()?
);
306 codepoints
= &caps
["chars"];
308 for m
in CHARS
.find_iter(codepoints
) {
309 let cp
= m
.as_str().parse()?
;
316 impl fmt
::Display
for UnicodeDataDecomposition
{
317 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
318 if let Some(ref tag
) = self.tag
{
319 write
!(f
, "<{}> ", tag
)?
;
321 let mut first
= true;
322 for cp
in self.mapping() {
327 write
!(f
, "{}", cp
)?
;
333 /// The formatting tag on a decomposition mapping.
335 /// This is taken from
336 /// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
337 #[derive(Clone, Debug, Eq, PartialEq)]
338 pub enum UnicodeDataDecompositionTag
{
373 impl FromStr
for UnicodeDataDecompositionTag
{
376 fn from_str(s
: &str) -> Result
<UnicodeDataDecompositionTag
, Error
> {
377 use self::UnicodeDataDecompositionTag
::*;
380 "noBreak" => NoBreak
,
381 "initial" => Initial
,
384 "isolated" => Isolated
,
388 "vertical" => Vertical
,
393 "fraction" => Fraction
,
395 _
=> return err
!("invalid decomposition formatting tag: {}", s
),
400 impl fmt
::Display
for UnicodeDataDecompositionTag
{
401 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
402 use self::UnicodeDataDecompositionTag
::*;
403 let s
= match *self {
405 NoBreak
=> "noBreak",
406 Initial
=> "initial",
409 Isolated
=> "isolated",
413 Vertical
=> "vertical",
418 Fraction
=> "fraction",
425 /// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
427 /// A numeric value can either be a signed integer or a rational number.
428 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
429 pub enum UnicodeDataNumeric
{
432 /// A rational number. The first is the numerator and the latter is the
437 impl FromStr
for UnicodeDataNumeric
{
440 fn from_str(s
: &str) -> Result
<UnicodeDataNumeric
, Error
> {
443 "expected non-empty string for UnicodeDataNumeric value");
445 if let Some(pos
) = s
.find('
/'
) {
446 let (snum
, sden
) = (&s
[..pos
], &s
[pos
+1..]);
447 let num
= match snum
.parse() {
451 "invalid integer numerator '{}': {}", snum
, err
);
454 let den
= match sden
.parse() {
458 "invalid integer denominator '{}': {}", sden
, err
);
461 Ok(UnicodeDataNumeric
::Rational(num
, den
))
464 Ok(den
) => Ok(UnicodeDataNumeric
::Integer(den
)),
467 "invalid integer denominator '{}': {}", s
, err
);
474 impl fmt
::Display
for UnicodeDataNumeric
{
475 fn fmt(&self, f
: &mut fmt
::Formatter
) -> fmt
::Result
{
477 UnicodeDataNumeric
::Integer(n
) => write
!(f
, "{}", n
),
478 UnicodeDataNumeric
::Rational(n
, d
) => write
!(f
, "{}/{}", n
, d
),
483 /// An iterator adapter that expands rows in `UnicodeData.txt`.
485 /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
486 /// represented. Instead, they are represented by a pair of rows, indicating
487 /// a range of codepoints with the same properties. For example, the Hangul
488 /// syllable codepoints are represented by these two rows:
491 /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
492 /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
495 /// This iterator will wrap any iterator of `UnicodeData` and, when a range of
496 /// Unicode codepoints is found, it will be expanded to the appropriate
497 /// sequence of `UnicodeData` values. Note that all such expanded records will
498 /// have an empty name.
499 pub struct UnicodeDataExpander
<I
: Iterator
> {
500 /// The underlying iterator.
501 it
: iter
::Peekable
<I
>,
502 /// A range of codepoints to emit when we've found a pair. Otherwise,
504 range
: CodepointRange
,
507 struct CodepointRange
{
508 /// The codepoint range.
510 /// The start record. All subsequent records in this range are generated
511 /// by cloning this and updating the codepoint/name.
512 start_record
: UnicodeData
,
515 impl<I
: Iterator
<Item
=UnicodeData
>> UnicodeDataExpander
<I
> {
516 /// Create a new iterator that expands pairs of `UnicodeData` range
517 /// records. All other records are passed through as-is.
518 pub fn new
<T
>(it
: T
) -> UnicodeDataExpander
<I
>
519 where T
: IntoIterator
<IntoIter
=I
, Item
=I
::Item
>
521 UnicodeDataExpander
{
522 it
: it
.into_iter().peekable(),
523 range
: CodepointRange
{
525 start_record
: UnicodeData
::default(),
531 impl<I
: Iterator
<Item
=UnicodeData
>>
532 Iterator
for UnicodeDataExpander
<I
>
534 type Item
= UnicodeData
;
536 fn next(&mut self) -> Option
<UnicodeData
> {
537 if let Some(udata
) = self.range
.next() {
540 let row1
= match self.it
.next() {
544 if !row1
.is_range_start()
545 || !self.it
.peek().map_or(false, |row2
| row2
.is_range_end())
549 let row2
= self.it
.next().unwrap();
550 self.range
= CodepointRange
{
551 range
: row1
.codepoint
.value()..(row2
.codepoint
.value() + 1),
558 impl Iterator
for CodepointRange
{
559 type Item
= UnicodeData
;
561 fn next(&mut self) -> Option
<UnicodeData
> {
562 let cp
= match self.range
.next() {
567 codepoint
: Codepoint
::from_u32(cp
).unwrap(),
568 name
: "".to_string(),
569 ..self.start_record
.clone()
576 use common
::Codepoint
;
579 UnicodeData
, UnicodeDataNumeric
,
580 UnicodeDataDecomposition
, UnicodeDataDecompositionTag
,
583 fn codepoint(n
: u32) -> Codepoint
{
584 Codepoint
::from_u32(n
).unwrap()
587 fn s(string
: &str) -> String
{
593 let line
= "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
594 let data
: UnicodeData
= line
.parse().unwrap();
595 assert_eq
!(data
, UnicodeData
{
596 codepoint
: codepoint(0x249d),
597 name
: s("PARENTHESIZED LATIN SMALL LETTER B"),
598 general_category
: s("So"),
599 canonical_combining_class
: 0,
601 decomposition
: UnicodeDataDecomposition
::new(
602 Some(UnicodeDataDecompositionTag
::Compat
),
603 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
605 numeric_type_decimal
: None
,
606 numeric_type_digit
: None
,
607 numeric_type_numeric
: None
,
608 bidi_mirrored
: false,
609 unicode1_name
: s(""),
611 simple_uppercase_mapping
: None
,
612 simple_lowercase_mapping
: None
,
613 simple_titlecase_mapping
: None
,
619 let line
= "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
620 let data
: UnicodeData
= line
.parse().unwrap();
621 assert_eq
!(data
, UnicodeData
{
622 codepoint
: codepoint(0x000D),
623 name
: s("<control>"),
624 general_category
: s("Cc"),
625 canonical_combining_class
: 0,
627 decomposition
: UnicodeDataDecomposition
::new(
628 None
, &[codepoint(0x000D)]).unwrap(),
629 numeric_type_decimal
: None
,
630 numeric_type_digit
: None
,
631 numeric_type_numeric
: None
,
632 bidi_mirrored
: false,
633 unicode1_name
: s("CARRIAGE RETURN (CR)"),
635 simple_uppercase_mapping
: None
,
636 simple_lowercase_mapping
: None
,
637 simple_titlecase_mapping
: None
,
643 let line
= "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
644 let data
: UnicodeData
= line
.parse().unwrap();
645 assert_eq
!(data
, UnicodeData
{
646 codepoint
: codepoint(0x00BC),
647 name
: s("VULGAR FRACTION ONE QUARTER"),
648 general_category
: s("No"),
649 canonical_combining_class
: 0,
651 decomposition
: UnicodeDataDecomposition
::new(
652 Some(UnicodeDataDecompositionTag
::Fraction
),
653 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
655 numeric_type_decimal
: None
,
656 numeric_type_digit
: None
,
657 numeric_type_numeric
: Some(UnicodeDataNumeric
::Rational(1, 4)),
658 bidi_mirrored
: false,
659 unicode1_name
: s("FRACTION ONE QUARTER"),
661 simple_uppercase_mapping
: None
,
662 simple_lowercase_mapping
: None
,
663 simple_titlecase_mapping
: None
,
669 let line
= "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
670 let data
: UnicodeData
= line
.parse().unwrap();
671 assert_eq
!(data
, UnicodeData
{
672 codepoint
: codepoint(0x0041),
673 name
: s("LATIN CAPITAL LETTER A"),
674 general_category
: s("Lu"),
675 canonical_combining_class
: 0,
677 decomposition
: UnicodeDataDecomposition
::new(
678 None
, &[codepoint(0x0041)]).unwrap(),
679 numeric_type_decimal
: None
,
680 numeric_type_digit
: None
,
681 numeric_type_numeric
: None
,
682 bidi_mirrored
: false,
683 unicode1_name
: s(""),
685 simple_uppercase_mapping
: None
,
686 simple_lowercase_mapping
: Some(codepoint(0x0061)),
687 simple_titlecase_mapping
: None
,
693 let line
= "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
694 let data
: UnicodeData
= line
.parse().unwrap();
695 assert_eq
!(data
, UnicodeData
{
696 codepoint
: codepoint(0x0F33),
697 name
: s("TIBETAN DIGIT HALF ZERO"),
698 general_category
: s("No"),
699 canonical_combining_class
: 0,
701 decomposition
: UnicodeDataDecomposition
::new(
702 None
, &[codepoint(0x0F33)]).unwrap(),
703 numeric_type_decimal
: None
,
704 numeric_type_digit
: None
,
705 numeric_type_numeric
: Some(UnicodeDataNumeric
::Rational(-1, 2)),
706 bidi_mirrored
: false,
707 unicode1_name
: s(""),
709 simple_uppercase_mapping
: None
,
710 simple_lowercase_mapping
: None
,
711 simple_titlecase_mapping
: None
,
717 use common
::UcdLineParser
;
718 use super::UnicodeDataExpander
;
721 ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
722 AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
723 D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
724 D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
726 let records
= UcdLineParser
::new(None
, data
.as_bytes())
727 .collect
::<Result
<Vec
<_
>, _
>>()
729 assert_eq
!(UnicodeDataExpander
::new(records
).count(), 11174);