]> git.proxmox.com Git - rustc.git/blob - vendor/ucd-parse/src/unicode_data.rs
New upstream version 1.42.0+dfsg1
[rustc.git] / vendor / ucd-parse / src / unicode_data.rs
1 use std::fmt;
2 use std::iter;
3 use std::ops::Range;
4 use std::path::Path;
5 use std::str::FromStr;
6
7 use regex::Regex;
8
9 use common::{UcdFile, UcdFileByCodepoint, Codepoint, CodepointIter};
10 use error::Error;
11
12 /// Represents a single row in the `UnicodeData.txt` file.
13 ///
14 /// These fields were taken from UAX44, Table 9, as part of the documentation
15 /// for the
16 /// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
17 #[derive(Clone, Debug, Default, Eq, PartialEq)]
18 pub struct UnicodeData {
19 /// The codepoint corresponding to this row.
20 pub codepoint: Codepoint,
21 /// The name of this codepoint.
22 pub name: String,
23 /// The "general category" of this codepoint.
24 pub general_category: String,
25 /// The class of this codepoint used in the Canonical Ordering Algorithm.
26 ///
27 /// Note that some classes map to a particular symbol. See
28 /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
29 pub canonical_combining_class: u8,
30 /// The bidirectional class of this codepoint.
31 ///
32 /// Possible values are listed in
33 /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
34 pub bidi_class: String,
35 /// The decomposition mapping for this codepoint. This includes its
36 /// formatting tag (if present).
37 pub decomposition: UnicodeDataDecomposition,
38 /// A decimal numeric representation of this codepoint, if it has the
39 /// property `Numeric_Type=Decimal`.
40 pub numeric_type_decimal: Option<u8>,
41 /// A decimal numeric representation of this codepoint, if it has the
42 /// property `Numeric_Type=Digit`. Note that while this field is still
43 /// populated for existing codepoints, no new codepoints will have this
44 /// field populated.
45 pub numeric_type_digit: Option<u8>,
46 /// A decimal or rational numeric representation of this codepoint, if it
47 /// has the property `Numeric_Type=Numeric`.
48 pub numeric_type_numeric: Option<UnicodeDataNumeric>,
49 /// A boolean indicating whether this codepoint is "mirrored" in
50 /// bidirectional text.
51 pub bidi_mirrored: bool,
52 /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
53 /// this field is empty unless it is significantly different from
54 /// the `name` field.
55 pub unicode1_name: String,
56 /// The ISO 10464 comment field. This no longer contains any non-NULL
57 /// values.
58 pub iso_comment: String,
59 /// This codepoint's simple uppercase mapping, if it exists.
60 pub simple_uppercase_mapping: Option<Codepoint>,
61 /// This codepoint's simple lowercase mapping, if it exists.
62 pub simple_lowercase_mapping: Option<Codepoint>,
63 /// This codepoint's simple titlecase mapping, if it exists.
64 pub simple_titlecase_mapping: Option<Codepoint>,
65 }
66
67 impl UcdFile for UnicodeData {
68 fn relative_file_path() -> &'static Path {
69 Path::new("UnicodeData.txt")
70 }
71 }
72
73 impl UcdFileByCodepoint for UnicodeData {
74 fn codepoints(&self) -> CodepointIter {
75 self.codepoint.into_iter()
76 }
77 }
78
79 impl UnicodeData {
80 /// Returns true if and only if this record corresponds to the start of a
81 /// range.
82 pub fn is_range_start(&self) -> bool {
83 self.name.starts_with('<')
84 && self.name.ends_with('>')
85 && self.name.contains("First")
86 }
87
88 /// Returns true if and only if this record corresponds to the end of a
89 /// range.
90 pub fn is_range_end(&self) -> bool {
91 self.name.starts_with('<')
92 && self.name.ends_with('>')
93 && self.name.contains("Last")
94 }
95 }
96
97 impl FromStr for UnicodeData {
98 type Err = Error;
99
100 fn from_str(line: &str) -> Result<UnicodeData, Error> {
101 lazy_static! {
102 static ref PARTS: Regex = Regex::new(
103 r"(?x)
104 ^
105 ([A-Z0-9]+); # 1; codepoint
106 ([^;]+); # 2; name
107 ([^;]+); # 3; general category
108 ([0-9]+); # 4; canonical combining class
109 ([^;]+); # 5; bidi class
110 ([^;]*); # 6; decomposition
111 ([0-9]*); # 7; numeric type decimal
112 ([0-9]*); # 8; numeric type digit
113 ([-0-9/]*); # 9; numeric type numeric
114 ([YN]); # 10; bidi mirrored
115 ([^;]*); # 11; unicode1 name
116 ([^;]*); # 12; ISO comment
117 ([^;]*); # 13; simple uppercase mapping
118 ([^;]*); # 14; simple lowercase mapping
119 ([^;]*) # 15; simple titlecase mapping
120 $
121 "
122 ).unwrap();
123 };
124 let caps = match PARTS.captures(line.trim()) {
125 Some(caps) => caps,
126 None => return err!("invalid UnicodeData line"),
127 };
128 let capget = |n| caps.get(n).unwrap().as_str();
129 let mut data = UnicodeData::default();
130
131 data.codepoint = capget(1).parse()?;
132 data.name = capget(2).to_string();
133 data.general_category = capget(3).to_string();
134 data.canonical_combining_class = match capget(4).parse() {
135 Ok(n) => n,
136 Err(err) => return err!(
137 "failed to parse canonical combining class '{}': {}",
138 capget(4), err),
139 };
140 data.bidi_class = capget(5).to_string();
141 if !caps[6].is_empty() {
142 data.decomposition = caps[6].parse()?;
143 } else {
144 data.decomposition.push(data.codepoint)?;
145 }
146 if !capget(7).is_empty() {
147 data.numeric_type_decimal = Some(match capget(7).parse() {
148 Ok(n) => n,
149 Err(err) => return err!(
150 "failed to parse numeric type decimal '{}': {}",
151 capget(7), err),
152 });
153 }
154 if !capget(8).is_empty() {
155 data.numeric_type_digit = Some(match capget(8).parse() {
156 Ok(n) => n,
157 Err(err) => return err!(
158 "failed to parse numeric type digit '{}': {}",
159 capget(8), err),
160 });
161 }
162 if !capget(9).is_empty() {
163 data.numeric_type_numeric = Some(capget(9).parse()?);
164 }
165 data.bidi_mirrored = capget(10) == "Y";
166 data.unicode1_name = capget(11).to_string();
167 data.iso_comment = capget(12).to_string();
168 if !capget(13).is_empty() {
169 data.simple_uppercase_mapping = Some(capget(13).parse()?);
170 }
171 if !capget(14).is_empty() {
172 data.simple_lowercase_mapping = Some(capget(14).parse()?);
173 }
174 if !capget(15).is_empty() {
175 data.simple_titlecase_mapping = Some(capget(15).parse()?);
176 }
177 Ok(data)
178 }
179 }
180
181 impl fmt::Display for UnicodeData {
182 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
183 write!(f, "{};", self.codepoint)?;
184 write!(f, "{};", self.name)?;
185 write!(f, "{};", self.general_category)?;
186 write!(f, "{};", self.canonical_combining_class)?;
187 write!(f, "{};", self.bidi_class)?;
188 if self.decomposition.is_canonical()
189 && self.decomposition.mapping() == &[self.codepoint]
190 {
191 write!(f, ";")?;
192 } else {
193 write!(f, "{};", self.decomposition)?;
194 }
195 if let Some(n) = self.numeric_type_decimal {
196 write!(f, "{};", n)?;
197 } else {
198 write!(f, ";")?;
199 }
200 if let Some(n) = self.numeric_type_digit {
201 write!(f, "{};", n)?;
202 } else {
203 write!(f, ";")?;
204 }
205 if let Some(n) = self.numeric_type_numeric {
206 write!(f, "{};", n)?;
207 } else {
208 write!(f, ";")?;
209 }
210 write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
211 write!(f, "{};", self.unicode1_name)?;
212 write!(f, "{};", self.iso_comment)?;
213 if let Some(cp) = self.simple_uppercase_mapping {
214 write!(f, "{};", cp)?;
215 } else {
216 write!(f, ";")?;
217 }
218 if let Some(cp) = self.simple_lowercase_mapping {
219 write!(f, "{};", cp)?;
220 } else {
221 write!(f, ";")?;
222 }
223 if let Some(cp) = self.simple_titlecase_mapping {
224 write!(f, "{}", cp)?;
225 }
226 Ok(())
227 }
228 }
229
230 /// Represents a decomposition mapping of a single row in the
231 /// `UnicodeData.txt` file.
232 #[derive(Clone, Debug, Default, Eq, PartialEq)]
233 pub struct UnicodeDataDecomposition {
234 /// The formatting tag associated with this mapping, if present.
235 pub tag: Option<UnicodeDataDecompositionTag>,
236 /// The number of codepoints in this mapping.
237 pub len: usize,
238 /// The codepoints in the mapping. Entries beyond `len` in the mapping
239 /// are always U+0000. If no mapping was present, then this always contains
240 /// a single codepoint corresponding to this row's character.
241 pub mapping: [Codepoint; 18],
242 }
243
244 impl UnicodeDataDecomposition {
245 /// Create a new decomposition mapping with the given tag and codepoints.
246 ///
247 /// If there are too many codepoints, then an error is returned.
248 pub fn new(
249 tag: Option<UnicodeDataDecompositionTag>,
250 mapping: &[Codepoint],
251 ) -> Result<UnicodeDataDecomposition, Error> {
252 let mut x = UnicodeDataDecomposition::default();
253 x.tag = tag;
254 for &cp in mapping {
255 x.push(cp)?;
256 }
257 Ok(x)
258 }
259
260 /// Add a new codepoint to this decomposition's mapping.
261 ///
262 /// If the mapping is already full, then this returns an error.
263 pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
264 if self.len >= self.mapping.len() {
265 return err!("invalid decomposition mapping (too many codepoints)");
266 }
267 self.mapping[self.len] = cp;
268 self.len += 1;
269 Ok(())
270 }
271
272 /// Return the mapping as a slice of codepoints. The slice returned
273 /// has length equivalent to the number of codepoints in this mapping.
274 pub fn mapping(&self) -> &[Codepoint] {
275 &self.mapping[..self.len]
276 }
277
278 /// Returns true if and only if this decomposition mapping is canonical.
279 pub fn is_canonical(&self) -> bool {
280 self.tag.is_none()
281 }
282 }
283
284 impl FromStr for UnicodeDataDecomposition {
285 type Err = Error;
286
287 fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
288 lazy_static! {
289 static ref WITH_TAG: Regex = Regex::new(
290 r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
291 ).unwrap();
292 static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap();
293 };
294 if s.is_empty() {
295 return err!("expected non-empty string for \
296 UnicodeDataDecomposition value");
297 }
298 let caps = match WITH_TAG.captures(s) {
299 Some(caps) => caps,
300 None => return err!("invalid decomposition value"),
301 };
302 let mut decomp = UnicodeDataDecomposition::default();
303 let mut codepoints = s;
304 if let Some(m) = caps.name("tag") {
305 decomp.tag = Some(m.as_str().parse()?);
306 codepoints = &caps["chars"];
307 }
308 for m in CHARS.find_iter(codepoints) {
309 let cp = m.as_str().parse()?;
310 decomp.push(cp)?;
311 }
312 Ok(decomp)
313 }
314 }
315
316 impl fmt::Display for UnicodeDataDecomposition {
317 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
318 if let Some(ref tag) = self.tag {
319 write!(f, "<{}> ", tag)?;
320 }
321 let mut first = true;
322 for cp in self.mapping() {
323 if !first {
324 write!(f, " ")?;
325 }
326 first = false;
327 write!(f, "{}", cp)?;
328 }
329 Ok(())
330 }
331 }
332
333 /// The formatting tag on a decomposition mapping.
334 ///
335 /// This is taken from
336 /// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
337 #[derive(Clone, Debug, Eq, PartialEq)]
338 pub enum UnicodeDataDecompositionTag {
339 /// <font>
340 Font,
341 /// <noBreak>
342 NoBreak,
343 /// <initial>
344 Initial,
345 /// <medial>
346 Medial,
347 /// <final>
348 Final,
349 /// <isolated>
350 Isolated,
351 /// <circle>
352 Circle,
353 /// <super>
354 Super,
355 /// <sub>
356 Sub,
357 /// <vertical>
358 Vertical,
359 /// <wide>
360 Wide,
361 /// <narrow>
362 Narrow,
363 /// <small>
364 Small,
365 /// <square>
366 Square,
367 /// <fraction>
368 Fraction,
369 /// <compat>
370 Compat,
371 }
372
373 impl FromStr for UnicodeDataDecompositionTag {
374 type Err = Error;
375
376 fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
377 use self::UnicodeDataDecompositionTag::*;
378 Ok(match s {
379 "font" => Font,
380 "noBreak" => NoBreak,
381 "initial" => Initial,
382 "medial" => Medial,
383 "final" => Final,
384 "isolated" => Isolated,
385 "circle" => Circle,
386 "super" => Super,
387 "sub" => Sub,
388 "vertical" => Vertical,
389 "wide" => Wide,
390 "narrow" => Narrow,
391 "small" => Small,
392 "square" => Square,
393 "fraction" => Fraction,
394 "compat" => Compat,
395 _ => return err!("invalid decomposition formatting tag: {}", s),
396 })
397 }
398 }
399
400 impl fmt::Display for UnicodeDataDecompositionTag {
401 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
402 use self::UnicodeDataDecompositionTag::*;
403 let s = match *self {
404 Font => "font",
405 NoBreak => "noBreak",
406 Initial => "initial",
407 Medial => "medial",
408 Final => "final",
409 Isolated => "isolated",
410 Circle => "circle",
411 Super => "super",
412 Sub => "sub",
413 Vertical => "vertical",
414 Wide => "wide",
415 Narrow => "narrow",
416 Small => "small",
417 Square => "square",
418 Fraction => "fraction",
419 Compat => "compat",
420 };
421 write!(f, "{}", s)
422 }
423 }
424
425 /// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
426 ///
427 /// A numeric value can either be a signed integer or a rational number.
428 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
429 pub enum UnicodeDataNumeric {
430 /// An integer.
431 Integer(i64),
432 /// A rational number. The first is the numerator and the latter is the
433 /// denominator.
434 Rational(i64, i64),
435 }
436
437 impl FromStr for UnicodeDataNumeric {
438 type Err = Error;
439
440 fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
441 if s.is_empty() {
442 return err!(
443 "expected non-empty string for UnicodeDataNumeric value");
444 }
445 if let Some(pos) = s.find('/') {
446 let (snum, sden) = (&s[..pos], &s[pos+1..]);
447 let num = match snum.parse() {
448 Ok(num) => num,
449 Err(err) => {
450 return err!(
451 "invalid integer numerator '{}': {}", snum, err);
452 }
453 };
454 let den = match sden.parse() {
455 Ok(den) => den,
456 Err(err) => {
457 return err!(
458 "invalid integer denominator '{}': {}", sden, err);
459 }
460 };
461 Ok(UnicodeDataNumeric::Rational(num, den))
462 } else {
463 match s.parse() {
464 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
465 Err(err) => {
466 return err!(
467 "invalid integer denominator '{}': {}", s, err);
468 }
469 }
470 }
471 }
472 }
473
474 impl fmt::Display for UnicodeDataNumeric {
475 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
476 match *self {
477 UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
478 UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
479 }
480 }
481 }
482
483 /// An iterator adapter that expands rows in `UnicodeData.txt`.
484 ///
485 /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
486 /// represented. Instead, they are represented by a pair of rows, indicating
487 /// a range of codepoints with the same properties. For example, the Hangul
488 /// syllable codepoints are represented by these two rows:
489 ///
490 /// ```ignore
491 /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
492 /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
493 /// ```
494 ///
495 /// This iterator will wrap any iterator of `UnicodeData` and, when a range of
496 /// Unicode codepoints is found, it will be expanded to the appropriate
497 /// sequence of `UnicodeData` values. Note that all such expanded records will
498 /// have an empty name.
499 pub struct UnicodeDataExpander<I: Iterator> {
500 /// The underlying iterator.
501 it: iter::Peekable<I>,
502 /// A range of codepoints to emit when we've found a pair. Otherwise,
503 /// `None`.
504 range: CodepointRange,
505 }
506
507 struct CodepointRange {
508 /// The codepoint range.
509 range: Range<u32>,
510 /// The start record. All subsequent records in this range are generated
511 /// by cloning this and updating the codepoint/name.
512 start_record: UnicodeData,
513 }
514
515 impl<I: Iterator<Item=UnicodeData>> UnicodeDataExpander<I> {
516 /// Create a new iterator that expands pairs of `UnicodeData` range
517 /// records. All other records are passed through as-is.
518 pub fn new<T>(it: T) -> UnicodeDataExpander<I>
519 where T: IntoIterator<IntoIter=I, Item=I::Item>
520 {
521 UnicodeDataExpander {
522 it: it.into_iter().peekable(),
523 range: CodepointRange {
524 range: 0..0,
525 start_record: UnicodeData::default(),
526 },
527 }
528 }
529 }
530
531 impl<I: Iterator<Item=UnicodeData>>
532 Iterator for UnicodeDataExpander<I>
533 {
534 type Item = UnicodeData;
535
536 fn next(&mut self) -> Option<UnicodeData> {
537 if let Some(udata) = self.range.next() {
538 return Some(udata);
539 }
540 let row1 = match self.it.next() {
541 None => return None,
542 Some(row1) => row1,
543 };
544 if !row1.is_range_start()
545 || !self.it.peek().map_or(false, |row2| row2.is_range_end())
546 {
547 return Some(row1)
548 }
549 let row2 = self.it.next().unwrap();
550 self.range = CodepointRange {
551 range: row1.codepoint.value()..(row2.codepoint.value() + 1),
552 start_record: row1,
553 };
554 self.next()
555 }
556 }
557
558 impl Iterator for CodepointRange {
559 type Item = UnicodeData;
560
561 fn next(&mut self) -> Option<UnicodeData> {
562 let cp = match self.range.next() {
563 None => return None,
564 Some(cp) => cp,
565 };
566 Some(UnicodeData {
567 codepoint: Codepoint::from_u32(cp).unwrap(),
568 name: "".to_string(),
569 ..self.start_record.clone()
570 })
571 }
572 }
573
574 #[cfg(test)]
575 mod tests {
576 use common::Codepoint;
577
578 use super::{
579 UnicodeData, UnicodeDataNumeric,
580 UnicodeDataDecomposition, UnicodeDataDecompositionTag,
581 };
582
583 fn codepoint(n: u32) -> Codepoint {
584 Codepoint::from_u32(n).unwrap()
585 }
586
587 fn s(string: &str) -> String {
588 string.to_string()
589 }
590
591 #[test]
592 fn parse1() {
593 let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
594 let data: UnicodeData = line.parse().unwrap();
595 assert_eq!(data, UnicodeData {
596 codepoint: codepoint(0x249d),
597 name: s("PARENTHESIZED LATIN SMALL LETTER B"),
598 general_category: s("So"),
599 canonical_combining_class: 0,
600 bidi_class: s("L"),
601 decomposition: UnicodeDataDecomposition::new(
602 Some(UnicodeDataDecompositionTag::Compat),
603 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
604 ).unwrap(),
605 numeric_type_decimal: None,
606 numeric_type_digit: None,
607 numeric_type_numeric: None,
608 bidi_mirrored: false,
609 unicode1_name: s(""),
610 iso_comment: s(""),
611 simple_uppercase_mapping: None,
612 simple_lowercase_mapping: None,
613 simple_titlecase_mapping: None,
614 });
615 }
616
617 #[test]
618 fn parse2() {
619 let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
620 let data: UnicodeData = line.parse().unwrap();
621 assert_eq!(data, UnicodeData {
622 codepoint: codepoint(0x000D),
623 name: s("<control>"),
624 general_category: s("Cc"),
625 canonical_combining_class: 0,
626 bidi_class: s("B"),
627 decomposition: UnicodeDataDecomposition::new(
628 None, &[codepoint(0x000D)]).unwrap(),
629 numeric_type_decimal: None,
630 numeric_type_digit: None,
631 numeric_type_numeric: None,
632 bidi_mirrored: false,
633 unicode1_name: s("CARRIAGE RETURN (CR)"),
634 iso_comment: s(""),
635 simple_uppercase_mapping: None,
636 simple_lowercase_mapping: None,
637 simple_titlecase_mapping: None,
638 });
639 }
640
641 #[test]
642 fn parse3() {
643 let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
644 let data: UnicodeData = line.parse().unwrap();
645 assert_eq!(data, UnicodeData {
646 codepoint: codepoint(0x00BC),
647 name: s("VULGAR FRACTION ONE QUARTER"),
648 general_category: s("No"),
649 canonical_combining_class: 0,
650 bidi_class: s("ON"),
651 decomposition: UnicodeDataDecomposition::new(
652 Some(UnicodeDataDecompositionTag::Fraction),
653 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
654 ).unwrap(),
655 numeric_type_decimal: None,
656 numeric_type_digit: None,
657 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
658 bidi_mirrored: false,
659 unicode1_name: s("FRACTION ONE QUARTER"),
660 iso_comment: s(""),
661 simple_uppercase_mapping: None,
662 simple_lowercase_mapping: None,
663 simple_titlecase_mapping: None,
664 });
665 }
666
667 #[test]
668 fn parse4() {
669 let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
670 let data: UnicodeData = line.parse().unwrap();
671 assert_eq!(data, UnicodeData {
672 codepoint: codepoint(0x0041),
673 name: s("LATIN CAPITAL LETTER A"),
674 general_category: s("Lu"),
675 canonical_combining_class: 0,
676 bidi_class: s("L"),
677 decomposition: UnicodeDataDecomposition::new(
678 None, &[codepoint(0x0041)]).unwrap(),
679 numeric_type_decimal: None,
680 numeric_type_digit: None,
681 numeric_type_numeric: None,
682 bidi_mirrored: false,
683 unicode1_name: s(""),
684 iso_comment: s(""),
685 simple_uppercase_mapping: None,
686 simple_lowercase_mapping: Some(codepoint(0x0061)),
687 simple_titlecase_mapping: None,
688 });
689 }
690
691 #[test]
692 fn parse5() {
693 let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
694 let data: UnicodeData = line.parse().unwrap();
695 assert_eq!(data, UnicodeData {
696 codepoint: codepoint(0x0F33),
697 name: s("TIBETAN DIGIT HALF ZERO"),
698 general_category: s("No"),
699 canonical_combining_class: 0,
700 bidi_class: s("L"),
701 decomposition: UnicodeDataDecomposition::new(
702 None, &[codepoint(0x0F33)]).unwrap(),
703 numeric_type_decimal: None,
704 numeric_type_digit: None,
705 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(-1, 2)),
706 bidi_mirrored: false,
707 unicode1_name: s(""),
708 iso_comment: s(""),
709 simple_uppercase_mapping: None,
710 simple_lowercase_mapping: None,
711 simple_titlecase_mapping: None,
712 });
713 }
714
715 #[test]
716 fn expander() {
717 use common::UcdLineParser;
718 use super::UnicodeDataExpander;
719
720 let data = "\
721 ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
722 AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
723 D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
724 D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
725 ";
726 let records = UcdLineParser::new(None, data.as_bytes())
727 .collect::<Result<Vec<_>, _>>()
728 .unwrap();
729 assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
730 }
731 }