vendor/ucd-parse/src/unicode_data.rs

   1 use std::fmt;
   2 use std::iter;
   3 use std::ops::Range;
   4 use std::path::Path;
   5 use std::str::FromStr;
   6
   7 use regex::Regex;
   8
   9 use common::{UcdFile, UcdFileByCodepoint, Codepoint, CodepointIter};
  10 use error::Error;
  11
  12 /// Represents a single row in the `UnicodeData.txt` file.
  13 ///
  14 /// These fields were taken from UAX44, Table 9, as part of the documentation
  15 /// for the
  16 /// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
  17 #[derive(Clone, Debug, Default, Eq, PartialEq)]
  18 pub struct UnicodeData {
  19     /// The codepoint corresponding to this row.
  20     pub codepoint: Codepoint,
  21     /// The name of this codepoint.
  22     pub name: String,
  23     /// The "general category" of this codepoint.
  24     pub general_category: String,
  25     /// The class of this codepoint used in the Canonical Ordering Algorithm.
  26     ///
  27     /// Note that some classes map to a particular symbol. See
  28     /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
  29     pub canonical_combining_class: u8,
  30     /// The bidirectional class of this codepoint.
  31     ///
  32     /// Possible values are listed in
  33     /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
  34     pub bidi_class: String,
  35     /// The decomposition mapping for this codepoint. This includes its
  36     /// formatting tag (if present).
  37     pub decomposition: UnicodeDataDecomposition,
  38     /// A decimal numeric representation of this codepoint, if it has the
  39     /// property `Numeric_Type=Decimal`.
  40     pub numeric_type_decimal: Option<u8>,
  41     /// A decimal numeric representation of this codepoint, if it has the
  42     /// property `Numeric_Type=Digit`. Note that while this field is still
  43     /// populated for existing codepoints, no new codepoints will have this
  44     /// field populated.
  45     pub numeric_type_digit: Option<u8>,
  46     /// A decimal or rational numeric representation of this codepoint, if it
  47     /// has the property `Numeric_Type=Numeric`.
  48     pub numeric_type_numeric: Option<UnicodeDataNumeric>,
  49     /// A boolean indicating whether this codepoint is "mirrored" in
  50     /// bidirectional text.
  51     pub bidi_mirrored: bool,
  52     /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
  53     /// this field is empty unless it is significantly different from
  54     /// the `name` field.
  55     pub unicode1_name: String,
  56     /// The ISO 10464 comment field. This no longer contains any non-NULL
  57     /// values.
  58     pub iso_comment: String,
  59     /// This codepoint's simple uppercase mapping, if it exists.
  60     pub simple_uppercase_mapping: Option<Codepoint>,
  61     /// This codepoint's simple lowercase mapping, if it exists.
  62     pub simple_lowercase_mapping: Option<Codepoint>,
  63     /// This codepoint's simple titlecase mapping, if it exists.
  64     pub simple_titlecase_mapping: Option<Codepoint>,
  65 }
  66
  67 impl UcdFile for UnicodeData {
  68     fn relative_file_path() -> &'static Path {
  69         Path::new("UnicodeData.txt")
  70     }
  71 }
  72
  73 impl UcdFileByCodepoint for UnicodeData {
  74     fn codepoints(&self) -> CodepointIter {
  75         self.codepoint.into_iter()
  76     }
  77 }
  78
  79 impl UnicodeData {
  80     /// Returns true if and only if this record corresponds to the start of a
  81     /// range.
  82     pub fn is_range_start(&self) -> bool {
  83         self.name.starts_with('<')
  84         && self.name.ends_with('>')
  85         && self.name.contains("First")
  86     }
  87
  88     /// Returns true if and only if this record corresponds to the end of a
  89     /// range.
  90     pub fn is_range_end(&self) -> bool {
  91         self.name.starts_with('<')
  92         && self.name.ends_with('>')
  93         && self.name.contains("Last")
  94     }
  95 }
  96
  97 impl FromStr for UnicodeData {
  98     type Err = Error;
  99
 100     fn from_str(line: &str) -> Result<UnicodeData, Error> {
 101         lazy_static! {
 102             static ref PARTS: Regex = Regex::new(
 103                 r"(?x)
 104                 ^
 105                 ([A-Z0-9]+);  #  1; codepoint
 106                 ([^;]+);      #  2; name
 107                 ([^;]+);      #  3; general category
 108                 ([0-9]+);     #  4; canonical combining class
 109                 ([^;]+);      #  5; bidi class
 110                 ([^;]*);      #  6; decomposition
 111                 ([0-9]*);     #  7; numeric type decimal
 112                 ([0-9]*);     #  8; numeric type digit
 113                 ([-0-9/]*);   #  9; numeric type numeric
 114                 ([YN]);       # 10; bidi mirrored
 115                 ([^;]*);      # 11; unicode1 name
 116                 ([^;]*);      # 12; ISO comment
 117                 ([^;]*);      # 13; simple uppercase mapping
 118                 ([^;]*);      # 14; simple lowercase mapping
 119                 ([^;]*)       # 15; simple titlecase mapping
 120                 $
 121                 "
 122             ).unwrap();
 123         };
 124         let caps = match PARTS.captures(line.trim()) {
 125             Some(caps) => caps,
 126             None => return err!("invalid UnicodeData line"),
 127         };
 128         let capget = |n| caps.get(n).unwrap().as_str();
 129         let mut data = UnicodeData::default();
 130
 131         data.codepoint = capget(1).parse()?;
 132         data.name = capget(2).to_string();
 133         data.general_category = capget(3).to_string();
 134         data.canonical_combining_class = match capget(4).parse() {
 135             Ok(n) => n,
 136             Err(err) => return err!(
 137                 "failed to parse canonical combining class '{}': {}",
 138                 capget(4), err),
 139         };
 140         data.bidi_class = capget(5).to_string();
 141         if !caps[6].is_empty() {
 142             data.decomposition = caps[6].parse()?;
 143         } else {
 144             data.decomposition.push(data.codepoint)?;
 145         }
 146         if !capget(7).is_empty() {
 147             data.numeric_type_decimal = Some(match capget(7).parse() {
 148                 Ok(n) => n,
 149                 Err(err) => return err!(
 150                     "failed to parse numeric type decimal '{}': {}",
 151                     capget(7), err),
 152             });
 153         }
 154         if !capget(8).is_empty() {
 155             data.numeric_type_digit = Some(match capget(8).parse() {
 156                 Ok(n) => n,
 157                 Err(err) => return err!(
 158                     "failed to parse numeric type digit '{}': {}",
 159                     capget(8), err),
 160             });
 161         }
 162         if !capget(9).is_empty() {
 163             data.numeric_type_numeric = Some(capget(9).parse()?);
 164         }
 165         data.bidi_mirrored = capget(10) == "Y";
 166         data.unicode1_name = capget(11).to_string();
 167         data.iso_comment = capget(12).to_string();
 168         if !capget(13).is_empty() {
 169             data.simple_uppercase_mapping = Some(capget(13).parse()?);
 170         }
 171         if !capget(14).is_empty() {
 172             data.simple_lowercase_mapping = Some(capget(14).parse()?);
 173         }
 174         if !capget(15).is_empty() {
 175             data.simple_titlecase_mapping = Some(capget(15).parse()?);
 176         }
 177         Ok(data)
 178     }
 179 }
 180
 181 impl fmt::Display for UnicodeData {
 182     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 183         write!(f, "{};", self.codepoint)?;
 184         write!(f, "{};", self.name)?;
 185         write!(f, "{};", self.general_category)?;
 186         write!(f, "{};", self.canonical_combining_class)?;
 187         write!(f, "{};", self.bidi_class)?;
 188         if self.decomposition.is_canonical()
 189             && self.decomposition.mapping() == &[self.codepoint]
 190         {
 191             write!(f, ";")?;
 192         } else {
 193             write!(f, "{};", self.decomposition)?;
 194         }
 195         if let Some(n) = self.numeric_type_decimal {
 196             write!(f, "{};", n)?;
 197         } else {
 198             write!(f, ";")?;
 199         }
 200         if let Some(n) = self.numeric_type_digit {
 201             write!(f, "{};", n)?;
 202         } else {
 203             write!(f, ";")?;
 204         }
 205         if let Some(n) = self.numeric_type_numeric {
 206             write!(f, "{};", n)?;
 207         } else {
 208             write!(f, ";")?;
 209         }
 210         write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
 211         write!(f, "{};", self.unicode1_name)?;
 212         write!(f, "{};", self.iso_comment)?;
 213         if let Some(cp) = self.simple_uppercase_mapping {
 214             write!(f, "{};", cp)?;
 215         } else {
 216             write!(f, ";")?;
 217         }
 218         if let Some(cp) = self.simple_lowercase_mapping {
 219             write!(f, "{};", cp)?;
 220         } else {
 221             write!(f, ";")?;
 222         }
 223         if let Some(cp) = self.simple_titlecase_mapping {
 224             write!(f, "{}", cp)?;
 225         }
 226         Ok(())
 227     }
 228 }
 229
 230 /// Represents a decomposition mapping of a single row in the
 231 /// `UnicodeData.txt` file.
 232 #[derive(Clone, Debug, Default, Eq, PartialEq)]
 233 pub struct UnicodeDataDecomposition {
 234     /// The formatting tag associated with this mapping, if present.
 235     pub tag: Option<UnicodeDataDecompositionTag>,
 236     /// The number of codepoints in this mapping.
 237     pub len: usize,
 238     /// The codepoints in the mapping. Entries beyond `len` in the mapping
 239     /// are always U+0000. If no mapping was present, then this always contains
 240     /// a single codepoint corresponding to this row's character.
 241     pub mapping: [Codepoint; 18],
 242 }
 243
 244 impl UnicodeDataDecomposition {
 245     /// Create a new decomposition mapping with the given tag and codepoints.
 246     ///
 247     /// If there are too many codepoints, then an error is returned.
 248     pub fn new(
 249         tag: Option<UnicodeDataDecompositionTag>,
 250         mapping: &[Codepoint],
 251     ) -> Result<UnicodeDataDecomposition, Error> {
 252         let mut x = UnicodeDataDecomposition::default();
 253         x.tag = tag;
 254         for &cp in mapping {
 255             x.push(cp)?;
 256         }
 257         Ok(x)
 258     }
 259
 260     /// Add a new codepoint to this decomposition's mapping.
 261     ///
 262     /// If the mapping is already full, then this returns an error.
 263     pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
 264         if self.len >= self.mapping.len() {
 265             return err!("invalid decomposition mapping (too many codepoints)");
 266         }
 267         self.mapping[self.len] = cp;
 268         self.len += 1;
 269         Ok(())
 270     }
 271
 272     /// Return the mapping as a slice of codepoints. The slice returned
 273     /// has length equivalent to the number of codepoints in this mapping.
 274     pub fn mapping(&self) -> &[Codepoint] {
 275         &self.mapping[..self.len]
 276     }
 277
 278     /// Returns true if and only if this decomposition mapping is canonical.
 279     pub fn is_canonical(&self) -> bool {
 280         self.tag.is_none()
 281     }
 282 }
 283
 284 impl FromStr for UnicodeDataDecomposition {
 285     type Err = Error;
 286
 287     fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
 288         lazy_static! {
 289             static ref WITH_TAG: Regex = Regex::new(
 290                 r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
 291             ).unwrap();
 292             static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap();
 293         };
 294         if s.is_empty() {
 295             return err!("expected non-empty string for \
 296                          UnicodeDataDecomposition value");
 297         }
 298         let caps = match WITH_TAG.captures(s) {
 299             Some(caps) => caps,
 300             None => return err!("invalid decomposition value"),
 301         };
 302         let mut decomp = UnicodeDataDecomposition::default();
 303         let mut codepoints = s;
 304         if let Some(m) = caps.name("tag") {
 305             decomp.tag = Some(m.as_str().parse()?);
 306             codepoints = &caps["chars"];
 307         }
 308         for m in CHARS.find_iter(codepoints) {
 309             let cp = m.as_str().parse()?;
 310             decomp.push(cp)?;
 311         }
 312         Ok(decomp)
 313     }
 314 }
 315
 316 impl fmt::Display for UnicodeDataDecomposition {
 317     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 318         if let Some(ref tag) = self.tag {
 319             write!(f, "<{}> ", tag)?;
 320         }
 321         let mut first = true;
 322         for cp in self.mapping() {
 323             if !first {
 324                 write!(f, " ")?;
 325             }
 326             first = false;
 327             write!(f, "{}", cp)?;
 328         }
 329         Ok(())
 330     }
 331 }
 332
 333 /// The formatting tag on a decomposition mapping.
 334 ///
 335 /// This is taken from
 336 /// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
 337 #[derive(Clone, Debug, Eq, PartialEq)]
 338 pub enum UnicodeDataDecompositionTag {
 339     /// <font>
 340     Font,
 341     /// <noBreak>
 342     NoBreak,
 343     /// <initial>
 344     Initial,
 345     /// <medial>
 346     Medial,
 347     /// <final>
 348     Final,
 349     /// <isolated>
 350     Isolated,
 351     /// <circle>
 352     Circle,
 353     /// <super>
 354     Super,
 355     /// <sub>
 356     Sub,
 357     /// <vertical>
 358     Vertical,
 359     /// <wide>
 360     Wide,
 361     /// <narrow>
 362     Narrow,
 363     /// <small>
 364     Small,
 365     /// <square>
 366     Square,
 367     /// <fraction>
 368     Fraction,
 369     /// <compat>
 370     Compat,
 371 }
 372
 373 impl FromStr for UnicodeDataDecompositionTag {
 374     type Err = Error;
 375
 376     fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
 377         use self::UnicodeDataDecompositionTag::*;
 378         Ok(match s {
 379             "font" => Font,
 380             "noBreak" => NoBreak,
 381             "initial" => Initial,
 382             "medial" => Medial,
 383             "final" => Final,
 384             "isolated" => Isolated,
 385             "circle" => Circle,
 386             "super" => Super,
 387             "sub" => Sub,
 388             "vertical" => Vertical,
 389             "wide" => Wide,
 390             "narrow" => Narrow,
 391             "small" => Small,
 392             "square" => Square,
 393             "fraction" => Fraction,
 394             "compat" => Compat,
 395             _ => return err!("invalid decomposition formatting tag: {}", s),
 396         })
 397     }
 398 }
 399
 400 impl fmt::Display for UnicodeDataDecompositionTag {
 401     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 402         use self::UnicodeDataDecompositionTag::*;
 403         let s = match *self {
 404             Font => "font",
 405             NoBreak => "noBreak",
 406             Initial => "initial",
 407             Medial => "medial",
 408             Final => "final",
 409             Isolated => "isolated",
 410             Circle => "circle",
 411             Super => "super",
 412             Sub => "sub",
 413             Vertical => "vertical",
 414             Wide => "wide",
 415             Narrow => "narrow",
 416             Small => "small",
 417             Square => "square",
 418             Fraction => "fraction",
 419             Compat => "compat",
 420         };
 421         write!(f, "{}", s)
 422     }
 423 }
 424
 425 /// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
 426 ///
 427 /// A numeric value can either be a signed integer or a rational number.
 428 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 429 pub enum UnicodeDataNumeric {
 430     /// An integer.
 431     Integer(i64),
 432     /// A rational number. The first is the numerator and the latter is the
 433     /// denominator.
 434     Rational(i64, i64),
 435 }
 436
 437 impl FromStr for UnicodeDataNumeric {
 438     type Err = Error;
 439
 440     fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
 441         if s.is_empty() {
 442             return err!(
 443                 "expected non-empty string for UnicodeDataNumeric value");
 444         }
 445         if let Some(pos) = s.find('/') {
 446             let (snum, sden) = (&s[..pos], &s[pos+1..]);
 447             let num = match snum.parse() {
 448                 Ok(num) => num,
 449                 Err(err) => {
 450                     return err!(
 451                         "invalid integer numerator '{}': {}", snum, err);
 452                 }
 453             };
 454             let den = match sden.parse() {
 455                 Ok(den) => den,
 456                 Err(err) => {
 457                     return err!(
 458                         "invalid integer denominator '{}': {}", sden, err);
 459                 }
 460             };
 461             Ok(UnicodeDataNumeric::Rational(num, den))
 462         } else {
 463             match s.parse() {
 464                 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
 465                 Err(err) => {
 466                     return err!(
 467                         "invalid integer denominator '{}': {}", s, err);
 468                 }
 469             }
 470         }
 471     }
 472 }
 473
 474 impl fmt::Display for UnicodeDataNumeric {
 475     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 476         match *self {
 477             UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
 478             UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
 479         }
 480     }
 481 }
 482
 483 /// An iterator adapter that expands rows in `UnicodeData.txt`.
 484 ///
 485 /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
 486 /// represented. Instead, they are represented by a pair of rows, indicating
 487 /// a range of codepoints with the same properties. For example, the Hangul
 488 /// syllable codepoints are represented by these two rows:
 489 ///
 490 /// ```ignore
 491 /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 492 /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 493 /// ```
 494 ///
 495 /// This iterator will wrap any iterator of `UnicodeData` and, when a range of
 496 /// Unicode codepoints is found, it will be expanded to the appropriate
 497 /// sequence of `UnicodeData` values. Note that all such expanded records will
 498 /// have an empty name.
 499 pub struct UnicodeDataExpander<I: Iterator> {
 500     /// The underlying iterator.
 501     it: iter::Peekable<I>,
 502     /// A range of codepoints to emit when we've found a pair. Otherwise,
 503     /// `None`.
 504     range: CodepointRange,
 505 }
 506
 507 struct CodepointRange {
 508     /// The codepoint range.
 509     range: Range<u32>,
 510     /// The start record. All subsequent records in this range are generated
 511     /// by cloning this and updating the codepoint/name.
 512     start_record: UnicodeData,
 513 }
 514
 515 impl<I: Iterator<Item=UnicodeData>> UnicodeDataExpander<I> {
 516     /// Create a new iterator that expands pairs of `UnicodeData` range
 517     /// records. All other records are passed through as-is.
 518     pub fn new<T>(it: T) -> UnicodeDataExpander<I>
 519             where T: IntoIterator<IntoIter=I, Item=I::Item>
 520     {
 521         UnicodeDataExpander {
 522             it: it.into_iter().peekable(),
 523             range: CodepointRange {
 524                 range: 0..0,
 525                 start_record: UnicodeData::default(),
 526             },
 527         }
 528     }
 529 }
 530
 531 impl<I: Iterator<Item=UnicodeData>>
 532     Iterator for UnicodeDataExpander<I>
 533 {
 534     type Item = UnicodeData;
 535
 536     fn next(&mut self) -> Option<UnicodeData> {
 537         if let Some(udata) = self.range.next() {
 538             return Some(udata);
 539         }
 540         let row1 = match self.it.next() {
 541             None => return None,
 542             Some(row1) => row1,
 543         };
 544         if !row1.is_range_start()
 545             || !self.it.peek().map_or(false, |row2| row2.is_range_end())
 546         {
 547             return Some(row1)
 548         }
 549         let row2 = self.it.next().unwrap();
 550         self.range = CodepointRange {
 551             range: row1.codepoint.value()..(row2.codepoint.value() + 1),
 552             start_record: row1,
 553         };
 554         self.next()
 555     }
 556 }
 557
 558 impl Iterator for CodepointRange {
 559     type Item = UnicodeData;
 560
 561     fn next(&mut self) -> Option<UnicodeData> {
 562         let cp = match self.range.next() {
 563             None => return None,
 564             Some(cp) => cp,
 565         };
 566         Some(UnicodeData {
 567             codepoint: Codepoint::from_u32(cp).unwrap(),
 568             name: "".to_string(),
 569             ..self.start_record.clone()
 570         })
 571     }
 572 }
 573
 574 #[cfg(test)]
 575 mod tests {
 576     use common::Codepoint;
 577
 578     use super::{
 579         UnicodeData, UnicodeDataNumeric,
 580         UnicodeDataDecomposition, UnicodeDataDecompositionTag,
 581     };
 582
 583     fn codepoint(n: u32) -> Codepoint {
 584         Codepoint::from_u32(n).unwrap()
 585     }
 586
 587     fn s(string: &str) -> String {
 588         string.to_string()
 589     }
 590
 591     #[test]
 592     fn parse1() {
 593         let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
 594         let data: UnicodeData = line.parse().unwrap();
 595         assert_eq!(data, UnicodeData {
 596             codepoint: codepoint(0x249d),
 597             name: s("PARENTHESIZED LATIN SMALL LETTER B"),
 598             general_category: s("So"),
 599             canonical_combining_class: 0,
 600             bidi_class: s("L"),
 601             decomposition: UnicodeDataDecomposition::new(
 602                 Some(UnicodeDataDecompositionTag::Compat),
 603                 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
 604             ).unwrap(),
 605             numeric_type_decimal: None,
 606             numeric_type_digit: None,
 607             numeric_type_numeric: None,
 608             bidi_mirrored: false,
 609             unicode1_name: s(""),
 610             iso_comment: s(""),
 611             simple_uppercase_mapping: None,
 612             simple_lowercase_mapping: None,
 613             simple_titlecase_mapping: None,
 614         });
 615     }
 616
 617     #[test]
 618     fn parse2() {
 619         let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
 620         let data: UnicodeData = line.parse().unwrap();
 621         assert_eq!(data, UnicodeData {
 622             codepoint: codepoint(0x000D),
 623             name: s("<control>"),
 624             general_category: s("Cc"),
 625             canonical_combining_class: 0,
 626             bidi_class: s("B"),
 627             decomposition: UnicodeDataDecomposition::new(
 628                 None, &[codepoint(0x000D)]).unwrap(),
 629             numeric_type_decimal: None,
 630             numeric_type_digit: None,
 631             numeric_type_numeric: None,
 632             bidi_mirrored: false,
 633             unicode1_name: s("CARRIAGE RETURN (CR)"),
 634             iso_comment: s(""),
 635             simple_uppercase_mapping: None,
 636             simple_lowercase_mapping: None,
 637             simple_titlecase_mapping: None,
 638         });
 639     }
 640
 641     #[test]
 642     fn parse3() {
 643         let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
 644         let data: UnicodeData = line.parse().unwrap();
 645         assert_eq!(data, UnicodeData {
 646             codepoint: codepoint(0x00BC),
 647             name: s("VULGAR FRACTION ONE QUARTER"),
 648             general_category: s("No"),
 649             canonical_combining_class: 0,
 650             bidi_class: s("ON"),
 651             decomposition: UnicodeDataDecomposition::new(
 652                 Some(UnicodeDataDecompositionTag::Fraction),
 653                 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
 654             ).unwrap(),
 655             numeric_type_decimal: None,
 656             numeric_type_digit: None,
 657             numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
 658             bidi_mirrored: false,
 659             unicode1_name: s("FRACTION ONE QUARTER"),
 660             iso_comment: s(""),
 661             simple_uppercase_mapping: None,
 662             simple_lowercase_mapping: None,
 663             simple_titlecase_mapping: None,
 664         });
 665     }
 666
 667     #[test]
 668     fn parse4() {
 669         let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
 670         let data: UnicodeData = line.parse().unwrap();
 671         assert_eq!(data, UnicodeData {
 672             codepoint: codepoint(0x0041),
 673             name: s("LATIN CAPITAL LETTER A"),
 674             general_category: s("Lu"),
 675             canonical_combining_class: 0,
 676             bidi_class: s("L"),
 677             decomposition: UnicodeDataDecomposition::new(
 678                 None, &[codepoint(0x0041)]).unwrap(),
 679             numeric_type_decimal: None,
 680             numeric_type_digit: None,
 681             numeric_type_numeric: None,
 682             bidi_mirrored: false,
 683             unicode1_name: s(""),
 684             iso_comment: s(""),
 685             simple_uppercase_mapping: None,
 686             simple_lowercase_mapping: Some(codepoint(0x0061)),
 687             simple_titlecase_mapping: None,
 688         });
 689     }
 690
 691     #[test]
 692     fn parse5() {
 693         let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
 694         let data: UnicodeData = line.parse().unwrap();
 695         assert_eq!(data, UnicodeData {
 696             codepoint: codepoint(0x0F33),
 697             name: s("TIBETAN DIGIT HALF ZERO"),
 698             general_category: s("No"),
 699             canonical_combining_class: 0,
 700             bidi_class: s("L"),
 701             decomposition: UnicodeDataDecomposition::new(
 702                 None, &[codepoint(0x0F33)]).unwrap(),
 703             numeric_type_decimal: None,
 704             numeric_type_digit: None,
 705             numeric_type_numeric: Some(UnicodeDataNumeric::Rational(-1, 2)),
 706             bidi_mirrored: false,
 707             unicode1_name: s(""),
 708             iso_comment: s(""),
 709             simple_uppercase_mapping: None,
 710             simple_lowercase_mapping: None,
 711             simple_titlecase_mapping: None,
 712         });
 713     }
 714
 715     #[test]
 716     fn expander() {
 717         use common::UcdLineParser;
 718         use super::UnicodeDataExpander;
 719
 720         let data = "\
 721 ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
 722 AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
 723 D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
 724 D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
 725 ";
 726         let records = UcdLineParser::new(None, data.as_bytes())
 727             .collect::<Result<Vec<_>, _>>()
 728             .unwrap();
 729         assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
 730     }
 731 }