vendor/utf8-ranges/src/lib.rs

   1 /*!
   2 Crate `utf8-ranges` converts ranges of Unicode scalar values to equivalent
   3 ranges of UTF-8 bytes. This is useful for constructing byte based automatons
   4 that need to embed UTF-8 decoding.
   5
   6 See the documentation on the `Utf8Sequences` iterator for more details and
   7 an example.
   8
   9 # Wait, what is this?
  10
  11 This is simplest to explain with an example. Let's say you wanted to test
  12 whether a particular byte sequence was a Cyrillic character. One possible
  13 scalar value range is `[0400-04FF]`. The set of allowed bytes for this
  14 range can be expressed as a sequence of byte ranges:
  15
  16 ```ignore
  17 [D0-D3][80-BF]
  18 ```
  19
  20 This is simple enough: simply encode the boundaries, `0400` encodes to
  21 `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
  22 corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
  23
  24 However, what if you wanted to add the Cyrillic Supplementary characters to
  25 your range? Your range might then become `[0400-052F]`. The same procedure
  26 as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
  27 you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
  28 this isn't quite correct because this range doesn't capture many characters,
  29 for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
  30
  31 Instead, you need multiple sequences of byte ranges:
  32
  33 ```ignore
  34 [D0-D3][80-BF]  # matches codepoints 0400-04FF
  35 [D4][80-AF]     # matches codepoints 0500-052F
  36 ```
  37
  38 This gets even more complicated if you want bigger ranges, particularly if
  39 they naively contain surrogate codepoints. For example, the sequence of byte
  40 ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
  41
  42 ```ignore
  43 [0-7F]
  44 [C2-DF][80-BF]
  45 [E0][A0-BF][80-BF]
  46 [E1-EC][80-BF][80-BF]
  47 [ED][80-9F][80-BF]
  48 [EE-EF][80-BF][80-BF]
  49 ```
  50
  51 Note that the byte ranges above will *not* match any erroneous encoding of
  52 UTF-8, including encodings of surrogate codepoints.
  53
  54 And, of course, for all of Unicode (`[000000-10FFFF]`):
  55
  56 ```ignore
  57 [0-7F]
  58 [C2-DF][80-BF]
  59 [E0][A0-BF][80-BF]
  60 [E1-EC][80-BF][80-BF]
  61 [ED][80-9F][80-BF]
  62 [EE-EF][80-BF][80-BF]
  63 [F0][90-BF][80-BF][80-BF]
  64 [F1-F3][80-BF][80-BF][80-BF]
  65 [F4][80-8F][80-BF][80-BF]
  66 ```
  67
  68 This crate automates the process of creating these byte ranges from ranges of
  69 Unicode scalar values.
  70
  71 # Why would I ever use this?
  72
  73 You probably won't ever need this. In 99% of cases, you just decode the byte
  74 sequence into a Unicode scalar value and compare scalar values directly.
  75 However, this explicit decoding step isn't always possible. For example, the
  76 construction of some finite state machines may benefit from converting ranges
  77 of scalar values into UTF-8 decoder automata (e.g., for character classes in
  78 regular expressions).
  79
  80 # Lineage
  81
  82 I got the idea and general implementation strategy from Russ Cox in his
  83 [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
  84 Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
  85 I also got the idea from
  86 [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
  87 which uses it for executing automata on their term index.
  88 */
  89
  90 #![deny(missing_docs)]
  91
  92 #[cfg(test)] extern crate quickcheck;
  93
  94 use std::char;
  95 use std::fmt;
  96 use std::slice;
  97
  98 use char_utf8::encode_utf8;
  99
 100 const MAX_UTF8_BYTES: usize = 4;
 101
 102 mod char_utf8;
 103
 104 /// Utf8Sequence represents a sequence of byte ranges.
 105 ///
 106 /// To match a Utf8Sequence, a candidate byte sequence must match each
 107 /// successive range.
 108 ///
 109 /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
 110 /// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
 111 #[derive(Copy, Clone, Eq, PartialEq)]
 112 pub enum Utf8Sequence {
 113     /// One byte range.
 114     One(Utf8Range),
 115     /// Two successive byte ranges.
 116     Two([Utf8Range; 2]),
 117     /// Three successive byte ranges.
 118     Three([Utf8Range; 3]),
 119     /// Four successive byte ranges.
 120     Four([Utf8Range; 4]),
 121 }
 122
 123 impl Utf8Sequence {
 124     /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
 125     /// range.
 126     ///
 127     /// This assumes that `start` and `end` have the same length.
 128     fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
 129         assert_eq!(start.len(), end.len());
 130         match start.len() {
 131             2 => Utf8Sequence::Two([
 132                 Utf8Range::new(start[0], end[0]),
 133                 Utf8Range::new(start[1], end[1]),
 134             ]),
 135             3 => Utf8Sequence::Three([
 136                 Utf8Range::new(start[0], end[0]),
 137                 Utf8Range::new(start[1], end[1]),
 138                 Utf8Range::new(start[2], end[2]),
 139             ]),
 140             4 => Utf8Sequence::Four([
 141                 Utf8Range::new(start[0], end[0]),
 142                 Utf8Range::new(start[1], end[1]),
 143                 Utf8Range::new(start[2], end[2]),
 144                 Utf8Range::new(start[3], end[3]),
 145             ]),
 146             n => unreachable!("invalid encoded length: {}", n),
 147         }
 148     }
 149
 150     /// Returns the underlying sequence of byte ranges as a slice.
 151     pub fn as_slice(&self) -> &[Utf8Range] {
 152         use self::Utf8Sequence::*;
 153         match *self {
 154             One(ref r) => unsafe { slice::from_raw_parts(r, 1) },
 155             Two(ref r) => &r[..],
 156             Three(ref r) => &r[..],
 157             Four(ref r) => &r[..],
 158         }
 159     }
 160
 161     /// Returns the number of byte ranges in this sequence.
 162     ///
 163     /// The length is guaranteed to be in the closed interval `[1, 4]`.
 164     pub fn len(&self) -> usize {
 165         self.as_slice().len()
 166     }
 167
 168     /// Returns true if and only if a prefix of `bytes` matches this sequence
 169     /// of byte ranges.
 170     pub fn matches(&self, bytes: &[u8]) -> bool {
 171         if bytes.len() < self.len() {
 172             return false;
 173         }
 174         for (&b, r) in bytes.iter().zip(self) {
 175             if !r.matches(b) {
 176                 return false;
 177             }
 178         }
 179         true
 180     }
 181 }
 182
 183 impl<'a> IntoIterator for &'a Utf8Sequence {
 184     type IntoIter = slice::Iter<'a, Utf8Range>;
 185     type Item = &'a Utf8Range;
 186
 187     fn into_iter(self) -> Self::IntoIter {
 188         self.as_slice().into_iter()
 189     }
 190 }
 191
 192 impl fmt::Debug for Utf8Sequence {
 193     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 194         use self::Utf8Sequence::*;
 195         match *self {
 196             One(ref r) => write!(f, "{:?}", r),
 197             Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
 198             Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
 199             Four(ref r) => write!(f, "{:?}{:?}{:?}{:?}",
 200                                   r[0], r[1], r[2], r[3]),
 201         }
 202     }
 203 }
 204
 205 /// A single inclusive range of UTF-8 bytes.
 206 #[derive(Clone, Copy, PartialEq, Eq)]
 207 pub struct Utf8Range {
 208     /// Start of byte range (inclusive).
 209     pub start: u8,
 210     /// End of byte range (inclusive).
 211     pub end: u8,
 212 }
 213
 214 impl Utf8Range {
 215     fn new(start: u8, end: u8) -> Self {
 216         Utf8Range { start: start, end: end }
 217     }
 218
 219     /// Returns true if and only if the given byte is in this range.
 220     pub fn matches(&self, b: u8) -> bool {
 221         self.start <= b && b <= self.end
 222     }
 223 }
 224
 225 impl fmt::Debug for Utf8Range {
 226     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 227         if self.start == self.end {
 228             write!(f, "[{:X}]", self.start)
 229         } else {
 230             write!(f, "[{:X}-{:X}]", self.start, self.end)
 231         }
 232     }
 233 }
 234
 235 /// An iterator over ranges of matching UTF-8 byte sequences.
 236 ///
 237 /// The iteration represents an alternation of comprehensive byte sequences
 238 /// that match precisely the set of UTF-8 encoded scalar values.
 239 ///
 240 /// A byte sequence corresponds to one of the scalar values in the range given
 241 /// if and only if it completely matches exactly one of the sequences of byte
 242 /// ranges produced by this iterator.
 243 ///
 244 /// Each sequence of byte ranges matches a unique set of bytes. That is, no two
 245 /// sequences will match the same bytes.
 246 ///
 247 /// # Example
 248 ///
 249 /// This shows how to match an arbitrary byte sequence against a range of
 250 /// scalar values.
 251 ///
 252 /// ```rust
 253 /// use utf8_ranges::{Utf8Sequences, Utf8Sequence};
 254 ///
 255 /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
 256 ///     for range in seqs {
 257 ///         if range.matches(bytes) {
 258 ///             return true;
 259 ///         }
 260 ///     }
 261 ///     false
 262 /// }
 263 ///
 264 /// // Test the basic multilingual plane.
 265 /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
 266 ///
 267 /// // UTF-8 encoding of 'a'.
 268 /// assert!(matches(&seqs, &[0x61]));
 269 /// // UTF-8 encoding of '☃' (`\u{2603}`).
 270 /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
 271 /// // UTF-8 encoding of `\u{10348}` (outside the BMP).
 272 /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
 273 /// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
 274 /// // which is invalid UTF-8, and therefore fails, despite the fact that
 275 /// // the corresponding codepoint (0xD800) falls in the range given.
 276 /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
 277 /// // And fails against plain old invalid UTF-8.
 278 /// assert!(!matches(&seqs, &[0xFF, 0xFF]));
 279 /// ```
 280 ///
 281 /// If this example seems circuitous, that's because it is! It's meant to be
 282 /// illustrative. In practice, you could just try to decode your byte sequence
 283 /// and compare it with the scalar value range directly. However, this is not
 284 /// always possible (for example, in a byte based automaton).
 285 pub struct Utf8Sequences {
 286     range_stack: Vec<ScalarRange>,
 287 }
 288
 289 impl Utf8Sequences {
 290     /// Create a new iterator over UTF-8 byte ranges for the scalar value range
 291     /// given.
 292     pub fn new(start: char, end: char) -> Self {
 293         let mut it = Utf8Sequences { range_stack: vec![] };
 294         it.push(start as u32, end as u32);
 295         it
 296     }
 297
 298     /// reset resets the scalar value range.
 299     /// Any existing state is cleared, but resources may be reused.
 300     ///
 301     /// N.B. Benchmarks say that this method is dubious.
 302     #[doc(hidden)]
 303     pub fn reset(&mut self, start: char, end: char) {
 304         self.range_stack.clear();
 305         self.push(start as u32, end as u32);
 306     }
 307
 308     fn push(&mut self, start: u32, end: u32) {
 309         self.range_stack.push(ScalarRange { start: start, end: end });
 310     }
 311 }
 312
 313 struct ScalarRange {
 314     start: u32,
 315     end: u32,
 316 }
 317
 318 impl fmt::Debug for ScalarRange {
 319     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 320         write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
 321     }
 322 }
 323
 324 impl Iterator for Utf8Sequences {
 325     type Item = Utf8Sequence;
 326
 327     fn next(&mut self) -> Option<Self::Item> {
 328     'TOP:
 329         while let Some(mut r) = self.range_stack.pop() {
 330         'INNER:
 331             loop {
 332                 if let Some((r1, r2)) = r.split() {
 333                     self.push(r2.start, r2.end);
 334                     r.start = r1.start;
 335                     r.end = r1.end;
 336                     continue 'INNER;
 337                 }
 338                 if !r.is_valid() {
 339                     continue 'TOP;
 340                 }
 341                 for i in 1..MAX_UTF8_BYTES {
 342                     let max = max_scalar_value(i);
 343                     if r.start <= max && max < r.end {
 344                         self.push(max + 1, r.end);
 345                         r.end = max;
 346                         continue 'INNER;
 347                     }
 348                 }
 349                 if let Some(ascii_range) = r.as_ascii() {
 350                     return Some(Utf8Sequence::One(ascii_range));
 351                 }
 352                 for i in 1..MAX_UTF8_BYTES {
 353                     let m = (1 << (6 * i)) - 1;
 354                     if (r.start & !m) != (r.end & !m) {
 355                         if (r.start & m) != 0 {
 356                             self.push((r.start | m) + 1, r.end);
 357                             r.end = r.start | m;
 358                             continue 'INNER;
 359                         }
 360                         if (r.end & m) != m {
 361                             self.push(r.end & !m, r.end);
 362                             r.end = (r.end & !m) - 1;
 363                             continue 'INNER;
 364                         }
 365                     }
 366                 }
 367                 let mut start = [0; MAX_UTF8_BYTES];
 368                 let mut end = [0; MAX_UTF8_BYTES];
 369                 let n = r.encode(&mut start, &mut end);
 370                 return Some(Utf8Sequence::from_encoded_range(
 371                     &start[0..n], &end[0..n]));
 372             }
 373         }
 374         None
 375     }
 376 }
 377
 378 impl ScalarRange {
 379     /// split splits this range if it overlaps with a surrogate codepoint.
 380     ///
 381     /// Either or both ranges may be invalid.
 382     fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
 383         if self.start < 0xE000 && self.end > 0xD7FF {
 384             Some((ScalarRange {
 385                 start: self.start,
 386                 end: 0xD7FF,
 387             }, ScalarRange {
 388                 start: 0xE000,
 389                 end: self.end,
 390             }))
 391         } else {
 392             None
 393         }
 394     }
 395
 396     /// is_valid returns true if and only if start <= end.
 397     fn is_valid(&self) -> bool {
 398         self.start <= self.end
 399     }
 400
 401     /// as_ascii returns this range as a Utf8Range if and only if all scalar
 402     /// values in this range can be encoded as a single byte.
 403     fn as_ascii(&self) -> Option<Utf8Range> {
 404         if self.is_ascii() {
 405             Some(Utf8Range::new(self.start as u8, self.end as u8))
 406         } else {
 407             None
 408         }
 409     }
 410
 411     /// is_ascii returns true if the range is ASCII only (i.e., takes a single
 412     /// byte to encode any scalar value).
 413     fn is_ascii(&self) -> bool {
 414         self.is_valid() && self.end <= 0x7f
 415     }
 416
 417     /// encode writes the UTF-8 encoding of the start and end of this range
 418     /// to the corresponding destination slices.
 419     ///
 420     /// The slices should have room for at least `MAX_UTF8_BYTES`.
 421     fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
 422         let cs = char::from_u32(self.start).unwrap();
 423         let ce = char::from_u32(self.end).unwrap();
 424         let n = encode_utf8(cs, start).unwrap();
 425         let m = encode_utf8(ce, end).unwrap();
 426         assert_eq!(n, m);
 427         n
 428     }
 429 }
 430
 431 fn max_scalar_value(nbytes: usize) -> u32 {
 432     match nbytes {
 433         1 => 0x007F,
 434         2 => 0x07FF,
 435         3 => 0xFFFF,
 436         4 => 0x10FFFF,
 437         _ => unreachable!("invalid UTF-8 byte sequence size"),
 438     }
 439 }
 440
 441 #[cfg(test)]
 442 mod tests {
 443     use std::char;
 444
 445     use quickcheck::{TestResult, quickcheck};
 446
 447     use char_utf8::encode_utf8;
 448     use {MAX_UTF8_BYTES, Utf8Range, Utf8Sequences};
 449
 450     fn rutf8(s: u8, e: u8) -> Utf8Range {
 451         Utf8Range::new(s, e)
 452     }
 453
 454     fn never_accepts_surrogate_codepoints(start: char, end: char) {
 455         let mut buf = [0; MAX_UTF8_BYTES];
 456         for cp in 0xD800..0xE000 {
 457             let c = unsafe { ::std::mem::transmute(cp) };
 458             let n = encode_utf8(c, &mut buf).unwrap();
 459             for r in Utf8Sequences::new(start, end) {
 460                 if r.matches(&buf[0..n]) {
 461                     panic!("Sequence ({:X}, {:X}) contains range {:?}, \
 462                             which matches surrogate code point {:X} \
 463                             with encoded bytes {:?}",
 464                            start as u32, end as u32, r, cp, &buf[0..n]);
 465                 }
 466             }
 467         }
 468     }
 469
 470     #[test]
 471     fn codepoints_no_surrogates() {
 472         never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
 473         never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
 474         never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
 475         never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
 476         never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
 477     }
 478
 479     #[test]
 480     fn single_codepoint_one_sequence() {
 481         // Tests that every range of scalar values that contains a single
 482         // scalar value is recognized by one sequence of byte ranges.
 483         for i in 0x0..(0x10FFFF + 1) {
 484             let c = match char::from_u32(i) {
 485                 None => continue,
 486                 Some(c) => c,
 487             };
 488             let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
 489             assert_eq!(seqs.len(), 1);
 490         }
 491     }
 492
 493     #[test]
 494     fn qc_codepoints_no_surrogate() {
 495         fn p(s: char, e: char) -> TestResult {
 496             if s > e {
 497                 return TestResult::discard();
 498             }
 499             never_accepts_surrogate_codepoints(s, e);
 500             TestResult::passed()
 501         }
 502         quickcheck(p as fn(char, char) -> TestResult);
 503     }
 504
 505     #[test]
 506     fn bmp() {
 507         use Utf8Sequence::*;
 508
 509         let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}')
 510                                  .collect::<Vec<_>>();
 511         assert_eq!(seqs, vec![
 512             One(rutf8(0x0, 0x7F)),
 513             Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
 514             Three([rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF)]),
 515             Three([rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
 516             Three([rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF)]),
 517             Three([rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
 518         ]);
 519     }
 520
 521     #[test]
 522     fn scratch() {
 523         for range in Utf8Sequences::new('\u{0}', '\u{FFFF}') {
 524             println!("{:?}", range);
 525         }
 526     }
 527 }