vendor/tendril/src/fmt.rs

   1 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   2 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   3 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
   4 // option. This file may not be copied, modified, or distributed
   5 // except according to those terms.
   6
   7 //! Marker types for formats.
   8 //!
   9 //! This module defines the types and traits used to mark a `Tendril`
  10 //! with the format of data it contains. It includes those formats
  11 //! for which `Tendril` supports at least some operations without
  12 //! conversion.
  13 //!
  14 //! To convert a string tendril to/from a byte tendril in an arbitrary
  15 //! character encoding, see the `encode` and `decode` methods on
  16 //! `Tendril`.
  17 //!
  18 //! `Tendril` operations may become memory-unsafe if data invalid for
  19 //! the format sneaks in. For that reason, these traits require
  20 //! `unsafe impl`.
  21
  22 use std::default::Default;
  23 use std::{char, mem, str};
  24
  25 use futf::{self, Codepoint, Meaning};
  26
  27 /// Implementation details.
  28 ///
  29 /// You don't need these unless you are implementing
  30 /// a new format.
  31 pub mod imp {
  32     use std::default::Default;
  33     use std::{iter, mem, slice};
  34
  35     /// Describes how to fix up encodings when concatenating.
  36     ///
  37     /// We can drop characters on either side of the splice,
  38     /// and insert up to 4 bytes in the middle.
  39     pub struct Fixup {
  40         pub drop_left: u32,
  41         pub drop_right: u32,
  42         pub insert_len: u32,
  43         pub insert_bytes: [u8; 4],
  44     }
  45
  46     impl Default for Fixup {
  47         #[inline(always)]
  48         fn default() -> Fixup {
  49             Fixup {
  50                 drop_left: 0,
  51                 drop_right: 0,
  52                 insert_len: 0,
  53                 insert_bytes: [0; 4],
  54             }
  55         }
  56     }
  57
  58     #[inline(always)]
  59     unsafe fn from_u32_unchecked(n: u32) -> char {
  60         mem::transmute(n)
  61     }
  62
  63     pub struct SingleByteCharIndices<'a> {
  64         inner: iter::Enumerate<slice::Iter<'a, u8>>,
  65     }
  66
  67     impl<'a> Iterator for SingleByteCharIndices<'a> {
  68         type Item = (usize, char);
  69
  70         #[inline]
  71         fn next(&mut self) -> Option<(usize, char)> {
  72             self.inner
  73                 .next()
  74                 .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
  75         }
  76     }
  77
  78     impl<'a> SingleByteCharIndices<'a> {
  79         #[inline]
  80         pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
  81             SingleByteCharIndices {
  82                 inner: buf.iter().enumerate(),
  83             }
  84         }
  85     }
  86 }
  87
  88 /// Trait for format marker types.
  89 ///
  90 /// The type implementing this trait is usually not instantiated.
  91 /// It's used with a phantom type parameter of `Tendril`.
  92 pub unsafe trait Format {
  93     /// Check whether the buffer is valid for this format.
  94     fn validate(buf: &[u8]) -> bool;
  95
  96     /// Check whether the buffer is valid for this format.
  97     ///
  98     /// You may assume the buffer is a prefix of a valid buffer.
  99     #[inline]
 100     fn validate_prefix(buf: &[u8]) -> bool {
 101         <Self as Format>::validate(buf)
 102     }
 103
 104     /// Check whether the buffer is valid for this format.
 105     ///
 106     /// You may assume the buffer is a suffix of a valid buffer.
 107     #[inline]
 108     fn validate_suffix(buf: &[u8]) -> bool {
 109         <Self as Format>::validate(buf)
 110     }
 111
 112     /// Check whether the buffer is valid for this format.
 113     ///
 114     /// You may assume the buffer is a contiguous subsequence
 115     /// of a valid buffer, but not necessarily a prefix or
 116     /// a suffix.
 117     #[inline]
 118     fn validate_subseq(buf: &[u8]) -> bool {
 119         <Self as Format>::validate(buf)
 120     }
 121
 122     /// Compute any fixup needed when concatenating buffers.
 123     ///
 124     /// The default is to do nothing.
 125     ///
 126     /// The function is `unsafe` because it may assume the input
 127     /// buffers are already valid for the format. Also, no
 128     /// bounds-checking is performed on the return value!
 129     #[inline(always)]
 130     unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
 131         Default::default()
 132     }
 133 }
 134
 135 /// Indicates that one format is a subset of another.
 136 ///
 137 /// The subset format can be converted to the superset format
 138 /// for free.
 139 pub unsafe trait SubsetOf<Super>: Format
 140 where
 141     Super: Format,
 142 {
 143     /// Validate the *other* direction of conversion; check if
 144     /// this buffer from the superset format conforms to the
 145     /// subset format.
 146     ///
 147     /// The default calls `Self::validate`, but some conversions
 148     /// may implement a check which is cheaper than validating
 149     /// from scratch.
 150     fn revalidate_subset(x: &[u8]) -> bool {
 151         Self::validate(x)
 152     }
 153 }
 154
 155 /// Indicates a format which corresponds to a Rust slice type,
 156 /// representing exactly the same invariants.
 157 pub unsafe trait SliceFormat: Format + Sized {
 158     type Slice: ?Sized + Slice;
 159 }
 160
 161 /// Indicates a format which contains characters from Unicode
 162 /// (all of it, or some proper subset).
 163 pub unsafe trait CharFormat<'a>: Format {
 164     /// Iterator for characters and their byte indices.
 165     type Iter: Iterator<Item = (usize, char)>;
 166
 167     /// Iterate over the characters of the string and their byte
 168     /// indices.
 169     ///
 170     /// You may assume the buffer is *already validated* for `Format`.
 171     unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
 172
 173     /// Encode the character as bytes and pass them to a continuation.
 174     ///
 175     /// Returns `Err(())` iff the character cannot be represented.
 176     fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
 177     where
 178         F: FnOnce(&[u8]);
 179 }
 180
 181 /// Indicates a Rust slice type that is represented in memory as bytes.
 182 pub unsafe trait Slice {
 183     /// Access the raw bytes of the slice.
 184     fn as_bytes(&self) -> &[u8];
 185
 186     /// Convert a byte slice to this kind of slice.
 187     ///
 188     /// You may assume the buffer is *already validated*
 189     /// for `Format`.
 190     unsafe fn from_bytes(x: &[u8]) -> &Self;
 191
 192     /// Convert a byte slice to this kind of slice.
 193     ///
 194     /// You may assume the buffer is *already validated*
 195     /// for `Format`.
 196     unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
 197 }
 198
 199 /// Marker type for uninterpreted bytes.
 200 ///
 201 /// Validation will never fail for this format.
 202 #[derive(Copy, Clone, Default, Debug)]
 203 pub struct Bytes;
 204
 205 unsafe impl Format for Bytes {
 206     #[inline(always)]
 207     fn validate(_: &[u8]) -> bool {
 208         true
 209     }
 210 }
 211
 212 unsafe impl SliceFormat for Bytes {
 213     type Slice = [u8];
 214 }
 215
 216 unsafe impl Slice for [u8] {
 217     #[inline(always)]
 218     fn as_bytes(&self) -> &[u8] {
 219         self
 220     }
 221
 222     #[inline(always)]
 223     unsafe fn from_bytes(x: &[u8]) -> &[u8] {
 224         x
 225     }
 226
 227     #[inline(always)]
 228     unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
 229         x
 230     }
 231 }
 232
 233 /// Marker type for ASCII text.
 234 #[derive(Copy, Clone, Default, Debug)]
 235 pub struct ASCII;
 236
 237 unsafe impl Format for ASCII {
 238     #[inline]
 239     fn validate(buf: &[u8]) -> bool {
 240         buf.iter().all(|&n| n <= 127)
 241     }
 242
 243     #[inline(always)]
 244     fn validate_prefix(_: &[u8]) -> bool {
 245         true
 246     }
 247
 248     #[inline(always)]
 249     fn validate_suffix(_: &[u8]) -> bool {
 250         true
 251     }
 252
 253     #[inline(always)]
 254     fn validate_subseq(_: &[u8]) -> bool {
 255         true
 256     }
 257 }
 258
 259 unsafe impl SubsetOf<UTF8> for ASCII {}
 260 unsafe impl SubsetOf<Latin1> for ASCII {}
 261
 262 unsafe impl<'a> CharFormat<'a> for ASCII {
 263     type Iter = imp::SingleByteCharIndices<'a>;
 264
 265     #[inline]
 266     unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
 267         imp::SingleByteCharIndices::new(buf)
 268     }
 269
 270     #[inline]
 271     fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
 272     where
 273         F: FnOnce(&[u8]),
 274     {
 275         let n = ch as u32;
 276         if n > 0x7F {
 277             return Err(());
 278         }
 279         cont(&[n as u8]);
 280         Ok(())
 281     }
 282 }
 283
 284 /// Marker type for UTF-8 text.
 285 #[derive(Copy, Clone, Default, Debug)]
 286 pub struct UTF8;
 287
 288 unsafe impl Format for UTF8 {
 289     #[inline]
 290     fn validate(buf: &[u8]) -> bool {
 291         str::from_utf8(buf).is_ok()
 292     }
 293
 294     #[inline]
 295     fn validate_prefix(buf: &[u8]) -> bool {
 296         if buf.len() == 0 {
 297             return true;
 298         }
 299         match futf::classify(buf, buf.len() - 1) {
 300             Some(Codepoint {
 301                 meaning: Meaning::Whole(_),
 302                 ..
 303             }) => true,
 304             _ => false,
 305         }
 306     }
 307
 308     #[inline]
 309     fn validate_suffix(buf: &[u8]) -> bool {
 310         if buf.len() == 0 {
 311             return true;
 312         }
 313         match futf::classify(buf, 0) {
 314             Some(Codepoint {
 315                 meaning: Meaning::Whole(_),
 316                 ..
 317             }) => true,
 318             _ => false,
 319         }
 320     }
 321
 322     #[inline]
 323     fn validate_subseq(buf: &[u8]) -> bool {
 324         <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
 325     }
 326 }
 327
 328 unsafe impl SubsetOf<WTF8> for UTF8 {}
 329
 330 unsafe impl SliceFormat for UTF8 {
 331     type Slice = str;
 332 }
 333
 334 unsafe impl Slice for str {
 335     #[inline(always)]
 336     fn as_bytes(&self) -> &[u8] {
 337         str::as_bytes(self)
 338     }
 339
 340     #[inline(always)]
 341     unsafe fn from_bytes(x: &[u8]) -> &str {
 342         str::from_utf8_unchecked(x)
 343     }
 344
 345     #[inline(always)]
 346     unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
 347         mem::transmute(x)
 348     }
 349 }
 350
 351 unsafe impl<'a> CharFormat<'a> for UTF8 {
 352     type Iter = str::CharIndices<'a>;
 353
 354     #[inline]
 355     unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
 356         str::from_utf8_unchecked(buf).char_indices()
 357     }
 358
 359     #[inline]
 360     fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
 361     where
 362         F: FnOnce(&[u8]),
 363     {
 364         cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
 365         Ok(())
 366     }
 367 }
 368
 369 /// Marker type for WTF-8 text.
 370 ///
 371 /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
 372 #[derive(Copy, Clone, Default, Debug)]
 373 pub struct WTF8;
 374
 375 #[inline]
 376 fn wtf8_meaningful(m: Meaning) -> bool {
 377     match m {
 378         Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
 379         _ => false,
 380     }
 381 }
 382
 383 unsafe impl Format for WTF8 {
 384     #[inline]
 385     fn validate(buf: &[u8]) -> bool {
 386         let mut i = 0;
 387         let mut prev_lead = false;
 388         while i < buf.len() {
 389             let codept = unwrap_or_return!(futf::classify(buf, i), false);
 390             if !wtf8_meaningful(codept.meaning) {
 391                 return false;
 392             }
 393             i += codept.bytes.len();
 394             prev_lead = match codept.meaning {
 395                 Meaning::TrailSurrogate(_) if prev_lead => return false,
 396                 Meaning::LeadSurrogate(_) => true,
 397                 _ => false,
 398             };
 399         }
 400
 401         true
 402     }
 403
 404     #[inline]
 405     fn validate_prefix(buf: &[u8]) -> bool {
 406         if buf.len() == 0 {
 407             return true;
 408         }
 409         match futf::classify(buf, buf.len() - 1) {
 410             Some(c) => wtf8_meaningful(c.meaning),
 411             _ => false,
 412         }
 413     }
 414
 415     #[inline]
 416     fn validate_suffix(buf: &[u8]) -> bool {
 417         if buf.len() == 0 {
 418             return true;
 419         }
 420         match futf::classify(buf, 0) {
 421             Some(c) => wtf8_meaningful(c.meaning),
 422             _ => false,
 423         }
 424     }
 425
 426     #[inline]
 427     fn validate_subseq(buf: &[u8]) -> bool {
 428         <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
 429     }
 430
 431     #[inline]
 432     unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
 433         const ERR: &'static str = "WTF8: internal error";
 434
 435         if lhs.len() >= 3 && rhs.len() >= 3 {
 436             if let (
 437                 Some(Codepoint {
 438                     meaning: Meaning::LeadSurrogate(hi),
 439                     ..
 440                 }),
 441                 Some(Codepoint {
 442                     meaning: Meaning::TrailSurrogate(lo),
 443                     ..
 444                 }),
 445             ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
 446             {
 447                 let mut fixup = imp::Fixup {
 448                     drop_left: 3,
 449                     drop_right: 3,
 450                     insert_len: 0,
 451                     insert_bytes: [0_u8; 4],
 452                 };
 453
 454                 let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
 455
 456                 let ch = char::from_u32(n).expect(ERR);
 457                 fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
 458
 459                 return fixup;
 460             }
 461         }
 462
 463         Default::default()
 464     }
 465 }
 466
 467 /// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
 468 ///
 469 /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
 470 /// C0 and C1 control characters from ECMA-48 / ISO 6429.
 471 ///
 472 /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
 473 /// many other aliases), which actually stand for Windows-1252.
 474 #[derive(Copy, Clone, Default, Debug)]
 475 pub struct Latin1;
 476
 477 unsafe impl Format for Latin1 {
 478     #[inline(always)]
 479     fn validate(_: &[u8]) -> bool {
 480         true
 481     }
 482
 483     #[inline(always)]
 484     fn validate_prefix(_: &[u8]) -> bool {
 485         true
 486     }
 487
 488     #[inline(always)]
 489     fn validate_suffix(_: &[u8]) -> bool {
 490         true
 491     }
 492
 493     #[inline(always)]
 494     fn validate_subseq(_: &[u8]) -> bool {
 495         true
 496     }
 497 }
 498
 499 unsafe impl<'a> CharFormat<'a> for Latin1 {
 500     type Iter = imp::SingleByteCharIndices<'a>;
 501
 502     #[inline]
 503     unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
 504         imp::SingleByteCharIndices::new(buf)
 505     }
 506
 507     #[inline]
 508     fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
 509     where
 510         F: FnOnce(&[u8]),
 511     {
 512         let n = ch as u32;
 513         if n > 0xFF {
 514             return Err(());
 515         }
 516         cont(&[n as u8]);
 517         Ok(())
 518     }
 519 }