vendor/regex-automata/src/dfa/accel.rs

   1 // This module defines some core types for dealing with accelerated DFA states.
   2 // Briefly, a DFA state can be "accelerated" if all of its transitions except
   3 // for a few loop back to itself. This directly implies that the only way out
   4 // of such a state is if a byte corresponding to one of those non-loopback
   5 // transitions is found. Such states are often found in simple repetitions in
   6 // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
   7 // DFA with regex-cli:
   8 //
   9 //     $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
  10 //     dense::DFA(
  11 //     D 000000:
  12 //     Q 000001:
  13 //      *000002:
  14 //     A 000003: \x00-` => 3, a => 5, b-\xFF => 3
  15 //      >000004: \x00-` => 3, a => 4, b-\xFF => 3
  16 //       000005: \x00-\xFF => 2, EOI => 2
  17 //     )
  18 //
  19 // In particular, state 3 is accelerated (shown via the 'A' indicator) since
  20 // the only way to leave that state once entered is to see an 'a' byte. If
  21 // there is a long run of non-'a' bytes, then using something like 'memchr'
  22 // to find the next 'a' byte can be significantly faster than just using the
  23 // standard byte-at-a-time state machine.
  24 //
  25 // Unfortunately, this optimization rarely applies when Unicode is enabled.
  26 // For example, patterns like '[^a]' don't actually match any byte that isn't
  27 // 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
  28 // 'a'. This makes the state machine much more complex---far beyond a single
  29 // state---and removes the ability to easily accelerate it. (Because if the
  30 // machine sees a non-UTF-8 sequence, then the machine won't match through it.)
  31 //
  32 // In practice, we only consider accelerating states that have 3 or fewer
  33 // non-loop transitions. At a certain point, you get diminishing returns, but
  34 // also because that's what the memchr crate supports. The structures below
  35 // hard-code this assumption and provide (de)serialization APIs for use inside
  36 // a DFA.
  37 //
  38 // And finally, note that there is some trickery involved in making it very
  39 // fast to not only check whether a state is accelerated at search time, but
  40 // also to access the bytes to search for to implement the acceleration itself.
  41 // dfa/special.rs provides more detail, but the short story is that all
  42 // accelerated states appear contiguously in a DFA. This means we can represent
  43 // the ID space of all accelerated DFA states with a single range. So given
  44 // a state ID, we can determine whether it's accelerated via
  45 //
  46 //     min_accel_id <= id <= max_accel_id
  47 //
  48 // And find its corresponding accelerator with:
  49 //
  50 //     accels.get((id - min_accel_id) / dfa_stride)
  51
  52 use core::convert::{TryFrom, TryInto};
  53
  54 #[cfg(feature = "alloc")]
  55 use alloc::{vec, vec::Vec};
  56
  57 use crate::util::bytes::{self, DeserializeError, Endian, SerializeError};
  58
  59 /// The base type used to represent a collection of accelerators.
  60 ///
  61 /// While an `Accel` is represented as a fixed size array of bytes, a
  62 /// *collection* of `Accel`s (called `Accels`) is represented internally as a
  63 /// slice of u32. While it's a bit unnatural to do this and costs us a bit of
  64 /// fairly low-risk not-safe code, it lets us remove the need for a second type
  65 /// parameter in the definition of dense::DFA. (Which really wants everything
  66 /// to be a slice of u32.)
  67 type AccelTy = u32;
  68
  69 /// The size of the unit of representation for accelerators.
  70 ///
  71 /// ACCEL_CAP *must* be a multiple of this size.
  72 const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
  73
  74 /// The maximum length in bytes that a single Accel can be. This is distinct
  75 /// from the capacity of an accelerator in that the length represents only the
  76 /// bytes that should be read.
  77 const ACCEL_LEN: usize = 4;
  78
  79 /// The capacity of each accelerator, in bytes. We set this to 8 since it's a
  80 /// multiple of 4 (our ID size) and because it gives us a little wiggle room
  81 /// if we want to support more accel bytes in the future without a breaking
  82 /// change.
  83 ///
  84 /// This MUST be a multiple of ACCEL_TY_SIZE.
  85 const ACCEL_CAP: usize = 8;
  86
  87 /// Search for between 1 and 3 needle bytes in the given haystack, starting the
  88 /// search at the given position. If `needles` has a length other than 1-3,
  89 /// then this panics.
  90 #[inline(always)]
  91 pub(crate) fn find_fwd(
  92     needles: &[u8],
  93     haystack: &[u8],
  94     at: usize,
  95 ) -> Option<usize> {
  96     let bs = needles;
  97     let i = match needles.len() {
  98         1 => memchr::memchr(bs[0], &haystack[at..])?,
  99         2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
 100         3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
 101         0 => panic!("cannot find with empty needles"),
 102         n => panic!("invalid needles length: {}", n),
 103     };
 104     Some(at + i)
 105 }
 106
 107 /// Search for between 1 and 3 needle bytes in the given haystack in reverse,
 108 /// starting the search at the given position. If `needles` has a length other
 109 /// than 1-3, then this panics.
 110 #[inline(always)]
 111 pub(crate) fn find_rev(
 112     needles: &[u8],
 113     haystack: &[u8],
 114     at: usize,
 115 ) -> Option<usize> {
 116     let bs = needles;
 117     match needles.len() {
 118         1 => memchr::memrchr(bs[0], &haystack[..at]),
 119         2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
 120         3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
 121         0 => panic!("cannot find with empty needles"),
 122         n => panic!("invalid needles length: {}", n),
 123     }
 124 }
 125
 126 /// Represents the accelerators for all accelerated states in a dense DFA.
 127 ///
 128 /// The `A` type parameter represents the type of the underlying bytes.
 129 /// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
 130 #[derive(Clone)]
 131 pub(crate) struct Accels<A> {
 132     /// A length prefixed slice of contiguous accelerators. See the top comment
 133     /// in this module for more details on how we can jump from a DFA's state
 134     /// ID to an accelerator in this list.
 135     ///
 136     /// The first 4 bytes always correspond to the number of accelerators
 137     /// that follow.
 138     accels: A,
 139 }
 140
 141 #[cfg(feature = "alloc")]
 142 impl Accels<Vec<AccelTy>> {
 143     /// Create an empty sequence of accelerators for a DFA.
 144     pub fn empty() -> Accels<Vec<AccelTy>> {
 145         Accels { accels: vec![0] }
 146     }
 147
 148     /// Add an accelerator to this sequence.
 149     ///
 150     /// This adds to the accelerator to the end of the sequence and therefore
 151     /// should be done in correspondence with its state in the DFA.
 152     ///
 153     /// This panics if this results in more accelerators than AccelTy::MAX.
 154     pub fn add(&mut self, accel: Accel) {
 155         self.accels.extend_from_slice(&accel.as_accel_tys());
 156         let len = self.len();
 157         self.set_len(len + 1);
 158     }
 159
 160     /// Set the number of accelerators in this sequence, which is encoded in
 161     /// the first 4 bytes of the underlying bytes.
 162     fn set_len(&mut self, new_len: usize) {
 163         // The only way an accelerator gets added is if a state exists for
 164         // it, and if a state exists, then its index is guaranteed to be
 165         // representable by a AccelTy by virtue of the guarantees provided by
 166         // StateID.
 167         let new_len = AccelTy::try_from(new_len).unwrap();
 168         self.accels[0] = new_len;
 169     }
 170 }
 171
 172 impl<'a> Accels<&'a [AccelTy]> {
 173     /// Deserialize a sequence of accelerators from the given bytes. If there
 174     /// was a problem deserializing, then an error is returned.
 175     ///
 176     /// This is guaranteed to run in constant time. This does not guarantee
 177     /// that every accelerator in the returned collection is valid. Thus,
 178     /// accessing one may panic, or not-safe code that relies on accelerators
 179     /// being correct my result in UB.
 180     ///
 181     /// Callers may check the validity of every accelerator with the `validate`
 182     /// method.
 183     pub unsafe fn from_bytes_unchecked(
 184         mut slice: &'a [u8],
 185     ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
 186         let slice_start = slice.as_ptr() as usize;
 187
 188         let (count, _) =
 189             bytes::try_read_u32_as_usize(slice, "accelerators count")?;
 190         // The accelerator count is part of the accel_tys slice that
 191         // we deserialize. This is perhaps a bit idiosyncratic. It would
 192         // probably be better to split out the count into a real field.
 193
 194         let accel_tys_count = bytes::add(
 195             bytes::mul(count, 2, "total number of accelerator accel_tys")?,
 196             1,
 197             "total number of accel_tys",
 198         )?;
 199         let accel_tys_len = bytes::mul(
 200             ACCEL_TY_SIZE,
 201             accel_tys_count,
 202             "total number of bytes in accelerators",
 203         )?;
 204         bytes::check_slice_len(slice, accel_tys_len, "accelerators")?;
 205         bytes::check_alignment::<AccelTy>(slice)?;
 206         let accel_tys = &slice[..accel_tys_len];
 207         slice = &slice[accel_tys_len..];
 208         // SAFETY: We've checked the length and alignment above, and since
 209         // slice is just bytes, we can safely cast to a slice of &[AccelTy].
 210         #[allow(unused_unsafe)]
 211         let accels = unsafe {
 212             core::slice::from_raw_parts(
 213                 accel_tys.as_ptr() as *const AccelTy,
 214                 accel_tys_count,
 215             )
 216         };
 217         Ok((Accels { accels }, slice.as_ptr() as usize - slice_start))
 218     }
 219 }
 220
 221 impl<A: AsRef<[AccelTy]>> Accels<A> {
 222     /// Return an owned version of the accelerators.
 223     #[cfg(feature = "alloc")]
 224     pub fn to_owned(&self) -> Accels<Vec<AccelTy>> {
 225         Accels { accels: self.accels.as_ref().to_vec() }
 226     }
 227
 228     /// Return a borrowed version of the accelerators.
 229     pub fn as_ref(&self) -> Accels<&[AccelTy]> {
 230         Accels { accels: self.accels.as_ref() }
 231     }
 232
 233     /// Return the bytes representing the serialization of the accelerators.
 234     pub fn as_bytes(&self) -> &[u8] {
 235         let accels = self.accels.as_ref();
 236         // SAFETY: This is safe because accels is a just a slice of AccelTy,
 237         // and u8 always has a smaller alignment.
 238         unsafe {
 239             core::slice::from_raw_parts(
 240                 accels.as_ptr() as *const u8,
 241                 accels.len() * ACCEL_TY_SIZE,
 242             )
 243         }
 244     }
 245
 246     /// Returns the memory usage, in bytes, of these accelerators.
 247     ///
 248     /// The memory usage is computed based on the number of bytes used to
 249     /// represent all of the accelerators.
 250     ///
 251     /// This does **not** include the stack size used by this value.
 252     pub fn memory_usage(&self) -> usize {
 253         self.as_bytes().len()
 254     }
 255
 256     /// Return the bytes to search for corresponding to the accelerator in this
 257     /// sequence at index `i`. If no such accelerator exists, then this panics.
 258     ///
 259     /// The significance of the index is that it should be in correspondence
 260     /// with the index of the corresponding DFA. That is, accelerated DFA
 261     /// states are stored contiguously in the DFA and have an ordering implied
 262     /// by their respective state IDs. The state's index in that sequence
 263     /// corresponds to the index of its corresponding accelerator.
 264     #[inline(always)]
 265     pub fn needles(&self, i: usize) -> &[u8] {
 266         if i >= self.len() {
 267             panic!("invalid accelerator index {}", i);
 268         }
 269         let bytes = self.as_bytes();
 270         let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
 271         let len = bytes[offset] as usize;
 272         &bytes[offset + 1..offset + 1 + len]
 273     }
 274
 275     /// Return the total number of accelerators in this sequence.
 276     pub fn len(&self) -> usize {
 277         // This should never panic since deserialization checks that the
 278         // length can fit into a usize.
 279         usize::try_from(self.accels.as_ref()[0]).unwrap()
 280     }
 281
 282     /// Return the accelerator in this sequence at index `i`. If no such
 283     /// accelerator exists, then this returns None.
 284     ///
 285     /// See the docs for `needles` on the significance of the index.
 286     fn get(&self, i: usize) -> Option<Accel> {
 287         if i >= self.len() {
 288             return None;
 289         }
 290         let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
 291         let accel = Accel::from_slice(&self.as_bytes()[offset..])
 292             .expect("Accels must contain valid accelerators");
 293         Some(accel)
 294     }
 295
 296     /// Returns an iterator of accelerators in this sequence.
 297     fn iter(&self) -> IterAccels<'_, A> {
 298         IterAccels { accels: self, i: 0 }
 299     }
 300
 301     /// Writes these accelerators to the given byte buffer using the indicated
 302     /// endianness. If the given buffer is too small, then an error is
 303     /// returned. Upon success, the total number of bytes written is returned.
 304     /// The number of bytes written is guaranteed to be a multiple of 8.
 305     pub fn write_to<E: Endian>(
 306         &self,
 307         dst: &mut [u8],
 308     ) -> Result<usize, SerializeError> {
 309         let nwrite = self.write_to_len();
 310         assert_eq!(
 311             nwrite % ACCEL_TY_SIZE,
 312             0,
 313             "expected accelerator bytes written to be a multiple of {}",
 314             ACCEL_TY_SIZE,
 315         );
 316         if dst.len() < nwrite {
 317             return Err(SerializeError::buffer_too_small("accelerators"));
 318         }
 319
 320         // The number of accelerators can never exceed AccelTy::MAX.
 321         E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
 322         // The actual accelerators are just raw bytes and thus their endianness
 323         // is irrelevant. So we can copy them as bytes.
 324         dst[ACCEL_TY_SIZE..nwrite]
 325             .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
 326         Ok(nwrite)
 327     }
 328
 329     /// Validates that every accelerator in this collection can be successfully
 330     /// deserialized as a valid accelerator.
 331     pub fn validate(&self) -> Result<(), DeserializeError> {
 332         for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
 333             let _ = Accel::from_slice(chunk)?;
 334         }
 335         Ok(())
 336     }
 337
 338     /// Returns the total number of bytes written by `write_to`.
 339     pub fn write_to_len(&self) -> usize {
 340         self.as_bytes().len()
 341     }
 342 }
 343
 344 impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
 345     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
 346         write!(f, "Accels(")?;
 347         let mut list = f.debug_list();
 348         for a in self.iter() {
 349             list.entry(&a);
 350         }
 351         list.finish()?;
 352         write!(f, ")")
 353     }
 354 }
 355
 356 #[derive(Debug)]
 357 struct IterAccels<'a, A: AsRef<[AccelTy]>> {
 358     accels: &'a Accels<A>,
 359     i: usize,
 360 }
 361
 362 impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
 363     type Item = Accel;
 364
 365     fn next(&mut self) -> Option<Accel> {
 366         let accel = self.accels.get(self.i)?;
 367         self.i += 1;
 368         Some(accel)
 369     }
 370 }
 371
 372 /// Accel represents a structure for determining how to "accelerate" a DFA
 373 /// state.
 374 ///
 375 /// Namely, it contains zero or more bytes that must be seen in order for the
 376 /// DFA to leave the state it is associated with. In practice, the actual range
 377 /// is 1 to 3 bytes.
 378 ///
 379 /// The purpose of acceleration is to identify states whose vast majority
 380 /// of transitions are just loops back to the same state. For example,
 381 /// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
 382 /// (corresponding to `[^a]+`) where all transitions *except* for `a` and
 383 /// `b` loop back to itself. Thus, this state can be "accelerated" by simply
 384 /// looking for the next occurrence of either `a` or `b` instead of explicitly
 385 /// following transitions. (In this case, `b` transitions to the next state
 386 /// where as `a` would transition to the dead state.)
 387 #[derive(Clone)]
 388 pub(crate) struct Accel {
 389     /// The first byte is the length. Subsequent bytes are the accelerated
 390     /// bytes.
 391     ///
 392     /// Note that we make every accelerator 8 bytes as a slightly wasteful
 393     /// way of making sure alignment is always correct for state ID sizes of
 394     /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
 395     /// particularly common, especially when Unicode is enabled.
 396     bytes: [u8; ACCEL_CAP],
 397 }
 398
 399 impl Accel {
 400     /// Returns an empty accel, where no bytes are accelerated.
 401     #[cfg(feature = "alloc")]
 402     pub fn new() -> Accel {
 403         Accel { bytes: [0; ACCEL_CAP] }
 404     }
 405
 406     /// Returns a verified accelerator derived from the beginning of the given
 407     /// slice.
 408     ///
 409     /// If the slice is not long enough or contains invalid bytes for an
 410     /// accelerator, then this returns an error.
 411     pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
 412         slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
 413         let bytes = slice
 414             .try_into()
 415             .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
 416         Accel::from_bytes(bytes)
 417     }
 418
 419     /// Returns a verified accelerator derived from raw bytes.
 420     ///
 421     /// If the given bytes are invalid, then this returns an error.
 422     fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
 423         if bytes[0] as usize >= ACCEL_LEN {
 424             return Err(DeserializeError::generic(
 425                 "accelerator bytes cannot have length more than 3",
 426             ));
 427         }
 428         Ok(Accel::from_bytes_unchecked(bytes))
 429     }
 430
 431     /// Returns an accelerator derived from raw bytes.
 432     ///
 433     /// This does not check whether the given bytes are valid. Invalid bytes
 434     /// cannot sacrifice memory safety, but may result in panics or silent
 435     /// logic bugs.
 436     fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
 437         Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
 438     }
 439
 440     /// Attempts to add the given byte to this accelerator. If the accelerator
 441     /// is already full then this returns false. Otherwise, returns true.
 442     ///
 443     /// If the given byte is already in this accelerator, then it panics.
 444     #[cfg(feature = "alloc")]
 445     pub fn add(&mut self, byte: u8) -> bool {
 446         if self.len() >= 3 {
 447             return false;
 448         }
 449         assert!(
 450             !self.contains(byte),
 451             "accelerator already contains {:?}",
 452             crate::util::DebugByte(byte)
 453         );
 454         self.bytes[self.len() + 1] = byte;
 455         self.bytes[0] += 1;
 456         true
 457     }
 458
 459     /// Return the number of bytes in this accelerator.
 460     pub fn len(&self) -> usize {
 461         self.bytes[0] as usize
 462     }
 463
 464     /// Returns true if and only if there are no bytes in this accelerator.
 465     #[cfg(feature = "alloc")]
 466     pub fn is_empty(&self) -> bool {
 467         self.len() == 0
 468     }
 469
 470     /// Returns the slice of bytes to accelerate.
 471     ///
 472     /// If this accelerator is empty, then this returns an empty slice.
 473     fn needles(&self) -> &[u8] {
 474         &self.bytes[1..1 + self.len()]
 475     }
 476
 477     /// Returns true if and only if this accelerator will accelerate the given
 478     /// byte.
 479     #[cfg(feature = "alloc")]
 480     fn contains(&self, byte: u8) -> bool {
 481         self.needles().iter().position(|&b| b == byte).is_some()
 482     }
 483
 484     /// Returns the accelerator bytes as an array of AccelTys.
 485     #[cfg(feature = "alloc")]
 486     fn as_accel_tys(&self) -> [AccelTy; 2] {
 487         assert_eq!(ACCEL_CAP, 8);
 488         // These unwraps are OK since ACCEL_CAP is set to 8.
 489         let first =
 490             AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
 491         let second =
 492             AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
 493         [first, second]
 494     }
 495 }
 496
 497 impl core::fmt::Debug for Accel {
 498     fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
 499         write!(f, "Accel(")?;
 500         let mut set = f.debug_set();
 501         for &b in self.needles() {
 502             set.entry(&crate::util::DebugByte(b));
 503         }
 504         set.finish()?;
 505         write!(f, ")")
 506     }
 507 }