]> git.proxmox.com Git - rustc.git/blob - vendor/regex-automata/src/dfa/accel.rs
New upstream version 1.67.1+dfsg1
[rustc.git] / vendor / regex-automata / src / dfa / accel.rs
1 // This module defines some core types for dealing with accelerated DFA states.
2 // Briefly, a DFA state can be "accelerated" if all of its transitions except
3 // for a few loop back to itself. This directly implies that the only way out
4 // of such a state is if a byte corresponding to one of those non-loopback
5 // transitions is found. Such states are often found in simple repetitions in
6 // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
7 // DFA with regex-cli:
8 //
9 // $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC
10 // dense::DFA(
11 // D 000000:
12 // Q 000001:
13 // *000002:
14 // A 000003: \x00-` => 3, a => 5, b-\xFF => 3
15 // >000004: \x00-` => 3, a => 4, b-\xFF => 3
16 // 000005: \x00-\xFF => 2, EOI => 2
17 // )
18 //
19 // In particular, state 3 is accelerated (shown via the 'A' indicator) since
20 // the only way to leave that state once entered is to see an 'a' byte. If
21 // there is a long run of non-'a' bytes, then using something like 'memchr'
22 // to find the next 'a' byte can be significantly faster than just using the
23 // standard byte-at-a-time state machine.
24 //
25 // Unfortunately, this optimization rarely applies when Unicode is enabled.
26 // For example, patterns like '[^a]' don't actually match any byte that isn't
27 // 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
28 // 'a'. This makes the state machine much more complex---far beyond a single
29 // state---and removes the ability to easily accelerate it. (Because if the
30 // machine sees a non-UTF-8 sequence, then the machine won't match through it.)
31 //
32 // In practice, we only consider accelerating states that have 3 or fewer
33 // non-loop transitions. At a certain point, you get diminishing returns, but
34 // also because that's what the memchr crate supports. The structures below
35 // hard-code this assumption and provide (de)serialization APIs for use inside
36 // a DFA.
37 //
38 // And finally, note that there is some trickery involved in making it very
39 // fast to not only check whether a state is accelerated at search time, but
40 // also to access the bytes to search for to implement the acceleration itself.
41 // dfa/special.rs provides more detail, but the short story is that all
42 // accelerated states appear contiguously in a DFA. This means we can represent
43 // the ID space of all accelerated DFA states with a single range. So given
44 // a state ID, we can determine whether it's accelerated via
45 //
46 // min_accel_id <= id <= max_accel_id
47 //
48 // And find its corresponding accelerator with:
49 //
50 // accels.get((id - min_accel_id) / dfa_stride)
51
52 use core::convert::{TryFrom, TryInto};
53
54 #[cfg(feature = "alloc")]
55 use alloc::{vec, vec::Vec};
56
57 use crate::util::bytes::{self, DeserializeError, Endian, SerializeError};
58
59 /// The base type used to represent a collection of accelerators.
60 ///
61 /// While an `Accel` is represented as a fixed size array of bytes, a
62 /// *collection* of `Accel`s (called `Accels`) is represented internally as a
63 /// slice of u32. While it's a bit unnatural to do this and costs us a bit of
64 /// fairly low-risk not-safe code, it lets us remove the need for a second type
65 /// parameter in the definition of dense::DFA. (Which really wants everything
66 /// to be a slice of u32.)
67 type AccelTy = u32;
68
69 /// The size of the unit of representation for accelerators.
70 ///
71 /// ACCEL_CAP *must* be a multiple of this size.
72 const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
73
74 /// The maximum length in bytes that a single Accel can be. This is distinct
75 /// from the capacity of an accelerator in that the length represents only the
76 /// bytes that should be read.
77 const ACCEL_LEN: usize = 4;
78
79 /// The capacity of each accelerator, in bytes. We set this to 8 since it's a
80 /// multiple of 4 (our ID size) and because it gives us a little wiggle room
81 /// if we want to support more accel bytes in the future without a breaking
82 /// change.
83 ///
84 /// This MUST be a multiple of ACCEL_TY_SIZE.
85 const ACCEL_CAP: usize = 8;
86
87 /// Search for between 1 and 3 needle bytes in the given haystack, starting the
88 /// search at the given position. If `needles` has a length other than 1-3,
89 /// then this panics.
90 #[inline(always)]
91 pub(crate) fn find_fwd(
92 needles: &[u8],
93 haystack: &[u8],
94 at: usize,
95 ) -> Option<usize> {
96 let bs = needles;
97 let i = match needles.len() {
98 1 => memchr::memchr(bs[0], &haystack[at..])?,
99 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
100 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
101 0 => panic!("cannot find with empty needles"),
102 n => panic!("invalid needles length: {}", n),
103 };
104 Some(at + i)
105 }
106
107 /// Search for between 1 and 3 needle bytes in the given haystack in reverse,
108 /// starting the search at the given position. If `needles` has a length other
109 /// than 1-3, then this panics.
110 #[inline(always)]
111 pub(crate) fn find_rev(
112 needles: &[u8],
113 haystack: &[u8],
114 at: usize,
115 ) -> Option<usize> {
116 let bs = needles;
117 match needles.len() {
118 1 => memchr::memrchr(bs[0], &haystack[..at]),
119 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
120 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
121 0 => panic!("cannot find with empty needles"),
122 n => panic!("invalid needles length: {}", n),
123 }
124 }
125
126 /// Represents the accelerators for all accelerated states in a dense DFA.
127 ///
128 /// The `A` type parameter represents the type of the underlying bytes.
129 /// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
130 #[derive(Clone)]
131 pub(crate) struct Accels<A> {
132 /// A length prefixed slice of contiguous accelerators. See the top comment
133 /// in this module for more details on how we can jump from a DFA's state
134 /// ID to an accelerator in this list.
135 ///
136 /// The first 4 bytes always correspond to the number of accelerators
137 /// that follow.
138 accels: A,
139 }
140
141 #[cfg(feature = "alloc")]
142 impl Accels<Vec<AccelTy>> {
143 /// Create an empty sequence of accelerators for a DFA.
144 pub fn empty() -> Accels<Vec<AccelTy>> {
145 Accels { accels: vec![0] }
146 }
147
148 /// Add an accelerator to this sequence.
149 ///
150 /// This adds to the accelerator to the end of the sequence and therefore
151 /// should be done in correspondence with its state in the DFA.
152 ///
153 /// This panics if this results in more accelerators than AccelTy::MAX.
154 pub fn add(&mut self, accel: Accel) {
155 self.accels.extend_from_slice(&accel.as_accel_tys());
156 let len = self.len();
157 self.set_len(len + 1);
158 }
159
160 /// Set the number of accelerators in this sequence, which is encoded in
161 /// the first 4 bytes of the underlying bytes.
162 fn set_len(&mut self, new_len: usize) {
163 // The only way an accelerator gets added is if a state exists for
164 // it, and if a state exists, then its index is guaranteed to be
165 // representable by a AccelTy by virtue of the guarantees provided by
166 // StateID.
167 let new_len = AccelTy::try_from(new_len).unwrap();
168 self.accels[0] = new_len;
169 }
170 }
171
172 impl<'a> Accels<&'a [AccelTy]> {
173 /// Deserialize a sequence of accelerators from the given bytes. If there
174 /// was a problem deserializing, then an error is returned.
175 ///
176 /// This is guaranteed to run in constant time. This does not guarantee
177 /// that every accelerator in the returned collection is valid. Thus,
178 /// accessing one may panic, or not-safe code that relies on accelerators
179 /// being correct my result in UB.
180 ///
181 /// Callers may check the validity of every accelerator with the `validate`
182 /// method.
183 pub unsafe fn from_bytes_unchecked(
184 mut slice: &'a [u8],
185 ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
186 let slice_start = slice.as_ptr() as usize;
187
188 let (count, _) =
189 bytes::try_read_u32_as_usize(slice, "accelerators count")?;
190 // The accelerator count is part of the accel_tys slice that
191 // we deserialize. This is perhaps a bit idiosyncratic. It would
192 // probably be better to split out the count into a real field.
193
194 let accel_tys_count = bytes::add(
195 bytes::mul(count, 2, "total number of accelerator accel_tys")?,
196 1,
197 "total number of accel_tys",
198 )?;
199 let accel_tys_len = bytes::mul(
200 ACCEL_TY_SIZE,
201 accel_tys_count,
202 "total number of bytes in accelerators",
203 )?;
204 bytes::check_slice_len(slice, accel_tys_len, "accelerators")?;
205 bytes::check_alignment::<AccelTy>(slice)?;
206 let accel_tys = &slice[..accel_tys_len];
207 slice = &slice[accel_tys_len..];
208 // SAFETY: We've checked the length and alignment above, and since
209 // slice is just bytes, we can safely cast to a slice of &[AccelTy].
210 #[allow(unused_unsafe)]
211 let accels = unsafe {
212 core::slice::from_raw_parts(
213 accel_tys.as_ptr() as *const AccelTy,
214 accel_tys_count,
215 )
216 };
217 Ok((Accels { accels }, slice.as_ptr() as usize - slice_start))
218 }
219 }
220
221 impl<A: AsRef<[AccelTy]>> Accels<A> {
222 /// Return an owned version of the accelerators.
223 #[cfg(feature = "alloc")]
224 pub fn to_owned(&self) -> Accels<Vec<AccelTy>> {
225 Accels { accels: self.accels.as_ref().to_vec() }
226 }
227
228 /// Return a borrowed version of the accelerators.
229 pub fn as_ref(&self) -> Accels<&[AccelTy]> {
230 Accels { accels: self.accels.as_ref() }
231 }
232
233 /// Return the bytes representing the serialization of the accelerators.
234 pub fn as_bytes(&self) -> &[u8] {
235 let accels = self.accels.as_ref();
236 // SAFETY: This is safe because accels is a just a slice of AccelTy,
237 // and u8 always has a smaller alignment.
238 unsafe {
239 core::slice::from_raw_parts(
240 accels.as_ptr() as *const u8,
241 accels.len() * ACCEL_TY_SIZE,
242 )
243 }
244 }
245
246 /// Returns the memory usage, in bytes, of these accelerators.
247 ///
248 /// The memory usage is computed based on the number of bytes used to
249 /// represent all of the accelerators.
250 ///
251 /// This does **not** include the stack size used by this value.
252 pub fn memory_usage(&self) -> usize {
253 self.as_bytes().len()
254 }
255
256 /// Return the bytes to search for corresponding to the accelerator in this
257 /// sequence at index `i`. If no such accelerator exists, then this panics.
258 ///
259 /// The significance of the index is that it should be in correspondence
260 /// with the index of the corresponding DFA. That is, accelerated DFA
261 /// states are stored contiguously in the DFA and have an ordering implied
262 /// by their respective state IDs. The state's index in that sequence
263 /// corresponds to the index of its corresponding accelerator.
264 #[inline(always)]
265 pub fn needles(&self, i: usize) -> &[u8] {
266 if i >= self.len() {
267 panic!("invalid accelerator index {}", i);
268 }
269 let bytes = self.as_bytes();
270 let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
271 let len = bytes[offset] as usize;
272 &bytes[offset + 1..offset + 1 + len]
273 }
274
275 /// Return the total number of accelerators in this sequence.
276 pub fn len(&self) -> usize {
277 // This should never panic since deserialization checks that the
278 // length can fit into a usize.
279 usize::try_from(self.accels.as_ref()[0]).unwrap()
280 }
281
282 /// Return the accelerator in this sequence at index `i`. If no such
283 /// accelerator exists, then this returns None.
284 ///
285 /// See the docs for `needles` on the significance of the index.
286 fn get(&self, i: usize) -> Option<Accel> {
287 if i >= self.len() {
288 return None;
289 }
290 let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
291 let accel = Accel::from_slice(&self.as_bytes()[offset..])
292 .expect("Accels must contain valid accelerators");
293 Some(accel)
294 }
295
296 /// Returns an iterator of accelerators in this sequence.
297 fn iter(&self) -> IterAccels<'_, A> {
298 IterAccels { accels: self, i: 0 }
299 }
300
301 /// Writes these accelerators to the given byte buffer using the indicated
302 /// endianness. If the given buffer is too small, then an error is
303 /// returned. Upon success, the total number of bytes written is returned.
304 /// The number of bytes written is guaranteed to be a multiple of 8.
305 pub fn write_to<E: Endian>(
306 &self,
307 dst: &mut [u8],
308 ) -> Result<usize, SerializeError> {
309 let nwrite = self.write_to_len();
310 assert_eq!(
311 nwrite % ACCEL_TY_SIZE,
312 0,
313 "expected accelerator bytes written to be a multiple of {}",
314 ACCEL_TY_SIZE,
315 );
316 if dst.len() < nwrite {
317 return Err(SerializeError::buffer_too_small("accelerators"));
318 }
319
320 // The number of accelerators can never exceed AccelTy::MAX.
321 E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
322 // The actual accelerators are just raw bytes and thus their endianness
323 // is irrelevant. So we can copy them as bytes.
324 dst[ACCEL_TY_SIZE..nwrite]
325 .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
326 Ok(nwrite)
327 }
328
329 /// Validates that every accelerator in this collection can be successfully
330 /// deserialized as a valid accelerator.
331 pub fn validate(&self) -> Result<(), DeserializeError> {
332 for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
333 let _ = Accel::from_slice(chunk)?;
334 }
335 Ok(())
336 }
337
338 /// Returns the total number of bytes written by `write_to`.
339 pub fn write_to_len(&self) -> usize {
340 self.as_bytes().len()
341 }
342 }
343
344 impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
345 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
346 write!(f, "Accels(")?;
347 let mut list = f.debug_list();
348 for a in self.iter() {
349 list.entry(&a);
350 }
351 list.finish()?;
352 write!(f, ")")
353 }
354 }
355
356 #[derive(Debug)]
357 struct IterAccels<'a, A: AsRef<[AccelTy]>> {
358 accels: &'a Accels<A>,
359 i: usize,
360 }
361
362 impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
363 type Item = Accel;
364
365 fn next(&mut self) -> Option<Accel> {
366 let accel = self.accels.get(self.i)?;
367 self.i += 1;
368 Some(accel)
369 }
370 }
371
372 /// Accel represents a structure for determining how to "accelerate" a DFA
373 /// state.
374 ///
375 /// Namely, it contains zero or more bytes that must be seen in order for the
376 /// DFA to leave the state it is associated with. In practice, the actual range
377 /// is 1 to 3 bytes.
378 ///
379 /// The purpose of acceleration is to identify states whose vast majority
380 /// of transitions are just loops back to the same state. For example,
381 /// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
382 /// (corresponding to `[^a]+`) where all transitions *except* for `a` and
383 /// `b` loop back to itself. Thus, this state can be "accelerated" by simply
384 /// looking for the next occurrence of either `a` or `b` instead of explicitly
385 /// following transitions. (In this case, `b` transitions to the next state
386 /// where as `a` would transition to the dead state.)
387 #[derive(Clone)]
388 pub(crate) struct Accel {
389 /// The first byte is the length. Subsequent bytes are the accelerated
390 /// bytes.
391 ///
392 /// Note that we make every accelerator 8 bytes as a slightly wasteful
393 /// way of making sure alignment is always correct for state ID sizes of
394 /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
395 /// particularly common, especially when Unicode is enabled.
396 bytes: [u8; ACCEL_CAP],
397 }
398
399 impl Accel {
400 /// Returns an empty accel, where no bytes are accelerated.
401 #[cfg(feature = "alloc")]
402 pub fn new() -> Accel {
403 Accel { bytes: [0; ACCEL_CAP] }
404 }
405
406 /// Returns a verified accelerator derived from the beginning of the given
407 /// slice.
408 ///
409 /// If the slice is not long enough or contains invalid bytes for an
410 /// accelerator, then this returns an error.
411 pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
412 slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
413 let bytes = slice
414 .try_into()
415 .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
416 Accel::from_bytes(bytes)
417 }
418
419 /// Returns a verified accelerator derived from raw bytes.
420 ///
421 /// If the given bytes are invalid, then this returns an error.
422 fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
423 if bytes[0] as usize >= ACCEL_LEN {
424 return Err(DeserializeError::generic(
425 "accelerator bytes cannot have length more than 3",
426 ));
427 }
428 Ok(Accel::from_bytes_unchecked(bytes))
429 }
430
431 /// Returns an accelerator derived from raw bytes.
432 ///
433 /// This does not check whether the given bytes are valid. Invalid bytes
434 /// cannot sacrifice memory safety, but may result in panics or silent
435 /// logic bugs.
436 fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
437 Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
438 }
439
440 /// Attempts to add the given byte to this accelerator. If the accelerator
441 /// is already full then this returns false. Otherwise, returns true.
442 ///
443 /// If the given byte is already in this accelerator, then it panics.
444 #[cfg(feature = "alloc")]
445 pub fn add(&mut self, byte: u8) -> bool {
446 if self.len() >= 3 {
447 return false;
448 }
449 assert!(
450 !self.contains(byte),
451 "accelerator already contains {:?}",
452 crate::util::DebugByte(byte)
453 );
454 self.bytes[self.len() + 1] = byte;
455 self.bytes[0] += 1;
456 true
457 }
458
459 /// Return the number of bytes in this accelerator.
460 pub fn len(&self) -> usize {
461 self.bytes[0] as usize
462 }
463
464 /// Returns true if and only if there are no bytes in this accelerator.
465 #[cfg(feature = "alloc")]
466 pub fn is_empty(&self) -> bool {
467 self.len() == 0
468 }
469
470 /// Returns the slice of bytes to accelerate.
471 ///
472 /// If this accelerator is empty, then this returns an empty slice.
473 fn needles(&self) -> &[u8] {
474 &self.bytes[1..1 + self.len()]
475 }
476
477 /// Returns true if and only if this accelerator will accelerate the given
478 /// byte.
479 #[cfg(feature = "alloc")]
480 fn contains(&self, byte: u8) -> bool {
481 self.needles().iter().position(|&b| b == byte).is_some()
482 }
483
484 /// Returns the accelerator bytes as an array of AccelTys.
485 #[cfg(feature = "alloc")]
486 fn as_accel_tys(&self) -> [AccelTy; 2] {
487 assert_eq!(ACCEL_CAP, 8);
488 // These unwraps are OK since ACCEL_CAP is set to 8.
489 let first =
490 AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
491 let second =
492 AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
493 [first, second]
494 }
495 }
496
497 impl core::fmt::Debug for Accel {
498 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
499 write!(f, "Accel(")?;
500 let mut set = f.debug_set();
501 for &b in self.needles() {
502 set.entry(&crate::util::DebugByte(b));
503 }
504 set.finish()?;
505 write!(f, ")")
506 }
507 }