vendor/regex-automata/src/dfa/search_unsafe.rs

   1 use crate::dfa::automaton::{Automaton, State};
   2 use crate::MatchError;
   3
   4 /// This is marked as `inline(always)` specifically because it supports
   5 /// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
   6 /// permits eliminating a few crucial branches.
   7 #[inline(always)]
   8 pub fn find_fwd<A: Automaton + ?Sized>(
   9     dfa: &A,
  10     bytes: &[u8],
  11     start: usize,
  12     end: usize,
  13     earliest: bool,
  14 ) -> Result<Option<usize>, MatchError> {
  15     assert!(start <= end);
  16     assert!(start <= bytes.len());
  17     assert!(end <= bytes.len());
  18
  19     let (mut state, mut last_match) = init_fwd(dfa, bytes, start, end)?;
  20     if earliest && last_match.is_some() {
  21         return Ok(last_match);
  22     }
  23
  24     let mut at = start;
  25     while at < end {
  26         let byte = bytes[at];
  27         state = dfa.next_state(state, byte);
  28         at += 1;
  29         if dfa.is_special_state(state) {
  30             if dfa.is_dead_state(state) {
  31                 return Ok(last_match);
  32             } else if dfa.is_quit_state(state) {
  33                 return Err(MatchError::Quit { byte, offset: at - 1 });
  34             }
  35             last_match = Some(at - dfa.match_offset());
  36             if earliest {
  37                 return Ok(last_match);
  38             }
  39         }
  40     }
  41     /*
  42     unsafe {
  43         let mut p = bytes.as_ptr().add(start);
  44         while p < bytes[end..].as_ptr() {
  45             let byte = *p;
  46             state = dfa.next_state_unchecked(state, byte);
  47             p = p.add(1);
  48             if dfa.is_special_state(state) {
  49                 if dfa.is_dead_state(state) {
  50                     return Ok(last_match);
  51                 } else if dfa.is_quit_state(state) {
  52                     return Err(MatchError::Quit {
  53                         byte,
  54                         offset: offset(bytes, p) - 1,
  55                     });
  56                 }
  57                 last_match = Some(offset(bytes, p) - dfa.match_offset());
  58                 if earliest {
  59                     return Ok(last_match);
  60                 }
  61             }
  62         }
  63     }
  64     */
  65     Ok(eof_fwd(dfa, bytes, end, &mut state)?.or(last_match))
  66 }
  67
  68 /// This is marked as `inline(always)` specifically because it supports
  69 /// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
  70 /// permits eliminating a few crucial branches.
  71 #[inline(always)]
  72 pub fn find_rev<A: Automaton + ?Sized>(
  73     dfa: &A,
  74     bytes: &[u8],
  75     start: usize,
  76     end: usize,
  77     earliest: bool,
  78 ) -> Result<Option<usize>, MatchError> {
  79     assert!(start <= end);
  80     assert!(start <= bytes.len());
  81     assert!(end <= bytes.len());
  82
  83     let (mut state, mut last_match) = init_rev(dfa, bytes, start, end)?;
  84     if earliest && last_match.is_some() {
  85         return Ok(last_match);
  86     }
  87
  88     let mut at = end;
  89     while at > start {
  90         at -= 1;
  91         let byte = bytes[at];
  92         state = dfa.next_state(state, byte);
  93         if dfa.is_special_state(state) {
  94             if dfa.is_dead_state(state) {
  95                 return Ok(last_match);
  96             } else if dfa.is_quit_state(state) {
  97                 return Err(MatchError::Quit { byte, offset: at });
  98             }
  99             last_match = Some(at + dfa.match_offset());
 100             if earliest {
 101                 return Ok(last_match);
 102             }
 103         }
 104     }
 105     /*
 106     unsafe {
 107         let mut p = bytes.as_ptr().add(end);
 108         while p > bytes[start..].as_ptr() {
 109             p = p.sub(1);
 110             let byte = *p;
 111             state = dfa.next_state_unchecked(state, byte);
 112             if dfa.is_special_state(state) {
 113                 if dfa.is_dead_state(state) {
 114                     return Ok(last_match);
 115                 } else if dfa.is_quit_state(state) {
 116                     return Err(MatchError::Quit {
 117                         byte,
 118                         offset: offset(bytes, p),
 119                     });
 120                 }
 121                 last_match = Some(offset(bytes, p) + dfa.match_offset());
 122                 if earliest {
 123                     return Ok(last_match);
 124                 }
 125             }
 126         }
 127     }
 128     */
 129     Ok(eof_rev(dfa, state, bytes, start)?.or(last_match))
 130 }
 131
 132 pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
 133     dfa: &A,
 134     bytes: &[u8],
 135     mut start: usize,
 136     end: usize,
 137     caller_state: &mut State<A::ID>,
 138 ) -> Result<Option<usize>, MatchError> {
 139     assert!(start <= end);
 140     assert!(start <= bytes.len());
 141     assert!(end <= bytes.len());
 142
 143     let (mut state, mut last_match) = match caller_state.as_option() {
 144         None => init_fwd(dfa, bytes, start, end)?,
 145         Some(id) => {
 146             // This is a subtle but critical detail. If the caller provides a
 147             // non-None state ID, then it must be the case that the state ID
 148             // corresponds to one set by this function. The state ID therefore
 149             // corresponds to a match state, a dead state or some other state.
 150             // However, "some other" state _only_ occurs when the input has
 151             // been exhausted because the only way to stop before then is to
 152             // see a match or a dead/quit state.
 153             //
 154             // If the input is exhausted or if it's a dead state, then
 155             // incrementing the starting position has no relevance on
 156             // correctness, since the loop below will either not execute
 157             // at all or will immediately stop due to being in a dead state.
 158             // (Once in a dead state it is impossible to leave it.)
 159             //
 160             // Therefore, the only case we need to consider is when
 161             // caller_state is a match state. In this case, since our machines
 162             // support the ability to delay a match by a certain number of
 163             // bytes (to support look-around), it follows that we actually
 164             // consumed that many additional bytes on our previous search. When
 165             // the caller resumes their search to find subsequent matches, they
 166             // will use the ending location from the previous match as the next
 167             // starting point, which is `match_offset` bytes PRIOR to where
 168             // we scanned to on the previous search. Therefore, we need to
 169             // compensate by bumping `start` up by `match_offset` bytes.
 170             start += dfa.match_offset();
 171             // Since match_offset could be any arbitrary value and we use
 172             // `start` in pointer arithmetic below, we check that we are still
 173             // in bounds. Otherwise, we could materialize a pointer that is
 174             // more than one past the end point of `bytes`, which is UB.
 175             if start > end {
 176                 return Ok(None);
 177             }
 178             (id, None)
 179         }
 180     };
 181     if last_match.is_some() {
 182         caller_state.set(state);
 183         return Ok(last_match);
 184     }
 185
 186     let mut at = start;
 187     while at < end {
 188         let byte = bytes[at];
 189         state = dfa.next_state(state, byte);
 190         at += 1;
 191         if dfa.is_special_state(state) {
 192             caller_state.set(state);
 193             if dfa.is_dead_state(state) {
 194                 return Ok(None);
 195             } else if dfa.is_quit_state(state) {
 196                 return Err(MatchError::Quit { byte, offset: at - 1 });
 197             } else {
 198                 return Ok(Some(at - dfa.match_offset()));
 199             }
 200         }
 201     }
 202     /*
 203     // SAFETY: Other than the normal pointer arithmetic happening here, a
 204     // unique aspect of safety for this function is the fact that the caller
 205     // can provide the state that the search routine will start with. If this
 206     // state were invalid, it would be possible to incorrectly index the
 207     // transition table. We however prevent this from happening by guaranteeing
 208     // that State is valid. Namely, callers cannot mutate a State. All they can
 209     // do is create a "start" state or otherwise reuse a previously set state.
 210     // Since callers can't mutate a state, it follows that a previously set
 211     // state can only be retrieved by crate internal functions. Therefore, our
 212     // use of it is safe since this code will only ever set the provided state
 213     // to a valid state.
 214     unsafe {
 215         let mut p = bytes.as_ptr().add(start);
 216         while p < bytes[end..].as_ptr() {
 217             let byte = *p;
 218             state = dfa.next_state_unchecked(state, byte);
 219             p = p.add(1);
 220             if dfa.is_special_state(state) {
 221                 caller_state.set(state);
 222                 return if dfa.is_dead_state(state) {
 223                     Ok(None)
 224                 } else if dfa.is_quit_state(state) {
 225                     Err(MatchError::Quit { byte, offset: offset(bytes, p) - 1 })
 226                 } else {
 227                     Ok(Some(offset(bytes, p) - dfa.match_offset()))
 228                 };
 229             }
 230         }
 231     }
 232     */
 233
 234     let result = eof_fwd(dfa, bytes, end, &mut state);
 235     caller_state.set(state);
 236     result
 237 }
 238
 239 fn init_fwd<A: Automaton + ?Sized>(
 240     dfa: &A,
 241     bytes: &[u8],
 242     start: usize,
 243     end: usize,
 244 ) -> Result<(A::ID, Option<usize>), MatchError> {
 245     let state = dfa.start_state_forward(bytes, start, end);
 246     if dfa.is_match_state(state) {
 247         Ok((state, Some(start - dfa.match_offset())))
 248     } else {
 249         Ok((state, None))
 250     }
 251 }
 252
 253 fn init_rev<A: Automaton + ?Sized>(
 254     dfa: &A,
 255     bytes: &[u8],
 256     start: usize,
 257     end: usize,
 258 ) -> Result<(A::ID, Option<usize>), MatchError> {
 259     let state = dfa.start_state_reverse(bytes, start, end);
 260     if dfa.is_match_state(state) {
 261         Ok((state, Some(end + dfa.match_offset())))
 262     } else {
 263         Ok((state, None))
 264     }
 265 }
 266
 267 fn eof_fwd<A: Automaton + ?Sized>(
 268     dfa: &A,
 269     bytes: &[u8],
 270     end: usize,
 271     state: &mut A::ID,
 272 ) -> Result<Option<usize>, MatchError> {
 273     match bytes.get(end) {
 274         Some(&b) => {
 275             *state = dfa.next_state(*state, b);
 276             if dfa.is_match_state(*state) {
 277                 Ok(Some(end))
 278             } else {
 279                 Ok(None)
 280             }
 281         }
 282         None => {
 283             *state = dfa.next_eof_state(*state);
 284             if dfa.is_match_state(*state) {
 285                 Ok(Some(bytes.len()))
 286             } else {
 287                 Ok(None)
 288             }
 289         }
 290     }
 291 }
 292
 293 fn eof_rev<A: Automaton + ?Sized>(
 294     dfa: &A,
 295     state: A::ID,
 296     bytes: &[u8],
 297     start: usize,
 298 ) -> Result<Option<usize>, MatchError> {
 299     if start > 0 {
 300         if dfa.is_match_state(dfa.next_state(state, bytes[start - 1])) {
 301             Ok(Some(start))
 302         } else {
 303             Ok(None)
 304         }
 305     } else {
 306         if dfa.is_match_state(dfa.next_eof_state(state)) {
 307             Ok(Some(0))
 308         } else {
 309             Ok(None)
 310         }
 311     }
 312 }
 313
 314 /// Returns the distance between the given pointer and the start of `bytes`.
 315 /// This assumes that the given pointer points to somewhere in the `bytes`
 316 /// slice given.
 317 fn offset(bytes: &[u8], p: *const u8) -> usize {
 318     debug_assert!(bytes.as_ptr() <= p);
 319     debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
 320     ((p as isize) - (bytes.as_ptr() as isize)) as usize
 321 }