vendor/memchr/src/memchr/x86/sse2.rs

   1 use core::{arch::x86_64::*, cmp, mem::size_of};
   2
   3 const VECTOR_SIZE: usize = size_of::<__m128i>();
   4 const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
   5
   6 // The number of bytes to loop at in one iteration of memchr/memrchr.
   7 const LOOP_SIZE: usize = 4 * VECTOR_SIZE;
   8
   9 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
  10 // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
  11 // in benchmarks. memchr3 in particular only gets a very slight speed up from
  12 // the loop unrolling.
  13 const LOOP_SIZE2: usize = 2 * VECTOR_SIZE;
  14
  15 #[target_feature(enable = "sse2")]
  16 pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
  17     // What follows is a fast SSE2-only algorithm to detect the position of
  18     // `n1` in `haystack` if it exists. From what I know, this is the "classic"
  19     // algorithm. I believe it can be found in places like glibc and Go's
  20     // standard library. It appears to be well known and is elaborated on in
  21     // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
  22     //
  23     // While this routine is very long, the basic idea is actually very simple
  24     // and can be expressed straight-forwardly in pseudo code:
  25     //
  26     //     needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
  27     //     // Note: shift amount in bytes
  28     //
  29     //     while i <= haystack.len() - 16:
  30     //       // A 16 byte vector. Each byte in chunk corresponds to a byte in
  31     //       // the haystack.
  32     //       chunk = haystack[i:i+16]
  33     //       // Compare bytes in needle with bytes in chunk. The result is a 16
  34     //       // byte chunk where each byte is 0xFF if the corresponding bytes
  35     //       // in needle and chunk were equal, or 0x00 otherwise.
  36     //       eqs = cmpeq(needle, chunk)
  37     //       // Return a 32 bit integer where the most significant 16 bits
  38     //       // are always 0 and the lower 16 bits correspond to whether the
  39     //       // most significant bit in the correspond byte in `eqs` is set.
  40     //       // In other words, `mask as u16` has bit i set if and only if
  41     //       // needle[i] == chunk[i].
  42     //       mask = movemask(eqs)
  43     //
  44     //       // Mask is 0 if there is no match, and non-zero otherwise.
  45     //       if mask != 0:
  46     //         // trailing_zeros tells us the position of the least significant
  47     //         // bit that is set.
  48     //         return i + trailing_zeros(mask)
  49     //
  50     //     // haystack length may not be a multiple of 16, so search the rest.
  51     //     while i < haystack.len():
  52     //       if haystack[i] == n1:
  53     //         return i
  54     //
  55     //     // No match found.
  56     //     return NULL
  57     //
  58     // In fact, we could loosely translate the above code to Rust line-for-line
  59     // and it would be a pretty fast algorithm. But, we pull out all the stops
  60     // to go as fast as possible:
  61     //
  62     // 1. We use aligned loads. That is, we do some finagling to make sure our
  63     //    primary loop not only proceeds in increments of 16 bytes, but that
  64     //    the address of haystack's pointer that we dereference is aligned to
  65     //    16 bytes. 16 is a magic number here because it is the size of SSE2
  66     //    128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
  67     //    Therefore, to get aligned loads, our pointer's address must be evenly
  68     //    divisible by 16.
  69     // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
  70     //    kind of like loop unrolling, but we combine the equality comparisons
  71     //    using a vector OR such that we only need to extract a single mask to
  72     //    determine whether a match exists or not. If so, then we do some
  73     //    book-keeping to determine the precise location but otherwise mush on.
  74     // 3. We use our "chunk" comparison routine in as many places as possible,
  75     //    even if it means using unaligned loads. In particular, if haystack
  76     //    starts with an unaligned address, then we do an unaligned load to
  77     //    search the first 16 bytes. We then start our primary loop at the
  78     //    smallest subsequent aligned address, which will actually overlap with
  79     //    previously searched bytes. But we're OK with that. We do a similar
  80     //    dance at the end of our primary loop. Finally, to avoid a
  81     //    byte-at-a-time loop at the end, we do a final 16 byte unaligned load
  82     //    that may overlap with a previous load. This is OK because it converts
  83     //    a loop into a small number of very fast vector instructions.
  84     //
  85     // The primary downside of this algorithm is that it's effectively
  86     // completely unsafe. Therefore, we have to be super careful to avoid
  87     // undefined behavior:
  88     //
  89     // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
  90     //    require the pointer to be valid, but we actually can't even store the
  91     //    address of an invalid pointer (unless it's 1 past the end of
  92     //    haystack) without sacrificing performance.
  93     // 2. _mm_loadu_si128 is used when you don't care about alignment, and
  94     //    _mm_load_si128 is used when you do care. You cannot use the latter
  95     //    on unaligned pointers.
  96     // 3. We make liberal use of debug_assert! to check assumptions.
  97     // 4. We make a concerted effort to stick with pointers instead of indices.
  98     //    Indices are nicer because there's less to worry about with them (see
  99     //    above about pointer offsets), but I could not get the compiler to
 100     //    produce as good of code as what the below produces. In any case,
 101     //    pointers are what we really care about here, and alignment is
 102     //    expressed a bit more naturally with them.
 103     //
 104     // In general, most of the algorithms in this crate have a similar
 105     // structure to what you see below, so this comment applies fairly well to
 106     // all of them.
 107
 108     let vn1 = _mm_set1_epi8(n1 as i8);
 109     let len = haystack.len();
 110     let loop_size = cmp::min(LOOP_SIZE, len);
 111     let start_ptr = haystack.as_ptr();
 112     let end_ptr = start_ptr.add(haystack.len());
 113     let mut ptr = start_ptr;
 114
 115     if haystack.len() < VECTOR_SIZE {
 116         while ptr < end_ptr {
 117             if *ptr == n1 {
 118                 return Some(sub(ptr, start_ptr));
 119             }
 120             ptr = ptr.offset(1);
 121         }
 122         return None;
 123     }
 124
 125     if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
 126         return Some(i);
 127     }
 128
 129     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
 130     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
 131     while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) {
 132         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
 133
 134         let a = _mm_load_si128(ptr as *const __m128i);
 135         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
 136         let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
 137         let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
 138         let eqa = _mm_cmpeq_epi8(vn1, a);
 139         let eqb = _mm_cmpeq_epi8(vn1, b);
 140         let eqc = _mm_cmpeq_epi8(vn1, c);
 141         let eqd = _mm_cmpeq_epi8(vn1, d);
 142         let or1 = _mm_or_si128(eqa, eqb);
 143         let or2 = _mm_or_si128(eqc, eqd);
 144         let or3 = _mm_or_si128(or1, or2);
 145         if _mm_movemask_epi8(or3) != 0 {
 146             let mut at = sub(ptr, start_ptr);
 147             let mask = _mm_movemask_epi8(eqa);
 148             if mask != 0 {
 149                 return Some(at + forward_pos(mask));
 150             }
 151
 152             at += VECTOR_SIZE;
 153             let mask = _mm_movemask_epi8(eqb);
 154             if mask != 0 {
 155                 return Some(at + forward_pos(mask));
 156             }
 157
 158             at += VECTOR_SIZE;
 159             let mask = _mm_movemask_epi8(eqc);
 160             if mask != 0 {
 161                 return Some(at + forward_pos(mask));
 162             }
 163
 164             at += VECTOR_SIZE;
 165             let mask = _mm_movemask_epi8(eqd);
 166             debug_assert!(mask != 0);
 167             return Some(at + forward_pos(mask));
 168         }
 169         ptr = ptr.add(loop_size);
 170     }
 171     while ptr <= end_ptr.sub(VECTOR_SIZE) {
 172         debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
 173
 174         if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) {
 175             return Some(i);
 176         }
 177         ptr = ptr.add(VECTOR_SIZE);
 178     }
 179     if ptr < end_ptr {
 180         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
 181         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
 182         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
 183
 184         return forward_search1(start_ptr, end_ptr, ptr, vn1);
 185     }
 186     None
 187 }
 188
 189 #[target_feature(enable = "sse2")]
 190 pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
 191     let vn1 = _mm_set1_epi8(n1 as i8);
 192     let vn2 = _mm_set1_epi8(n2 as i8);
 193     let len = haystack.len();
 194     let loop_size = cmp::min(LOOP_SIZE2, len);
 195     let start_ptr = haystack.as_ptr();
 196     let end_ptr = start_ptr.add(haystack.len());
 197     let mut ptr = start_ptr;
 198
 199     if haystack.len() < VECTOR_SIZE {
 200         while ptr < end_ptr {
 201             if *ptr == n1 || *ptr == n2 {
 202                 return Some(sub(ptr, start_ptr));
 203             }
 204             ptr = ptr.offset(1);
 205         }
 206         return None;
 207     }
 208
 209     if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
 210         return Some(i);
 211     }
 212
 213     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
 214     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
 215     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
 216         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
 217
 218         let a = _mm_load_si128(ptr as *const __m128i);
 219         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
 220         let eqa1 = _mm_cmpeq_epi8(vn1, a);
 221         let eqb1 = _mm_cmpeq_epi8(vn1, b);
 222         let eqa2 = _mm_cmpeq_epi8(vn2, a);
 223         let eqb2 = _mm_cmpeq_epi8(vn2, b);
 224         let or1 = _mm_or_si128(eqa1, eqb1);
 225         let or2 = _mm_or_si128(eqa2, eqb2);
 226         let or3 = _mm_or_si128(or1, or2);
 227         if _mm_movemask_epi8(or3) != 0 {
 228             let mut at = sub(ptr, start_ptr);
 229             let mask1 = _mm_movemask_epi8(eqa1);
 230             let mask2 = _mm_movemask_epi8(eqa2);
 231             if mask1 != 0 || mask2 != 0 {
 232                 return Some(at + forward_pos2(mask1, mask2));
 233             }
 234
 235             at += VECTOR_SIZE;
 236             let mask1 = _mm_movemask_epi8(eqb1);
 237             let mask2 = _mm_movemask_epi8(eqb2);
 238             return Some(at + forward_pos2(mask1, mask2));
 239         }
 240         ptr = ptr.add(loop_size);
 241     }
 242     while ptr <= end_ptr.sub(VECTOR_SIZE) {
 243         if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
 244             return Some(i);
 245         }
 246         ptr = ptr.add(VECTOR_SIZE);
 247     }
 248     if ptr < end_ptr {
 249         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
 250         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
 251         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
 252
 253         return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2);
 254     }
 255     None
 256 }
 257
 258 #[target_feature(enable = "sse2")]
 259 pub unsafe fn memchr3(
 260     n1: u8,
 261     n2: u8,
 262     n3: u8,
 263     haystack: &[u8],
 264 ) -> Option<usize> {
 265     let vn1 = _mm_set1_epi8(n1 as i8);
 266     let vn2 = _mm_set1_epi8(n2 as i8);
 267     let vn3 = _mm_set1_epi8(n3 as i8);
 268     let len = haystack.len();
 269     let loop_size = cmp::min(LOOP_SIZE2, len);
 270     let start_ptr = haystack.as_ptr();
 271     let end_ptr = start_ptr.add(haystack.len());
 272     let mut ptr = start_ptr;
 273
 274     if haystack.len() < VECTOR_SIZE {
 275         while ptr < end_ptr {
 276             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
 277                 return Some(sub(ptr, start_ptr));
 278             }
 279             ptr = ptr.offset(1);
 280         }
 281         return None;
 282     }
 283
 284     if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
 285         return Some(i);
 286     }
 287
 288     ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
 289     debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr);
 290     while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) {
 291         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
 292
 293         let a = _mm_load_si128(ptr as *const __m128i);
 294         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
 295         let eqa1 = _mm_cmpeq_epi8(vn1, a);
 296         let eqb1 = _mm_cmpeq_epi8(vn1, b);
 297         let eqa2 = _mm_cmpeq_epi8(vn2, a);
 298         let eqb2 = _mm_cmpeq_epi8(vn2, b);
 299         let eqa3 = _mm_cmpeq_epi8(vn3, a);
 300         let eqb3 = _mm_cmpeq_epi8(vn3, b);
 301         let or1 = _mm_or_si128(eqa1, eqb1);
 302         let or2 = _mm_or_si128(eqa2, eqb2);
 303         let or3 = _mm_or_si128(eqa3, eqb3);
 304         let or4 = _mm_or_si128(or1, or2);
 305         let or5 = _mm_or_si128(or3, or4);
 306         if _mm_movemask_epi8(or5) != 0 {
 307             let mut at = sub(ptr, start_ptr);
 308             let mask1 = _mm_movemask_epi8(eqa1);
 309             let mask2 = _mm_movemask_epi8(eqa2);
 310             let mask3 = _mm_movemask_epi8(eqa3);
 311             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
 312                 return Some(at + forward_pos3(mask1, mask2, mask3));
 313             }
 314
 315             at += VECTOR_SIZE;
 316             let mask1 = _mm_movemask_epi8(eqb1);
 317             let mask2 = _mm_movemask_epi8(eqb2);
 318             let mask3 = _mm_movemask_epi8(eqb3);
 319             return Some(at + forward_pos3(mask1, mask2, mask3));
 320         }
 321         ptr = ptr.add(loop_size);
 322     }
 323     while ptr <= end_ptr.sub(VECTOR_SIZE) {
 324         if let Some(i) =
 325             forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
 326         {
 327             return Some(i);
 328         }
 329         ptr = ptr.add(VECTOR_SIZE);
 330     }
 331     if ptr < end_ptr {
 332         debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE);
 333         ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr));
 334         debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE);
 335
 336         return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3);
 337     }
 338     None
 339 }
 340
 341 #[target_feature(enable = "sse2")]
 342 pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
 343     let vn1 = _mm_set1_epi8(n1 as i8);
 344     let len = haystack.len();
 345     let loop_size = cmp::min(LOOP_SIZE, len);
 346     let start_ptr = haystack.as_ptr();
 347     let end_ptr = start_ptr.add(haystack.len());
 348     let mut ptr = end_ptr;
 349
 350     if haystack.len() < VECTOR_SIZE {
 351         while ptr > start_ptr {
 352             ptr = ptr.offset(-1);
 353             if *ptr == n1 {
 354                 return Some(sub(ptr, start_ptr));
 355             }
 356         }
 357         return None;
 358     }
 359
 360     ptr = ptr.sub(VECTOR_SIZE);
 361     if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
 362         return Some(i);
 363     }
 364
 365     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
 366     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
 367     while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) {
 368         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
 369
 370         ptr = ptr.sub(loop_size);
 371         let a = _mm_load_si128(ptr as *const __m128i);
 372         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
 373         let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
 374         let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
 375         let eqa = _mm_cmpeq_epi8(vn1, a);
 376         let eqb = _mm_cmpeq_epi8(vn1, b);
 377         let eqc = _mm_cmpeq_epi8(vn1, c);
 378         let eqd = _mm_cmpeq_epi8(vn1, d);
 379         let or1 = _mm_or_si128(eqa, eqb);
 380         let or2 = _mm_or_si128(eqc, eqd);
 381         let or3 = _mm_or_si128(or1, or2);
 382         if _mm_movemask_epi8(or3) != 0 {
 383             let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr);
 384             let mask = _mm_movemask_epi8(eqd);
 385             if mask != 0 {
 386                 return Some(at + reverse_pos(mask));
 387             }
 388
 389             at -= VECTOR_SIZE;
 390             let mask = _mm_movemask_epi8(eqc);
 391             if mask != 0 {
 392                 return Some(at + reverse_pos(mask));
 393             }
 394
 395             at -= VECTOR_SIZE;
 396             let mask = _mm_movemask_epi8(eqb);
 397             if mask != 0 {
 398                 return Some(at + reverse_pos(mask));
 399             }
 400
 401             at -= VECTOR_SIZE;
 402             let mask = _mm_movemask_epi8(eqa);
 403             debug_assert!(mask != 0);
 404             return Some(at + reverse_pos(mask));
 405         }
 406     }
 407     while ptr >= start_ptr.add(VECTOR_SIZE) {
 408         ptr = ptr.sub(VECTOR_SIZE);
 409         if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) {
 410             return Some(i);
 411         }
 412     }
 413     if ptr > start_ptr {
 414         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
 415         return reverse_search1(start_ptr, end_ptr, start_ptr, vn1);
 416     }
 417     None
 418 }
 419
 420 #[target_feature(enable = "sse2")]
 421 pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
 422     let vn1 = _mm_set1_epi8(n1 as i8);
 423     let vn2 = _mm_set1_epi8(n2 as i8);
 424     let len = haystack.len();
 425     let loop_size = cmp::min(LOOP_SIZE2, len);
 426     let start_ptr = haystack.as_ptr();
 427     let end_ptr = start_ptr.add(haystack.len());
 428     let mut ptr = end_ptr;
 429
 430     if haystack.len() < VECTOR_SIZE {
 431         while ptr > start_ptr {
 432             ptr = ptr.offset(-1);
 433             if *ptr == n1 || *ptr == n2 {
 434                 return Some(sub(ptr, start_ptr));
 435             }
 436         }
 437         return None;
 438     }
 439
 440     ptr = ptr.sub(VECTOR_SIZE);
 441     if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
 442         return Some(i);
 443     }
 444
 445     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
 446     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
 447     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
 448         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
 449
 450         ptr = ptr.sub(loop_size);
 451         let a = _mm_load_si128(ptr as *const __m128i);
 452         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
 453         let eqa1 = _mm_cmpeq_epi8(vn1, a);
 454         let eqb1 = _mm_cmpeq_epi8(vn1, b);
 455         let eqa2 = _mm_cmpeq_epi8(vn2, a);
 456         let eqb2 = _mm_cmpeq_epi8(vn2, b);
 457         let or1 = _mm_or_si128(eqa1, eqb1);
 458         let or2 = _mm_or_si128(eqa2, eqb2);
 459         let or3 = _mm_or_si128(or1, or2);
 460         if _mm_movemask_epi8(or3) != 0 {
 461             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
 462             let mask1 = _mm_movemask_epi8(eqb1);
 463             let mask2 = _mm_movemask_epi8(eqb2);
 464             if mask1 != 0 || mask2 != 0 {
 465                 return Some(at + reverse_pos2(mask1, mask2));
 466             }
 467
 468             at -= VECTOR_SIZE;
 469             let mask1 = _mm_movemask_epi8(eqa1);
 470             let mask2 = _mm_movemask_epi8(eqa2);
 471             return Some(at + reverse_pos2(mask1, mask2));
 472         }
 473     }
 474     while ptr >= start_ptr.add(VECTOR_SIZE) {
 475         ptr = ptr.sub(VECTOR_SIZE);
 476         if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) {
 477             return Some(i);
 478         }
 479     }
 480     if ptr > start_ptr {
 481         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
 482         return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2);
 483     }
 484     None
 485 }
 486
 487 #[target_feature(enable = "sse2")]
 488 pub unsafe fn memrchr3(
 489     n1: u8,
 490     n2: u8,
 491     n3: u8,
 492     haystack: &[u8],
 493 ) -> Option<usize> {
 494     let vn1 = _mm_set1_epi8(n1 as i8);
 495     let vn2 = _mm_set1_epi8(n2 as i8);
 496     let vn3 = _mm_set1_epi8(n3 as i8);
 497     let len = haystack.len();
 498     let loop_size = cmp::min(LOOP_SIZE2, len);
 499     let start_ptr = haystack.as_ptr();
 500     let end_ptr = start_ptr.add(haystack.len());
 501     let mut ptr = end_ptr;
 502
 503     if haystack.len() < VECTOR_SIZE {
 504         while ptr > start_ptr {
 505             ptr = ptr.offset(-1);
 506             if *ptr == n1 || *ptr == n2 || *ptr == n3 {
 507                 return Some(sub(ptr, start_ptr));
 508             }
 509         }
 510         return None;
 511     }
 512
 513     ptr = ptr.sub(VECTOR_SIZE);
 514     if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) {
 515         return Some(i);
 516     }
 517
 518     ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8;
 519     debug_assert!(start_ptr <= ptr && ptr <= end_ptr);
 520     while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) {
 521         debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
 522
 523         ptr = ptr.sub(loop_size);
 524         let a = _mm_load_si128(ptr as *const __m128i);
 525         let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
 526         let eqa1 = _mm_cmpeq_epi8(vn1, a);
 527         let eqb1 = _mm_cmpeq_epi8(vn1, b);
 528         let eqa2 = _mm_cmpeq_epi8(vn2, a);
 529         let eqb2 = _mm_cmpeq_epi8(vn2, b);
 530         let eqa3 = _mm_cmpeq_epi8(vn3, a);
 531         let eqb3 = _mm_cmpeq_epi8(vn3, b);
 532         let or1 = _mm_or_si128(eqa1, eqb1);
 533         let or2 = _mm_or_si128(eqa2, eqb2);
 534         let or3 = _mm_or_si128(eqa3, eqb3);
 535         let or4 = _mm_or_si128(or1, or2);
 536         let or5 = _mm_or_si128(or3, or4);
 537         if _mm_movemask_epi8(or5) != 0 {
 538             let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr);
 539             let mask1 = _mm_movemask_epi8(eqb1);
 540             let mask2 = _mm_movemask_epi8(eqb2);
 541             let mask3 = _mm_movemask_epi8(eqb3);
 542             if mask1 != 0 || mask2 != 0 || mask3 != 0 {
 543                 return Some(at + reverse_pos3(mask1, mask2, mask3));
 544             }
 545
 546             at -= VECTOR_SIZE;
 547             let mask1 = _mm_movemask_epi8(eqa1);
 548             let mask2 = _mm_movemask_epi8(eqa2);
 549             let mask3 = _mm_movemask_epi8(eqa3);
 550             return Some(at + reverse_pos3(mask1, mask2, mask3));
 551         }
 552     }
 553     while ptr >= start_ptr.add(VECTOR_SIZE) {
 554         ptr = ptr.sub(VECTOR_SIZE);
 555         if let Some(i) =
 556             reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3)
 557         {
 558             return Some(i);
 559         }
 560     }
 561     if ptr > start_ptr {
 562         debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE);
 563         return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3);
 564     }
 565     None
 566 }
 567
 568 #[target_feature(enable = "sse2")]
 569 pub unsafe fn forward_search1(
 570     start_ptr: *const u8,
 571     end_ptr: *const u8,
 572     ptr: *const u8,
 573     vn1: __m128i,
 574 ) -> Option<usize> {
 575     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
 576     debug_assert!(start_ptr <= ptr);
 577     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
 578
 579     let chunk = _mm_loadu_si128(ptr as *const __m128i);
 580     let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1));
 581     if mask != 0 {
 582         Some(sub(ptr, start_ptr) + forward_pos(mask))
 583     } else {
 584         None
 585     }
 586 }
 587
 588 #[target_feature(enable = "sse2")]
 589 unsafe fn forward_search2(
 590     start_ptr: *const u8,
 591     end_ptr: *const u8,
 592     ptr: *const u8,
 593     vn1: __m128i,
 594     vn2: __m128i,
 595 ) -> Option<usize> {
 596     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
 597     debug_assert!(start_ptr <= ptr);
 598     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
 599
 600     let chunk = _mm_loadu_si128(ptr as *const __m128i);
 601     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
 602     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
 603     if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
 604         let mask1 = _mm_movemask_epi8(eq1);
 605         let mask2 = _mm_movemask_epi8(eq2);
 606         Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2))
 607     } else {
 608         None
 609     }
 610 }
 611
 612 #[target_feature(enable = "sse2")]
 613 pub unsafe fn forward_search3(
 614     start_ptr: *const u8,
 615     end_ptr: *const u8,
 616     ptr: *const u8,
 617     vn1: __m128i,
 618     vn2: __m128i,
 619     vn3: __m128i,
 620 ) -> Option<usize> {
 621     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
 622     debug_assert!(start_ptr <= ptr);
 623     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
 624
 625     let chunk = _mm_loadu_si128(ptr as *const __m128i);
 626     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
 627     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
 628     let eq3 = _mm_cmpeq_epi8(chunk, vn3);
 629     let or = _mm_or_si128(eq1, eq2);
 630     if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
 631         let mask1 = _mm_movemask_epi8(eq1);
 632         let mask2 = _mm_movemask_epi8(eq2);
 633         let mask3 = _mm_movemask_epi8(eq3);
 634         Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3))
 635     } else {
 636         None
 637     }
 638 }
 639
 640 #[target_feature(enable = "sse2")]
 641 unsafe fn reverse_search1(
 642     start_ptr: *const u8,
 643     end_ptr: *const u8,
 644     ptr: *const u8,
 645     vn1: __m128i,
 646 ) -> Option<usize> {
 647     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
 648     debug_assert!(start_ptr <= ptr);
 649     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
 650
 651     let chunk = _mm_loadu_si128(ptr as *const __m128i);
 652     let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk));
 653     if mask != 0 {
 654         Some(sub(ptr, start_ptr) + reverse_pos(mask))
 655     } else {
 656         None
 657     }
 658 }
 659
 660 #[target_feature(enable = "sse2")]
 661 unsafe fn reverse_search2(
 662     start_ptr: *const u8,
 663     end_ptr: *const u8,
 664     ptr: *const u8,
 665     vn1: __m128i,
 666     vn2: __m128i,
 667 ) -> Option<usize> {
 668     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
 669     debug_assert!(start_ptr <= ptr);
 670     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
 671
 672     let chunk = _mm_loadu_si128(ptr as *const __m128i);
 673     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
 674     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
 675     if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 {
 676         let mask1 = _mm_movemask_epi8(eq1);
 677         let mask2 = _mm_movemask_epi8(eq2);
 678         Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2))
 679     } else {
 680         None
 681     }
 682 }
 683
 684 #[target_feature(enable = "sse2")]
 685 unsafe fn reverse_search3(
 686     start_ptr: *const u8,
 687     end_ptr: *const u8,
 688     ptr: *const u8,
 689     vn1: __m128i,
 690     vn2: __m128i,
 691     vn3: __m128i,
 692 ) -> Option<usize> {
 693     debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE);
 694     debug_assert!(start_ptr <= ptr);
 695     debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE));
 696
 697     let chunk = _mm_loadu_si128(ptr as *const __m128i);
 698     let eq1 = _mm_cmpeq_epi8(chunk, vn1);
 699     let eq2 = _mm_cmpeq_epi8(chunk, vn2);
 700     let eq3 = _mm_cmpeq_epi8(chunk, vn3);
 701     let or = _mm_or_si128(eq1, eq2);
 702     if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 {
 703         let mask1 = _mm_movemask_epi8(eq1);
 704         let mask2 = _mm_movemask_epi8(eq2);
 705         let mask3 = _mm_movemask_epi8(eq3);
 706         Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3))
 707     } else {
 708         None
 709     }
 710 }
 711
 712 /// Compute the position of the first matching byte from the given mask. The
 713 /// position returned is always in the range [0, 15].
 714 ///
 715 /// The mask given is expected to be the result of _mm_movemask_epi8.
 716 fn forward_pos(mask: i32) -> usize {
 717     // We are dealing with little endian here, where the most significant byte
 718     // is at a higher address. That means the least significant bit that is set
 719     // corresponds to the position of our first matching byte. That position
 720     // corresponds to the number of zeros after the least significant bit.
 721     mask.trailing_zeros() as usize
 722 }
 723
 724 /// Compute the position of the first matching byte from the given masks. The
 725 /// position returned is always in the range [0, 15]. Each mask corresponds to
 726 /// the equality comparison of a single byte.
 727 ///
 728 /// The masks given are expected to be the result of _mm_movemask_epi8, where
 729 /// at least one of the masks is non-zero (i.e., indicates a match).
 730 fn forward_pos2(mask1: i32, mask2: i32) -> usize {
 731     debug_assert!(mask1 != 0 || mask2 != 0);
 732
 733     forward_pos(mask1 | mask2)
 734 }
 735
 736 /// Compute the position of the first matching byte from the given masks. The
 737 /// position returned is always in the range [0, 15]. Each mask corresponds to
 738 /// the equality comparison of a single byte.
 739 ///
 740 /// The masks given are expected to be the result of _mm_movemask_epi8, where
 741 /// at least one of the masks is non-zero (i.e., indicates a match).
 742 fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
 743     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
 744
 745     forward_pos(mask1 | mask2 | mask3)
 746 }
 747
 748 /// Compute the position of the last matching byte from the given mask. The
 749 /// position returned is always in the range [0, 15].
 750 ///
 751 /// The mask given is expected to be the result of _mm_movemask_epi8.
 752 fn reverse_pos(mask: i32) -> usize {
 753     // We are dealing with little endian here, where the most significant byte
 754     // is at a higher address. That means the most significant bit that is set
 755     // corresponds to the position of our last matching byte. The position from
 756     // the end of the mask is therefore the number of leading zeros in a 16
 757     // bit integer, and the position from the start of the mask is therefore
 758     // 16 - (leading zeros) - 1.
 759     VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1
 760 }
 761
 762 /// Compute the position of the last matching byte from the given masks. The
 763 /// position returned is always in the range [0, 15]. Each mask corresponds to
 764 /// the equality comparison of a single byte.
 765 ///
 766 /// The masks given are expected to be the result of _mm_movemask_epi8, where
 767 /// at least one of the masks is non-zero (i.e., indicates a match).
 768 fn reverse_pos2(mask1: i32, mask2: i32) -> usize {
 769     debug_assert!(mask1 != 0 || mask2 != 0);
 770
 771     reverse_pos(mask1 | mask2)
 772 }
 773
 774 /// Compute the position of the last matching byte from the given masks. The
 775 /// position returned is always in the range [0, 15]. Each mask corresponds to
 776 /// the equality comparison of a single byte.
 777 ///
 778 /// The masks given are expected to be the result of _mm_movemask_epi8, where
 779 /// at least one of the masks is non-zero (i.e., indicates a match).
 780 fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize {
 781     debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0);
 782
 783     reverse_pos(mask1 | mask2 | mask3)
 784 }
 785
 786 /// Subtract `b` from `a` and return the difference. `a` should be greater than
 787 /// or equal to `b`.
 788 fn sub(a: *const u8, b: *const u8) -> usize {
 789     debug_assert!(a >= b);
 790     (a as usize) - (b as usize)
 791 }