1 use core
::{arch::x86_64::*, cmp, mem::size_of}
;
3 const VECTOR_SIZE
: usize = size_of
::<__m128i
>();
4 const VECTOR_ALIGN
: usize = VECTOR_SIZE
- 1;
6 // The number of bytes to loop at in one iteration of memchr/memrchr.
7 const LOOP_SIZE
: usize = 4 * VECTOR_SIZE
;
9 // The number of bytes to loop at in one iteration of memchr2/memrchr2 and
10 // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes
11 // in benchmarks. memchr3 in particular only gets a very slight speed up from
12 // the loop unrolling.
13 const LOOP_SIZE2
: usize = 2 * VECTOR_SIZE
;
15 #[target_feature(enable = "sse2")]
16 pub unsafe fn memchr(n1
: u8, haystack
: &[u8]) -> Option
<usize> {
17 // What follows is a fast SSE2-only algorithm to detect the position of
18 // `n1` in `haystack` if it exists. From what I know, this is the "classic"
19 // algorithm. I believe it can be found in places like glibc and Go's
20 // standard library. It appears to be well known and is elaborated on in
21 // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html
23 // While this routine is very long, the basic idea is actually very simple
24 // and can be expressed straight-forwardly in pseudo code:
26 // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1
27 // // Note: shift amount in bytes
29 // while i <= haystack.len() - 16:
30 // // A 16 byte vector. Each byte in chunk corresponds to a byte in
32 // chunk = haystack[i:i+16]
33 // // Compare bytes in needle with bytes in chunk. The result is a 16
34 // // byte chunk where each byte is 0xFF if the corresponding bytes
35 // // in needle and chunk were equal, or 0x00 otherwise.
36 // eqs = cmpeq(needle, chunk)
37 // // Return a 32 bit integer where the most significant 16 bits
38 // // are always 0 and the lower 16 bits correspond to whether the
39 // // most significant bit in the correspond byte in `eqs` is set.
40 // // In other words, `mask as u16` has bit i set if and only if
41 // // needle[i] == chunk[i].
42 // mask = movemask(eqs)
44 // // Mask is 0 if there is no match, and non-zero otherwise.
46 // // trailing_zeros tells us the position of the least significant
47 // // bit that is set.
48 // return i + trailing_zeros(mask)
50 // // haystack length may not be a multiple of 16, so search the rest.
51 // while i < haystack.len():
52 // if haystack[i] == n1:
58 // In fact, we could loosely translate the above code to Rust line-for-line
59 // and it would be a pretty fast algorithm. But, we pull out all the stops
60 // to go as fast as possible:
62 // 1. We use aligned loads. That is, we do some finagling to make sure our
63 // primary loop not only proceeds in increments of 16 bytes, but that
64 // the address of haystack's pointer that we dereference is aligned to
65 // 16 bytes. 16 is a magic number here because it is the size of SSE2
66 // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.)
67 // Therefore, to get aligned loads, our pointer's address must be evenly
69 // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's
70 // kind of like loop unrolling, but we combine the equality comparisons
71 // using a vector OR such that we only need to extract a single mask to
72 // determine whether a match exists or not. If so, then we do some
73 // book-keeping to determine the precise location but otherwise mush on.
74 // 3. We use our "chunk" comparison routine in as many places as possible,
75 // even if it means using unaligned loads. In particular, if haystack
76 // starts with an unaligned address, then we do an unaligned load to
77 // search the first 16 bytes. We then start our primary loop at the
78 // smallest subsequent aligned address, which will actually overlap with
79 // previously searched bytes. But we're OK with that. We do a similar
80 // dance at the end of our primary loop. Finally, to avoid a
81 // byte-at-a-time loop at the end, we do a final 16 byte unaligned load
82 // that may overlap with a previous load. This is OK because it converts
83 // a loop into a small number of very fast vector instructions.
85 // The primary downside of this algorithm is that it's effectively
86 // completely unsafe. Therefore, we have to be super careful to avoid
87 // undefined behavior:
89 // 1. We use raw pointers everywhere. Not only does dereferencing a pointer
90 // require the pointer to be valid, but we actually can't even store the
91 // address of an invalid pointer (unless it's 1 past the end of
92 // haystack) without sacrificing performance.
93 // 2. _mm_loadu_si128 is used when you don't care about alignment, and
94 // _mm_load_si128 is used when you do care. You cannot use the latter
95 // on unaligned pointers.
96 // 3. We make liberal use of debug_assert! to check assumptions.
97 // 4. We make a concerted effort to stick with pointers instead of indices.
98 // Indices are nicer because there's less to worry about with them (see
99 // above about pointer offsets), but I could not get the compiler to
100 // produce as good of code as what the below produces. In any case,
101 // pointers are what we really care about here, and alignment is
102 // expressed a bit more naturally with them.
104 // In general, most of the algorithms in this crate have a similar
105 // structure to what you see below, so this comment applies fairly well to
108 let vn1
= _mm_set1_epi8(n1
as i8);
109 let len
= haystack
.len();
110 let loop_size
= cmp
::min(LOOP_SIZE
, len
);
111 let start_ptr
= haystack
.as_ptr();
112 let end_ptr
= start_ptr
.add(haystack
.len());
113 let mut ptr
= start_ptr
;
115 if haystack
.len() < VECTOR_SIZE
{
116 while ptr
< end_ptr
{
118 return Some(sub(ptr
, start_ptr
));
125 if let Some(i
) = forward_search1(start_ptr
, end_ptr
, ptr
, vn1
) {
129 ptr
= ptr
.add(VECTOR_SIZE
- (start_ptr
as usize & VECTOR_ALIGN
));
130 debug_assert
!(ptr
> start_ptr
&& end_ptr
.sub(VECTOR_SIZE
) >= start_ptr
);
131 while loop_size
== LOOP_SIZE
&& ptr
<= end_ptr
.sub(loop_size
) {
132 debug_assert_eq
!(0, (ptr
as usize) % VECTOR_SIZE
);
134 let a
= _mm_load_si128(ptr
as *const __m128i
);
135 let b
= _mm_load_si128(ptr
.add(VECTOR_SIZE
) as *const __m128i
);
136 let c
= _mm_load_si128(ptr
.add(2 * VECTOR_SIZE
) as *const __m128i
);
137 let d
= _mm_load_si128(ptr
.add(3 * VECTOR_SIZE
) as *const __m128i
);
138 let eqa
= _mm_cmpeq_epi8(vn1
, a
);
139 let eqb
= _mm_cmpeq_epi8(vn1
, b
);
140 let eqc
= _mm_cmpeq_epi8(vn1
, c
);
141 let eqd
= _mm_cmpeq_epi8(vn1
, d
);
142 let or1
= _mm_or_si128(eqa
, eqb
);
143 let or2
= _mm_or_si128(eqc
, eqd
);
144 let or3
= _mm_or_si128(or1
, or2
);
145 if _mm_movemask_epi8(or3
) != 0 {
146 let mut at
= sub(ptr
, start_ptr
);
147 let mask
= _mm_movemask_epi8(eqa
);
149 return Some(at
+ forward_pos(mask
));
153 let mask
= _mm_movemask_epi8(eqb
);
155 return Some(at
+ forward_pos(mask
));
159 let mask
= _mm_movemask_epi8(eqc
);
161 return Some(at
+ forward_pos(mask
));
165 let mask
= _mm_movemask_epi8(eqd
);
166 debug_assert
!(mask
!= 0);
167 return Some(at
+ forward_pos(mask
));
169 ptr
= ptr
.add(loop_size
);
171 while ptr
<= end_ptr
.sub(VECTOR_SIZE
) {
172 debug_assert
!(sub(end_ptr
, ptr
) >= VECTOR_SIZE
);
174 if let Some(i
) = forward_search1(start_ptr
, end_ptr
, ptr
, vn1
) {
177 ptr
= ptr
.add(VECTOR_SIZE
);
180 debug_assert
!(sub(end_ptr
, ptr
) < VECTOR_SIZE
);
181 ptr
= ptr
.sub(VECTOR_SIZE
- sub(end_ptr
, ptr
));
182 debug_assert_eq
!(sub(end_ptr
, ptr
), VECTOR_SIZE
);
184 return forward_search1(start_ptr
, end_ptr
, ptr
, vn1
);
189 #[target_feature(enable = "sse2")]
190 pub unsafe fn memchr2(n1
: u8, n2
: u8, haystack
: &[u8]) -> Option
<usize> {
191 let vn1
= _mm_set1_epi8(n1
as i8);
192 let vn2
= _mm_set1_epi8(n2
as i8);
193 let len
= haystack
.len();
194 let loop_size
= cmp
::min(LOOP_SIZE2
, len
);
195 let start_ptr
= haystack
.as_ptr();
196 let end_ptr
= start_ptr
.add(haystack
.len());
197 let mut ptr
= start_ptr
;
199 if haystack
.len() < VECTOR_SIZE
{
200 while ptr
< end_ptr
{
201 if *ptr
== n1
|| *ptr
== n2
{
202 return Some(sub(ptr
, start_ptr
));
209 if let Some(i
) = forward_search2(start_ptr
, end_ptr
, ptr
, vn1
, vn2
) {
213 ptr
= ptr
.add(VECTOR_SIZE
- (start_ptr
as usize & VECTOR_ALIGN
));
214 debug_assert
!(ptr
> start_ptr
&& end_ptr
.sub(VECTOR_SIZE
) >= start_ptr
);
215 while loop_size
== LOOP_SIZE2
&& ptr
<= end_ptr
.sub(loop_size
) {
216 debug_assert_eq
!(0, (ptr
as usize) % VECTOR_SIZE
);
218 let a
= _mm_load_si128(ptr
as *const __m128i
);
219 let b
= _mm_load_si128(ptr
.add(VECTOR_SIZE
) as *const __m128i
);
220 let eqa1
= _mm_cmpeq_epi8(vn1
, a
);
221 let eqb1
= _mm_cmpeq_epi8(vn1
, b
);
222 let eqa2
= _mm_cmpeq_epi8(vn2
, a
);
223 let eqb2
= _mm_cmpeq_epi8(vn2
, b
);
224 let or1
= _mm_or_si128(eqa1
, eqb1
);
225 let or2
= _mm_or_si128(eqa2
, eqb2
);
226 let or3
= _mm_or_si128(or1
, or2
);
227 if _mm_movemask_epi8(or3
) != 0 {
228 let mut at
= sub(ptr
, start_ptr
);
229 let mask1
= _mm_movemask_epi8(eqa1
);
230 let mask2
= _mm_movemask_epi8(eqa2
);
231 if mask1
!= 0 || mask2
!= 0 {
232 return Some(at
+ forward_pos2(mask1
, mask2
));
236 let mask1
= _mm_movemask_epi8(eqb1
);
237 let mask2
= _mm_movemask_epi8(eqb2
);
238 return Some(at
+ forward_pos2(mask1
, mask2
));
240 ptr
= ptr
.add(loop_size
);
242 while ptr
<= end_ptr
.sub(VECTOR_SIZE
) {
243 if let Some(i
) = forward_search2(start_ptr
, end_ptr
, ptr
, vn1
, vn2
) {
246 ptr
= ptr
.add(VECTOR_SIZE
);
249 debug_assert
!(sub(end_ptr
, ptr
) < VECTOR_SIZE
);
250 ptr
= ptr
.sub(VECTOR_SIZE
- sub(end_ptr
, ptr
));
251 debug_assert_eq
!(sub(end_ptr
, ptr
), VECTOR_SIZE
);
253 return forward_search2(start_ptr
, end_ptr
, ptr
, vn1
, vn2
);
258 #[target_feature(enable = "sse2")]
259 pub unsafe fn memchr3(
265 let vn1
= _mm_set1_epi8(n1
as i8);
266 let vn2
= _mm_set1_epi8(n2
as i8);
267 let vn3
= _mm_set1_epi8(n3
as i8);
268 let len
= haystack
.len();
269 let loop_size
= cmp
::min(LOOP_SIZE2
, len
);
270 let start_ptr
= haystack
.as_ptr();
271 let end_ptr
= start_ptr
.add(haystack
.len());
272 let mut ptr
= start_ptr
;
274 if haystack
.len() < VECTOR_SIZE
{
275 while ptr
< end_ptr
{
276 if *ptr
== n1
|| *ptr
== n2
|| *ptr
== n3
{
277 return Some(sub(ptr
, start_ptr
));
284 if let Some(i
) = forward_search3(start_ptr
, end_ptr
, ptr
, vn1
, vn2
, vn3
) {
288 ptr
= ptr
.add(VECTOR_SIZE
- (start_ptr
as usize & VECTOR_ALIGN
));
289 debug_assert
!(ptr
> start_ptr
&& end_ptr
.sub(VECTOR_SIZE
) >= start_ptr
);
290 while loop_size
== LOOP_SIZE2
&& ptr
<= end_ptr
.sub(loop_size
) {
291 debug_assert_eq
!(0, (ptr
as usize) % VECTOR_SIZE
);
293 let a
= _mm_load_si128(ptr
as *const __m128i
);
294 let b
= _mm_load_si128(ptr
.add(VECTOR_SIZE
) as *const __m128i
);
295 let eqa1
= _mm_cmpeq_epi8(vn1
, a
);
296 let eqb1
= _mm_cmpeq_epi8(vn1
, b
);
297 let eqa2
= _mm_cmpeq_epi8(vn2
, a
);
298 let eqb2
= _mm_cmpeq_epi8(vn2
, b
);
299 let eqa3
= _mm_cmpeq_epi8(vn3
, a
);
300 let eqb3
= _mm_cmpeq_epi8(vn3
, b
);
301 let or1
= _mm_or_si128(eqa1
, eqb1
);
302 let or2
= _mm_or_si128(eqa2
, eqb2
);
303 let or3
= _mm_or_si128(eqa3
, eqb3
);
304 let or4
= _mm_or_si128(or1
, or2
);
305 let or5
= _mm_or_si128(or3
, or4
);
306 if _mm_movemask_epi8(or5
) != 0 {
307 let mut at
= sub(ptr
, start_ptr
);
308 let mask1
= _mm_movemask_epi8(eqa1
);
309 let mask2
= _mm_movemask_epi8(eqa2
);
310 let mask3
= _mm_movemask_epi8(eqa3
);
311 if mask1
!= 0 || mask2
!= 0 || mask3
!= 0 {
312 return Some(at
+ forward_pos3(mask1
, mask2
, mask3
));
316 let mask1
= _mm_movemask_epi8(eqb1
);
317 let mask2
= _mm_movemask_epi8(eqb2
);
318 let mask3
= _mm_movemask_epi8(eqb3
);
319 return Some(at
+ forward_pos3(mask1
, mask2
, mask3
));
321 ptr
= ptr
.add(loop_size
);
323 while ptr
<= end_ptr
.sub(VECTOR_SIZE
) {
325 forward_search3(start_ptr
, end_ptr
, ptr
, vn1
, vn2
, vn3
)
329 ptr
= ptr
.add(VECTOR_SIZE
);
332 debug_assert
!(sub(end_ptr
, ptr
) < VECTOR_SIZE
);
333 ptr
= ptr
.sub(VECTOR_SIZE
- sub(end_ptr
, ptr
));
334 debug_assert_eq
!(sub(end_ptr
, ptr
), VECTOR_SIZE
);
336 return forward_search3(start_ptr
, end_ptr
, ptr
, vn1
, vn2
, vn3
);
341 #[target_feature(enable = "sse2")]
342 pub unsafe fn memrchr(n1
: u8, haystack
: &[u8]) -> Option
<usize> {
343 let vn1
= _mm_set1_epi8(n1
as i8);
344 let len
= haystack
.len();
345 let loop_size
= cmp
::min(LOOP_SIZE
, len
);
346 let start_ptr
= haystack
.as_ptr();
347 let end_ptr
= start_ptr
.add(haystack
.len());
348 let mut ptr
= end_ptr
;
350 if haystack
.len() < VECTOR_SIZE
{
351 while ptr
> start_ptr
{
352 ptr
= ptr
.offset(-1);
354 return Some(sub(ptr
, start_ptr
));
360 ptr
= ptr
.sub(VECTOR_SIZE
);
361 if let Some(i
) = reverse_search1(start_ptr
, end_ptr
, ptr
, vn1
) {
365 ptr
= (end_ptr
as usize & !VECTOR_ALIGN
) as *const u8;
366 debug_assert
!(start_ptr
<= ptr
&& ptr
<= end_ptr
);
367 while loop_size
== LOOP_SIZE
&& ptr
>= start_ptr
.add(loop_size
) {
368 debug_assert_eq
!(0, (ptr
as usize) % VECTOR_SIZE
);
370 ptr
= ptr
.sub(loop_size
);
371 let a
= _mm_load_si128(ptr
as *const __m128i
);
372 let b
= _mm_load_si128(ptr
.add(VECTOR_SIZE
) as *const __m128i
);
373 let c
= _mm_load_si128(ptr
.add(2 * VECTOR_SIZE
) as *const __m128i
);
374 let d
= _mm_load_si128(ptr
.add(3 * VECTOR_SIZE
) as *const __m128i
);
375 let eqa
= _mm_cmpeq_epi8(vn1
, a
);
376 let eqb
= _mm_cmpeq_epi8(vn1
, b
);
377 let eqc
= _mm_cmpeq_epi8(vn1
, c
);
378 let eqd
= _mm_cmpeq_epi8(vn1
, d
);
379 let or1
= _mm_or_si128(eqa
, eqb
);
380 let or2
= _mm_or_si128(eqc
, eqd
);
381 let or3
= _mm_or_si128(or1
, or2
);
382 if _mm_movemask_epi8(or3
) != 0 {
383 let mut at
= sub(ptr
.add(3 * VECTOR_SIZE
), start_ptr
);
384 let mask
= _mm_movemask_epi8(eqd
);
386 return Some(at
+ reverse_pos(mask
));
390 let mask
= _mm_movemask_epi8(eqc
);
392 return Some(at
+ reverse_pos(mask
));
396 let mask
= _mm_movemask_epi8(eqb
);
398 return Some(at
+ reverse_pos(mask
));
402 let mask
= _mm_movemask_epi8(eqa
);
403 debug_assert
!(mask
!= 0);
404 return Some(at
+ reverse_pos(mask
));
407 while ptr
>= start_ptr
.add(VECTOR_SIZE
) {
408 ptr
= ptr
.sub(VECTOR_SIZE
);
409 if let Some(i
) = reverse_search1(start_ptr
, end_ptr
, ptr
, vn1
) {
414 debug_assert
!(sub(ptr
, start_ptr
) < VECTOR_SIZE
);
415 return reverse_search1(start_ptr
, end_ptr
, start_ptr
, vn1
);
420 #[target_feature(enable = "sse2")]
421 pub unsafe fn memrchr2(n1
: u8, n2
: u8, haystack
: &[u8]) -> Option
<usize> {
422 let vn1
= _mm_set1_epi8(n1
as i8);
423 let vn2
= _mm_set1_epi8(n2
as i8);
424 let len
= haystack
.len();
425 let loop_size
= cmp
::min(LOOP_SIZE2
, len
);
426 let start_ptr
= haystack
.as_ptr();
427 let end_ptr
= start_ptr
.add(haystack
.len());
428 let mut ptr
= end_ptr
;
430 if haystack
.len() < VECTOR_SIZE
{
431 while ptr
> start_ptr
{
432 ptr
= ptr
.offset(-1);
433 if *ptr
== n1
|| *ptr
== n2
{
434 return Some(sub(ptr
, start_ptr
));
440 ptr
= ptr
.sub(VECTOR_SIZE
);
441 if let Some(i
) = reverse_search2(start_ptr
, end_ptr
, ptr
, vn1
, vn2
) {
445 ptr
= (end_ptr
as usize & !VECTOR_ALIGN
) as *const u8;
446 debug_assert
!(start_ptr
<= ptr
&& ptr
<= end_ptr
);
447 while loop_size
== LOOP_SIZE2
&& ptr
>= start_ptr
.add(loop_size
) {
448 debug_assert_eq
!(0, (ptr
as usize) % VECTOR_SIZE
);
450 ptr
= ptr
.sub(loop_size
);
451 let a
= _mm_load_si128(ptr
as *const __m128i
);
452 let b
= _mm_load_si128(ptr
.add(VECTOR_SIZE
) as *const __m128i
);
453 let eqa1
= _mm_cmpeq_epi8(vn1
, a
);
454 let eqb1
= _mm_cmpeq_epi8(vn1
, b
);
455 let eqa2
= _mm_cmpeq_epi8(vn2
, a
);
456 let eqb2
= _mm_cmpeq_epi8(vn2
, b
);
457 let or1
= _mm_or_si128(eqa1
, eqb1
);
458 let or2
= _mm_or_si128(eqa2
, eqb2
);
459 let or3
= _mm_or_si128(or1
, or2
);
460 if _mm_movemask_epi8(or3
) != 0 {
461 let mut at
= sub(ptr
.add(VECTOR_SIZE
), start_ptr
);
462 let mask1
= _mm_movemask_epi8(eqb1
);
463 let mask2
= _mm_movemask_epi8(eqb2
);
464 if mask1
!= 0 || mask2
!= 0 {
465 return Some(at
+ reverse_pos2(mask1
, mask2
));
469 let mask1
= _mm_movemask_epi8(eqa1
);
470 let mask2
= _mm_movemask_epi8(eqa2
);
471 return Some(at
+ reverse_pos2(mask1
, mask2
));
474 while ptr
>= start_ptr
.add(VECTOR_SIZE
) {
475 ptr
= ptr
.sub(VECTOR_SIZE
);
476 if let Some(i
) = reverse_search2(start_ptr
, end_ptr
, ptr
, vn1
, vn2
) {
481 debug_assert
!(sub(ptr
, start_ptr
) < VECTOR_SIZE
);
482 return reverse_search2(start_ptr
, end_ptr
, start_ptr
, vn1
, vn2
);
487 #[target_feature(enable = "sse2")]
488 pub unsafe fn memrchr3(
494 let vn1
= _mm_set1_epi8(n1
as i8);
495 let vn2
= _mm_set1_epi8(n2
as i8);
496 let vn3
= _mm_set1_epi8(n3
as i8);
497 let len
= haystack
.len();
498 let loop_size
= cmp
::min(LOOP_SIZE2
, len
);
499 let start_ptr
= haystack
.as_ptr();
500 let end_ptr
= start_ptr
.add(haystack
.len());
501 let mut ptr
= end_ptr
;
503 if haystack
.len() < VECTOR_SIZE
{
504 while ptr
> start_ptr
{
505 ptr
= ptr
.offset(-1);
506 if *ptr
== n1
|| *ptr
== n2
|| *ptr
== n3
{
507 return Some(sub(ptr
, start_ptr
));
513 ptr
= ptr
.sub(VECTOR_SIZE
);
514 if let Some(i
) = reverse_search3(start_ptr
, end_ptr
, ptr
, vn1
, vn2
, vn3
) {
518 ptr
= (end_ptr
as usize & !VECTOR_ALIGN
) as *const u8;
519 debug_assert
!(start_ptr
<= ptr
&& ptr
<= end_ptr
);
520 while loop_size
== LOOP_SIZE2
&& ptr
>= start_ptr
.add(loop_size
) {
521 debug_assert_eq
!(0, (ptr
as usize) % VECTOR_SIZE
);
523 ptr
= ptr
.sub(loop_size
);
524 let a
= _mm_load_si128(ptr
as *const __m128i
);
525 let b
= _mm_load_si128(ptr
.add(VECTOR_SIZE
) as *const __m128i
);
526 let eqa1
= _mm_cmpeq_epi8(vn1
, a
);
527 let eqb1
= _mm_cmpeq_epi8(vn1
, b
);
528 let eqa2
= _mm_cmpeq_epi8(vn2
, a
);
529 let eqb2
= _mm_cmpeq_epi8(vn2
, b
);
530 let eqa3
= _mm_cmpeq_epi8(vn3
, a
);
531 let eqb3
= _mm_cmpeq_epi8(vn3
, b
);
532 let or1
= _mm_or_si128(eqa1
, eqb1
);
533 let or2
= _mm_or_si128(eqa2
, eqb2
);
534 let or3
= _mm_or_si128(eqa3
, eqb3
);
535 let or4
= _mm_or_si128(or1
, or2
);
536 let or5
= _mm_or_si128(or3
, or4
);
537 if _mm_movemask_epi8(or5
) != 0 {
538 let mut at
= sub(ptr
.add(VECTOR_SIZE
), start_ptr
);
539 let mask1
= _mm_movemask_epi8(eqb1
);
540 let mask2
= _mm_movemask_epi8(eqb2
);
541 let mask3
= _mm_movemask_epi8(eqb3
);
542 if mask1
!= 0 || mask2
!= 0 || mask3
!= 0 {
543 return Some(at
+ reverse_pos3(mask1
, mask2
, mask3
));
547 let mask1
= _mm_movemask_epi8(eqa1
);
548 let mask2
= _mm_movemask_epi8(eqa2
);
549 let mask3
= _mm_movemask_epi8(eqa3
);
550 return Some(at
+ reverse_pos3(mask1
, mask2
, mask3
));
553 while ptr
>= start_ptr
.add(VECTOR_SIZE
) {
554 ptr
= ptr
.sub(VECTOR_SIZE
);
556 reverse_search3(start_ptr
, end_ptr
, ptr
, vn1
, vn2
, vn3
)
562 debug_assert
!(sub(ptr
, start_ptr
) < VECTOR_SIZE
);
563 return reverse_search3(start_ptr
, end_ptr
, start_ptr
, vn1
, vn2
, vn3
);
568 #[target_feature(enable = "sse2")]
569 pub unsafe fn forward_search1(
570 start_ptr
: *const u8,
575 debug_assert
!(sub(end_ptr
, start_ptr
) >= VECTOR_SIZE
);
576 debug_assert
!(start_ptr
<= ptr
);
577 debug_assert
!(ptr
<= end_ptr
.sub(VECTOR_SIZE
));
579 let chunk
= _mm_loadu_si128(ptr
as *const __m128i
);
580 let mask
= _mm_movemask_epi8(_mm_cmpeq_epi8(chunk
, vn1
));
582 Some(sub(ptr
, start_ptr
) + forward_pos(mask
))
588 #[target_feature(enable = "sse2")]
589 unsafe fn forward_search2(
590 start_ptr
: *const u8,
596 debug_assert
!(sub(end_ptr
, start_ptr
) >= VECTOR_SIZE
);
597 debug_assert
!(start_ptr
<= ptr
);
598 debug_assert
!(ptr
<= end_ptr
.sub(VECTOR_SIZE
));
600 let chunk
= _mm_loadu_si128(ptr
as *const __m128i
);
601 let eq1
= _mm_cmpeq_epi8(chunk
, vn1
);
602 let eq2
= _mm_cmpeq_epi8(chunk
, vn2
);
603 if _mm_movemask_epi8(_mm_or_si128(eq1
, eq2
)) != 0 {
604 let mask1
= _mm_movemask_epi8(eq1
);
605 let mask2
= _mm_movemask_epi8(eq2
);
606 Some(sub(ptr
, start_ptr
) + forward_pos2(mask1
, mask2
))
612 #[target_feature(enable = "sse2")]
613 pub unsafe fn forward_search3(
614 start_ptr
: *const u8,
621 debug_assert
!(sub(end_ptr
, start_ptr
) >= VECTOR_SIZE
);
622 debug_assert
!(start_ptr
<= ptr
);
623 debug_assert
!(ptr
<= end_ptr
.sub(VECTOR_SIZE
));
625 let chunk
= _mm_loadu_si128(ptr
as *const __m128i
);
626 let eq1
= _mm_cmpeq_epi8(chunk
, vn1
);
627 let eq2
= _mm_cmpeq_epi8(chunk
, vn2
);
628 let eq3
= _mm_cmpeq_epi8(chunk
, vn3
);
629 let or
= _mm_or_si128(eq1
, eq2
);
630 if _mm_movemask_epi8(_mm_or_si128(or
, eq3
)) != 0 {
631 let mask1
= _mm_movemask_epi8(eq1
);
632 let mask2
= _mm_movemask_epi8(eq2
);
633 let mask3
= _mm_movemask_epi8(eq3
);
634 Some(sub(ptr
, start_ptr
) + forward_pos3(mask1
, mask2
, mask3
))
640 #[target_feature(enable = "sse2")]
641 unsafe fn reverse_search1(
642 start_ptr
: *const u8,
647 debug_assert
!(sub(end_ptr
, start_ptr
) >= VECTOR_SIZE
);
648 debug_assert
!(start_ptr
<= ptr
);
649 debug_assert
!(ptr
<= end_ptr
.sub(VECTOR_SIZE
));
651 let chunk
= _mm_loadu_si128(ptr
as *const __m128i
);
652 let mask
= _mm_movemask_epi8(_mm_cmpeq_epi8(vn1
, chunk
));
654 Some(sub(ptr
, start_ptr
) + reverse_pos(mask
))
660 #[target_feature(enable = "sse2")]
661 unsafe fn reverse_search2(
662 start_ptr
: *const u8,
668 debug_assert
!(sub(end_ptr
, start_ptr
) >= VECTOR_SIZE
);
669 debug_assert
!(start_ptr
<= ptr
);
670 debug_assert
!(ptr
<= end_ptr
.sub(VECTOR_SIZE
));
672 let chunk
= _mm_loadu_si128(ptr
as *const __m128i
);
673 let eq1
= _mm_cmpeq_epi8(chunk
, vn1
);
674 let eq2
= _mm_cmpeq_epi8(chunk
, vn2
);
675 if _mm_movemask_epi8(_mm_or_si128(eq1
, eq2
)) != 0 {
676 let mask1
= _mm_movemask_epi8(eq1
);
677 let mask2
= _mm_movemask_epi8(eq2
);
678 Some(sub(ptr
, start_ptr
) + reverse_pos2(mask1
, mask2
))
684 #[target_feature(enable = "sse2")]
685 unsafe fn reverse_search3(
686 start_ptr
: *const u8,
693 debug_assert
!(sub(end_ptr
, start_ptr
) >= VECTOR_SIZE
);
694 debug_assert
!(start_ptr
<= ptr
);
695 debug_assert
!(ptr
<= end_ptr
.sub(VECTOR_SIZE
));
697 let chunk
= _mm_loadu_si128(ptr
as *const __m128i
);
698 let eq1
= _mm_cmpeq_epi8(chunk
, vn1
);
699 let eq2
= _mm_cmpeq_epi8(chunk
, vn2
);
700 let eq3
= _mm_cmpeq_epi8(chunk
, vn3
);
701 let or
= _mm_or_si128(eq1
, eq2
);
702 if _mm_movemask_epi8(_mm_or_si128(or
, eq3
)) != 0 {
703 let mask1
= _mm_movemask_epi8(eq1
);
704 let mask2
= _mm_movemask_epi8(eq2
);
705 let mask3
= _mm_movemask_epi8(eq3
);
706 Some(sub(ptr
, start_ptr
) + reverse_pos3(mask1
, mask2
, mask3
))
712 /// Compute the position of the first matching byte from the given mask. The
713 /// position returned is always in the range [0, 15].
715 /// The mask given is expected to be the result of _mm_movemask_epi8.
716 fn forward_pos(mask
: i32) -> usize {
717 // We are dealing with little endian here, where the most significant byte
718 // is at a higher address. That means the least significant bit that is set
719 // corresponds to the position of our first matching byte. That position
720 // corresponds to the number of zeros after the least significant bit.
721 mask
.trailing_zeros() as usize
724 /// Compute the position of the first matching byte from the given masks. The
725 /// position returned is always in the range [0, 15]. Each mask corresponds to
726 /// the equality comparison of a single byte.
728 /// The masks given are expected to be the result of _mm_movemask_epi8, where
729 /// at least one of the masks is non-zero (i.e., indicates a match).
730 fn forward_pos2(mask1
: i32, mask2
: i32) -> usize {
731 debug_assert
!(mask1
!= 0 || mask2
!= 0);
733 forward_pos(mask1
| mask2
)
736 /// Compute the position of the first matching byte from the given masks. The
737 /// position returned is always in the range [0, 15]. Each mask corresponds to
738 /// the equality comparison of a single byte.
740 /// The masks given are expected to be the result of _mm_movemask_epi8, where
741 /// at least one of the masks is non-zero (i.e., indicates a match).
742 fn forward_pos3(mask1
: i32, mask2
: i32, mask3
: i32) -> usize {
743 debug_assert
!(mask1
!= 0 || mask2
!= 0 || mask3
!= 0);
745 forward_pos(mask1
| mask2
| mask3
)
748 /// Compute the position of the last matching byte from the given mask. The
749 /// position returned is always in the range [0, 15].
751 /// The mask given is expected to be the result of _mm_movemask_epi8.
752 fn reverse_pos(mask
: i32) -> usize {
753 // We are dealing with little endian here, where the most significant byte
754 // is at a higher address. That means the most significant bit that is set
755 // corresponds to the position of our last matching byte. The position from
756 // the end of the mask is therefore the number of leading zeros in a 16
757 // bit integer, and the position from the start of the mask is therefore
758 // 16 - (leading zeros) - 1.
759 VECTOR_SIZE
- (mask
as u16).leading_zeros() as usize - 1
762 /// Compute the position of the last matching byte from the given masks. The
763 /// position returned is always in the range [0, 15]. Each mask corresponds to
764 /// the equality comparison of a single byte.
766 /// The masks given are expected to be the result of _mm_movemask_epi8, where
767 /// at least one of the masks is non-zero (i.e., indicates a match).
768 fn reverse_pos2(mask1
: i32, mask2
: i32) -> usize {
769 debug_assert
!(mask1
!= 0 || mask2
!= 0);
771 reverse_pos(mask1
| mask2
)
774 /// Compute the position of the last matching byte from the given masks. The
775 /// position returned is always in the range [0, 15]. Each mask corresponds to
776 /// the equality comparison of a single byte.
778 /// The masks given are expected to be the result of _mm_movemask_epi8, where
779 /// at least one of the masks is non-zero (i.e., indicates a match).
780 fn reverse_pos3(mask1
: i32, mask2
: i32, mask3
: i32) -> usize {
781 debug_assert
!(mask1
!= 0 || mask2
!= 0 || mask3
!= 0);
783 reverse_pos(mask1
| mask2
| mask3
)
786 /// Subtract `b` from `a` and return the difference. `a` should be greater than
788 fn sub(a
: *const u8, b
: *const u8) -> usize {
789 debug_assert
!(a
>= b
);
790 (a
as usize) - (b
as usize)