1 use crate::soft
::{x2, x4}
;
3 use crate::vec128_storage
;
4 use crate::x86_64
::Avx2Machine
;
5 use crate::x86_64
::SseMachine
as Machine86
;
6 use crate::x86_64
::{NoS3, NoS4, YesS3, YesS4}
;
7 use core
::arch
::x86_64
::*;
8 use core
::marker
::PhantomData
;
10 Add
, AddAssign
, BitAnd
, BitAndAssign
, BitOr
, BitOrAssign
, BitXor
, BitXorAssign
, Not
,
13 macro_rules
! impl_binop
{
14 ($vec
:ident
, $
trait:ident
, $
fn:ident
, $impl_fn
:ident
) => {
15 impl<S3
, S4
, NI
> $
trait for $vec
<S3
, S4
, NI
> {
18 fn $
fn(self, rhs
: Self) -> Self::Output
{
19 Self::new(unsafe { $impl_fn(self.x, rhs.x) }
)
25 macro_rules
! impl_binop_assign
{
26 ($vec
:ident
, $
trait:ident
, $fn_assign
:ident
, $
fn:ident
) => {
27 impl<S3
, S4
, NI
> $
trait for $vec
<S3
, S4
, NI
>
29 $vec
<S3
, S4
, NI
>: Copy
,
32 fn $
fn_assign(&mut self, rhs
: Self) {
33 *self = self.$
fn(rhs
);
39 macro_rules
! def_vec
{
40 ($vec
:ident
, $word
:ident
) => {
41 #[allow(non_camel_case_types)]
42 #[derive(Copy, Clone)]
43 pub struct $vec
<S3
, S4
, NI
> {
50 impl<S3
, S4
, NI
> Store
<vec128_storage
> for $vec
<S3
, S4
, NI
> {
52 unsafe fn unpack(x
: vec128_storage
) -> Self {
56 impl<S3
, S4
, NI
> From
<$vec
<S3
, S4
, NI
>> for vec128_storage
{
58 fn from(x
: $vec
<S3
, S4
, NI
>) -> Self {
59 vec128_storage { sse2: x.x }
62 impl<S3
, S4
, NI
> $vec
<S3
, S4
, NI
> {
64 fn new(x
: __m128i
) -> Self {
74 impl<S3
, S4
, NI
> StoreBytes
for $vec
<S3
, S4
, NI
>
79 unsafe fn unsafe_read_le(input
: &[u8]) -> Self {
80 assert_eq
!(input
.len(), 16);
81 Self::new(_mm_loadu_si128(input
.as_ptr() as *const _
))
84 unsafe fn unsafe_read_be(input
: &[u8]) -> Self {
85 assert_eq
!(input
.len(), 16);
86 Self::new(_mm_loadu_si128(input
.as_ptr() as *const _
)).bswap()
89 fn write_le(self, out
: &mut [u8]) {
90 assert_eq
!(out
.len(), 16);
91 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
94 fn write_be(self, out
: &mut [u8]) {
95 assert_eq
!(out
.len(), 16);
96 let x
= self.bswap().x
;
98 _mm_storeu_si128(out
.as_mut_ptr() as *mut _
, x
);
103 impl<S3
, S4
, NI
> Default
for $vec
<S3
, S4
, NI
> {
105 fn default() -> Self {
106 Self::new(unsafe { _mm_setzero_si128() }
)
110 impl<S3
, S4
, NI
> Not
for $vec
<S3
, S4
, NI
> {
113 fn not(self) -> Self::Output
{
115 let ff
= _mm_set1_epi64x(-1i64);
121 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> BitOps0
for $vec
<S3
, S4
, NI
> {}
122 impl_binop
!($vec
, BitAnd
, bitand
, _mm_and_si128
);
123 impl_binop
!($vec
, BitOr
, bitor
, _mm_or_si128
);
124 impl_binop
!($vec
, BitXor
, bitxor
, _mm_xor_si128
);
125 impl_binop_assign
!($vec
, BitAndAssign
, bitand_assign
, bitand
);
126 impl_binop_assign
!($vec
, BitOrAssign
, bitor_assign
, bitor
);
127 impl_binop_assign
!($vec
, BitXorAssign
, bitxor_assign
, bitxor
);
128 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> AndNot
for $vec
<S3
, S4
, NI
> {
131 fn andnot(self, rhs
: Self) -> Self {
132 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) }
)
138 macro_rules
! impl_bitops32
{
140 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> BitOps32
for $vec
<S3
, S4
, NI
> where
141 $vec
<S3
, S4
, NI
>: RotateEachWord32
147 macro_rules
! impl_bitops64
{
149 impl_bitops32
!($vec
);
150 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> BitOps64
for $vec
<S3
, S4
, NI
> where
151 $vec
<S3
, S4
, NI
>: RotateEachWord64
+ RotateEachWord32
157 macro_rules
! impl_bitops128
{
159 impl_bitops64
!($vec
);
160 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> BitOps128
for $vec
<S3
, S4
, NI
> where
161 $vec
<S3
, S4
, NI
>: RotateEachWord128
167 macro_rules
! rotr_32_s3
{
168 ($name
:ident
, $k0
:expr
, $k1
:expr
) => {
170 fn $
name(self) -> Self {
174 _mm_set_epi64x($k0
, $k1
),
180 macro_rules
! rotr_32
{
181 ($name
:ident
, $i
:expr
) => {
183 fn $
name(self) -> Self {
186 _mm_srli_epi32(self.x
, $i
as i32),
187 _mm_slli_epi32(self.x
, 32 - $i
as i32),
193 impl<S4
: Copy
, NI
: Copy
> RotateEachWord32
for u32x4_sse2
<YesS3
, S4
, NI
> {
194 rotr_32
!(rotate_each_word_right7
, 7);
196 rotate_each_word_right8
,
200 rotr_32
!(rotate_each_word_right11
, 11);
201 rotr_32
!(rotate_each_word_right12
, 12);
203 rotate_each_word_right16
,
207 rotr_32
!(rotate_each_word_right20
, 20);
209 rotate_each_word_right24
,
213 rotr_32
!(rotate_each_word_right25
, 25);
215 impl<S4
: Copy
, NI
: Copy
> RotateEachWord32
for u32x4_sse2
<NoS3
, S4
, NI
> {
216 rotr_32
!(rotate_each_word_right7
, 7);
217 rotr_32
!(rotate_each_word_right8
, 8);
218 rotr_32
!(rotate_each_word_right11
, 11);
219 rotr_32
!(rotate_each_word_right12
, 12);
221 fn rotate_each_word_right16(self) -> Self {
222 Self::new(swap16_s2(self.x
))
224 rotr_32
!(rotate_each_word_right20
, 20);
225 rotr_32
!(rotate_each_word_right24
, 24);
226 rotr_32
!(rotate_each_word_right25
, 25);
229 macro_rules
! rotr_64_s3
{
230 ($name
:ident
, $k0
:expr
, $k1
:expr
) => {
232 fn $
name(self) -> Self {
236 _mm_set_epi64x($k0
, $k1
),
242 macro_rules
! rotr_64
{
243 ($name
:ident
, $i
:expr
) => {
245 fn $
name(self) -> Self {
248 _mm_srli_epi64(self.x
, $i
as i32),
249 _mm_slli_epi64(self.x
, 64 - $i
as i32),
255 impl<S4
: Copy
, NI
: Copy
> RotateEachWord32
for u64x2_sse2
<YesS3
, S4
, NI
> {
256 rotr_64
!(rotate_each_word_right7
, 7);
258 rotate_each_word_right8
,
259 0x080f_0e0d_0c0b_0a09,
260 0x0007_0605_0403_0201
262 rotr_64
!(rotate_each_word_right11
, 11);
263 rotr_64
!(rotate_each_word_right12
, 12);
265 rotate_each_word_right16
,
266 0x0908_0f0e_0d0c_0b0a,
267 0x0100_0706_0504_0302
269 rotr_64
!(rotate_each_word_right20
, 20);
271 rotate_each_word_right24
,
272 0x0a09_080f_0e0d_0c0b,
273 0x0201_0007_0605_0403
275 rotr_64
!(rotate_each_word_right25
, 25);
277 impl<S4
: Copy
, NI
: Copy
> RotateEachWord32
for u64x2_sse2
<NoS3
, S4
, NI
> {
278 rotr_64
!(rotate_each_word_right7
, 7);
279 rotr_64
!(rotate_each_word_right8
, 8);
280 rotr_64
!(rotate_each_word_right11
, 11);
281 rotr_64
!(rotate_each_word_right12
, 12);
283 fn rotate_each_word_right16(self) -> Self {
284 Self::new(swap16_s2(self.x
))
286 rotr_64
!(rotate_each_word_right20
, 20);
287 rotr_64
!(rotate_each_word_right24
, 24);
288 rotr_64
!(rotate_each_word_right25
, 25);
290 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> RotateEachWord64
for u64x2_sse2
<S3
, S4
, NI
> {
292 fn rotate_each_word_right32(self) -> Self {
293 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) }
)
297 macro_rules
! rotr_128
{
298 ($name
:ident
, $i
:expr
) => {
300 fn $
name(self) -> Self {
303 _mm_srli_si128(self.x
, $i
as i32),
304 _mm_slli_si128(self.x
, 128 - $i
as i32),
310 // TODO: completely unoptimized
311 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> RotateEachWord32
for u128x1_sse2
<S3
, S4
, NI
> {
312 rotr_128
!(rotate_each_word_right7
, 7);
313 rotr_128
!(rotate_each_word_right8
, 8);
314 rotr_128
!(rotate_each_word_right11
, 11);
315 rotr_128
!(rotate_each_word_right12
, 12);
316 rotr_128
!(rotate_each_word_right16
, 16);
317 rotr_128
!(rotate_each_word_right20
, 20);
318 rotr_128
!(rotate_each_word_right24
, 24);
319 rotr_128
!(rotate_each_word_right25
, 25);
321 // TODO: completely unoptimized
322 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> RotateEachWord64
for u128x1_sse2
<S3
, S4
, NI
> {
323 rotr_128
!(rotate_each_word_right32
, 32);
325 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> RotateEachWord128
for u128x1_sse2
<S3
, S4
, NI
> {}
327 def_vec
!(u32x4_sse2
, u32);
328 def_vec
!(u64x2_sse2
, u64);
329 def_vec
!(u128x1_sse2
, u128
);
331 impl<S3
, NI
> MultiLane
<[u32; 4]> for u32x4_sse2
<S3
, YesS4
, NI
> {
333 fn to_lanes(self) -> [u32; 4] {
335 let x
= _mm_cvtsi128_si64(self.x
) as u64;
336 let y
= _mm_extract_epi64(self.x
, 1) as u64;
337 [x
as u32, (x
>> 32) as u32, y
as u32, (y
>> 32) as u32]
341 fn from_lanes(xs
: [u32; 4]) -> Self {
343 let mut x
= _mm_cvtsi64_si128((xs
[0] as u64 | ((xs
[1] as u64) << 32)) as i64);
344 x
= _mm_insert_epi64(x
, (xs
[2] as u64 | ((xs
[3] as u64) << 32)) as i64, 1);
349 impl<S3
, NI
> MultiLane
<[u32; 4]> for u32x4_sse2
<S3
, NoS4
, NI
> {
351 fn to_lanes(self) -> [u32; 4] {
353 let x
= _mm_cvtsi128_si64(self.x
) as u64;
354 let y
= _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x
, 0b11101110)) as u64;
355 [x
as u32, (x
>> 32) as u32, y
as u32, (y
>> 32) as u32]
359 fn from_lanes(xs
: [u32; 4]) -> Self {
361 let x
= (xs
[0] as u64 | ((xs
[1] as u64) << 32)) as i64;
362 let y
= (xs
[2] as u64 | ((xs
[3] as u64) << 32)) as i64;
363 let x
= _mm_cvtsi64_si128(x
);
364 let y
= _mm_slli_si128(_mm_cvtsi64_si128(y
), 8);
365 Self::new(_mm_or_si128(x
, y
))
369 impl<S3
, NI
> MultiLane
<[u64; 2]> for u64x2_sse2
<S3
, YesS4
, NI
> {
371 fn to_lanes(self) -> [u64; 2] {
374 _mm_cvtsi128_si64(self.x
) as u64,
375 _mm_extract_epi64(self.x
, 1) as u64,
380 fn from_lanes(xs
: [u64; 2]) -> Self {
382 let mut x
= _mm_cvtsi64_si128(xs
[0] as i64);
383 x
= _mm_insert_epi64(x
, xs
[1] as i64, 1);
388 impl<S3
, NI
> MultiLane
<[u64; 2]> for u64x2_sse2
<S3
, NoS4
, NI
> {
390 fn to_lanes(self) -> [u64; 2] {
393 _mm_cvtsi128_si64(self.x
) as u64,
394 _mm_cvtsi128_si64(_mm_srli_si128(self.x
, 8)) as u64,
399 fn from_lanes(xs
: [u64; 2]) -> Self {
401 let x
= _mm_cvtsi64_si128(xs
[0] as i64);
402 let y
= _mm_slli_si128(_mm_cvtsi64_si128(xs
[1] as i64), 8);
403 Self::new(_mm_or_si128(x
, y
))
407 impl<S3
, S4
, NI
> MultiLane
<[u128
; 1]> for u128x1_sse2
<S3
, S4
, NI
> {
409 fn to_lanes(self) -> [u128
; 1] {
413 fn from_lanes(xs
: [u128
; 1]) -> Self {
418 impl<S3
, S4
, NI
> MultiLane
<[u64; 4]> for u64x4_sse2
<S3
, S4
, NI
>
420 u64x2_sse2
<S3
, S4
, NI
>: MultiLane
<[u64; 2]> + Copy
,
423 fn to_lanes(self) -> [u64; 4] {
424 let (a
, b
) = (self.0[0].to_lanes(), self.0[1].to_lanes());
425 [a
[0], a
[1], b
[0], b
[1]]
428 fn from_lanes(xs
: [u64; 4]) -> Self {
430 u64x2_sse2
::from_lanes([xs
[0], xs
[1]]),
431 u64x2_sse2
::from_lanes([xs
[2], xs
[3]]),
437 macro_rules
! impl_into
{
438 ($from
:ident
, $to
:ident
) => {
439 impl<S3
, S4
, NI
> From
<$from
<S3
, S4
, NI
>> for $to
<S3
, S4
, NI
> {
441 fn from(x
: $from
<S3
, S4
, NI
>) -> Self {
448 impl_into
!(u128x1_sse2
, u32x4_sse2
);
449 impl_into
!(u128x1_sse2
, u64x2_sse2
);
451 impl_bitops32
!(u32x4_sse2
);
452 impl_bitops64
!(u64x2_sse2
);
453 impl_bitops128
!(u128x1_sse2
);
455 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> ArithOps
for u32x4_sse2
<S3
, S4
, NI
> where
456 u32x4_sse2
<S3
, S4
, NI
>: BSwap
459 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> ArithOps
for u64x2_sse2
<S3
, S4
, NI
> where
460 u64x2_sse2
<S3
, S4
, NI
>: BSwap
463 impl_binop
!(u32x4_sse2
, Add
, add
, _mm_add_epi32
);
464 impl_binop
!(u64x2_sse2
, Add
, add
, _mm_add_epi64
);
465 impl_binop_assign
!(u32x4_sse2
, AddAssign
, add_assign
, add
);
466 impl_binop_assign
!(u64x2_sse2
, AddAssign
, add_assign
, add
);
468 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u32x4
<Machine86
<S3
, S4
, NI
>> for u32x4_sse2
<S3
, S4
, NI
>
470 u32x4_sse2
<S3
, S4
, NI
>: RotateEachWord32
+ BSwap
+ MultiLane
<[u32; 4]> + Vec4
<u32>,
471 Machine86
<S3
, S4
, NI
>: Machine
,
474 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u64x2
<Machine86
<S3
, S4
, NI
>> for u64x2_sse2
<S3
, S4
, NI
>
476 u64x2_sse2
<S3
, S4
, NI
>:
477 RotateEachWord64
+ RotateEachWord32
+ BSwap
+ MultiLane
<[u64; 2]> + Vec2
<u64>,
478 Machine86
<S3
, S4
, NI
>: Machine
,
481 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u128x1
<Machine86
<S3
, S4
, NI
>> for u128x1_sse2
<S3
, S4
, NI
>
483 u128x1_sse2
<S3
, S4
, NI
>: Swap64
+ RotateEachWord64
+ RotateEachWord32
+ BSwap
,
484 Machine86
<S3
, S4
, NI
>: Machine
,
485 u128x1_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4
>,
486 u128x1_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2
>,
490 impl<NI
: Copy
> u32x4
<Avx2Machine
<NI
>> for u32x4_sse2
<YesS3
, YesS4
, NI
>
492 u32x4_sse2
<YesS3
, YesS4
, NI
>: RotateEachWord32
+ BSwap
+ MultiLane
<[u32; 4]> + Vec4
<u32>,
493 Machine86
<YesS3
, YesS4
, NI
>: Machine
,
496 impl<NI
: Copy
> u64x2
<Avx2Machine
<NI
>> for u64x2_sse2
<YesS3
, YesS4
, NI
>
498 u64x2_sse2
<YesS3
, YesS4
, NI
>:
499 RotateEachWord64
+ RotateEachWord32
+ BSwap
+ MultiLane
<[u64; 2]> + Vec2
<u64>,
500 Machine86
<YesS3
, YesS4
, NI
>: Machine
,
503 impl<NI
: Copy
> u128x1
<Avx2Machine
<NI
>> for u128x1_sse2
<YesS3
, YesS4
, NI
>
505 u128x1_sse2
<YesS3
, YesS4
, NI
>: Swap64
+ RotateEachWord64
+ RotateEachWord32
+ BSwap
,
506 Machine86
<YesS3
, YesS4
, NI
>: Machine
,
507 u128x1_sse2
<YesS3
, YesS4
, NI
>: Into
<<Machine86
<YesS3
, YesS4
, NI
> as Machine
>::u32x4
>,
508 u128x1_sse2
<YesS3
, YesS4
, NI
>: Into
<<Machine86
<YesS3
, YesS4
, NI
> as Machine
>::u64x2
>,
512 impl<S3
, S4
, NI
> UnsafeFrom
<[u32; 4]> for u32x4_sse2
<S3
, S4
, NI
> {
514 unsafe fn unsafe_from(xs
: [u32; 4]) -> Self {
515 Self::new(_mm_set_epi32(
524 impl<S3
, NI
> Vec4
<u32> for u32x4_sse2
<S3
, YesS4
, NI
>
526 Self: MultiLane
<[u32; 4]>,
529 fn extract(self, i
: u32) -> u32 {
530 self.to_lanes()[i
as usize]
533 fn insert(self, v
: u32, i
: u32) -> Self {
536 0 => _mm_insert_epi32(self.x
, v
as i32, 0),
537 1 => _mm_insert_epi32(self.x
, v
as i32, 1),
538 2 => _mm_insert_epi32(self.x
, v
as i32, 2),
539 3 => _mm_insert_epi32(self.x
, v
as i32, 3),
545 impl<S3
, NI
> Vec4
<u32> for u32x4_sse2
<S3
, NoS4
, NI
>
547 Self: MultiLane
<[u32; 4]>,
550 fn extract(self, i
: u32) -> u32 {
551 self.to_lanes()[i
as usize]
554 fn insert(self, v
: u32, i
: u32) -> Self {
558 let x
= _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x
);
559 _mm_or_si128(x
, _mm_cvtsi32_si128(v
as i32))
562 let mut x
= _mm_shuffle_epi32(self.x
, 0b0111_1000);
563 x
= _mm_slli_si128(x
, 4);
564 x
= _mm_or_si128(x
, _mm_cvtsi32_si128(v
as i32));
565 _mm_shuffle_epi32(x
, 0b1110_0001)
568 let mut x
= _mm_shuffle_epi32(self.x
, 0b1011_0100);
569 x
= _mm_slli_si128(x
, 4);
570 x
= _mm_or_si128(x
, _mm_cvtsi32_si128(v
as i32));
571 _mm_shuffle_epi32(x
, 0b1100_1001)
574 let mut x
= _mm_slli_si128(self.x
, 4);
575 x
= _mm_or_si128(x
, _mm_cvtsi32_si128(v
as i32));
576 _mm_shuffle_epi32(x
, 0b0011_1001)
584 impl<S3
, S4
, NI
> LaneWords4
for u32x4_sse2
<S3
, S4
, NI
> {
586 fn shuffle_lane_words2301(self) -> Self {
590 fn shuffle_lane_words1230(self) -> Self {
594 fn shuffle_lane_words3012(self) -> Self {
599 impl<S3
, S4
, NI
> Words4
for u32x4_sse2
<S3
, S4
, NI
> {
601 fn shuffle2301(self) -> Self {
602 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }
)
605 fn shuffle1230(self) -> Self {
606 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) }
)
609 fn shuffle3012(self) -> Self {
610 Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) }
)
614 impl<S4
, NI
> Words4
for u64x4_sse2
<YesS3
, S4
, NI
> {
616 fn shuffle2301(self) -> Self {
617 x2
::new([u64x2_sse2
::new(self.0[1].x
), u64x2_sse2
::new(self.0[0].x
)])
620 fn shuffle3012(self) -> Self {
623 u64x2_sse2
::new(_mm_alignr_epi8(self.0[1].x
, self.0[0].x
, 8)),
624 u64x2_sse2
::new(_mm_alignr_epi8(self.0[0].x
, self.0[1].x
, 8)),
629 fn shuffle1230(self) -> Self {
632 u64x2_sse2
::new(_mm_alignr_epi8(self.0[0].x
, self.0[1].x
, 8)),
633 u64x2_sse2
::new(_mm_alignr_epi8(self.0[1].x
, self.0[0].x
, 8)),
638 impl<S4
, NI
> Words4
for u64x4_sse2
<NoS3
, S4
, NI
> {
640 fn shuffle2301(self) -> Self {
641 x2
::new([u64x2_sse2
::new(self.0[1].x
), u64x2_sse2
::new(self.0[0].x
)])
644 fn shuffle3012(self) -> Self {
646 let a
= _mm_srli_si128(self.0[0].x
, 8);
647 let b
= _mm_slli_si128(self.0[0].x
, 8);
648 let c
= _mm_srli_si128(self.0[1].x
, 8);
649 let d
= _mm_slli_si128(self.0[1].x
, 8);
650 let da
= _mm_or_si128(d
, a
);
651 let bc
= _mm_or_si128(b
, c
);
652 x2
::new([u64x2_sse2
::new(da
), u64x2_sse2
::new(bc
)])
656 fn shuffle1230(self) -> Self {
658 let a
= _mm_srli_si128(self.0[0].x
, 8);
659 let b
= _mm_slli_si128(self.0[0].x
, 8);
660 let c
= _mm_srli_si128(self.0[1].x
, 8);
661 let d
= _mm_slli_si128(self.0[1].x
, 8);
662 let da
= _mm_or_si128(d
, a
);
663 let bc
= _mm_or_si128(b
, c
);
664 x2
::new([u64x2_sse2
::new(bc
), u64x2_sse2
::new(da
)])
669 impl<S3
, S4
, NI
> UnsafeFrom
<[u64; 2]> for u64x2_sse2
<S3
, S4
, NI
> {
671 unsafe fn unsafe_from(xs
: [u64; 2]) -> Self {
672 Self::new(_mm_set_epi64x(xs
[1] as i64, xs
[0] as i64))
676 impl<S3
, NI
> Vec2
<u64> for u64x2_sse2
<S3
, YesS4
, NI
> {
678 fn extract(self, i
: u32) -> u64 {
681 0 => _mm_cvtsi128_si64(self.x
) as u64,
682 1 => _mm_extract_epi64(self.x
, 1) as u64,
688 fn insert(self, x
: u64, i
: u32) -> Self {
691 0 => _mm_insert_epi64(self.x
, x
as i64, 0),
692 1 => _mm_insert_epi64(self.x
, x
as i64, 1),
698 impl<S3
, NI
> Vec2
<u64> for u64x2_sse2
<S3
, NoS4
, NI
> {
700 fn extract(self, i
: u32) -> u64 {
703 0 => _mm_cvtsi128_si64(self.x
) as u64,
704 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x
, 0b11101110)) as u64,
710 fn insert(self, x
: u64, i
: u32) -> Self {
714 _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x
),
715 _mm_cvtsi64_si128(x
as i64),
718 _mm_move_epi64(self.x
),
719 _mm_slli_si128(_mm_cvtsi64_si128(x
as i64), 8),
727 impl<S4
, NI
> BSwap
for u32x4_sse2
<YesS3
, S4
, NI
> {
729 fn bswap(self) -> Self {
731 let k
= _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
732 _mm_shuffle_epi8(self.x
, k
)
737 fn bswap32_s2(x
: __m128i
) -> __m128i
{
739 let mut y
= _mm_unpacklo_epi8(x
, _mm_setzero_si128());
740 y
= _mm_shufflehi_epi16(y
, 0b0001_1011);
741 y
= _mm_shufflelo_epi16(y
, 0b0001_1011);
742 let mut z
= _mm_unpackhi_epi8(x
, _mm_setzero_si128());
743 z
= _mm_shufflehi_epi16(z
, 0b0001_1011);
744 z
= _mm_shufflelo_epi16(z
, 0b0001_1011);
745 _mm_packus_epi16(y
, z
)
748 impl<S4
, NI
> BSwap
for u32x4_sse2
<NoS3
, S4
, NI
> {
750 fn bswap(self) -> Self {
751 Self::new(bswap32_s2(self.x
))
755 impl<S4
, NI
> BSwap
for u64x2_sse2
<YesS3
, S4
, NI
> {
757 fn bswap(self) -> Self {
759 let k
= _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
760 _mm_shuffle_epi8(self.x
, k
)
764 impl<S4
, NI
> BSwap
for u64x2_sse2
<NoS3
, S4
, NI
> {
766 fn bswap(self) -> Self {
767 Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) }
)
771 impl<S4
, NI
> BSwap
for u128x1_sse2
<YesS3
, S4
, NI
> {
773 fn bswap(self) -> Self {
775 let k
= _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
776 _mm_shuffle_epi8(self.x
, k
)
780 impl<S4
, NI
> BSwap
for u128x1_sse2
<NoS3
, S4
, NI
> {
782 fn bswap(self) -> Self {
783 Self::new(unsafe { unimplemented!() }
)
788 ($x
:expr
, $i
:expr
, $k
:expr
) => {
791 let k
= _mm_set1_epi8(K
as i8);
792 u128x1_sse2
::new(_mm_or_si128(
793 _mm_srli_epi16(_mm_and_si128($x
.x
, k
), $i
),
794 _mm_and_si128(_mm_slli_epi16($x
.x
, $i
), k
),
800 fn swap16_s2(x
: __m128i
) -> __m128i
{
801 unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
803 impl<S4
, NI
> Swap64
for u128x1_sse2
<YesS3
, S4
, NI
> {
805 fn swap1(self) -> Self {
806 swapi
!(self, 1, 0xaa)
809 fn swap2(self) -> Self {
810 swapi
!(self, 2, 0xcc)
813 fn swap4(self) -> Self {
814 swapi
!(self, 4, 0xf0)
817 fn swap8(self) -> Self {
818 u128x1_sse2
::new(unsafe {
819 let k
= _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
820 _mm_shuffle_epi8(self.x
, k
)
824 fn swap16(self) -> Self {
825 u128x1_sse2
::new(unsafe {
826 let k
= _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
827 _mm_shuffle_epi8(self.x
, k
)
831 fn swap32(self) -> Self {
832 u128x1_sse2
::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }
)
835 fn swap64(self) -> Self {
836 u128x1_sse2
::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }
)
839 impl<S4
, NI
> Swap64
for u128x1_sse2
<NoS3
, S4
, NI
> {
841 fn swap1(self) -> Self {
842 swapi
!(self, 1, 0xaa)
845 fn swap2(self) -> Self {
846 swapi
!(self, 2, 0xcc)
849 fn swap4(self) -> Self {
850 swapi
!(self, 4, 0xf0)
853 fn swap8(self) -> Self {
854 u128x1_sse2
::new(unsafe {
855 _mm_or_si128(_mm_slli_epi16(self.x
, 8), _mm_srli_epi16(self.x
, 8))
859 fn swap16(self) -> Self {
860 u128x1_sse2
::new(swap16_s2(self.x
))
863 fn swap32(self) -> Self {
864 u128x1_sse2
::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) }
)
867 fn swap64(self) -> Self {
868 u128x1_sse2
::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) }
)
872 #[derive(Copy, Clone)]
874 #[derive(Copy, Clone)]
877 #[allow(non_camel_case_types)]
878 pub type u32x4x2_sse2
<S3
, S4
, NI
> = x2
<u32x4_sse2
<S3
, S4
, NI
>, G0
>;
879 #[allow(non_camel_case_types)]
880 pub type u64x2x2_sse2
<S3
, S4
, NI
> = x2
<u64x2_sse2
<S3
, S4
, NI
>, G0
>;
881 #[allow(non_camel_case_types)]
882 pub type u64x4_sse2
<S3
, S4
, NI
> = x2
<u64x2_sse2
<S3
, S4
, NI
>, G1
>;
883 #[allow(non_camel_case_types)]
884 pub type u128x2_sse2
<S3
, S4
, NI
> = x2
<u128x1_sse2
<S3
, S4
, NI
>, G0
>;
886 #[allow(non_camel_case_types)]
887 pub type u32x4x4_sse2
<S3
, S4
, NI
> = x4
<u32x4_sse2
<S3
, S4
, NI
>>;
888 #[allow(non_camel_case_types)]
889 pub type u64x2x4_sse2
<S3
, S4
, NI
> = x4
<u64x2_sse2
<S3
, S4
, NI
>>;
890 #[allow(non_camel_case_types)]
891 pub type u128x4_sse2
<S3
, S4
, NI
> = x4
<u128x1_sse2
<S3
, S4
, NI
>>;
893 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u32x4x2
<Machine86
<S3
, S4
, NI
>> for u32x4x2_sse2
<S3
, S4
, NI
>
895 u32x4_sse2
<S3
, S4
, NI
>: RotateEachWord32
+ BSwap
,
896 Machine86
<S3
, S4
, NI
>: Machine
,
897 u32x4x2_sse2
<S3
, S4
, NI
>: MultiLane
<[<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4
; 2]>,
898 u32x4x2_sse2
<S3
, S4
, NI
>: Vec2
<<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4
>,
901 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u64x2x2
<Machine86
<S3
, S4
, NI
>> for u64x2x2_sse2
<S3
, S4
, NI
>
903 u64x2_sse2
<S3
, S4
, NI
>: RotateEachWord64
+ RotateEachWord32
+ BSwap
,
904 Machine86
<S3
, S4
, NI
>: Machine
,
905 u64x2x2_sse2
<S3
, S4
, NI
>: MultiLane
<[<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2
; 2]>,
906 u64x2x2_sse2
<S3
, S4
, NI
>: Vec2
<<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2
>,
909 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u64x4
<Machine86
<S3
, S4
, NI
>> for u64x4_sse2
<S3
, S4
, NI
>
911 u64x2_sse2
<S3
, S4
, NI
>: RotateEachWord64
+ RotateEachWord32
+ BSwap
,
912 Machine86
<S3
, S4
, NI
>: Machine
,
913 u64x4_sse2
<S3
, S4
, NI
>: MultiLane
<[u64; 4]> + Vec4
<u64> + Words4
,
916 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u128x2
<Machine86
<S3
, S4
, NI
>> for u128x2_sse2
<S3
, S4
, NI
>
918 u128x1_sse2
<S3
, S4
, NI
>: Swap64
+ BSwap
,
919 Machine86
<S3
, S4
, NI
>: Machine
,
920 u128x2_sse2
<S3
, S4
, NI
>: MultiLane
<[<Machine86
<S3
, S4
, NI
> as Machine
>::u128x1
; 2]>,
921 u128x2_sse2
<S3
, S4
, NI
>: Vec2
<<Machine86
<S3
, S4
, NI
> as Machine
>::u128x1
>,
922 u128x2_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4x2
>,
923 u128x2_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2x2
>,
924 u128x2_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u64x4
>,
928 impl<NI
: Copy
> u32x4x2
<Avx2Machine
<NI
>> for u32x4x2_sse2
<YesS3
, YesS4
, NI
>
930 u32x4_sse2
<YesS3
, YesS4
, NI
>: RotateEachWord32
+ BSwap
,
931 Avx2Machine
<NI
>: Machine
,
932 u32x4x2_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[<Avx2Machine
<NI
> as Machine
>::u32x4
; 2]>,
933 u32x4x2_sse2
<YesS3
, YesS4
, NI
>: Vec2
<<Avx2Machine
<NI
> as Machine
>::u32x4
>,
936 impl<NI
: Copy
> u64x2x2
<Avx2Machine
<NI
>> for u64x2x2_sse2
<YesS3
, YesS4
, NI
>
938 u64x2_sse2
<YesS3
, YesS4
, NI
>: RotateEachWord64
+ RotateEachWord32
+ BSwap
,
939 Avx2Machine
<NI
>: Machine
,
940 u64x2x2_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[<Avx2Machine
<NI
> as Machine
>::u64x2
; 2]>,
941 u64x2x2_sse2
<YesS3
, YesS4
, NI
>: Vec2
<<Avx2Machine
<NI
> as Machine
>::u64x2
>,
944 impl<NI
: Copy
> u64x4
<Avx2Machine
<NI
>> for u64x4_sse2
<YesS3
, YesS4
, NI
>
946 u64x2_sse2
<YesS3
, YesS4
, NI
>: RotateEachWord64
+ RotateEachWord32
+ BSwap
,
947 Avx2Machine
<NI
>: Machine
,
948 u64x4_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[u64; 4]> + Vec4
<u64> + Words4
,
951 impl<NI
: Copy
> u128x2
<Avx2Machine
<NI
>> for u128x2_sse2
<YesS3
, YesS4
, NI
>
953 u128x1_sse2
<YesS3
, YesS4
, NI
>: Swap64
+ BSwap
,
954 Avx2Machine
<NI
>: Machine
,
955 u128x2_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[<Avx2Machine
<NI
> as Machine
>::u128x1
; 2]>,
956 u128x2_sse2
<YesS3
, YesS4
, NI
>: Vec2
<<Avx2Machine
<NI
> as Machine
>::u128x1
>,
957 u128x2_sse2
<YesS3
, YesS4
, NI
>: Into
<<Avx2Machine
<NI
> as Machine
>::u32x4x2
>,
958 u128x2_sse2
<YesS3
, YesS4
, NI
>: Into
<<Avx2Machine
<NI
> as Machine
>::u64x2x2
>,
959 u128x2_sse2
<YesS3
, YesS4
, NI
>: Into
<<Avx2Machine
<NI
> as Machine
>::u64x4
>,
963 impl<S3
, S4
, NI
> Vec4
<u64> for u64x4_sse2
<S3
, S4
, NI
>
965 u64x2_sse2
<S3
, S4
, NI
>: Copy
+ Vec2
<u64>,
968 fn extract(self, i
: u32) -> u64 {
970 0 => self.0[0].extract(0),
971 1 => self.0[0].extract(1),
972 2 => self.0[1].extract(0),
973 3 => self.0[1].extract(1),
978 fn insert(mut self, w
: u64, i
: u32) -> Self {
980 0 => self.0[0] = self.0[0].insert(w
, 0),
981 1 => self.0[0] = self.0[0].insert(w
, 1),
982 2 => self.0[1] = self.0[1].insert(w
, 0),
983 3 => self.0[1] = self.0[1].insert(w
, 1),
990 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u32x4x4
<Machine86
<S3
, S4
, NI
>> for u32x4x4_sse2
<S3
, S4
, NI
>
992 u32x4_sse2
<S3
, S4
, NI
>: RotateEachWord32
+ BSwap
,
993 Machine86
<S3
, S4
, NI
>: Machine
,
994 u32x4x4_sse2
<S3
, S4
, NI
>: MultiLane
<[<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4
; 4]>,
995 u32x4x4_sse2
<S3
, S4
, NI
>: Vec4
<<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4
>,
998 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u64x2x4
<Machine86
<S3
, S4
, NI
>> for u64x2x4_sse2
<S3
, S4
, NI
>
1000 u64x2_sse2
<S3
, S4
, NI
>: RotateEachWord64
+ RotateEachWord32
+ BSwap
,
1001 Machine86
<S3
, S4
, NI
>: Machine
,
1002 u64x2x4_sse2
<S3
, S4
, NI
>: MultiLane
<[<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2
; 4]>,
1003 u64x2x4_sse2
<S3
, S4
, NI
>: Vec4
<<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2
>,
1006 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> u128x4
<Machine86
<S3
, S4
, NI
>> for u128x4_sse2
<S3
, S4
, NI
>
1008 u128x1_sse2
<S3
, S4
, NI
>: Swap64
+ BSwap
,
1009 Machine86
<S3
, S4
, NI
>: Machine
,
1010 u128x4_sse2
<S3
, S4
, NI
>: MultiLane
<[<Machine86
<S3
, S4
, NI
> as Machine
>::u128x1
; 4]>,
1011 u128x4_sse2
<S3
, S4
, NI
>: Vec4
<<Machine86
<S3
, S4
, NI
> as Machine
>::u128x1
>,
1012 u128x4_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u32x4x4
>,
1013 u128x4_sse2
<S3
, S4
, NI
>: Into
<<Machine86
<S3
, S4
, NI
> as Machine
>::u64x2x4
>,
1017 impl<NI
: Copy
> u32x4x4
<Avx2Machine
<NI
>> for u32x4x4_sse2
<YesS3
, YesS4
, NI
>
1019 u32x4_sse2
<YesS3
, YesS4
, NI
>: RotateEachWord32
+ BSwap
,
1020 Avx2Machine
<NI
>: Machine
,
1021 u32x4x4_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[<Avx2Machine
<NI
> as Machine
>::u32x4
; 4]>,
1022 u32x4x4_sse2
<YesS3
, YesS4
, NI
>: Vec4
<<Avx2Machine
<NI
> as Machine
>::u32x4
>,
1025 impl<NI
: Copy
> u64x2x4
<Avx2Machine
<NI
>> for u64x2x4_sse2
<YesS3
, YesS4
, NI
>
1027 u64x2_sse2
<YesS3
, YesS4
, NI
>: RotateEachWord64
+ RotateEachWord32
+ BSwap
,
1028 Avx2Machine
<NI
>: Machine
,
1029 u64x2x4_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[<Avx2Machine
<NI
> as Machine
>::u64x2
; 4]>,
1030 u64x2x4_sse2
<YesS3
, YesS4
, NI
>: Vec4
<<Avx2Machine
<NI
> as Machine
>::u64x2
>,
1033 impl<NI
: Copy
> u128x4
<Avx2Machine
<NI
>> for u128x4_sse2
<YesS3
, YesS4
, NI
>
1035 u128x1_sse2
<YesS3
, YesS4
, NI
>: Swap64
+ BSwap
,
1036 Avx2Machine
<NI
>: Machine
,
1037 u128x4_sse2
<YesS3
, YesS4
, NI
>: MultiLane
<[<Avx2Machine
<NI
> as Machine
>::u128x1
; 4]>,
1038 u128x4_sse2
<YesS3
, YesS4
, NI
>: Vec4
<<Avx2Machine
<NI
> as Machine
>::u128x1
>,
1039 u128x4_sse2
<YesS3
, YesS4
, NI
>: Into
<<Avx2Machine
<NI
> as Machine
>::u32x4x4
>,
1040 u128x4_sse2
<YesS3
, YesS4
, NI
>: Into
<<Avx2Machine
<NI
> as Machine
>::u64x2x4
>,
1044 macro_rules
! impl_into_x
{
1045 ($from
:ident
, $to
:ident
) => {
1046 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
, Gf
, Gt
> From
<x2
<$from
<S3
, S4
, NI
>, Gf
>>
1047 for x2
<$to
<S3
, S4
, NI
>, Gt
>
1050 fn from(x
: x2
<$from
<S3
, S4
, NI
>, Gf
>) -> Self {
1051 x2
::new([$to
::from(x
.0[0]), $to
::from(x
.0[1])])
1054 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> From
<x4
<$from
<S3
, S4
, NI
>>> for x4
<$to
<S3
, S4
, NI
>> {
1056 fn from(x
: x4
<$from
<S3
, S4
, NI
>>) -> Self {
1067 impl_into_x
!(u128x1_sse2
, u64x2_sse2
);
1068 impl_into_x
!(u128x1_sse2
, u32x4_sse2
);
1072 use core
::fmt
::{Debug, Formatter, Result}
;
1074 impl<W
: PartialEq
, G
> PartialEq
for x2
<W
, G
> {
1076 fn eq(&self, rhs
: &Self) -> bool
{
1077 self.0[0] == rhs
.0[0] && self.0[1] == rhs
.0[1]
1082 unsafe fn eq128_s4(x
: __m128i
, y
: __m128i
) -> bool
{
1083 let q
= _mm_shuffle_epi32(_mm_cmpeq_epi64(x
, y
), 0b1100_0110);
1084 _mm_cvtsi128_si64(q
) == -1
1088 unsafe fn eq128_s2(x
: __m128i
, y
: __m128i
) -> bool
{
1089 let q
= _mm_cmpeq_epi32(x
, y
);
1090 let p
= _mm_cvtsi128_si64(_mm_srli_si128(q
, 8));
1091 let q
= _mm_cvtsi128_si64(q
);
1095 impl<S3
, S4
, NI
> PartialEq
for u32x4_sse2
<S3
, S4
, NI
> {
1097 fn eq(&self, rhs
: &Self) -> bool
{
1098 unsafe { eq128_s2(self.x, rhs.x) }
1101 impl<S3
, S4
, NI
> Debug
for u32x4_sse2
<S3
, S4
, NI
>
1103 Self: Copy
+ MultiLane
<[u32; 4]>,
1106 fn fmt(&self, fmt
: &mut Formatter
) -> Result
{
1107 fmt
.write_fmt(format_args
!("{:08x?}", &self.to_lanes()))
1111 impl<S3
, S4
, NI
> PartialEq
for u64x2_sse2
<S3
, S4
, NI
> {
1113 fn eq(&self, rhs
: &Self) -> bool
{
1114 unsafe { eq128_s2(self.x, rhs.x) }
1117 impl<S3
, S4
, NI
> Debug
for u64x2_sse2
<S3
, S4
, NI
>
1119 Self: Copy
+ MultiLane
<[u64; 2]>,
1122 fn fmt(&self, fmt
: &mut Formatter
) -> Result
{
1123 fmt
.write_fmt(format_args
!("{:016x?}", &self.to_lanes()))
1127 impl<S3
, S4
, NI
> Debug
for u64x4_sse2
<S3
, S4
, NI
>
1129 u64x2_sse2
<S3
, S4
, NI
>: Copy
+ MultiLane
<[u64; 2]>,
1132 fn fmt(&self, fmt
: &mut Formatter
) -> Result
{
1133 let (a
, b
) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1134 fmt
.write_fmt(format_args
!("{:016x?}", &[a
[0], a
[1], b
[0], b
[1]]))
1141 use crate::x86_64
::{SSE2, SSE41, SSSE3}
;
1145 #[cfg(target_arch = "x86_64")]
1146 fn test_bswap32_s2_vs_s3() {
1147 let xs
= [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1148 let ys
= [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1150 let s2
= unsafe { SSE2::instance() }
;
1151 let s3
= unsafe { SSSE3::instance() }
;
1154 let x_s2
: <SSE2
as Machine
>::u32x4
= s2
.vec(xs
);
1159 let x_s3
: <SSSE3
as Machine
>::u32x4
= s3
.vec(xs
);
1163 assert_eq
!(x_s2
, unsafe { core::mem::transmute(x_s3) }
);
1164 assert_eq
!(x_s2
, s2
.vec(ys
));
1168 #[cfg(target_arch = "x86_64")]
1169 fn test_bswap64_s2_vs_s3() {
1170 let xs
= [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1171 let ys
= [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1173 let s2
= unsafe { SSE2::instance() }
;
1174 let s3
= unsafe { SSSE3::instance() }
;
1177 let x_s2
: <SSE2
as Machine
>::u64x2
= s2
.vec(xs
);
1182 let x_s3
: <SSSE3
as Machine
>::u64x2
= s3
.vec(xs
);
1186 assert_eq
!(x_s2
, s2
.vec(ys
));
1187 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1191 #[cfg(target_arch = "x86_64")]
1192 fn test_shuffle32_s2_vs_s3() {
1193 let xs
= [0x0, 0x1, 0x2, 0x3];
1194 let ys
= [0x2, 0x3, 0x0, 0x1];
1195 let zs
= [0x1, 0x2, 0x3, 0x0];
1197 let s2
= unsafe { SSE2::instance() }
;
1198 let s3
= unsafe { SSSE3::instance() }
;
1201 let x_s2
: <SSE2
as Machine
>::u32x4
= s2
.vec(xs
);
1205 let x_s3
: <SSSE3
as Machine
>::u32x4
= s3
.vec(xs
);
1208 assert_eq
!(x_s2
, s2
.vec(ys
));
1209 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1212 let x_s2
: <SSE2
as Machine
>::u32x4
= s2
.vec(xs
);
1216 let x_s3
: <SSSE3
as Machine
>::u32x4
= s3
.vec(xs
);
1219 assert_eq
!(x_s2
, s2
.vec(zs
));
1220 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1222 let x_s2
= x_s2
.shuffle1230();
1223 let x_s3
= x_s3
.shuffle1230();
1224 assert_eq
!(x_s2
, s2
.vec(xs
));
1225 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1229 #[cfg(target_arch = "x86_64")]
1230 fn test_shuffle64_s2_vs_s3() {
1231 let xs
= [0x0, 0x1, 0x2, 0x3];
1232 let ys
= [0x2, 0x3, 0x0, 0x1];
1233 let zs
= [0x1, 0x2, 0x3, 0x0];
1235 let s2
= unsafe { SSE2::instance() }
;
1236 let s3
= unsafe { SSSE3::instance() }
;
1239 let x_s2
: <SSE2
as Machine
>::u64x4
= s2
.vec(xs
);
1243 let x_s3
: <SSSE3
as Machine
>::u64x4
= s3
.vec(xs
);
1246 assert_eq
!(x_s2
, s2
.vec(ys
));
1247 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1250 let x_s2
: <SSE2
as Machine
>::u64x4
= s2
.vec(xs
);
1254 let x_s3
: <SSSE3
as Machine
>::u64x4
= s3
.vec(xs
);
1257 assert_eq
!(x_s2
, s2
.vec(zs
));
1258 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1260 let x_s2
= x_s2
.shuffle1230();
1261 let x_s3
= x_s3
.shuffle1230();
1262 assert_eq
!(x_s2
, s2
.vec(xs
));
1263 assert_eq
!(x_s3
, unsafe { core::mem::transmute(x_s3) }
);
1267 #[cfg(target_arch = "x86_64")]
1268 fn test_lanes_u32x4() {
1269 let xs
= [0x1, 0x2, 0x3, 0x4];
1271 let s2
= unsafe { SSE2::instance() }
;
1272 let s3
= unsafe { SSSE3::instance() }
;
1273 let s4
= unsafe { SSE41::instance() }
;
1276 let x_s2
: <SSE2
as Machine
>::u32x4
= s2
.vec(xs
);
1277 let y_s2
= <SSE2
as Machine
>::u32x4
::from_lanes(xs
);
1278 assert_eq
!(x_s2
, y_s2
);
1279 assert_eq
!(xs
, y_s2
.to_lanes());
1283 let x_s3
: <SSSE3
as Machine
>::u32x4
= s3
.vec(xs
);
1284 let y_s3
= <SSSE3
as Machine
>::u32x4
::from_lanes(xs
);
1285 assert_eq
!(x_s3
, y_s3
);
1286 assert_eq
!(xs
, y_s3
.to_lanes());
1290 let x_s4
: <SSE41
as Machine
>::u32x4
= s4
.vec(xs
);
1291 let y_s4
= <SSE41
as Machine
>::u32x4
::from_lanes(xs
);
1292 assert_eq
!(x_s4
, y_s4
);
1293 assert_eq
!(xs
, y_s4
.to_lanes());
1298 #[cfg(target_arch = "x86_64")]
1299 fn test_lanes_u64x2() {
1300 let xs
= [0x1, 0x2];
1302 let s2
= unsafe { SSE2::instance() }
;
1303 let s3
= unsafe { SSSE3::instance() }
;
1304 let s4
= unsafe { SSE41::instance() }
;
1307 let x_s2
: <SSE2
as Machine
>::u64x2
= s2
.vec(xs
);
1308 let y_s2
= <SSE2
as Machine
>::u64x2
::from_lanes(xs
);
1309 assert_eq
!(x_s2
, y_s2
);
1310 assert_eq
!(xs
, y_s2
.to_lanes());
1314 let x_s3
: <SSSE3
as Machine
>::u64x2
= s3
.vec(xs
);
1315 let y_s3
= <SSSE3
as Machine
>::u64x2
::from_lanes(xs
);
1316 assert_eq
!(x_s3
, y_s3
);
1317 assert_eq
!(xs
, y_s3
.to_lanes());
1321 let x_s4
: <SSE41
as Machine
>::u64x2
= s4
.vec(xs
);
1322 let y_s4
= <SSE41
as Machine
>::u64x2
::from_lanes(xs
);
1323 assert_eq
!(x_s4
, y_s4
);
1324 assert_eq
!(xs
, y_s4
.to_lanes());
1329 #[cfg(target_arch = "x86_64")]
1330 fn test_vec4_u32x4_s2() {
1331 let xs
= [1, 2, 3, 4];
1332 let s2
= unsafe { SSE2::instance() }
;
1333 let x_s2
: <SSE2
as Machine
>::u32x4
= s2
.vec(xs
);
1334 assert_eq
!(x_s2
.extract(0), 1);
1335 assert_eq
!(x_s2
.extract(1), 2);
1336 assert_eq
!(x_s2
.extract(2), 3);
1337 assert_eq
!(x_s2
.extract(3), 4);
1338 assert_eq
!(x_s2
.insert(0xf, 0), s2
.vec([0xf, 2, 3, 4]));
1339 assert_eq
!(x_s2
.insert(0xf, 1), s2
.vec([1, 0xf, 3, 4]));
1340 assert_eq
!(x_s2
.insert(0xf, 2), s2
.vec([1, 2, 0xf, 4]));
1341 assert_eq
!(x_s2
.insert(0xf, 3), s2
.vec([1, 2, 3, 0xf]));
1345 #[cfg(target_arch = "x86_64")]
1346 fn test_vec4_u32x4_s4() {
1347 let xs
= [1, 2, 3, 4];
1348 let s4
= unsafe { SSE41::instance() }
;
1349 let x_s4
: <SSE41
as Machine
>::u32x4
= s4
.vec(xs
);
1350 assert_eq
!(x_s4
.extract(0), 1);
1351 assert_eq
!(x_s4
.extract(1), 2);
1352 assert_eq
!(x_s4
.extract(2), 3);
1353 assert_eq
!(x_s4
.extract(3), 4);
1354 assert_eq
!(x_s4
.insert(0xf, 0), s4
.vec([0xf, 2, 3, 4]));
1355 assert_eq
!(x_s4
.insert(0xf, 1), s4
.vec([1, 0xf, 3, 4]));
1356 assert_eq
!(x_s4
.insert(0xf, 2), s4
.vec([1, 2, 0xf, 4]));
1357 assert_eq
!(x_s4
.insert(0xf, 3), s4
.vec([1, 2, 3, 0xf]));
1361 #[cfg(target_arch = "x86_64")]
1362 fn test_vec2_u64x2_s2() {
1363 let xs
= [0x1, 0x2];
1364 let s2
= unsafe { SSE2::instance() }
;
1365 let x_s2
: <SSE2
as Machine
>::u64x2
= s2
.vec(xs
);
1366 assert_eq
!(x_s2
.extract(0), 1);
1367 assert_eq
!(x_s2
.extract(1), 2);
1368 assert_eq
!(x_s2
.insert(0xf, 0), s2
.vec([0xf, 2]));
1369 assert_eq
!(x_s2
.insert(0xf, 1), s2
.vec([1, 0xf]));
1373 #[cfg(target_arch = "x86_64")]
1374 fn test_vec4_u64x2_s4() {
1375 let xs
= [0x1, 0x2];
1376 let s4
= unsafe { SSE41::instance() }
;
1377 let x_s4
: <SSE41
as Machine
>::u64x2
= s4
.vec(xs
);
1378 assert_eq
!(x_s4
.extract(0), 1);
1379 assert_eq
!(x_s4
.extract(1), 2);
1380 assert_eq
!(x_s4
.insert(0xf, 0), s4
.vec([0xf, 2]));
1381 assert_eq
!(x_s4
.insert(0xf, 1), s4
.vec([1, 0xf]));
1386 #![allow(non_camel_case_types)]
1387 use crate::soft
::x4
;
1388 use crate::types
::*;
1389 use crate::x86_64
::sse2
::{u128x1_sse2, u32x4_sse2}
;
1390 use crate::x86_64
::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}
;
1391 use core
::arch
::x86_64
::*;
1392 use core
::marker
::PhantomData
;
1395 #[derive(Copy, Clone)]
1396 pub struct u32x4x4_avx2
<NI
> {
1398 ni
: PhantomData
<NI
>,
1401 impl<NI
> u32x4x4_avx2
<NI
> {
1403 fn new(x
: [__m256i
; 2]) -> Self {
1404 Self { x, ni: PhantomData }
1408 impl<NI
> u32x4x4
<Avx2Machine
<NI
>> for u32x4x4_avx2
<NI
> where NI
: Copy {}
1409 impl<NI
> Store
<vec512_storage
> for u32x4x4_avx2
<NI
> {
1411 unsafe fn unpack(p
: vec512_storage
) -> Self {
1412 Self::new([p
.avx
[0].avx
, p
.avx
[1].avx
])
1415 impl<NI
> MultiLane
<[u32x4_sse2
<YesS3
, YesS4
, NI
>; 4]> for u32x4x4_avx2
<NI
> {
1417 fn to_lanes(self) -> [u32x4_sse2
<YesS3
, YesS4
, NI
>; 4] {
1420 u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[0], 0)),
1421 u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[0], 1)),
1422 u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[1], 0)),
1423 u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[1], 1)),
1428 fn from_lanes(x
: [u32x4_sse2
<YesS3
, YesS4
, NI
>; 4]) -> Self {
1431 _mm256_setr_m128i(x
[0].x
, x
[1].x
),
1432 _mm256_setr_m128i(x
[2].x
, x
[3].x
),
1437 impl<NI
> Vec4
<u32x4_sse2
<YesS3
, YesS4
, NI
>> for u32x4x4_avx2
<NI
> {
1439 fn extract(self, i
: u32) -> u32x4_sse2
<YesS3
, YesS4
, NI
> {
1442 0 => u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[0], 0)),
1443 1 => u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[0], 1)),
1444 2 => u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[1], 0)),
1445 3 => u32x4_sse2
::new(_mm256_extracti128_si256(self.x
[1], 1)),
1451 fn insert(self, w
: u32x4_sse2
<YesS3
, YesS4
, NI
>, i
: u32) -> Self {
1454 0 => [_mm256_inserti128_si256(self.x
[0], w
.x
, 0), self.x
[1]],
1455 1 => [_mm256_inserti128_si256(self.x
[0], w
.x
, 1), self.x
[1]],
1456 2 => [self.x
[0], _mm256_inserti128_si256(self.x
[1], w
.x
, 0)],
1457 3 => [self.x
[0], _mm256_inserti128_si256(self.x
[1], w
.x
, 1)],
1463 impl<NI
> LaneWords4
for u32x4x4_avx2
<NI
> {
1465 fn shuffle_lane_words1230(self) -> Self {
1468 _mm256_shuffle_epi32(self.x
[0], 0b1001_0011),
1469 _mm256_shuffle_epi32(self.x
[1], 0b1001_0011),
1474 fn shuffle_lane_words2301(self) -> Self {
1477 _mm256_shuffle_epi32(self.x
[0], 0b0100_1110),
1478 _mm256_shuffle_epi32(self.x
[1], 0b0100_1110),
1483 fn shuffle_lane_words3012(self) -> Self {
1486 _mm256_shuffle_epi32(self.x
[0], 0b0011_1001),
1487 _mm256_shuffle_epi32(self.x
[1], 0b0011_1001),
1492 impl<NI
> BitOps32
for u32x4x4_avx2
<NI
> where NI
: Copy {}
1493 impl<NI
> ArithOps
for u32x4x4_avx2
<NI
> where NI
: Copy {}
1494 macro_rules
! shuf_lane_bytes
{
1495 ($name
:ident
, $k0
:expr
, $k1
:expr
) => {
1497 fn $
name(self) -> Self {
1500 _mm256_shuffle_epi8(
1502 _mm256_set_epi64x($k0
, $k1
, $k0
, $k1
),
1504 _mm256_shuffle_epi8(
1506 _mm256_set_epi64x($k0
, $k1
, $k0
, $k1
),
1513 macro_rules
! rotr_32
{
1514 ($name
:ident
, $i
:expr
) => {
1516 fn $
name(self) -> Self {
1520 _mm256_srli_epi32(self.x
[0], $i
as i32),
1521 _mm256_slli_epi32(self.x
[0], 32 - $i
as i32),
1524 _mm256_srli_epi32(self.x
[1], $i
as i32),
1525 _mm256_slli_epi32(self.x
[1], 32 - $i
as i32),
1532 impl<NI
: Copy
> RotateEachWord32
for u32x4x4_avx2
<NI
> {
1533 rotr_32
!(rotate_each_word_right7
, 7);
1535 rotate_each_word_right8
,
1536 0x0c0f0e0d_080b0a09,
1539 rotr_32
!(rotate_each_word_right11
, 11);
1540 rotr_32
!(rotate_each_word_right12
, 12);
1542 rotate_each_word_right16
,
1543 0x0d0c0f0e_09080b0a,
1546 rotr_32
!(rotate_each_word_right20
, 20);
1548 rotate_each_word_right24
,
1549 0x0e0d0c0f_0a09080b,
1552 rotr_32
!(rotate_each_word_right25
, 25);
1554 impl<NI
> BitOps0
for u32x4x4_avx2
<NI
> where NI
: Copy {}
1555 impl<NI
> From
<u32x4x4_avx2
<NI
>> for vec512_storage
{
1557 fn from(x
: u32x4x4_avx2
<NI
>) -> Self {
1560 vec256_storage { avx: x.x[0] }
,
1561 vec256_storage { avx: x.x[1] }
,
1567 macro_rules
! impl_assign
{
1568 ($vec
:ident
, $Assign
:ident
, $assign_fn
:ident
, $bin_fn
:ident
) => {
1569 impl<NI
> $Assign
for $vec
<NI
>
1574 fn $
assign_fn(&mut self, rhs
: Self) {
1575 *self = self.$
bin_fn(rhs
);
1580 impl_assign
!(u32x4x4_avx2
, BitXorAssign
, bitxor_assign
, bitxor
);
1581 impl_assign
!(u32x4x4_avx2
, BitOrAssign
, bitor_assign
, bitor
);
1582 impl_assign
!(u32x4x4_avx2
, BitAndAssign
, bitand_assign
, bitand
);
1583 impl_assign
!(u32x4x4_avx2
, AddAssign
, add_assign
, add
);
1585 macro_rules
! impl_bitop_x2
{
1586 ($vec
:ident
, $Op
:ident
, $op_fn
:ident
, $impl_fn
:ident
) => {
1587 impl<NI
> $Op
for $vec
<NI
> {
1590 fn $
op_fn(self, rhs
: Self) -> Self::Output
{
1592 [$
impl_fn(self.x
[0], rhs
.x
[0]), $
impl_fn(self.x
[1], rhs
.x
[1])]
1598 impl_bitop_x2
!(u32x4x4_avx2
, BitXor
, bitxor
, _mm256_xor_si256
);
1599 impl_bitop_x2
!(u32x4x4_avx2
, BitOr
, bitor
, _mm256_or_si256
);
1600 impl_bitop_x2
!(u32x4x4_avx2
, BitAnd
, bitand
, _mm256_and_si256
);
1601 impl_bitop_x2
!(u32x4x4_avx2
, AndNot
, andnot
, _mm256_andnot_si256
);
1602 impl_bitop_x2
!(u32x4x4_avx2
, Add
, add
, _mm256_add_epi32
);
1604 impl<NI
> Not
for u32x4x4_avx2
<NI
> {
1607 fn not(self) -> Self::Output
{
1609 let f
= _mm256_set1_epi8(-0x7f);
1610 Self::new([f
, f
]) ^
self
1615 impl<NI
> BSwap
for u32x4x4_avx2
<NI
> {
1616 shuf_lane_bytes
!(bswap
, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1619 impl<NI
> From
<x4
<u128x1_sse2
<YesS3
, YesS4
, NI
>>> for u32x4x4_avx2
<NI
>
1624 fn from(x
: x4
<u128x1_sse2
<YesS3
, YesS4
, NI
>>) -> Self {
1627 _mm256_setr_m128i(x
.0[0].x
, x
.0[1].x
),
1628 _mm256_setr_m128i(x
.0[2].x
, x
.0[3].x
),