vendor/ppv-lite86/src/x86_64/sse2.rs

   1 use crate::soft::{x2, x4};
   2 use crate::types::*;
   3 use crate::vec128_storage;
   4 use crate::x86_64::Avx2Machine;
   5 use crate::x86_64::SseMachine as Machine86;
   6 use crate::x86_64::{NoS3, NoS4, YesS3, YesS4};
   7 use core::arch::x86_64::*;
   8 use core::marker::PhantomData;
   9 use core::ops::{
  10     Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
  11 };
  12
  13 macro_rules! impl_binop {
  14     ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
  15         impl<S3, S4, NI> $trait for $vec<S3, S4, NI> {
  16             type Output = Self;
  17             #[inline(always)]
  18             fn $fn(self, rhs: Self) -> Self::Output {
  19                 Self::new(unsafe { $impl_fn(self.x, rhs.x) })
  20             }
  21         }
  22     };
  23 }
  24
  25 macro_rules! impl_binop_assign {
  26     ($vec:ident, $trait:ident, $fn_assign:ident, $fn:ident) => {
  27         impl<S3, S4, NI> $trait for $vec<S3, S4, NI>
  28         where
  29             $vec<S3, S4, NI>: Copy,
  30         {
  31             #[inline(always)]
  32             fn $fn_assign(&mut self, rhs: Self) {
  33                 *self = self.$fn(rhs);
  34             }
  35         }
  36     };
  37 }
  38
  39 macro_rules! def_vec {
  40     ($vec:ident, $word:ident) => {
  41         #[allow(non_camel_case_types)]
  42         #[derive(Copy, Clone)]
  43         pub struct $vec<S3, S4, NI> {
  44             x: __m128i,
  45             s3: PhantomData<S3>,
  46             s4: PhantomData<S4>,
  47             ni: PhantomData<NI>,
  48         }
  49
  50         impl<S3, S4, NI> Store<vec128_storage> for $vec<S3, S4, NI> {
  51             #[inline(always)]
  52             unsafe fn unpack(x: vec128_storage) -> Self {
  53                 Self::new(x.sse2)
  54             }
  55         }
  56         impl<S3, S4, NI> From<$vec<S3, S4, NI>> for vec128_storage {
  57             #[inline(always)]
  58             fn from(x: $vec<S3, S4, NI>) -> Self {
  59                 vec128_storage { sse2: x.x }
  60             }
  61         }
  62         impl<S3, S4, NI> $vec<S3, S4, NI> {
  63             #[inline(always)]
  64             fn new(x: __m128i) -> Self {
  65                 $vec {
  66                     x,
  67                     s3: PhantomData,
  68                     s4: PhantomData,
  69                     ni: PhantomData,
  70                 }
  71             }
  72         }
  73
  74         impl<S3, S4, NI> StoreBytes for $vec<S3, S4, NI>
  75         where
  76             Self: BSwap,
  77         {
  78             #[inline(always)]
  79             unsafe fn unsafe_read_le(input: &[u8]) -> Self {
  80                 assert_eq!(input.len(), 16);
  81                 Self::new(_mm_loadu_si128(input.as_ptr() as *const _))
  82             }
  83             #[inline(always)]
  84             unsafe fn unsafe_read_be(input: &[u8]) -> Self {
  85                 assert_eq!(input.len(), 16);
  86                 Self::new(_mm_loadu_si128(input.as_ptr() as *const _)).bswap()
  87             }
  88             #[inline(always)]
  89             fn write_le(self, out: &mut [u8]) {
  90                 assert_eq!(out.len(), 16);
  91                 unsafe { _mm_storeu_si128(out.as_mut_ptr() as *mut _, self.x) }
  92             }
  93             #[inline(always)]
  94             fn write_be(self, out: &mut [u8]) {
  95                 assert_eq!(out.len(), 16);
  96                 let x = self.bswap().x;
  97                 unsafe {
  98                     _mm_storeu_si128(out.as_mut_ptr() as *mut _, x);
  99                 }
 100             }
 101         }
 102
 103         impl<S3, S4, NI> Default for $vec<S3, S4, NI> {
 104             #[inline(always)]
 105             fn default() -> Self {
 106                 Self::new(unsafe { _mm_setzero_si128() })
 107             }
 108         }
 109
 110         impl<S3, S4, NI> Not for $vec<S3, S4, NI> {
 111             type Output = Self;
 112             #[inline(always)]
 113             fn not(self) -> Self::Output {
 114                 unsafe {
 115                     let ff = _mm_set1_epi64x(-1i64);
 116                     self ^ Self::new(ff)
 117                 }
 118             }
 119         }
 120
 121         impl<S3: Copy, S4: Copy, NI: Copy> BitOps0 for $vec<S3, S4, NI> {}
 122         impl_binop!($vec, BitAnd, bitand, _mm_and_si128);
 123         impl_binop!($vec, BitOr, bitor, _mm_or_si128);
 124         impl_binop!($vec, BitXor, bitxor, _mm_xor_si128);
 125         impl_binop_assign!($vec, BitAndAssign, bitand_assign, bitand);
 126         impl_binop_assign!($vec, BitOrAssign, bitor_assign, bitor);
 127         impl_binop_assign!($vec, BitXorAssign, bitxor_assign, bitxor);
 128         impl<S3: Copy, S4: Copy, NI: Copy> AndNot for $vec<S3, S4, NI> {
 129             type Output = Self;
 130             #[inline(always)]
 131             fn andnot(self, rhs: Self) -> Self {
 132                 Self::new(unsafe { _mm_andnot_si128(self.x, rhs.x) })
 133             }
 134         }
 135     };
 136 }
 137
 138 macro_rules! impl_bitops32 {
 139     ($vec:ident) => {
 140         impl<S3: Copy, S4: Copy, NI: Copy> BitOps32 for $vec<S3, S4, NI> where
 141             $vec<S3, S4, NI>: RotateEachWord32
 142         {
 143         }
 144     };
 145 }
 146
 147 macro_rules! impl_bitops64 {
 148     ($vec:ident) => {
 149         impl_bitops32!($vec);
 150         impl<S3: Copy, S4: Copy, NI: Copy> BitOps64 for $vec<S3, S4, NI> where
 151             $vec<S3, S4, NI>: RotateEachWord64 + RotateEachWord32
 152         {
 153         }
 154     };
 155 }
 156
 157 macro_rules! impl_bitops128 {
 158     ($vec:ident) => {
 159         impl_bitops64!($vec);
 160         impl<S3: Copy, S4: Copy, NI: Copy> BitOps128 for $vec<S3, S4, NI> where
 161             $vec<S3, S4, NI>: RotateEachWord128
 162         {
 163         }
 164     };
 165 }
 166
 167 macro_rules! rotr_32_s3 {
 168     ($name:ident, $k0:expr, $k1:expr) => {
 169     #[inline(always)]
 170     fn $name(self) -> Self {
 171         Self::new(unsafe {
 172                 _mm_shuffle_epi8(
 173                     self.x,
 174                     _mm_set_epi64x($k0, $k1),
 175                 )
 176             })
 177         }
 178     };
 179 }
 180 macro_rules! rotr_32 {
 181     ($name:ident, $i:expr) => {
 182     #[inline(always)]
 183     fn $name(self) -> Self {
 184         Self::new(unsafe {
 185             _mm_or_si128(
 186                 _mm_srli_epi32(self.x, $i as i32),
 187                 _mm_slli_epi32(self.x, 32 - $i as i32),
 188             )
 189         })
 190     }
 191     };
 192 }
 193 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
 194     rotr_32!(rotate_each_word_right7, 7);
 195     rotr_32_s3!(
 196         rotate_each_word_right8,
 197         0x0c0f0e0d_080b0a09,
 198         0x04070605_00030201
 199     );
 200     rotr_32!(rotate_each_word_right11, 11);
 201     rotr_32!(rotate_each_word_right12, 12);
 202     rotr_32_s3!(
 203         rotate_each_word_right16,
 204         0x0d0c0f0e_09080b0a,
 205         0x05040706_01000302
 206     );
 207     rotr_32!(rotate_each_word_right20, 20);
 208     rotr_32_s3!(
 209         rotate_each_word_right24,
 210         0x0e0d0c0f_0a09080b,
 211         0x06050407_02010003
 212     );
 213     rotr_32!(rotate_each_word_right25, 25);
 214 }
 215 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
 216     rotr_32!(rotate_each_word_right7, 7);
 217     rotr_32!(rotate_each_word_right8, 8);
 218     rotr_32!(rotate_each_word_right11, 11);
 219     rotr_32!(rotate_each_word_right12, 12);
 220     #[inline(always)]
 221     fn rotate_each_word_right16(self) -> Self {
 222         Self::new(swap16_s2(self.x))
 223     }
 224     rotr_32!(rotate_each_word_right20, 20);
 225     rotr_32!(rotate_each_word_right24, 24);
 226     rotr_32!(rotate_each_word_right25, 25);
 227 }
 228
 229 macro_rules! rotr_64_s3 {
 230     ($name:ident, $k0:expr, $k1:expr) => {
 231     #[inline(always)]
 232     fn $name(self) -> Self {
 233         Self::new(unsafe {
 234                 _mm_shuffle_epi8(
 235                     self.x,
 236                     _mm_set_epi64x($k0, $k1),
 237                 )
 238             })
 239         }
 240     };
 241 }
 242 macro_rules! rotr_64 {
 243     ($name:ident, $i:expr) => {
 244     #[inline(always)]
 245     fn $name(self) -> Self {
 246         Self::new(unsafe {
 247             _mm_or_si128(
 248                 _mm_srli_epi64(self.x, $i as i32),
 249                 _mm_slli_epi64(self.x, 64 - $i as i32),
 250             )
 251         })
 252     }
 253     };
 254 }
 255 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
 256     rotr_64!(rotate_each_word_right7, 7);
 257     rotr_64_s3!(
 258         rotate_each_word_right8,
 259         0x080f_0e0d_0c0b_0a09,
 260         0x0007_0605_0403_0201
 261     );
 262     rotr_64!(rotate_each_word_right11, 11);
 263     rotr_64!(rotate_each_word_right12, 12);
 264     rotr_64_s3!(
 265         rotate_each_word_right16,
 266         0x0908_0f0e_0d0c_0b0a,
 267         0x0100_0706_0504_0302
 268     );
 269     rotr_64!(rotate_each_word_right20, 20);
 270     rotr_64_s3!(
 271         rotate_each_word_right24,
 272         0x0a09_080f_0e0d_0c0b,
 273         0x0201_0007_0605_0403
 274     );
 275     rotr_64!(rotate_each_word_right25, 25);
 276 }
 277 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<NoS3, S4, NI> {
 278     rotr_64!(rotate_each_word_right7, 7);
 279     rotr_64!(rotate_each_word_right8, 8);
 280     rotr_64!(rotate_each_word_right11, 11);
 281     rotr_64!(rotate_each_word_right12, 12);
 282     #[inline(always)]
 283     fn rotate_each_word_right16(self) -> Self {
 284         Self::new(swap16_s2(self.x))
 285     }
 286     rotr_64!(rotate_each_word_right20, 20);
 287     rotr_64!(rotate_each_word_right24, 24);
 288     rotr_64!(rotate_each_word_right25, 25);
 289 }
 290 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
 291     #[inline(always)]
 292     fn rotate_each_word_right32(self) -> Self {
 293         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b10110001) })
 294     }
 295 }
 296
 297 macro_rules! rotr_128 {
 298     ($name:ident, $i:expr) => {
 299     #[inline(always)]
 300     fn $name(self) -> Self {
 301         Self::new(unsafe {
 302             _mm_or_si128(
 303                 _mm_srli_si128(self.x, $i as i32),
 304                 _mm_slli_si128(self.x, 128 - $i as i32),
 305             )
 306         })
 307     }
 308     };
 309 }
 310 // TODO: completely unoptimized
 311 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord32 for u128x1_sse2<S3, S4, NI> {
 312     rotr_128!(rotate_each_word_right7, 7);
 313     rotr_128!(rotate_each_word_right8, 8);
 314     rotr_128!(rotate_each_word_right11, 11);
 315     rotr_128!(rotate_each_word_right12, 12);
 316     rotr_128!(rotate_each_word_right16, 16);
 317     rotr_128!(rotate_each_word_right20, 20);
 318     rotr_128!(rotate_each_word_right24, 24);
 319     rotr_128!(rotate_each_word_right25, 25);
 320 }
 321 // TODO: completely unoptimized
 322 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u128x1_sse2<S3, S4, NI> {
 323     rotr_128!(rotate_each_word_right32, 32);
 324 }
 325 impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord128 for u128x1_sse2<S3, S4, NI> {}
 326
 327 def_vec!(u32x4_sse2, u32);
 328 def_vec!(u64x2_sse2, u64);
 329 def_vec!(u128x1_sse2, u128);
 330
 331 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, YesS4, NI> {
 332     #[inline(always)]
 333     fn to_lanes(self) -> [u32; 4] {
 334         unsafe {
 335             let x = _mm_cvtsi128_si64(self.x) as u64;
 336             let y = _mm_extract_epi64(self.x, 1) as u64;
 337             [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
 338         }
 339     }
 340     #[inline(always)]
 341     fn from_lanes(xs: [u32; 4]) -> Self {
 342         unsafe {
 343             let mut x = _mm_cvtsi64_si128((xs[0] as u64 | ((xs[1] as u64) << 32)) as i64);
 344             x = _mm_insert_epi64(x, (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64, 1);
 345             Self::new(x)
 346         }
 347     }
 348 }
 349 impl<S3, NI> MultiLane<[u32; 4]> for u32x4_sse2<S3, NoS4, NI> {
 350     #[inline(always)]
 351     fn to_lanes(self) -> [u32; 4] {
 352         unsafe {
 353             let x = _mm_cvtsi128_si64(self.x) as u64;
 354             let y = _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64;
 355             [x as u32, (x >> 32) as u32, y as u32, (y >> 32) as u32]
 356         }
 357     }
 358     #[inline(always)]
 359     fn from_lanes(xs: [u32; 4]) -> Self {
 360         unsafe {
 361             let x = (xs[0] as u64 | ((xs[1] as u64) << 32)) as i64;
 362             let y = (xs[2] as u64 | ((xs[3] as u64) << 32)) as i64;
 363             let x = _mm_cvtsi64_si128(x);
 364             let y = _mm_slli_si128(_mm_cvtsi64_si128(y), 8);
 365             Self::new(_mm_or_si128(x, y))
 366         }
 367     }
 368 }
 369 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, YesS4, NI> {
 370     #[inline(always)]
 371     fn to_lanes(self) -> [u64; 2] {
 372         unsafe {
 373             [
 374                 _mm_cvtsi128_si64(self.x) as u64,
 375                 _mm_extract_epi64(self.x, 1) as u64,
 376             ]
 377         }
 378     }
 379     #[inline(always)]
 380     fn from_lanes(xs: [u64; 2]) -> Self {
 381         unsafe {
 382             let mut x = _mm_cvtsi64_si128(xs[0] as i64);
 383             x = _mm_insert_epi64(x, xs[1] as i64, 1);
 384             Self::new(x)
 385         }
 386     }
 387 }
 388 impl<S3, NI> MultiLane<[u64; 2]> for u64x2_sse2<S3, NoS4, NI> {
 389     #[inline(always)]
 390     fn to_lanes(self) -> [u64; 2] {
 391         unsafe {
 392             [
 393                 _mm_cvtsi128_si64(self.x) as u64,
 394                 _mm_cvtsi128_si64(_mm_srli_si128(self.x, 8)) as u64,
 395             ]
 396         }
 397     }
 398     #[inline(always)]
 399     fn from_lanes(xs: [u64; 2]) -> Self {
 400         unsafe {
 401             let x = _mm_cvtsi64_si128(xs[0] as i64);
 402             let y = _mm_slli_si128(_mm_cvtsi64_si128(xs[1] as i64), 8);
 403             Self::new(_mm_or_si128(x, y))
 404         }
 405     }
 406 }
 407 impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
 408     #[inline(always)]
 409     fn to_lanes(self) -> [u128; 1] {
 410         unimplemented!()
 411     }
 412     #[inline(always)]
 413     fn from_lanes(xs: [u128; 1]) -> Self {
 414         unimplemented!()
 415     }
 416 }
 417
 418 impl<S3, S4, NI> MultiLane<[u64; 4]> for u64x4_sse2<S3, S4, NI>
 419 where
 420     u64x2_sse2<S3, S4, NI>: MultiLane<[u64; 2]> + Copy,
 421 {
 422     #[inline(always)]
 423     fn to_lanes(self) -> [u64; 4] {
 424         let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
 425         [a[0], a[1], b[0], b[1]]
 426     }
 427     #[inline(always)]
 428     fn from_lanes(xs: [u64; 4]) -> Self {
 429         let (a, b) = (
 430             u64x2_sse2::from_lanes([xs[0], xs[1]]),
 431             u64x2_sse2::from_lanes([xs[2], xs[3]]),
 432         );
 433         x2::new([a, b])
 434     }
 435 }
 436
 437 macro_rules! impl_into {
 438     ($from:ident, $to:ident) => {
 439         impl<S3, S4, NI> From<$from<S3, S4, NI>> for $to<S3, S4, NI> {
 440             #[inline(always)]
 441             fn from(x: $from<S3, S4, NI>) -> Self {
 442                 $to::new(x.x)
 443             }
 444         }
 445     };
 446 }
 447
 448 impl_into!(u128x1_sse2, u32x4_sse2);
 449 impl_into!(u128x1_sse2, u64x2_sse2);
 450
 451 impl_bitops32!(u32x4_sse2);
 452 impl_bitops64!(u64x2_sse2);
 453 impl_bitops128!(u128x1_sse2);
 454
 455 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u32x4_sse2<S3, S4, NI> where
 456     u32x4_sse2<S3, S4, NI>: BSwap
 457 {
 458 }
 459 impl<S3: Copy, S4: Copy, NI: Copy> ArithOps for u64x2_sse2<S3, S4, NI> where
 460     u64x2_sse2<S3, S4, NI>: BSwap
 461 {
 462 }
 463 impl_binop!(u32x4_sse2, Add, add, _mm_add_epi32);
 464 impl_binop!(u64x2_sse2, Add, add, _mm_add_epi64);
 465 impl_binop_assign!(u32x4_sse2, AddAssign, add_assign, add);
 466 impl_binop_assign!(u64x2_sse2, AddAssign, add_assign, add);
 467
 468 impl<S3: Copy, S4: Copy, NI: Copy> u32x4<Machine86<S3, S4, NI>> for u32x4_sse2<S3, S4, NI>
 469 where
 470     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
 471     Machine86<S3, S4, NI>: Machine,
 472 {
 473 }
 474 impl<S3: Copy, S4: Copy, NI: Copy> u64x2<Machine86<S3, S4, NI>> for u64x2_sse2<S3, S4, NI>
 475 where
 476     u64x2_sse2<S3, S4, NI>:
 477         RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
 478     Machine86<S3, S4, NI>: Machine,
 479 {
 480 }
 481 impl<S3: Copy, S4: Copy, NI: Copy> u128x1<Machine86<S3, S4, NI>> for u128x1_sse2<S3, S4, NI>
 482 where
 483     u128x1_sse2<S3, S4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
 484     Machine86<S3, S4, NI>: Machine,
 485     u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4>,
 486     u128x1_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2>,
 487 {
 488 }
 489
 490 impl<NI: Copy> u32x4<Avx2Machine<NI>> for u32x4_sse2<YesS3, YesS4, NI>
 491 where
 492     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap + MultiLane<[u32; 4]> + Vec4<u32>,
 493     Machine86<YesS3, YesS4, NI>: Machine,
 494 {
 495 }
 496 impl<NI: Copy> u64x2<Avx2Machine<NI>> for u64x2_sse2<YesS3, YesS4, NI>
 497 where
 498     u64x2_sse2<YesS3, YesS4, NI>:
 499         RotateEachWord64 + RotateEachWord32 + BSwap + MultiLane<[u64; 2]> + Vec2<u64>,
 500     Machine86<YesS3, YesS4, NI>: Machine,
 501 {
 502 }
 503 impl<NI: Copy> u128x1<Avx2Machine<NI>> for u128x1_sse2<YesS3, YesS4, NI>
 504 where
 505     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + RotateEachWord64 + RotateEachWord32 + BSwap,
 506     Machine86<YesS3, YesS4, NI>: Machine,
 507     u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u32x4>,
 508     u128x1_sse2<YesS3, YesS4, NI>: Into<<Machine86<YesS3, YesS4, NI> as Machine>::u64x2>,
 509 {
 510 }
 511
 512 impl<S3, S4, NI> UnsafeFrom<[u32; 4]> for u32x4_sse2<S3, S4, NI> {
 513     #[inline(always)]
 514     unsafe fn unsafe_from(xs: [u32; 4]) -> Self {
 515         Self::new(_mm_set_epi32(
 516             xs[3] as i32,
 517             xs[2] as i32,
 518             xs[1] as i32,
 519             xs[0] as i32,
 520         ))
 521     }
 522 }
 523
 524 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, YesS4, NI>
 525 where
 526     Self: MultiLane<[u32; 4]>,
 527 {
 528     #[inline(always)]
 529     fn extract(self, i: u32) -> u32 {
 530         self.to_lanes()[i as usize]
 531     }
 532     #[inline(always)]
 533     fn insert(self, v: u32, i: u32) -> Self {
 534         Self::new(unsafe {
 535             match i {
 536                 0 => _mm_insert_epi32(self.x, v as i32, 0),
 537                 1 => _mm_insert_epi32(self.x, v as i32, 1),
 538                 2 => _mm_insert_epi32(self.x, v as i32, 2),
 539                 3 => _mm_insert_epi32(self.x, v as i32, 3),
 540                 _ => unreachable!(),
 541             }
 542         })
 543     }
 544 }
 545 impl<S3, NI> Vec4<u32> for u32x4_sse2<S3, NoS4, NI>
 546 where
 547     Self: MultiLane<[u32; 4]>,
 548 {
 549     #[inline(always)]
 550     fn extract(self, i: u32) -> u32 {
 551         self.to_lanes()[i as usize]
 552     }
 553     #[inline(always)]
 554     fn insert(self, v: u32, i: u32) -> Self {
 555         Self::new(unsafe {
 556             match i {
 557                 0 => {
 558                     let x = _mm_andnot_si128(_mm_cvtsi32_si128(-1), self.x);
 559                     _mm_or_si128(x, _mm_cvtsi32_si128(v as i32))
 560                 }
 561                 1 => {
 562                     let mut x = _mm_shuffle_epi32(self.x, 0b0111_1000);
 563                     x = _mm_slli_si128(x, 4);
 564                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
 565                     _mm_shuffle_epi32(x, 0b1110_0001)
 566                 }
 567                 2 => {
 568                     let mut x = _mm_shuffle_epi32(self.x, 0b1011_0100);
 569                     x = _mm_slli_si128(x, 4);
 570                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
 571                     _mm_shuffle_epi32(x, 0b1100_1001)
 572                 }
 573                 3 => {
 574                     let mut x = _mm_slli_si128(self.x, 4);
 575                     x = _mm_or_si128(x, _mm_cvtsi32_si128(v as i32));
 576                     _mm_shuffle_epi32(x, 0b0011_1001)
 577                 }
 578                 _ => unreachable!(),
 579             }
 580         })
 581     }
 582 }
 583
 584 impl<S3, S4, NI> LaneWords4 for u32x4_sse2<S3, S4, NI> {
 585     #[inline(always)]
 586     fn shuffle_lane_words2301(self) -> Self {
 587         self.shuffle2301()
 588     }
 589     #[inline(always)]
 590     fn shuffle_lane_words1230(self) -> Self {
 591         self.shuffle1230()
 592     }
 593     #[inline(always)]
 594     fn shuffle_lane_words3012(self) -> Self {
 595         self.shuffle3012()
 596     }
 597 }
 598
 599 impl<S3, S4, NI> Words4 for u32x4_sse2<S3, S4, NI> {
 600     #[inline(always)]
 601     fn shuffle2301(self) -> Self {
 602         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
 603     }
 604     #[inline(always)]
 605     fn shuffle1230(self) -> Self {
 606         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b1001_0011) })
 607     }
 608     #[inline(always)]
 609     fn shuffle3012(self) -> Self {
 610         Self::new(unsafe { _mm_shuffle_epi32(self.x, 0b0011_1001) })
 611     }
 612 }
 613
 614 impl<S4, NI> Words4 for u64x4_sse2<YesS3, S4, NI> {
 615     #[inline(always)]
 616     fn shuffle2301(self) -> Self {
 617         x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
 618     }
 619     #[inline(always)]
 620     fn shuffle3012(self) -> Self {
 621         unsafe {
 622             x2::new([
 623                 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
 624                 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
 625             ])
 626         }
 627     }
 628     #[inline(always)]
 629     fn shuffle1230(self) -> Self {
 630         unsafe {
 631             x2::new([
 632                 u64x2_sse2::new(_mm_alignr_epi8(self.0[0].x, self.0[1].x, 8)),
 633                 u64x2_sse2::new(_mm_alignr_epi8(self.0[1].x, self.0[0].x, 8)),
 634             ])
 635         }
 636     }
 637 }
 638 impl<S4, NI> Words4 for u64x4_sse2<NoS3, S4, NI> {
 639     #[inline(always)]
 640     fn shuffle2301(self) -> Self {
 641         x2::new([u64x2_sse2::new(self.0[1].x), u64x2_sse2::new(self.0[0].x)])
 642     }
 643     #[inline(always)]
 644     fn shuffle3012(self) -> Self {
 645         unsafe {
 646             let a = _mm_srli_si128(self.0[0].x, 8);
 647             let b = _mm_slli_si128(self.0[0].x, 8);
 648             let c = _mm_srli_si128(self.0[1].x, 8);
 649             let d = _mm_slli_si128(self.0[1].x, 8);
 650             let da = _mm_or_si128(d, a);
 651             let bc = _mm_or_si128(b, c);
 652             x2::new([u64x2_sse2::new(da), u64x2_sse2::new(bc)])
 653         }
 654     }
 655     #[inline(always)]
 656     fn shuffle1230(self) -> Self {
 657         unsafe {
 658             let a = _mm_srli_si128(self.0[0].x, 8);
 659             let b = _mm_slli_si128(self.0[0].x, 8);
 660             let c = _mm_srli_si128(self.0[1].x, 8);
 661             let d = _mm_slli_si128(self.0[1].x, 8);
 662             let da = _mm_or_si128(d, a);
 663             let bc = _mm_or_si128(b, c);
 664             x2::new([u64x2_sse2::new(bc), u64x2_sse2::new(da)])
 665         }
 666     }
 667 }
 668
 669 impl<S3, S4, NI> UnsafeFrom<[u64; 2]> for u64x2_sse2<S3, S4, NI> {
 670     #[inline(always)]
 671     unsafe fn unsafe_from(xs: [u64; 2]) -> Self {
 672         Self::new(_mm_set_epi64x(xs[1] as i64, xs[0] as i64))
 673     }
 674 }
 675
 676 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, YesS4, NI> {
 677     #[inline(always)]
 678     fn extract(self, i: u32) -> u64 {
 679         unsafe {
 680             match i {
 681                 0 => _mm_cvtsi128_si64(self.x) as u64,
 682                 1 => _mm_extract_epi64(self.x, 1) as u64,
 683                 _ => unreachable!(),
 684             }
 685         }
 686     }
 687     #[inline(always)]
 688     fn insert(self, x: u64, i: u32) -> Self {
 689         Self::new(unsafe {
 690             match i {
 691                 0 => _mm_insert_epi64(self.x, x as i64, 0),
 692                 1 => _mm_insert_epi64(self.x, x as i64, 1),
 693                 _ => unreachable!(),
 694             }
 695         })
 696     }
 697 }
 698 impl<S3, NI> Vec2<u64> for u64x2_sse2<S3, NoS4, NI> {
 699     #[inline(always)]
 700     fn extract(self, i: u32) -> u64 {
 701         unsafe {
 702             match i {
 703                 0 => _mm_cvtsi128_si64(self.x) as u64,
 704                 1 => _mm_cvtsi128_si64(_mm_shuffle_epi32(self.x, 0b11101110)) as u64,
 705                 _ => unreachable!(),
 706             }
 707         }
 708     }
 709     #[inline(always)]
 710     fn insert(self, x: u64, i: u32) -> Self {
 711         Self::new(unsafe {
 712             match i {
 713                 0 => _mm_or_si128(
 714                     _mm_andnot_si128(_mm_cvtsi64_si128(-1), self.x),
 715                     _mm_cvtsi64_si128(x as i64),
 716                 ),
 717                 1 => _mm_or_si128(
 718                     _mm_move_epi64(self.x),
 719                     _mm_slli_si128(_mm_cvtsi64_si128(x as i64), 8),
 720                 ),
 721                 _ => unreachable!(),
 722             }
 723         })
 724     }
 725 }
 726
 727 impl<S4, NI> BSwap for u32x4_sse2<YesS3, S4, NI> {
 728     #[inline(always)]
 729     fn bswap(self) -> Self {
 730         Self::new(unsafe {
 731             let k = _mm_set_epi64x(0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
 732             _mm_shuffle_epi8(self.x, k)
 733         })
 734     }
 735 }
 736 #[inline(always)]
 737 fn bswap32_s2(x: __m128i) -> __m128i {
 738     unsafe {
 739         let mut y = _mm_unpacklo_epi8(x, _mm_setzero_si128());
 740         y = _mm_shufflehi_epi16(y, 0b0001_1011);
 741         y = _mm_shufflelo_epi16(y, 0b0001_1011);
 742         let mut z = _mm_unpackhi_epi8(x, _mm_setzero_si128());
 743         z = _mm_shufflehi_epi16(z, 0b0001_1011);
 744         z = _mm_shufflelo_epi16(z, 0b0001_1011);
 745         _mm_packus_epi16(y, z)
 746     }
 747 }
 748 impl<S4, NI> BSwap for u32x4_sse2<NoS3, S4, NI> {
 749     #[inline(always)]
 750     fn bswap(self) -> Self {
 751         Self::new(bswap32_s2(self.x))
 752     }
 753 }
 754
 755 impl<S4, NI> BSwap for u64x2_sse2<YesS3, S4, NI> {
 756     #[inline(always)]
 757     fn bswap(self) -> Self {
 758         Self::new(unsafe {
 759             let k = _mm_set_epi64x(0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607);
 760             _mm_shuffle_epi8(self.x, k)
 761         })
 762     }
 763 }
 764 impl<S4, NI> BSwap for u64x2_sse2<NoS3, S4, NI> {
 765     #[inline(always)]
 766     fn bswap(self) -> Self {
 767         Self::new(unsafe { bswap32_s2(_mm_shuffle_epi32(self.x, 0b1011_0001)) })
 768     }
 769 }
 770
 771 impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
 772     #[inline(always)]
 773     fn bswap(self) -> Self {
 774         Self::new(unsafe {
 775             let k = _mm_set_epi64x(0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100);
 776             _mm_shuffle_epi8(self.x, k)
 777         })
 778     }
 779 }
 780 impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
 781     #[inline(always)]
 782     fn bswap(self) -> Self {
 783         Self::new(unsafe { unimplemented!() })
 784     }
 785 }
 786
 787 macro_rules! swapi {
 788     ($x:expr, $i:expr, $k:expr) => {
 789         unsafe {
 790             const K: u8 = $k;
 791             let k = _mm_set1_epi8(K as i8);
 792             u128x1_sse2::new(_mm_or_si128(
 793                 _mm_srli_epi16(_mm_and_si128($x.x, k), $i),
 794                 _mm_and_si128(_mm_slli_epi16($x.x, $i), k),
 795             ))
 796         }
 797     };
 798 }
 799 #[inline(always)]
 800 fn swap16_s2(x: __m128i) -> __m128i {
 801     unsafe { _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0b1011_0001), 0b1011_0001) }
 802 }
 803 impl<S4, NI> Swap64 for u128x1_sse2<YesS3, S4, NI> {
 804     #[inline(always)]
 805     fn swap1(self) -> Self {
 806         swapi!(self, 1, 0xaa)
 807     }
 808     #[inline(always)]
 809     fn swap2(self) -> Self {
 810         swapi!(self, 2, 0xcc)
 811     }
 812     #[inline(always)]
 813     fn swap4(self) -> Self {
 814         swapi!(self, 4, 0xf0)
 815     }
 816     #[inline(always)]
 817     fn swap8(self) -> Self {
 818         u128x1_sse2::new(unsafe {
 819             let k = _mm_set_epi64x(0x0e0f_0c0d_0a0b_0809, 0x0607_0405_0203_0001);
 820             _mm_shuffle_epi8(self.x, k)
 821         })
 822     }
 823     #[inline(always)]
 824     fn swap16(self) -> Self {
 825         u128x1_sse2::new(unsafe {
 826             let k = _mm_set_epi64x(0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302);
 827             _mm_shuffle_epi8(self.x, k)
 828         })
 829     }
 830     #[inline(always)]
 831     fn swap32(self) -> Self {
 832         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
 833     }
 834     #[inline(always)]
 835     fn swap64(self) -> Self {
 836         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
 837     }
 838 }
 839 impl<S4, NI> Swap64 for u128x1_sse2<NoS3, S4, NI> {
 840     #[inline(always)]
 841     fn swap1(self) -> Self {
 842         swapi!(self, 1, 0xaa)
 843     }
 844     #[inline(always)]
 845     fn swap2(self) -> Self {
 846         swapi!(self, 2, 0xcc)
 847     }
 848     #[inline(always)]
 849     fn swap4(self) -> Self {
 850         swapi!(self, 4, 0xf0)
 851     }
 852     #[inline(always)]
 853     fn swap8(self) -> Self {
 854         u128x1_sse2::new(unsafe {
 855             _mm_or_si128(_mm_slli_epi16(self.x, 8), _mm_srli_epi16(self.x, 8))
 856         })
 857     }
 858     #[inline(always)]
 859     fn swap16(self) -> Self {
 860         u128x1_sse2::new(swap16_s2(self.x))
 861     }
 862     #[inline(always)]
 863     fn swap32(self) -> Self {
 864         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b1011_0001) })
 865     }
 866     #[inline(always)]
 867     fn swap64(self) -> Self {
 868         u128x1_sse2::new(unsafe { _mm_shuffle_epi32(self.x, 0b0100_1110) })
 869     }
 870 }
 871
 872 #[derive(Copy, Clone)]
 873 pub struct G0;
 874 #[derive(Copy, Clone)]
 875 pub struct G1;
 876
 877 #[allow(non_camel_case_types)]
 878 pub type u32x4x2_sse2<S3, S4, NI> = x2<u32x4_sse2<S3, S4, NI>, G0>;
 879 #[allow(non_camel_case_types)]
 880 pub type u64x2x2_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G0>;
 881 #[allow(non_camel_case_types)]
 882 pub type u64x4_sse2<S3, S4, NI> = x2<u64x2_sse2<S3, S4, NI>, G1>;
 883 #[allow(non_camel_case_types)]
 884 pub type u128x2_sse2<S3, S4, NI> = x2<u128x1_sse2<S3, S4, NI>, G0>;
 885
 886 #[allow(non_camel_case_types)]
 887 pub type u32x4x4_sse2<S3, S4, NI> = x4<u32x4_sse2<S3, S4, NI>>;
 888 #[allow(non_camel_case_types)]
 889 pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
 890 #[allow(non_camel_case_types)]
 891 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
 892
 893 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
 894 where
 895     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
 896     Machine86<S3, S4, NI>: Machine,
 897     u32x4x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 2]>,
 898     u32x4x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u32x4>,
 899 {
 900 }
 901 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x2<Machine86<S3, S4, NI>> for u64x2x2_sse2<S3, S4, NI>
 902 where
 903     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
 904     Machine86<S3, S4, NI>: Machine,
 905     u64x2x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 2]>,
 906     u64x2x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u64x2>,
 907 {
 908 }
 909 impl<S3: Copy, S4: Copy, NI: Copy> u64x4<Machine86<S3, S4, NI>> for u64x4_sse2<S3, S4, NI>
 910 where
 911     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
 912     Machine86<S3, S4, NI>: Machine,
 913     u64x4_sse2<S3, S4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
 914 {
 915 }
 916 impl<S3: Copy, S4: Copy, NI: Copy> u128x2<Machine86<S3, S4, NI>> for u128x2_sse2<S3, S4, NI>
 917 where
 918     u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
 919     Machine86<S3, S4, NI>: Machine,
 920     u128x2_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 2]>,
 921     u128x2_sse2<S3, S4, NI>: Vec2<<Machine86<S3, S4, NI> as Machine>::u128x1>,
 922     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x2>,
 923     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x2>,
 924     u128x2_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x4>,
 925 {
 926 }
 927
 928 impl<NI: Copy> u32x4x2<Avx2Machine<NI>> for u32x4x2_sse2<YesS3, YesS4, NI>
 929 where
 930     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
 931     Avx2Machine<NI>: Machine,
 932     u32x4x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 2]>,
 933     u32x4x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u32x4>,
 934 {
 935 }
 936 impl<NI: Copy> u64x2x2<Avx2Machine<NI>> for u64x2x2_sse2<YesS3, YesS4, NI>
 937 where
 938     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
 939     Avx2Machine<NI>: Machine,
 940     u64x2x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 2]>,
 941     u64x2x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u64x2>,
 942 {
 943 }
 944 impl<NI: Copy> u64x4<Avx2Machine<NI>> for u64x4_sse2<YesS3, YesS4, NI>
 945 where
 946     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
 947     Avx2Machine<NI>: Machine,
 948     u64x4_sse2<YesS3, YesS4, NI>: MultiLane<[u64; 4]> + Vec4<u64> + Words4,
 949 {
 950 }
 951 impl<NI: Copy> u128x2<Avx2Machine<NI>> for u128x2_sse2<YesS3, YesS4, NI>
 952 where
 953     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
 954     Avx2Machine<NI>: Machine,
 955     u128x2_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 2]>,
 956     u128x2_sse2<YesS3, YesS4, NI>: Vec2<<Avx2Machine<NI> as Machine>::u128x1>,
 957     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x2>,
 958     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x2>,
 959     u128x2_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x4>,
 960 {
 961 }
 962
 963 impl<S3, S4, NI> Vec4<u64> for u64x4_sse2<S3, S4, NI>
 964 where
 965     u64x2_sse2<S3, S4, NI>: Copy + Vec2<u64>,
 966 {
 967     #[inline(always)]
 968     fn extract(self, i: u32) -> u64 {
 969         match i {
 970             0 => self.0[0].extract(0),
 971             1 => self.0[0].extract(1),
 972             2 => self.0[1].extract(0),
 973             3 => self.0[1].extract(1),
 974             _ => panic!(),
 975         }
 976     }
 977     #[inline(always)]
 978     fn insert(mut self, w: u64, i: u32) -> Self {
 979         match i {
 980             0 => self.0[0] = self.0[0].insert(w, 0),
 981             1 => self.0[0] = self.0[0].insert(w, 1),
 982             2 => self.0[1] = self.0[1].insert(w, 0),
 983             3 => self.0[1] = self.0[1].insert(w, 1),
 984             _ => panic!(),
 985         };
 986         self
 987     }
 988 }
 989
 990 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x4<Machine86<S3, S4, NI>> for u32x4x4_sse2<S3, S4, NI>
 991 where
 992     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
 993     Machine86<S3, S4, NI>: Machine,
 994     u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
 995     u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
 996 {
 997 }
 998 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
 999 where
1000     u64x2_sse2<S3, S4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1001     Machine86<S3, S4, NI>: Machine,
1002     u64x2x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u64x2; 4]>,
1003     u64x2x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u64x2>,
1004 {
1005 }
1006 impl<S3: Copy, S4: Copy, NI: Copy> u128x4<Machine86<S3, S4, NI>> for u128x4_sse2<S3, S4, NI>
1007 where
1008     u128x1_sse2<S3, S4, NI>: Swap64 + BSwap,
1009     Machine86<S3, S4, NI>: Machine,
1010     u128x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u128x1; 4]>,
1011     u128x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u128x1>,
1012     u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u32x4x4>,
1013     u128x4_sse2<S3, S4, NI>: Into<<Machine86<S3, S4, NI> as Machine>::u64x2x4>,
1014 {
1015 }
1016
1017 impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
1018 where
1019     u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
1020     Avx2Machine<NI>: Machine,
1021     u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
1022     u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
1023 {
1024 }
1025 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
1026 where
1027     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
1028     Avx2Machine<NI>: Machine,
1029     u64x2x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u64x2; 4]>,
1030     u64x2x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u64x2>,
1031 {
1032 }
1033 impl<NI: Copy> u128x4<Avx2Machine<NI>> for u128x4_sse2<YesS3, YesS4, NI>
1034 where
1035     u128x1_sse2<YesS3, YesS4, NI>: Swap64 + BSwap,
1036     Avx2Machine<NI>: Machine,
1037     u128x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u128x1; 4]>,
1038     u128x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u128x1>,
1039     u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u32x4x4>,
1040     u128x4_sse2<YesS3, YesS4, NI>: Into<<Avx2Machine<NI> as Machine>::u64x2x4>,
1041 {
1042 }
1043
1044 macro_rules! impl_into_x {
1045     ($from:ident, $to:ident) => {
1046         impl<S3: Copy, S4: Copy, NI: Copy, Gf, Gt> From<x2<$from<S3, S4, NI>, Gf>>
1047             for x2<$to<S3, S4, NI>, Gt>
1048         {
1049             #[inline(always)]
1050             fn from(x: x2<$from<S3, S4, NI>, Gf>) -> Self {
1051                 x2::new([$to::from(x.0[0]), $to::from(x.0[1])])
1052             }
1053         }
1054         impl<S3: Copy, S4: Copy, NI: Copy> From<x4<$from<S3, S4, NI>>> for x4<$to<S3, S4, NI>> {
1055             #[inline(always)]
1056             fn from(x: x4<$from<S3, S4, NI>>) -> Self {
1057                 x4::new([
1058                     $to::from(x.0[0]),
1059                     $to::from(x.0[1]),
1060                     $to::from(x.0[2]),
1061                     $to::from(x.0[3]),
1062                 ])
1063             }
1064         }
1065     };
1066 }
1067 impl_into_x!(u128x1_sse2, u64x2_sse2);
1068 impl_into_x!(u128x1_sse2, u32x4_sse2);
1069
1070 ///// Debugging
1071
1072 use core::fmt::{Debug, Formatter, Result};
1073
1074 impl<W: PartialEq, G> PartialEq for x2<W, G> {
1075     #[inline(always)]
1076     fn eq(&self, rhs: &Self) -> bool {
1077         self.0[0] == rhs.0[0] && self.0[1] == rhs.0[1]
1078     }
1079 }
1080
1081 #[inline(always)]
1082 unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
1083     let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
1084     _mm_cvtsi128_si64(q) == -1
1085 }
1086
1087 #[inline(always)]
1088 unsafe fn eq128_s2(x: __m128i, y: __m128i) -> bool {
1089     let q = _mm_cmpeq_epi32(x, y);
1090     let p = _mm_cvtsi128_si64(_mm_srli_si128(q, 8));
1091     let q = _mm_cvtsi128_si64(q);
1092     (p & q) == -1
1093 }
1094
1095 impl<S3, S4, NI> PartialEq for u32x4_sse2<S3, S4, NI> {
1096     #[inline(always)]
1097     fn eq(&self, rhs: &Self) -> bool {
1098         unsafe { eq128_s2(self.x, rhs.x) }
1099     }
1100 }
1101 impl<S3, S4, NI> Debug for u32x4_sse2<S3, S4, NI>
1102 where
1103     Self: Copy + MultiLane<[u32; 4]>,
1104 {
1105     #[cold]
1106     fn fmt(&self, fmt: &mut Formatter) -> Result {
1107         fmt.write_fmt(format_args!("{:08x?}", &self.to_lanes()))
1108     }
1109 }
1110
1111 impl<S3, S4, NI> PartialEq for u64x2_sse2<S3, S4, NI> {
1112     #[inline(always)]
1113     fn eq(&self, rhs: &Self) -> bool {
1114         unsafe { eq128_s2(self.x, rhs.x) }
1115     }
1116 }
1117 impl<S3, S4, NI> Debug for u64x2_sse2<S3, S4, NI>
1118 where
1119     Self: Copy + MultiLane<[u64; 2]>,
1120 {
1121     #[cold]
1122     fn fmt(&self, fmt: &mut Formatter) -> Result {
1123         fmt.write_fmt(format_args!("{:016x?}", &self.to_lanes()))
1124     }
1125 }
1126
1127 impl<S3, S4, NI> Debug for u64x4_sse2<S3, S4, NI>
1128 where
1129     u64x2_sse2<S3, S4, NI>: Copy + MultiLane<[u64; 2]>,
1130 {
1131     #[cold]
1132     fn fmt(&self, fmt: &mut Formatter) -> Result {
1133         let (a, b) = (self.0[0].to_lanes(), self.0[1].to_lanes());
1134         fmt.write_fmt(format_args!("{:016x?}", &[a[0], a[1], b[0], b[1]]))
1135     }
1136 }
1137
1138 #[cfg(test)]
1139 mod test {
1140     use super::*;
1141     use crate::x86_64::{SSE2, SSE41, SSSE3};
1142     use crate::Machine;
1143
1144     #[test]
1145     #[cfg(target_arch = "x86_64")]
1146     fn test_bswap32_s2_vs_s3() {
1147         let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
1148         let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
1149
1150         let s2 = unsafe { SSE2::instance() };
1151         let s3 = unsafe { SSSE3::instance() };
1152
1153         let x_s2 = {
1154             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1155             x_s2.bswap()
1156         };
1157
1158         let x_s3 = {
1159             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1160             x_s3.bswap()
1161         };
1162
1163         assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
1164         assert_eq!(x_s2, s2.vec(ys));
1165     }
1166
1167     #[test]
1168     #[cfg(target_arch = "x86_64")]
1169     fn test_bswap64_s2_vs_s3() {
1170         let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
1171         let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
1172
1173         let s2 = unsafe { SSE2::instance() };
1174         let s3 = unsafe { SSSE3::instance() };
1175
1176         let x_s2 = {
1177             let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1178             x_s2.bswap()
1179         };
1180
1181         let x_s3 = {
1182             let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1183             x_s3.bswap()
1184         };
1185
1186         assert_eq!(x_s2, s2.vec(ys));
1187         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1188     }
1189
1190     #[test]
1191     #[cfg(target_arch = "x86_64")]
1192     fn test_shuffle32_s2_vs_s3() {
1193         let xs = [0x0, 0x1, 0x2, 0x3];
1194         let ys = [0x2, 0x3, 0x0, 0x1];
1195         let zs = [0x1, 0x2, 0x3, 0x0];
1196
1197         let s2 = unsafe { SSE2::instance() };
1198         let s3 = unsafe { SSSE3::instance() };
1199
1200         let x_s2 = {
1201             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1202             x_s2.shuffle2301()
1203         };
1204         let x_s3 = {
1205             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1206             x_s3.shuffle2301()
1207         };
1208         assert_eq!(x_s2, s2.vec(ys));
1209         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1210
1211         let x_s2 = {
1212             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1213             x_s2.shuffle3012()
1214         };
1215         let x_s3 = {
1216             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1217             x_s3.shuffle3012()
1218         };
1219         assert_eq!(x_s2, s2.vec(zs));
1220         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1221
1222         let x_s2 = x_s2.shuffle1230();
1223         let x_s3 = x_s3.shuffle1230();
1224         assert_eq!(x_s2, s2.vec(xs));
1225         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1226     }
1227
1228     #[test]
1229     #[cfg(target_arch = "x86_64")]
1230     fn test_shuffle64_s2_vs_s3() {
1231         let xs = [0x0, 0x1, 0x2, 0x3];
1232         let ys = [0x2, 0x3, 0x0, 0x1];
1233         let zs = [0x1, 0x2, 0x3, 0x0];
1234
1235         let s2 = unsafe { SSE2::instance() };
1236         let s3 = unsafe { SSSE3::instance() };
1237
1238         let x_s2 = {
1239             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1240             x_s2.shuffle2301()
1241         };
1242         let x_s3 = {
1243             let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1244             x_s3.shuffle2301()
1245         };
1246         assert_eq!(x_s2, s2.vec(ys));
1247         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1248
1249         let x_s2 = {
1250             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
1251             x_s2.shuffle3012()
1252         };
1253         let x_s3 = {
1254             let x_s3: <SSSE3 as Machine>::u64x4 = s3.vec(xs);
1255             x_s3.shuffle3012()
1256         };
1257         assert_eq!(x_s2, s2.vec(zs));
1258         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1259
1260         let x_s2 = x_s2.shuffle1230();
1261         let x_s3 = x_s3.shuffle1230();
1262         assert_eq!(x_s2, s2.vec(xs));
1263         assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
1264     }
1265
1266     #[test]
1267     #[cfg(target_arch = "x86_64")]
1268     fn test_lanes_u32x4() {
1269         let xs = [0x1, 0x2, 0x3, 0x4];
1270
1271         let s2 = unsafe { SSE2::instance() };
1272         let s3 = unsafe { SSSE3::instance() };
1273         let s4 = unsafe { SSE41::instance() };
1274
1275         {
1276             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1277             let y_s2 = <SSE2 as Machine>::u32x4::from_lanes(xs);
1278             assert_eq!(x_s2, y_s2);
1279             assert_eq!(xs, y_s2.to_lanes());
1280         }
1281
1282         {
1283             let x_s3: <SSSE3 as Machine>::u32x4 = s3.vec(xs);
1284             let y_s3 = <SSSE3 as Machine>::u32x4::from_lanes(xs);
1285             assert_eq!(x_s3, y_s3);
1286             assert_eq!(xs, y_s3.to_lanes());
1287         }
1288
1289         {
1290             let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1291             let y_s4 = <SSE41 as Machine>::u32x4::from_lanes(xs);
1292             assert_eq!(x_s4, y_s4);
1293             assert_eq!(xs, y_s4.to_lanes());
1294         }
1295     }
1296
1297     #[test]
1298     #[cfg(target_arch = "x86_64")]
1299     fn test_lanes_u64x2() {
1300         let xs = [0x1, 0x2];
1301
1302         let s2 = unsafe { SSE2::instance() };
1303         let s3 = unsafe { SSSE3::instance() };
1304         let s4 = unsafe { SSE41::instance() };
1305
1306         {
1307             let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1308             let y_s2 = <SSE2 as Machine>::u64x2::from_lanes(xs);
1309             assert_eq!(x_s2, y_s2);
1310             assert_eq!(xs, y_s2.to_lanes());
1311         }
1312
1313         {
1314             let x_s3: <SSSE3 as Machine>::u64x2 = s3.vec(xs);
1315             let y_s3 = <SSSE3 as Machine>::u64x2::from_lanes(xs);
1316             assert_eq!(x_s3, y_s3);
1317             assert_eq!(xs, y_s3.to_lanes());
1318         }
1319
1320         {
1321             let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1322             let y_s4 = <SSE41 as Machine>::u64x2::from_lanes(xs);
1323             assert_eq!(x_s4, y_s4);
1324             assert_eq!(xs, y_s4.to_lanes());
1325         }
1326     }
1327
1328     #[test]
1329     #[cfg(target_arch = "x86_64")]
1330     fn test_vec4_u32x4_s2() {
1331         let xs = [1, 2, 3, 4];
1332         let s2 = unsafe { SSE2::instance() };
1333         let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
1334         assert_eq!(x_s2.extract(0), 1);
1335         assert_eq!(x_s2.extract(1), 2);
1336         assert_eq!(x_s2.extract(2), 3);
1337         assert_eq!(x_s2.extract(3), 4);
1338         assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2, 3, 4]));
1339         assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf, 3, 4]));
1340         assert_eq!(x_s2.insert(0xf, 2), s2.vec([1, 2, 0xf, 4]));
1341         assert_eq!(x_s2.insert(0xf, 3), s2.vec([1, 2, 3, 0xf]));
1342     }
1343
1344     #[test]
1345     #[cfg(target_arch = "x86_64")]
1346     fn test_vec4_u32x4_s4() {
1347         let xs = [1, 2, 3, 4];
1348         let s4 = unsafe { SSE41::instance() };
1349         let x_s4: <SSE41 as Machine>::u32x4 = s4.vec(xs);
1350         assert_eq!(x_s4.extract(0), 1);
1351         assert_eq!(x_s4.extract(1), 2);
1352         assert_eq!(x_s4.extract(2), 3);
1353         assert_eq!(x_s4.extract(3), 4);
1354         assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2, 3, 4]));
1355         assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf, 3, 4]));
1356         assert_eq!(x_s4.insert(0xf, 2), s4.vec([1, 2, 0xf, 4]));
1357         assert_eq!(x_s4.insert(0xf, 3), s4.vec([1, 2, 3, 0xf]));
1358     }
1359
1360     #[test]
1361     #[cfg(target_arch = "x86_64")]
1362     fn test_vec2_u64x2_s2() {
1363         let xs = [0x1, 0x2];
1364         let s2 = unsafe { SSE2::instance() };
1365         let x_s2: <SSE2 as Machine>::u64x2 = s2.vec(xs);
1366         assert_eq!(x_s2.extract(0), 1);
1367         assert_eq!(x_s2.extract(1), 2);
1368         assert_eq!(x_s2.insert(0xf, 0), s2.vec([0xf, 2]));
1369         assert_eq!(x_s2.insert(0xf, 1), s2.vec([1, 0xf]));
1370     }
1371
1372     #[test]
1373     #[cfg(target_arch = "x86_64")]
1374     fn test_vec4_u64x2_s4() {
1375         let xs = [0x1, 0x2];
1376         let s4 = unsafe { SSE41::instance() };
1377         let x_s4: <SSE41 as Machine>::u64x2 = s4.vec(xs);
1378         assert_eq!(x_s4.extract(0), 1);
1379         assert_eq!(x_s4.extract(1), 2);
1380         assert_eq!(x_s4.insert(0xf, 0), s4.vec([0xf, 2]));
1381         assert_eq!(x_s4.insert(0xf, 1), s4.vec([1, 0xf]));
1382     }
1383 }
1384
1385 pub mod avx2 {
1386     #![allow(non_camel_case_types)]
1387     use crate::soft::x4;
1388     use crate::types::*;
1389     use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
1390     use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
1391     use core::arch::x86_64::*;
1392     use core::marker::PhantomData;
1393     use core::ops::*;
1394
1395     #[derive(Copy, Clone)]
1396     pub struct u32x4x4_avx2<NI> {
1397         x: [__m256i; 2],
1398         ni: PhantomData<NI>,
1399     }
1400
1401     impl<NI> u32x4x4_avx2<NI> {
1402         #[inline(always)]
1403         fn new(x: [__m256i; 2]) -> Self {
1404             Self { x, ni: PhantomData }
1405         }
1406     }
1407
1408     impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
1409     impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
1410         #[inline(always)]
1411         unsafe fn unpack(p: vec512_storage) -> Self {
1412             Self::new([p.avx[0].avx, p.avx[1].avx])
1413         }
1414     }
1415     impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
1416         #[inline(always)]
1417         fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
1418             unsafe {
1419                 [
1420                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1421                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1422                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1423                     u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1424                 ]
1425             }
1426         }
1427         #[inline(always)]
1428         fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
1429             Self::new(unsafe {
1430                 [
1431                     _mm256_setr_m128i(x[0].x, x[1].x),
1432                     _mm256_setr_m128i(x[2].x, x[3].x),
1433                 ]
1434             })
1435         }
1436     }
1437     impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
1438         #[inline(always)]
1439         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
1440             unsafe {
1441                 match i {
1442                     0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
1443                     1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
1444                     2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
1445                     3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
1446                     _ => panic!(),
1447                 }
1448             }
1449         }
1450         #[inline(always)]
1451         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
1452             Self::new(unsafe {
1453                 match i {
1454                     0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
1455                     1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
1456                     2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
1457                     3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
1458                     _ => panic!(),
1459                 }
1460             })
1461         }
1462     }
1463     impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
1464         #[inline(always)]
1465         fn shuffle_lane_words1230(self) -> Self {
1466             Self::new(unsafe {
1467                 [
1468                     _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
1469                     _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
1470                 ]
1471             })
1472         }
1473         #[inline(always)]
1474         fn shuffle_lane_words2301(self) -> Self {
1475             Self::new(unsafe {
1476                 [
1477                     _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
1478                     _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
1479                 ]
1480             })
1481         }
1482         #[inline(always)]
1483         fn shuffle_lane_words3012(self) -> Self {
1484             Self::new(unsafe {
1485                 [
1486                     _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
1487                     _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
1488                 ]
1489             })
1490         }
1491     }
1492     impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
1493     impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
1494     macro_rules! shuf_lane_bytes {
1495         ($name:ident, $k0:expr, $k1:expr) => {
1496         #[inline(always)]
1497         fn $name(self) -> Self {
1498             Self::new(unsafe {
1499                 [
1500                     _mm256_shuffle_epi8(
1501                         self.x[0],
1502                         _mm256_set_epi64x($k0, $k1, $k0, $k1),
1503                     ),
1504                     _mm256_shuffle_epi8(
1505                         self.x[1],
1506                         _mm256_set_epi64x($k0, $k1, $k0, $k1),
1507                     )
1508                 ]
1509                 })
1510             }
1511         };
1512     }
1513     macro_rules! rotr_32 {
1514         ($name:ident, $i:expr) => {
1515             #[inline(always)]
1516             fn $name(self) -> Self {
1517                 Self::new(unsafe {
1518                     [
1519                         _mm256_or_si256(
1520                             _mm256_srli_epi32(self.x[0], $i as i32),
1521                             _mm256_slli_epi32(self.x[0], 32 - $i as i32),
1522                         ),
1523                         _mm256_or_si256(
1524                             _mm256_srli_epi32(self.x[1], $i as i32),
1525                             _mm256_slli_epi32(self.x[1], 32 - $i as i32),
1526                         )
1527                     ]
1528                 })
1529             }
1530         };
1531     }
1532     impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
1533         rotr_32!(rotate_each_word_right7, 7);
1534         shuf_lane_bytes!(
1535             rotate_each_word_right8,
1536             0x0c0f0e0d_080b0a09,
1537             0x04070605_00030201
1538         );
1539         rotr_32!(rotate_each_word_right11, 11);
1540         rotr_32!(rotate_each_word_right12, 12);
1541         shuf_lane_bytes!(
1542             rotate_each_word_right16,
1543             0x0d0c0f0e_09080b0a,
1544             0x05040706_01000302
1545         );
1546         rotr_32!(rotate_each_word_right20, 20);
1547         shuf_lane_bytes!(
1548             rotate_each_word_right24,
1549             0x0e0d0c0f_0a09080b,
1550             0x06050407_02010003
1551         );
1552         rotr_32!(rotate_each_word_right25, 25);
1553     }
1554     impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
1555     impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
1556         #[inline(always)]
1557         fn from(x: u32x4x4_avx2<NI>) -> Self {
1558             Self {
1559                 avx: [
1560                     vec256_storage { avx: x.x[0] },
1561                     vec256_storage { avx: x.x[1] },
1562                 ],
1563             }
1564         }
1565     }
1566
1567     macro_rules! impl_assign {
1568         ($vec:ident, $Assign:ident, $assign_fn:ident, $bin_fn:ident) => {
1569             impl<NI> $Assign for $vec<NI>
1570             where
1571                 NI: Copy,
1572             {
1573                 #[inline(always)]
1574                 fn $assign_fn(&mut self, rhs: Self) {
1575                     *self = self.$bin_fn(rhs);
1576                 }
1577             }
1578         };
1579     }
1580     impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
1581     impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
1582     impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
1583     impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
1584
1585     macro_rules! impl_bitop_x2 {
1586         ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
1587             impl<NI> $Op for $vec<NI> {
1588                 type Output = Self;
1589                 #[inline(always)]
1590                 fn $op_fn(self, rhs: Self) -> Self::Output {
1591                     Self::new(unsafe {
1592                         [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
1593                     })
1594                 }
1595             }
1596         };
1597     }
1598     impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
1599     impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
1600     impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
1601     impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
1602     impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
1603
1604     impl<NI> Not for u32x4x4_avx2<NI> {
1605         type Output = Self;
1606         #[inline(always)]
1607         fn not(self) -> Self::Output {
1608             unsafe {
1609                 let f = _mm256_set1_epi8(-0x7f);
1610                 Self::new([f, f]) ^ self
1611             }
1612         }
1613     }
1614
1615     impl<NI> BSwap for u32x4x4_avx2<NI> {
1616         shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
1617     }
1618
1619     impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
1620     where
1621         NI: Copy,
1622     {
1623         #[inline(always)]
1624         fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
1625             Self::new(unsafe {
1626                 [
1627                     _mm256_setr_m128i(x.0[0].x, x.0[1].x),
1628                     _mm256_setr_m128i(x.0[2].x, x.0[3].x),
1629                 ]
1630             })
1631         }
1632     }
1633 }