library/stdarch/crates/core_arch/src/x86/avx2.rs

   1 //! Advanced Vector Extensions 2 (AVX)
   2 //!
   3 //! AVX2 expands most AVX commands to 256-bit wide vector registers and
   4 //! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
   5 //!
   6 //! The references are:
   7 //!
   8 //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
   9 //!   Instruction Set Reference, A-Z][intel64_ref].
  10 //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
  11 //!   System Instructions][amd64_ref].
  12 //!
  13 //! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
  14 //! overview of the instructions available.
  15 //!
  16 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
  17 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
  18 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
  19 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
  20
  21 use crate::{
  22     core_arch::{simd::*, simd_llvm::*, x86::*},
  23     mem::transmute,
  24 };
  25
  26 #[cfg(test)]
  27 use stdarch_test::assert_instr;
  28
  29 /// Computes the absolute values of packed 32-bit integers in `a`.
  30 ///
  31 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
  32 #[inline]
  33 #[target_feature(enable = "avx2")]
  34 #[cfg_attr(test, assert_instr(vpabsd))]
  35 #[stable(feature = "simd_x86", since = "1.27.0")]
  36 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
  37     transmute(pabsd(a.as_i32x8()))
  38 }
  39
  40 /// Computes the absolute values of packed 16-bit integers in `a`.
  41 ///
  42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
  43 #[inline]
  44 #[target_feature(enable = "avx2")]
  45 #[cfg_attr(test, assert_instr(vpabsw))]
  46 #[stable(feature = "simd_x86", since = "1.27.0")]
  47 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
  48     transmute(pabsw(a.as_i16x16()))
  49 }
  50
  51 /// Computes the absolute values of packed 8-bit integers in `a`.
  52 ///
  53 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
  54 #[inline]
  55 #[target_feature(enable = "avx2")]
  56 #[cfg_attr(test, assert_instr(vpabsb))]
  57 #[stable(feature = "simd_x86", since = "1.27.0")]
  58 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
  59     transmute(pabsb(a.as_i8x32()))
  60 }
  61
  62 /// Adds packed 64-bit integers in `a` and `b`.
  63 ///
  64 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
  65 #[inline]
  66 #[target_feature(enable = "avx2")]
  67 #[cfg_attr(test, assert_instr(vpaddq))]
  68 #[stable(feature = "simd_x86", since = "1.27.0")]
  69 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
  70     transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
  71 }
  72
  73 /// Adds packed 32-bit integers in `a` and `b`.
  74 ///
  75 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
  76 #[inline]
  77 #[target_feature(enable = "avx2")]
  78 #[cfg_attr(test, assert_instr(vpaddd))]
  79 #[stable(feature = "simd_x86", since = "1.27.0")]
  80 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
  81     transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
  82 }
  83
  84 /// Adds packed 16-bit integers in `a` and `b`.
  85 ///
  86 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
  87 #[inline]
  88 #[target_feature(enable = "avx2")]
  89 #[cfg_attr(test, assert_instr(vpaddw))]
  90 #[stable(feature = "simd_x86", since = "1.27.0")]
  91 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
  92     transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
  93 }
  94
  95 /// Adds packed 8-bit integers in `a` and `b`.
  96 ///
  97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
  98 #[inline]
  99 #[target_feature(enable = "avx2")]
 100 #[cfg_attr(test, assert_instr(vpaddb))]
 101 #[stable(feature = "simd_x86", since = "1.27.0")]
 102 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 103     transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
 104 }
 105
 106 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 107 ///
 108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
 109 #[inline]
 110 #[target_feature(enable = "avx2")]
 111 #[cfg_attr(test, assert_instr(vpaddsb))]
 112 #[stable(feature = "simd_x86", since = "1.27.0")]
 113 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 114     transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
 115 }
 116
 117 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 118 ///
 119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
 120 #[inline]
 121 #[target_feature(enable = "avx2")]
 122 #[cfg_attr(test, assert_instr(vpaddsw))]
 123 #[stable(feature = "simd_x86", since = "1.27.0")]
 124 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 125     transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
 126 }
 127
 128 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 129 ///
 130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
 131 #[inline]
 132 #[target_feature(enable = "avx2")]
 133 #[cfg_attr(test, assert_instr(vpaddusb))]
 134 #[stable(feature = "simd_x86", since = "1.27.0")]
 135 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 136     transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
 137 }
 138
 139 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 140 ///
 141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
 142 #[inline]
 143 #[target_feature(enable = "avx2")]
 144 #[cfg_attr(test, assert_instr(vpaddusw))]
 145 #[stable(feature = "simd_x86", since = "1.27.0")]
 146 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
 147     transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
 148 }
 149
 150 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 151 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
 152 ///
 153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
 154 #[inline]
 155 #[target_feature(enable = "avx2")]
 156 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
 157 #[rustc_args_required_const(2)]
 158 #[stable(feature = "simd_x86", since = "1.27.0")]
 159 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
 160     let n = n as u32;
 161     // If `palignr` is shifting the pair of vectors more than the size of two
 162     // lanes, emit zero.
 163     if n > 32 {
 164         return _mm256_set1_epi8(0);
 165     }
 166     // If `palignr` is shifting the pair of input vectors more than one lane,
 167     // but less than two lanes, convert to shifting in zeroes.
 168     let (a, b, n) = if n > 16 {
 169         (_mm256_set1_epi8(0), a, n - 16)
 170     } else {
 171         (a, b, n)
 172     };
 173
 174     let a = a.as_i8x32();
 175     let b = b.as_i8x32();
 176
 177     let r: i8x32 = match n {
 178         0 => simd_shuffle32(
 179             b,
 180             a,
 181             [
 182                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
 183                 23, 24, 25, 26, 27, 28, 29, 30, 31,
 184             ],
 185         ),
 186         1 => simd_shuffle32(
 187             b,
 188             a,
 189             [
 190                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
 191                 24, 25, 26, 27, 28, 29, 30, 31, 48,
 192             ],
 193         ),
 194         2 => simd_shuffle32(
 195             b,
 196             a,
 197             [
 198                 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
 199                 25, 26, 27, 28, 29, 30, 31, 48, 49,
 200             ],
 201         ),
 202         3 => simd_shuffle32(
 203             b,
 204             a,
 205             [
 206                 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
 207                 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
 208             ],
 209         ),
 210         4 => simd_shuffle32(
 211             b,
 212             a,
 213             [
 214                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
 215                 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
 216             ],
 217         ),
 218         5 => simd_shuffle32(
 219             b,
 220             a,
 221             [
 222                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
 223                 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
 224             ],
 225         ),
 226         6 => simd_shuffle32(
 227             b,
 228             a,
 229             [
 230                 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
 231                 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
 232             ],
 233         ),
 234         7 => simd_shuffle32(
 235             b,
 236             a,
 237             [
 238                 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
 239                 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
 240             ],
 241         ),
 242         8 => simd_shuffle32(
 243             b,
 244             a,
 245             [
 246                 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
 247                 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
 248             ],
 249         ),
 250         9 => simd_shuffle32(
 251             b,
 252             a,
 253             [
 254                 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
 255                 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
 256             ],
 257         ),
 258         10 => simd_shuffle32(
 259             b,
 260             a,
 261             [
 262                 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
 263                 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 264             ],
 265         ),
 266         11 => simd_shuffle32(
 267             b,
 268             a,
 269             [
 270                 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
 271                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
 272             ],
 273         ),
 274         12 => simd_shuffle32(
 275             b,
 276             a,
 277             [
 278                 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
 279                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
 280             ],
 281         ),
 282         13 => simd_shuffle32(
 283             b,
 284             a,
 285             [
 286                 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
 287                 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
 288             ],
 289         ),
 290         14 => simd_shuffle32(
 291             b,
 292             a,
 293             [
 294                 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
 295                 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
 296             ],
 297         ),
 298         15 => simd_shuffle32(
 299             b,
 300             a,
 301             [
 302                 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
 303                 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
 304             ],
 305         ),
 306         _ => b,
 307     };
 308     transmute(r)
 309 }
 310
 311 /// Computes the bitwise AND of 256 bits (representing integer data)
 312 /// in `a` and `b`.
 313 ///
 314 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
 315 #[inline]
 316 #[target_feature(enable = "avx2")]
 317 #[cfg_attr(test, assert_instr(vandps))]
 318 #[stable(feature = "simd_x86", since = "1.27.0")]
 319 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 320     transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
 321 }
 322
 323 /// Computes the bitwise NOT of 256 bits (representing integer data)
 324 /// in `a` and then AND with `b`.
 325 ///
 326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
 327 #[inline]
 328 #[target_feature(enable = "avx2")]
 329 #[cfg_attr(test, assert_instr(vandnps))]
 330 #[stable(feature = "simd_x86", since = "1.27.0")]
 331 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 332     let all_ones = _mm256_set1_epi8(-1);
 333     transmute(simd_and(
 334         simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
 335         b.as_i64x4(),
 336     ))
 337 }
 338
 339 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 340 ///
 341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
 342 #[inline]
 343 #[target_feature(enable = "avx2")]
 344 #[cfg_attr(test, assert_instr(vpavgw))]
 345 #[stable(feature = "simd_x86", since = "1.27.0")]
 346 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 347     transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
 348 }
 349
 350 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 351 ///
 352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
 353 #[inline]
 354 #[target_feature(enable = "avx2")]
 355 #[cfg_attr(test, assert_instr(vpavgb))]
 356 #[stable(feature = "simd_x86", since = "1.27.0")]
 357 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 358     transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
 359 }
 360
 361 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
 362 ///
 363 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
 364 #[inline]
 365 #[target_feature(enable = "avx2")]
 366 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 367 #[rustc_args_required_const(2)]
 368 #[stable(feature = "simd_x86", since = "1.27.0")]
 369 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 370     let imm8 = (imm8 & 0xFF) as u8;
 371     let a = a.as_i32x4();
 372     let b = b.as_i32x4();
 373     macro_rules! blend2 {
 374         ($a:expr, $b:expr, $c:expr, $d:expr) => {
 375             simd_shuffle4(a, b, [$a, $b, $c, $d]);
 376         };
 377     }
 378     macro_rules! blend1 {
 379         ($a:expr, $b:expr) => {
 380             match (imm8 >> 2) & 0b11 {
 381                 0b00 => blend2!($a, $b, 2, 3),
 382                 0b01 => blend2!($a, $b, 6, 3),
 383                 0b10 => blend2!($a, $b, 2, 7),
 384                 _ => blend2!($a, $b, 6, 7),
 385             }
 386         };
 387     }
 388     let r: i32x4 = match imm8 & 0b11 {
 389         0b00 => blend1!(0, 1),
 390         0b01 => blend1!(4, 1),
 391         0b10 => blend1!(0, 5),
 392         _ => blend1!(4, 5),
 393     };
 394     transmute(r)
 395 }
 396
 397 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
 398 ///
 399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
 400 #[inline]
 401 #[target_feature(enable = "avx2")]
 402 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 403 #[rustc_args_required_const(2)]
 404 #[stable(feature = "simd_x86", since = "1.27.0")]
 405 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 406     let imm8 = (imm8 & 0xFF) as u8;
 407     let a = a.as_i32x8();
 408     let b = b.as_i32x8();
 409     macro_rules! blend4 {
 410         (
 411             $a:expr,
 412             $b:expr,
 413             $c:expr,
 414             $d:expr,
 415             $e:expr,
 416             $f:expr,
 417             $g:expr,
 418             $h:expr
 419         ) => {
 420             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
 421         };
 422     }
 423     macro_rules! blend3 {
 424         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
 425             match (imm8 >> 6) & 0b11 {
 426                 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
 427                 0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
 428                 0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
 429                 _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
 430             }
 431         };
 432     }
 433     macro_rules! blend2 {
 434         ($a:expr, $b:expr, $c:expr, $d:expr) => {
 435             match (imm8 >> 4) & 0b11 {
 436                 0b00 => blend3!($a, $b, $c, $d, 4, 5),
 437                 0b01 => blend3!($a, $b, $c, $d, 12, 5),
 438                 0b10 => blend3!($a, $b, $c, $d, 4, 13),
 439                 _ => blend3!($a, $b, $c, $d, 12, 13),
 440             }
 441         };
 442     }
 443     macro_rules! blend1 {
 444         ($a:expr, $b:expr) => {
 445             match (imm8 >> 2) & 0b11 {
 446                 0b00 => blend2!($a, $b, 2, 3),
 447                 0b01 => blend2!($a, $b, 10, 3),
 448                 0b10 => blend2!($a, $b, 2, 11),
 449                 _ => blend2!($a, $b, 10, 11),
 450             }
 451         };
 452     }
 453     let r: i32x8 = match imm8 & 0b11 {
 454         0b00 => blend1!(0, 1),
 455         0b01 => blend1!(8, 1),
 456         0b10 => blend1!(0, 9),
 457         _ => blend1!(8, 9),
 458     };
 459     transmute(r)
 460 }
 461
 462 /// Blends packed 16-bit integers from `a` and `b` using control mask `imm8`.
 463 ///
 464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
 465 #[inline]
 466 #[target_feature(enable = "avx2")]
 467 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
 468 #[rustc_args_required_const(2)]
 469 #[stable(feature = "simd_x86", since = "1.27.0")]
 470 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 471     let imm8 = (imm8 & 0xFF) as u8;
 472     let a = a.as_i16x16();
 473     let b = b.as_i16x16();
 474     macro_rules! blend4 {
 475         (
 476             $a:expr,
 477             $b:expr,
 478             $c:expr,
 479             $d:expr,
 480             $e:expr,
 481             $f:expr,
 482             $g:expr,
 483             $h:expr,
 484             $i:expr,
 485             $j:expr,
 486             $k:expr,
 487             $l:expr,
 488             $m:expr,
 489             $n:expr,
 490             $o:expr,
 491             $p:expr
 492         ) => {
 493             simd_shuffle16(
 494                 a,
 495                 b,
 496                 [
 497                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
 498                 ],
 499             )
 500         };
 501     }
 502     macro_rules! blend3 {
 503         (
 504             $a:expr,
 505             $b:expr,
 506             $c:expr,
 507             $d:expr,
 508             $e:expr,
 509             $f:expr,
 510             $a2:expr,
 511             $b2:expr,
 512             $c2:expr,
 513             $d2:expr,
 514             $e2:expr,
 515             $f2:expr
 516         ) => {
 517             match (imm8 >> 6) & 0b11 {
 518                 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
 519                 0b01 => {
 520                     blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
 521                 }
 522                 0b10 => {
 523                     blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
 524                 }
 525                 _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
 526             }
 527         };
 528     }
 529     macro_rules! blend2 {
 530         (
 531             $a:expr,
 532             $b:expr,
 533             $c:expr,
 534             $d:expr,
 535             $a2:expr,
 536             $b2:expr,
 537             $c2:expr,
 538             $d2:expr
 539         ) => {
 540             match (imm8 >> 4) & 0b11 {
 541                 0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
 542                 0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
 543                 0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
 544                 _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
 545             }
 546         };
 547     }
 548     macro_rules! blend1 {
 549         ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
 550             match (imm8 >> 2) & 0b11 {
 551                 0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
 552                 0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
 553                 0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
 554                 _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
 555             }
 556         };
 557     }
 558     let r: i16x16 = match imm8 & 0b11 {
 559         0b00 => blend1!(0, 1, 8, 9),
 560         0b01 => blend1!(16, 1, 24, 9),
 561         0b10 => blend1!(0, 17, 8, 25),
 562         _ => blend1!(16, 17, 24, 25),
 563     };
 564     transmute(r)
 565 }
 566
 567 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
 568 ///
 569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
 570 #[inline]
 571 #[target_feature(enable = "avx2")]
 572 #[cfg_attr(test, assert_instr(vpblendvb))]
 573 #[stable(feature = "simd_x86", since = "1.27.0")]
 574 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
 575     transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
 576 }
 577
 578 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 579 /// the 128-bit returned value.
 580 ///
 581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
 582 #[inline]
 583 #[target_feature(enable = "avx2")]
 584 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 585 #[stable(feature = "simd_x86", since = "1.27.0")]
 586 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 587     let zero = _mm_setzero_si128();
 588     let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
 589     transmute::<i8x16, _>(ret)
 590 }
 591
 592 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 593 /// the 256-bit returned value.
 594 ///
 595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
 596 #[inline]
 597 #[target_feature(enable = "avx2")]
 598 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 599 #[stable(feature = "simd_x86", since = "1.27.0")]
 600 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 601     let zero = _mm_setzero_si128();
 602     let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
 603     transmute::<i8x32, _>(ret)
 604 }
 605
 606 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
 607 // often compiled to `vbroadcastss`.
 608 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 609 /// the 128-bit returned value.
 610 ///
 611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
 612 #[inline]
 613 #[target_feature(enable = "avx2")]
 614 #[cfg_attr(test, assert_instr(vbroadcastss))]
 615 #[stable(feature = "simd_x86", since = "1.27.0")]
 616 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 617     let zero = _mm_setzero_si128();
 618     let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
 619     transmute::<i32x4, _>(ret)
 620 }
 621
 622 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
 623 // often compiled to `vbroadcastss`.
 624 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 625 /// the 256-bit returned value.
 626 ///
 627 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
 628 #[inline]
 629 #[target_feature(enable = "avx2")]
 630 #[cfg_attr(test, assert_instr(vbroadcastss))]
 631 #[stable(feature = "simd_x86", since = "1.27.0")]
 632 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 633     let zero = _mm_setzero_si128();
 634     let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
 635     transmute::<i32x8, _>(ret)
 636 }
 637
 638 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 639 /// the 128-bit returned value.
 640 ///
 641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
 642 #[inline]
 643 #[target_feature(enable = "avx2")]
 644 // FIXME: https://github.com/rust-lang/stdarch/issues/791
 645 #[cfg_attr(test, assert_instr(vmovddup))]
 646 #[stable(feature = "simd_x86", since = "1.27.0")]
 647 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 648     let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
 649     transmute::<i64x2, _>(ret)
 650 }
 651
 652 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 653 /// the 256-bit returned value.
 654 ///
 655 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
 656 #[inline]
 657 #[target_feature(enable = "avx2")]
 658 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 659 #[stable(feature = "simd_x86", since = "1.27.0")]
 660 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 661     let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
 662     transmute::<i64x4, _>(ret)
 663 }
 664
 665 /// Broadcasts the low double-precision (64-bit) floating-point element
 666 /// from `a` to all elements of the 128-bit returned value.
 667 ///
 668 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
 669 #[inline]
 670 #[target_feature(enable = "avx2")]
 671 #[cfg_attr(test, assert_instr(vmovddup))]
 672 #[stable(feature = "simd_x86", since = "1.27.0")]
 673 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 674     simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
 675 }
 676
 677 /// Broadcasts the low double-precision (64-bit) floating-point element
 678 /// from `a` to all elements of the 256-bit returned value.
 679 ///
 680 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
 681 #[inline]
 682 #[target_feature(enable = "avx2")]
 683 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 684 #[stable(feature = "simd_x86", since = "1.27.0")]
 685 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 686     simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
 687 }
 688
 689 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
 690 // `vbroadcastf128`.
 691 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 692 /// the 256-bit returned value.
 693 ///
 694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
 695 #[inline]
 696 #[target_feature(enable = "avx2")]
 697 #[stable(feature = "simd_x86", since = "1.27.0")]
 698 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 699     let zero = _mm_setzero_si128();
 700     let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
 701     transmute::<i64x4, _>(ret)
 702 }
 703
 704 /// Broadcasts the low single-precision (32-bit) floating-point element
 705 /// from `a` to all elements of the 128-bit returned value.
 706 ///
 707 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
 708 #[inline]
 709 #[target_feature(enable = "avx2")]
 710 #[cfg_attr(test, assert_instr(vbroadcastss))]
 711 #[stable(feature = "simd_x86", since = "1.27.0")]
 712 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 713     simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
 714 }
 715
 716 /// Broadcasts the low single-precision (32-bit) floating-point element
 717 /// from `a` to all elements of the 256-bit returned value.
 718 ///
 719 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
 720 #[inline]
 721 #[target_feature(enable = "avx2")]
 722 #[cfg_attr(test, assert_instr(vbroadcastss))]
 723 #[stable(feature = "simd_x86", since = "1.27.0")]
 724 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 725     simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
 726 }
 727
 728 /// Broadcasts the low packed 16-bit integer from a to all elements of
 729 /// the 128-bit returned value
 730 ///
 731 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
 732 #[inline]
 733 #[target_feature(enable = "avx2")]
 734 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 735 #[stable(feature = "simd_x86", since = "1.27.0")]
 736 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 737     let zero = _mm_setzero_si128();
 738     let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
 739     transmute::<i16x8, _>(ret)
 740 }
 741
 742 /// Broadcasts the low packed 16-bit integer from a to all elements of
 743 /// the 256-bit returned value
 744 ///
 745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
 746 #[inline]
 747 #[target_feature(enable = "avx2")]
 748 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 749 #[stable(feature = "simd_x86", since = "1.27.0")]
 750 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 751     let zero = _mm_setzero_si128();
 752     let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
 753     transmute::<i16x16, _>(ret)
 754 }
 755
 756 /// Compares packed 64-bit integers in `a` and `b` for equality.
 757 ///
 758 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
 759 #[inline]
 760 #[target_feature(enable = "avx2")]
 761 #[cfg_attr(test, assert_instr(vpcmpeqq))]
 762 #[stable(feature = "simd_x86", since = "1.27.0")]
 763 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 764     transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
 765 }
 766
 767 /// Compares packed 32-bit integers in `a` and `b` for equality.
 768 ///
 769 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
 770 #[inline]
 771 #[target_feature(enable = "avx2")]
 772 #[cfg_attr(test, assert_instr(vpcmpeqd))]
 773 #[stable(feature = "simd_x86", since = "1.27.0")]
 774 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 775     transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
 776 }
 777
 778 /// Compares packed 16-bit integers in `a` and `b` for equality.
 779 ///
 780 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
 781 #[inline]
 782 #[target_feature(enable = "avx2")]
 783 #[cfg_attr(test, assert_instr(vpcmpeqw))]
 784 #[stable(feature = "simd_x86", since = "1.27.0")]
 785 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 786     transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
 787 }
 788
 789 /// Compares packed 8-bit integers in `a` and `b` for equality.
 790 ///
 791 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
 792 #[inline]
 793 #[target_feature(enable = "avx2")]
 794 #[cfg_attr(test, assert_instr(vpcmpeqb))]
 795 #[stable(feature = "simd_x86", since = "1.27.0")]
 796 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 797     transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
 798 }
 799
 800 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
 801 ///
 802 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
 803 #[inline]
 804 #[target_feature(enable = "avx2")]
 805 #[cfg_attr(test, assert_instr(vpcmpgtq))]
 806 #[stable(feature = "simd_x86", since = "1.27.0")]
 807 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 808     transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
 809 }
 810
 811 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 812 ///
 813 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
 814 #[inline]
 815 #[target_feature(enable = "avx2")]
 816 #[cfg_attr(test, assert_instr(vpcmpgtd))]
 817 #[stable(feature = "simd_x86", since = "1.27.0")]
 818 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 819     transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
 820 }
 821
 822 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 823 ///
 824 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
 825 #[inline]
 826 #[target_feature(enable = "avx2")]
 827 #[cfg_attr(test, assert_instr(vpcmpgtw))]
 828 #[stable(feature = "simd_x86", since = "1.27.0")]
 829 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 830     transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
 831 }
 832
 833 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 834 ///
 835 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
 836 #[inline]
 837 #[target_feature(enable = "avx2")]
 838 #[cfg_attr(test, assert_instr(vpcmpgtb))]
 839 #[stable(feature = "simd_x86", since = "1.27.0")]
 840 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 841     transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
 842 }
 843
 844 /// Sign-extend 16-bit integers to 32-bit integers.
 845 ///
 846 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
 847 #[inline]
 848 #[target_feature(enable = "avx2")]
 849 #[cfg_attr(test, assert_instr(vpmovsxwd))]
 850 #[stable(feature = "simd_x86", since = "1.27.0")]
 851 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 852     transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
 853 }
 854
 855 /// Sign-extend 16-bit integers to 64-bit integers.
 856 ///
 857 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
 858 #[inline]
 859 #[target_feature(enable = "avx2")]
 860 #[cfg_attr(test, assert_instr(vpmovsxwq))]
 861 #[stable(feature = "simd_x86", since = "1.27.0")]
 862 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 863     let a = a.as_i16x8();
 864     let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 865     transmute::<i64x4, _>(simd_cast(v64))
 866 }
 867
 868 /// Sign-extend 32-bit integers to 64-bit integers.
 869 ///
 870 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
 871 #[inline]
 872 #[target_feature(enable = "avx2")]
 873 #[cfg_attr(test, assert_instr(vpmovsxdq))]
 874 #[stable(feature = "simd_x86", since = "1.27.0")]
 875 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 876     transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
 877 }
 878
 879 /// Sign-extend 8-bit integers to 16-bit integers.
 880 ///
 881 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
 882 #[inline]
 883 #[target_feature(enable = "avx2")]
 884 #[cfg_attr(test, assert_instr(vpmovsxbw))]
 885 #[stable(feature = "simd_x86", since = "1.27.0")]
 886 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 887     transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
 888 }
 889
 890 /// Sign-extend 8-bit integers to 32-bit integers.
 891 ///
 892 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
 893 #[inline]
 894 #[target_feature(enable = "avx2")]
 895 #[cfg_attr(test, assert_instr(vpmovsxbd))]
 896 #[stable(feature = "simd_x86", since = "1.27.0")]
 897 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 898     let a = a.as_i8x16();
 899     let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 900     transmute::<i32x8, _>(simd_cast(v64))
 901 }
 902
 903 /// Sign-extend 8-bit integers to 64-bit integers.
 904 ///
 905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
 906 #[inline]
 907 #[target_feature(enable = "avx2")]
 908 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 909 #[stable(feature = "simd_x86", since = "1.27.0")]
 910 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 911     let a = a.as_i8x16();
 912     let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 913     transmute::<i64x4, _>(simd_cast(v32))
 914 }
 915
 916 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
 917 /// integers, and stores the results in `dst`.
 918 ///
 919 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
 920 #[inline]
 921 #[target_feature(enable = "avx2")]
 922 #[cfg_attr(test, assert_instr(vpmovzxwd))]
 923 #[stable(feature = "simd_x86", since = "1.27.0")]
 924 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 925     transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
 926 }
 927
 928 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 929 /// integers. The upper four elements of `a` are unused.
 930 ///
 931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
 932 #[inline]
 933 #[target_feature(enable = "avx2")]
 934 #[cfg_attr(test, assert_instr(vpmovzxwq))]
 935 #[stable(feature = "simd_x86", since = "1.27.0")]
 936 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 937     let a = a.as_u16x8();
 938     let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 939     transmute::<i64x4, _>(simd_cast(v64))
 940 }
 941
 942 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
 943 ///
 944 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
 945 #[inline]
 946 #[target_feature(enable = "avx2")]
 947 #[cfg_attr(test, assert_instr(vpmovzxdq))]
 948 #[stable(feature = "simd_x86", since = "1.27.0")]
 949 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 950     transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
 951 }
 952
 953 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
 954 ///
 955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
 956 #[inline]
 957 #[target_feature(enable = "avx2")]
 958 #[cfg_attr(test, assert_instr(vpmovzxbw))]
 959 #[stable(feature = "simd_x86", since = "1.27.0")]
 960 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 961     transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
 962 }
 963
 964 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 965 /// integers. The upper eight elements of `a` are unused.
 966 ///
 967 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
 968 #[inline]
 969 #[target_feature(enable = "avx2")]
 970 #[cfg_attr(test, assert_instr(vpmovzxbd))]
 971 #[stable(feature = "simd_x86", since = "1.27.0")]
 972 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 973     let a = a.as_u8x16();
 974     let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 975     transmute::<i32x8, _>(simd_cast(v64))
 976 }
 977
 978 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 979 /// integers. The upper twelve elements of `a` are unused.
 980 ///
 981 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
 982 #[inline]
 983 #[target_feature(enable = "avx2")]
 984 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 985 #[stable(feature = "simd_x86", since = "1.27.0")]
 986 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 987     let a = a.as_u8x16();
 988     let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 989     transmute::<i64x4, _>(simd_cast(v32))
 990 }
 991
 992 /// Extracts 128 bits (of integer data) from `a` selected with `imm8`.
 993 ///
 994 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
 995 #[inline]
 996 #[target_feature(enable = "avx2")]
 997 #[cfg_attr(
 998     all(test, not(target_os = "windows")),
 999     assert_instr(vextractf128, imm8 = 1)
1000 )]
1001 #[rustc_args_required_const(1)]
1002 #[stable(feature = "simd_x86", since = "1.27.0")]
1003 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
1004     let a = a.as_i64x4();
1005     let b = _mm256_undefined_si256().as_i64x4();
1006     let dst: i64x2 = match imm8 & 0b01 {
1007         0 => simd_shuffle2(a, b, [0, 1]),
1008         _ => simd_shuffle2(a, b, [2, 3]),
1009     };
1010     transmute(dst)
1011 }
1012
1013 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
1014 ///
1015 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
1016 #[inline]
1017 #[target_feature(enable = "avx2")]
1018 #[cfg_attr(test, assert_instr(vphaddw))]
1019 #[stable(feature = "simd_x86", since = "1.27.0")]
1020 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
1021     transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
1022 }
1023
1024 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
1025 ///
1026 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
1027 #[inline]
1028 #[target_feature(enable = "avx2")]
1029 #[cfg_attr(test, assert_instr(vphaddd))]
1030 #[stable(feature = "simd_x86", since = "1.27.0")]
1031 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
1032     transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
1033 }
1034
1035 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
1036 /// using saturation.
1037 ///
1038 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
1039 #[inline]
1040 #[target_feature(enable = "avx2")]
1041 #[cfg_attr(test, assert_instr(vphaddsw))]
1042 #[stable(feature = "simd_x86", since = "1.27.0")]
1043 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1044     transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
1045 }
1046
1047 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1048 ///
1049 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
1050 #[inline]
1051 #[target_feature(enable = "avx2")]
1052 #[cfg_attr(test, assert_instr(vphsubw))]
1053 #[stable(feature = "simd_x86", since = "1.27.0")]
1054 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1055     transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
1056 }
1057
1058 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1059 ///
1060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
1061 #[inline]
1062 #[target_feature(enable = "avx2")]
1063 #[cfg_attr(test, assert_instr(vphsubd))]
1064 #[stable(feature = "simd_x86", since = "1.27.0")]
1065 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1066     transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
1067 }
1068
1069 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1070 /// using saturation.
1071 ///
1072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
1073 #[inline]
1074 #[target_feature(enable = "avx2")]
1075 #[cfg_attr(test, assert_instr(vphsubsw))]
1076 #[stable(feature = "simd_x86", since = "1.27.0")]
1077 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1078     transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
1079 }
1080
1081 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1082 /// where
1083 /// `scale` is between 1 and 8.
1084 ///
1085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
1086 #[inline]
1087 #[target_feature(enable = "avx2")]
1088 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1089 #[rustc_args_required_const(2)]
1090 #[stable(feature = "simd_x86", since = "1.27.0")]
1091 pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1092     let zero = _mm_setzero_si128().as_i32x4();
1093     let neg_one = _mm_set1_epi32(-1).as_i32x4();
1094     let offsets = offsets.as_i32x4();
1095     let slice = slice as *const i8;
1096     macro_rules! call {
1097         ($imm8:expr) => {
1098             pgatherdd(zero, slice, offsets, neg_one, $imm8)
1099         };
1100     }
1101     let r = constify_imm8!(scale, call);
1102     transmute(r)
1103 }
1104
1105 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1106 /// where
1107 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1108 /// that position instead.
1109 ///
1110 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
1111 #[inline]
1112 #[target_feature(enable = "avx2")]
1113 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1114 #[rustc_args_required_const(4)]
1115 #[stable(feature = "simd_x86", since = "1.27.0")]
1116 pub unsafe fn _mm_mask_i32gather_epi32(
1117     src: __m128i,
1118     slice: *const i32,
1119     offsets: __m128i,
1120     mask: __m128i,
1121     scale: i32,
1122 ) -> __m128i {
1123     let src = src.as_i32x4();
1124     let mask = mask.as_i32x4();
1125     let offsets = offsets.as_i32x4();
1126     let slice = slice as *const i8;
1127     macro_rules! call {
1128         ($imm8:expr) => {
1129             pgatherdd(src, slice, offsets, mask, $imm8)
1130         };
1131     }
1132     let r = constify_imm8!(scale, call);
1133     transmute(r)
1134 }
1135
1136 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1137 /// where
1138 /// `scale` is between 1 and 8.
1139 ///
1140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
1141 #[inline]
1142 #[target_feature(enable = "avx2")]
1143 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1144 #[rustc_args_required_const(2)]
1145 #[stable(feature = "simd_x86", since = "1.27.0")]
1146 pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
1147     let zero = _mm256_setzero_si256().as_i32x8();
1148     let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1149     let offsets = offsets.as_i32x8();
1150     let slice = slice as *const i8;
1151     macro_rules! call {
1152         ($imm8:expr) => {
1153             vpgatherdd(zero, slice, offsets, neg_one, $imm8)
1154         };
1155     }
1156     let r = constify_imm8!(scale, call);
1157     transmute(r)
1158 }
1159
1160 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1161 /// where
1162 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1163 /// that position instead.
1164 ///
1165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
1166 #[inline]
1167 #[target_feature(enable = "avx2")]
1168 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1169 #[rustc_args_required_const(4)]
1170 #[stable(feature = "simd_x86", since = "1.27.0")]
1171 pub unsafe fn _mm256_mask_i32gather_epi32(
1172     src: __m256i,
1173     slice: *const i32,
1174     offsets: __m256i,
1175     mask: __m256i,
1176     scale: i32,
1177 ) -> __m256i {
1178     let src = src.as_i32x8();
1179     let mask = mask.as_i32x8();
1180     let offsets = offsets.as_i32x8();
1181     let slice = slice as *const i8;
1182     macro_rules! call {
1183         ($imm8:expr) => {
1184             vpgatherdd(src, slice, offsets, mask, $imm8)
1185         };
1186     }
1187     let r = constify_imm8!(scale, call);
1188     transmute(r)
1189 }
1190
1191 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1192 /// where
1193 /// `scale` is between 1 and 8.
1194 ///
1195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
1196 #[inline]
1197 #[target_feature(enable = "avx2")]
1198 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1199 #[rustc_args_required_const(2)]
1200 #[stable(feature = "simd_x86", since = "1.27.0")]
1201 pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1202     let zero = _mm_setzero_ps();
1203     let neg_one = _mm_set1_ps(-1.0);
1204     let offsets = offsets.as_i32x4();
1205     let slice = slice as *const i8;
1206     macro_rules! call {
1207         ($imm8:expr) => {
1208             pgatherdps(zero, slice, offsets, neg_one, $imm8)
1209         };
1210     }
1211     constify_imm8!(scale, call)
1212 }
1213
1214 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1215 /// where
1216 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1217 /// that position instead.
1218 ///
1219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
1220 #[inline]
1221 #[target_feature(enable = "avx2")]
1222 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1223 #[rustc_args_required_const(4)]
1224 #[stable(feature = "simd_x86", since = "1.27.0")]
1225 pub unsafe fn _mm_mask_i32gather_ps(
1226     src: __m128,
1227     slice: *const f32,
1228     offsets: __m128i,
1229     mask: __m128,
1230     scale: i32,
1231 ) -> __m128 {
1232     let offsets = offsets.as_i32x4();
1233     let slice = slice as *const i8;
1234     macro_rules! call {
1235         ($imm8:expr) => {
1236             pgatherdps(src, slice, offsets, mask, $imm8)
1237         };
1238     }
1239     constify_imm8!(scale, call)
1240 }
1241
1242 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1243 /// where
1244 /// `scale` is between 1 and 8.
1245 ///
1246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
1247 #[inline]
1248 #[target_feature(enable = "avx2")]
1249 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1250 #[rustc_args_required_const(2)]
1251 #[stable(feature = "simd_x86", since = "1.27.0")]
1252 pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
1253     let zero = _mm256_setzero_ps();
1254     let neg_one = _mm256_set1_ps(-1.0);
1255     let offsets = offsets.as_i32x8();
1256     let slice = slice as *const i8;
1257     macro_rules! call {
1258         ($imm8:expr) => {
1259             vpgatherdps(zero, slice, offsets, neg_one, $imm8)
1260         };
1261     }
1262     constify_imm8!(scale, call)
1263 }
1264
1265 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1266 /// where
1267 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1268 /// that position instead.
1269 ///
1270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
1271 #[inline]
1272 #[target_feature(enable = "avx2")]
1273 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1274 #[rustc_args_required_const(4)]
1275 #[stable(feature = "simd_x86", since = "1.27.0")]
1276 pub unsafe fn _mm256_mask_i32gather_ps(
1277     src: __m256,
1278     slice: *const f32,
1279     offsets: __m256i,
1280     mask: __m256,
1281     scale: i32,
1282 ) -> __m256 {
1283     let offsets = offsets.as_i32x8();
1284     let slice = slice as *const i8;
1285     macro_rules! call {
1286         ($imm8:expr) => {
1287             vpgatherdps(src, slice, offsets, mask, $imm8)
1288         };
1289     }
1290     constify_imm8!(scale, call)
1291 }
1292
1293 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1294 /// where
1295 /// `scale` is between 1 and 8.
1296 ///
1297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
1298 #[inline]
1299 #[target_feature(enable = "avx2")]
1300 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1301 #[rustc_args_required_const(2)]
1302 #[stable(feature = "simd_x86", since = "1.27.0")]
1303 pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1304     let zero = _mm_setzero_si128().as_i64x2();
1305     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1306     let offsets = offsets.as_i32x4();
1307     let slice = slice as *const i8;
1308     macro_rules! call {
1309         ($imm8:expr) => {
1310             pgatherdq(zero, slice, offsets, neg_one, $imm8)
1311         };
1312     }
1313     let r = constify_imm8!(scale, call);
1314     transmute(r)
1315 }
1316
1317 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1318 /// where
1319 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1320 /// that position instead.
1321 ///
1322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
1323 #[inline]
1324 #[target_feature(enable = "avx2")]
1325 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1326 #[rustc_args_required_const(4)]
1327 #[stable(feature = "simd_x86", since = "1.27.0")]
1328 pub unsafe fn _mm_mask_i32gather_epi64(
1329     src: __m128i,
1330     slice: *const i64,
1331     offsets: __m128i,
1332     mask: __m128i,
1333     scale: i32,
1334 ) -> __m128i {
1335     let src = src.as_i64x2();
1336     let mask = mask.as_i64x2();
1337     let offsets = offsets.as_i32x4();
1338     let slice = slice as *const i8;
1339     macro_rules! call {
1340         ($imm8:expr) => {
1341             pgatherdq(src, slice, offsets, mask, $imm8)
1342         };
1343     }
1344     let r = constify_imm8!(scale, call);
1345     transmute(r)
1346 }
1347
1348 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1349 /// where
1350 /// `scale` is between 1 and 8.
1351 ///
1352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
1353 #[inline]
1354 #[target_feature(enable = "avx2")]
1355 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1356 #[rustc_args_required_const(2)]
1357 #[stable(feature = "simd_x86", since = "1.27.0")]
1358 pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
1359     let zero = _mm256_setzero_si256().as_i64x4();
1360     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1361     let offsets = offsets.as_i32x4();
1362     let slice = slice as *const i8;
1363     macro_rules! call {
1364         ($imm8:expr) => {
1365             vpgatherdq(zero, slice, offsets, neg_one, $imm8)
1366         };
1367     }
1368     let r = constify_imm8!(scale, call);
1369     transmute(r)
1370 }
1371
1372 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1373 /// where
1374 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1375 /// that position instead.
1376 ///
1377 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
1378 #[inline]
1379 #[target_feature(enable = "avx2")]
1380 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1381 #[rustc_args_required_const(4)]
1382 #[stable(feature = "simd_x86", since = "1.27.0")]
1383 pub unsafe fn _mm256_mask_i32gather_epi64(
1384     src: __m256i,
1385     slice: *const i64,
1386     offsets: __m128i,
1387     mask: __m256i,
1388     scale: i32,
1389 ) -> __m256i {
1390     let src = src.as_i64x4();
1391     let mask = mask.as_i64x4();
1392     let offsets = offsets.as_i32x4();
1393     let slice = slice as *const i8;
1394     macro_rules! call {
1395         ($imm8:expr) => {
1396             vpgatherdq(src, slice, offsets, mask, $imm8)
1397         };
1398     }
1399     let r = constify_imm8!(scale, call);
1400     transmute(r)
1401 }
1402
1403 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1404 /// where
1405 /// `scale` is between 1 and 8.
1406 ///
1407 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
1408 #[inline]
1409 #[target_feature(enable = "avx2")]
1410 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1411 #[rustc_args_required_const(2)]
1412 #[stable(feature = "simd_x86", since = "1.27.0")]
1413 pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1414     let zero = _mm_setzero_pd();
1415     let neg_one = _mm_set1_pd(-1.0);
1416     let offsets = offsets.as_i32x4();
1417     let slice = slice as *const i8;
1418     macro_rules! call {
1419         ($imm8:expr) => {
1420             pgatherdpd(zero, slice, offsets, neg_one, $imm8)
1421         };
1422     }
1423     constify_imm8!(scale, call)
1424 }
1425
1426 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1427 /// where
1428 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1429 /// that position instead.
1430 ///
1431 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
1432 #[inline]
1433 #[target_feature(enable = "avx2")]
1434 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1435 #[rustc_args_required_const(4)]
1436 #[stable(feature = "simd_x86", since = "1.27.0")]
1437 pub unsafe fn _mm_mask_i32gather_pd(
1438     src: __m128d,
1439     slice: *const f64,
1440     offsets: __m128i,
1441     mask: __m128d,
1442     scale: i32,
1443 ) -> __m128d {
1444     let offsets = offsets.as_i32x4();
1445     let slice = slice as *const i8;
1446     macro_rules! call {
1447         ($imm8:expr) => {
1448             pgatherdpd(src, slice, offsets, mask, $imm8)
1449         };
1450     }
1451     constify_imm8!(scale, call)
1452 }
1453
1454 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1455 /// where
1456 /// `scale` is between 1 and 8.
1457 ///
1458 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
1459 #[inline]
1460 #[target_feature(enable = "avx2")]
1461 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1462 #[rustc_args_required_const(2)]
1463 #[stable(feature = "simd_x86", since = "1.27.0")]
1464 pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
1465     let zero = _mm256_setzero_pd();
1466     let neg_one = _mm256_set1_pd(-1.0);
1467     let offsets = offsets.as_i32x4();
1468     let slice = slice as *const i8;
1469     macro_rules! call {
1470         ($imm8:expr) => {
1471             vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
1472         };
1473     }
1474     constify_imm8!(scale, call)
1475 }
1476
1477 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1478 /// where
1479 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1480 /// that position instead.
1481 ///
1482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
1483 #[inline]
1484 #[target_feature(enable = "avx2")]
1485 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1486 #[rustc_args_required_const(4)]
1487 #[stable(feature = "simd_x86", since = "1.27.0")]
1488 pub unsafe fn _mm256_mask_i32gather_pd(
1489     src: __m256d,
1490     slice: *const f64,
1491     offsets: __m128i,
1492     mask: __m256d,
1493     scale: i32,
1494 ) -> __m256d {
1495     let offsets = offsets.as_i32x4();
1496     let slice = slice as *const i8;
1497     macro_rules! call {
1498         ($imm8:expr) => {
1499             vpgatherdpd(src, slice, offsets, mask, $imm8)
1500         };
1501     }
1502     constify_imm8!(scale, call)
1503 }
1504
1505 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1506 /// where
1507 /// `scale` is between 1 and 8.
1508 ///
1509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
1510 #[inline]
1511 #[target_feature(enable = "avx2")]
1512 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1513 #[rustc_args_required_const(2)]
1514 #[stable(feature = "simd_x86", since = "1.27.0")]
1515 pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1516     let zero = _mm_setzero_si128().as_i32x4();
1517     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1518     let offsets = offsets.as_i64x2();
1519     let slice = slice as *const i8;
1520     macro_rules! call {
1521         ($imm8:expr) => {
1522             pgatherqd(zero, slice, offsets, neg_one, $imm8)
1523         };
1524     }
1525     let r = constify_imm8!(scale, call);
1526     transmute(r)
1527 }
1528
1529 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1530 /// where
1531 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1532 /// that position instead.
1533 ///
1534 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
1535 #[inline]
1536 #[target_feature(enable = "avx2")]
1537 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1538 #[rustc_args_required_const(4)]
1539 #[stable(feature = "simd_x86", since = "1.27.0")]
1540 pub unsafe fn _mm_mask_i64gather_epi32(
1541     src: __m128i,
1542     slice: *const i32,
1543     offsets: __m128i,
1544     mask: __m128i,
1545     scale: i32,
1546 ) -> __m128i {
1547     let src = src.as_i32x4();
1548     let mask = mask.as_i32x4();
1549     let offsets = offsets.as_i64x2();
1550     let slice = slice as *const i8;
1551     macro_rules! call {
1552         ($imm8:expr) => {
1553             pgatherqd(src, slice, offsets, mask, $imm8)
1554         };
1555     }
1556     let r = constify_imm8!(scale, call);
1557     transmute(r)
1558 }
1559
1560 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1561 /// where
1562 /// `scale` is between 1 and 8.
1563 ///
1564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
1565 #[inline]
1566 #[target_feature(enable = "avx2")]
1567 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1568 #[rustc_args_required_const(2)]
1569 #[stable(feature = "simd_x86", since = "1.27.0")]
1570 pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
1571     let zero = _mm_setzero_si128().as_i32x4();
1572     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1573     let offsets = offsets.as_i64x4();
1574     let slice = slice as *const i8;
1575     macro_rules! call {
1576         ($imm8:expr) => {
1577             vpgatherqd(zero, slice, offsets, neg_one, $imm8)
1578         };
1579     }
1580     let r = constify_imm8!(scale, call);
1581     transmute(r)
1582 }
1583
1584 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1585 /// where
1586 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1587 /// that position instead.
1588 ///
1589 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
1590 #[inline]
1591 #[target_feature(enable = "avx2")]
1592 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1593 #[rustc_args_required_const(4)]
1594 #[stable(feature = "simd_x86", since = "1.27.0")]
1595 pub unsafe fn _mm256_mask_i64gather_epi32(
1596     src: __m128i,
1597     slice: *const i32,
1598     offsets: __m256i,
1599     mask: __m128i,
1600     scale: i32,
1601 ) -> __m128i {
1602     let src = src.as_i32x4();
1603     let mask = mask.as_i32x4();
1604     let offsets = offsets.as_i64x4();
1605     let slice = slice as *const i8;
1606     macro_rules! call {
1607         ($imm8:expr) => {
1608             vpgatherqd(src, slice, offsets, mask, $imm8)
1609         };
1610     }
1611     let r = constify_imm8!(scale, call);
1612     transmute(r)
1613 }
1614
1615 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1616 /// where
1617 /// `scale` is between 1 and 8.
1618 ///
1619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
1620 #[inline]
1621 #[target_feature(enable = "avx2")]
1622 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1623 #[rustc_args_required_const(2)]
1624 #[stable(feature = "simd_x86", since = "1.27.0")]
1625 pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1626     let zero = _mm_setzero_ps();
1627     let neg_one = _mm_set1_ps(-1.0);
1628     let offsets = offsets.as_i64x2();
1629     let slice = slice as *const i8;
1630     macro_rules! call {
1631         ($imm8:expr) => {
1632             pgatherqps(zero, slice, offsets, neg_one, $imm8)
1633         };
1634     }
1635     constify_imm8!(scale, call)
1636 }
1637
1638 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1639 /// where
1640 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1641 /// that position instead.
1642 ///
1643 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
1644 #[inline]
1645 #[target_feature(enable = "avx2")]
1646 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1647 #[rustc_args_required_const(4)]
1648 #[stable(feature = "simd_x86", since = "1.27.0")]
1649 pub unsafe fn _mm_mask_i64gather_ps(
1650     src: __m128,
1651     slice: *const f32,
1652     offsets: __m128i,
1653     mask: __m128,
1654     scale: i32,
1655 ) -> __m128 {
1656     let offsets = offsets.as_i64x2();
1657     let slice = slice as *const i8;
1658     macro_rules! call {
1659         ($imm8:expr) => {
1660             pgatherqps(src, slice, offsets, mask, $imm8)
1661         };
1662     }
1663     constify_imm8!(scale, call)
1664 }
1665
1666 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1667 /// where
1668 /// `scale` is between 1 and 8.
1669 ///
1670 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
1671 #[inline]
1672 #[target_feature(enable = "avx2")]
1673 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1674 #[rustc_args_required_const(2)]
1675 #[stable(feature = "simd_x86", since = "1.27.0")]
1676 pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
1677     let zero = _mm_setzero_ps();
1678     let neg_one = _mm_set1_ps(-1.0);
1679     let offsets = offsets.as_i64x4();
1680     let slice = slice as *const i8;
1681     macro_rules! call {
1682         ($imm8:expr) => {
1683             vpgatherqps(zero, slice, offsets, neg_one, $imm8)
1684         };
1685     }
1686     constify_imm8!(scale, call)
1687 }
1688
1689 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1690 /// where
1691 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1692 /// that position instead.
1693 ///
1694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
1695 #[inline]
1696 #[target_feature(enable = "avx2")]
1697 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1698 #[rustc_args_required_const(4)]
1699 #[stable(feature = "simd_x86", since = "1.27.0")]
1700 pub unsafe fn _mm256_mask_i64gather_ps(
1701     src: __m128,
1702     slice: *const f32,
1703     offsets: __m256i,
1704     mask: __m128,
1705     scale: i32,
1706 ) -> __m128 {
1707     let offsets = offsets.as_i64x4();
1708     let slice = slice as *const i8;
1709     macro_rules! call {
1710         ($imm8:expr) => {
1711             vpgatherqps(src, slice, offsets, mask, $imm8)
1712         };
1713     }
1714     constify_imm8!(scale, call)
1715 }
1716
1717 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1718 /// where
1719 /// `scale` is between 1 and 8.
1720 ///
1721 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
1722 #[inline]
1723 #[target_feature(enable = "avx2")]
1724 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1725 #[rustc_args_required_const(2)]
1726 #[stable(feature = "simd_x86", since = "1.27.0")]
1727 pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1728     let zero = _mm_setzero_si128().as_i64x2();
1729     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1730     let slice = slice as *const i8;
1731     let offsets = offsets.as_i64x2();
1732     macro_rules! call {
1733         ($imm8:expr) => {
1734             pgatherqq(zero, slice, offsets, neg_one, $imm8)
1735         };
1736     }
1737     let r = constify_imm8!(scale, call);
1738     transmute(r)
1739 }
1740
1741 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1742 /// where
1743 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1744 /// that position instead.
1745 ///
1746 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
1747 #[inline]
1748 #[target_feature(enable = "avx2")]
1749 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1750 #[rustc_args_required_const(4)]
1751 #[stable(feature = "simd_x86", since = "1.27.0")]
1752 pub unsafe fn _mm_mask_i64gather_epi64(
1753     src: __m128i,
1754     slice: *const i64,
1755     offsets: __m128i,
1756     mask: __m128i,
1757     scale: i32,
1758 ) -> __m128i {
1759     let src = src.as_i64x2();
1760     let mask = mask.as_i64x2();
1761     let offsets = offsets.as_i64x2();
1762     let slice = slice as *const i8;
1763     macro_rules! call {
1764         ($imm8:expr) => {
1765             pgatherqq(src, slice, offsets, mask, $imm8)
1766         };
1767     }
1768     let r = constify_imm8!(scale, call);
1769     transmute(r)
1770 }
1771
1772 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1773 /// where
1774 /// `scale` is between 1 and 8.
1775 ///
1776 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
1777 #[inline]
1778 #[target_feature(enable = "avx2")]
1779 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1780 #[rustc_args_required_const(2)]
1781 #[stable(feature = "simd_x86", since = "1.27.0")]
1782 pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
1783     let zero = _mm256_setzero_si256().as_i64x4();
1784     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1785     let slice = slice as *const i8;
1786     let offsets = offsets.as_i64x4();
1787     macro_rules! call {
1788         ($imm8:expr) => {
1789             vpgatherqq(zero, slice, offsets, neg_one, $imm8)
1790         };
1791     }
1792     let r = constify_imm8!(scale, call);
1793     transmute(r)
1794 }
1795
1796 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1797 /// where
1798 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1799 /// that position instead.
1800 ///
1801 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
1802 #[inline]
1803 #[target_feature(enable = "avx2")]
1804 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1805 #[rustc_args_required_const(4)]
1806 #[stable(feature = "simd_x86", since = "1.27.0")]
1807 pub unsafe fn _mm256_mask_i64gather_epi64(
1808     src: __m256i,
1809     slice: *const i64,
1810     offsets: __m256i,
1811     mask: __m256i,
1812     scale: i32,
1813 ) -> __m256i {
1814     let src = src.as_i64x4();
1815     let mask = mask.as_i64x4();
1816     let offsets = offsets.as_i64x4();
1817     let slice = slice as *const i8;
1818     macro_rules! call {
1819         ($imm8:expr) => {
1820             vpgatherqq(src, slice, offsets, mask, $imm8)
1821         };
1822     }
1823     let r = constify_imm8!(scale, call);
1824     transmute(r)
1825 }
1826
1827 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1828 /// where
1829 /// `scale` is between 1 and 8.
1830 ///
1831 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
1832 #[inline]
1833 #[target_feature(enable = "avx2")]
1834 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1835 #[rustc_args_required_const(2)]
1836 #[stable(feature = "simd_x86", since = "1.27.0")]
1837 pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1838     let zero = _mm_setzero_pd();
1839     let neg_one = _mm_set1_pd(-1.0);
1840     let slice = slice as *const i8;
1841     let offsets = offsets.as_i64x2();
1842     macro_rules! call {
1843         ($imm8:expr) => {
1844             pgatherqpd(zero, slice, offsets, neg_one, $imm8)
1845         };
1846     }
1847     constify_imm8!(scale, call)
1848 }
1849
1850 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1851 /// where
1852 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1853 /// that position instead.
1854 ///
1855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
1856 #[inline]
1857 #[target_feature(enable = "avx2")]
1858 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1859 #[rustc_args_required_const(4)]
1860 #[stable(feature = "simd_x86", since = "1.27.0")]
1861 pub unsafe fn _mm_mask_i64gather_pd(
1862     src: __m128d,
1863     slice: *const f64,
1864     offsets: __m128i,
1865     mask: __m128d,
1866     scale: i32,
1867 ) -> __m128d {
1868     let slice = slice as *const i8;
1869     let offsets = offsets.as_i64x2();
1870     macro_rules! call {
1871         ($imm8:expr) => {
1872             pgatherqpd(src, slice, offsets, mask, $imm8)
1873         };
1874     }
1875     constify_imm8!(scale, call)
1876 }
1877
1878 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1879 /// where
1880 /// `scale` is between 1 and 8.
1881 ///
1882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
1883 #[inline]
1884 #[target_feature(enable = "avx2")]
1885 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1886 #[rustc_args_required_const(2)]
1887 #[stable(feature = "simd_x86", since = "1.27.0")]
1888 pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
1889     let zero = _mm256_setzero_pd();
1890     let neg_one = _mm256_set1_pd(-1.0);
1891     let slice = slice as *const i8;
1892     let offsets = offsets.as_i64x4();
1893     macro_rules! call {
1894         ($imm8:expr) => {
1895             vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
1896         };
1897     }
1898     constify_imm8!(scale, call)
1899 }
1900
1901 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1902 /// where
1903 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1904 /// that position instead.
1905 ///
1906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
1907 #[inline]
1908 #[target_feature(enable = "avx2")]
1909 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1910 #[rustc_args_required_const(4)]
1911 #[stable(feature = "simd_x86", since = "1.27.0")]
1912 pub unsafe fn _mm256_mask_i64gather_pd(
1913     src: __m256d,
1914     slice: *const f64,
1915     offsets: __m256i,
1916     mask: __m256d,
1917     scale: i32,
1918 ) -> __m256d {
1919     let slice = slice as *const i8;
1920     let offsets = offsets.as_i64x4();
1921     macro_rules! call {
1922         ($imm8:expr) => {
1923             vpgatherqpd(src, slice, offsets, mask, $imm8)
1924         };
1925     }
1926     constify_imm8!(scale, call)
1927 }
1928
1929 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1930 /// location specified by `imm8`.
1931 ///
1932 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
1933 #[inline]
1934 #[target_feature(enable = "avx2")]
1935 #[cfg_attr(
1936     all(test, not(target_os = "windows")),
1937     assert_instr(vinsertf128, imm8 = 1)
1938 )]
1939 #[rustc_args_required_const(2)]
1940 #[stable(feature = "simd_x86", since = "1.27.0")]
1941 pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
1942     let a = a.as_i64x4();
1943     let b = _mm256_castsi128_si256(b).as_i64x4();
1944     let dst: i64x4 = match imm8 & 0b01 {
1945         0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
1946         _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
1947     };
1948     transmute(dst)
1949 }
1950
1951 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1952 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1953 /// of intermediate 32-bit integers.
1954 ///
1955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
1956 #[inline]
1957 #[target_feature(enable = "avx2")]
1958 #[cfg_attr(test, assert_instr(vpmaddwd))]
1959 #[stable(feature = "simd_x86", since = "1.27.0")]
1960 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1961     transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1962 }
1963
1964 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
1965 /// corresponding signed 8-bit integer from `b`, producing intermediate
1966 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1967 /// signed 16-bit integers
1968 ///
1969 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
1970 #[inline]
1971 #[target_feature(enable = "avx2")]
1972 #[cfg_attr(test, assert_instr(vpmaddubsw))]
1973 #[stable(feature = "simd_x86", since = "1.27.0")]
1974 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1975     transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1976 }
1977
1978 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1979 /// (elements are zeroed out when the highest bit is not set in the
1980 /// corresponding element).
1981 ///
1982 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
1983 #[inline]
1984 #[target_feature(enable = "avx2")]
1985 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1986 #[stable(feature = "simd_x86", since = "1.27.0")]
1987 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1988     transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1989 }
1990
1991 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1992 /// (elements are zeroed out when the highest bit is not set in the
1993 /// corresponding element).
1994 ///
1995 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
1996 #[inline]
1997 #[target_feature(enable = "avx2")]
1998 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1999 #[stable(feature = "simd_x86", since = "1.27.0")]
2000 pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
2001     transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
2002 }
2003
2004 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2005 /// (elements are zeroed out when the highest bit is not set in the
2006 /// corresponding element).
2007 ///
2008 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
2009 #[inline]
2010 #[target_feature(enable = "avx2")]
2011 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2012 #[stable(feature = "simd_x86", since = "1.27.0")]
2013 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
2014     transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
2015 }
2016
2017 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2018 /// (elements are zeroed out when the highest bit is not set in the
2019 /// corresponding element).
2020 ///
2021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
2022 #[inline]
2023 #[target_feature(enable = "avx2")]
2024 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2025 #[stable(feature = "simd_x86", since = "1.27.0")]
2026 pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
2027     transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
2028 }
2029
2030 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2031 /// using `mask` (elements are not stored when the highest bit is not set
2032 /// in the corresponding element).
2033 ///
2034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
2035 #[inline]
2036 #[target_feature(enable = "avx2")]
2037 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2038 #[stable(feature = "simd_x86", since = "1.27.0")]
2039 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
2040     maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
2041 }
2042
2043 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2044 /// using `mask` (elements are not stored when the highest bit is not set
2045 /// in the corresponding element).
2046 ///
2047 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
2048 #[inline]
2049 #[target_feature(enable = "avx2")]
2050 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2051 #[stable(feature = "simd_x86", since = "1.27.0")]
2052 pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
2053     maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
2054 }
2055
2056 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2057 /// using `mask` (elements are not stored when the highest bit is not set
2058 /// in the corresponding element).
2059 ///
2060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
2061 #[inline]
2062 #[target_feature(enable = "avx2")]
2063 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2064 #[stable(feature = "simd_x86", since = "1.27.0")]
2065 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
2066     maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
2067 }
2068
2069 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2070 /// using `mask` (elements are not stored when the highest bit is not set
2071 /// in the corresponding element).
2072 ///
2073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
2074 #[inline]
2075 #[target_feature(enable = "avx2")]
2076 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2077 #[stable(feature = "simd_x86", since = "1.27.0")]
2078 pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
2079     maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
2080 }
2081
2082 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2083 /// maximum values.
2084 ///
2085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
2086 #[inline]
2087 #[target_feature(enable = "avx2")]
2088 #[cfg_attr(test, assert_instr(vpmaxsw))]
2089 #[stable(feature = "simd_x86", since = "1.27.0")]
2090 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2091     transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
2092 }
2093
2094 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2095 /// maximum values.
2096 ///
2097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
2098 #[inline]
2099 #[target_feature(enable = "avx2")]
2100 #[cfg_attr(test, assert_instr(vpmaxsd))]
2101 #[stable(feature = "simd_x86", since = "1.27.0")]
2102 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2103     transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
2104 }
2105
2106 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2107 /// maximum values.
2108 ///
2109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
2110 #[inline]
2111 #[target_feature(enable = "avx2")]
2112 #[cfg_attr(test, assert_instr(vpmaxsb))]
2113 #[stable(feature = "simd_x86", since = "1.27.0")]
2114 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2115     transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
2116 }
2117
2118 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2119 /// the packed maximum values.
2120 ///
2121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
2122 #[inline]
2123 #[target_feature(enable = "avx2")]
2124 #[cfg_attr(test, assert_instr(vpmaxuw))]
2125 #[stable(feature = "simd_x86", since = "1.27.0")]
2126 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2127     transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
2128 }
2129
2130 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2131 /// the packed maximum values.
2132 ///
2133 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
2134 #[inline]
2135 #[target_feature(enable = "avx2")]
2136 #[cfg_attr(test, assert_instr(vpmaxud))]
2137 #[stable(feature = "simd_x86", since = "1.27.0")]
2138 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2139     transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
2140 }
2141
2142 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2143 /// the packed maximum values.
2144 ///
2145 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
2146 #[inline]
2147 #[target_feature(enable = "avx2")]
2148 #[cfg_attr(test, assert_instr(vpmaxub))]
2149 #[stable(feature = "simd_x86", since = "1.27.0")]
2150 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2151     transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
2152 }
2153
2154 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2155 /// minimum values.
2156 ///
2157 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
2158 #[inline]
2159 #[target_feature(enable = "avx2")]
2160 #[cfg_attr(test, assert_instr(vpminsw))]
2161 #[stable(feature = "simd_x86", since = "1.27.0")]
2162 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2163     transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
2164 }
2165
2166 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2167 /// minimum values.
2168 ///
2169 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
2170 #[inline]
2171 #[target_feature(enable = "avx2")]
2172 #[cfg_attr(test, assert_instr(vpminsd))]
2173 #[stable(feature = "simd_x86", since = "1.27.0")]
2174 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2175     transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
2176 }
2177
2178 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2179 /// minimum values.
2180 ///
2181 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
2182 #[inline]
2183 #[target_feature(enable = "avx2")]
2184 #[cfg_attr(test, assert_instr(vpminsb))]
2185 #[stable(feature = "simd_x86", since = "1.27.0")]
2186 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2187     transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
2188 }
2189
2190 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2191 /// the packed minimum values.
2192 ///
2193 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
2194 #[inline]
2195 #[target_feature(enable = "avx2")]
2196 #[cfg_attr(test, assert_instr(vpminuw))]
2197 #[stable(feature = "simd_x86", since = "1.27.0")]
2198 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2199     transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
2200 }
2201
2202 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2203 /// the packed minimum values.
2204 ///
2205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
2206 #[inline]
2207 #[target_feature(enable = "avx2")]
2208 #[cfg_attr(test, assert_instr(vpminud))]
2209 #[stable(feature = "simd_x86", since = "1.27.0")]
2210 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2211     transmute(pminud(a.as_u32x8(), b.as_u32x8()))
2212 }
2213
2214 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2215 /// the packed minimum values.
2216 ///
2217 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
2218 #[inline]
2219 #[target_feature(enable = "avx2")]
2220 #[cfg_attr(test, assert_instr(vpminub))]
2221 #[stable(feature = "simd_x86", since = "1.27.0")]
2222 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2223     transmute(pminub(a.as_u8x32(), b.as_u8x32()))
2224 }
2225
2226 /// Creates mask from the most significant bit of each 8-bit element in `a`,
2227 /// return the result.
2228 ///
2229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
2230 #[inline]
2231 #[target_feature(enable = "avx2")]
2232 #[cfg_attr(test, assert_instr(vpmovmskb))]
2233 #[stable(feature = "simd_x86", since = "1.27.0")]
2234 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2235     pmovmskb(a.as_i8x32())
2236 }
2237
2238 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2239 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2240 /// results in dst. Eight SADs are performed for each 128-bit lane using one
2241 /// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2242 /// selected from `b` starting at on the offset specified in `imm8`. Eight
2243 /// quadruplets are formed from sequential 8-bit integers selected from `a`
2244 /// starting at the offset specified in `imm8`.
2245 ///
2246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
2247 #[inline]
2248 #[target_feature(enable = "avx2")]
2249 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
2250 #[rustc_args_required_const(2)]
2251 #[stable(feature = "simd_x86", since = "1.27.0")]
2252 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2253     let a = a.as_u8x32();
2254     let b = b.as_u8x32();
2255     macro_rules! call {
2256         ($imm8:expr) => {
2257             mpsadbw(a, b, $imm8)
2258         };
2259     }
2260     let r = constify_imm8!(imm8, call);
2261     transmute(r)
2262 }
2263
2264 /// Multiplies the low 32-bit integers from each packed 64-bit element in
2265 /// `a` and `b`
2266 ///
2267 /// Returns the 64-bit results.
2268 ///
2269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
2270 #[inline]
2271 #[target_feature(enable = "avx2")]
2272 #[cfg_attr(test, assert_instr(vpmuldq))]
2273 #[stable(feature = "simd_x86", since = "1.27.0")]
2274 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2275     transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
2276 }
2277
2278 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2279 /// element in `a` and `b`
2280 ///
2281 /// Returns the unsigned 64-bit results.
2282 ///
2283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
2284 #[inline]
2285 #[target_feature(enable = "avx2")]
2286 #[cfg_attr(test, assert_instr(vpmuludq))]
2287 #[stable(feature = "simd_x86", since = "1.27.0")]
2288 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2289     transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
2290 }
2291
2292 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2293 /// intermediate 32-bit integers and returning the high 16 bits of the
2294 /// intermediate integers.
2295 ///
2296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
2297 #[inline]
2298 #[target_feature(enable = "avx2")]
2299 #[cfg_attr(test, assert_instr(vpmulhw))]
2300 #[stable(feature = "simd_x86", since = "1.27.0")]
2301 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2302     transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
2303 }
2304
2305 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2306 /// intermediate 32-bit integers and returning the high 16 bits of the
2307 /// intermediate integers.
2308 ///
2309 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
2310 #[inline]
2311 #[target_feature(enable = "avx2")]
2312 #[cfg_attr(test, assert_instr(vpmulhuw))]
2313 #[stable(feature = "simd_x86", since = "1.27.0")]
2314 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2315     transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
2316 }
2317
2318 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2319 /// intermediate 32-bit integers, and returns the low 16 bits of the
2320 /// intermediate integers
2321 ///
2322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
2323 #[inline]
2324 #[target_feature(enable = "avx2")]
2325 #[cfg_attr(test, assert_instr(vpmullw))]
2326 #[stable(feature = "simd_x86", since = "1.27.0")]
2327 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2328     transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2329 }
2330
2331 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
2332 /// intermediate 64-bit integers, and returns the low 32 bits of the
2333 /// intermediate integers
2334 ///
2335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
2336 #[inline]
2337 #[target_feature(enable = "avx2")]
2338 #[cfg_attr(test, assert_instr(vpmulld))]
2339 #[stable(feature = "simd_x86", since = "1.27.0")]
2340 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2341     transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2342 }
2343
2344 /// Multiplies packed 16-bit integers in `a` and `b`, producing
2345 /// intermediate signed 32-bit integers. Truncate each intermediate
2346 /// integer to the 18 most significant bits, round by adding 1, and
2347 /// return bits `[16:1]`.
2348 ///
2349 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
2350 #[inline]
2351 #[target_feature(enable = "avx2")]
2352 #[cfg_attr(test, assert_instr(vpmulhrsw))]
2353 #[stable(feature = "simd_x86", since = "1.27.0")]
2354 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2355     transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2356 }
2357
2358 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2359 /// and `b`
2360 ///
2361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
2362 #[inline]
2363 #[target_feature(enable = "avx2")]
2364 #[cfg_attr(test, assert_instr(vorps))]
2365 #[stable(feature = "simd_x86", since = "1.27.0")]
2366 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2367     transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2368 }
2369
2370 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2371 /// using signed saturation
2372 ///
2373 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
2374 #[inline]
2375 #[target_feature(enable = "avx2")]
2376 #[cfg_attr(test, assert_instr(vpacksswb))]
2377 #[stable(feature = "simd_x86", since = "1.27.0")]
2378 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2379     transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2380 }
2381
2382 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2383 /// using signed saturation
2384 ///
2385 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
2386 #[inline]
2387 #[target_feature(enable = "avx2")]
2388 #[cfg_attr(test, assert_instr(vpackssdw))]
2389 #[stable(feature = "simd_x86", since = "1.27.0")]
2390 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2391     transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2392 }
2393
2394 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2395 /// using unsigned saturation
2396 ///
2397 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
2398 #[inline]
2399 #[target_feature(enable = "avx2")]
2400 #[cfg_attr(test, assert_instr(vpackuswb))]
2401 #[stable(feature = "simd_x86", since = "1.27.0")]
2402 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2403     transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2404 }
2405
2406 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2407 /// using unsigned saturation
2408 ///
2409 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
2410 #[inline]
2411 #[target_feature(enable = "avx2")]
2412 #[cfg_attr(test, assert_instr(vpackusdw))]
2413 #[stable(feature = "simd_x86", since = "1.27.0")]
2414 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2415     transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2416 }
2417
2418 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
2419 ///
2420 /// The last 3 bits of each integer of `b` are used as addresses into the 8
2421 /// integers of `a`.
2422 ///
2423 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
2424 #[inline]
2425 #[target_feature(enable = "avx2")]
2426 #[cfg_attr(test, assert_instr(vpermps))]
2427 #[stable(feature = "simd_x86", since = "1.27.0")]
2428 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2429     transmute(permd(a.as_u32x8(), b.as_u32x8()))
2430 }
2431
2432 /// Permutes 64-bit integers from `a` using control mask `imm8`.
2433 ///
2434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
2435 #[inline]
2436 #[target_feature(enable = "avx2")]
2437 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
2438 #[rustc_args_required_const(1)]
2439 #[stable(feature = "simd_x86", since = "1.27.0")]
2440 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
2441     let imm8 = (imm8 & 0xFF) as u8;
2442     let zero = _mm256_setzero_si256().as_i64x4();
2443     let a = a.as_i64x4();
2444     macro_rules! permute4 {
2445         ($a:expr, $b:expr, $c:expr, $d:expr) => {
2446             simd_shuffle4(a, zero, [$a, $b, $c, $d]);
2447         };
2448     }
2449     macro_rules! permute3 {
2450         ($a:expr, $b:expr, $c:expr) => {
2451             match (imm8 >> 6) & 0b11 {
2452                 0b00 => permute4!($a, $b, $c, 0),
2453                 0b01 => permute4!($a, $b, $c, 1),
2454                 0b10 => permute4!($a, $b, $c, 2),
2455                 _ => permute4!($a, $b, $c, 3),
2456             }
2457         };
2458     }
2459     macro_rules! permute2 {
2460         ($a:expr, $b:expr) => {
2461             match (imm8 >> 4) & 0b11 {
2462                 0b00 => permute3!($a, $b, 0),
2463                 0b01 => permute3!($a, $b, 1),
2464                 0b10 => permute3!($a, $b, 2),
2465                 _ => permute3!($a, $b, 3),
2466             }
2467         };
2468     }
2469     macro_rules! permute1 {
2470         ($a:expr) => {
2471             match (imm8 >> 2) & 0b11 {
2472                 0b00 => permute2!($a, 0),
2473                 0b01 => permute2!($a, 1),
2474                 0b10 => permute2!($a, 2),
2475                 _ => permute2!($a, 3),
2476             }
2477         };
2478     }
2479     let r: i64x4 = match imm8 & 0b11 {
2480         0b00 => permute1!(0),
2481         0b01 => permute1!(1),
2482         0b10 => permute1!(2),
2483         _ => permute1!(3),
2484     };
2485     transmute(r)
2486 }
2487
2488 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2489 ///
2490 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
2491 #[inline]
2492 #[target_feature(enable = "avx2")]
2493 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
2494 #[rustc_args_required_const(2)]
2495 #[stable(feature = "simd_x86", since = "1.27.0")]
2496 pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2497     let a = a.as_i64x4();
2498     let b = b.as_i64x4();
2499     macro_rules! call {
2500         ($imm8:expr) => {
2501             vperm2i128(a, b, $imm8)
2502         };
2503     }
2504     transmute(constify_imm8!(imm8, call))
2505 }
2506
2507 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
2508 /// control in `imm8`.
2509 ///
2510 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
2511 #[inline]
2512 #[target_feature(enable = "avx2")]
2513 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
2514 #[rustc_args_required_const(1)]
2515 #[stable(feature = "simd_x86", since = "1.27.0")]
2516 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
2517     let imm8 = (imm8 & 0xFF) as u8;
2518     let undef = _mm256_undefined_pd();
2519     macro_rules! shuffle_done {
2520         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2521             simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
2522         };
2523     }
2524     macro_rules! shuffle_x67 {
2525         ($x01:expr, $x23:expr, $x45:expr) => {
2526             match (imm8 >> 6) & 0b11 {
2527                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2528                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2529                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2530                 _ => shuffle_done!($x01, $x23, $x45, 3),
2531             }
2532         };
2533     }
2534     macro_rules! shuffle_x45 {
2535         ($x01:expr, $x23:expr) => {
2536             match (imm8 >> 4) & 0b11 {
2537                 0b00 => shuffle_x67!($x01, $x23, 0),
2538                 0b01 => shuffle_x67!($x01, $x23, 1),
2539                 0b10 => shuffle_x67!($x01, $x23, 2),
2540                 _ => shuffle_x67!($x01, $x23, 3),
2541             }
2542         };
2543     }
2544     macro_rules! shuffle_x23 {
2545         ($x01:expr) => {
2546             match (imm8 >> 2) & 0b11 {
2547                 0b00 => shuffle_x45!($x01, 0),
2548                 0b01 => shuffle_x45!($x01, 1),
2549                 0b10 => shuffle_x45!($x01, 2),
2550                 _ => shuffle_x45!($x01, 3),
2551             }
2552         };
2553     }
2554     match imm8 & 0b11 {
2555         0b00 => shuffle_x23!(0),
2556         0b01 => shuffle_x23!(1),
2557         0b10 => shuffle_x23!(2),
2558         _ => shuffle_x23!(3),
2559     }
2560 }
2561
2562 /// Shuffles eight 32-bit foating-point elements in `a` across lanes using
2563 /// the corresponding 32-bit integer index in `idx`.
2564 ///
2565 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
2566 #[inline]
2567 #[target_feature(enable = "avx2")]
2568 #[cfg_attr(test, assert_instr(vpermps))]
2569 #[stable(feature = "simd_x86", since = "1.27.0")]
2570 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2571     permps(a, idx.as_i32x8())
2572 }
2573
2574 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2575 /// and `b`, then horizontally sum each consecutive 8 differences to
2576 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2577 /// integers in the low 16 bits of the 64-bit return value
2578 ///
2579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
2580 #[inline]
2581 #[target_feature(enable = "avx2")]
2582 #[cfg_attr(test, assert_instr(vpsadbw))]
2583 #[stable(feature = "simd_x86", since = "1.27.0")]
2584 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2585     transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2586 }
2587
2588 /// Shuffles bytes from `a` according to the content of `b`.
2589 ///
2590 /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
2591 /// of `a`.
2592 ///
2593 /// In addition, if the highest significant bit of a byte of `b` is set, the
2594 /// respective destination byte is set to 0.
2595 ///
2596 /// The low and high halves of the vectors are shuffled separately.
2597 ///
2598 /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2599 /// equivalent to:
2600 ///
2601 /// ```
2602 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2603 ///     let mut r = [0; 32];
2604 ///     for i in 0..16 {
2605 ///         // if the most significant bit of b is set,
2606 ///         // then the destination byte is set to 0.
2607 ///         if b[i] & 0x80 == 0u8 {
2608 ///             r[i] = a[(b[i] % 16) as usize];
2609 ///         }
2610 ///         if b[i + 16] & 0x80 == 0u8 {
2611 ///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2612 ///         }
2613 ///     }
2614 ///     r
2615 /// }
2616 /// ```
2617 ///
2618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
2619 #[inline]
2620 #[target_feature(enable = "avx2")]
2621 #[cfg_attr(test, assert_instr(vpshufb))]
2622 #[stable(feature = "simd_x86", since = "1.27.0")]
2623 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2624     transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2625 }
2626
2627 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2628 /// `imm8`.
2629 ///
2630 /// ```rust
2631 /// #[cfg(target_arch = "x86")]
2632 /// use std::arch::x86::*;
2633 /// #[cfg(target_arch = "x86_64")]
2634 /// use std::arch::x86_64::*;
2635 ///
2636 /// # fn main() {
2637 /// #     if is_x86_feature_detected!("avx2") {
2638 /// #         #[target_feature(enable = "avx2")]
2639 /// #         unsafe fn worker() {
2640 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2641 ///
2642 /// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2643 /// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2644 ///
2645 /// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2646 /// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2647 ///
2648 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2649 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2650 /// #         }
2651 /// #         unsafe { worker(); }
2652 /// #     }
2653 /// # }
2654 /// ```
2655 ///
2656 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2657 #[inline]
2658 #[target_feature(enable = "avx2")]
2659 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
2660 #[rustc_args_required_const(1)]
2661 #[stable(feature = "simd_x86", since = "1.27.0")]
2662 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
2663     // simd_shuffleX requires that its selector parameter be made up of
2664     // constant values, but we can't enforce that here. In spirit, we need
2665     // to write a `match` on all possible values of a byte, and for each value,
2666     // hard-code the correct `simd_shuffleX` call using only constants. We
2667     // then hope for LLVM to do the rest.
2668     //
2669     // Of course, that's... awful. So we try to use macros to do it for us.
2670     let imm8 = (imm8 & 0xFF) as u8;
2671
2672     let a = a.as_i32x8();
2673     macro_rules! shuffle_done {
2674         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2675             simd_shuffle8(
2676                 a,
2677                 a,
2678                 [
2679                     $x01,
2680                     $x23,
2681                     $x45,
2682                     $x67,
2683                     4 + $x01,
2684                     4 + $x23,
2685                     4 + $x45,
2686                     4 + $x67,
2687                 ],
2688             )
2689         };
2690     }
2691     macro_rules! shuffle_x67 {
2692         ($x01:expr, $x23:expr, $x45:expr) => {
2693             match (imm8 >> 6) & 0b11 {
2694                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2695                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2696                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2697                 _ => shuffle_done!($x01, $x23, $x45, 3),
2698             }
2699         };
2700     }
2701     macro_rules! shuffle_x45 {
2702         ($x01:expr, $x23:expr) => {
2703             match (imm8 >> 4) & 0b11 {
2704                 0b00 => shuffle_x67!($x01, $x23, 0),
2705                 0b01 => shuffle_x67!($x01, $x23, 1),
2706                 0b10 => shuffle_x67!($x01, $x23, 2),
2707                 _ => shuffle_x67!($x01, $x23, 3),
2708             }
2709         };
2710     }
2711     macro_rules! shuffle_x23 {
2712         ($x01:expr) => {
2713             match (imm8 >> 2) & 0b11 {
2714                 0b00 => shuffle_x45!($x01, 0),
2715                 0b01 => shuffle_x45!($x01, 1),
2716                 0b10 => shuffle_x45!($x01, 2),
2717                 _ => shuffle_x45!($x01, 3),
2718             }
2719         };
2720     }
2721     let r: i32x8 = match imm8 & 0b11 {
2722         0b00 => shuffle_x23!(0),
2723         0b01 => shuffle_x23!(1),
2724         0b10 => shuffle_x23!(2),
2725         _ => shuffle_x23!(3),
2726     };
2727     transmute(r)
2728 }
2729
2730 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2731 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2732 /// to the output.
2733 ///
2734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
2735 #[inline]
2736 #[target_feature(enable = "avx2")]
2737 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
2738 #[rustc_args_required_const(1)]
2739 #[stable(feature = "simd_x86", since = "1.27.0")]
2740 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
2741     let imm8 = (imm8 & 0xFF) as u8;
2742     let a = a.as_i16x16();
2743     macro_rules! shuffle_done {
2744         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2745             #[rustfmt::skip]
2746                         simd_shuffle16(a, a, [
2747                             0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
2748                             8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
2749                         ]);
2750         };
2751     }
2752     macro_rules! shuffle_x67 {
2753         ($x01:expr, $x23:expr, $x45:expr) => {
2754             match (imm8 >> 6) & 0b11 {
2755                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2756                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2757                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2758                 _ => shuffle_done!($x01, $x23, $x45, 3),
2759             }
2760         };
2761     }
2762     macro_rules! shuffle_x45 {
2763         ($x01:expr, $x23:expr) => {
2764             match (imm8 >> 4) & 0b11 {
2765                 0b00 => shuffle_x67!($x01, $x23, 0),
2766                 0b01 => shuffle_x67!($x01, $x23, 1),
2767                 0b10 => shuffle_x67!($x01, $x23, 2),
2768                 _ => shuffle_x67!($x01, $x23, 3),
2769             }
2770         };
2771     }
2772     macro_rules! shuffle_x23 {
2773         ($x01:expr) => {
2774             match (imm8 >> 2) & 0b11 {
2775                 0b00 => shuffle_x45!($x01, 0),
2776                 0b01 => shuffle_x45!($x01, 1),
2777                 0b10 => shuffle_x45!($x01, 2),
2778                 _ => shuffle_x45!($x01, 3),
2779             }
2780         };
2781     }
2782     let r: i16x16 = match imm8 & 0b11 {
2783         0b00 => shuffle_x23!(0),
2784         0b01 => shuffle_x23!(1),
2785         0b10 => shuffle_x23!(2),
2786         _ => shuffle_x23!(3),
2787     };
2788     transmute(r)
2789 }
2790
2791 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2792 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2793 /// to the output.
2794 ///
2795 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
2796 #[inline]
2797 #[target_feature(enable = "avx2")]
2798 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
2799 #[rustc_args_required_const(1)]
2800 #[stable(feature = "simd_x86", since = "1.27.0")]
2801 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
2802     let imm8 = (imm8 & 0xFF) as u8;
2803     let a = a.as_i16x16();
2804     macro_rules! shuffle_done {
2805         ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2806             #[rustfmt::skip]
2807                         simd_shuffle16(a, a, [
2808                             0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
2809                             8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
2810                         ]);
2811         };
2812     }
2813     macro_rules! shuffle_x67 {
2814         ($x01:expr, $x23:expr, $x45:expr) => {
2815             match (imm8 >> 6) & 0b11 {
2816                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2817                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2818                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2819                 _ => shuffle_done!($x01, $x23, $x45, 3),
2820             }
2821         };
2822     }
2823     macro_rules! shuffle_x45 {
2824         ($x01:expr, $x23:expr) => {
2825             match (imm8 >> 4) & 0b11 {
2826                 0b00 => shuffle_x67!($x01, $x23, 0),
2827                 0b01 => shuffle_x67!($x01, $x23, 1),
2828                 0b10 => shuffle_x67!($x01, $x23, 2),
2829                 _ => shuffle_x67!($x01, $x23, 3),
2830             }
2831         };
2832     }
2833     macro_rules! shuffle_x23 {
2834         ($x01:expr) => {
2835             match (imm8 >> 2) & 0b11 {
2836                 0b00 => shuffle_x45!($x01, 0),
2837                 0b01 => shuffle_x45!($x01, 1),
2838                 0b10 => shuffle_x45!($x01, 2),
2839                 _ => shuffle_x45!($x01, 3),
2840             }
2841         };
2842     }
2843     let r: i16x16 = match imm8 & 0b11 {
2844         0b00 => shuffle_x23!(0),
2845         0b01 => shuffle_x23!(1),
2846         0b10 => shuffle_x23!(2),
2847         _ => shuffle_x23!(3),
2848     };
2849     transmute(r)
2850 }
2851
2852 /// Negates packed 16-bit integers in `a` when the corresponding signed
2853 /// 16-bit integer in `b` is negative, and returns the results.
2854 /// Results are zeroed out when the corresponding element in `b` is zero.
2855 ///
2856 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
2857 #[inline]
2858 #[target_feature(enable = "avx2")]
2859 #[cfg_attr(test, assert_instr(vpsignw))]
2860 #[stable(feature = "simd_x86", since = "1.27.0")]
2861 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2862     transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2863 }
2864
2865 /// Negates packed 32-bit integers in `a` when the corresponding signed
2866 /// 32-bit integer in `b` is negative, and returns the results.
2867 /// Results are zeroed out when the corresponding element in `b` is zero.
2868 ///
2869 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
2870 #[inline]
2871 #[target_feature(enable = "avx2")]
2872 #[cfg_attr(test, assert_instr(vpsignd))]
2873 #[stable(feature = "simd_x86", since = "1.27.0")]
2874 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2875     transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2876 }
2877
2878 /// Negates packed 8-bit integers in `a` when the corresponding signed
2879 /// 8-bit integer in `b` is negative, and returns the results.
2880 /// Results are zeroed out when the corresponding element in `b` is zero.
2881 ///
2882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
2883 #[inline]
2884 #[target_feature(enable = "avx2")]
2885 #[cfg_attr(test, assert_instr(vpsignb))]
2886 #[stable(feature = "simd_x86", since = "1.27.0")]
2887 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2888     transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2889 }
2890
2891 /// Shifts packed 16-bit integers in `a` left by `count` while
2892 /// shifting in zeros, and returns the result
2893 ///
2894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
2895 #[inline]
2896 #[target_feature(enable = "avx2")]
2897 #[cfg_attr(test, assert_instr(vpsllw))]
2898 #[stable(feature = "simd_x86", since = "1.27.0")]
2899 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2900     transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2901 }
2902
2903 /// Shifts packed 32-bit integers in `a` left by `count` while
2904 /// shifting in zeros, and returns the result
2905 ///
2906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
2907 #[inline]
2908 #[target_feature(enable = "avx2")]
2909 #[cfg_attr(test, assert_instr(vpslld))]
2910 #[stable(feature = "simd_x86", since = "1.27.0")]
2911 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2912     transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2913 }
2914
2915 /// Shifts packed 64-bit integers in `a` left by `count` while
2916 /// shifting in zeros, and returns the result
2917 ///
2918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
2919 #[inline]
2920 #[target_feature(enable = "avx2")]
2921 #[cfg_attr(test, assert_instr(vpsllq))]
2922 #[stable(feature = "simd_x86", since = "1.27.0")]
2923 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2924     transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2925 }
2926
2927 /// Shifts packed 16-bit integers in `a` left by `imm8` while
2928 /// shifting in zeros, return the results;
2929 ///
2930 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
2931 #[inline]
2932 #[target_feature(enable = "avx2")]
2933 #[cfg_attr(test, assert_instr(vpsllw))]
2934 #[stable(feature = "simd_x86", since = "1.27.0")]
2935 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
2936     transmute(pslliw(a.as_i16x16(), imm8))
2937 }
2938
2939 /// Shifts packed 32-bit integers in `a` left by `imm8` while
2940 /// shifting in zeros, return the results;
2941 ///
2942 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
2943 #[inline]
2944 #[target_feature(enable = "avx2")]
2945 #[cfg_attr(test, assert_instr(vpslld))]
2946 #[stable(feature = "simd_x86", since = "1.27.0")]
2947 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
2948     transmute(psllid(a.as_i32x8(), imm8))
2949 }
2950
2951 /// Shifts packed 64-bit integers in `a` left by `imm8` while
2952 /// shifting in zeros, return the results;
2953 ///
2954 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
2955 #[inline]
2956 #[target_feature(enable = "avx2")]
2957 #[cfg_attr(test, assert_instr(vpsllq))]
2958 #[stable(feature = "simd_x86", since = "1.27.0")]
2959 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
2960     transmute(pslliq(a.as_i64x4(), imm8))
2961 }
2962
2963 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2964 ///
2965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
2966 #[inline]
2967 #[target_feature(enable = "avx2")]
2968 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2969 #[rustc_args_required_const(1)]
2970 #[stable(feature = "simd_x86", since = "1.27.0")]
2971 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2972     let a = a.as_i64x4();
2973     macro_rules! call {
2974         ($imm8:expr) => {
2975             vpslldq(a, $imm8)
2976         };
2977     }
2978     transmute(constify_imm8!(imm8 * 8, call))
2979 }
2980
2981 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2982 ///
2983 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
2984 #[inline]
2985 #[target_feature(enable = "avx2")]
2986 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2987 #[rustc_args_required_const(1)]
2988 #[stable(feature = "simd_x86", since = "1.27.0")]
2989 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2990     let a = a.as_i64x4();
2991     macro_rules! call {
2992         ($imm8:expr) => {
2993             vpslldq(a, $imm8)
2994         };
2995     }
2996     transmute(constify_imm8!(imm8 * 8, call))
2997 }
2998
2999 /// Shifts packed 32-bit integers in `a` left by the amount
3000 /// specified by the corresponding element in `count` while
3001 /// shifting in zeros, and returns the result.
3002 ///
3003 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
3004 #[inline]
3005 #[target_feature(enable = "avx2")]
3006 #[cfg_attr(test, assert_instr(vpsllvd))]
3007 #[stable(feature = "simd_x86", since = "1.27.0")]
3008 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
3009     transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
3010 }
3011
3012 /// Shifts packed 32-bit integers in `a` left by the amount
3013 /// specified by the corresponding element in `count` while
3014 /// shifting in zeros, and returns the result.
3015 ///
3016 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
3017 #[inline]
3018 #[target_feature(enable = "avx2")]
3019 #[cfg_attr(test, assert_instr(vpsllvd))]
3020 #[stable(feature = "simd_x86", since = "1.27.0")]
3021 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
3022     transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
3023 }
3024
3025 /// Shifts packed 64-bit integers in `a` left by the amount
3026 /// specified by the corresponding element in `count` while
3027 /// shifting in zeros, and returns the result.
3028 ///
3029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
3030 #[inline]
3031 #[target_feature(enable = "avx2")]
3032 #[cfg_attr(test, assert_instr(vpsllvq))]
3033 #[stable(feature = "simd_x86", since = "1.27.0")]
3034 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3035     transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
3036 }
3037
3038 /// Shifts packed 64-bit integers in `a` left by the amount
3039 /// specified by the corresponding element in `count` while
3040 /// shifting in zeros, and returns the result.
3041 ///
3042 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
3043 #[inline]
3044 #[target_feature(enable = "avx2")]
3045 #[cfg_attr(test, assert_instr(vpsllvq))]
3046 #[stable(feature = "simd_x86", since = "1.27.0")]
3047 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3048     transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
3049 }
3050
3051 /// Shifts packed 16-bit integers in `a` right by `count` while
3052 /// shifting in sign bits.
3053 ///
3054 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
3055 #[inline]
3056 #[target_feature(enable = "avx2")]
3057 #[cfg_attr(test, assert_instr(vpsraw))]
3058 #[stable(feature = "simd_x86", since = "1.27.0")]
3059 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3060     transmute(psraw(a.as_i16x16(), count.as_i16x8()))
3061 }
3062
3063 /// Shifts packed 32-bit integers in `a` right by `count` while
3064 /// shifting in sign bits.
3065 ///
3066 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
3067 #[inline]
3068 #[target_feature(enable = "avx2")]
3069 #[cfg_attr(test, assert_instr(vpsrad))]
3070 #[stable(feature = "simd_x86", since = "1.27.0")]
3071 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3072     transmute(psrad(a.as_i32x8(), count.as_i32x4()))
3073 }
3074
3075 /// Shifts packed 16-bit integers in `a` right by `imm8` while
3076 /// shifting in sign bits.
3077 ///
3078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
3079 #[inline]
3080 #[target_feature(enable = "avx2")]
3081 #[cfg_attr(test, assert_instr(vpsraw))]
3082 #[stable(feature = "simd_x86", since = "1.27.0")]
3083 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
3084     transmute(psraiw(a.as_i16x16(), imm8))
3085 }
3086
3087 /// Shifts packed 32-bit integers in `a` right by `imm8` while
3088 /// shifting in sign bits.
3089 ///
3090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
3091 #[inline]
3092 #[target_feature(enable = "avx2")]
3093 #[cfg_attr(test, assert_instr(vpsrad))]
3094 #[stable(feature = "simd_x86", since = "1.27.0")]
3095 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
3096     transmute(psraid(a.as_i32x8(), imm8))
3097 }
3098
3099 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3100 /// corresponding element in `count` while shifting in sign bits.
3101 ///
3102 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
3103 #[inline]
3104 #[target_feature(enable = "avx2")]
3105 #[cfg_attr(test, assert_instr(vpsravd))]
3106 #[stable(feature = "simd_x86", since = "1.27.0")]
3107 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3108     transmute(psravd(a.as_i32x4(), count.as_i32x4()))
3109 }
3110
3111 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3112 /// corresponding element in `count` while shifting in sign bits.
3113 ///
3114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
3115 #[inline]
3116 #[target_feature(enable = "avx2")]
3117 #[cfg_attr(test, assert_instr(vpsravd))]
3118 #[stable(feature = "simd_x86", since = "1.27.0")]
3119 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3120     transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
3121 }
3122
3123 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3124 ///
3125 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
3126 #[inline]
3127 #[target_feature(enable = "avx2")]
3128 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3129 #[rustc_args_required_const(1)]
3130 #[stable(feature = "simd_x86", since = "1.27.0")]
3131 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
3132     let a = a.as_i64x4();
3133     macro_rules! call {
3134         ($imm8:expr) => {
3135             vpsrldq(a, $imm8)
3136         };
3137     }
3138     transmute(constify_imm8!(imm8 * 8, call))
3139 }
3140
3141 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3142 ///
3143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
3144 #[inline]
3145 #[target_feature(enable = "avx2")]
3146 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3147 #[rustc_args_required_const(1)]
3148 #[stable(feature = "simd_x86", since = "1.27.0")]
3149 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
3150     let a = a.as_i64x4();
3151     macro_rules! call {
3152         ($imm8:expr) => {
3153             vpsrldq(a, $imm8)
3154         };
3155     }
3156     transmute(constify_imm8!(imm8 * 8, call))
3157 }
3158
3159 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3160 /// zeros.
3161 ///
3162 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
3163 #[inline]
3164 #[target_feature(enable = "avx2")]
3165 #[cfg_attr(test, assert_instr(vpsrlw))]
3166 #[stable(feature = "simd_x86", since = "1.27.0")]
3167 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3168     transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3169 }
3170
3171 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3172 /// zeros.
3173 ///
3174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
3175 #[inline]
3176 #[target_feature(enable = "avx2")]
3177 #[cfg_attr(test, assert_instr(vpsrld))]
3178 #[stable(feature = "simd_x86", since = "1.27.0")]
3179 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3180     transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3181 }
3182
3183 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3184 /// zeros.
3185 ///
3186 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
3187 #[inline]
3188 #[target_feature(enable = "avx2")]
3189 #[cfg_attr(test, assert_instr(vpsrlq))]
3190 #[stable(feature = "simd_x86", since = "1.27.0")]
3191 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3192     transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3193 }
3194
3195 /// Shifts packed 16-bit integers in `a` right by `imm8` while shifting in
3196 /// zeros
3197 ///
3198 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
3199 #[inline]
3200 #[target_feature(enable = "avx2")]
3201 #[cfg_attr(test, assert_instr(vpsrlw))]
3202 #[stable(feature = "simd_x86", since = "1.27.0")]
3203 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
3204     transmute(psrliw(a.as_i16x16(), imm8))
3205 }
3206
3207 /// Shifts packed 32-bit integers in `a` right by `imm8` while shifting in
3208 /// zeros
3209 ///
3210 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
3211 #[inline]
3212 #[target_feature(enable = "avx2")]
3213 #[cfg_attr(test, assert_instr(vpsrld))]
3214 #[stable(feature = "simd_x86", since = "1.27.0")]
3215 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
3216     transmute(psrlid(a.as_i32x8(), imm8))
3217 }
3218
3219 /// Shifts packed 64-bit integers in `a` right by `imm8` while shifting in
3220 /// zeros
3221 ///
3222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
3223 #[inline]
3224 #[target_feature(enable = "avx2")]
3225 #[cfg_attr(test, assert_instr(vpsrlq))]
3226 #[stable(feature = "simd_x86", since = "1.27.0")]
3227 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
3228     transmute(psrliq(a.as_i64x4(), imm8))
3229 }
3230
3231 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3232 /// the corresponding element in `count` while shifting in zeros,
3233 ///
3234 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
3235 #[inline]
3236 #[target_feature(enable = "avx2")]
3237 #[cfg_attr(test, assert_instr(vpsrlvd))]
3238 #[stable(feature = "simd_x86", since = "1.27.0")]
3239 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3240     transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3241 }
3242
3243 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3244 /// the corresponding element in `count` while shifting in zeros,
3245 ///
3246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
3247 #[inline]
3248 #[target_feature(enable = "avx2")]
3249 #[cfg_attr(test, assert_instr(vpsrlvd))]
3250 #[stable(feature = "simd_x86", since = "1.27.0")]
3251 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3252     transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3253 }
3254
3255 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3256 /// the corresponding element in `count` while shifting in zeros,
3257 ///
3258 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
3259 #[inline]
3260 #[target_feature(enable = "avx2")]
3261 #[cfg_attr(test, assert_instr(vpsrlvq))]
3262 #[stable(feature = "simd_x86", since = "1.27.0")]
3263 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3264     transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3265 }
3266
3267 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3268 /// the corresponding element in `count` while shifting in zeros,
3269 ///
3270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
3271 #[inline]
3272 #[target_feature(enable = "avx2")]
3273 #[cfg_attr(test, assert_instr(vpsrlvq))]
3274 #[stable(feature = "simd_x86", since = "1.27.0")]
3275 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3276     transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3277 }
3278
3279 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
3280
3281 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3282 ///
3283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
3284 #[inline]
3285 #[target_feature(enable = "avx2")]
3286 #[cfg_attr(test, assert_instr(vpsubw))]
3287 #[stable(feature = "simd_x86", since = "1.27.0")]
3288 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3289     transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3290 }
3291
3292 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3293 ///
3294 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
3295 #[inline]
3296 #[target_feature(enable = "avx2")]
3297 #[cfg_attr(test, assert_instr(vpsubd))]
3298 #[stable(feature = "simd_x86", since = "1.27.0")]
3299 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3300     transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3301 }
3302
3303 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3304 ///
3305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
3306 #[inline]
3307 #[target_feature(enable = "avx2")]
3308 #[cfg_attr(test, assert_instr(vpsubq))]
3309 #[stable(feature = "simd_x86", since = "1.27.0")]
3310 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3311     transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3312 }
3313
3314 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3315 ///
3316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
3317 #[inline]
3318 #[target_feature(enable = "avx2")]
3319 #[cfg_attr(test, assert_instr(vpsubb))]
3320 #[stable(feature = "simd_x86", since = "1.27.0")]
3321 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3322     transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3323 }
3324
3325 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3326 /// `a` using saturation.
3327 ///
3328 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
3329 #[inline]
3330 #[target_feature(enable = "avx2")]
3331 #[cfg_attr(test, assert_instr(vpsubsw))]
3332 #[stable(feature = "simd_x86", since = "1.27.0")]
3333 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3334     transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
3335 }
3336
3337 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3338 /// `a` using saturation.
3339 ///
3340 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
3341 #[inline]
3342 #[target_feature(enable = "avx2")]
3343 #[cfg_attr(test, assert_instr(vpsubsb))]
3344 #[stable(feature = "simd_x86", since = "1.27.0")]
3345 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3346     transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
3347 }
3348
3349 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3350 /// integers in `a` using saturation.
3351 ///
3352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
3353 #[inline]
3354 #[target_feature(enable = "avx2")]
3355 #[cfg_attr(test, assert_instr(vpsubusw))]
3356 #[stable(feature = "simd_x86", since = "1.27.0")]
3357 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3358     transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
3359 }
3360
3361 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3362 /// integers in `a` using saturation.
3363 ///
3364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
3365 #[inline]
3366 #[target_feature(enable = "avx2")]
3367 #[cfg_attr(test, assert_instr(vpsubusb))]
3368 #[stable(feature = "simd_x86", since = "1.27.0")]
3369 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3370     transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
3371 }
3372
3373 /// Unpacks and interleave 8-bit integers from the high half of each
3374 /// 128-bit lane in `a` and `b`.
3375 ///
3376 /// ```rust
3377 /// #[cfg(target_arch = "x86")]
3378 /// use std::arch::x86::*;
3379 /// #[cfg(target_arch = "x86_64")]
3380 /// use std::arch::x86_64::*;
3381 ///
3382 /// # fn main() {
3383 /// #     if is_x86_feature_detected!("avx2") {
3384 /// #         #[target_feature(enable = "avx2")]
3385 /// #         unsafe fn worker() {
3386 /// let a = _mm256_setr_epi8(
3387 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3388 ///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3389 /// );
3390 /// let b = _mm256_setr_epi8(
3391 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3392 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3393 ///     -30, -31,
3394 /// );
3395 ///
3396 /// let c = _mm256_unpackhi_epi8(a, b);
3397 ///
3398 /// let expected = _mm256_setr_epi8(
3399 ///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3400 ///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3401 ///     -31,
3402 /// );
3403 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3404 ///
3405 /// #         }
3406 /// #         unsafe { worker(); }
3407 /// #     }
3408 /// # }
3409 /// ```
3410 ///
3411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
3412 #[inline]
3413 #[target_feature(enable = "avx2")]
3414 #[cfg_attr(test, assert_instr(vpunpckhbw))]
3415 #[stable(feature = "simd_x86", since = "1.27.0")]
3416 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3417     #[rustfmt::skip]
3418     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3419             8, 40, 9, 41, 10, 42, 11, 43,
3420             12, 44, 13, 45, 14, 46, 15, 47,
3421             24, 56, 25, 57, 26, 58, 27, 59,
3422             28, 60, 29, 61, 30, 62, 31, 63,
3423     ]);
3424     transmute(r)
3425 }
3426
3427 /// Unpacks and interleave 8-bit integers from the low half of each
3428 /// 128-bit lane of `a` and `b`.
3429 ///
3430 /// ```rust
3431 /// #[cfg(target_arch = "x86")]
3432 /// use std::arch::x86::*;
3433 /// #[cfg(target_arch = "x86_64")]
3434 /// use std::arch::x86_64::*;
3435 ///
3436 /// # fn main() {
3437 /// #     if is_x86_feature_detected!("avx2") {
3438 /// #         #[target_feature(enable = "avx2")]
3439 /// #         unsafe fn worker() {
3440 /// let a = _mm256_setr_epi8(
3441 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3442 ///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3443 /// );
3444 /// let b = _mm256_setr_epi8(
3445 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3446 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3447 ///     -30, -31,
3448 /// );
3449 ///
3450 /// let c = _mm256_unpacklo_epi8(a, b);
3451 ///
3452 /// let expected = _mm256_setr_epi8(
3453 ///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3454 ///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3455 /// );
3456 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3457 ///
3458 /// #         }
3459 /// #         unsafe { worker(); }
3460 /// #     }
3461 /// # }
3462 /// ```
3463 ///
3464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
3465 #[inline]
3466 #[target_feature(enable = "avx2")]
3467 #[cfg_attr(test, assert_instr(vpunpcklbw))]
3468 #[stable(feature = "simd_x86", since = "1.27.0")]
3469 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3470     #[rustfmt::skip]
3471     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3472         0, 32, 1, 33, 2, 34, 3, 35,
3473         4, 36, 5, 37, 6, 38, 7, 39,
3474         16, 48, 17, 49, 18, 50, 19, 51,
3475         20, 52, 21, 53, 22, 54, 23, 55,
3476     ]);
3477     transmute(r)
3478 }
3479
3480 /// Unpacks and interleave 16-bit integers from the high half of each
3481 /// 128-bit lane of `a` and `b`.
3482 ///
3483 /// ```rust
3484 /// #[cfg(target_arch = "x86")]
3485 /// use std::arch::x86::*;
3486 /// #[cfg(target_arch = "x86_64")]
3487 /// use std::arch::x86_64::*;
3488 ///
3489 /// # fn main() {
3490 /// #     if is_x86_feature_detected!("avx2") {
3491 /// #         #[target_feature(enable = "avx2")]
3492 /// #         unsafe fn worker() {
3493 /// let a = _mm256_setr_epi16(
3494 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3495 /// );
3496 /// let b = _mm256_setr_epi16(
3497 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3498 /// );
3499 ///
3500 /// let c = _mm256_unpackhi_epi16(a, b);
3501 ///
3502 /// let expected = _mm256_setr_epi16(
3503 ///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3504 /// );
3505 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3506 ///
3507 /// #         }
3508 /// #         unsafe { worker(); }
3509 /// #     }
3510 /// # }
3511 /// ```
3512 ///
3513 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
3514 #[inline]
3515 #[target_feature(enable = "avx2")]
3516 #[cfg_attr(test, assert_instr(vpunpckhwd))]
3517 #[stable(feature = "simd_x86", since = "1.27.0")]
3518 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3519     let r: i16x16 = simd_shuffle16(
3520         a.as_i16x16(),
3521         b.as_i16x16(),
3522         [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3523     );
3524     transmute(r)
3525 }
3526
3527 /// Unpacks and interleave 16-bit integers from the low half of each
3528 /// 128-bit lane of `a` and `b`.
3529 ///
3530 /// ```rust
3531 /// #[cfg(target_arch = "x86")]
3532 /// use std::arch::x86::*;
3533 /// #[cfg(target_arch = "x86_64")]
3534 /// use std::arch::x86_64::*;
3535 ///
3536 /// # fn main() {
3537 /// #     if is_x86_feature_detected!("avx2") {
3538 /// #         #[target_feature(enable = "avx2")]
3539 /// #         unsafe fn worker() {
3540 ///
3541 /// let a = _mm256_setr_epi16(
3542 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3543 /// );
3544 /// let b = _mm256_setr_epi16(
3545 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3546 /// );
3547 ///
3548 /// let c = _mm256_unpacklo_epi16(a, b);
3549 ///
3550 /// let expected = _mm256_setr_epi16(
3551 ///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3552 /// );
3553 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3554 ///
3555 /// #         }
3556 /// #         unsafe { worker(); }
3557 /// #     }
3558 /// # }
3559 /// ```
3560 ///
3561 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
3562 #[inline]
3563 #[target_feature(enable = "avx2")]
3564 #[cfg_attr(test, assert_instr(vpunpcklwd))]
3565 #[stable(feature = "simd_x86", since = "1.27.0")]
3566 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3567     let r: i16x16 = simd_shuffle16(
3568         a.as_i16x16(),
3569         b.as_i16x16(),
3570         [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3571     );
3572     transmute(r)
3573 }
3574
3575 /// Unpacks and interleave 32-bit integers from the high half of each
3576 /// 128-bit lane of `a` and `b`.
3577 ///
3578 /// ```rust
3579 /// #[cfg(target_arch = "x86")]
3580 /// use std::arch::x86::*;
3581 /// #[cfg(target_arch = "x86_64")]
3582 /// use std::arch::x86_64::*;
3583 ///
3584 /// # fn main() {
3585 /// #     if is_x86_feature_detected!("avx2") {
3586 /// #         #[target_feature(enable = "avx2")]
3587 /// #         unsafe fn worker() {
3588 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3589 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3590 ///
3591 /// let c = _mm256_unpackhi_epi32(a, b);
3592 ///
3593 /// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3594 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3595 ///
3596 /// #         }
3597 /// #         unsafe { worker(); }
3598 /// #     }
3599 /// # }
3600 /// ```
3601 ///
3602 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
3603 #[inline]
3604 #[target_feature(enable = "avx2")]
3605 #[cfg_attr(test, assert_instr(vunpckhps))]
3606 #[stable(feature = "simd_x86", since = "1.27.0")]
3607 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3608     let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3609     transmute(r)
3610 }
3611
3612 /// Unpacks and interleave 32-bit integers from the low half of each
3613 /// 128-bit lane of `a` and `b`.
3614 ///
3615 /// ```rust
3616 /// #[cfg(target_arch = "x86")]
3617 /// use std::arch::x86::*;
3618 /// #[cfg(target_arch = "x86_64")]
3619 /// use std::arch::x86_64::*;
3620 ///
3621 /// # fn main() {
3622 /// #     if is_x86_feature_detected!("avx2") {
3623 /// #         #[target_feature(enable = "avx2")]
3624 /// #         unsafe fn worker() {
3625 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3626 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3627 ///
3628 /// let c = _mm256_unpacklo_epi32(a, b);
3629 ///
3630 /// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3631 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3632 ///
3633 /// #         }
3634 /// #         unsafe { worker(); }
3635 /// #     }
3636 /// # }
3637 /// ```
3638 ///
3639 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
3640 #[inline]
3641 #[target_feature(enable = "avx2")]
3642 #[cfg_attr(test, assert_instr(vunpcklps))]
3643 #[stable(feature = "simd_x86", since = "1.27.0")]
3644 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3645     let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3646     transmute(r)
3647 }
3648
3649 /// Unpacks and interleave 64-bit integers from the high half of each
3650 /// 128-bit lane of `a` and `b`.
3651 ///
3652 /// ```rust
3653 /// #[cfg(target_arch = "x86")]
3654 /// use std::arch::x86::*;
3655 /// #[cfg(target_arch = "x86_64")]
3656 /// use std::arch::x86_64::*;
3657 ///
3658 /// # fn main() {
3659 /// #     if is_x86_feature_detected!("avx2") {
3660 /// #         #[target_feature(enable = "avx2")]
3661 /// #         unsafe fn worker() {
3662 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3663 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3664 ///
3665 /// let c = _mm256_unpackhi_epi64(a, b);
3666 ///
3667 /// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3668 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3669 ///
3670 /// #         }
3671 /// #         unsafe { worker(); }
3672 /// #     }
3673 /// # }
3674 /// ```
3675 ///
3676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
3677 #[inline]
3678 #[target_feature(enable = "avx2")]
3679 #[cfg_attr(test, assert_instr(vunpckhpd))]
3680 #[stable(feature = "simd_x86", since = "1.27.0")]
3681 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3682     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3683     transmute(r)
3684 }
3685
3686 /// Unpacks and interleave 64-bit integers from the low half of each
3687 /// 128-bit lane of `a` and `b`.
3688 ///
3689 /// ```rust
3690 /// #[cfg(target_arch = "x86")]
3691 /// use std::arch::x86::*;
3692 /// #[cfg(target_arch = "x86_64")]
3693 /// use std::arch::x86_64::*;
3694 ///
3695 /// # fn main() {
3696 /// #     if is_x86_feature_detected!("avx2") {
3697 /// #         #[target_feature(enable = "avx2")]
3698 /// #         unsafe fn worker() {
3699 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3700 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3701 ///
3702 /// let c = _mm256_unpacklo_epi64(a, b);
3703 ///
3704 /// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3705 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3706 ///
3707 /// #         }
3708 /// #         unsafe { worker(); }
3709 /// #     }
3710 /// # }
3711 /// ```
3712 ///
3713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
3714 #[inline]
3715 #[target_feature(enable = "avx2")]
3716 #[cfg_attr(test, assert_instr(vunpcklpd))]
3717 #[stable(feature = "simd_x86", since = "1.27.0")]
3718 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3719     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3720     transmute(r)
3721 }
3722
3723 /// Computes the bitwise XOR of 256 bits (representing integer data)
3724 /// in `a` and `b`
3725 ///
3726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
3727 #[inline]
3728 #[target_feature(enable = "avx2")]
3729 #[cfg_attr(test, assert_instr(vxorps))]
3730 #[stable(feature = "simd_x86", since = "1.27.0")]
3731 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3732     transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3733 }
3734
3735 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3736 /// integer containing the zero-extended integer data.
3737 ///
3738 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3739 ///
3740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
3741 #[inline]
3742 #[target_feature(enable = "avx2")]
3743 // This intrinsic has no corresponding instruction.
3744 #[rustc_args_required_const(1)]
3745 #[stable(feature = "simd_x86", since = "1.27.0")]
3746 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i32 {
3747     let a = a.as_u8x32();
3748     macro_rules! call {
3749         ($imm5:expr) => {
3750             simd_extract::<_, u8>(a, $imm5) as i32
3751         };
3752     }
3753     constify_imm5!(imm8, call)
3754 }
3755
3756 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3757 /// integer containing the zero-extended integer data.
3758 ///
3759 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3760 ///
3761 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
3762 #[inline]
3763 #[target_feature(enable = "avx2")]
3764 // This intrinsic has no corresponding instruction.
3765 #[rustc_args_required_const(1)]
3766 #[stable(feature = "simd_x86", since = "1.27.0")]
3767 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i32 {
3768     let a = a.as_u16x16();
3769     macro_rules! call {
3770         ($imm4:expr) => {
3771             simd_extract::<_, u16>(a, $imm4) as i32
3772         };
3773     }
3774     constify_imm4!((imm8 & 15), call)
3775 }
3776
3777 /// Extracts a 32-bit integer from `a`, selected with `imm8`.
3778 ///
3779 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
3780 #[inline]
3781 #[target_feature(enable = "avx2")]
3782 // This intrinsic has no corresponding instruction.
3783 #[rustc_args_required_const(1)]
3784 #[stable(feature = "simd_x86", since = "1.27.0")]
3785 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
3786     let a = a.as_i32x8();
3787     macro_rules! call {
3788         ($imm3:expr) => {
3789             simd_extract(a, $imm3)
3790         };
3791     }
3792     constify_imm3!((imm8 & 7), call)
3793 }
3794
3795 /// Returns the first element of the input vector of `[4 x double]`.
3796 ///
3797 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
3798 #[inline]
3799 #[target_feature(enable = "avx2")]
3800 //#[cfg_attr(test, assert_instr(movsd))] FIXME
3801 #[stable(feature = "simd_x86", since = "1.27.0")]
3802 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
3803     simd_extract(a, 0)
3804 }
3805
3806 /// Returns the first element of the input vector of `[8 x i32]`.
3807 ///
3808 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
3809 #[inline]
3810 #[target_feature(enable = "avx2")]
3811 //#[cfg_attr(test, assert_instr(movd))] FIXME
3812 #[stable(feature = "simd_x86", since = "1.27.0")]
3813 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
3814     simd_extract(a.as_i32x8(), 0)
3815 }
3816
3817 #[allow(improper_ctypes)]
3818 extern "C" {
3819     #[link_name = "llvm.x86.avx2.pabs.b"]
3820     fn pabsb(a: i8x32) -> u8x32;
3821     #[link_name = "llvm.x86.avx2.pabs.w"]
3822     fn pabsw(a: i16x16) -> u16x16;
3823     #[link_name = "llvm.x86.avx2.pabs.d"]
3824     fn pabsd(a: i32x8) -> u32x8;
3825     #[link_name = "llvm.x86.avx2.pavg.b"]
3826     fn pavgb(a: u8x32, b: u8x32) -> u8x32;
3827     #[link_name = "llvm.x86.avx2.pavg.w"]
3828     fn pavgw(a: u16x16, b: u16x16) -> u16x16;
3829     #[link_name = "llvm.x86.avx2.pblendvb"]
3830     fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
3831     #[link_name = "llvm.x86.avx2.phadd.w"]
3832     fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3833     #[link_name = "llvm.x86.avx2.phadd.d"]
3834     fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3835     #[link_name = "llvm.x86.avx2.phadd.sw"]
3836     fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3837     #[link_name = "llvm.x86.avx2.phsub.w"]
3838     fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3839     #[link_name = "llvm.x86.avx2.phsub.d"]
3840     fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3841     #[link_name = "llvm.x86.avx2.phsub.sw"]
3842     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3843     #[link_name = "llvm.x86.avx2.pmadd.wd"]
3844     fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3845     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3846     fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3847     #[link_name = "llvm.x86.avx2.maskload.d"]
3848     fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3849     #[link_name = "llvm.x86.avx2.maskload.d.256"]
3850     fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3851     #[link_name = "llvm.x86.avx2.maskload.q"]
3852     fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3853     #[link_name = "llvm.x86.avx2.maskload.q.256"]
3854     fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3855     #[link_name = "llvm.x86.avx2.maskstore.d"]
3856     fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3857     #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3858     fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3859     #[link_name = "llvm.x86.avx2.maskstore.q"]
3860     fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3861     #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3862     fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3863     #[link_name = "llvm.x86.avx2.pmaxs.w"]
3864     fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
3865     #[link_name = "llvm.x86.avx2.pmaxs.d"]
3866     fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
3867     #[link_name = "llvm.x86.avx2.pmaxs.b"]
3868     fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
3869     #[link_name = "llvm.x86.avx2.pmaxu.w"]
3870     fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
3871     #[link_name = "llvm.x86.avx2.pmaxu.d"]
3872     fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
3873     #[link_name = "llvm.x86.avx2.pmaxu.b"]
3874     fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
3875     #[link_name = "llvm.x86.avx2.pmins.w"]
3876     fn pminsw(a: i16x16, b: i16x16) -> i16x16;
3877     #[link_name = "llvm.x86.avx2.pmins.d"]
3878     fn pminsd(a: i32x8, b: i32x8) -> i32x8;
3879     #[link_name = "llvm.x86.avx2.pmins.b"]
3880     fn pminsb(a: i8x32, b: i8x32) -> i8x32;
3881     #[link_name = "llvm.x86.avx2.pminu.w"]
3882     fn pminuw(a: u16x16, b: u16x16) -> u16x16;
3883     #[link_name = "llvm.x86.avx2.pminu.d"]
3884     fn pminud(a: u32x8, b: u32x8) -> u32x8;
3885     #[link_name = "llvm.x86.avx2.pminu.b"]
3886     fn pminub(a: u8x32, b: u8x32) -> u8x32;
3887     #[link_name = "llvm.x86.avx2.pmovmskb"]
3888     fn pmovmskb(a: i8x32) -> i32;
3889     #[link_name = "llvm.x86.avx2.mpsadbw"]
3890     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3891     #[link_name = "llvm.x86.avx2.pmulhu.w"]
3892     fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
3893     #[link_name = "llvm.x86.avx2.pmulh.w"]
3894     fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
3895     #[link_name = "llvm.x86.avx2.pmul.dq"]
3896     fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
3897     #[link_name = "llvm.x86.avx2.pmulu.dq"]
3898     fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
3899     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3900     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3901     #[link_name = "llvm.x86.avx2.packsswb"]
3902     fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3903     #[link_name = "llvm.x86.avx2.packssdw"]
3904     fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3905     #[link_name = "llvm.x86.avx2.packuswb"]
3906     fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3907     #[link_name = "llvm.x86.avx2.packusdw"]
3908     fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3909     #[link_name = "llvm.x86.avx2.psad.bw"]
3910     fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3911     #[link_name = "llvm.x86.avx2.psign.b"]
3912     fn psignb(a: i8x32, b: i8x32) -> i8x32;
3913     #[link_name = "llvm.x86.avx2.psign.w"]
3914     fn psignw(a: i16x16, b: i16x16) -> i16x16;
3915     #[link_name = "llvm.x86.avx2.psign.d"]
3916     fn psignd(a: i32x8, b: i32x8) -> i32x8;
3917     #[link_name = "llvm.x86.avx2.psll.w"]
3918     fn psllw(a: i16x16, count: i16x8) -> i16x16;
3919     #[link_name = "llvm.x86.avx2.psll.d"]
3920     fn pslld(a: i32x8, count: i32x4) -> i32x8;
3921     #[link_name = "llvm.x86.avx2.psll.q"]
3922     fn psllq(a: i64x4, count: i64x2) -> i64x4;
3923     #[link_name = "llvm.x86.avx2.pslli.w"]
3924     fn pslliw(a: i16x16, imm8: i32) -> i16x16;
3925     #[link_name = "llvm.x86.avx2.pslli.d"]
3926     fn psllid(a: i32x8, imm8: i32) -> i32x8;
3927     #[link_name = "llvm.x86.avx2.pslli.q"]
3928     fn pslliq(a: i64x4, imm8: i32) -> i64x4;
3929     #[link_name = "llvm.x86.avx2.psllv.d"]
3930     fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3931     #[link_name = "llvm.x86.avx2.psllv.d.256"]
3932     fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3933     #[link_name = "llvm.x86.avx2.psllv.q"]
3934     fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3935     #[link_name = "llvm.x86.avx2.psllv.q.256"]
3936     fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3937     #[link_name = "llvm.x86.avx2.psra.w"]
3938     fn psraw(a: i16x16, count: i16x8) -> i16x16;
3939     #[link_name = "llvm.x86.avx2.psra.d"]
3940     fn psrad(a: i32x8, count: i32x4) -> i32x8;
3941     #[link_name = "llvm.x86.avx2.psrai.w"]
3942     fn psraiw(a: i16x16, imm8: i32) -> i16x16;
3943     #[link_name = "llvm.x86.avx2.psrai.d"]
3944     fn psraid(a: i32x8, imm8: i32) -> i32x8;
3945     #[link_name = "llvm.x86.avx2.psrav.d"]
3946     fn psravd(a: i32x4, count: i32x4) -> i32x4;
3947     #[link_name = "llvm.x86.avx2.psrav.d.256"]
3948     fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3949     #[link_name = "llvm.x86.avx2.psrl.w"]
3950     fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3951     #[link_name = "llvm.x86.avx2.psrl.d"]
3952     fn psrld(a: i32x8, count: i32x4) -> i32x8;
3953     #[link_name = "llvm.x86.avx2.psrl.q"]
3954     fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3955     #[link_name = "llvm.x86.avx2.psrli.w"]
3956     fn psrliw(a: i16x16, imm8: i32) -> i16x16;
3957     #[link_name = "llvm.x86.avx2.psrli.d"]
3958     fn psrlid(a: i32x8, imm8: i32) -> i32x8;
3959     #[link_name = "llvm.x86.avx2.psrli.q"]
3960     fn psrliq(a: i64x4, imm8: i32) -> i64x4;
3961     #[link_name = "llvm.x86.avx2.psrlv.d"]
3962     fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3963     #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3964     fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3965     #[link_name = "llvm.x86.avx2.psrlv.q"]
3966     fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3967     #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3968     fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3969     #[link_name = "llvm.x86.avx2.pshuf.b"]
3970     fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3971     #[link_name = "llvm.x86.avx2.permd"]
3972     fn permd(a: u32x8, b: u32x8) -> u32x8;
3973     #[link_name = "llvm.x86.avx2.permps"]
3974     fn permps(a: __m256, b: i32x8) -> __m256;
3975     #[link_name = "llvm.x86.avx2.vperm2i128"]
3976     fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3977     #[link_name = "llvm.x86.avx2.gather.d.d"]
3978     fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3979     #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3980     fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3981     #[link_name = "llvm.x86.avx2.gather.d.q"]
3982     fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3983     #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3984     fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3985     #[link_name = "llvm.x86.avx2.gather.q.d"]
3986     fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3987     #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3988     fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3989     #[link_name = "llvm.x86.avx2.gather.q.q"]
3990     fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3991     #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3992     fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3993     #[link_name = "llvm.x86.avx2.gather.d.pd"]
3994     fn pgatherdpd(
3995         src: __m128d,
3996         slice: *const i8,
3997         offsets: i32x4,
3998         mask: __m128d,
3999         scale: i8,
4000     ) -> __m128d;
4001     #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
4002     fn vpgatherdpd(
4003         src: __m256d,
4004         slice: *const i8,
4005         offsets: i32x4,
4006         mask: __m256d,
4007         scale: i8,
4008     ) -> __m256d;
4009     #[link_name = "llvm.x86.avx2.gather.q.pd"]
4010     fn pgatherqpd(
4011         src: __m128d,
4012         slice: *const i8,
4013         offsets: i64x2,
4014         mask: __m128d,
4015         scale: i8,
4016     ) -> __m128d;
4017     #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
4018     fn vpgatherqpd(
4019         src: __m256d,
4020         slice: *const i8,
4021         offsets: i64x4,
4022         mask: __m256d,
4023         scale: i8,
4024     ) -> __m256d;
4025     #[link_name = "llvm.x86.avx2.gather.d.ps"]
4026     fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
4027         -> __m128;
4028     #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4029     fn vpgatherdps(
4030         src: __m256,
4031         slice: *const i8,
4032         offsets: i32x8,
4033         mask: __m256,
4034         scale: i8,
4035     ) -> __m256;
4036     #[link_name = "llvm.x86.avx2.gather.q.ps"]
4037     fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4038         -> __m128;
4039     #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4040     fn vpgatherqps(
4041         src: __m128,
4042         slice: *const i8,
4043         offsets: i64x4,
4044         mask: __m128,
4045         scale: i8,
4046     ) -> __m128;
4047     #[link_name = "llvm.x86.avx2.psll.dq"]
4048     fn vpslldq(a: i64x4, b: i32) -> i64x4;
4049     #[link_name = "llvm.x86.avx2.psrl.dq"]
4050     fn vpsrldq(a: i64x4, b: i32) -> i64x4;
4051 }
4052
4053 #[cfg(test)]
4054 mod tests {
4055     use std;
4056     use stdarch_test::simd_test;
4057
4058     use crate::core_arch::x86::*;
4059
4060     #[simd_test(enable = "avx2")]
4061     unsafe fn test_mm256_abs_epi32() {
4062         #[rustfmt::skip]
4063         let a = _mm256_setr_epi32(
4064             0, 1, -1, i32::MAX,
4065             i32::MIN, 100, -100, -32,
4066         );
4067         let r = _mm256_abs_epi32(a);
4068         #[rustfmt::skip]
4069         let e = _mm256_setr_epi32(
4070             0, 1, 1, i32::MAX,
4071             i32::MAX.wrapping_add(1), 100, 100, 32,
4072         );
4073         assert_eq_m256i(r, e);
4074     }
4075
4076     #[simd_test(enable = "avx2")]
4077     unsafe fn test_mm256_abs_epi16() {
4078         #[rustfmt::skip]
4079         let a = _mm256_setr_epi16(
4080             0,  1, -1, 2, -2, 3, -3, 4,
4081             -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
4082         );
4083         let r = _mm256_abs_epi16(a);
4084         #[rustfmt::skip]
4085         let e = _mm256_setr_epi16(
4086             0, 1, 1, 2, 2, 3, 3, 4,
4087             4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
4088         );
4089         assert_eq_m256i(r, e);
4090     }
4091
4092     #[simd_test(enable = "avx2")]
4093     unsafe fn test_mm256_abs_epi8() {
4094         #[rustfmt::skip]
4095         let a = _mm256_setr_epi8(
4096             0, 1, -1, 2, -2, 3, -3, 4,
4097             -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4098             0, 1, -1, 2, -2, 3, -3, 4,
4099             -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4100         );
4101         let r = _mm256_abs_epi8(a);
4102         #[rustfmt::skip]
4103         let e = _mm256_setr_epi8(
4104             0, 1, 1, 2, 2, 3, 3, 4,
4105             4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4106             0, 1, 1, 2, 2, 3, 3, 4,
4107             4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4108         );
4109         assert_eq_m256i(r, e);
4110     }
4111
4112     #[simd_test(enable = "avx2")]
4113     unsafe fn test_mm256_add_epi64() {
4114         let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4115         let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4116         let r = _mm256_add_epi64(a, b);
4117         let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4118         assert_eq_m256i(r, e);
4119     }
4120
4121     #[simd_test(enable = "avx2")]
4122     unsafe fn test_mm256_add_epi32() {
4123         let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4124         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4125         let r = _mm256_add_epi32(a, b);
4126         let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4127         assert_eq_m256i(r, e);
4128     }
4129
4130     #[simd_test(enable = "avx2")]
4131     unsafe fn test_mm256_add_epi16() {
4132         #[rustfmt::skip]
4133         let a = _mm256_setr_epi16(
4134             0, 1, 2, 3, 4, 5, 6, 7,
4135             8, 9, 10, 11, 12, 13, 14, 15,
4136         );
4137         #[rustfmt::skip]
4138         let b = _mm256_setr_epi16(
4139             0, 1, 2, 3, 4, 5, 6, 7,
4140             8, 9, 10, 11, 12, 13, 14, 15,
4141         );
4142         let r = _mm256_add_epi16(a, b);
4143         #[rustfmt::skip]
4144         let e = _mm256_setr_epi16(
4145             0, 2, 4, 6, 8, 10, 12, 14,
4146             16, 18, 20, 22, 24, 26, 28, 30,
4147         );
4148         assert_eq_m256i(r, e);
4149     }
4150
4151     #[simd_test(enable = "avx2")]
4152     unsafe fn test_mm256_add_epi8() {
4153         #[rustfmt::skip]
4154         let a = _mm256_setr_epi8(
4155             0, 1, 2, 3, 4, 5, 6, 7,
4156             8, 9, 10, 11, 12, 13, 14, 15,
4157             16, 17, 18, 19, 20, 21, 22, 23,
4158             24, 25, 26, 27, 28, 29, 30, 31,
4159         );
4160         #[rustfmt::skip]
4161         let b = _mm256_setr_epi8(
4162             0, 1, 2, 3, 4, 5, 6, 7,
4163             8, 9, 10, 11, 12, 13, 14, 15,
4164             16, 17, 18, 19, 20, 21, 22, 23,
4165             24, 25, 26, 27, 28, 29, 30, 31,
4166         );
4167         let r = _mm256_add_epi8(a, b);
4168         #[rustfmt::skip]
4169         let e = _mm256_setr_epi8(
4170             0, 2, 4, 6, 8, 10, 12, 14,
4171             16, 18, 20, 22, 24, 26, 28, 30,
4172             32, 34, 36, 38, 40, 42, 44, 46,
4173             48, 50, 52, 54, 56, 58, 60, 62,
4174         );
4175         assert_eq_m256i(r, e);
4176     }
4177
4178     #[simd_test(enable = "avx2")]
4179     unsafe fn test_mm256_adds_epi8() {
4180         #[rustfmt::skip]
4181         let a = _mm256_setr_epi8(
4182             0, 1, 2, 3, 4, 5, 6, 7,
4183             8, 9, 10, 11, 12, 13, 14, 15,
4184             16, 17, 18, 19, 20, 21, 22, 23,
4185             24, 25, 26, 27, 28, 29, 30, 31,
4186         );
4187         #[rustfmt::skip]
4188         let b = _mm256_setr_epi8(
4189             32, 33, 34, 35, 36, 37, 38, 39,
4190             40, 41, 42, 43, 44, 45, 46, 47,
4191             48, 49, 50, 51, 52, 53, 54, 55,
4192             56, 57, 58, 59, 60, 61, 62, 63,
4193         );
4194         let r = _mm256_adds_epi8(a, b);
4195         #[rustfmt::skip]
4196         let e = _mm256_setr_epi8(
4197             32, 34, 36, 38, 40, 42, 44, 46,
4198             48, 50, 52, 54, 56, 58, 60, 62,
4199             64, 66, 68, 70, 72, 74, 76, 78,
4200             80, 82, 84, 86, 88, 90, 92, 94,
4201         );
4202         assert_eq_m256i(r, e);
4203     }
4204
4205     #[simd_test(enable = "avx2")]
4206     unsafe fn test_mm256_adds_epi8_saturate_positive() {
4207         let a = _mm256_set1_epi8(0x7F);
4208         let b = _mm256_set1_epi8(1);
4209         let r = _mm256_adds_epi8(a, b);
4210         assert_eq_m256i(r, a);
4211     }
4212
4213     #[simd_test(enable = "avx2")]
4214     unsafe fn test_mm256_adds_epi8_saturate_negative() {
4215         let a = _mm256_set1_epi8(-0x80);
4216         let b = _mm256_set1_epi8(-1);
4217         let r = _mm256_adds_epi8(a, b);
4218         assert_eq_m256i(r, a);
4219     }
4220
4221     #[simd_test(enable = "avx2")]
4222     unsafe fn test_mm256_adds_epi16() {
4223         #[rustfmt::skip]
4224         let a = _mm256_setr_epi16(
4225             0, 1, 2, 3, 4, 5, 6, 7,
4226             8, 9, 10, 11, 12, 13, 14, 15,
4227         );
4228         #[rustfmt::skip]
4229         let b = _mm256_setr_epi16(
4230             32, 33, 34, 35, 36, 37, 38, 39,
4231             40, 41, 42, 43, 44, 45, 46, 47,
4232         );
4233         let r = _mm256_adds_epi16(a, b);
4234         #[rustfmt::skip]
4235         let e = _mm256_setr_epi16(
4236             32, 34, 36, 38, 40, 42, 44, 46,
4237             48, 50, 52, 54, 56, 58, 60, 62,
4238         );
4239
4240         assert_eq_m256i(r, e);
4241     }
4242
4243     #[simd_test(enable = "avx2")]
4244     unsafe fn test_mm256_adds_epi16_saturate_positive() {
4245         let a = _mm256_set1_epi16(0x7FFF);
4246         let b = _mm256_set1_epi16(1);
4247         let r = _mm256_adds_epi16(a, b);
4248         assert_eq_m256i(r, a);
4249     }
4250
4251     #[simd_test(enable = "avx2")]
4252     unsafe fn test_mm256_adds_epi16_saturate_negative() {
4253         let a = _mm256_set1_epi16(-0x8000);
4254         let b = _mm256_set1_epi16(-1);
4255         let r = _mm256_adds_epi16(a, b);
4256         assert_eq_m256i(r, a);
4257     }
4258
4259     #[simd_test(enable = "avx2")]
4260     unsafe fn test_mm256_adds_epu8() {
4261         #[rustfmt::skip]
4262         let a = _mm256_setr_epi8(
4263             0, 1, 2, 3, 4, 5, 6, 7,
4264             8, 9, 10, 11, 12, 13, 14, 15,
4265             16, 17, 18, 19, 20, 21, 22, 23,
4266             24, 25, 26, 27, 28, 29, 30, 31,
4267         );
4268         #[rustfmt::skip]
4269         let b = _mm256_setr_epi8(
4270             32, 33, 34, 35, 36, 37, 38, 39,
4271             40, 41, 42, 43, 44, 45, 46, 47,
4272             48, 49, 50, 51, 52, 53, 54, 55,
4273             56, 57, 58, 59, 60, 61, 62, 63,
4274         );
4275         let r = _mm256_adds_epu8(a, b);
4276         #[rustfmt::skip]
4277         let e = _mm256_setr_epi8(
4278             32, 34, 36, 38, 40, 42, 44, 46,
4279             48, 50, 52, 54, 56, 58, 60, 62,
4280             64, 66, 68, 70, 72, 74, 76, 78,
4281             80, 82, 84, 86, 88, 90, 92, 94,
4282         );
4283         assert_eq_m256i(r, e);
4284     }
4285
4286     #[simd_test(enable = "avx2")]
4287     unsafe fn test_mm256_adds_epu8_saturate() {
4288         let a = _mm256_set1_epi8(!0);
4289         let b = _mm256_set1_epi8(1);
4290         let r = _mm256_adds_epu8(a, b);
4291         assert_eq_m256i(r, a);
4292     }
4293
4294     #[simd_test(enable = "avx2")]
4295     unsafe fn test_mm256_adds_epu16() {
4296         #[rustfmt::skip]
4297         let a = _mm256_setr_epi16(
4298             0, 1, 2, 3, 4, 5, 6, 7,
4299             8, 9, 10, 11, 12, 13, 14, 15,
4300         );
4301         #[rustfmt::skip]
4302         let b = _mm256_setr_epi16(
4303             32, 33, 34, 35, 36, 37, 38, 39,
4304             40, 41, 42, 43, 44, 45, 46, 47,
4305         );
4306         let r = _mm256_adds_epu16(a, b);
4307         #[rustfmt::skip]
4308         let e = _mm256_setr_epi16(
4309             32, 34, 36, 38, 40, 42, 44, 46,
4310             48, 50, 52, 54, 56, 58, 60, 62,
4311         );
4312
4313         assert_eq_m256i(r, e);
4314     }
4315
4316     #[simd_test(enable = "avx2")]
4317     unsafe fn test_mm256_adds_epu16_saturate() {
4318         let a = _mm256_set1_epi16(!0);
4319         let b = _mm256_set1_epi16(1);
4320         let r = _mm256_adds_epu16(a, b);
4321         assert_eq_m256i(r, a);
4322     }
4323
4324     #[simd_test(enable = "avx2")]
4325     unsafe fn test_mm256_and_si256() {
4326         let a = _mm256_set1_epi8(5);
4327         let b = _mm256_set1_epi8(3);
4328         let got = _mm256_and_si256(a, b);
4329         assert_eq_m256i(got, _mm256_set1_epi8(1));
4330     }
4331
4332     #[simd_test(enable = "avx2")]
4333     unsafe fn test_mm256_andnot_si256() {
4334         let a = _mm256_set1_epi8(5);
4335         let b = _mm256_set1_epi8(3);
4336         let got = _mm256_andnot_si256(a, b);
4337         assert_eq_m256i(got, _mm256_set1_epi8(2));
4338     }
4339
4340     #[simd_test(enable = "avx2")]
4341     unsafe fn test_mm256_avg_epu8() {
4342         let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4343         let r = _mm256_avg_epu8(a, b);
4344         assert_eq_m256i(r, _mm256_set1_epi8(6));
4345     }
4346
4347     #[simd_test(enable = "avx2")]
4348     unsafe fn test_mm256_avg_epu16() {
4349         let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4350         let r = _mm256_avg_epu16(a, b);
4351         assert_eq_m256i(r, _mm256_set1_epi16(6));
4352     }
4353
4354     #[simd_test(enable = "avx2")]
4355     unsafe fn test_mm_blend_epi32() {
4356         let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4357         let e = _mm_setr_epi32(9, 3, 3, 3);
4358         let r = _mm_blend_epi32(a, b, 0x01 as i32);
4359         assert_eq_m128i(r, e);
4360
4361         let r = _mm_blend_epi32(b, a, 0x0E as i32);
4362         assert_eq_m128i(r, e);
4363     }
4364
4365     #[simd_test(enable = "avx2")]
4366     unsafe fn test_mm256_blend_epi32() {
4367         let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4368         let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4369         let r = _mm256_blend_epi32(a, b, 0x01 as i32);
4370         assert_eq_m256i(r, e);
4371
4372         let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4373         let r = _mm256_blend_epi32(a, b, 0x82 as i32);
4374         assert_eq_m256i(r, e);
4375
4376         let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4377         let r = _mm256_blend_epi32(a, b, 0x7C as i32);
4378         assert_eq_m256i(r, e);
4379     }
4380
4381     #[simd_test(enable = "avx2")]
4382     unsafe fn test_mm256_blend_epi16() {
4383         let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4384         let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4385         let r = _mm256_blend_epi16(a, b, 0x01 as i32);
4386         assert_eq_m256i(r, e);
4387
4388         let r = _mm256_blend_epi16(b, a, 0xFE as i32);
4389         assert_eq_m256i(r, e);
4390     }
4391
4392     #[simd_test(enable = "avx2")]
4393     unsafe fn test_mm256_blendv_epi8() {
4394         let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4395         let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
4396         let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
4397         let r = _mm256_blendv_epi8(a, b, mask);
4398         assert_eq_m256i(r, e);
4399     }
4400
4401     #[simd_test(enable = "avx2")]
4402     unsafe fn test_mm_broadcastb_epi8() {
4403         let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4404         let res = _mm_broadcastb_epi8(a);
4405         assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4406     }
4407
4408     #[simd_test(enable = "avx2")]
4409     unsafe fn test_mm256_broadcastb_epi8() {
4410         let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4411         let res = _mm256_broadcastb_epi8(a);
4412         assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4413     }
4414
4415     #[simd_test(enable = "avx2")]
4416     unsafe fn test_mm_broadcastd_epi32() {
4417         let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4418         let res = _mm_broadcastd_epi32(a);
4419         assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4420     }
4421
4422     #[simd_test(enable = "avx2")]
4423     unsafe fn test_mm256_broadcastd_epi32() {
4424         let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4425         let res = _mm256_broadcastd_epi32(a);
4426         assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4427     }
4428
4429     #[simd_test(enable = "avx2")]
4430     unsafe fn test_mm_broadcastq_epi64() {
4431         let a = _mm_setr_epi64x(0x1ffffffff, 0);
4432         let res = _mm_broadcastq_epi64(a);
4433         assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4434     }
4435
4436     #[simd_test(enable = "avx2")]
4437     unsafe fn test_mm256_broadcastq_epi64() {
4438         let a = _mm_setr_epi64x(0x1ffffffff, 0);
4439         let res = _mm256_broadcastq_epi64(a);
4440         assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4441     }
4442
4443     #[simd_test(enable = "avx2")]
4444     unsafe fn test_mm_broadcastsd_pd() {
4445         let a = _mm_setr_pd(6.28, 3.14);
4446         let res = _mm_broadcastsd_pd(a);
4447         assert_eq_m128d(res, _mm_set1_pd(6.28f64));
4448     }
4449
4450     #[simd_test(enable = "avx2")]
4451     unsafe fn test_mm256_broadcastsd_pd() {
4452         let a = _mm_setr_pd(6.28, 3.14);
4453         let res = _mm256_broadcastsd_pd(a);
4454         assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
4455     }
4456
4457     #[simd_test(enable = "avx2")]
4458     unsafe fn test_mm256_broadcastsi128_si256() {
4459         let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4460         let res = _mm256_broadcastsi128_si256(a);
4461         let retval = _mm256_setr_epi64x(
4462             0x0987654321012334,
4463             0x5678909876543210,
4464             0x0987654321012334,
4465             0x5678909876543210,
4466         );
4467         assert_eq_m256i(res, retval);
4468     }
4469
4470     #[simd_test(enable = "avx2")]
4471     unsafe fn test_mm_broadcastss_ps() {
4472         let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4473         let res = _mm_broadcastss_ps(a);
4474         assert_eq_m128(res, _mm_set1_ps(6.28f32));
4475     }
4476
4477     #[simd_test(enable = "avx2")]
4478     unsafe fn test_mm256_broadcastss_ps() {
4479         let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4480         let res = _mm256_broadcastss_ps(a);
4481         assert_eq_m256(res, _mm256_set1_ps(6.28f32));
4482     }
4483
4484     #[simd_test(enable = "avx2")]
4485     unsafe fn test_mm_broadcastw_epi16() {
4486         let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4487         let res = _mm_broadcastw_epi16(a);
4488         assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4489     }
4490
4491     #[simd_test(enable = "avx2")]
4492     unsafe fn test_mm256_broadcastw_epi16() {
4493         let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4494         let res = _mm256_broadcastw_epi16(a);
4495         assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4496     }
4497
4498     #[simd_test(enable = "avx2")]
4499     unsafe fn test_mm256_cmpeq_epi8() {
4500         #[rustfmt::skip]
4501         let a = _mm256_setr_epi8(
4502             0, 1, 2, 3, 4, 5, 6, 7,
4503             8, 9, 10, 11, 12, 13, 14, 15,
4504             16, 17, 18, 19, 20, 21, 22, 23,
4505             24, 25, 26, 27, 28, 29, 30, 31,
4506         );
4507         #[rustfmt::skip]
4508         let b = _mm256_setr_epi8(
4509             31, 30, 2, 28, 27, 26, 25, 24,
4510             23, 22, 21, 20, 19, 18, 17, 16,
4511             15, 14, 13, 12, 11, 10, 9, 8,
4512             7, 6, 5, 4, 3, 2, 1, 0,
4513         );
4514         let r = _mm256_cmpeq_epi8(a, b);
4515         assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
4516     }
4517
4518     #[simd_test(enable = "avx2")]
4519     unsafe fn test_mm256_cmpeq_epi16() {
4520         #[rustfmt::skip]
4521         let a = _mm256_setr_epi16(
4522             0, 1, 2, 3, 4, 5, 6, 7,
4523             8, 9, 10, 11, 12, 13, 14, 15,
4524         );
4525         #[rustfmt::skip]
4526         let b = _mm256_setr_epi16(
4527             15, 14, 2, 12, 11, 10, 9, 8,
4528             7, 6, 5, 4, 3, 2, 1, 0,
4529         );
4530         let r = _mm256_cmpeq_epi16(a, b);
4531         assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
4532     }
4533
4534     #[simd_test(enable = "avx2")]
4535     unsafe fn test_mm256_cmpeq_epi32() {
4536         let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4537         let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4538         let r = _mm256_cmpeq_epi32(a, b);
4539         let e = _mm256_set1_epi32(0);
4540         let e = _mm256_insert_epi32(e, !0, 2);
4541         assert_eq_m256i(r, e);
4542     }
4543
4544     #[simd_test(enable = "avx2")]
4545     unsafe fn test_mm256_cmpeq_epi64() {
4546         let a = _mm256_setr_epi64x(0, 1, 2, 3);
4547         let b = _mm256_setr_epi64x(3, 2, 2, 0);
4548         let r = _mm256_cmpeq_epi64(a, b);
4549         assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
4550     }
4551
4552     #[simd_test(enable = "avx2")]
4553     unsafe fn test_mm256_cmpgt_epi8() {
4554         let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
4555         let b = _mm256_set1_epi8(0);
4556         let r = _mm256_cmpgt_epi8(a, b);
4557         assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
4558     }
4559
4560     #[simd_test(enable = "avx2")]
4561     unsafe fn test_mm256_cmpgt_epi16() {
4562         let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
4563         let b = _mm256_set1_epi16(0);
4564         let r = _mm256_cmpgt_epi16(a, b);
4565         assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
4566     }
4567
4568     #[simd_test(enable = "avx2")]
4569     unsafe fn test_mm256_cmpgt_epi32() {
4570         let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
4571         let b = _mm256_set1_epi32(0);
4572         let r = _mm256_cmpgt_epi32(a, b);
4573         assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
4574     }
4575
4576     #[simd_test(enable = "avx2")]
4577     unsafe fn test_mm256_cmpgt_epi64() {
4578         let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
4579         let b = _mm256_set1_epi64x(0);
4580         let r = _mm256_cmpgt_epi64(a, b);
4581         assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
4582     }
4583
4584     #[simd_test(enable = "avx2")]
4585     unsafe fn test_mm256_cvtepi8_epi16() {
4586         #[rustfmt::skip]
4587         let a = _mm_setr_epi8(
4588             0, 0, -1, 1, -2, 2, -3, 3,
4589             -4, 4, -5, 5, -6, 6, -7, 7,
4590         );
4591         #[rustfmt::skip]
4592         let r = _mm256_setr_epi16(
4593             0, 0, -1, 1, -2, 2, -3, 3,
4594             -4, 4, -5, 5, -6, 6, -7, 7,
4595         );
4596         assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4597     }
4598
4599     #[simd_test(enable = "avx2")]
4600     unsafe fn test_mm256_cvtepi8_epi32() {
4601         #[rustfmt::skip]
4602         let a = _mm_setr_epi8(
4603             0, 0, -1, 1, -2, 2, -3, 3,
4604             -4, 4, -5, 5, -6, 6, -7, 7,
4605         );
4606         let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4607         assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4608     }
4609
4610     #[simd_test(enable = "avx2")]
4611     unsafe fn test_mm256_cvtepi8_epi64() {
4612         #[rustfmt::skip]
4613         let a = _mm_setr_epi8(
4614             0, 0, -1, 1, -2, 2, -3, 3,
4615             -4, 4, -5, 5, -6, 6, -7, 7,
4616         );
4617         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4618         assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4619     }
4620
4621     #[simd_test(enable = "avx2")]
4622     unsafe fn test_mm256_cvtepi16_epi32() {
4623         let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4624         let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4625         assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4626     }
4627
4628     #[simd_test(enable = "avx2")]
4629     unsafe fn test_mm256_cvtepi16_epi64() {
4630         let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4631         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4632         assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4633     }
4634
4635     #[simd_test(enable = "avx2")]
4636     unsafe fn test_mm256_cvtepi32_epi64() {
4637         let a = _mm_setr_epi32(0, 0, -1, 1);
4638         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4639         assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4640     }
4641
4642     #[simd_test(enable = "avx2")]
4643     unsafe fn test_mm256_cvtepu16_epi32() {
4644         let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4645         let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4646         assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4647     }
4648
4649     #[simd_test(enable = "avx2")]
4650     unsafe fn test_mm256_cvtepu16_epi64() {
4651         let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4652         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4653         assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4654     }
4655
4656     #[simd_test(enable = "avx2")]
4657     unsafe fn test_mm256_cvtepu32_epi64() {
4658         let a = _mm_setr_epi32(0, 1, 2, 3);
4659         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4660         assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4661     }
4662
4663     #[simd_test(enable = "avx2")]
4664     unsafe fn test_mm256_cvtepu8_epi16() {
4665         #[rustfmt::skip]
4666         let a = _mm_setr_epi8(
4667             0, 1, 2, 3, 4, 5, 6, 7,
4668             8, 9, 10, 11, 12, 13, 14, 15,
4669         );
4670         #[rustfmt::skip]
4671         let r = _mm256_setr_epi16(
4672             0, 1, 2, 3, 4, 5, 6, 7,
4673             8, 9, 10, 11, 12, 13, 14, 15,
4674         );
4675         assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4676     }
4677
4678     #[simd_test(enable = "avx2")]
4679     unsafe fn test_mm256_cvtepu8_epi32() {
4680         #[rustfmt::skip]
4681         let a = _mm_setr_epi8(
4682             0, 1, 2, 3, 4, 5, 6, 7,
4683             8, 9, 10, 11, 12, 13, 14, 15,
4684         );
4685         let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4686         assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4687     }
4688
4689     #[simd_test(enable = "avx2")]
4690     unsafe fn test_mm256_cvtepu8_epi64() {
4691         #[rustfmt::skip]
4692         let a = _mm_setr_epi8(
4693             0, 1, 2, 3, 4, 5, 6, 7,
4694             8, 9, 10, 11, 12, 13, 14, 15,
4695         );
4696         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4697         assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4698     }
4699
4700     #[simd_test(enable = "avx2")]
4701     unsafe fn test_mm256_extracti128_si256() {
4702         let a = _mm256_setr_epi64x(1, 2, 3, 4);
4703         let r = _mm256_extracti128_si256(a, 0b01);
4704         let e = _mm_setr_epi64x(3, 4);
4705         assert_eq_m128i(r, e);
4706     }
4707
4708     #[simd_test(enable = "avx2")]
4709     unsafe fn test_mm256_hadd_epi16() {
4710         let a = _mm256_set1_epi16(2);
4711         let b = _mm256_set1_epi16(4);
4712         let r = _mm256_hadd_epi16(a, b);
4713         let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4714         assert_eq_m256i(r, e);
4715     }
4716
4717     #[simd_test(enable = "avx2")]
4718     unsafe fn test_mm256_hadd_epi32() {
4719         let a = _mm256_set1_epi32(2);
4720         let b = _mm256_set1_epi32(4);
4721         let r = _mm256_hadd_epi32(a, b);
4722         let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4723         assert_eq_m256i(r, e);
4724     }
4725
4726     #[simd_test(enable = "avx2")]
4727     unsafe fn test_mm256_hadds_epi16() {
4728         let a = _mm256_set1_epi16(2);
4729         let a = _mm256_insert_epi16(a, 0x7fff, 0);
4730         let a = _mm256_insert_epi16(a, 1, 1);
4731         let b = _mm256_set1_epi16(4);
4732         let r = _mm256_hadds_epi16(a, b);
4733         #[rustfmt::skip]
4734         let e = _mm256_setr_epi16(
4735             0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4736             4, 4, 4, 4, 8, 8, 8, 8,
4737         );
4738         assert_eq_m256i(r, e);
4739     }
4740
4741     #[simd_test(enable = "avx2")]
4742     unsafe fn test_mm256_hsub_epi16() {
4743         let a = _mm256_set1_epi16(2);
4744         let b = _mm256_set1_epi16(4);
4745         let r = _mm256_hsub_epi16(a, b);
4746         let e = _mm256_set1_epi16(0);
4747         assert_eq_m256i(r, e);
4748     }
4749
4750     #[simd_test(enable = "avx2")]
4751     unsafe fn test_mm256_hsub_epi32() {
4752         let a = _mm256_set1_epi32(2);
4753         let b = _mm256_set1_epi32(4);
4754         let r = _mm256_hsub_epi32(a, b);
4755         let e = _mm256_set1_epi32(0);
4756         assert_eq_m256i(r, e);
4757     }
4758
4759     #[simd_test(enable = "avx2")]
4760     unsafe fn test_mm256_hsubs_epi16() {
4761         let a = _mm256_set1_epi16(2);
4762         let a = _mm256_insert_epi16(a, 0x7fff, 0);
4763         let a = _mm256_insert_epi16(a, -1, 1);
4764         let b = _mm256_set1_epi16(4);
4765         let r = _mm256_hsubs_epi16(a, b);
4766         let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
4767         assert_eq_m256i(r, e);
4768     }
4769
4770     #[simd_test(enable = "avx2")]
4771     unsafe fn test_mm256_madd_epi16() {
4772         let a = _mm256_set1_epi16(2);
4773         let b = _mm256_set1_epi16(4);
4774         let r = _mm256_madd_epi16(a, b);
4775         let e = _mm256_set1_epi32(16);
4776         assert_eq_m256i(r, e);
4777     }
4778
4779     #[simd_test(enable = "avx2")]
4780     unsafe fn test_mm256_inserti128_si256() {
4781         let a = _mm256_setr_epi64x(1, 2, 3, 4);
4782         let b = _mm_setr_epi64x(7, 8);
4783         let r = _mm256_inserti128_si256(a, b, 0b01);
4784         let e = _mm256_setr_epi64x(1, 2, 7, 8);
4785         assert_eq_m256i(r, e);
4786     }
4787
4788     #[simd_test(enable = "avx2")]
4789     unsafe fn test_mm256_maddubs_epi16() {
4790         let a = _mm256_set1_epi8(2);
4791         let b = _mm256_set1_epi8(4);
4792         let r = _mm256_maddubs_epi16(a, b);
4793         let e = _mm256_set1_epi16(16);
4794         assert_eq_m256i(r, e);
4795     }
4796
4797     #[simd_test(enable = "avx2")]
4798     unsafe fn test_mm_maskload_epi32() {
4799         let nums = [1, 2, 3, 4];
4800         let a = &nums as *const i32;
4801         let mask = _mm_setr_epi32(-1, 0, 0, -1);
4802         let r = _mm_maskload_epi32(a, mask);
4803         let e = _mm_setr_epi32(1, 0, 0, 4);
4804         assert_eq_m128i(r, e);
4805     }
4806
4807     #[simd_test(enable = "avx2")]
4808     unsafe fn test_mm256_maskload_epi32() {
4809         let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4810         let a = &nums as *const i32;
4811         let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4812         let r = _mm256_maskload_epi32(a, mask);
4813         let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4814         assert_eq_m256i(r, e);
4815     }
4816
4817     #[simd_test(enable = "avx2")]
4818     unsafe fn test_mm_maskload_epi64() {
4819         let nums = [1_i64, 2_i64];
4820         let a = &nums as *const i64;
4821         let mask = _mm_setr_epi64x(0, -1);
4822         let r = _mm_maskload_epi64(a, mask);
4823         let e = _mm_setr_epi64x(0, 2);
4824         assert_eq_m128i(r, e);
4825     }
4826
4827     #[simd_test(enable = "avx2")]
4828     unsafe fn test_mm256_maskload_epi64() {
4829         let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4830         let a = &nums as *const i64;
4831         let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4832         let r = _mm256_maskload_epi64(a, mask);
4833         let e = _mm256_setr_epi64x(0, 2, 3, 0);
4834         assert_eq_m256i(r, e);
4835     }
4836
4837     #[simd_test(enable = "avx2")]
4838     unsafe fn test_mm_maskstore_epi32() {
4839         let a = _mm_setr_epi32(1, 2, 3, 4);
4840         let mut arr = [-1, -1, -1, -1];
4841         let mask = _mm_setr_epi32(-1, 0, 0, -1);
4842         _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4843         let e = [1, -1, -1, 4];
4844         assert_eq!(arr, e);
4845     }
4846
4847     #[simd_test(enable = "avx2")]
4848     unsafe fn test_mm256_maskstore_epi32() {
4849         let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4850         let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4851         let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4852         _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4853         let e = [1, -1, -1, 42, -1, 6, 7, -1];
4854         assert_eq!(arr, e);
4855     }
4856
4857     #[simd_test(enable = "avx2")]
4858     unsafe fn test_mm_maskstore_epi64() {
4859         let a = _mm_setr_epi64x(1_i64, 2_i64);
4860         let mut arr = [-1_i64, -1_i64];
4861         let mask = _mm_setr_epi64x(0, -1);
4862         _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4863         let e = [-1, 2];
4864         assert_eq!(arr, e);
4865     }
4866
4867     #[simd_test(enable = "avx2")]
4868     unsafe fn test_mm256_maskstore_epi64() {
4869         let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4870         let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4871         let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4872         _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4873         let e = [-1, 2, 3, -1];
4874         assert_eq!(arr, e);
4875     }
4876
4877     #[simd_test(enable = "avx2")]
4878     unsafe fn test_mm256_max_epi16() {
4879         let a = _mm256_set1_epi16(2);
4880         let b = _mm256_set1_epi16(4);
4881         let r = _mm256_max_epi16(a, b);
4882         assert_eq_m256i(r, b);
4883     }
4884
4885     #[simd_test(enable = "avx2")]
4886     unsafe fn test_mm256_max_epi32() {
4887         let a = _mm256_set1_epi32(2);
4888         let b = _mm256_set1_epi32(4);
4889         let r = _mm256_max_epi32(a, b);
4890         assert_eq_m256i(r, b);
4891     }
4892
4893     #[simd_test(enable = "avx2")]
4894     unsafe fn test_mm256_max_epi8() {
4895         let a = _mm256_set1_epi8(2);
4896         let b = _mm256_set1_epi8(4);
4897         let r = _mm256_max_epi8(a, b);
4898         assert_eq_m256i(r, b);
4899     }
4900
4901     #[simd_test(enable = "avx2")]
4902     unsafe fn test_mm256_max_epu16() {
4903         let a = _mm256_set1_epi16(2);
4904         let b = _mm256_set1_epi16(4);
4905         let r = _mm256_max_epu16(a, b);
4906         assert_eq_m256i(r, b);
4907     }
4908
4909     #[simd_test(enable = "avx2")]
4910     unsafe fn test_mm256_max_epu32() {
4911         let a = _mm256_set1_epi32(2);
4912         let b = _mm256_set1_epi32(4);
4913         let r = _mm256_max_epu32(a, b);
4914         assert_eq_m256i(r, b);
4915     }
4916
4917     #[simd_test(enable = "avx2")]
4918     unsafe fn test_mm256_max_epu8() {
4919         let a = _mm256_set1_epi8(2);
4920         let b = _mm256_set1_epi8(4);
4921         let r = _mm256_max_epu8(a, b);
4922         assert_eq_m256i(r, b);
4923     }
4924
4925     #[simd_test(enable = "avx2")]
4926     unsafe fn test_mm256_min_epi16() {
4927         let a = _mm256_set1_epi16(2);
4928         let b = _mm256_set1_epi16(4);
4929         let r = _mm256_min_epi16(a, b);
4930         assert_eq_m256i(r, a);
4931     }
4932
4933     #[simd_test(enable = "avx2")]
4934     unsafe fn test_mm256_min_epi32() {
4935         let a = _mm256_set1_epi32(2);
4936         let b = _mm256_set1_epi32(4);
4937         let r = _mm256_min_epi32(a, b);
4938         assert_eq_m256i(r, a);
4939     }
4940
4941     #[simd_test(enable = "avx2")]
4942     unsafe fn test_mm256_min_epi8() {
4943         let a = _mm256_set1_epi8(2);
4944         let b = _mm256_set1_epi8(4);
4945         let r = _mm256_min_epi8(a, b);
4946         assert_eq_m256i(r, a);
4947     }
4948
4949     #[simd_test(enable = "avx2")]
4950     unsafe fn test_mm256_min_epu16() {
4951         let a = _mm256_set1_epi16(2);
4952         let b = _mm256_set1_epi16(4);
4953         let r = _mm256_min_epu16(a, b);
4954         assert_eq_m256i(r, a);
4955     }
4956
4957     #[simd_test(enable = "avx2")]
4958     unsafe fn test_mm256_min_epu32() {
4959         let a = _mm256_set1_epi32(2);
4960         let b = _mm256_set1_epi32(4);
4961         let r = _mm256_min_epu32(a, b);
4962         assert_eq_m256i(r, a);
4963     }
4964
4965     #[simd_test(enable = "avx2")]
4966     unsafe fn test_mm256_min_epu8() {
4967         let a = _mm256_set1_epi8(2);
4968         let b = _mm256_set1_epi8(4);
4969         let r = _mm256_min_epu8(a, b);
4970         assert_eq_m256i(r, a);
4971     }
4972
4973     #[simd_test(enable = "avx2")]
4974     unsafe fn test_mm256_movemask_epi8() {
4975         let a = _mm256_set1_epi8(-1);
4976         let r = _mm256_movemask_epi8(a);
4977         let e = -1;
4978         assert_eq!(r, e);
4979     }
4980
4981     #[simd_test(enable = "avx2")]
4982     unsafe fn test_mm256_mpsadbw_epu8() {
4983         let a = _mm256_set1_epi8(2);
4984         let b = _mm256_set1_epi8(4);
4985         let r = _mm256_mpsadbw_epu8(a, b, 0);
4986         let e = _mm256_set1_epi16(8);
4987         assert_eq_m256i(r, e);
4988     }
4989
4990     #[simd_test(enable = "avx2")]
4991     unsafe fn test_mm256_mul_epi32() {
4992         let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4993         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4994         let r = _mm256_mul_epi32(a, b);
4995         let e = _mm256_setr_epi64x(0, 0, 10, 14);
4996         assert_eq_m256i(r, e);
4997     }
4998
4999     #[simd_test(enable = "avx2")]
5000     unsafe fn test_mm256_mul_epu32() {
5001         let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
5002         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
5003         let r = _mm256_mul_epu32(a, b);
5004         let e = _mm256_setr_epi64x(0, 0, 10, 14);
5005         assert_eq_m256i(r, e);
5006     }
5007
5008     #[simd_test(enable = "avx2")]
5009     unsafe fn test_mm256_mulhi_epi16() {
5010         let a = _mm256_set1_epi16(6535);
5011         let b = _mm256_set1_epi16(6535);
5012         let r = _mm256_mulhi_epi16(a, b);
5013         let e = _mm256_set1_epi16(651);
5014         assert_eq_m256i(r, e);
5015     }
5016
5017     #[simd_test(enable = "avx2")]
5018     unsafe fn test_mm256_mulhi_epu16() {
5019         let a = _mm256_set1_epi16(6535);
5020         let b = _mm256_set1_epi16(6535);
5021         let r = _mm256_mulhi_epu16(a, b);
5022         let e = _mm256_set1_epi16(651);
5023         assert_eq_m256i(r, e);
5024     }
5025
5026     #[simd_test(enable = "avx2")]
5027     unsafe fn test_mm256_mullo_epi16() {
5028         let a = _mm256_set1_epi16(2);
5029         let b = _mm256_set1_epi16(4);
5030         let r = _mm256_mullo_epi16(a, b);
5031         let e = _mm256_set1_epi16(8);
5032         assert_eq_m256i(r, e);
5033     }
5034
5035     #[simd_test(enable = "avx2")]
5036     unsafe fn test_mm256_mullo_epi32() {
5037         let a = _mm256_set1_epi32(2);
5038         let b = _mm256_set1_epi32(4);
5039         let r = _mm256_mullo_epi32(a, b);
5040         let e = _mm256_set1_epi32(8);
5041         assert_eq_m256i(r, e);
5042     }
5043
5044     #[simd_test(enable = "avx2")]
5045     unsafe fn test_mm256_mulhrs_epi16() {
5046         let a = _mm256_set1_epi16(2);
5047         let b = _mm256_set1_epi16(4);
5048         let r = _mm256_mullo_epi16(a, b);
5049         let e = _mm256_set1_epi16(8);
5050         assert_eq_m256i(r, e);
5051     }
5052
5053     #[simd_test(enable = "avx2")]
5054     unsafe fn test_mm256_or_si256() {
5055         let a = _mm256_set1_epi8(-1);
5056         let b = _mm256_set1_epi8(0);
5057         let r = _mm256_or_si256(a, b);
5058         assert_eq_m256i(r, a);
5059     }
5060
5061     #[simd_test(enable = "avx2")]
5062     unsafe fn test_mm256_packs_epi16() {
5063         let a = _mm256_set1_epi16(2);
5064         let b = _mm256_set1_epi16(4);
5065         let r = _mm256_packs_epi16(a, b);
5066         #[rustfmt::skip]
5067         let e = _mm256_setr_epi8(
5068             2, 2, 2, 2, 2, 2, 2, 2,
5069             4, 4, 4, 4, 4, 4, 4, 4,
5070             2, 2, 2, 2, 2, 2, 2, 2,
5071             4, 4, 4, 4, 4, 4, 4, 4,
5072         );
5073
5074         assert_eq_m256i(r, e);
5075     }
5076
5077     #[simd_test(enable = "avx2")]
5078     unsafe fn test_mm256_packs_epi32() {
5079         let a = _mm256_set1_epi32(2);
5080         let b = _mm256_set1_epi32(4);
5081         let r = _mm256_packs_epi32(a, b);
5082         let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5083
5084         assert_eq_m256i(r, e);
5085     }
5086
5087     #[simd_test(enable = "avx2")]
5088     unsafe fn test_mm256_packus_epi16() {
5089         let a = _mm256_set1_epi16(2);
5090         let b = _mm256_set1_epi16(4);
5091         let r = _mm256_packus_epi16(a, b);
5092         #[rustfmt::skip]
5093         let e = _mm256_setr_epi8(
5094             2, 2, 2, 2, 2, 2, 2, 2,
5095             4, 4, 4, 4, 4, 4, 4, 4,
5096             2, 2, 2, 2, 2, 2, 2, 2,
5097             4, 4, 4, 4, 4, 4, 4, 4,
5098         );
5099
5100         assert_eq_m256i(r, e);
5101     }
5102
5103     #[simd_test(enable = "avx2")]
5104     unsafe fn test_mm256_packus_epi32() {
5105         let a = _mm256_set1_epi32(2);
5106         let b = _mm256_set1_epi32(4);
5107         let r = _mm256_packus_epi32(a, b);
5108         let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5109
5110         assert_eq_m256i(r, e);
5111     }
5112
5113     #[simd_test(enable = "avx2")]
5114     unsafe fn test_mm256_sad_epu8() {
5115         let a = _mm256_set1_epi8(2);
5116         let b = _mm256_set1_epi8(4);
5117         let r = _mm256_sad_epu8(a, b);
5118         let e = _mm256_set1_epi64x(16);
5119         assert_eq_m256i(r, e);
5120     }
5121
5122     #[simd_test(enable = "avx2")]
5123     unsafe fn test_mm256_shufflehi_epi16() {
5124         #[rustfmt::skip]
5125         let a = _mm256_setr_epi16(
5126             0, 1, 2, 3, 11, 22, 33, 44,
5127             4, 5, 6, 7, 55, 66, 77, 88,
5128         );
5129         #[rustfmt::skip]
5130         let e = _mm256_setr_epi16(
5131             0, 1, 2, 3, 44, 22, 22, 11,
5132             4, 5, 6, 7, 88, 66, 66, 55,
5133         );
5134         let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
5135         assert_eq_m256i(r, e);
5136     }
5137
5138     #[simd_test(enable = "avx2")]
5139     unsafe fn test_mm256_shufflelo_epi16() {
5140         #[rustfmt::skip]
5141         let a = _mm256_setr_epi16(
5142             11, 22, 33, 44, 0, 1, 2, 3,
5143             55, 66, 77, 88, 4, 5, 6, 7,
5144         );
5145         #[rustfmt::skip]
5146         let e = _mm256_setr_epi16(
5147             44, 22, 22, 11, 0, 1, 2, 3,
5148             88, 66, 66, 55, 4, 5, 6, 7,
5149         );
5150         let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
5151         assert_eq_m256i(r, e);
5152     }
5153
5154     #[simd_test(enable = "avx2")]
5155     unsafe fn test_mm256_sign_epi16() {
5156         let a = _mm256_set1_epi16(2);
5157         let b = _mm256_set1_epi16(-1);
5158         let r = _mm256_sign_epi16(a, b);
5159         let e = _mm256_set1_epi16(-2);
5160         assert_eq_m256i(r, e);
5161     }
5162
5163     #[simd_test(enable = "avx2")]
5164     unsafe fn test_mm256_sign_epi32() {
5165         let a = _mm256_set1_epi32(2);
5166         let b = _mm256_set1_epi32(-1);
5167         let r = _mm256_sign_epi32(a, b);
5168         let e = _mm256_set1_epi32(-2);
5169         assert_eq_m256i(r, e);
5170     }
5171
5172     #[simd_test(enable = "avx2")]
5173     unsafe fn test_mm256_sign_epi8() {
5174         let a = _mm256_set1_epi8(2);
5175         let b = _mm256_set1_epi8(-1);
5176         let r = _mm256_sign_epi8(a, b);
5177         let e = _mm256_set1_epi8(-2);
5178         assert_eq_m256i(r, e);
5179     }
5180
5181     #[simd_test(enable = "avx2")]
5182     unsafe fn test_mm256_sll_epi16() {
5183         let a = _mm256_set1_epi16(0xFF);
5184         let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5185         let r = _mm256_sll_epi16(a, b);
5186         assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5187     }
5188
5189     #[simd_test(enable = "avx2")]
5190     unsafe fn test_mm256_sll_epi32() {
5191         let a = _mm256_set1_epi32(0xFFFF);
5192         let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5193         let r = _mm256_sll_epi32(a, b);
5194         assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5195     }
5196
5197     #[simd_test(enable = "avx2")]
5198     unsafe fn test_mm256_sll_epi64() {
5199         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5200         let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
5201         let r = _mm256_sll_epi64(a, b);
5202         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5203     }
5204
5205     #[simd_test(enable = "avx2")]
5206     unsafe fn test_mm256_slli_epi16() {
5207         assert_eq_m256i(
5208             _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
5209             _mm256_set1_epi16(0xFF0),
5210         );
5211     }
5212
5213     #[simd_test(enable = "avx2")]
5214     unsafe fn test_mm256_slli_epi32() {
5215         assert_eq_m256i(
5216             _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5217             _mm256_set1_epi32(0xFFFF0),
5218         );
5219     }
5220
5221     #[simd_test(enable = "avx2")]
5222     unsafe fn test_mm256_slli_epi64() {
5223         assert_eq_m256i(
5224             _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5225             _mm256_set1_epi64x(0xFFFFFFFF0),
5226         );
5227     }
5228
5229     #[simd_test(enable = "avx2")]
5230     unsafe fn test_mm256_slli_si256() {
5231         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5232         let r = _mm256_slli_si256(a, 3);
5233         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5234     }
5235
5236     #[simd_test(enable = "avx2")]
5237     unsafe fn test_mm_sllv_epi32() {
5238         let a = _mm_set1_epi32(2);
5239         let b = _mm_set1_epi32(1);
5240         let r = _mm_sllv_epi32(a, b);
5241         let e = _mm_set1_epi32(4);
5242         assert_eq_m128i(r, e);
5243     }
5244
5245     #[simd_test(enable = "avx2")]
5246     unsafe fn test_mm256_sllv_epi32() {
5247         let a = _mm256_set1_epi32(2);
5248         let b = _mm256_set1_epi32(1);
5249         let r = _mm256_sllv_epi32(a, b);
5250         let e = _mm256_set1_epi32(4);
5251         assert_eq_m256i(r, e);
5252     }
5253
5254     #[simd_test(enable = "avx2")]
5255     unsafe fn test_mm_sllv_epi64() {
5256         let a = _mm_set1_epi64x(2);
5257         let b = _mm_set1_epi64x(1);
5258         let r = _mm_sllv_epi64(a, b);
5259         let e = _mm_set1_epi64x(4);
5260         assert_eq_m128i(r, e);
5261     }
5262
5263     #[simd_test(enable = "avx2")]
5264     unsafe fn test_mm256_sllv_epi64() {
5265         let a = _mm256_set1_epi64x(2);
5266         let b = _mm256_set1_epi64x(1);
5267         let r = _mm256_sllv_epi64(a, b);
5268         let e = _mm256_set1_epi64x(4);
5269         assert_eq_m256i(r, e);
5270     }
5271
5272     #[simd_test(enable = "avx2")]
5273     unsafe fn test_mm256_sra_epi16() {
5274         let a = _mm256_set1_epi16(-1);
5275         let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5276         let r = _mm256_sra_epi16(a, b);
5277         assert_eq_m256i(r, _mm256_set1_epi16(-1));
5278     }
5279
5280     #[simd_test(enable = "avx2")]
5281     unsafe fn test_mm256_sra_epi32() {
5282         let a = _mm256_set1_epi32(-1);
5283         let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
5284         let r = _mm256_sra_epi32(a, b);
5285         assert_eq_m256i(r, _mm256_set1_epi32(-1));
5286     }
5287
5288     #[simd_test(enable = "avx2")]
5289     unsafe fn test_mm256_srai_epi16() {
5290         assert_eq_m256i(
5291             _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
5292             _mm256_set1_epi16(-1),
5293         );
5294     }
5295
5296     #[simd_test(enable = "avx2")]
5297     unsafe fn test_mm256_srai_epi32() {
5298         assert_eq_m256i(
5299             _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
5300             _mm256_set1_epi32(-1),
5301         );
5302     }
5303
5304     #[simd_test(enable = "avx2")]
5305     unsafe fn test_mm_srav_epi32() {
5306         let a = _mm_set1_epi32(4);
5307         let count = _mm_set1_epi32(1);
5308         let r = _mm_srav_epi32(a, count);
5309         let e = _mm_set1_epi32(2);
5310         assert_eq_m128i(r, e);
5311     }
5312
5313     #[simd_test(enable = "avx2")]
5314     unsafe fn test_mm256_srav_epi32() {
5315         let a = _mm256_set1_epi32(4);
5316         let count = _mm256_set1_epi32(1);
5317         let r = _mm256_srav_epi32(a, count);
5318         let e = _mm256_set1_epi32(2);
5319         assert_eq_m256i(r, e);
5320     }
5321
5322     #[simd_test(enable = "avx2")]
5323     unsafe fn test_mm256_srli_si256() {
5324         #[rustfmt::skip]
5325         let a = _mm256_setr_epi8(
5326             1, 2, 3, 4, 5, 6, 7, 8,
5327             9, 10, 11, 12, 13, 14, 15, 16,
5328             17, 18, 19, 20, 21, 22, 23, 24,
5329             25, 26, 27, 28, 29, 30, 31, 32,
5330         );
5331         let r = _mm256_srli_si256(a, 3);
5332         #[rustfmt::skip]
5333         let e = _mm256_setr_epi8(
5334             4, 5, 6, 7, 8, 9, 10, 11,
5335             12, 13, 14, 15, 16, 0, 0, 0,
5336             20, 21, 22, 23, 24, 25, 26, 27,
5337             28, 29, 30, 31, 32, 0, 0, 0,
5338         );
5339         assert_eq_m256i(r, e);
5340     }
5341
5342     #[simd_test(enable = "avx2")]
5343     unsafe fn test_mm256_srl_epi16() {
5344         let a = _mm256_set1_epi16(0xFF);
5345         let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5346         let r = _mm256_srl_epi16(a, b);
5347         assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5348     }
5349
5350     #[simd_test(enable = "avx2")]
5351     unsafe fn test_mm256_srl_epi32() {
5352         let a = _mm256_set1_epi32(0xFFFF);
5353         let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5354         let r = _mm256_srl_epi32(a, b);
5355         assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5356     }
5357
5358     #[simd_test(enable = "avx2")]
5359     unsafe fn test_mm256_srl_epi64() {
5360         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5361         let b = _mm_setr_epi64x(4, 0);
5362         let r = _mm256_srl_epi64(a, b);
5363         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5364     }
5365
5366     #[simd_test(enable = "avx2")]
5367     unsafe fn test_mm256_srli_epi16() {
5368         assert_eq_m256i(
5369             _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
5370             _mm256_set1_epi16(0xF),
5371         );
5372     }
5373
5374     #[simd_test(enable = "avx2")]
5375     unsafe fn test_mm256_srli_epi32() {
5376         assert_eq_m256i(
5377             _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5378             _mm256_set1_epi32(0xFFF),
5379         );
5380     }
5381
5382     #[simd_test(enable = "avx2")]
5383     unsafe fn test_mm256_srli_epi64() {
5384         assert_eq_m256i(
5385             _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5386             _mm256_set1_epi64x(0xFFFFFFF),
5387         );
5388     }
5389
5390     #[simd_test(enable = "avx2")]
5391     unsafe fn test_mm_srlv_epi32() {
5392         let a = _mm_set1_epi32(2);
5393         let count = _mm_set1_epi32(1);
5394         let r = _mm_srlv_epi32(a, count);
5395         let e = _mm_set1_epi32(1);
5396         assert_eq_m128i(r, e);
5397     }
5398
5399     #[simd_test(enable = "avx2")]
5400     unsafe fn test_mm256_srlv_epi32() {
5401         let a = _mm256_set1_epi32(2);
5402         let count = _mm256_set1_epi32(1);
5403         let r = _mm256_srlv_epi32(a, count);
5404         let e = _mm256_set1_epi32(1);
5405         assert_eq_m256i(r, e);
5406     }
5407
5408     #[simd_test(enable = "avx2")]
5409     unsafe fn test_mm_srlv_epi64() {
5410         let a = _mm_set1_epi64x(2);
5411         let count = _mm_set1_epi64x(1);
5412         let r = _mm_srlv_epi64(a, count);
5413         let e = _mm_set1_epi64x(1);
5414         assert_eq_m128i(r, e);
5415     }
5416
5417     #[simd_test(enable = "avx2")]
5418     unsafe fn test_mm256_srlv_epi64() {
5419         let a = _mm256_set1_epi64x(2);
5420         let count = _mm256_set1_epi64x(1);
5421         let r = _mm256_srlv_epi64(a, count);
5422         let e = _mm256_set1_epi64x(1);
5423         assert_eq_m256i(r, e);
5424     }
5425
5426     #[simd_test(enable = "avx2")]
5427     unsafe fn test_mm256_sub_epi16() {
5428         let a = _mm256_set1_epi16(4);
5429         let b = _mm256_set1_epi16(2);
5430         let r = _mm256_sub_epi16(a, b);
5431         assert_eq_m256i(r, b);
5432     }
5433
5434     #[simd_test(enable = "avx2")]
5435     unsafe fn test_mm256_sub_epi32() {
5436         let a = _mm256_set1_epi32(4);
5437         let b = _mm256_set1_epi32(2);
5438         let r = _mm256_sub_epi32(a, b);
5439         assert_eq_m256i(r, b);
5440     }
5441
5442     #[simd_test(enable = "avx2")]
5443     unsafe fn test_mm256_sub_epi64() {
5444         let a = _mm256_set1_epi64x(4);
5445         let b = _mm256_set1_epi64x(2);
5446         let r = _mm256_sub_epi64(a, b);
5447         assert_eq_m256i(r, b);
5448     }
5449
5450     #[simd_test(enable = "avx2")]
5451     unsafe fn test_mm256_sub_epi8() {
5452         let a = _mm256_set1_epi8(4);
5453         let b = _mm256_set1_epi8(2);
5454         let r = _mm256_sub_epi8(a, b);
5455         assert_eq_m256i(r, b);
5456     }
5457
5458     #[simd_test(enable = "avx2")]
5459     unsafe fn test_mm256_subs_epi16() {
5460         let a = _mm256_set1_epi16(4);
5461         let b = _mm256_set1_epi16(2);
5462         let r = _mm256_subs_epi16(a, b);
5463         assert_eq_m256i(r, b);
5464     }
5465
5466     #[simd_test(enable = "avx2")]
5467     unsafe fn test_mm256_subs_epi8() {
5468         let a = _mm256_set1_epi8(4);
5469         let b = _mm256_set1_epi8(2);
5470         let r = _mm256_subs_epi8(a, b);
5471         assert_eq_m256i(r, b);
5472     }
5473
5474     #[simd_test(enable = "avx2")]
5475     unsafe fn test_mm256_subs_epu16() {
5476         let a = _mm256_set1_epi16(4);
5477         let b = _mm256_set1_epi16(2);
5478         let r = _mm256_subs_epu16(a, b);
5479         assert_eq_m256i(r, b);
5480     }
5481
5482     #[simd_test(enable = "avx2")]
5483     unsafe fn test_mm256_subs_epu8() {
5484         let a = _mm256_set1_epi8(4);
5485         let b = _mm256_set1_epi8(2);
5486         let r = _mm256_subs_epu8(a, b);
5487         assert_eq_m256i(r, b);
5488     }
5489
5490     #[simd_test(enable = "avx2")]
5491     unsafe fn test_mm256_xor_si256() {
5492         let a = _mm256_set1_epi8(5);
5493         let b = _mm256_set1_epi8(3);
5494         let r = _mm256_xor_si256(a, b);
5495         assert_eq_m256i(r, _mm256_set1_epi8(6));
5496     }
5497
5498     #[simd_test(enable = "avx2")]
5499     unsafe fn test_mm256_alignr_epi8() {
5500         #[rustfmt::skip]
5501         let a = _mm256_setr_epi8(
5502             1, 2, 3, 4, 5, 6, 7, 8,
5503             9, 10, 11, 12, 13, 14, 15, 16,
5504             17, 18, 19, 20, 21, 22, 23, 24,
5505             25, 26, 27, 28, 29, 30, 31, 32,
5506         );
5507         #[rustfmt::skip]
5508         let b = _mm256_setr_epi8(
5509             -1, -2, -3, -4, -5, -6, -7, -8,
5510             -9, -10, -11, -12, -13, -14, -15, -16,
5511             -17, -18, -19, -20, -21, -22, -23, -24,
5512             -25, -26, -27, -28, -29, -30, -31, -32,
5513         );
5514         let r = _mm256_alignr_epi8(a, b, 33);
5515         assert_eq_m256i(r, _mm256_set1_epi8(0));
5516
5517         let r = _mm256_alignr_epi8(a, b, 17);
5518         #[rustfmt::skip]
5519         let expected = _mm256_setr_epi8(
5520             2, 3, 4, 5, 6, 7, 8, 9,
5521             10, 11, 12, 13, 14, 15, 16, 0,
5522             18, 19, 20, 21, 22, 23, 24, 25,
5523             26, 27, 28, 29, 30, 31, 32, 0,
5524         );
5525         assert_eq_m256i(r, expected);
5526
5527         let r = _mm256_alignr_epi8(a, b, 4);
5528         #[rustfmt::skip]
5529         let expected = _mm256_setr_epi8(
5530             -5, -6, -7, -8, -9, -10, -11, -12,
5531             -13, -14, -15, -16, 1, 2, 3, 4,
5532             -21, -22, -23, -24, -25, -26, -27, -28,
5533             -29, -30, -31, -32, 17, 18, 19, 20,
5534         );
5535         assert_eq_m256i(r, expected);
5536
5537         #[rustfmt::skip]
5538         let expected = _mm256_setr_epi8(
5539             -1, -2, -3, -4, -5, -6, -7, -8,
5540             -9, -10, -11, -12, -13, -14, -15, -16, -17,
5541             -18, -19, -20, -21, -22, -23, -24, -25,
5542             -26, -27, -28, -29, -30, -31, -32,
5543         );
5544         let r = _mm256_alignr_epi8(a, b, 16);
5545         assert_eq_m256i(r, expected);
5546
5547         let r = _mm256_alignr_epi8(a, b, 15);
5548         #[rustfmt::skip]
5549         let expected = _mm256_setr_epi8(
5550             -16, 1, 2, 3, 4, 5, 6, 7,
5551             8, 9, 10, 11, 12, 13, 14, 15,
5552             -32, 17, 18, 19, 20, 21, 22, 23,
5553             24, 25, 26, 27, 28, 29, 30, 31,
5554         );
5555         assert_eq_m256i(r, expected);
5556
5557         let r = _mm256_alignr_epi8(a, b, 0);
5558         assert_eq_m256i(r, b);
5559     }
5560
5561     #[simd_test(enable = "avx2")]
5562     unsafe fn test_mm256_shuffle_epi8() {
5563         #[rustfmt::skip]
5564         let a = _mm256_setr_epi8(
5565             1, 2, 3, 4, 5, 6, 7, 8,
5566             9, 10, 11, 12, 13, 14, 15, 16,
5567             17, 18, 19, 20, 21, 22, 23, 24,
5568             25, 26, 27, 28, 29, 30, 31, 32,
5569         );
5570         #[rustfmt::skip]
5571         let b = _mm256_setr_epi8(
5572             4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5573             12, 5, 5, 10, 4, 1, 8, 0,
5574             4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5575             12, 5, 5, 10, 4, 1, 8, 0,
5576         );
5577         #[rustfmt::skip]
5578         let expected = _mm256_setr_epi8(
5579             5, 0, 5, 4, 9, 13, 7, 4,
5580             13, 6, 6, 11, 5, 2, 9, 1,
5581             21, 0, 21, 20, 25, 29, 23, 20,
5582             29, 22, 22, 27, 21, 18, 25, 17,
5583         );
5584         let r = _mm256_shuffle_epi8(a, b);
5585         assert_eq_m256i(r, expected);
5586     }
5587
5588     #[simd_test(enable = "avx2")]
5589     unsafe fn test_mm256_permutevar8x32_epi32() {
5590         let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5591         let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5592         let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5593         let r = _mm256_permutevar8x32_epi32(a, b);
5594         assert_eq_m256i(r, expected);
5595     }
5596
5597     #[simd_test(enable = "avx2")]
5598     unsafe fn test_mm256_permute4x64_epi64() {
5599         let a = _mm256_setr_epi64x(100, 200, 300, 400);
5600         let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5601         let r = _mm256_permute4x64_epi64(a, 0b00010011);
5602         assert_eq_m256i(r, expected);
5603     }
5604
5605     #[simd_test(enable = "avx2")]
5606     unsafe fn test_mm256_permute2x128_si256() {
5607         let a = _mm256_setr_epi64x(100, 200, 500, 600);
5608         let b = _mm256_setr_epi64x(300, 400, 700, 800);
5609         let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
5610         let e = _mm256_setr_epi64x(700, 800, 500, 600);
5611         assert_eq_m256i(r, e);
5612     }
5613
5614     #[simd_test(enable = "avx2")]
5615     unsafe fn test_mm256_permute4x64_pd() {
5616         let a = _mm256_setr_pd(1., 2., 3., 4.);
5617         let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
5618         let e = _mm256_setr_pd(4., 1., 2., 1.);
5619         assert_eq_m256d(r, e);
5620     }
5621
5622     #[simd_test(enable = "avx2")]
5623     unsafe fn test_mm256_permutevar8x32_ps() {
5624         let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5625         let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5626         let r = _mm256_permutevar8x32_ps(a, b);
5627         let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5628         assert_eq_m256(r, e);
5629     }
5630
5631     #[simd_test(enable = "avx2")]
5632     unsafe fn test_mm_i32gather_epi32() {
5633         let mut arr = [0i32; 128];
5634         for i in 0..128i32 {
5635             arr[i as usize] = i;
5636         }
5637         // A multiplier of 4 is word-addressing
5638         let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5639         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5640     }
5641
5642     #[simd_test(enable = "avx2")]
5643     unsafe fn test_mm_mask_i32gather_epi32() {
5644         let mut arr = [0i32; 128];
5645         for i in 0..128i32 {
5646             arr[i as usize] = i;
5647         }
5648         // A multiplier of 4 is word-addressing
5649         let r = _mm_mask_i32gather_epi32(
5650             _mm_set1_epi32(256),
5651             arr.as_ptr(),
5652             _mm_setr_epi32(0, 16, 64, 96),
5653             _mm_setr_epi32(-1, -1, -1, 0),
5654             4,
5655         );
5656         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5657     }
5658
5659     #[simd_test(enable = "avx2")]
5660     unsafe fn test_mm256_i32gather_epi32() {
5661         let mut arr = [0i32; 128];
5662         for i in 0..128i32 {
5663             arr[i as usize] = i;
5664         }
5665         // A multiplier of 4 is word-addressing
5666         let r = _mm256_i32gather_epi32(
5667             arr.as_ptr(),
5668             _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5669             4,
5670         );
5671         assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5672     }
5673
5674     #[simd_test(enable = "avx2")]
5675     unsafe fn test_mm256_mask_i32gather_epi32() {
5676         let mut arr = [0i32; 128];
5677         for i in 0..128i32 {
5678             arr[i as usize] = i;
5679         }
5680         // A multiplier of 4 is word-addressing
5681         let r = _mm256_mask_i32gather_epi32(
5682             _mm256_set1_epi32(256),
5683             arr.as_ptr(),
5684             _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5685             _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5686             4,
5687         );
5688         assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5689     }
5690
5691     #[simd_test(enable = "avx2")]
5692     unsafe fn test_mm_i32gather_ps() {
5693         let mut arr = [0.0f32; 128];
5694         let mut j = 0.0;
5695         for i in 0..128usize {
5696             arr[i] = j;
5697             j += 1.0;
5698         }
5699         // A multiplier of 4 is word-addressing for f32s
5700         let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5701         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5702     }
5703
5704     #[simd_test(enable = "avx2")]
5705     unsafe fn test_mm_mask_i32gather_ps() {
5706         let mut arr = [0.0f32; 128];
5707         let mut j = 0.0;
5708         for i in 0..128usize {
5709             arr[i] = j;
5710             j += 1.0;
5711         }
5712         // A multiplier of 4 is word-addressing for f32s
5713         let r = _mm_mask_i32gather_ps(
5714             _mm_set1_ps(256.0),
5715             arr.as_ptr(),
5716             _mm_setr_epi32(0, 16, 64, 96),
5717             _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5718             4,
5719         );
5720         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5721     }
5722
5723     #[simd_test(enable = "avx2")]
5724     unsafe fn test_mm256_i32gather_ps() {
5725         let mut arr = [0.0f32; 128];
5726         let mut j = 0.0;
5727         for i in 0..128usize {
5728             arr[i] = j;
5729             j += 1.0;
5730         }
5731         // A multiplier of 4 is word-addressing for f32s
5732         let r = _mm256_i32gather_ps(
5733             arr.as_ptr(),
5734             _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5735             4,
5736         );
5737         assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5738     }
5739
5740     #[simd_test(enable = "avx2")]
5741     unsafe fn test_mm256_mask_i32gather_ps() {
5742         let mut arr = [0.0f32; 128];
5743         let mut j = 0.0;
5744         for i in 0..128usize {
5745             arr[i] = j;
5746             j += 1.0;
5747         }
5748         // A multiplier of 4 is word-addressing for f32s
5749         let r = _mm256_mask_i32gather_ps(
5750             _mm256_set1_ps(256.0),
5751             arr.as_ptr(),
5752             _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5753             _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5754             4,
5755         );
5756         assert_eq_m256(
5757             r,
5758             _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5759         );
5760     }
5761
5762     #[simd_test(enable = "avx2")]
5763     unsafe fn test_mm_i32gather_epi64() {
5764         let mut arr = [0i64; 128];
5765         for i in 0..128i64 {
5766             arr[i as usize] = i;
5767         }
5768         // A multiplier of 8 is word-addressing for i64s
5769         let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5770         assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5771     }
5772
5773     #[simd_test(enable = "avx2")]
5774     unsafe fn test_mm_mask_i32gather_epi64() {
5775         let mut arr = [0i64; 128];
5776         for i in 0..128i64 {
5777             arr[i as usize] = i;
5778         }
5779         // A multiplier of 8 is word-addressing for i64s
5780         let r = _mm_mask_i32gather_epi64(
5781             _mm_set1_epi64x(256),
5782             arr.as_ptr(),
5783             _mm_setr_epi32(16, 16, 16, 16),
5784             _mm_setr_epi64x(-1, 0),
5785             8,
5786         );
5787         assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5788     }
5789
5790     #[simd_test(enable = "avx2")]
5791     unsafe fn test_mm256_i32gather_epi64() {
5792         let mut arr = [0i64; 128];
5793         for i in 0..128i64 {
5794             arr[i as usize] = i;
5795         }
5796         // A multiplier of 8 is word-addressing for i64s
5797         let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5798         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5799     }
5800
5801     #[simd_test(enable = "avx2")]
5802     unsafe fn test_mm256_mask_i32gather_epi64() {
5803         let mut arr = [0i64; 128];
5804         for i in 0..128i64 {
5805             arr[i as usize] = i;
5806         }
5807         // A multiplier of 8 is word-addressing for i64s
5808         let r = _mm256_mask_i32gather_epi64(
5809             _mm256_set1_epi64x(256),
5810             arr.as_ptr(),
5811             _mm_setr_epi32(0, 16, 64, 96),
5812             _mm256_setr_epi64x(-1, -1, -1, 0),
5813             8,
5814         );
5815         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5816     }
5817
5818     #[simd_test(enable = "avx2")]
5819     unsafe fn test_mm_i32gather_pd() {
5820         let mut arr = [0.0f64; 128];
5821         let mut j = 0.0;
5822         for i in 0..128usize {
5823             arr[i] = j;
5824             j += 1.0;
5825         }
5826         // A multiplier of 8 is word-addressing for f64s
5827         let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5828         assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5829     }
5830
5831     #[simd_test(enable = "avx2")]
5832     unsafe fn test_mm_mask_i32gather_pd() {
5833         let mut arr = [0.0f64; 128];
5834         let mut j = 0.0;
5835         for i in 0..128usize {
5836             arr[i] = j;
5837             j += 1.0;
5838         }
5839         // A multiplier of 8 is word-addressing for f64s
5840         let r = _mm_mask_i32gather_pd(
5841             _mm_set1_pd(256.0),
5842             arr.as_ptr(),
5843             _mm_setr_epi32(16, 16, 16, 16),
5844             _mm_setr_pd(-1.0, 0.0),
5845             8,
5846         );
5847         assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5848     }
5849
5850     #[simd_test(enable = "avx2")]
5851     unsafe fn test_mm256_i32gather_pd() {
5852         let mut arr = [0.0f64; 128];
5853         let mut j = 0.0;
5854         for i in 0..128usize {
5855             arr[i] = j;
5856             j += 1.0;
5857         }
5858         // A multiplier of 8 is word-addressing for f64s
5859         let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5860         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5861     }
5862
5863     #[simd_test(enable = "avx2")]
5864     unsafe fn test_mm256_mask_i32gather_pd() {
5865         let mut arr = [0.0f64; 128];
5866         let mut j = 0.0;
5867         for i in 0..128usize {
5868             arr[i] = j;
5869             j += 1.0;
5870         }
5871         // A multiplier of 8 is word-addressing for f64s
5872         let r = _mm256_mask_i32gather_pd(
5873             _mm256_set1_pd(256.0),
5874             arr.as_ptr(),
5875             _mm_setr_epi32(0, 16, 64, 96),
5876             _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5877             8,
5878         );
5879         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5880     }
5881
5882     #[simd_test(enable = "avx2")]
5883     unsafe fn test_mm_i64gather_epi32() {
5884         let mut arr = [0i32; 128];
5885         for i in 0..128i32 {
5886             arr[i as usize] = i;
5887         }
5888         // A multiplier of 4 is word-addressing
5889         let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5890         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5891     }
5892
5893     #[simd_test(enable = "avx2")]
5894     unsafe fn test_mm_mask_i64gather_epi32() {
5895         let mut arr = [0i32; 128];
5896         for i in 0..128i32 {
5897             arr[i as usize] = i;
5898         }
5899         // A multiplier of 4 is word-addressing
5900         let r = _mm_mask_i64gather_epi32(
5901             _mm_set1_epi32(256),
5902             arr.as_ptr(),
5903             _mm_setr_epi64x(0, 16),
5904             _mm_setr_epi32(-1, 0, -1, 0),
5905             4,
5906         );
5907         assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5908     }
5909
5910     #[simd_test(enable = "avx2")]
5911     unsafe fn test_mm256_i64gather_epi32() {
5912         let mut arr = [0i32; 128];
5913         for i in 0..128i32 {
5914             arr[i as usize] = i;
5915         }
5916         // A multiplier of 4 is word-addressing
5917         let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5918         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5919     }
5920
5921     #[simd_test(enable = "avx2")]
5922     unsafe fn test_mm256_mask_i64gather_epi32() {
5923         let mut arr = [0i32; 128];
5924         for i in 0..128i32 {
5925             arr[i as usize] = i;
5926         }
5927         // A multiplier of 4 is word-addressing
5928         let r = _mm256_mask_i64gather_epi32(
5929             _mm_set1_epi32(256),
5930             arr.as_ptr(),
5931             _mm256_setr_epi64x(0, 16, 64, 96),
5932             _mm_setr_epi32(-1, -1, -1, 0),
5933             4,
5934         );
5935         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5936     }
5937
5938     #[simd_test(enable = "avx2")]
5939     unsafe fn test_mm_i64gather_ps() {
5940         let mut arr = [0.0f32; 128];
5941         let mut j = 0.0;
5942         for i in 0..128usize {
5943             arr[i] = j;
5944             j += 1.0;
5945         }
5946         // A multiplier of 4 is word-addressing for f32s
5947         let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5948         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5949     }
5950
5951     #[simd_test(enable = "avx2")]
5952     unsafe fn test_mm_mask_i64gather_ps() {
5953         let mut arr = [0.0f32; 128];
5954         let mut j = 0.0;
5955         for i in 0..128usize {
5956             arr[i] = j;
5957             j += 1.0;
5958         }
5959         // A multiplier of 4 is word-addressing for f32s
5960         let r = _mm_mask_i64gather_ps(
5961             _mm_set1_ps(256.0),
5962             arr.as_ptr(),
5963             _mm_setr_epi64x(0, 16),
5964             _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5965             4,
5966         );
5967         assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5968     }
5969
5970     #[simd_test(enable = "avx2")]
5971     unsafe fn test_mm256_i64gather_ps() {
5972         let mut arr = [0.0f32; 128];
5973         let mut j = 0.0;
5974         for i in 0..128usize {
5975             arr[i] = j;
5976             j += 1.0;
5977         }
5978         // A multiplier of 4 is word-addressing for f32s
5979         let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5980         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5981     }
5982
5983     #[simd_test(enable = "avx2")]
5984     unsafe fn test_mm256_mask_i64gather_ps() {
5985         let mut arr = [0.0f32; 128];
5986         let mut j = 0.0;
5987         for i in 0..128usize {
5988             arr[i] = j;
5989             j += 1.0;
5990         }
5991         // A multiplier of 4 is word-addressing for f32s
5992         let r = _mm256_mask_i64gather_ps(
5993             _mm_set1_ps(256.0),
5994             arr.as_ptr(),
5995             _mm256_setr_epi64x(0, 16, 64, 96),
5996             _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5997             4,
5998         );
5999         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
6000     }
6001
6002     #[simd_test(enable = "avx2")]
6003     unsafe fn test_mm_i64gather_epi64() {
6004         let mut arr = [0i64; 128];
6005         for i in 0..128i64 {
6006             arr[i as usize] = i;
6007         }
6008         // A multiplier of 8 is word-addressing for i64s
6009         let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6010         assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
6011     }
6012
6013     #[simd_test(enable = "avx2")]
6014     unsafe fn test_mm_mask_i64gather_epi64() {
6015         let mut arr = [0i64; 128];
6016         for i in 0..128i64 {
6017             arr[i as usize] = i;
6018         }
6019         // A multiplier of 8 is word-addressing for i64s
6020         let r = _mm_mask_i64gather_epi64(
6021             _mm_set1_epi64x(256),
6022             arr.as_ptr(),
6023             _mm_setr_epi64x(16, 16),
6024             _mm_setr_epi64x(-1, 0),
6025             8,
6026         );
6027         assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
6028     }
6029
6030     #[simd_test(enable = "avx2")]
6031     unsafe fn test_mm256_i64gather_epi64() {
6032         let mut arr = [0i64; 128];
6033         for i in 0..128i64 {
6034             arr[i as usize] = i;
6035         }
6036         // A multiplier of 8 is word-addressing for i64s
6037         let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6038         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
6039     }
6040
6041     #[simd_test(enable = "avx2")]
6042     unsafe fn test_mm256_mask_i64gather_epi64() {
6043         let mut arr = [0i64; 128];
6044         for i in 0..128i64 {
6045             arr[i as usize] = i;
6046         }
6047         // A multiplier of 8 is word-addressing for i64s
6048         let r = _mm256_mask_i64gather_epi64(
6049             _mm256_set1_epi64x(256),
6050             arr.as_ptr(),
6051             _mm256_setr_epi64x(0, 16, 64, 96),
6052             _mm256_setr_epi64x(-1, -1, -1, 0),
6053             8,
6054         );
6055         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
6056     }
6057
6058     #[simd_test(enable = "avx2")]
6059     unsafe fn test_mm_i64gather_pd() {
6060         let mut arr = [0.0f64; 128];
6061         let mut j = 0.0;
6062         for i in 0..128usize {
6063             arr[i] = j;
6064             j += 1.0;
6065         }
6066         // A multiplier of 8 is word-addressing for f64s
6067         let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6068         assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
6069     }
6070
6071     #[simd_test(enable = "avx2")]
6072     unsafe fn test_mm_mask_i64gather_pd() {
6073         let mut arr = [0.0f64; 128];
6074         let mut j = 0.0;
6075         for i in 0..128usize {
6076             arr[i] = j;
6077             j += 1.0;
6078         }
6079         // A multiplier of 8 is word-addressing for f64s
6080         let r = _mm_mask_i64gather_pd(
6081             _mm_set1_pd(256.0),
6082             arr.as_ptr(),
6083             _mm_setr_epi64x(16, 16),
6084             _mm_setr_pd(-1.0, 0.0),
6085             8,
6086         );
6087         assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
6088     }
6089
6090     #[simd_test(enable = "avx2")]
6091     unsafe fn test_mm256_i64gather_pd() {
6092         let mut arr = [0.0f64; 128];
6093         let mut j = 0.0;
6094         for i in 0..128usize {
6095             arr[i] = j;
6096             j += 1.0;
6097         }
6098         // A multiplier of 8 is word-addressing for f64s
6099         let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6100         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6101     }
6102
6103     #[simd_test(enable = "avx2")]
6104     unsafe fn test_mm256_mask_i64gather_pd() {
6105         let mut arr = [0.0f64; 128];
6106         let mut j = 0.0;
6107         for i in 0..128usize {
6108             arr[i] = j;
6109             j += 1.0;
6110         }
6111         // A multiplier of 8 is word-addressing for f64s
6112         let r = _mm256_mask_i64gather_pd(
6113             _mm256_set1_pd(256.0),
6114             arr.as_ptr(),
6115             _mm256_setr_epi64x(0, 16, 64, 96),
6116             _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6117             8,
6118         );
6119         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6120     }
6121
6122     #[simd_test(enable = "avx")]
6123     unsafe fn test_mm256_extract_epi8() {
6124         #[rustfmt::skip]
6125         let a = _mm256_setr_epi8(
6126             -1, 1, 2, 3, 4, 5, 6, 7,
6127             8, 9, 10, 11, 12, 13, 14, 15,
6128             16, 17, 18, 19, 20, 21, 22, 23,
6129             24, 25, 26, 27, 28, 29, 30, 31
6130         );
6131         let r1 = _mm256_extract_epi8(a, 0);
6132         let r2 = _mm256_extract_epi8(a, 35);
6133         assert_eq!(r1, 0xFF);
6134         assert_eq!(r2, 3);
6135     }
6136
6137     #[simd_test(enable = "avx2")]
6138     unsafe fn test_mm256_extract_epi16() {
6139         #[rustfmt::skip]
6140         let a = _mm256_setr_epi16(
6141             -1, 1, 2, 3, 4, 5, 6, 7,
6142             8, 9, 10, 11, 12, 13, 14, 15,
6143         );
6144         let r1 = _mm256_extract_epi16(a, 0);
6145         let r2 = _mm256_extract_epi16(a, 19);
6146         assert_eq!(r1, 0xFFFF);
6147         assert_eq!(r2, 3);
6148     }
6149
6150     #[simd_test(enable = "avx2")]
6151     unsafe fn test_mm256_extract_epi32() {
6152         let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
6153         let r1 = _mm256_extract_epi32(a, 0);
6154         let r2 = _mm256_extract_epi32(a, 11);
6155         assert_eq!(r1, -1);
6156         assert_eq!(r2, 3);
6157     }
6158
6159     #[simd_test(enable = "avx2")]
6160     unsafe fn test_mm256_cvtsd_f64() {
6161         let a = _mm256_setr_pd(1., 2., 3., 4.);
6162         let r = _mm256_cvtsd_f64(a);
6163         assert_eq!(r, 1.);
6164     }
6165
6166     #[simd_test(enable = "avx2")]
6167     unsafe fn test_mm256_cvtsi256_si32() {
6168         let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
6169         let r = _mm256_cvtsi256_si32(a);
6170         assert_eq!(r, 1);
6171     }
6172 }