src/stdarch/crates/core_arch/src/x86/avx2.rs

   1 //! Advanced Vector Extensions 2 (AVX)
   2 //!
   3 //! AVX2 expands most AVX commands to 256-bit wide vector registers and
   4 //! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
   5 //!
   6 //! The references are:
   7 //!
   8 //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
   9 //!   Instruction Set Reference, A-Z][intel64_ref].
  10 //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
  11 //!   System Instructions][amd64_ref].
  12 //!
  13 //! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
  14 //! overview of the instructions available.
  15 //!
  16 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
  17 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
  18 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
  19 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
  20
  21 use crate::{
  22     core_arch::{simd::*, simd_llvm::*, x86::*},
  23     mem::transmute,
  24 };
  25
  26 #[cfg(test)]
  27 use stdarch_test::assert_instr;
  28
  29 /// Computes the absolute values of packed 32-bit integers in `a`.
  30 ///
  31 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
  32 #[inline]
  33 #[target_feature(enable = "avx2")]
  34 #[cfg_attr(test, assert_instr(vpabsd))]
  35 #[stable(feature = "simd_x86", since = "1.27.0")]
  36 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
  37     transmute(pabsd(a.as_i32x8()))
  38 }
  39
  40 /// Computes the absolute values of packed 16-bit integers in `a`.
  41 ///
  42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
  43 #[inline]
  44 #[target_feature(enable = "avx2")]
  45 #[cfg_attr(test, assert_instr(vpabsw))]
  46 #[stable(feature = "simd_x86", since = "1.27.0")]
  47 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
  48     transmute(pabsw(a.as_i16x16()))
  49 }
  50
  51 /// Computes the absolute values of packed 8-bit integers in `a`.
  52 ///
  53 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
  54 #[inline]
  55 #[target_feature(enable = "avx2")]
  56 #[cfg_attr(test, assert_instr(vpabsb))]
  57 #[stable(feature = "simd_x86", since = "1.27.0")]
  58 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
  59     transmute(pabsb(a.as_i8x32()))
  60 }
  61
  62 /// Adds packed 64-bit integers in `a` and `b`.
  63 ///
  64 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
  65 #[inline]
  66 #[target_feature(enable = "avx2")]
  67 #[cfg_attr(test, assert_instr(vpaddq))]
  68 #[stable(feature = "simd_x86", since = "1.27.0")]
  69 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
  70     transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
  71 }
  72
  73 /// Adds packed 32-bit integers in `a` and `b`.
  74 ///
  75 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
  76 #[inline]
  77 #[target_feature(enable = "avx2")]
  78 #[cfg_attr(test, assert_instr(vpaddd))]
  79 #[stable(feature = "simd_x86", since = "1.27.0")]
  80 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
  81     transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
  82 }
  83
  84 /// Adds packed 16-bit integers in `a` and `b`.
  85 ///
  86 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
  87 #[inline]
  88 #[target_feature(enable = "avx2")]
  89 #[cfg_attr(test, assert_instr(vpaddw))]
  90 #[stable(feature = "simd_x86", since = "1.27.0")]
  91 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
  92     transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
  93 }
  94
  95 /// Adds packed 8-bit integers in `a` and `b`.
  96 ///
  97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
  98 #[inline]
  99 #[target_feature(enable = "avx2")]
 100 #[cfg_attr(test, assert_instr(vpaddb))]
 101 #[stable(feature = "simd_x86", since = "1.27.0")]
 102 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 103     transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
 104 }
 105
 106 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 107 ///
 108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
 109 #[inline]
 110 #[target_feature(enable = "avx2")]
 111 #[cfg_attr(test, assert_instr(vpaddsb))]
 112 #[stable(feature = "simd_x86", since = "1.27.0")]
 113 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 114     transmute(paddsb(a.as_i8x32(), b.as_i8x32()))
 115 }
 116
 117 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 118 ///
 119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
 120 #[inline]
 121 #[target_feature(enable = "avx2")]
 122 #[cfg_attr(test, assert_instr(vpaddsw))]
 123 #[stable(feature = "simd_x86", since = "1.27.0")]
 124 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 125     transmute(paddsw(a.as_i16x16(), b.as_i16x16()))
 126 }
 127
 128 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 129 ///
 130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
 131 #[inline]
 132 #[target_feature(enable = "avx2")]
 133 #[cfg_attr(test, assert_instr(vpaddusb))]
 134 #[stable(feature = "simd_x86", since = "1.27.0")]
 135 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 136     transmute(paddusb(a.as_u8x32(), b.as_u8x32()))
 137 }
 138
 139 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 140 ///
 141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
 142 #[inline]
 143 #[target_feature(enable = "avx2")]
 144 #[cfg_attr(test, assert_instr(vpaddusw))]
 145 #[stable(feature = "simd_x86", since = "1.27.0")]
 146 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
 147     transmute(paddusw(a.as_u16x16(), b.as_u16x16()))
 148 }
 149
 150 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 151 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
 152 ///
 153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
 154 #[inline]
 155 #[target_feature(enable = "avx2")]
 156 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
 157 #[rustc_args_required_const(2)]
 158 #[stable(feature = "simd_x86", since = "1.27.0")]
 159 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
 160     let n = n as u32;
 161     // If `palignr` is shifting the pair of vectors more than the size of two
 162     // lanes, emit zero.
 163     if n > 32 {
 164         return _mm256_set1_epi8(0);
 165     }
 166     // If `palignr` is shifting the pair of input vectors more than one lane,
 167     // but less than two lanes, convert to shifting in zeroes.
 168     let (a, b, n) = if n > 16 {
 169         (_mm256_set1_epi8(0), a, n - 16)
 170     } else {
 171         (a, b, n)
 172     };
 173
 174     let a = a.as_i8x32();
 175     let b = b.as_i8x32();
 176
 177     let r: i8x32 = match n {
 178         0 => simd_shuffle32(
 179             b,
 180             a,
 181             [
 182                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
 183                 23, 24, 25, 26, 27, 28, 29, 30, 31,
 184             ],
 185         ),
 186         1 => simd_shuffle32(
 187             b,
 188             a,
 189             [
 190                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
 191                 24, 25, 26, 27, 28, 29, 30, 31, 48,
 192             ],
 193         ),
 194         2 => simd_shuffle32(
 195             b,
 196             a,
 197             [
 198                 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
 199                 25, 26, 27, 28, 29, 30, 31, 48, 49,
 200             ],
 201         ),
 202         3 => simd_shuffle32(
 203             b,
 204             a,
 205             [
 206                 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
 207                 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
 208             ],
 209         ),
 210         4 => simd_shuffle32(
 211             b,
 212             a,
 213             [
 214                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
 215                 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
 216             ],
 217         ),
 218         5 => simd_shuffle32(
 219             b,
 220             a,
 221             [
 222                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
 223                 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
 224             ],
 225         ),
 226         6 => simd_shuffle32(
 227             b,
 228             a,
 229             [
 230                 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
 231                 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
 232             ],
 233         ),
 234         7 => simd_shuffle32(
 235             b,
 236             a,
 237             [
 238                 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
 239                 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
 240             ],
 241         ),
 242         8 => simd_shuffle32(
 243             b,
 244             a,
 245             [
 246                 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
 247                 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
 248             ],
 249         ),
 250         9 => simd_shuffle32(
 251             b,
 252             a,
 253             [
 254                 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
 255                 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
 256             ],
 257         ),
 258         10 => simd_shuffle32(
 259             b,
 260             a,
 261             [
 262                 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
 263                 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 264             ],
 265         ),
 266         11 => simd_shuffle32(
 267             b,
 268             a,
 269             [
 270                 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
 271                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
 272             ],
 273         ),
 274         12 => simd_shuffle32(
 275             b,
 276             a,
 277             [
 278                 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
 279                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
 280             ],
 281         ),
 282         13 => simd_shuffle32(
 283             b,
 284             a,
 285             [
 286                 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
 287                 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
 288             ],
 289         ),
 290         14 => simd_shuffle32(
 291             b,
 292             a,
 293             [
 294                 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
 295                 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
 296             ],
 297         ),
 298         15 => simd_shuffle32(
 299             b,
 300             a,
 301             [
 302                 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
 303                 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
 304             ],
 305         ),
 306         _ => b,
 307     };
 308     transmute(r)
 309 }
 310
 311 /// Computes the bitwise AND of 256 bits (representing integer data)
 312 /// in `a` and `b`.
 313 ///
 314 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
 315 #[inline]
 316 #[target_feature(enable = "avx2")]
 317 #[cfg_attr(test, assert_instr(vandps))]
 318 #[stable(feature = "simd_x86", since = "1.27.0")]
 319 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 320     transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
 321 }
 322
 323 /// Computes the bitwise NOT of 256 bits (representing integer data)
 324 /// in `a` and then AND with `b`.
 325 ///
 326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
 327 #[inline]
 328 #[target_feature(enable = "avx2")]
 329 #[cfg_attr(test, assert_instr(vandnps))]
 330 #[stable(feature = "simd_x86", since = "1.27.0")]
 331 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 332     let all_ones = _mm256_set1_epi8(-1);
 333     transmute(simd_and(
 334         simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
 335         b.as_i64x4(),
 336     ))
 337 }
 338
 339 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 340 ///
 341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
 342 #[inline]
 343 #[target_feature(enable = "avx2")]
 344 #[cfg_attr(test, assert_instr(vpavgw))]
 345 #[stable(feature = "simd_x86", since = "1.27.0")]
 346 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 347     transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
 348 }
 349
 350 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 351 ///
 352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
 353 #[inline]
 354 #[target_feature(enable = "avx2")]
 355 #[cfg_attr(test, assert_instr(vpavgb))]
 356 #[stable(feature = "simd_x86", since = "1.27.0")]
 357 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 358     transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
 359 }
 360
 361 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
 362 ///
 363 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
 364 #[inline]
 365 #[target_feature(enable = "avx2")]
 366 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 367 #[rustc_args_required_const(2)]
 368 #[stable(feature = "simd_x86", since = "1.27.0")]
 369 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 370     let imm8 = (imm8 & 0xFF) as u8;
 371     let a = a.as_i32x4();
 372     let b = b.as_i32x4();
 373     macro_rules! blend2 {
 374         ($a:expr, $b:expr, $c:expr, $d:expr) => {
 375             simd_shuffle4(a, b, [$a, $b, $c, $d]);
 376         };
 377     }
 378     macro_rules! blend1 {
 379         ($a:expr, $b:expr) => {
 380             match (imm8 >> 2) & 0b11 {
 381                 0b00 => blend2!($a, $b, 2, 3),
 382                 0b01 => blend2!($a, $b, 6, 3),
 383                 0b10 => blend2!($a, $b, 2, 7),
 384                 _ => blend2!($a, $b, 6, 7),
 385             }
 386         };
 387     }
 388     let r: i32x4 = match imm8 & 0b11 {
 389         0b00 => blend1!(0, 1),
 390         0b01 => blend1!(4, 1),
 391         0b10 => blend1!(0, 5),
 392         _ => blend1!(4, 5),
 393     };
 394     transmute(r)
 395 }
 396
 397 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
 398 ///
 399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
 400 #[inline]
 401 #[target_feature(enable = "avx2")]
 402 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 403 #[rustc_args_required_const(2)]
 404 #[stable(feature = "simd_x86", since = "1.27.0")]
 405 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 406     let imm8 = (imm8 & 0xFF) as u8;
 407     let a = a.as_i32x8();
 408     let b = b.as_i32x8();
 409     macro_rules! blend4 {
 410         (
 411             $a:expr,
 412             $b:expr,
 413             $c:expr,
 414             $d:expr,
 415             $e:expr,
 416             $f:expr,
 417             $g:expr,
 418             $h:expr
 419         ) => {
 420             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
 421         };
 422     }
 423     macro_rules! blend3 {
 424         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
 425             match (imm8 >> 6) & 0b11 {
 426                 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
 427                 0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
 428                 0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
 429                 _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
 430             }
 431         };
 432     }
 433     macro_rules! blend2 {
 434         ($a:expr, $b:expr, $c:expr, $d:expr) => {
 435             match (imm8 >> 4) & 0b11 {
 436                 0b00 => blend3!($a, $b, $c, $d, 4, 5),
 437                 0b01 => blend3!($a, $b, $c, $d, 12, 5),
 438                 0b10 => blend3!($a, $b, $c, $d, 4, 13),
 439                 _ => blend3!($a, $b, $c, $d, 12, 13),
 440             }
 441         };
 442     }
 443     macro_rules! blend1 {
 444         ($a:expr, $b:expr) => {
 445             match (imm8 >> 2) & 0b11 {
 446                 0b00 => blend2!($a, $b, 2, 3),
 447                 0b01 => blend2!($a, $b, 10, 3),
 448                 0b10 => blend2!($a, $b, 2, 11),
 449                 _ => blend2!($a, $b, 10, 11),
 450             }
 451         };
 452     }
 453     let r: i32x8 = match imm8 & 0b11 {
 454         0b00 => blend1!(0, 1),
 455         0b01 => blend1!(8, 1),
 456         0b10 => blend1!(0, 9),
 457         _ => blend1!(8, 9),
 458     };
 459     transmute(r)
 460 }
 461
 462 /// Blends packed 16-bit integers from `a` and `b` using control mask `imm8`.
 463 ///
 464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
 465 #[inline]
 466 #[target_feature(enable = "avx2")]
 467 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
 468 #[rustc_args_required_const(2)]
 469 #[stable(feature = "simd_x86", since = "1.27.0")]
 470 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 471     let imm8 = (imm8 & 0xFF) as u8;
 472     let a = a.as_i16x16();
 473     let b = b.as_i16x16();
 474     macro_rules! blend4 {
 475         (
 476             $a:expr,
 477             $b:expr,
 478             $c:expr,
 479             $d:expr,
 480             $e:expr,
 481             $f:expr,
 482             $g:expr,
 483             $h:expr,
 484             $i:expr,
 485             $j:expr,
 486             $k:expr,
 487             $l:expr,
 488             $m:expr,
 489             $n:expr,
 490             $o:expr,
 491             $p:expr
 492         ) => {
 493             simd_shuffle16(
 494                 a,
 495                 b,
 496                 [
 497                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
 498                 ],
 499             )
 500         };
 501     }
 502     macro_rules! blend3 {
 503         (
 504             $a:expr,
 505             $b:expr,
 506             $c:expr,
 507             $d:expr,
 508             $e:expr,
 509             $f:expr,
 510             $a2:expr,
 511             $b2:expr,
 512             $c2:expr,
 513             $d2:expr,
 514             $e2:expr,
 515             $f2:expr
 516         ) => {
 517             match (imm8 >> 6) & 0b11 {
 518                 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
 519                 0b01 => {
 520                     blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
 521                 }
 522                 0b10 => {
 523                     blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
 524                 }
 525                 _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
 526             }
 527         };
 528     }
 529     macro_rules! blend2 {
 530         (
 531             $a:expr,
 532             $b:expr,
 533             $c:expr,
 534             $d:expr,
 535             $a2:expr,
 536             $b2:expr,
 537             $c2:expr,
 538             $d2:expr
 539         ) => {
 540             match (imm8 >> 4) & 0b11 {
 541                 0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
 542                 0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
 543                 0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
 544                 _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
 545             }
 546         };
 547     }
 548     macro_rules! blend1 {
 549         ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
 550             match (imm8 >> 2) & 0b11 {
 551                 0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
 552                 0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
 553                 0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
 554                 _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
 555             }
 556         };
 557     }
 558     let r: i16x16 = match imm8 & 0b11 {
 559         0b00 => blend1!(0, 1, 8, 9),
 560         0b01 => blend1!(16, 1, 24, 9),
 561         0b10 => blend1!(0, 17, 8, 25),
 562         _ => blend1!(16, 17, 24, 25),
 563     };
 564     transmute(r)
 565 }
 566
 567 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
 568 ///
 569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
 570 #[inline]
 571 #[target_feature(enable = "avx2")]
 572 #[cfg_attr(test, assert_instr(vpblendvb))]
 573 #[stable(feature = "simd_x86", since = "1.27.0")]
 574 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
 575     transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
 576 }
 577
 578 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 579 /// the 128-bit returned value.
 580 ///
 581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
 582 #[inline]
 583 #[target_feature(enable = "avx2")]
 584 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 585 #[stable(feature = "simd_x86", since = "1.27.0")]
 586 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 587     let zero = _mm_setzero_si128();
 588     let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
 589     transmute::<i8x16, _>(ret)
 590 }
 591
 592 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 593 /// the 256-bit returned value.
 594 ///
 595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
 596 #[inline]
 597 #[target_feature(enable = "avx2")]
 598 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 599 #[stable(feature = "simd_x86", since = "1.27.0")]
 600 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 601     let zero = _mm_setzero_si128();
 602     let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
 603     transmute::<i8x32, _>(ret)
 604 }
 605
 606 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
 607 // often compiled to `vbroadcastss`.
 608 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 609 /// the 128-bit returned value.
 610 ///
 611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
 612 #[inline]
 613 #[target_feature(enable = "avx2")]
 614 #[cfg_attr(test, assert_instr(vbroadcastss))]
 615 #[stable(feature = "simd_x86", since = "1.27.0")]
 616 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 617     let zero = _mm_setzero_si128();
 618     let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
 619     transmute::<i32x4, _>(ret)
 620 }
 621
 622 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
 623 // often compiled to `vbroadcastss`.
 624 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 625 /// the 256-bit returned value.
 626 ///
 627 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
 628 #[inline]
 629 #[target_feature(enable = "avx2")]
 630 #[cfg_attr(test, assert_instr(vbroadcastss))]
 631 #[stable(feature = "simd_x86", since = "1.27.0")]
 632 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 633     let zero = _mm_setzero_si128();
 634     let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
 635     transmute::<i32x8, _>(ret)
 636 }
 637
 638 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 639 /// the 128-bit returned value.
 640 ///
 641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
 642 #[inline]
 643 #[target_feature(enable = "avx2")]
 644 #[cfg_attr(test, assert_instr(vpbroadcastq))]
 645 #[stable(feature = "simd_x86", since = "1.27.0")]
 646 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 647     let zero = _mm_setzero_si128().as_i64x2();
 648     let ret = simd_shuffle2(a.as_i64x2(), zero, [0_u32; 2]);
 649     transmute::<i64x2, _>(ret)
 650 }
 651
 652 // N.B. `simd_shuffle4` with integer data types for `a` and `b` is
 653 // often compiled to `vbroadcastsd`.
 654 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 655 /// the 256-bit returned value.
 656 ///
 657 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
 658 #[inline]
 659 #[target_feature(enable = "avx2")]
 660 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 661 #[stable(feature = "simd_x86", since = "1.27.0")]
 662 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 663     let zero = _mm_setzero_si128();
 664     let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0_u32; 4]);
 665     transmute::<i64x4, _>(ret)
 666 }
 667
 668 /// Broadcasts the low double-precision (64-bit) floating-point element
 669 /// from `a` to all elements of the 128-bit returned value.
 670 ///
 671 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
 672 #[inline]
 673 #[target_feature(enable = "avx2")]
 674 #[cfg_attr(test, assert_instr(vmovddup))]
 675 #[stable(feature = "simd_x86", since = "1.27.0")]
 676 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 677     simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
 678 }
 679
 680 /// Broadcasts the low double-precision (64-bit) floating-point element
 681 /// from `a` to all elements of the 256-bit returned value.
 682 ///
 683 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
 684 #[inline]
 685 #[target_feature(enable = "avx2")]
 686 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 687 #[stable(feature = "simd_x86", since = "1.27.0")]
 688 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 689     simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
 690 }
 691
 692 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
 693 // `vbroadcastf128`.
 694 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 695 /// the 256-bit returned value.
 696 ///
 697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
 698 #[inline]
 699 #[target_feature(enable = "avx2")]
 700 #[stable(feature = "simd_x86", since = "1.27.0")]
 701 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 702     let zero = _mm_setzero_si128();
 703     let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
 704     transmute::<i64x4, _>(ret)
 705 }
 706
 707 /// Broadcasts the low single-precision (32-bit) floating-point element
 708 /// from `a` to all elements of the 128-bit returned value.
 709 ///
 710 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
 711 #[inline]
 712 #[target_feature(enable = "avx2")]
 713 #[cfg_attr(test, assert_instr(vbroadcastss))]
 714 #[stable(feature = "simd_x86", since = "1.27.0")]
 715 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 716     simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
 717 }
 718
 719 /// Broadcasts the low single-precision (32-bit) floating-point element
 720 /// from `a` to all elements of the 256-bit returned value.
 721 ///
 722 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
 723 #[inline]
 724 #[target_feature(enable = "avx2")]
 725 #[cfg_attr(test, assert_instr(vbroadcastss))]
 726 #[stable(feature = "simd_x86", since = "1.27.0")]
 727 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 728     simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
 729 }
 730
 731 /// Broadcasts the low packed 16-bit integer from a to all elements of
 732 /// the 128-bit returned value
 733 ///
 734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
 735 #[inline]
 736 #[target_feature(enable = "avx2")]
 737 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 738 #[stable(feature = "simd_x86", since = "1.27.0")]
 739 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 740     let zero = _mm_setzero_si128();
 741     let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
 742     transmute::<i16x8, _>(ret)
 743 }
 744
 745 /// Broadcasts the low packed 16-bit integer from a to all elements of
 746 /// the 256-bit returned value
 747 ///
 748 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
 749 #[inline]
 750 #[target_feature(enable = "avx2")]
 751 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 752 #[stable(feature = "simd_x86", since = "1.27.0")]
 753 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 754     let zero = _mm_setzero_si128();
 755     let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
 756     transmute::<i16x16, _>(ret)
 757 }
 758
 759 /// Compares packed 64-bit integers in `a` and `b` for equality.
 760 ///
 761 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
 762 #[inline]
 763 #[target_feature(enable = "avx2")]
 764 #[cfg_attr(test, assert_instr(vpcmpeqq))]
 765 #[stable(feature = "simd_x86", since = "1.27.0")]
 766 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 767     transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
 768 }
 769
 770 /// Compares packed 32-bit integers in `a` and `b` for equality.
 771 ///
 772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
 773 #[inline]
 774 #[target_feature(enable = "avx2")]
 775 #[cfg_attr(test, assert_instr(vpcmpeqd))]
 776 #[stable(feature = "simd_x86", since = "1.27.0")]
 777 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 778     transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
 779 }
 780
 781 /// Compares packed 16-bit integers in `a` and `b` for equality.
 782 ///
 783 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
 784 #[inline]
 785 #[target_feature(enable = "avx2")]
 786 #[cfg_attr(test, assert_instr(vpcmpeqw))]
 787 #[stable(feature = "simd_x86", since = "1.27.0")]
 788 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 789     transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
 790 }
 791
 792 /// Compares packed 8-bit integers in `a` and `b` for equality.
 793 ///
 794 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
 795 #[inline]
 796 #[target_feature(enable = "avx2")]
 797 #[cfg_attr(test, assert_instr(vpcmpeqb))]
 798 #[stable(feature = "simd_x86", since = "1.27.0")]
 799 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 800     transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
 801 }
 802
 803 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
 804 ///
 805 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
 806 #[inline]
 807 #[target_feature(enable = "avx2")]
 808 #[cfg_attr(test, assert_instr(vpcmpgtq))]
 809 #[stable(feature = "simd_x86", since = "1.27.0")]
 810 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 811     transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
 812 }
 813
 814 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 815 ///
 816 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
 817 #[inline]
 818 #[target_feature(enable = "avx2")]
 819 #[cfg_attr(test, assert_instr(vpcmpgtd))]
 820 #[stable(feature = "simd_x86", since = "1.27.0")]
 821 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 822     transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
 823 }
 824
 825 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 826 ///
 827 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
 828 #[inline]
 829 #[target_feature(enable = "avx2")]
 830 #[cfg_attr(test, assert_instr(vpcmpgtw))]
 831 #[stable(feature = "simd_x86", since = "1.27.0")]
 832 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 833     transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
 834 }
 835
 836 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 837 ///
 838 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
 839 #[inline]
 840 #[target_feature(enable = "avx2")]
 841 #[cfg_attr(test, assert_instr(vpcmpgtb))]
 842 #[stable(feature = "simd_x86", since = "1.27.0")]
 843 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 844     transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
 845 }
 846
 847 /// Sign-extend 16-bit integers to 32-bit integers.
 848 ///
 849 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
 850 #[inline]
 851 #[target_feature(enable = "avx2")]
 852 #[cfg_attr(test, assert_instr(vpmovsxwd))]
 853 #[stable(feature = "simd_x86", since = "1.27.0")]
 854 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 855     transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
 856 }
 857
 858 /// Sign-extend 16-bit integers to 64-bit integers.
 859 ///
 860 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
 861 #[inline]
 862 #[target_feature(enable = "avx2")]
 863 #[cfg_attr(test, assert_instr(vpmovsxwq))]
 864 #[stable(feature = "simd_x86", since = "1.27.0")]
 865 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 866     let a = a.as_i16x8();
 867     let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 868     transmute::<i64x4, _>(simd_cast(v64))
 869 }
 870
 871 /// Sign-extend 32-bit integers to 64-bit integers.
 872 ///
 873 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
 874 #[inline]
 875 #[target_feature(enable = "avx2")]
 876 #[cfg_attr(test, assert_instr(vpmovsxdq))]
 877 #[stable(feature = "simd_x86", since = "1.27.0")]
 878 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 879     transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
 880 }
 881
 882 /// Sign-extend 8-bit integers to 16-bit integers.
 883 ///
 884 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
 885 #[inline]
 886 #[target_feature(enable = "avx2")]
 887 #[cfg_attr(test, assert_instr(vpmovsxbw))]
 888 #[stable(feature = "simd_x86", since = "1.27.0")]
 889 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 890     transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
 891 }
 892
 893 /// Sign-extend 8-bit integers to 32-bit integers.
 894 ///
 895 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
 896 #[inline]
 897 #[target_feature(enable = "avx2")]
 898 #[cfg_attr(test, assert_instr(vpmovsxbd))]
 899 #[stable(feature = "simd_x86", since = "1.27.0")]
 900 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 901     let a = a.as_i8x16();
 902     let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 903     transmute::<i32x8, _>(simd_cast(v64))
 904 }
 905
 906 /// Sign-extend 8-bit integers to 64-bit integers.
 907 ///
 908 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
 909 #[inline]
 910 #[target_feature(enable = "avx2")]
 911 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 912 #[stable(feature = "simd_x86", since = "1.27.0")]
 913 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 914     let a = a.as_i8x16();
 915     let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 916     transmute::<i64x4, _>(simd_cast(v32))
 917 }
 918
 919 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
 920 /// integers, and stores the results in `dst`.
 921 ///
 922 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
 923 #[inline]
 924 #[target_feature(enable = "avx2")]
 925 #[cfg_attr(test, assert_instr(vpmovzxwd))]
 926 #[stable(feature = "simd_x86", since = "1.27.0")]
 927 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 928     transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
 929 }
 930
 931 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 932 /// integers. The upper four elements of `a` are unused.
 933 ///
 934 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
 935 #[inline]
 936 #[target_feature(enable = "avx2")]
 937 #[cfg_attr(test, assert_instr(vpmovzxwq))]
 938 #[stable(feature = "simd_x86", since = "1.27.0")]
 939 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 940     let a = a.as_u16x8();
 941     let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 942     transmute::<i64x4, _>(simd_cast(v64))
 943 }
 944
 945 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
 946 ///
 947 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
 948 #[inline]
 949 #[target_feature(enable = "avx2")]
 950 #[cfg_attr(test, assert_instr(vpmovzxdq))]
 951 #[stable(feature = "simd_x86", since = "1.27.0")]
 952 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 953     transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
 954 }
 955
 956 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
 957 ///
 958 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
 959 #[inline]
 960 #[target_feature(enable = "avx2")]
 961 #[cfg_attr(test, assert_instr(vpmovzxbw))]
 962 #[stable(feature = "simd_x86", since = "1.27.0")]
 963 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 964     transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
 965 }
 966
 967 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 968 /// integers. The upper eight elements of `a` are unused.
 969 ///
 970 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
 971 #[inline]
 972 #[target_feature(enable = "avx2")]
 973 #[cfg_attr(test, assert_instr(vpmovzxbd))]
 974 #[stable(feature = "simd_x86", since = "1.27.0")]
 975 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 976     let a = a.as_u8x16();
 977     let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 978     transmute::<i32x8, _>(simd_cast(v64))
 979 }
 980
 981 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 982 /// integers. The upper twelve elements of `a` are unused.
 983 ///
 984 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
 985 #[inline]
 986 #[target_feature(enable = "avx2")]
 987 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 988 #[stable(feature = "simd_x86", since = "1.27.0")]
 989 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 990     let a = a.as_u8x16();
 991     let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 992     transmute::<i64x4, _>(simd_cast(v32))
 993 }
 994
 995 /// Extracts 128 bits (of integer data) from `a` selected with `imm8`.
 996 ///
 997 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
 998 #[inline]
 999 #[target_feature(enable = "avx2")]
1000 #[cfg_attr(
1001     all(test, not(target_os = "windows")),
1002     assert_instr(vextractf128, imm8 = 1)
1003 )]
1004 #[rustc_args_required_const(1)]
1005 #[stable(feature = "simd_x86", since = "1.27.0")]
1006 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
1007     let a = a.as_i64x4();
1008     let b = _mm256_undefined_si256().as_i64x4();
1009     let dst: i64x2 = match imm8 & 0b01 {
1010         0 => simd_shuffle2(a, b, [0, 1]),
1011         _ => simd_shuffle2(a, b, [2, 3]),
1012     };
1013     transmute(dst)
1014 }
1015
1016 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
1017 ///
1018 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
1019 #[inline]
1020 #[target_feature(enable = "avx2")]
1021 #[cfg_attr(test, assert_instr(vphaddw))]
1022 #[stable(feature = "simd_x86", since = "1.27.0")]
1023 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
1024     transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
1025 }
1026
1027 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
1028 ///
1029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
1030 #[inline]
1031 #[target_feature(enable = "avx2")]
1032 #[cfg_attr(test, assert_instr(vphaddd))]
1033 #[stable(feature = "simd_x86", since = "1.27.0")]
1034 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
1035     transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
1036 }
1037
1038 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
1039 /// using saturation.
1040 ///
1041 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
1042 #[inline]
1043 #[target_feature(enable = "avx2")]
1044 #[cfg_attr(test, assert_instr(vphaddsw))]
1045 #[stable(feature = "simd_x86", since = "1.27.0")]
1046 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1047     transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
1048 }
1049
1050 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1051 ///
1052 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
1053 #[inline]
1054 #[target_feature(enable = "avx2")]
1055 #[cfg_attr(test, assert_instr(vphsubw))]
1056 #[stable(feature = "simd_x86", since = "1.27.0")]
1057 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1058     transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
1059 }
1060
1061 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1062 ///
1063 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
1064 #[inline]
1065 #[target_feature(enable = "avx2")]
1066 #[cfg_attr(test, assert_instr(vphsubd))]
1067 #[stable(feature = "simd_x86", since = "1.27.0")]
1068 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1069     transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
1070 }
1071
1072 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1073 /// using saturation.
1074 ///
1075 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
1076 #[inline]
1077 #[target_feature(enable = "avx2")]
1078 #[cfg_attr(test, assert_instr(vphsubsw))]
1079 #[stable(feature = "simd_x86", since = "1.27.0")]
1080 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1081     transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
1082 }
1083
1084 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1085 /// where
1086 /// `scale` is between 1 and 8.
1087 ///
1088 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
1089 #[inline]
1090 #[target_feature(enable = "avx2")]
1091 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1092 #[rustc_args_required_const(2)]
1093 #[stable(feature = "simd_x86", since = "1.27.0")]
1094 pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1095     let zero = _mm_setzero_si128().as_i32x4();
1096     let neg_one = _mm_set1_epi32(-1).as_i32x4();
1097     let offsets = offsets.as_i32x4();
1098     let slice = slice as *const i8;
1099     macro_rules! call {
1100         ($imm8:expr) => {
1101             pgatherdd(zero, slice, offsets, neg_one, $imm8)
1102         };
1103     }
1104     let r = constify_imm8!(scale, call);
1105     transmute(r)
1106 }
1107
1108 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1109 /// where
1110 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1111 /// that position instead.
1112 ///
1113 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
1114 #[inline]
1115 #[target_feature(enable = "avx2")]
1116 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1117 #[rustc_args_required_const(4)]
1118 #[stable(feature = "simd_x86", since = "1.27.0")]
1119 pub unsafe fn _mm_mask_i32gather_epi32(
1120     src: __m128i,
1121     slice: *const i32,
1122     offsets: __m128i,
1123     mask: __m128i,
1124     scale: i32,
1125 ) -> __m128i {
1126     let src = src.as_i32x4();
1127     let mask = mask.as_i32x4();
1128     let offsets = offsets.as_i32x4();
1129     let slice = slice as *const i8;
1130     macro_rules! call {
1131         ($imm8:expr) => {
1132             pgatherdd(src, slice, offsets, mask, $imm8)
1133         };
1134     }
1135     let r = constify_imm8!(scale, call);
1136     transmute(r)
1137 }
1138
1139 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1140 /// where
1141 /// `scale` is between 1 and 8.
1142 ///
1143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
1144 #[inline]
1145 #[target_feature(enable = "avx2")]
1146 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1147 #[rustc_args_required_const(2)]
1148 #[stable(feature = "simd_x86", since = "1.27.0")]
1149 pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
1150     let zero = _mm256_setzero_si256().as_i32x8();
1151     let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1152     let offsets = offsets.as_i32x8();
1153     let slice = slice as *const i8;
1154     macro_rules! call {
1155         ($imm8:expr) => {
1156             vpgatherdd(zero, slice, offsets, neg_one, $imm8)
1157         };
1158     }
1159     let r = constify_imm8!(scale, call);
1160     transmute(r)
1161 }
1162
1163 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1164 /// where
1165 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1166 /// that position instead.
1167 ///
1168 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
1169 #[inline]
1170 #[target_feature(enable = "avx2")]
1171 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1172 #[rustc_args_required_const(4)]
1173 #[stable(feature = "simd_x86", since = "1.27.0")]
1174 pub unsafe fn _mm256_mask_i32gather_epi32(
1175     src: __m256i,
1176     slice: *const i32,
1177     offsets: __m256i,
1178     mask: __m256i,
1179     scale: i32,
1180 ) -> __m256i {
1181     let src = src.as_i32x8();
1182     let mask = mask.as_i32x8();
1183     let offsets = offsets.as_i32x8();
1184     let slice = slice as *const i8;
1185     macro_rules! call {
1186         ($imm8:expr) => {
1187             vpgatherdd(src, slice, offsets, mask, $imm8)
1188         };
1189     }
1190     let r = constify_imm8!(scale, call);
1191     transmute(r)
1192 }
1193
1194 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1195 /// where
1196 /// `scale` is between 1 and 8.
1197 ///
1198 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
1199 #[inline]
1200 #[target_feature(enable = "avx2")]
1201 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1202 #[rustc_args_required_const(2)]
1203 #[stable(feature = "simd_x86", since = "1.27.0")]
1204 pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1205     let zero = _mm_setzero_ps();
1206     let neg_one = _mm_set1_ps(-1.0);
1207     let offsets = offsets.as_i32x4();
1208     let slice = slice as *const i8;
1209     macro_rules! call {
1210         ($imm8:expr) => {
1211             pgatherdps(zero, slice, offsets, neg_one, $imm8)
1212         };
1213     }
1214     constify_imm8!(scale, call)
1215 }
1216
1217 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1218 /// where
1219 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1220 /// that position instead.
1221 ///
1222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
1223 #[inline]
1224 #[target_feature(enable = "avx2")]
1225 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1226 #[rustc_args_required_const(4)]
1227 #[stable(feature = "simd_x86", since = "1.27.0")]
1228 pub unsafe fn _mm_mask_i32gather_ps(
1229     src: __m128,
1230     slice: *const f32,
1231     offsets: __m128i,
1232     mask: __m128,
1233     scale: i32,
1234 ) -> __m128 {
1235     let offsets = offsets.as_i32x4();
1236     let slice = slice as *const i8;
1237     macro_rules! call {
1238         ($imm8:expr) => {
1239             pgatherdps(src, slice, offsets, mask, $imm8)
1240         };
1241     }
1242     constify_imm8!(scale, call)
1243 }
1244
1245 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1246 /// where
1247 /// `scale` is between 1 and 8.
1248 ///
1249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
1250 #[inline]
1251 #[target_feature(enable = "avx2")]
1252 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1253 #[rustc_args_required_const(2)]
1254 #[stable(feature = "simd_x86", since = "1.27.0")]
1255 pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
1256     let zero = _mm256_setzero_ps();
1257     let neg_one = _mm256_set1_ps(-1.0);
1258     let offsets = offsets.as_i32x8();
1259     let slice = slice as *const i8;
1260     macro_rules! call {
1261         ($imm8:expr) => {
1262             vpgatherdps(zero, slice, offsets, neg_one, $imm8)
1263         };
1264     }
1265     constify_imm8!(scale, call)
1266 }
1267
1268 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1269 /// where
1270 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1271 /// that position instead.
1272 ///
1273 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
1274 #[inline]
1275 #[target_feature(enable = "avx2")]
1276 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1277 #[rustc_args_required_const(4)]
1278 #[stable(feature = "simd_x86", since = "1.27.0")]
1279 pub unsafe fn _mm256_mask_i32gather_ps(
1280     src: __m256,
1281     slice: *const f32,
1282     offsets: __m256i,
1283     mask: __m256,
1284     scale: i32,
1285 ) -> __m256 {
1286     let offsets = offsets.as_i32x8();
1287     let slice = slice as *const i8;
1288     macro_rules! call {
1289         ($imm8:expr) => {
1290             vpgatherdps(src, slice, offsets, mask, $imm8)
1291         };
1292     }
1293     constify_imm8!(scale, call)
1294 }
1295
1296 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1297 /// where
1298 /// `scale` is between 1 and 8.
1299 ///
1300 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
1301 #[inline]
1302 #[target_feature(enable = "avx2")]
1303 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1304 #[rustc_args_required_const(2)]
1305 #[stable(feature = "simd_x86", since = "1.27.0")]
1306 pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1307     let zero = _mm_setzero_si128().as_i64x2();
1308     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1309     let offsets = offsets.as_i32x4();
1310     let slice = slice as *const i8;
1311     macro_rules! call {
1312         ($imm8:expr) => {
1313             pgatherdq(zero, slice, offsets, neg_one, $imm8)
1314         };
1315     }
1316     let r = constify_imm8!(scale, call);
1317     transmute(r)
1318 }
1319
1320 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1321 /// where
1322 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1323 /// that position instead.
1324 ///
1325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
1326 #[inline]
1327 #[target_feature(enable = "avx2")]
1328 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1329 #[rustc_args_required_const(4)]
1330 #[stable(feature = "simd_x86", since = "1.27.0")]
1331 pub unsafe fn _mm_mask_i32gather_epi64(
1332     src: __m128i,
1333     slice: *const i64,
1334     offsets: __m128i,
1335     mask: __m128i,
1336     scale: i32,
1337 ) -> __m128i {
1338     let src = src.as_i64x2();
1339     let mask = mask.as_i64x2();
1340     let offsets = offsets.as_i32x4();
1341     let slice = slice as *const i8;
1342     macro_rules! call {
1343         ($imm8:expr) => {
1344             pgatherdq(src, slice, offsets, mask, $imm8)
1345         };
1346     }
1347     let r = constify_imm8!(scale, call);
1348     transmute(r)
1349 }
1350
1351 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1352 /// where
1353 /// `scale` is between 1 and 8.
1354 ///
1355 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
1356 #[inline]
1357 #[target_feature(enable = "avx2")]
1358 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1359 #[rustc_args_required_const(2)]
1360 #[stable(feature = "simd_x86", since = "1.27.0")]
1361 pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
1362     let zero = _mm256_setzero_si256().as_i64x4();
1363     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1364     let offsets = offsets.as_i32x4();
1365     let slice = slice as *const i8;
1366     macro_rules! call {
1367         ($imm8:expr) => {
1368             vpgatherdq(zero, slice, offsets, neg_one, $imm8)
1369         };
1370     }
1371     let r = constify_imm8!(scale, call);
1372     transmute(r)
1373 }
1374
1375 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1376 /// where
1377 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1378 /// that position instead.
1379 ///
1380 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
1381 #[inline]
1382 #[target_feature(enable = "avx2")]
1383 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1384 #[rustc_args_required_const(4)]
1385 #[stable(feature = "simd_x86", since = "1.27.0")]
1386 pub unsafe fn _mm256_mask_i32gather_epi64(
1387     src: __m256i,
1388     slice: *const i64,
1389     offsets: __m128i,
1390     mask: __m256i,
1391     scale: i32,
1392 ) -> __m256i {
1393     let src = src.as_i64x4();
1394     let mask = mask.as_i64x4();
1395     let offsets = offsets.as_i32x4();
1396     let slice = slice as *const i8;
1397     macro_rules! call {
1398         ($imm8:expr) => {
1399             vpgatherdq(src, slice, offsets, mask, $imm8)
1400         };
1401     }
1402     let r = constify_imm8!(scale, call);
1403     transmute(r)
1404 }
1405
1406 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1407 /// where
1408 /// `scale` is between 1 and 8.
1409 ///
1410 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
1411 #[inline]
1412 #[target_feature(enable = "avx2")]
1413 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1414 #[rustc_args_required_const(2)]
1415 #[stable(feature = "simd_x86", since = "1.27.0")]
1416 pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1417     let zero = _mm_setzero_pd();
1418     let neg_one = _mm_set1_pd(-1.0);
1419     let offsets = offsets.as_i32x4();
1420     let slice = slice as *const i8;
1421     macro_rules! call {
1422         ($imm8:expr) => {
1423             pgatherdpd(zero, slice, offsets, neg_one, $imm8)
1424         };
1425     }
1426     constify_imm8!(scale, call)
1427 }
1428
1429 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1430 /// where
1431 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1432 /// that position instead.
1433 ///
1434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
1435 #[inline]
1436 #[target_feature(enable = "avx2")]
1437 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1438 #[rustc_args_required_const(4)]
1439 #[stable(feature = "simd_x86", since = "1.27.0")]
1440 pub unsafe fn _mm_mask_i32gather_pd(
1441     src: __m128d,
1442     slice: *const f64,
1443     offsets: __m128i,
1444     mask: __m128d,
1445     scale: i32,
1446 ) -> __m128d {
1447     let offsets = offsets.as_i32x4();
1448     let slice = slice as *const i8;
1449     macro_rules! call {
1450         ($imm8:expr) => {
1451             pgatherdpd(src, slice, offsets, mask, $imm8)
1452         };
1453     }
1454     constify_imm8!(scale, call)
1455 }
1456
1457 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1458 /// where
1459 /// `scale` is between 1 and 8.
1460 ///
1461 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
1462 #[inline]
1463 #[target_feature(enable = "avx2")]
1464 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1465 #[rustc_args_required_const(2)]
1466 #[stable(feature = "simd_x86", since = "1.27.0")]
1467 pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
1468     let zero = _mm256_setzero_pd();
1469     let neg_one = _mm256_set1_pd(-1.0);
1470     let offsets = offsets.as_i32x4();
1471     let slice = slice as *const i8;
1472     macro_rules! call {
1473         ($imm8:expr) => {
1474             vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
1475         };
1476     }
1477     constify_imm8!(scale, call)
1478 }
1479
1480 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1481 /// where
1482 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1483 /// that position instead.
1484 ///
1485 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
1486 #[inline]
1487 #[target_feature(enable = "avx2")]
1488 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1489 #[rustc_args_required_const(4)]
1490 #[stable(feature = "simd_x86", since = "1.27.0")]
1491 pub unsafe fn _mm256_mask_i32gather_pd(
1492     src: __m256d,
1493     slice: *const f64,
1494     offsets: __m128i,
1495     mask: __m256d,
1496     scale: i32,
1497 ) -> __m256d {
1498     let offsets = offsets.as_i32x4();
1499     let slice = slice as *const i8;
1500     macro_rules! call {
1501         ($imm8:expr) => {
1502             vpgatherdpd(src, slice, offsets, mask, $imm8)
1503         };
1504     }
1505     constify_imm8!(scale, call)
1506 }
1507
1508 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1509 /// where
1510 /// `scale` is between 1 and 8.
1511 ///
1512 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
1513 #[inline]
1514 #[target_feature(enable = "avx2")]
1515 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1516 #[rustc_args_required_const(2)]
1517 #[stable(feature = "simd_x86", since = "1.27.0")]
1518 pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1519     let zero = _mm_setzero_si128().as_i32x4();
1520     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1521     let offsets = offsets.as_i64x2();
1522     let slice = slice as *const i8;
1523     macro_rules! call {
1524         ($imm8:expr) => {
1525             pgatherqd(zero, slice, offsets, neg_one, $imm8)
1526         };
1527     }
1528     let r = constify_imm8!(scale, call);
1529     transmute(r)
1530 }
1531
1532 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1533 /// where
1534 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1535 /// that position instead.
1536 ///
1537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
1538 #[inline]
1539 #[target_feature(enable = "avx2")]
1540 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1541 #[rustc_args_required_const(4)]
1542 #[stable(feature = "simd_x86", since = "1.27.0")]
1543 pub unsafe fn _mm_mask_i64gather_epi32(
1544     src: __m128i,
1545     slice: *const i32,
1546     offsets: __m128i,
1547     mask: __m128i,
1548     scale: i32,
1549 ) -> __m128i {
1550     let src = src.as_i32x4();
1551     let mask = mask.as_i32x4();
1552     let offsets = offsets.as_i64x2();
1553     let slice = slice as *const i8;
1554     macro_rules! call {
1555         ($imm8:expr) => {
1556             pgatherqd(src, slice, offsets, mask, $imm8)
1557         };
1558     }
1559     let r = constify_imm8!(scale, call);
1560     transmute(r)
1561 }
1562
1563 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1564 /// where
1565 /// `scale` is between 1 and 8.
1566 ///
1567 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
1568 #[inline]
1569 #[target_feature(enable = "avx2")]
1570 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1571 #[rustc_args_required_const(2)]
1572 #[stable(feature = "simd_x86", since = "1.27.0")]
1573 pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
1574     let zero = _mm_setzero_si128().as_i32x4();
1575     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1576     let offsets = offsets.as_i64x4();
1577     let slice = slice as *const i8;
1578     macro_rules! call {
1579         ($imm8:expr) => {
1580             vpgatherqd(zero, slice, offsets, neg_one, $imm8)
1581         };
1582     }
1583     let r = constify_imm8!(scale, call);
1584     transmute(r)
1585 }
1586
1587 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1588 /// where
1589 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1590 /// that position instead.
1591 ///
1592 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
1593 #[inline]
1594 #[target_feature(enable = "avx2")]
1595 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1596 #[rustc_args_required_const(4)]
1597 #[stable(feature = "simd_x86", since = "1.27.0")]
1598 pub unsafe fn _mm256_mask_i64gather_epi32(
1599     src: __m128i,
1600     slice: *const i32,
1601     offsets: __m256i,
1602     mask: __m128i,
1603     scale: i32,
1604 ) -> __m128i {
1605     let src = src.as_i32x4();
1606     let mask = mask.as_i32x4();
1607     let offsets = offsets.as_i64x4();
1608     let slice = slice as *const i8;
1609     macro_rules! call {
1610         ($imm8:expr) => {
1611             vpgatherqd(src, slice, offsets, mask, $imm8)
1612         };
1613     }
1614     let r = constify_imm8!(scale, call);
1615     transmute(r)
1616 }
1617
1618 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1619 /// where
1620 /// `scale` is between 1 and 8.
1621 ///
1622 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
1623 #[inline]
1624 #[target_feature(enable = "avx2")]
1625 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1626 #[rustc_args_required_const(2)]
1627 #[stable(feature = "simd_x86", since = "1.27.0")]
1628 pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1629     let zero = _mm_setzero_ps();
1630     let neg_one = _mm_set1_ps(-1.0);
1631     let offsets = offsets.as_i64x2();
1632     let slice = slice as *const i8;
1633     macro_rules! call {
1634         ($imm8:expr) => {
1635             pgatherqps(zero, slice, offsets, neg_one, $imm8)
1636         };
1637     }
1638     constify_imm8!(scale, call)
1639 }
1640
1641 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1642 /// where
1643 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1644 /// that position instead.
1645 ///
1646 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
1647 #[inline]
1648 #[target_feature(enable = "avx2")]
1649 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1650 #[rustc_args_required_const(4)]
1651 #[stable(feature = "simd_x86", since = "1.27.0")]
1652 pub unsafe fn _mm_mask_i64gather_ps(
1653     src: __m128,
1654     slice: *const f32,
1655     offsets: __m128i,
1656     mask: __m128,
1657     scale: i32,
1658 ) -> __m128 {
1659     let offsets = offsets.as_i64x2();
1660     let slice = slice as *const i8;
1661     macro_rules! call {
1662         ($imm8:expr) => {
1663             pgatherqps(src, slice, offsets, mask, $imm8)
1664         };
1665     }
1666     constify_imm8!(scale, call)
1667 }
1668
1669 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1670 /// where
1671 /// `scale` is between 1 and 8.
1672 ///
1673 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
1674 #[inline]
1675 #[target_feature(enable = "avx2")]
1676 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1677 #[rustc_args_required_const(2)]
1678 #[stable(feature = "simd_x86", since = "1.27.0")]
1679 pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
1680     let zero = _mm_setzero_ps();
1681     let neg_one = _mm_set1_ps(-1.0);
1682     let offsets = offsets.as_i64x4();
1683     let slice = slice as *const i8;
1684     macro_rules! call {
1685         ($imm8:expr) => {
1686             vpgatherqps(zero, slice, offsets, neg_one, $imm8)
1687         };
1688     }
1689     constify_imm8!(scale, call)
1690 }
1691
1692 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1693 /// where
1694 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1695 /// that position instead.
1696 ///
1697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
1698 #[inline]
1699 #[target_feature(enable = "avx2")]
1700 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1701 #[rustc_args_required_const(4)]
1702 #[stable(feature = "simd_x86", since = "1.27.0")]
1703 pub unsafe fn _mm256_mask_i64gather_ps(
1704     src: __m128,
1705     slice: *const f32,
1706     offsets: __m256i,
1707     mask: __m128,
1708     scale: i32,
1709 ) -> __m128 {
1710     let offsets = offsets.as_i64x4();
1711     let slice = slice as *const i8;
1712     macro_rules! call {
1713         ($imm8:expr) => {
1714             vpgatherqps(src, slice, offsets, mask, $imm8)
1715         };
1716     }
1717     constify_imm8!(scale, call)
1718 }
1719
1720 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1721 /// where
1722 /// `scale` is between 1 and 8.
1723 ///
1724 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
1725 #[inline]
1726 #[target_feature(enable = "avx2")]
1727 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1728 #[rustc_args_required_const(2)]
1729 #[stable(feature = "simd_x86", since = "1.27.0")]
1730 pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1731     let zero = _mm_setzero_si128().as_i64x2();
1732     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1733     let slice = slice as *const i8;
1734     let offsets = offsets.as_i64x2();
1735     macro_rules! call {
1736         ($imm8:expr) => {
1737             pgatherqq(zero, slice, offsets, neg_one, $imm8)
1738         };
1739     }
1740     let r = constify_imm8!(scale, call);
1741     transmute(r)
1742 }
1743
1744 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1745 /// where
1746 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1747 /// that position instead.
1748 ///
1749 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
1750 #[inline]
1751 #[target_feature(enable = "avx2")]
1752 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1753 #[rustc_args_required_const(4)]
1754 #[stable(feature = "simd_x86", since = "1.27.0")]
1755 pub unsafe fn _mm_mask_i64gather_epi64(
1756     src: __m128i,
1757     slice: *const i64,
1758     offsets: __m128i,
1759     mask: __m128i,
1760     scale: i32,
1761 ) -> __m128i {
1762     let src = src.as_i64x2();
1763     let mask = mask.as_i64x2();
1764     let offsets = offsets.as_i64x2();
1765     let slice = slice as *const i8;
1766     macro_rules! call {
1767         ($imm8:expr) => {
1768             pgatherqq(src, slice, offsets, mask, $imm8)
1769         };
1770     }
1771     let r = constify_imm8!(scale, call);
1772     transmute(r)
1773 }
1774
1775 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1776 /// where
1777 /// `scale` is between 1 and 8.
1778 ///
1779 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
1780 #[inline]
1781 #[target_feature(enable = "avx2")]
1782 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1783 #[rustc_args_required_const(2)]
1784 #[stable(feature = "simd_x86", since = "1.27.0")]
1785 pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
1786     let zero = _mm256_setzero_si256().as_i64x4();
1787     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1788     let slice = slice as *const i8;
1789     let offsets = offsets.as_i64x4();
1790     macro_rules! call {
1791         ($imm8:expr) => {
1792             vpgatherqq(zero, slice, offsets, neg_one, $imm8)
1793         };
1794     }
1795     let r = constify_imm8!(scale, call);
1796     transmute(r)
1797 }
1798
1799 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1800 /// where
1801 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1802 /// that position instead.
1803 ///
1804 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
1805 #[inline]
1806 #[target_feature(enable = "avx2")]
1807 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1808 #[rustc_args_required_const(4)]
1809 #[stable(feature = "simd_x86", since = "1.27.0")]
1810 pub unsafe fn _mm256_mask_i64gather_epi64(
1811     src: __m256i,
1812     slice: *const i64,
1813     offsets: __m256i,
1814     mask: __m256i,
1815     scale: i32,
1816 ) -> __m256i {
1817     let src = src.as_i64x4();
1818     let mask = mask.as_i64x4();
1819     let offsets = offsets.as_i64x4();
1820     let slice = slice as *const i8;
1821     macro_rules! call {
1822         ($imm8:expr) => {
1823             vpgatherqq(src, slice, offsets, mask, $imm8)
1824         };
1825     }
1826     let r = constify_imm8!(scale, call);
1827     transmute(r)
1828 }
1829
1830 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1831 /// where
1832 /// `scale` is between 1 and 8.
1833 ///
1834 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
1835 #[inline]
1836 #[target_feature(enable = "avx2")]
1837 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1838 #[rustc_args_required_const(2)]
1839 #[stable(feature = "simd_x86", since = "1.27.0")]
1840 pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1841     let zero = _mm_setzero_pd();
1842     let neg_one = _mm_set1_pd(-1.0);
1843     let slice = slice as *const i8;
1844     let offsets = offsets.as_i64x2();
1845     macro_rules! call {
1846         ($imm8:expr) => {
1847             pgatherqpd(zero, slice, offsets, neg_one, $imm8)
1848         };
1849     }
1850     constify_imm8!(scale, call)
1851 }
1852
1853 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1854 /// where
1855 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1856 /// that position instead.
1857 ///
1858 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
1859 #[inline]
1860 #[target_feature(enable = "avx2")]
1861 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1862 #[rustc_args_required_const(4)]
1863 #[stable(feature = "simd_x86", since = "1.27.0")]
1864 pub unsafe fn _mm_mask_i64gather_pd(
1865     src: __m128d,
1866     slice: *const f64,
1867     offsets: __m128i,
1868     mask: __m128d,
1869     scale: i32,
1870 ) -> __m128d {
1871     let slice = slice as *const i8;
1872     let offsets = offsets.as_i64x2();
1873     macro_rules! call {
1874         ($imm8:expr) => {
1875             pgatherqpd(src, slice, offsets, mask, $imm8)
1876         };
1877     }
1878     constify_imm8!(scale, call)
1879 }
1880
1881 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1882 /// where
1883 /// `scale` is between 1 and 8.
1884 ///
1885 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
1886 #[inline]
1887 #[target_feature(enable = "avx2")]
1888 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1889 #[rustc_args_required_const(2)]
1890 #[stable(feature = "simd_x86", since = "1.27.0")]
1891 pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
1892     let zero = _mm256_setzero_pd();
1893     let neg_one = _mm256_set1_pd(-1.0);
1894     let slice = slice as *const i8;
1895     let offsets = offsets.as_i64x4();
1896     macro_rules! call {
1897         ($imm8:expr) => {
1898             vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
1899         };
1900     }
1901     constify_imm8!(scale, call)
1902 }
1903
1904 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1905 /// where
1906 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1907 /// that position instead.
1908 ///
1909 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
1910 #[inline]
1911 #[target_feature(enable = "avx2")]
1912 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1913 #[rustc_args_required_const(4)]
1914 #[stable(feature = "simd_x86", since = "1.27.0")]
1915 pub unsafe fn _mm256_mask_i64gather_pd(
1916     src: __m256d,
1917     slice: *const f64,
1918     offsets: __m256i,
1919     mask: __m256d,
1920     scale: i32,
1921 ) -> __m256d {
1922     let slice = slice as *const i8;
1923     let offsets = offsets.as_i64x4();
1924     macro_rules! call {
1925         ($imm8:expr) => {
1926             vpgatherqpd(src, slice, offsets, mask, $imm8)
1927         };
1928     }
1929     constify_imm8!(scale, call)
1930 }
1931
1932 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1933 /// location specified by `imm8`.
1934 ///
1935 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
1936 #[inline]
1937 #[target_feature(enable = "avx2")]
1938 #[cfg_attr(
1939     all(test, not(target_os = "windows")),
1940     assert_instr(vinsertf128, imm8 = 1)
1941 )]
1942 #[rustc_args_required_const(2)]
1943 #[stable(feature = "simd_x86", since = "1.27.0")]
1944 pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
1945     let a = a.as_i64x4();
1946     let b = _mm256_castsi128_si256(b).as_i64x4();
1947     let dst: i64x4 = match imm8 & 0b01 {
1948         0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
1949         _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
1950     };
1951     transmute(dst)
1952 }
1953
1954 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1955 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1956 /// of intermediate 32-bit integers.
1957 ///
1958 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
1959 #[inline]
1960 #[target_feature(enable = "avx2")]
1961 #[cfg_attr(test, assert_instr(vpmaddwd))]
1962 #[stable(feature = "simd_x86", since = "1.27.0")]
1963 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1964     transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1965 }
1966
1967 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
1968 /// corresponding signed 8-bit integer from `b`, producing intermediate
1969 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1970 /// signed 16-bit integers
1971 ///
1972 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
1973 #[inline]
1974 #[target_feature(enable = "avx2")]
1975 #[cfg_attr(test, assert_instr(vpmaddubsw))]
1976 #[stable(feature = "simd_x86", since = "1.27.0")]
1977 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1978     transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1979 }
1980
1981 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1982 /// (elements are zeroed out when the highest bit is not set in the
1983 /// corresponding element).
1984 ///
1985 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
1986 #[inline]
1987 #[target_feature(enable = "avx2")]
1988 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1989 #[stable(feature = "simd_x86", since = "1.27.0")]
1990 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1991     transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1992 }
1993
1994 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1995 /// (elements are zeroed out when the highest bit is not set in the
1996 /// corresponding element).
1997 ///
1998 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
1999 #[inline]
2000 #[target_feature(enable = "avx2")]
2001 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2002 #[stable(feature = "simd_x86", since = "1.27.0")]
2003 pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
2004     transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
2005 }
2006
2007 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2008 /// (elements are zeroed out when the highest bit is not set in the
2009 /// corresponding element).
2010 ///
2011 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
2012 #[inline]
2013 #[target_feature(enable = "avx2")]
2014 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2015 #[stable(feature = "simd_x86", since = "1.27.0")]
2016 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
2017     transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
2018 }
2019
2020 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2021 /// (elements are zeroed out when the highest bit is not set in the
2022 /// corresponding element).
2023 ///
2024 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
2025 #[inline]
2026 #[target_feature(enable = "avx2")]
2027 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2028 #[stable(feature = "simd_x86", since = "1.27.0")]
2029 pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
2030     transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
2031 }
2032
2033 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2034 /// using `mask` (elements are not stored when the highest bit is not set
2035 /// in the corresponding element).
2036 ///
2037 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
2038 #[inline]
2039 #[target_feature(enable = "avx2")]
2040 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2041 #[stable(feature = "simd_x86", since = "1.27.0")]
2042 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
2043     maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
2044 }
2045
2046 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2047 /// using `mask` (elements are not stored when the highest bit is not set
2048 /// in the corresponding element).
2049 ///
2050 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
2051 #[inline]
2052 #[target_feature(enable = "avx2")]
2053 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2054 #[stable(feature = "simd_x86", since = "1.27.0")]
2055 pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
2056     maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
2057 }
2058
2059 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2060 /// using `mask` (elements are not stored when the highest bit is not set
2061 /// in the corresponding element).
2062 ///
2063 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
2064 #[inline]
2065 #[target_feature(enable = "avx2")]
2066 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2067 #[stable(feature = "simd_x86", since = "1.27.0")]
2068 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
2069     maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
2070 }
2071
2072 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2073 /// using `mask` (elements are not stored when the highest bit is not set
2074 /// in the corresponding element).
2075 ///
2076 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
2077 #[inline]
2078 #[target_feature(enable = "avx2")]
2079 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2080 #[stable(feature = "simd_x86", since = "1.27.0")]
2081 pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
2082     maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
2083 }
2084
2085 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2086 /// maximum values.
2087 ///
2088 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
2089 #[inline]
2090 #[target_feature(enable = "avx2")]
2091 #[cfg_attr(test, assert_instr(vpmaxsw))]
2092 #[stable(feature = "simd_x86", since = "1.27.0")]
2093 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2094     transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
2095 }
2096
2097 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2098 /// maximum values.
2099 ///
2100 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
2101 #[inline]
2102 #[target_feature(enable = "avx2")]
2103 #[cfg_attr(test, assert_instr(vpmaxsd))]
2104 #[stable(feature = "simd_x86", since = "1.27.0")]
2105 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2106     transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
2107 }
2108
2109 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2110 /// maximum values.
2111 ///
2112 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
2113 #[inline]
2114 #[target_feature(enable = "avx2")]
2115 #[cfg_attr(test, assert_instr(vpmaxsb))]
2116 #[stable(feature = "simd_x86", since = "1.27.0")]
2117 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2118     transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
2119 }
2120
2121 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2122 /// the packed maximum values.
2123 ///
2124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
2125 #[inline]
2126 #[target_feature(enable = "avx2")]
2127 #[cfg_attr(test, assert_instr(vpmaxuw))]
2128 #[stable(feature = "simd_x86", since = "1.27.0")]
2129 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2130     transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
2131 }
2132
2133 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2134 /// the packed maximum values.
2135 ///
2136 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
2137 #[inline]
2138 #[target_feature(enable = "avx2")]
2139 #[cfg_attr(test, assert_instr(vpmaxud))]
2140 #[stable(feature = "simd_x86", since = "1.27.0")]
2141 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2142     transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
2143 }
2144
2145 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2146 /// the packed maximum values.
2147 ///
2148 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
2149 #[inline]
2150 #[target_feature(enable = "avx2")]
2151 #[cfg_attr(test, assert_instr(vpmaxub))]
2152 #[stable(feature = "simd_x86", since = "1.27.0")]
2153 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2154     transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
2155 }
2156
2157 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2158 /// minimum values.
2159 ///
2160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
2161 #[inline]
2162 #[target_feature(enable = "avx2")]
2163 #[cfg_attr(test, assert_instr(vpminsw))]
2164 #[stable(feature = "simd_x86", since = "1.27.0")]
2165 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2166     transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
2167 }
2168
2169 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2170 /// minimum values.
2171 ///
2172 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
2173 #[inline]
2174 #[target_feature(enable = "avx2")]
2175 #[cfg_attr(test, assert_instr(vpminsd))]
2176 #[stable(feature = "simd_x86", since = "1.27.0")]
2177 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2178     transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
2179 }
2180
2181 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2182 /// minimum values.
2183 ///
2184 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
2185 #[inline]
2186 #[target_feature(enable = "avx2")]
2187 #[cfg_attr(test, assert_instr(vpminsb))]
2188 #[stable(feature = "simd_x86", since = "1.27.0")]
2189 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2190     transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
2191 }
2192
2193 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2194 /// the packed minimum values.
2195 ///
2196 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
2197 #[inline]
2198 #[target_feature(enable = "avx2")]
2199 #[cfg_attr(test, assert_instr(vpminuw))]
2200 #[stable(feature = "simd_x86", since = "1.27.0")]
2201 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2202     transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
2203 }
2204
2205 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2206 /// the packed minimum values.
2207 ///
2208 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
2209 #[inline]
2210 #[target_feature(enable = "avx2")]
2211 #[cfg_attr(test, assert_instr(vpminud))]
2212 #[stable(feature = "simd_x86", since = "1.27.0")]
2213 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2214     transmute(pminud(a.as_u32x8(), b.as_u32x8()))
2215 }
2216
2217 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2218 /// the packed minimum values.
2219 ///
2220 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
2221 #[inline]
2222 #[target_feature(enable = "avx2")]
2223 #[cfg_attr(test, assert_instr(vpminub))]
2224 #[stable(feature = "simd_x86", since = "1.27.0")]
2225 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2226     transmute(pminub(a.as_u8x32(), b.as_u8x32()))
2227 }
2228
2229 /// Creates mask from the most significant bit of each 8-bit element in `a`,
2230 /// return the result.
2231 ///
2232 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
2233 #[inline]
2234 #[target_feature(enable = "avx2")]
2235 #[cfg_attr(test, assert_instr(vpmovmskb))]
2236 #[stable(feature = "simd_x86", since = "1.27.0")]
2237 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2238     pmovmskb(a.as_i8x32())
2239 }
2240
2241 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2242 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2243 /// results in dst. Eight SADs are performed for each 128-bit lane using one
2244 /// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2245 /// selected from `b` starting at on the offset specified in `imm8`. Eight
2246 /// quadruplets are formed from sequential 8-bit integers selected from `a`
2247 /// starting at the offset specified in `imm8`.
2248 ///
2249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
2250 #[inline]
2251 #[target_feature(enable = "avx2")]
2252 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
2253 #[rustc_args_required_const(2)]
2254 #[stable(feature = "simd_x86", since = "1.27.0")]
2255 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2256     let a = a.as_u8x32();
2257     let b = b.as_u8x32();
2258     macro_rules! call {
2259         ($imm8:expr) => {
2260             mpsadbw(a, b, $imm8)
2261         };
2262     }
2263     let r = constify_imm8!(imm8, call);
2264     transmute(r)
2265 }
2266
2267 /// Multiplies the low 32-bit integers from each packed 64-bit element in
2268 /// `a` and `b`
2269 ///
2270 /// Returns the 64-bit results.
2271 ///
2272 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
2273 #[inline]
2274 #[target_feature(enable = "avx2")]
2275 #[cfg_attr(test, assert_instr(vpmuldq))]
2276 #[stable(feature = "simd_x86", since = "1.27.0")]
2277 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2278     transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
2279 }
2280
2281 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2282 /// element in `a` and `b`
2283 ///
2284 /// Returns the unsigned 64-bit results.
2285 ///
2286 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
2287 #[inline]
2288 #[target_feature(enable = "avx2")]
2289 #[cfg_attr(test, assert_instr(vpmuludq))]
2290 #[stable(feature = "simd_x86", since = "1.27.0")]
2291 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2292     transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
2293 }
2294
2295 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2296 /// intermediate 32-bit integers and returning the high 16 bits of the
2297 /// intermediate integers.
2298 ///
2299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
2300 #[inline]
2301 #[target_feature(enable = "avx2")]
2302 #[cfg_attr(test, assert_instr(vpmulhw))]
2303 #[stable(feature = "simd_x86", since = "1.27.0")]
2304 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2305     transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
2306 }
2307
2308 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2309 /// intermediate 32-bit integers and returning the high 16 bits of the
2310 /// intermediate integers.
2311 ///
2312 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
2313 #[inline]
2314 #[target_feature(enable = "avx2")]
2315 #[cfg_attr(test, assert_instr(vpmulhuw))]
2316 #[stable(feature = "simd_x86", since = "1.27.0")]
2317 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2318     transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
2319 }
2320
2321 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2322 /// intermediate 32-bit integers, and returns the low 16 bits of the
2323 /// intermediate integers
2324 ///
2325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
2326 #[inline]
2327 #[target_feature(enable = "avx2")]
2328 #[cfg_attr(test, assert_instr(vpmullw))]
2329 #[stable(feature = "simd_x86", since = "1.27.0")]
2330 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2331     transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2332 }
2333
2334 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
2335 /// intermediate 64-bit integers, and returns the low 32 bits of the
2336 /// intermediate integers
2337 ///
2338 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
2339 #[inline]
2340 #[target_feature(enable = "avx2")]
2341 #[cfg_attr(test, assert_instr(vpmulld))]
2342 #[stable(feature = "simd_x86", since = "1.27.0")]
2343 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2344     transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2345 }
2346
2347 /// Multiplies packed 16-bit integers in `a` and `b`, producing
2348 /// intermediate signed 32-bit integers. Truncate each intermediate
2349 /// integer to the 18 most significant bits, round by adding 1, and
2350 /// return bits `[16:1]`.
2351 ///
2352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
2353 #[inline]
2354 #[target_feature(enable = "avx2")]
2355 #[cfg_attr(test, assert_instr(vpmulhrsw))]
2356 #[stable(feature = "simd_x86", since = "1.27.0")]
2357 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2358     transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2359 }
2360
2361 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2362 /// and `b`
2363 ///
2364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
2365 #[inline]
2366 #[target_feature(enable = "avx2")]
2367 #[cfg_attr(test, assert_instr(vorps))]
2368 #[stable(feature = "simd_x86", since = "1.27.0")]
2369 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2370     transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2371 }
2372
2373 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2374 /// using signed saturation
2375 ///
2376 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
2377 #[inline]
2378 #[target_feature(enable = "avx2")]
2379 #[cfg_attr(test, assert_instr(vpacksswb))]
2380 #[stable(feature = "simd_x86", since = "1.27.0")]
2381 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2382     transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2383 }
2384
2385 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2386 /// using signed saturation
2387 ///
2388 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
2389 #[inline]
2390 #[target_feature(enable = "avx2")]
2391 #[cfg_attr(test, assert_instr(vpackssdw))]
2392 #[stable(feature = "simd_x86", since = "1.27.0")]
2393 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2394     transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2395 }
2396
2397 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2398 /// using unsigned saturation
2399 ///
2400 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
2401 #[inline]
2402 #[target_feature(enable = "avx2")]
2403 #[cfg_attr(test, assert_instr(vpackuswb))]
2404 #[stable(feature = "simd_x86", since = "1.27.0")]
2405 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2406     transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2407 }
2408
2409 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2410 /// using unsigned saturation
2411 ///
2412 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
2413 #[inline]
2414 #[target_feature(enable = "avx2")]
2415 #[cfg_attr(test, assert_instr(vpackusdw))]
2416 #[stable(feature = "simd_x86", since = "1.27.0")]
2417 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2418     transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2419 }
2420
2421 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
2422 ///
2423 /// The last 3 bits of each integer of `b` are used as addresses into the 8
2424 /// integers of `a`.
2425 ///
2426 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
2427 #[inline]
2428 #[target_feature(enable = "avx2")]
2429 #[cfg_attr(test, assert_instr(vpermps))]
2430 #[stable(feature = "simd_x86", since = "1.27.0")]
2431 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2432     transmute(permd(a.as_u32x8(), b.as_u32x8()))
2433 }
2434
2435 /// Permutes 64-bit integers from `a` using control mask `imm8`.
2436 ///
2437 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
2438 #[inline]
2439 #[target_feature(enable = "avx2")]
2440 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
2441 #[rustc_args_required_const(1)]
2442 #[stable(feature = "simd_x86", since = "1.27.0")]
2443 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
2444     let imm8 = (imm8 & 0xFF) as u8;
2445     let zero = _mm256_setzero_si256().as_i64x4();
2446     let a = a.as_i64x4();
2447     macro_rules! permute4 {
2448         ($a:expr, $b:expr, $c:expr, $d:expr) => {
2449             simd_shuffle4(a, zero, [$a, $b, $c, $d]);
2450         };
2451     }
2452     macro_rules! permute3 {
2453         ($a:expr, $b:expr, $c:expr) => {
2454             match (imm8 >> 6) & 0b11 {
2455                 0b00 => permute4!($a, $b, $c, 0),
2456                 0b01 => permute4!($a, $b, $c, 1),
2457                 0b10 => permute4!($a, $b, $c, 2),
2458                 _ => permute4!($a, $b, $c, 3),
2459             }
2460         };
2461     }
2462     macro_rules! permute2 {
2463         ($a:expr, $b:expr) => {
2464             match (imm8 >> 4) & 0b11 {
2465                 0b00 => permute3!($a, $b, 0),
2466                 0b01 => permute3!($a, $b, 1),
2467                 0b10 => permute3!($a, $b, 2),
2468                 _ => permute3!($a, $b, 3),
2469             }
2470         };
2471     }
2472     macro_rules! permute1 {
2473         ($a:expr) => {
2474             match (imm8 >> 2) & 0b11 {
2475                 0b00 => permute2!($a, 0),
2476                 0b01 => permute2!($a, 1),
2477                 0b10 => permute2!($a, 2),
2478                 _ => permute2!($a, 3),
2479             }
2480         };
2481     }
2482     let r: i64x4 = match imm8 & 0b11 {
2483         0b00 => permute1!(0),
2484         0b01 => permute1!(1),
2485         0b10 => permute1!(2),
2486         _ => permute1!(3),
2487     };
2488     transmute(r)
2489 }
2490
2491 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2492 ///
2493 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
2494 #[inline]
2495 #[target_feature(enable = "avx2")]
2496 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
2497 #[rustc_args_required_const(2)]
2498 #[stable(feature = "simd_x86", since = "1.27.0")]
2499 pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2500     let a = a.as_i64x4();
2501     let b = b.as_i64x4();
2502     macro_rules! call {
2503         ($imm8:expr) => {
2504             vperm2i128(a, b, $imm8)
2505         };
2506     }
2507     transmute(constify_imm8!(imm8, call))
2508 }
2509
2510 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
2511 /// control in `imm8`.
2512 ///
2513 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
2514 #[inline]
2515 #[target_feature(enable = "avx2")]
2516 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
2517 #[rustc_args_required_const(1)]
2518 #[stable(feature = "simd_x86", since = "1.27.0")]
2519 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
2520     let imm8 = (imm8 & 0xFF) as u8;
2521     let undef = _mm256_undefined_pd();
2522     macro_rules! shuffle_done {
2523         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2524             simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
2525         };
2526     }
2527     macro_rules! shuffle_x67 {
2528         ($x01:expr, $x23:expr, $x45:expr) => {
2529             match (imm8 >> 6) & 0b11 {
2530                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2531                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2532                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2533                 _ => shuffle_done!($x01, $x23, $x45, 3),
2534             }
2535         };
2536     }
2537     macro_rules! shuffle_x45 {
2538         ($x01:expr, $x23:expr) => {
2539             match (imm8 >> 4) & 0b11 {
2540                 0b00 => shuffle_x67!($x01, $x23, 0),
2541                 0b01 => shuffle_x67!($x01, $x23, 1),
2542                 0b10 => shuffle_x67!($x01, $x23, 2),
2543                 _ => shuffle_x67!($x01, $x23, 3),
2544             }
2545         };
2546     }
2547     macro_rules! shuffle_x23 {
2548         ($x01:expr) => {
2549             match (imm8 >> 2) & 0b11 {
2550                 0b00 => shuffle_x45!($x01, 0),
2551                 0b01 => shuffle_x45!($x01, 1),
2552                 0b10 => shuffle_x45!($x01, 2),
2553                 _ => shuffle_x45!($x01, 3),
2554             }
2555         };
2556     }
2557     match imm8 & 0b11 {
2558         0b00 => shuffle_x23!(0),
2559         0b01 => shuffle_x23!(1),
2560         0b10 => shuffle_x23!(2),
2561         _ => shuffle_x23!(3),
2562     }
2563 }
2564
2565 /// Shuffles eight 32-bit foating-point elements in `a` across lanes using
2566 /// the corresponding 32-bit integer index in `idx`.
2567 ///
2568 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
2569 #[inline]
2570 #[target_feature(enable = "avx2")]
2571 #[cfg_attr(test, assert_instr(vpermps))]
2572 #[stable(feature = "simd_x86", since = "1.27.0")]
2573 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2574     permps(a, idx.as_i32x8())
2575 }
2576
2577 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2578 /// and `b`, then horizontally sum each consecutive 8 differences to
2579 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2580 /// integers in the low 16 bits of the 64-bit return value
2581 ///
2582 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
2583 #[inline]
2584 #[target_feature(enable = "avx2")]
2585 #[cfg_attr(test, assert_instr(vpsadbw))]
2586 #[stable(feature = "simd_x86", since = "1.27.0")]
2587 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2588     transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2589 }
2590
2591 /// Shuffles bytes from `a` according to the content of `b`.
2592 ///
2593 /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
2594 /// of `a`.
2595 ///
2596 /// In addition, if the highest significant bit of a byte of `b` is set, the
2597 /// respective destination byte is set to 0.
2598 ///
2599 /// The low and high halves of the vectors are shuffled separately.
2600 ///
2601 /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2602 /// equivalent to:
2603 ///
2604 /// ```
2605 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2606 ///     let mut r = [0; 32];
2607 ///     for i in 0..16 {
2608 ///         // if the most significant bit of b is set,
2609 ///         // then the destination byte is set to 0.
2610 ///         if b[i] & 0x80 == 0u8 {
2611 ///             r[i] = a[(b[i] % 16) as usize];
2612 ///         }
2613 ///         if b[i + 16] & 0x80 == 0u8 {
2614 ///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2615 ///         }
2616 ///     }
2617 ///     r
2618 /// }
2619 /// ```
2620 ///
2621 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
2622 #[inline]
2623 #[target_feature(enable = "avx2")]
2624 #[cfg_attr(test, assert_instr(vpshufb))]
2625 #[stable(feature = "simd_x86", since = "1.27.0")]
2626 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2627     transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2628 }
2629
2630 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2631 /// `imm8`.
2632 ///
2633 /// ```rust
2634 /// #[cfg(target_arch = "x86")]
2635 /// use std::arch::x86::*;
2636 /// #[cfg(target_arch = "x86_64")]
2637 /// use std::arch::x86_64::*;
2638 ///
2639 /// # fn main() {
2640 /// #     if is_x86_feature_detected!("avx2") {
2641 /// #         #[target_feature(enable = "avx2")]
2642 /// #         unsafe fn worker() {
2643 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2644 ///
2645 /// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2646 /// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2647 ///
2648 /// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2649 /// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2650 ///
2651 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2652 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2653 /// #         }
2654 /// #         unsafe { worker(); }
2655 /// #     }
2656 /// # }
2657 /// ```
2658 ///
2659 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2660 #[inline]
2661 #[target_feature(enable = "avx2")]
2662 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
2663 #[rustc_args_required_const(1)]
2664 #[stable(feature = "simd_x86", since = "1.27.0")]
2665 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
2666     // simd_shuffleX requires that its selector parameter be made up of
2667     // constant values, but we can't enforce that here. In spirit, we need
2668     // to write a `match` on all possible values of a byte, and for each value,
2669     // hard-code the correct `simd_shuffleX` call using only constants. We
2670     // then hope for LLVM to do the rest.
2671     //
2672     // Of course, that's... awful. So we try to use macros to do it for us.
2673     let imm8 = (imm8 & 0xFF) as u8;
2674
2675     let a = a.as_i32x8();
2676     macro_rules! shuffle_done {
2677         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2678             simd_shuffle8(
2679                 a,
2680                 a,
2681                 [
2682                     $x01,
2683                     $x23,
2684                     $x45,
2685                     $x67,
2686                     4 + $x01,
2687                     4 + $x23,
2688                     4 + $x45,
2689                     4 + $x67,
2690                 ],
2691             )
2692         };
2693     }
2694     macro_rules! shuffle_x67 {
2695         ($x01:expr, $x23:expr, $x45:expr) => {
2696             match (imm8 >> 6) & 0b11 {
2697                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2698                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2699                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2700                 _ => shuffle_done!($x01, $x23, $x45, 3),
2701             }
2702         };
2703     }
2704     macro_rules! shuffle_x45 {
2705         ($x01:expr, $x23:expr) => {
2706             match (imm8 >> 4) & 0b11 {
2707                 0b00 => shuffle_x67!($x01, $x23, 0),
2708                 0b01 => shuffle_x67!($x01, $x23, 1),
2709                 0b10 => shuffle_x67!($x01, $x23, 2),
2710                 _ => shuffle_x67!($x01, $x23, 3),
2711             }
2712         };
2713     }
2714     macro_rules! shuffle_x23 {
2715         ($x01:expr) => {
2716             match (imm8 >> 2) & 0b11 {
2717                 0b00 => shuffle_x45!($x01, 0),
2718                 0b01 => shuffle_x45!($x01, 1),
2719                 0b10 => shuffle_x45!($x01, 2),
2720                 _ => shuffle_x45!($x01, 3),
2721             }
2722         };
2723     }
2724     let r: i32x8 = match imm8 & 0b11 {
2725         0b00 => shuffle_x23!(0),
2726         0b01 => shuffle_x23!(1),
2727         0b10 => shuffle_x23!(2),
2728         _ => shuffle_x23!(3),
2729     };
2730     transmute(r)
2731 }
2732
2733 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2734 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2735 /// to the output.
2736 ///
2737 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
2738 #[inline]
2739 #[target_feature(enable = "avx2")]
2740 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
2741 #[rustc_args_required_const(1)]
2742 #[stable(feature = "simd_x86", since = "1.27.0")]
2743 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
2744     let imm8 = (imm8 & 0xFF) as u8;
2745     let a = a.as_i16x16();
2746     macro_rules! shuffle_done {
2747         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2748             #[rustfmt::skip]
2749                         simd_shuffle16(a, a, [
2750                             0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
2751                             8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
2752                         ]);
2753         };
2754     }
2755     macro_rules! shuffle_x67 {
2756         ($x01:expr, $x23:expr, $x45:expr) => {
2757             match (imm8 >> 6) & 0b11 {
2758                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2759                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2760                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2761                 _ => shuffle_done!($x01, $x23, $x45, 3),
2762             }
2763         };
2764     }
2765     macro_rules! shuffle_x45 {
2766         ($x01:expr, $x23:expr) => {
2767             match (imm8 >> 4) & 0b11 {
2768                 0b00 => shuffle_x67!($x01, $x23, 0),
2769                 0b01 => shuffle_x67!($x01, $x23, 1),
2770                 0b10 => shuffle_x67!($x01, $x23, 2),
2771                 _ => shuffle_x67!($x01, $x23, 3),
2772             }
2773         };
2774     }
2775     macro_rules! shuffle_x23 {
2776         ($x01:expr) => {
2777             match (imm8 >> 2) & 0b11 {
2778                 0b00 => shuffle_x45!($x01, 0),
2779                 0b01 => shuffle_x45!($x01, 1),
2780                 0b10 => shuffle_x45!($x01, 2),
2781                 _ => shuffle_x45!($x01, 3),
2782             }
2783         };
2784     }
2785     let r: i16x16 = match imm8 & 0b11 {
2786         0b00 => shuffle_x23!(0),
2787         0b01 => shuffle_x23!(1),
2788         0b10 => shuffle_x23!(2),
2789         _ => shuffle_x23!(3),
2790     };
2791     transmute(r)
2792 }
2793
2794 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2795 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2796 /// to the output.
2797 ///
2798 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
2799 #[inline]
2800 #[target_feature(enable = "avx2")]
2801 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
2802 #[rustc_args_required_const(1)]
2803 #[stable(feature = "simd_x86", since = "1.27.0")]
2804 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
2805     let imm8 = (imm8 & 0xFF) as u8;
2806     let a = a.as_i16x16();
2807     macro_rules! shuffle_done {
2808         ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2809             #[rustfmt::skip]
2810                         simd_shuffle16(a, a, [
2811                             0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
2812                             8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
2813                         ]);
2814         };
2815     }
2816     macro_rules! shuffle_x67 {
2817         ($x01:expr, $x23:expr, $x45:expr) => {
2818             match (imm8 >> 6) & 0b11 {
2819                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2820                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2821                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2822                 _ => shuffle_done!($x01, $x23, $x45, 3),
2823             }
2824         };
2825     }
2826     macro_rules! shuffle_x45 {
2827         ($x01:expr, $x23:expr) => {
2828             match (imm8 >> 4) & 0b11 {
2829                 0b00 => shuffle_x67!($x01, $x23, 0),
2830                 0b01 => shuffle_x67!($x01, $x23, 1),
2831                 0b10 => shuffle_x67!($x01, $x23, 2),
2832                 _ => shuffle_x67!($x01, $x23, 3),
2833             }
2834         };
2835     }
2836     macro_rules! shuffle_x23 {
2837         ($x01:expr) => {
2838             match (imm8 >> 2) & 0b11 {
2839                 0b00 => shuffle_x45!($x01, 0),
2840                 0b01 => shuffle_x45!($x01, 1),
2841                 0b10 => shuffle_x45!($x01, 2),
2842                 _ => shuffle_x45!($x01, 3),
2843             }
2844         };
2845     }
2846     let r: i16x16 = match imm8 & 0b11 {
2847         0b00 => shuffle_x23!(0),
2848         0b01 => shuffle_x23!(1),
2849         0b10 => shuffle_x23!(2),
2850         _ => shuffle_x23!(3),
2851     };
2852     transmute(r)
2853 }
2854
2855 /// Negates packed 16-bit integers in `a` when the corresponding signed
2856 /// 16-bit integer in `b` is negative, and returns the results.
2857 /// Results are zeroed out when the corresponding element in `b` is zero.
2858 ///
2859 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
2860 #[inline]
2861 #[target_feature(enable = "avx2")]
2862 #[cfg_attr(test, assert_instr(vpsignw))]
2863 #[stable(feature = "simd_x86", since = "1.27.0")]
2864 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2865     transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2866 }
2867
2868 /// Negates packed 32-bit integers in `a` when the corresponding signed
2869 /// 32-bit integer in `b` is negative, and returns the results.
2870 /// Results are zeroed out when the corresponding element in `b` is zero.
2871 ///
2872 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
2873 #[inline]
2874 #[target_feature(enable = "avx2")]
2875 #[cfg_attr(test, assert_instr(vpsignd))]
2876 #[stable(feature = "simd_x86", since = "1.27.0")]
2877 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2878     transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2879 }
2880
2881 /// Negates packed 8-bit integers in `a` when the corresponding signed
2882 /// 8-bit integer in `b` is negative, and returns the results.
2883 /// Results are zeroed out when the corresponding element in `b` is zero.
2884 ///
2885 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
2886 #[inline]
2887 #[target_feature(enable = "avx2")]
2888 #[cfg_attr(test, assert_instr(vpsignb))]
2889 #[stable(feature = "simd_x86", since = "1.27.0")]
2890 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2891     transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2892 }
2893
2894 /// Shifts packed 16-bit integers in `a` left by `count` while
2895 /// shifting in zeros, and returns the result
2896 ///
2897 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
2898 #[inline]
2899 #[target_feature(enable = "avx2")]
2900 #[cfg_attr(test, assert_instr(vpsllw))]
2901 #[stable(feature = "simd_x86", since = "1.27.0")]
2902 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2903     transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2904 }
2905
2906 /// Shifts packed 32-bit integers in `a` left by `count` while
2907 /// shifting in zeros, and returns the result
2908 ///
2909 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
2910 #[inline]
2911 #[target_feature(enable = "avx2")]
2912 #[cfg_attr(test, assert_instr(vpslld))]
2913 #[stable(feature = "simd_x86", since = "1.27.0")]
2914 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2915     transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2916 }
2917
2918 /// Shifts packed 64-bit integers in `a` left by `count` while
2919 /// shifting in zeros, and returns the result
2920 ///
2921 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
2922 #[inline]
2923 #[target_feature(enable = "avx2")]
2924 #[cfg_attr(test, assert_instr(vpsllq))]
2925 #[stable(feature = "simd_x86", since = "1.27.0")]
2926 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2927     transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2928 }
2929
2930 /// Shifts packed 16-bit integers in `a` left by `imm8` while
2931 /// shifting in zeros, return the results;
2932 ///
2933 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
2934 #[inline]
2935 #[target_feature(enable = "avx2")]
2936 #[cfg_attr(test, assert_instr(vpsllw))]
2937 #[stable(feature = "simd_x86", since = "1.27.0")]
2938 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
2939     transmute(pslliw(a.as_i16x16(), imm8))
2940 }
2941
2942 /// Shifts packed 32-bit integers in `a` left by `imm8` while
2943 /// shifting in zeros, return the results;
2944 ///
2945 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
2946 #[inline]
2947 #[target_feature(enable = "avx2")]
2948 #[cfg_attr(test, assert_instr(vpslld))]
2949 #[stable(feature = "simd_x86", since = "1.27.0")]
2950 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
2951     transmute(psllid(a.as_i32x8(), imm8))
2952 }
2953
2954 /// Shifts packed 64-bit integers in `a` left by `imm8` while
2955 /// shifting in zeros, return the results;
2956 ///
2957 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
2958 #[inline]
2959 #[target_feature(enable = "avx2")]
2960 #[cfg_attr(test, assert_instr(vpsllq))]
2961 #[stable(feature = "simd_x86", since = "1.27.0")]
2962 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
2963     transmute(pslliq(a.as_i64x4(), imm8))
2964 }
2965
2966 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2967 ///
2968 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
2969 #[inline]
2970 #[target_feature(enable = "avx2")]
2971 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2972 #[rustc_args_required_const(1)]
2973 #[stable(feature = "simd_x86", since = "1.27.0")]
2974 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2975     let a = a.as_i64x4();
2976     macro_rules! call {
2977         ($imm8:expr) => {
2978             vpslldq(a, $imm8)
2979         };
2980     }
2981     transmute(constify_imm8!(imm8 * 8, call))
2982 }
2983
2984 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2985 ///
2986 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
2987 #[inline]
2988 #[target_feature(enable = "avx2")]
2989 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2990 #[rustc_args_required_const(1)]
2991 #[stable(feature = "simd_x86", since = "1.27.0")]
2992 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2993     let a = a.as_i64x4();
2994     macro_rules! call {
2995         ($imm8:expr) => {
2996             vpslldq(a, $imm8)
2997         };
2998     }
2999     transmute(constify_imm8!(imm8 * 8, call))
3000 }
3001
3002 /// Shifts packed 32-bit integers in `a` left by the amount
3003 /// specified by the corresponding element in `count` while
3004 /// shifting in zeros, and returns the result.
3005 ///
3006 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
3007 #[inline]
3008 #[target_feature(enable = "avx2")]
3009 #[cfg_attr(test, assert_instr(vpsllvd))]
3010 #[stable(feature = "simd_x86", since = "1.27.0")]
3011 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
3012     transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
3013 }
3014
3015 /// Shifts packed 32-bit integers in `a` left by the amount
3016 /// specified by the corresponding element in `count` while
3017 /// shifting in zeros, and returns the result.
3018 ///
3019 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
3020 #[inline]
3021 #[target_feature(enable = "avx2")]
3022 #[cfg_attr(test, assert_instr(vpsllvd))]
3023 #[stable(feature = "simd_x86", since = "1.27.0")]
3024 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
3025     transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
3026 }
3027
3028 /// Shifts packed 64-bit integers in `a` left by the amount
3029 /// specified by the corresponding element in `count` while
3030 /// shifting in zeros, and returns the result.
3031 ///
3032 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
3033 #[inline]
3034 #[target_feature(enable = "avx2")]
3035 #[cfg_attr(test, assert_instr(vpsllvq))]
3036 #[stable(feature = "simd_x86", since = "1.27.0")]
3037 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3038     transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
3039 }
3040
3041 /// Shifts packed 64-bit integers in `a` left by the amount
3042 /// specified by the corresponding element in `count` while
3043 /// shifting in zeros, and returns the result.
3044 ///
3045 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
3046 #[inline]
3047 #[target_feature(enable = "avx2")]
3048 #[cfg_attr(test, assert_instr(vpsllvq))]
3049 #[stable(feature = "simd_x86", since = "1.27.0")]
3050 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3051     transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
3052 }
3053
3054 /// Shifts packed 16-bit integers in `a` right by `count` while
3055 /// shifting in sign bits.
3056 ///
3057 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
3058 #[inline]
3059 #[target_feature(enable = "avx2")]
3060 #[cfg_attr(test, assert_instr(vpsraw))]
3061 #[stable(feature = "simd_x86", since = "1.27.0")]
3062 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3063     transmute(psraw(a.as_i16x16(), count.as_i16x8()))
3064 }
3065
3066 /// Shifts packed 32-bit integers in `a` right by `count` while
3067 /// shifting in sign bits.
3068 ///
3069 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
3070 #[inline]
3071 #[target_feature(enable = "avx2")]
3072 #[cfg_attr(test, assert_instr(vpsrad))]
3073 #[stable(feature = "simd_x86", since = "1.27.0")]
3074 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3075     transmute(psrad(a.as_i32x8(), count.as_i32x4()))
3076 }
3077
3078 /// Shifts packed 16-bit integers in `a` right by `imm8` while
3079 /// shifting in sign bits.
3080 ///
3081 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
3082 #[inline]
3083 #[target_feature(enable = "avx2")]
3084 #[cfg_attr(test, assert_instr(vpsraw))]
3085 #[stable(feature = "simd_x86", since = "1.27.0")]
3086 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
3087     transmute(psraiw(a.as_i16x16(), imm8))
3088 }
3089
3090 /// Shifts packed 32-bit integers in `a` right by `imm8` while
3091 /// shifting in sign bits.
3092 ///
3093 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
3094 #[inline]
3095 #[target_feature(enable = "avx2")]
3096 #[cfg_attr(test, assert_instr(vpsrad))]
3097 #[stable(feature = "simd_x86", since = "1.27.0")]
3098 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
3099     transmute(psraid(a.as_i32x8(), imm8))
3100 }
3101
3102 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3103 /// corresponding element in `count` while shifting in sign bits.
3104 ///
3105 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
3106 #[inline]
3107 #[target_feature(enable = "avx2")]
3108 #[cfg_attr(test, assert_instr(vpsravd))]
3109 #[stable(feature = "simd_x86", since = "1.27.0")]
3110 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3111     transmute(psravd(a.as_i32x4(), count.as_i32x4()))
3112 }
3113
3114 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3115 /// corresponding element in `count` while shifting in sign bits.
3116 ///
3117 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
3118 #[inline]
3119 #[target_feature(enable = "avx2")]
3120 #[cfg_attr(test, assert_instr(vpsravd))]
3121 #[stable(feature = "simd_x86", since = "1.27.0")]
3122 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3123     transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
3124 }
3125
3126 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3127 ///
3128 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
3129 #[inline]
3130 #[target_feature(enable = "avx2")]
3131 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3132 #[rustc_args_required_const(1)]
3133 #[stable(feature = "simd_x86", since = "1.27.0")]
3134 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
3135     let a = a.as_i64x4();
3136     macro_rules! call {
3137         ($imm8:expr) => {
3138             vpsrldq(a, $imm8)
3139         };
3140     }
3141     transmute(constify_imm8!(imm8 * 8, call))
3142 }
3143
3144 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3145 ///
3146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
3147 #[inline]
3148 #[target_feature(enable = "avx2")]
3149 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3150 #[rustc_args_required_const(1)]
3151 #[stable(feature = "simd_x86", since = "1.27.0")]
3152 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
3153     let a = a.as_i64x4();
3154     macro_rules! call {
3155         ($imm8:expr) => {
3156             vpsrldq(a, $imm8)
3157         };
3158     }
3159     transmute(constify_imm8!(imm8 * 8, call))
3160 }
3161
3162 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3163 /// zeros.
3164 ///
3165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
3166 #[inline]
3167 #[target_feature(enable = "avx2")]
3168 #[cfg_attr(test, assert_instr(vpsrlw))]
3169 #[stable(feature = "simd_x86", since = "1.27.0")]
3170 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3171     transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3172 }
3173
3174 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3175 /// zeros.
3176 ///
3177 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
3178 #[inline]
3179 #[target_feature(enable = "avx2")]
3180 #[cfg_attr(test, assert_instr(vpsrld))]
3181 #[stable(feature = "simd_x86", since = "1.27.0")]
3182 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3183     transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3184 }
3185
3186 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3187 /// zeros.
3188 ///
3189 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
3190 #[inline]
3191 #[target_feature(enable = "avx2")]
3192 #[cfg_attr(test, assert_instr(vpsrlq))]
3193 #[stable(feature = "simd_x86", since = "1.27.0")]
3194 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3195     transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3196 }
3197
3198 /// Shifts packed 16-bit integers in `a` right by `imm8` while shifting in
3199 /// zeros
3200 ///
3201 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
3202 #[inline]
3203 #[target_feature(enable = "avx2")]
3204 #[cfg_attr(test, assert_instr(vpsrlw))]
3205 #[stable(feature = "simd_x86", since = "1.27.0")]
3206 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
3207     transmute(psrliw(a.as_i16x16(), imm8))
3208 }
3209
3210 /// Shifts packed 32-bit integers in `a` right by `imm8` while shifting in
3211 /// zeros
3212 ///
3213 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
3214 #[inline]
3215 #[target_feature(enable = "avx2")]
3216 #[cfg_attr(test, assert_instr(vpsrld))]
3217 #[stable(feature = "simd_x86", since = "1.27.0")]
3218 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
3219     transmute(psrlid(a.as_i32x8(), imm8))
3220 }
3221
3222 /// Shifts packed 64-bit integers in `a` right by `imm8` while shifting in
3223 /// zeros
3224 ///
3225 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
3226 #[inline]
3227 #[target_feature(enable = "avx2")]
3228 #[cfg_attr(test, assert_instr(vpsrlq))]
3229 #[stable(feature = "simd_x86", since = "1.27.0")]
3230 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
3231     transmute(psrliq(a.as_i64x4(), imm8))
3232 }
3233
3234 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3235 /// the corresponding element in `count` while shifting in zeros,
3236 ///
3237 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
3238 #[inline]
3239 #[target_feature(enable = "avx2")]
3240 #[cfg_attr(test, assert_instr(vpsrlvd))]
3241 #[stable(feature = "simd_x86", since = "1.27.0")]
3242 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3243     transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3244 }
3245
3246 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3247 /// the corresponding element in `count` while shifting in zeros,
3248 ///
3249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
3250 #[inline]
3251 #[target_feature(enable = "avx2")]
3252 #[cfg_attr(test, assert_instr(vpsrlvd))]
3253 #[stable(feature = "simd_x86", since = "1.27.0")]
3254 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3255     transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3256 }
3257
3258 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3259 /// the corresponding element in `count` while shifting in zeros,
3260 ///
3261 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
3262 #[inline]
3263 #[target_feature(enable = "avx2")]
3264 #[cfg_attr(test, assert_instr(vpsrlvq))]
3265 #[stable(feature = "simd_x86", since = "1.27.0")]
3266 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3267     transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3268 }
3269
3270 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3271 /// the corresponding element in `count` while shifting in zeros,
3272 ///
3273 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
3274 #[inline]
3275 #[target_feature(enable = "avx2")]
3276 #[cfg_attr(test, assert_instr(vpsrlvq))]
3277 #[stable(feature = "simd_x86", since = "1.27.0")]
3278 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3279     transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3280 }
3281
3282 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
3283
3284 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3285 ///
3286 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
3287 #[inline]
3288 #[target_feature(enable = "avx2")]
3289 #[cfg_attr(test, assert_instr(vpsubw))]
3290 #[stable(feature = "simd_x86", since = "1.27.0")]
3291 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3292     transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3293 }
3294
3295 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
3296 ///
3297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
3298 #[inline]
3299 #[target_feature(enable = "avx2")]
3300 #[cfg_attr(test, assert_instr(vpsubd))]
3301 #[stable(feature = "simd_x86", since = "1.27.0")]
3302 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3303     transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3304 }
3305
3306 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
3307 ///
3308 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
3309 #[inline]
3310 #[target_feature(enable = "avx2")]
3311 #[cfg_attr(test, assert_instr(vpsubq))]
3312 #[stable(feature = "simd_x86", since = "1.27.0")]
3313 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3314     transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3315 }
3316
3317 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
3318 ///
3319 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
3320 #[inline]
3321 #[target_feature(enable = "avx2")]
3322 #[cfg_attr(test, assert_instr(vpsubb))]
3323 #[stable(feature = "simd_x86", since = "1.27.0")]
3324 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3325     transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3326 }
3327
3328 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3329 /// `a` using saturation.
3330 ///
3331 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
3332 #[inline]
3333 #[target_feature(enable = "avx2")]
3334 #[cfg_attr(test, assert_instr(vpsubsw))]
3335 #[stable(feature = "simd_x86", since = "1.27.0")]
3336 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3337     transmute(psubsw(a.as_i16x16(), b.as_i16x16()))
3338 }
3339
3340 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3341 /// `a` using saturation.
3342 ///
3343 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
3344 #[inline]
3345 #[target_feature(enable = "avx2")]
3346 #[cfg_attr(test, assert_instr(vpsubsb))]
3347 #[stable(feature = "simd_x86", since = "1.27.0")]
3348 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3349     transmute(psubsb(a.as_i8x32(), b.as_i8x32()))
3350 }
3351
3352 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3353 /// integers in `a` using saturation.
3354 ///
3355 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
3356 #[inline]
3357 #[target_feature(enable = "avx2")]
3358 #[cfg_attr(test, assert_instr(vpsubusw))]
3359 #[stable(feature = "simd_x86", since = "1.27.0")]
3360 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3361     transmute(psubusw(a.as_u16x16(), b.as_u16x16()))
3362 }
3363
3364 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3365 /// integers in `a` using saturation.
3366 ///
3367 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
3368 #[inline]
3369 #[target_feature(enable = "avx2")]
3370 #[cfg_attr(test, assert_instr(vpsubusb))]
3371 #[stable(feature = "simd_x86", since = "1.27.0")]
3372 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3373     transmute(psubusb(a.as_u8x32(), b.as_u8x32()))
3374 }
3375
3376 /// Unpacks and interleave 8-bit integers from the high half of each
3377 /// 128-bit lane in `a` and `b`.
3378 ///
3379 /// ```rust
3380 /// #[cfg(target_arch = "x86")]
3381 /// use std::arch::x86::*;
3382 /// #[cfg(target_arch = "x86_64")]
3383 /// use std::arch::x86_64::*;
3384 ///
3385 /// # fn main() {
3386 /// #     if is_x86_feature_detected!("avx2") {
3387 /// #         #[target_feature(enable = "avx2")]
3388 /// #         unsafe fn worker() {
3389 /// let a = _mm256_setr_epi8(
3390 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3391 ///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3392 /// );
3393 /// let b = _mm256_setr_epi8(
3394 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3395 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3396 ///     -30, -31,
3397 /// );
3398 ///
3399 /// let c = _mm256_unpackhi_epi8(a, b);
3400 ///
3401 /// let expected = _mm256_setr_epi8(
3402 ///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3403 ///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3404 ///     -31,
3405 /// );
3406 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3407 ///
3408 /// #         }
3409 /// #         unsafe { worker(); }
3410 /// #     }
3411 /// # }
3412 /// ```
3413 ///
3414 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
3415 #[inline]
3416 #[target_feature(enable = "avx2")]
3417 #[cfg_attr(test, assert_instr(vpunpckhbw))]
3418 #[stable(feature = "simd_x86", since = "1.27.0")]
3419 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3420     #[rustfmt::skip]
3421     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3422             8, 40, 9, 41, 10, 42, 11, 43,
3423             12, 44, 13, 45, 14, 46, 15, 47,
3424             24, 56, 25, 57, 26, 58, 27, 59,
3425             28, 60, 29, 61, 30, 62, 31, 63,
3426     ]);
3427     transmute(r)
3428 }
3429
3430 /// Unpacks and interleave 8-bit integers from the low half of each
3431 /// 128-bit lane of `a` and `b`.
3432 ///
3433 /// ```rust
3434 /// #[cfg(target_arch = "x86")]
3435 /// use std::arch::x86::*;
3436 /// #[cfg(target_arch = "x86_64")]
3437 /// use std::arch::x86_64::*;
3438 ///
3439 /// # fn main() {
3440 /// #     if is_x86_feature_detected!("avx2") {
3441 /// #         #[target_feature(enable = "avx2")]
3442 /// #         unsafe fn worker() {
3443 /// let a = _mm256_setr_epi8(
3444 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3445 ///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3446 /// );
3447 /// let b = _mm256_setr_epi8(
3448 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3449 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3450 ///     -30, -31,
3451 /// );
3452 ///
3453 /// let c = _mm256_unpacklo_epi8(a, b);
3454 ///
3455 /// let expected = _mm256_setr_epi8(
3456 ///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3457 ///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3458 /// );
3459 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3460 ///
3461 /// #         }
3462 /// #         unsafe { worker(); }
3463 /// #     }
3464 /// # }
3465 /// ```
3466 ///
3467 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
3468 #[inline]
3469 #[target_feature(enable = "avx2")]
3470 #[cfg_attr(test, assert_instr(vpunpcklbw))]
3471 #[stable(feature = "simd_x86", since = "1.27.0")]
3472 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3473     #[rustfmt::skip]
3474     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3475         0, 32, 1, 33, 2, 34, 3, 35,
3476         4, 36, 5, 37, 6, 38, 7, 39,
3477         16, 48, 17, 49, 18, 50, 19, 51,
3478         20, 52, 21, 53, 22, 54, 23, 55,
3479     ]);
3480     transmute(r)
3481 }
3482
3483 /// Unpacks and interleave 16-bit integers from the high half of each
3484 /// 128-bit lane of `a` and `b`.
3485 ///
3486 /// ```rust
3487 /// #[cfg(target_arch = "x86")]
3488 /// use std::arch::x86::*;
3489 /// #[cfg(target_arch = "x86_64")]
3490 /// use std::arch::x86_64::*;
3491 ///
3492 /// # fn main() {
3493 /// #     if is_x86_feature_detected!("avx2") {
3494 /// #         #[target_feature(enable = "avx2")]
3495 /// #         unsafe fn worker() {
3496 /// let a = _mm256_setr_epi16(
3497 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3498 /// );
3499 /// let b = _mm256_setr_epi16(
3500 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3501 /// );
3502 ///
3503 /// let c = _mm256_unpackhi_epi16(a, b);
3504 ///
3505 /// let expected = _mm256_setr_epi16(
3506 ///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3507 /// );
3508 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3509 ///
3510 /// #         }
3511 /// #         unsafe { worker(); }
3512 /// #     }
3513 /// # }
3514 /// ```
3515 ///
3516 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
3517 #[inline]
3518 #[target_feature(enable = "avx2")]
3519 #[cfg_attr(test, assert_instr(vpunpckhwd))]
3520 #[stable(feature = "simd_x86", since = "1.27.0")]
3521 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3522     let r: i16x16 = simd_shuffle16(
3523         a.as_i16x16(),
3524         b.as_i16x16(),
3525         [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3526     );
3527     transmute(r)
3528 }
3529
3530 /// Unpacks and interleave 16-bit integers from the low half of each
3531 /// 128-bit lane of `a` and `b`.
3532 ///
3533 /// ```rust
3534 /// #[cfg(target_arch = "x86")]
3535 /// use std::arch::x86::*;
3536 /// #[cfg(target_arch = "x86_64")]
3537 /// use std::arch::x86_64::*;
3538 ///
3539 /// # fn main() {
3540 /// #     if is_x86_feature_detected!("avx2") {
3541 /// #         #[target_feature(enable = "avx2")]
3542 /// #         unsafe fn worker() {
3543 ///
3544 /// let a = _mm256_setr_epi16(
3545 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3546 /// );
3547 /// let b = _mm256_setr_epi16(
3548 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3549 /// );
3550 ///
3551 /// let c = _mm256_unpacklo_epi16(a, b);
3552 ///
3553 /// let expected = _mm256_setr_epi16(
3554 ///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3555 /// );
3556 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3557 ///
3558 /// #         }
3559 /// #         unsafe { worker(); }
3560 /// #     }
3561 /// # }
3562 /// ```
3563 ///
3564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
3565 #[inline]
3566 #[target_feature(enable = "avx2")]
3567 #[cfg_attr(test, assert_instr(vpunpcklwd))]
3568 #[stable(feature = "simd_x86", since = "1.27.0")]
3569 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3570     let r: i16x16 = simd_shuffle16(
3571         a.as_i16x16(),
3572         b.as_i16x16(),
3573         [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3574     );
3575     transmute(r)
3576 }
3577
3578 /// Unpacks and interleave 32-bit integers from the high half of each
3579 /// 128-bit lane of `a` and `b`.
3580 ///
3581 /// ```rust
3582 /// #[cfg(target_arch = "x86")]
3583 /// use std::arch::x86::*;
3584 /// #[cfg(target_arch = "x86_64")]
3585 /// use std::arch::x86_64::*;
3586 ///
3587 /// # fn main() {
3588 /// #     if is_x86_feature_detected!("avx2") {
3589 /// #         #[target_feature(enable = "avx2")]
3590 /// #         unsafe fn worker() {
3591 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3592 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3593 ///
3594 /// let c = _mm256_unpackhi_epi32(a, b);
3595 ///
3596 /// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3597 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3598 ///
3599 /// #         }
3600 /// #         unsafe { worker(); }
3601 /// #     }
3602 /// # }
3603 /// ```
3604 ///
3605 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
3606 #[inline]
3607 #[target_feature(enable = "avx2")]
3608 #[cfg_attr(test, assert_instr(vunpckhps))]
3609 #[stable(feature = "simd_x86", since = "1.27.0")]
3610 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3611     let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3612     transmute(r)
3613 }
3614
3615 /// Unpacks and interleave 32-bit integers from the low half of each
3616 /// 128-bit lane of `a` and `b`.
3617 ///
3618 /// ```rust
3619 /// #[cfg(target_arch = "x86")]
3620 /// use std::arch::x86::*;
3621 /// #[cfg(target_arch = "x86_64")]
3622 /// use std::arch::x86_64::*;
3623 ///
3624 /// # fn main() {
3625 /// #     if is_x86_feature_detected!("avx2") {
3626 /// #         #[target_feature(enable = "avx2")]
3627 /// #         unsafe fn worker() {
3628 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3629 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3630 ///
3631 /// let c = _mm256_unpacklo_epi32(a, b);
3632 ///
3633 /// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3634 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3635 ///
3636 /// #         }
3637 /// #         unsafe { worker(); }
3638 /// #     }
3639 /// # }
3640 /// ```
3641 ///
3642 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
3643 #[inline]
3644 #[target_feature(enable = "avx2")]
3645 #[cfg_attr(test, assert_instr(vunpcklps))]
3646 #[stable(feature = "simd_x86", since = "1.27.0")]
3647 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3648     let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3649     transmute(r)
3650 }
3651
3652 /// Unpacks and interleave 64-bit integers from the high half of each
3653 /// 128-bit lane of `a` and `b`.
3654 ///
3655 /// ```rust
3656 /// #[cfg(target_arch = "x86")]
3657 /// use std::arch::x86::*;
3658 /// #[cfg(target_arch = "x86_64")]
3659 /// use std::arch::x86_64::*;
3660 ///
3661 /// # fn main() {
3662 /// #     if is_x86_feature_detected!("avx2") {
3663 /// #         #[target_feature(enable = "avx2")]
3664 /// #         unsafe fn worker() {
3665 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3666 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3667 ///
3668 /// let c = _mm256_unpackhi_epi64(a, b);
3669 ///
3670 /// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3671 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3672 ///
3673 /// #         }
3674 /// #         unsafe { worker(); }
3675 /// #     }
3676 /// # }
3677 /// ```
3678 ///
3679 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
3680 #[inline]
3681 #[target_feature(enable = "avx2")]
3682 #[cfg_attr(test, assert_instr(vunpckhpd))]
3683 #[stable(feature = "simd_x86", since = "1.27.0")]
3684 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3685     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3686     transmute(r)
3687 }
3688
3689 /// Unpacks and interleave 64-bit integers from the low half of each
3690 /// 128-bit lane of `a` and `b`.
3691 ///
3692 /// ```rust
3693 /// #[cfg(target_arch = "x86")]
3694 /// use std::arch::x86::*;
3695 /// #[cfg(target_arch = "x86_64")]
3696 /// use std::arch::x86_64::*;
3697 ///
3698 /// # fn main() {
3699 /// #     if is_x86_feature_detected!("avx2") {
3700 /// #         #[target_feature(enable = "avx2")]
3701 /// #         unsafe fn worker() {
3702 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3703 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3704 ///
3705 /// let c = _mm256_unpacklo_epi64(a, b);
3706 ///
3707 /// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3708 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3709 ///
3710 /// #         }
3711 /// #         unsafe { worker(); }
3712 /// #     }
3713 /// # }
3714 /// ```
3715 ///
3716 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
3717 #[inline]
3718 #[target_feature(enable = "avx2")]
3719 #[cfg_attr(test, assert_instr(vunpcklpd))]
3720 #[stable(feature = "simd_x86", since = "1.27.0")]
3721 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3722     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3723     transmute(r)
3724 }
3725
3726 /// Computes the bitwise XOR of 256 bits (representing integer data)
3727 /// in `a` and `b`
3728 ///
3729 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
3730 #[inline]
3731 #[target_feature(enable = "avx2")]
3732 #[cfg_attr(test, assert_instr(vxorps))]
3733 #[stable(feature = "simd_x86", since = "1.27.0")]
3734 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3735     transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3736 }
3737
3738 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3739 /// integer containing the zero-extended integer data.
3740 ///
3741 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3742 ///
3743 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
3744 #[inline]
3745 #[target_feature(enable = "avx2")]
3746 // This intrinsic has no corresponding instruction.
3747 #[rustc_args_required_const(1)]
3748 #[stable(feature = "simd_x86", since = "1.27.0")]
3749 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
3750     let imm8 = (imm8 & 31) as u32;
3751     simd_extract(a.as_i8x32(), imm8)
3752 }
3753
3754 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3755 /// integer containing the zero-extended integer data.
3756 ///
3757 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3758 ///
3759 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
3760 #[inline]
3761 #[target_feature(enable = "avx2")]
3762 // This intrinsic has no corresponding instruction.
3763 #[rustc_args_required_const(1)]
3764 #[stable(feature = "simd_x86", since = "1.27.0")]
3765 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
3766     let imm8 = (imm8 & 15) as u32;
3767     simd_extract(a.as_i16x16(), imm8)
3768 }
3769
3770 /// Extracts a 32-bit integer from `a`, selected with `imm8`.
3771 ///
3772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
3773 #[inline]
3774 #[target_feature(enable = "avx2")]
3775 // This intrinsic has no corresponding instruction.
3776 #[rustc_args_required_const(1)]
3777 #[stable(feature = "simd_x86", since = "1.27.0")]
3778 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
3779     let imm8 = (imm8 & 7) as u32;
3780     simd_extract(a.as_i32x8(), imm8)
3781 }
3782
3783 /// Returns the first element of the input vector of `[4 x double]`.
3784 ///
3785 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
3786 #[inline]
3787 #[target_feature(enable = "avx2")]
3788 //#[cfg_attr(test, assert_instr(movsd))] FIXME
3789 #[stable(feature = "simd_x86", since = "1.27.0")]
3790 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
3791     simd_extract(a, 0)
3792 }
3793
3794 /// Returns the first element of the input vector of `[8 x i32]`.
3795 ///
3796 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
3797 #[inline]
3798 #[target_feature(enable = "avx2")]
3799 //#[cfg_attr(test, assert_instr(movd))] FIXME
3800 #[stable(feature = "simd_x86", since = "1.27.0")]
3801 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
3802     simd_extract(a.as_i32x8(), 0)
3803 }
3804
3805 #[allow(improper_ctypes)]
3806 extern "C" {
3807     #[link_name = "llvm.x86.avx2.pabs.b"]
3808     fn pabsb(a: i8x32) -> u8x32;
3809     #[link_name = "llvm.x86.avx2.pabs.w"]
3810     fn pabsw(a: i16x16) -> u16x16;
3811     #[link_name = "llvm.x86.avx2.pabs.d"]
3812     fn pabsd(a: i32x8) -> u32x8;
3813     #[link_name = "llvm.x86.avx2.padds.b"]
3814     fn paddsb(a: i8x32, b: i8x32) -> i8x32;
3815     #[link_name = "llvm.x86.avx2.padds.w"]
3816     fn paddsw(a: i16x16, b: i16x16) -> i16x16;
3817     #[link_name = "llvm.x86.avx2.paddus.b"]
3818     fn paddusb(a: u8x32, b: u8x32) -> u8x32;
3819     #[link_name = "llvm.x86.avx2.paddus.w"]
3820     fn paddusw(a: u16x16, b: u16x16) -> u16x16;
3821     #[link_name = "llvm.x86.avx2.pavg.b"]
3822     fn pavgb(a: u8x32, b: u8x32) -> u8x32;
3823     #[link_name = "llvm.x86.avx2.pavg.w"]
3824     fn pavgw(a: u16x16, b: u16x16) -> u16x16;
3825     #[link_name = "llvm.x86.avx2.pblendvb"]
3826     fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
3827     #[link_name = "llvm.x86.avx2.phadd.w"]
3828     fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3829     #[link_name = "llvm.x86.avx2.phadd.d"]
3830     fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3831     #[link_name = "llvm.x86.avx2.phadd.sw"]
3832     fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3833     #[link_name = "llvm.x86.avx2.phsub.w"]
3834     fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3835     #[link_name = "llvm.x86.avx2.phsub.d"]
3836     fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3837     #[link_name = "llvm.x86.avx2.phsub.sw"]
3838     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3839     #[link_name = "llvm.x86.avx2.pmadd.wd"]
3840     fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3841     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3842     fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3843     #[link_name = "llvm.x86.avx2.maskload.d"]
3844     fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3845     #[link_name = "llvm.x86.avx2.maskload.d.256"]
3846     fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3847     #[link_name = "llvm.x86.avx2.maskload.q"]
3848     fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3849     #[link_name = "llvm.x86.avx2.maskload.q.256"]
3850     fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3851     #[link_name = "llvm.x86.avx2.maskstore.d"]
3852     fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3853     #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3854     fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3855     #[link_name = "llvm.x86.avx2.maskstore.q"]
3856     fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3857     #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3858     fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3859     #[link_name = "llvm.x86.avx2.pmaxs.w"]
3860     fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
3861     #[link_name = "llvm.x86.avx2.pmaxs.d"]
3862     fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
3863     #[link_name = "llvm.x86.avx2.pmaxs.b"]
3864     fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
3865     #[link_name = "llvm.x86.avx2.pmaxu.w"]
3866     fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
3867     #[link_name = "llvm.x86.avx2.pmaxu.d"]
3868     fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
3869     #[link_name = "llvm.x86.avx2.pmaxu.b"]
3870     fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
3871     #[link_name = "llvm.x86.avx2.pmins.w"]
3872     fn pminsw(a: i16x16, b: i16x16) -> i16x16;
3873     #[link_name = "llvm.x86.avx2.pmins.d"]
3874     fn pminsd(a: i32x8, b: i32x8) -> i32x8;
3875     #[link_name = "llvm.x86.avx2.pmins.b"]
3876     fn pminsb(a: i8x32, b: i8x32) -> i8x32;
3877     #[link_name = "llvm.x86.avx2.pminu.w"]
3878     fn pminuw(a: u16x16, b: u16x16) -> u16x16;
3879     #[link_name = "llvm.x86.avx2.pminu.d"]
3880     fn pminud(a: u32x8, b: u32x8) -> u32x8;
3881     #[link_name = "llvm.x86.avx2.pminu.b"]
3882     fn pminub(a: u8x32, b: u8x32) -> u8x32;
3883     #[link_name = "llvm.x86.avx2.pmovmskb"]
3884     fn pmovmskb(a: i8x32) -> i32;
3885     #[link_name = "llvm.x86.avx2.mpsadbw"]
3886     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3887     #[link_name = "llvm.x86.avx2.pmulhu.w"]
3888     fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
3889     #[link_name = "llvm.x86.avx2.pmulh.w"]
3890     fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
3891     #[link_name = "llvm.x86.avx2.pmul.dq"]
3892     fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
3893     #[link_name = "llvm.x86.avx2.pmulu.dq"]
3894     fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
3895     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3896     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3897     #[link_name = "llvm.x86.avx2.packsswb"]
3898     fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3899     #[link_name = "llvm.x86.avx2.packssdw"]
3900     fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3901     #[link_name = "llvm.x86.avx2.packuswb"]
3902     fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3903     #[link_name = "llvm.x86.avx2.packusdw"]
3904     fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3905     #[link_name = "llvm.x86.avx2.psad.bw"]
3906     fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3907     #[link_name = "llvm.x86.avx2.psign.b"]
3908     fn psignb(a: i8x32, b: i8x32) -> i8x32;
3909     #[link_name = "llvm.x86.avx2.psign.w"]
3910     fn psignw(a: i16x16, b: i16x16) -> i16x16;
3911     #[link_name = "llvm.x86.avx2.psign.d"]
3912     fn psignd(a: i32x8, b: i32x8) -> i32x8;
3913     #[link_name = "llvm.x86.avx2.psll.w"]
3914     fn psllw(a: i16x16, count: i16x8) -> i16x16;
3915     #[link_name = "llvm.x86.avx2.psll.d"]
3916     fn pslld(a: i32x8, count: i32x4) -> i32x8;
3917     #[link_name = "llvm.x86.avx2.psll.q"]
3918     fn psllq(a: i64x4, count: i64x2) -> i64x4;
3919     #[link_name = "llvm.x86.avx2.pslli.w"]
3920     fn pslliw(a: i16x16, imm8: i32) -> i16x16;
3921     #[link_name = "llvm.x86.avx2.pslli.d"]
3922     fn psllid(a: i32x8, imm8: i32) -> i32x8;
3923     #[link_name = "llvm.x86.avx2.pslli.q"]
3924     fn pslliq(a: i64x4, imm8: i32) -> i64x4;
3925     #[link_name = "llvm.x86.avx2.psllv.d"]
3926     fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3927     #[link_name = "llvm.x86.avx2.psllv.d.256"]
3928     fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3929     #[link_name = "llvm.x86.avx2.psllv.q"]
3930     fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3931     #[link_name = "llvm.x86.avx2.psllv.q.256"]
3932     fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3933     #[link_name = "llvm.x86.avx2.psra.w"]
3934     fn psraw(a: i16x16, count: i16x8) -> i16x16;
3935     #[link_name = "llvm.x86.avx2.psra.d"]
3936     fn psrad(a: i32x8, count: i32x4) -> i32x8;
3937     #[link_name = "llvm.x86.avx2.psrai.w"]
3938     fn psraiw(a: i16x16, imm8: i32) -> i16x16;
3939     #[link_name = "llvm.x86.avx2.psrai.d"]
3940     fn psraid(a: i32x8, imm8: i32) -> i32x8;
3941     #[link_name = "llvm.x86.avx2.psrav.d"]
3942     fn psravd(a: i32x4, count: i32x4) -> i32x4;
3943     #[link_name = "llvm.x86.avx2.psrav.d.256"]
3944     fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3945     #[link_name = "llvm.x86.avx2.psrl.w"]
3946     fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3947     #[link_name = "llvm.x86.avx2.psrl.d"]
3948     fn psrld(a: i32x8, count: i32x4) -> i32x8;
3949     #[link_name = "llvm.x86.avx2.psrl.q"]
3950     fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3951     #[link_name = "llvm.x86.avx2.psrli.w"]
3952     fn psrliw(a: i16x16, imm8: i32) -> i16x16;
3953     #[link_name = "llvm.x86.avx2.psrli.d"]
3954     fn psrlid(a: i32x8, imm8: i32) -> i32x8;
3955     #[link_name = "llvm.x86.avx2.psrli.q"]
3956     fn psrliq(a: i64x4, imm8: i32) -> i64x4;
3957     #[link_name = "llvm.x86.avx2.psrlv.d"]
3958     fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3959     #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3960     fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3961     #[link_name = "llvm.x86.avx2.psrlv.q"]
3962     fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3963     #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3964     fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3965     #[link_name = "llvm.x86.avx2.psubs.b"]
3966     fn psubsb(a: i8x32, b: i8x32) -> i8x32;
3967     #[link_name = "llvm.x86.avx2.psubs.w"]
3968     fn psubsw(a: i16x16, b: i16x16) -> i16x16;
3969     #[link_name = "llvm.x86.avx2.psubus.b"]
3970     fn psubusb(a: u8x32, b: u8x32) -> u8x32;
3971     #[link_name = "llvm.x86.avx2.psubus.w"]
3972     fn psubusw(a: u16x16, b: u16x16) -> u16x16;
3973     #[link_name = "llvm.x86.avx2.pshuf.b"]
3974     fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3975     #[link_name = "llvm.x86.avx2.permd"]
3976     fn permd(a: u32x8, b: u32x8) -> u32x8;
3977     #[link_name = "llvm.x86.avx2.permps"]
3978     fn permps(a: __m256, b: i32x8) -> __m256;
3979     #[link_name = "llvm.x86.avx2.vperm2i128"]
3980     fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3981     #[link_name = "llvm.x86.avx2.gather.d.d"]
3982     fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3983     #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3984     fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3985     #[link_name = "llvm.x86.avx2.gather.d.q"]
3986     fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3987     #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3988     fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3989     #[link_name = "llvm.x86.avx2.gather.q.d"]
3990     fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3991     #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3992     fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3993     #[link_name = "llvm.x86.avx2.gather.q.q"]
3994     fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3995     #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3996     fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3997     #[link_name = "llvm.x86.avx2.gather.d.pd"]
3998     fn pgatherdpd(
3999         src: __m128d,
4000         slice: *const i8,
4001         offsets: i32x4,
4002         mask: __m128d,
4003         scale: i8,
4004     ) -> __m128d;
4005     #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
4006     fn vpgatherdpd(
4007         src: __m256d,
4008         slice: *const i8,
4009         offsets: i32x4,
4010         mask: __m256d,
4011         scale: i8,
4012     ) -> __m256d;
4013     #[link_name = "llvm.x86.avx2.gather.q.pd"]
4014     fn pgatherqpd(
4015         src: __m128d,
4016         slice: *const i8,
4017         offsets: i64x2,
4018         mask: __m128d,
4019         scale: i8,
4020     ) -> __m128d;
4021     #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
4022     fn vpgatherqpd(
4023         src: __m256d,
4024         slice: *const i8,
4025         offsets: i64x4,
4026         mask: __m256d,
4027         scale: i8,
4028     ) -> __m256d;
4029     #[link_name = "llvm.x86.avx2.gather.d.ps"]
4030     fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
4031         -> __m128;
4032     #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4033     fn vpgatherdps(
4034         src: __m256,
4035         slice: *const i8,
4036         offsets: i32x8,
4037         mask: __m256,
4038         scale: i8,
4039     ) -> __m256;
4040     #[link_name = "llvm.x86.avx2.gather.q.ps"]
4041     fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4042         -> __m128;
4043     #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4044     fn vpgatherqps(
4045         src: __m128,
4046         slice: *const i8,
4047         offsets: i64x4,
4048         mask: __m128,
4049         scale: i8,
4050     ) -> __m128;
4051     #[link_name = "llvm.x86.avx2.psll.dq"]
4052     fn vpslldq(a: i64x4, b: i32) -> i64x4;
4053     #[link_name = "llvm.x86.avx2.psrl.dq"]
4054     fn vpsrldq(a: i64x4, b: i32) -> i64x4;
4055 }
4056
4057 #[cfg(test)]
4058 mod tests {
4059     use std;
4060     use stdarch_test::simd_test;
4061
4062     use crate::core_arch::x86::*;
4063
4064     #[simd_test(enable = "avx2")]
4065     unsafe fn test_mm256_abs_epi32() {
4066         #[rustfmt::skip]
4067         let a = _mm256_setr_epi32(
4068             0, 1, -1, std::i32::MAX,
4069             std::i32::MIN, 100, -100, -32,
4070         );
4071         let r = _mm256_abs_epi32(a);
4072         #[rustfmt::skip]
4073         let e = _mm256_setr_epi32(
4074             0, 1, 1, std::i32::MAX,
4075             std::i32::MAX.wrapping_add(1), 100, 100, 32,
4076         );
4077         assert_eq_m256i(r, e);
4078     }
4079
4080     #[simd_test(enable = "avx2")]
4081     unsafe fn test_mm256_abs_epi16() {
4082         #[rustfmt::skip]
4083         let a = _mm256_setr_epi16(
4084             0,  1, -1, 2, -2, 3, -3, 4,
4085             -4, 5, -5, std::i16::MAX, std::i16::MIN, 100, -100, -32,
4086         );
4087         let r = _mm256_abs_epi16(a);
4088         #[rustfmt::skip]
4089         let e = _mm256_setr_epi16(
4090             0, 1, 1, 2, 2, 3, 3, 4,
4091             4, 5, 5, std::i16::MAX, std::i16::MAX.wrapping_add(1), 100, 100, 32,
4092         );
4093         assert_eq_m256i(r, e);
4094     }
4095
4096     #[simd_test(enable = "avx2")]
4097     unsafe fn test_mm256_abs_epi8() {
4098         #[rustfmt::skip]
4099         let a = _mm256_setr_epi8(
4100             0, 1, -1, 2, -2, 3, -3, 4,
4101             -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32,
4102             0, 1, -1, 2, -2, 3, -3, 4,
4103             -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32,
4104         );
4105         let r = _mm256_abs_epi8(a);
4106         #[rustfmt::skip]
4107         let e = _mm256_setr_epi8(
4108             0, 1, 1, 2, 2, 3, 3, 4,
4109             4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32,
4110             0, 1, 1, 2, 2, 3, 3, 4,
4111             4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32,
4112         );
4113         assert_eq_m256i(r, e);
4114     }
4115
4116     #[simd_test(enable = "avx2")]
4117     unsafe fn test_mm256_add_epi64() {
4118         let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4119         let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4120         let r = _mm256_add_epi64(a, b);
4121         let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4122         assert_eq_m256i(r, e);
4123     }
4124
4125     #[simd_test(enable = "avx2")]
4126     unsafe fn test_mm256_add_epi32() {
4127         let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4128         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4129         let r = _mm256_add_epi32(a, b);
4130         let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4131         assert_eq_m256i(r, e);
4132     }
4133
4134     #[simd_test(enable = "avx2")]
4135     unsafe fn test_mm256_add_epi16() {
4136         #[rustfmt::skip]
4137         let a = _mm256_setr_epi16(
4138             0, 1, 2, 3, 4, 5, 6, 7,
4139             8, 9, 10, 11, 12, 13, 14, 15,
4140         );
4141         #[rustfmt::skip]
4142         let b = _mm256_setr_epi16(
4143             0, 1, 2, 3, 4, 5, 6, 7,
4144             8, 9, 10, 11, 12, 13, 14, 15,
4145         );
4146         let r = _mm256_add_epi16(a, b);
4147         #[rustfmt::skip]
4148         let e = _mm256_setr_epi16(
4149             0, 2, 4, 6, 8, 10, 12, 14,
4150             16, 18, 20, 22, 24, 26, 28, 30,
4151         );
4152         assert_eq_m256i(r, e);
4153     }
4154
4155     #[simd_test(enable = "avx2")]
4156     unsafe fn test_mm256_add_epi8() {
4157         #[rustfmt::skip]
4158         let a = _mm256_setr_epi8(
4159             0, 1, 2, 3, 4, 5, 6, 7,
4160             8, 9, 10, 11, 12, 13, 14, 15,
4161             16, 17, 18, 19, 20, 21, 22, 23,
4162             24, 25, 26, 27, 28, 29, 30, 31,
4163         );
4164         #[rustfmt::skip]
4165         let b = _mm256_setr_epi8(
4166             0, 1, 2, 3, 4, 5, 6, 7,
4167             8, 9, 10, 11, 12, 13, 14, 15,
4168             16, 17, 18, 19, 20, 21, 22, 23,
4169             24, 25, 26, 27, 28, 29, 30, 31,
4170         );
4171         let r = _mm256_add_epi8(a, b);
4172         #[rustfmt::skip]
4173         let e = _mm256_setr_epi8(
4174             0, 2, 4, 6, 8, 10, 12, 14,
4175             16, 18, 20, 22, 24, 26, 28, 30,
4176             32, 34, 36, 38, 40, 42, 44, 46,
4177             48, 50, 52, 54, 56, 58, 60, 62,
4178         );
4179         assert_eq_m256i(r, e);
4180     }
4181
4182     #[simd_test(enable = "avx2")]
4183     unsafe fn test_mm256_adds_epi8() {
4184         #[rustfmt::skip]
4185         let a = _mm256_setr_epi8(
4186             0, 1, 2, 3, 4, 5, 6, 7,
4187             8, 9, 10, 11, 12, 13, 14, 15,
4188             16, 17, 18, 19, 20, 21, 22, 23,
4189             24, 25, 26, 27, 28, 29, 30, 31,
4190         );
4191         #[rustfmt::skip]
4192         let b = _mm256_setr_epi8(
4193             32, 33, 34, 35, 36, 37, 38, 39,
4194             40, 41, 42, 43, 44, 45, 46, 47,
4195             48, 49, 50, 51, 52, 53, 54, 55,
4196             56, 57, 58, 59, 60, 61, 62, 63,
4197         );
4198         let r = _mm256_adds_epi8(a, b);
4199         #[rustfmt::skip]
4200         let e = _mm256_setr_epi8(
4201             32, 34, 36, 38, 40, 42, 44, 46,
4202             48, 50, 52, 54, 56, 58, 60, 62,
4203             64, 66, 68, 70, 72, 74, 76, 78,
4204             80, 82, 84, 86, 88, 90, 92, 94,
4205         );
4206         assert_eq_m256i(r, e);
4207     }
4208
4209     #[simd_test(enable = "avx2")]
4210     unsafe fn test_mm256_adds_epi8_saturate_positive() {
4211         let a = _mm256_set1_epi8(0x7F);
4212         let b = _mm256_set1_epi8(1);
4213         let r = _mm256_adds_epi8(a, b);
4214         assert_eq_m256i(r, a);
4215     }
4216
4217     #[simd_test(enable = "avx2")]
4218     unsafe fn test_mm256_adds_epi8_saturate_negative() {
4219         let a = _mm256_set1_epi8(-0x80);
4220         let b = _mm256_set1_epi8(-1);
4221         let r = _mm256_adds_epi8(a, b);
4222         assert_eq_m256i(r, a);
4223     }
4224
4225     #[simd_test(enable = "avx2")]
4226     unsafe fn test_mm256_adds_epi16() {
4227         #[rustfmt::skip]
4228         let a = _mm256_setr_epi16(
4229             0, 1, 2, 3, 4, 5, 6, 7,
4230             8, 9, 10, 11, 12, 13, 14, 15,
4231         );
4232         #[rustfmt::skip]
4233         let b = _mm256_setr_epi16(
4234             32, 33, 34, 35, 36, 37, 38, 39,
4235             40, 41, 42, 43, 44, 45, 46, 47,
4236         );
4237         let r = _mm256_adds_epi16(a, b);
4238         #[rustfmt::skip]
4239         let e = _mm256_setr_epi16(
4240             32, 34, 36, 38, 40, 42, 44, 46,
4241             48, 50, 52, 54, 56, 58, 60, 62,
4242         );
4243
4244         assert_eq_m256i(r, e);
4245     }
4246
4247     #[simd_test(enable = "avx2")]
4248     unsafe fn test_mm256_adds_epi16_saturate_positive() {
4249         let a = _mm256_set1_epi16(0x7FFF);
4250         let b = _mm256_set1_epi16(1);
4251         let r = _mm256_adds_epi16(a, b);
4252         assert_eq_m256i(r, a);
4253     }
4254
4255     #[simd_test(enable = "avx2")]
4256     unsafe fn test_mm256_adds_epi16_saturate_negative() {
4257         let a = _mm256_set1_epi16(-0x8000);
4258         let b = _mm256_set1_epi16(-1);
4259         let r = _mm256_adds_epi16(a, b);
4260         assert_eq_m256i(r, a);
4261     }
4262
4263     #[simd_test(enable = "avx2")]
4264     unsafe fn test_mm256_adds_epu8() {
4265         #[rustfmt::skip]
4266         let a = _mm256_setr_epi8(
4267             0, 1, 2, 3, 4, 5, 6, 7,
4268             8, 9, 10, 11, 12, 13, 14, 15,
4269             16, 17, 18, 19, 20, 21, 22, 23,
4270             24, 25, 26, 27, 28, 29, 30, 31,
4271         );
4272         #[rustfmt::skip]
4273         let b = _mm256_setr_epi8(
4274             32, 33, 34, 35, 36, 37, 38, 39,
4275             40, 41, 42, 43, 44, 45, 46, 47,
4276             48, 49, 50, 51, 52, 53, 54, 55,
4277             56, 57, 58, 59, 60, 61, 62, 63,
4278         );
4279         let r = _mm256_adds_epu8(a, b);
4280         #[rustfmt::skip]
4281         let e = _mm256_setr_epi8(
4282             32, 34, 36, 38, 40, 42, 44, 46,
4283             48, 50, 52, 54, 56, 58, 60, 62,
4284             64, 66, 68, 70, 72, 74, 76, 78,
4285             80, 82, 84, 86, 88, 90, 92, 94,
4286         );
4287         assert_eq_m256i(r, e);
4288     }
4289
4290     #[simd_test(enable = "avx2")]
4291     unsafe fn test_mm256_adds_epu8_saturate() {
4292         let a = _mm256_set1_epi8(!0);
4293         let b = _mm256_set1_epi8(1);
4294         let r = _mm256_adds_epu8(a, b);
4295         assert_eq_m256i(r, a);
4296     }
4297
4298     #[simd_test(enable = "avx2")]
4299     unsafe fn test_mm256_adds_epu16() {
4300         #[rustfmt::skip]
4301         let a = _mm256_setr_epi16(
4302             0, 1, 2, 3, 4, 5, 6, 7,
4303             8, 9, 10, 11, 12, 13, 14, 15,
4304         );
4305         #[rustfmt::skip]
4306         let b = _mm256_setr_epi16(
4307             32, 33, 34, 35, 36, 37, 38, 39,
4308             40, 41, 42, 43, 44, 45, 46, 47,
4309         );
4310         let r = _mm256_adds_epu16(a, b);
4311         #[rustfmt::skip]
4312         let e = _mm256_setr_epi16(
4313             32, 34, 36, 38, 40, 42, 44, 46,
4314             48, 50, 52, 54, 56, 58, 60, 62,
4315         );
4316
4317         assert_eq_m256i(r, e);
4318     }
4319
4320     #[simd_test(enable = "avx2")]
4321     unsafe fn test_mm256_adds_epu16_saturate() {
4322         let a = _mm256_set1_epi16(!0);
4323         let b = _mm256_set1_epi16(1);
4324         let r = _mm256_adds_epu16(a, b);
4325         assert_eq_m256i(r, a);
4326     }
4327
4328     #[simd_test(enable = "avx2")]
4329     unsafe fn test_mm256_and_si256() {
4330         let a = _mm256_set1_epi8(5);
4331         let b = _mm256_set1_epi8(3);
4332         let got = _mm256_and_si256(a, b);
4333         assert_eq_m256i(got, _mm256_set1_epi8(1));
4334     }
4335
4336     #[simd_test(enable = "avx2")]
4337     unsafe fn test_mm256_andnot_si256() {
4338         let a = _mm256_set1_epi8(5);
4339         let b = _mm256_set1_epi8(3);
4340         let got = _mm256_andnot_si256(a, b);
4341         assert_eq_m256i(got, _mm256_set1_epi8(2));
4342     }
4343
4344     #[simd_test(enable = "avx2")]
4345     unsafe fn test_mm256_avg_epu8() {
4346         let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4347         let r = _mm256_avg_epu8(a, b);
4348         assert_eq_m256i(r, _mm256_set1_epi8(6));
4349     }
4350
4351     #[simd_test(enable = "avx2")]
4352     unsafe fn test_mm256_avg_epu16() {
4353         let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4354         let r = _mm256_avg_epu16(a, b);
4355         assert_eq_m256i(r, _mm256_set1_epi16(6));
4356     }
4357
4358     #[simd_test(enable = "avx2")]
4359     unsafe fn test_mm_blend_epi32() {
4360         let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4361         let e = _mm_setr_epi32(9, 3, 3, 3);
4362         let r = _mm_blend_epi32(a, b, 0x01 as i32);
4363         assert_eq_m128i(r, e);
4364
4365         let r = _mm_blend_epi32(b, a, 0x0E as i32);
4366         assert_eq_m128i(r, e);
4367     }
4368
4369     #[simd_test(enable = "avx2")]
4370     unsafe fn test_mm256_blend_epi32() {
4371         let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4372         let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4373         let r = _mm256_blend_epi32(a, b, 0x01 as i32);
4374         assert_eq_m256i(r, e);
4375
4376         let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4377         let r = _mm256_blend_epi32(a, b, 0x82 as i32);
4378         assert_eq_m256i(r, e);
4379
4380         let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4381         let r = _mm256_blend_epi32(a, b, 0x7C as i32);
4382         assert_eq_m256i(r, e);
4383     }
4384
4385     #[simd_test(enable = "avx2")]
4386     unsafe fn test_mm256_blend_epi16() {
4387         let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4388         let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4389         let r = _mm256_blend_epi16(a, b, 0x01 as i32);
4390         assert_eq_m256i(r, e);
4391
4392         let r = _mm256_blend_epi16(b, a, 0xFE as i32);
4393         assert_eq_m256i(r, e);
4394     }
4395
4396     #[simd_test(enable = "avx2")]
4397     unsafe fn test_mm256_blendv_epi8() {
4398         let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4399         let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
4400         let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
4401         let r = _mm256_blendv_epi8(a, b, mask);
4402         assert_eq_m256i(r, e);
4403     }
4404
4405     #[simd_test(enable = "avx2")]
4406     unsafe fn test_mm_broadcastb_epi8() {
4407         let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4408         let res = _mm_broadcastb_epi8(a);
4409         assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4410     }
4411
4412     #[simd_test(enable = "avx2")]
4413     unsafe fn test_mm256_broadcastb_epi8() {
4414         let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4415         let res = _mm256_broadcastb_epi8(a);
4416         assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4417     }
4418
4419     #[simd_test(enable = "avx2")]
4420     unsafe fn test_mm_broadcastd_epi32() {
4421         let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4422         let res = _mm_broadcastd_epi32(a);
4423         assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4424     }
4425
4426     #[simd_test(enable = "avx2")]
4427     unsafe fn test_mm256_broadcastd_epi32() {
4428         let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4429         let res = _mm256_broadcastd_epi32(a);
4430         assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4431     }
4432
4433     #[simd_test(enable = "avx2")]
4434     unsafe fn test_mm_broadcastq_epi64() {
4435         let a = _mm_setr_epi64x(0x1ffffffff, 0);
4436         let res = _mm_broadcastq_epi64(a);
4437         assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4438     }
4439
4440     #[simd_test(enable = "avx2")]
4441     unsafe fn test_mm256_broadcastq_epi64() {
4442         let a = _mm_setr_epi64x(0x1ffffffff, 0);
4443         let res = _mm256_broadcastq_epi64(a);
4444         assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4445     }
4446
4447     #[simd_test(enable = "avx2")]
4448     unsafe fn test_mm_broadcastsd_pd() {
4449         let a = _mm_setr_pd(6.28, 3.14);
4450         let res = _mm_broadcastsd_pd(a);
4451         assert_eq_m128d(res, _mm_set1_pd(6.28f64));
4452     }
4453
4454     #[simd_test(enable = "avx2")]
4455     unsafe fn test_mm256_broadcastsd_pd() {
4456         let a = _mm_setr_pd(6.28, 3.14);
4457         let res = _mm256_broadcastsd_pd(a);
4458         assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
4459     }
4460
4461     #[simd_test(enable = "avx2")]
4462     unsafe fn test_mm256_broadcastsi128_si256() {
4463         let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4464         let res = _mm256_broadcastsi128_si256(a);
4465         let retval = _mm256_setr_epi64x(
4466             0x0987654321012334,
4467             0x5678909876543210,
4468             0x0987654321012334,
4469             0x5678909876543210,
4470         );
4471         assert_eq_m256i(res, retval);
4472     }
4473
4474     #[simd_test(enable = "avx2")]
4475     unsafe fn test_mm_broadcastss_ps() {
4476         let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4477         let res = _mm_broadcastss_ps(a);
4478         assert_eq_m128(res, _mm_set1_ps(6.28f32));
4479     }
4480
4481     #[simd_test(enable = "avx2")]
4482     unsafe fn test_mm256_broadcastss_ps() {
4483         let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4484         let res = _mm256_broadcastss_ps(a);
4485         assert_eq_m256(res, _mm256_set1_ps(6.28f32));
4486     }
4487
4488     #[simd_test(enable = "avx2")]
4489     unsafe fn test_mm_broadcastw_epi16() {
4490         let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4491         let res = _mm_broadcastw_epi16(a);
4492         assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4493     }
4494
4495     #[simd_test(enable = "avx2")]
4496     unsafe fn test_mm256_broadcastw_epi16() {
4497         let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4498         let res = _mm256_broadcastw_epi16(a);
4499         assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4500     }
4501
4502     #[simd_test(enable = "avx2")]
4503     unsafe fn test_mm256_cmpeq_epi8() {
4504         #[rustfmt::skip]
4505         let a = _mm256_setr_epi8(
4506             0, 1, 2, 3, 4, 5, 6, 7,
4507             8, 9, 10, 11, 12, 13, 14, 15,
4508             16, 17, 18, 19, 20, 21, 22, 23,
4509             24, 25, 26, 27, 28, 29, 30, 31,
4510         );
4511         #[rustfmt::skip]
4512         let b = _mm256_setr_epi8(
4513             31, 30, 2, 28, 27, 26, 25, 24,
4514             23, 22, 21, 20, 19, 18, 17, 16,
4515             15, 14, 13, 12, 11, 10, 9, 8,
4516             7, 6, 5, 4, 3, 2, 1, 0,
4517         );
4518         let r = _mm256_cmpeq_epi8(a, b);
4519         assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
4520     }
4521
4522     #[simd_test(enable = "avx2")]
4523     unsafe fn test_mm256_cmpeq_epi16() {
4524         #[rustfmt::skip]
4525         let a = _mm256_setr_epi16(
4526             0, 1, 2, 3, 4, 5, 6, 7,
4527             8, 9, 10, 11, 12, 13, 14, 15,
4528         );
4529         #[rustfmt::skip]
4530         let b = _mm256_setr_epi16(
4531             15, 14, 2, 12, 11, 10, 9, 8,
4532             7, 6, 5, 4, 3, 2, 1, 0,
4533         );
4534         let r = _mm256_cmpeq_epi16(a, b);
4535         assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
4536     }
4537
4538     #[simd_test(enable = "avx2")]
4539     unsafe fn test_mm256_cmpeq_epi32() {
4540         let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4541         let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4542         let r = _mm256_cmpeq_epi32(a, b);
4543         let e = _mm256_set1_epi32(0);
4544         let e = _mm256_insert_epi32(e, !0, 2);
4545         assert_eq_m256i(r, e);
4546     }
4547
4548     #[simd_test(enable = "avx2")]
4549     unsafe fn test_mm256_cmpeq_epi64() {
4550         let a = _mm256_setr_epi64x(0, 1, 2, 3);
4551         let b = _mm256_setr_epi64x(3, 2, 2, 0);
4552         let r = _mm256_cmpeq_epi64(a, b);
4553         assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
4554     }
4555
4556     #[simd_test(enable = "avx2")]
4557     unsafe fn test_mm256_cmpgt_epi8() {
4558         let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
4559         let b = _mm256_set1_epi8(0);
4560         let r = _mm256_cmpgt_epi8(a, b);
4561         assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
4562     }
4563
4564     #[simd_test(enable = "avx2")]
4565     unsafe fn test_mm256_cmpgt_epi16() {
4566         let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
4567         let b = _mm256_set1_epi16(0);
4568         let r = _mm256_cmpgt_epi16(a, b);
4569         assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
4570     }
4571
4572     #[simd_test(enable = "avx2")]
4573     unsafe fn test_mm256_cmpgt_epi32() {
4574         let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
4575         let b = _mm256_set1_epi32(0);
4576         let r = _mm256_cmpgt_epi32(a, b);
4577         assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
4578     }
4579
4580     #[simd_test(enable = "avx2")]
4581     unsafe fn test_mm256_cmpgt_epi64() {
4582         let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
4583         let b = _mm256_set1_epi64x(0);
4584         let r = _mm256_cmpgt_epi64(a, b);
4585         assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
4586     }
4587
4588     #[simd_test(enable = "avx2")]
4589     unsafe fn test_mm256_cvtepi8_epi16() {
4590         #[rustfmt::skip]
4591         let a = _mm_setr_epi8(
4592             0, 0, -1, 1, -2, 2, -3, 3,
4593             -4, 4, -5, 5, -6, 6, -7, 7,
4594         );
4595         #[rustfmt::skip]
4596         let r = _mm256_setr_epi16(
4597             0, 0, -1, 1, -2, 2, -3, 3,
4598             -4, 4, -5, 5, -6, 6, -7, 7,
4599         );
4600         assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4601     }
4602
4603     #[simd_test(enable = "avx2")]
4604     unsafe fn test_mm256_cvtepi8_epi32() {
4605         #[rustfmt::skip]
4606         let a = _mm_setr_epi8(
4607             0, 0, -1, 1, -2, 2, -3, 3,
4608             -4, 4, -5, 5, -6, 6, -7, 7,
4609         );
4610         let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4611         assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4612     }
4613
4614     #[simd_test(enable = "avx2")]
4615     unsafe fn test_mm256_cvtepi8_epi64() {
4616         #[rustfmt::skip]
4617         let a = _mm_setr_epi8(
4618             0, 0, -1, 1, -2, 2, -3, 3,
4619             -4, 4, -5, 5, -6, 6, -7, 7,
4620         );
4621         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4622         assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4623     }
4624
4625     #[simd_test(enable = "avx2")]
4626     unsafe fn test_mm256_cvtepi16_epi32() {
4627         let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4628         let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4629         assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4630     }
4631
4632     #[simd_test(enable = "avx2")]
4633     unsafe fn test_mm256_cvtepi16_epi64() {
4634         let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4635         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4636         assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4637     }
4638
4639     #[simd_test(enable = "avx2")]
4640     unsafe fn test_mm256_cvtepi32_epi64() {
4641         let a = _mm_setr_epi32(0, 0, -1, 1);
4642         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4643         assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4644     }
4645
4646     #[simd_test(enable = "avx2")]
4647     unsafe fn test_mm256_cvtepu16_epi32() {
4648         let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4649         let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4650         assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4651     }
4652
4653     #[simd_test(enable = "avx2")]
4654     unsafe fn test_mm256_cvtepu16_epi64() {
4655         let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4656         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4657         assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4658     }
4659
4660     #[simd_test(enable = "avx2")]
4661     unsafe fn test_mm256_cvtepu32_epi64() {
4662         let a = _mm_setr_epi32(0, 1, 2, 3);
4663         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4664         assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4665     }
4666
4667     #[simd_test(enable = "avx2")]
4668     unsafe fn test_mm256_cvtepu8_epi16() {
4669         #[rustfmt::skip]
4670         let a = _mm_setr_epi8(
4671             0, 1, 2, 3, 4, 5, 6, 7,
4672             8, 9, 10, 11, 12, 13, 14, 15,
4673         );
4674         #[rustfmt::skip]
4675         let r = _mm256_setr_epi16(
4676             0, 1, 2, 3, 4, 5, 6, 7,
4677             8, 9, 10, 11, 12, 13, 14, 15,
4678         );
4679         assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4680     }
4681
4682     #[simd_test(enable = "avx2")]
4683     unsafe fn test_mm256_cvtepu8_epi32() {
4684         #[rustfmt::skip]
4685         let a = _mm_setr_epi8(
4686             0, 1, 2, 3, 4, 5, 6, 7,
4687             8, 9, 10, 11, 12, 13, 14, 15,
4688         );
4689         let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4690         assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4691     }
4692
4693     #[simd_test(enable = "avx2")]
4694     unsafe fn test_mm256_cvtepu8_epi64() {
4695         #[rustfmt::skip]
4696         let a = _mm_setr_epi8(
4697             0, 1, 2, 3, 4, 5, 6, 7,
4698             8, 9, 10, 11, 12, 13, 14, 15,
4699         );
4700         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4701         assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4702     }
4703
4704     #[simd_test(enable = "avx2")]
4705     unsafe fn test_mm256_extracti128_si256() {
4706         let a = _mm256_setr_epi64x(1, 2, 3, 4);
4707         let r = _mm256_extracti128_si256(a, 0b01);
4708         let e = _mm_setr_epi64x(3, 4);
4709         assert_eq_m128i(r, e);
4710     }
4711
4712     #[simd_test(enable = "avx2")]
4713     unsafe fn test_mm256_hadd_epi16() {
4714         let a = _mm256_set1_epi16(2);
4715         let b = _mm256_set1_epi16(4);
4716         let r = _mm256_hadd_epi16(a, b);
4717         let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4718         assert_eq_m256i(r, e);
4719     }
4720
4721     #[simd_test(enable = "avx2")]
4722     unsafe fn test_mm256_hadd_epi32() {
4723         let a = _mm256_set1_epi32(2);
4724         let b = _mm256_set1_epi32(4);
4725         let r = _mm256_hadd_epi32(a, b);
4726         let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4727         assert_eq_m256i(r, e);
4728     }
4729
4730     #[simd_test(enable = "avx2")]
4731     unsafe fn test_mm256_hadds_epi16() {
4732         let a = _mm256_set1_epi16(2);
4733         let a = _mm256_insert_epi16(a, 0x7fff, 0);
4734         let a = _mm256_insert_epi16(a, 1, 1);
4735         let b = _mm256_set1_epi16(4);
4736         let r = _mm256_hadds_epi16(a, b);
4737         #[rustfmt::skip]
4738         let e = _mm256_setr_epi16(
4739             0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4740             4, 4, 4, 4, 8, 8, 8, 8,
4741         );
4742         assert_eq_m256i(r, e);
4743     }
4744
4745     #[simd_test(enable = "avx2")]
4746     unsafe fn test_mm256_hsub_epi16() {
4747         let a = _mm256_set1_epi16(2);
4748         let b = _mm256_set1_epi16(4);
4749         let r = _mm256_hsub_epi16(a, b);
4750         let e = _mm256_set1_epi16(0);
4751         assert_eq_m256i(r, e);
4752     }
4753
4754     #[simd_test(enable = "avx2")]
4755     unsafe fn test_mm256_hsub_epi32() {
4756         let a = _mm256_set1_epi32(2);
4757         let b = _mm256_set1_epi32(4);
4758         let r = _mm256_hsub_epi32(a, b);
4759         let e = _mm256_set1_epi32(0);
4760         assert_eq_m256i(r, e);
4761     }
4762
4763     #[simd_test(enable = "avx2")]
4764     unsafe fn test_mm256_hsubs_epi16() {
4765         let a = _mm256_set1_epi16(2);
4766         let a = _mm256_insert_epi16(a, 0x7fff, 0);
4767         let a = _mm256_insert_epi16(a, -1, 1);
4768         let b = _mm256_set1_epi16(4);
4769         let r = _mm256_hsubs_epi16(a, b);
4770         let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
4771         assert_eq_m256i(r, e);
4772     }
4773
4774     #[simd_test(enable = "avx2")]
4775     unsafe fn test_mm256_madd_epi16() {
4776         let a = _mm256_set1_epi16(2);
4777         let b = _mm256_set1_epi16(4);
4778         let r = _mm256_madd_epi16(a, b);
4779         let e = _mm256_set1_epi32(16);
4780         assert_eq_m256i(r, e);
4781     }
4782
4783     #[simd_test(enable = "avx2")]
4784     unsafe fn test_mm256_inserti128_si256() {
4785         let a = _mm256_setr_epi64x(1, 2, 3, 4);
4786         let b = _mm_setr_epi64x(7, 8);
4787         let r = _mm256_inserti128_si256(a, b, 0b01);
4788         let e = _mm256_setr_epi64x(1, 2, 7, 8);
4789         assert_eq_m256i(r, e);
4790     }
4791
4792     #[simd_test(enable = "avx2")]
4793     unsafe fn test_mm256_maddubs_epi16() {
4794         let a = _mm256_set1_epi8(2);
4795         let b = _mm256_set1_epi8(4);
4796         let r = _mm256_maddubs_epi16(a, b);
4797         let e = _mm256_set1_epi16(16);
4798         assert_eq_m256i(r, e);
4799     }
4800
4801     #[simd_test(enable = "avx2")]
4802     unsafe fn test_mm_maskload_epi32() {
4803         let nums = [1, 2, 3, 4];
4804         let a = &nums as *const i32;
4805         let mask = _mm_setr_epi32(-1, 0, 0, -1);
4806         let r = _mm_maskload_epi32(a, mask);
4807         let e = _mm_setr_epi32(1, 0, 0, 4);
4808         assert_eq_m128i(r, e);
4809     }
4810
4811     #[simd_test(enable = "avx2")]
4812     unsafe fn test_mm256_maskload_epi32() {
4813         let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4814         let a = &nums as *const i32;
4815         let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4816         let r = _mm256_maskload_epi32(a, mask);
4817         let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4818         assert_eq_m256i(r, e);
4819     }
4820
4821     #[simd_test(enable = "avx2")]
4822     unsafe fn test_mm_maskload_epi64() {
4823         let nums = [1_i64, 2_i64];
4824         let a = &nums as *const i64;
4825         let mask = _mm_setr_epi64x(0, -1);
4826         let r = _mm_maskload_epi64(a, mask);
4827         let e = _mm_setr_epi64x(0, 2);
4828         assert_eq_m128i(r, e);
4829     }
4830
4831     #[simd_test(enable = "avx2")]
4832     unsafe fn test_mm256_maskload_epi64() {
4833         let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4834         let a = &nums as *const i64;
4835         let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4836         let r = _mm256_maskload_epi64(a, mask);
4837         let e = _mm256_setr_epi64x(0, 2, 3, 0);
4838         assert_eq_m256i(r, e);
4839     }
4840
4841     #[simd_test(enable = "avx2")]
4842     unsafe fn test_mm_maskstore_epi32() {
4843         let a = _mm_setr_epi32(1, 2, 3, 4);
4844         let mut arr = [-1, -1, -1, -1];
4845         let mask = _mm_setr_epi32(-1, 0, 0, -1);
4846         _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4847         let e = [1, -1, -1, 4];
4848         assert_eq!(arr, e);
4849     }
4850
4851     #[simd_test(enable = "avx2")]
4852     unsafe fn test_mm256_maskstore_epi32() {
4853         let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4854         let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4855         let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4856         _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4857         let e = [1, -1, -1, 42, -1, 6, 7, -1];
4858         assert_eq!(arr, e);
4859     }
4860
4861     #[simd_test(enable = "avx2")]
4862     unsafe fn test_mm_maskstore_epi64() {
4863         let a = _mm_setr_epi64x(1_i64, 2_i64);
4864         let mut arr = [-1_i64, -1_i64];
4865         let mask = _mm_setr_epi64x(0, -1);
4866         _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4867         let e = [-1, 2];
4868         assert_eq!(arr, e);
4869     }
4870
4871     #[simd_test(enable = "avx2")]
4872     unsafe fn test_mm256_maskstore_epi64() {
4873         let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4874         let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4875         let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4876         _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4877         let e = [-1, 2, 3, -1];
4878         assert_eq!(arr, e);
4879     }
4880
4881     #[simd_test(enable = "avx2")]
4882     unsafe fn test_mm256_max_epi16() {
4883         let a = _mm256_set1_epi16(2);
4884         let b = _mm256_set1_epi16(4);
4885         let r = _mm256_max_epi16(a, b);
4886         assert_eq_m256i(r, b);
4887     }
4888
4889     #[simd_test(enable = "avx2")]
4890     unsafe fn test_mm256_max_epi32() {
4891         let a = _mm256_set1_epi32(2);
4892         let b = _mm256_set1_epi32(4);
4893         let r = _mm256_max_epi32(a, b);
4894         assert_eq_m256i(r, b);
4895     }
4896
4897     #[simd_test(enable = "avx2")]
4898     unsafe fn test_mm256_max_epi8() {
4899         let a = _mm256_set1_epi8(2);
4900         let b = _mm256_set1_epi8(4);
4901         let r = _mm256_max_epi8(a, b);
4902         assert_eq_m256i(r, b);
4903     }
4904
4905     #[simd_test(enable = "avx2")]
4906     unsafe fn test_mm256_max_epu16() {
4907         let a = _mm256_set1_epi16(2);
4908         let b = _mm256_set1_epi16(4);
4909         let r = _mm256_max_epu16(a, b);
4910         assert_eq_m256i(r, b);
4911     }
4912
4913     #[simd_test(enable = "avx2")]
4914     unsafe fn test_mm256_max_epu32() {
4915         let a = _mm256_set1_epi32(2);
4916         let b = _mm256_set1_epi32(4);
4917         let r = _mm256_max_epu32(a, b);
4918         assert_eq_m256i(r, b);
4919     }
4920
4921     #[simd_test(enable = "avx2")]
4922     unsafe fn test_mm256_max_epu8() {
4923         let a = _mm256_set1_epi8(2);
4924         let b = _mm256_set1_epi8(4);
4925         let r = _mm256_max_epu8(a, b);
4926         assert_eq_m256i(r, b);
4927     }
4928
4929     #[simd_test(enable = "avx2")]
4930     unsafe fn test_mm256_min_epi16() {
4931         let a = _mm256_set1_epi16(2);
4932         let b = _mm256_set1_epi16(4);
4933         let r = _mm256_min_epi16(a, b);
4934         assert_eq_m256i(r, a);
4935     }
4936
4937     #[simd_test(enable = "avx2")]
4938     unsafe fn test_mm256_min_epi32() {
4939         let a = _mm256_set1_epi32(2);
4940         let b = _mm256_set1_epi32(4);
4941         let r = _mm256_min_epi32(a, b);
4942         assert_eq_m256i(r, a);
4943     }
4944
4945     #[simd_test(enable = "avx2")]
4946     unsafe fn test_mm256_min_epi8() {
4947         let a = _mm256_set1_epi8(2);
4948         let b = _mm256_set1_epi8(4);
4949         let r = _mm256_min_epi8(a, b);
4950         assert_eq_m256i(r, a);
4951     }
4952
4953     #[simd_test(enable = "avx2")]
4954     unsafe fn test_mm256_min_epu16() {
4955         let a = _mm256_set1_epi16(2);
4956         let b = _mm256_set1_epi16(4);
4957         let r = _mm256_min_epu16(a, b);
4958         assert_eq_m256i(r, a);
4959     }
4960
4961     #[simd_test(enable = "avx2")]
4962     unsafe fn test_mm256_min_epu32() {
4963         let a = _mm256_set1_epi32(2);
4964         let b = _mm256_set1_epi32(4);
4965         let r = _mm256_min_epu32(a, b);
4966         assert_eq_m256i(r, a);
4967     }
4968
4969     #[simd_test(enable = "avx2")]
4970     unsafe fn test_mm256_min_epu8() {
4971         let a = _mm256_set1_epi8(2);
4972         let b = _mm256_set1_epi8(4);
4973         let r = _mm256_min_epu8(a, b);
4974         assert_eq_m256i(r, a);
4975     }
4976
4977     #[simd_test(enable = "avx2")]
4978     unsafe fn test_mm256_movemask_epi8() {
4979         let a = _mm256_set1_epi8(-1);
4980         let r = _mm256_movemask_epi8(a);
4981         let e = -1;
4982         assert_eq!(r, e);
4983     }
4984
4985     #[simd_test(enable = "avx2")]
4986     unsafe fn test_mm256_mpsadbw_epu8() {
4987         let a = _mm256_set1_epi8(2);
4988         let b = _mm256_set1_epi8(4);
4989         let r = _mm256_mpsadbw_epu8(a, b, 0);
4990         let e = _mm256_set1_epi16(8);
4991         assert_eq_m256i(r, e);
4992     }
4993
4994     #[simd_test(enable = "avx2")]
4995     unsafe fn test_mm256_mul_epi32() {
4996         let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4997         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4998         let r = _mm256_mul_epi32(a, b);
4999         let e = _mm256_setr_epi64x(0, 0, 10, 14);
5000         assert_eq_m256i(r, e);
5001     }
5002
5003     #[simd_test(enable = "avx2")]
5004     unsafe fn test_mm256_mul_epu32() {
5005         let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
5006         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
5007         let r = _mm256_mul_epu32(a, b);
5008         let e = _mm256_setr_epi64x(0, 0, 10, 14);
5009         assert_eq_m256i(r, e);
5010     }
5011
5012     #[simd_test(enable = "avx2")]
5013     unsafe fn test_mm256_mulhi_epi16() {
5014         let a = _mm256_set1_epi16(6535);
5015         let b = _mm256_set1_epi16(6535);
5016         let r = _mm256_mulhi_epi16(a, b);
5017         let e = _mm256_set1_epi16(651);
5018         assert_eq_m256i(r, e);
5019     }
5020
5021     #[simd_test(enable = "avx2")]
5022     unsafe fn test_mm256_mulhi_epu16() {
5023         let a = _mm256_set1_epi16(6535);
5024         let b = _mm256_set1_epi16(6535);
5025         let r = _mm256_mulhi_epu16(a, b);
5026         let e = _mm256_set1_epi16(651);
5027         assert_eq_m256i(r, e);
5028     }
5029
5030     #[simd_test(enable = "avx2")]
5031     unsafe fn test_mm256_mullo_epi16() {
5032         let a = _mm256_set1_epi16(2);
5033         let b = _mm256_set1_epi16(4);
5034         let r = _mm256_mullo_epi16(a, b);
5035         let e = _mm256_set1_epi16(8);
5036         assert_eq_m256i(r, e);
5037     }
5038
5039     #[simd_test(enable = "avx2")]
5040     unsafe fn test_mm256_mullo_epi32() {
5041         let a = _mm256_set1_epi32(2);
5042         let b = _mm256_set1_epi32(4);
5043         let r = _mm256_mullo_epi32(a, b);
5044         let e = _mm256_set1_epi32(8);
5045         assert_eq_m256i(r, e);
5046     }
5047
5048     #[simd_test(enable = "avx2")]
5049     unsafe fn test_mm256_mulhrs_epi16() {
5050         let a = _mm256_set1_epi16(2);
5051         let b = _mm256_set1_epi16(4);
5052         let r = _mm256_mullo_epi16(a, b);
5053         let e = _mm256_set1_epi16(8);
5054         assert_eq_m256i(r, e);
5055     }
5056
5057     #[simd_test(enable = "avx2")]
5058     unsafe fn test_mm256_or_si256() {
5059         let a = _mm256_set1_epi8(-1);
5060         let b = _mm256_set1_epi8(0);
5061         let r = _mm256_or_si256(a, b);
5062         assert_eq_m256i(r, a);
5063     }
5064
5065     #[simd_test(enable = "avx2")]
5066     unsafe fn test_mm256_packs_epi16() {
5067         let a = _mm256_set1_epi16(2);
5068         let b = _mm256_set1_epi16(4);
5069         let r = _mm256_packs_epi16(a, b);
5070         #[rustfmt::skip]
5071         let e = _mm256_setr_epi8(
5072             2, 2, 2, 2, 2, 2, 2, 2,
5073             4, 4, 4, 4, 4, 4, 4, 4,
5074             2, 2, 2, 2, 2, 2, 2, 2,
5075             4, 4, 4, 4, 4, 4, 4, 4,
5076         );
5077
5078         assert_eq_m256i(r, e);
5079     }
5080
5081     #[simd_test(enable = "avx2")]
5082     unsafe fn test_mm256_packs_epi32() {
5083         let a = _mm256_set1_epi32(2);
5084         let b = _mm256_set1_epi32(4);
5085         let r = _mm256_packs_epi32(a, b);
5086         let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5087
5088         assert_eq_m256i(r, e);
5089     }
5090
5091     #[simd_test(enable = "avx2")]
5092     unsafe fn test_mm256_packus_epi16() {
5093         let a = _mm256_set1_epi16(2);
5094         let b = _mm256_set1_epi16(4);
5095         let r = _mm256_packus_epi16(a, b);
5096         #[rustfmt::skip]
5097         let e = _mm256_setr_epi8(
5098             2, 2, 2, 2, 2, 2, 2, 2,
5099             4, 4, 4, 4, 4, 4, 4, 4,
5100             2, 2, 2, 2, 2, 2, 2, 2,
5101             4, 4, 4, 4, 4, 4, 4, 4,
5102         );
5103
5104         assert_eq_m256i(r, e);
5105     }
5106
5107     #[simd_test(enable = "avx2")]
5108     unsafe fn test_mm256_packus_epi32() {
5109         let a = _mm256_set1_epi32(2);
5110         let b = _mm256_set1_epi32(4);
5111         let r = _mm256_packus_epi32(a, b);
5112         let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5113
5114         assert_eq_m256i(r, e);
5115     }
5116
5117     #[simd_test(enable = "avx2")]
5118     unsafe fn test_mm256_sad_epu8() {
5119         let a = _mm256_set1_epi8(2);
5120         let b = _mm256_set1_epi8(4);
5121         let r = _mm256_sad_epu8(a, b);
5122         let e = _mm256_set1_epi64x(16);
5123         assert_eq_m256i(r, e);
5124     }
5125
5126     #[simd_test(enable = "avx2")]
5127     unsafe fn test_mm256_shufflehi_epi16() {
5128         #[rustfmt::skip]
5129         let a = _mm256_setr_epi16(
5130             0, 1, 2, 3, 11, 22, 33, 44,
5131             4, 5, 6, 7, 55, 66, 77, 88,
5132         );
5133         #[rustfmt::skip]
5134         let e = _mm256_setr_epi16(
5135             0, 1, 2, 3, 44, 22, 22, 11,
5136             4, 5, 6, 7, 88, 66, 66, 55,
5137         );
5138         let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
5139         assert_eq_m256i(r, e);
5140     }
5141
5142     #[simd_test(enable = "avx2")]
5143     unsafe fn test_mm256_shufflelo_epi16() {
5144         #[rustfmt::skip]
5145         let a = _mm256_setr_epi16(
5146             11, 22, 33, 44, 0, 1, 2, 3,
5147             55, 66, 77, 88, 4, 5, 6, 7,
5148         );
5149         #[rustfmt::skip]
5150         let e = _mm256_setr_epi16(
5151             44, 22, 22, 11, 0, 1, 2, 3,
5152             88, 66, 66, 55, 4, 5, 6, 7,
5153         );
5154         let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
5155         assert_eq_m256i(r, e);
5156     }
5157
5158     #[simd_test(enable = "avx2")]
5159     unsafe fn test_mm256_sign_epi16() {
5160         let a = _mm256_set1_epi16(2);
5161         let b = _mm256_set1_epi16(-1);
5162         let r = _mm256_sign_epi16(a, b);
5163         let e = _mm256_set1_epi16(-2);
5164         assert_eq_m256i(r, e);
5165     }
5166
5167     #[simd_test(enable = "avx2")]
5168     unsafe fn test_mm256_sign_epi32() {
5169         let a = _mm256_set1_epi32(2);
5170         let b = _mm256_set1_epi32(-1);
5171         let r = _mm256_sign_epi32(a, b);
5172         let e = _mm256_set1_epi32(-2);
5173         assert_eq_m256i(r, e);
5174     }
5175
5176     #[simd_test(enable = "avx2")]
5177     unsafe fn test_mm256_sign_epi8() {
5178         let a = _mm256_set1_epi8(2);
5179         let b = _mm256_set1_epi8(-1);
5180         let r = _mm256_sign_epi8(a, b);
5181         let e = _mm256_set1_epi8(-2);
5182         assert_eq_m256i(r, e);
5183     }
5184
5185     #[simd_test(enable = "avx2")]
5186     unsafe fn test_mm256_sll_epi16() {
5187         let a = _mm256_set1_epi16(0xFF);
5188         let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5189         let r = _mm256_sll_epi16(a, b);
5190         assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5191     }
5192
5193     #[simd_test(enable = "avx2")]
5194     unsafe fn test_mm256_sll_epi32() {
5195         let a = _mm256_set1_epi32(0xFFFF);
5196         let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5197         let r = _mm256_sll_epi32(a, b);
5198         assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5199     }
5200
5201     #[simd_test(enable = "avx2")]
5202     unsafe fn test_mm256_sll_epi64() {
5203         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5204         let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
5205         let r = _mm256_sll_epi64(a, b);
5206         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5207     }
5208
5209     #[simd_test(enable = "avx2")]
5210     unsafe fn test_mm256_slli_epi16() {
5211         assert_eq_m256i(
5212             _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
5213             _mm256_set1_epi16(0xFF0),
5214         );
5215     }
5216
5217     #[simd_test(enable = "avx2")]
5218     unsafe fn test_mm256_slli_epi32() {
5219         assert_eq_m256i(
5220             _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5221             _mm256_set1_epi32(0xFFFF0),
5222         );
5223     }
5224
5225     #[simd_test(enable = "avx2")]
5226     unsafe fn test_mm256_slli_epi64() {
5227         assert_eq_m256i(
5228             _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5229             _mm256_set1_epi64x(0xFFFFFFFF0),
5230         );
5231     }
5232
5233     #[simd_test(enable = "avx2")]
5234     unsafe fn test_mm256_slli_si256() {
5235         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5236         let r = _mm256_slli_si256(a, 3);
5237         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5238     }
5239
5240     #[simd_test(enable = "avx2")]
5241     unsafe fn test_mm_sllv_epi32() {
5242         let a = _mm_set1_epi32(2);
5243         let b = _mm_set1_epi32(1);
5244         let r = _mm_sllv_epi32(a, b);
5245         let e = _mm_set1_epi32(4);
5246         assert_eq_m128i(r, e);
5247     }
5248
5249     #[simd_test(enable = "avx2")]
5250     unsafe fn test_mm256_sllv_epi32() {
5251         let a = _mm256_set1_epi32(2);
5252         let b = _mm256_set1_epi32(1);
5253         let r = _mm256_sllv_epi32(a, b);
5254         let e = _mm256_set1_epi32(4);
5255         assert_eq_m256i(r, e);
5256     }
5257
5258     #[simd_test(enable = "avx2")]
5259     unsafe fn test_mm_sllv_epi64() {
5260         let a = _mm_set1_epi64x(2);
5261         let b = _mm_set1_epi64x(1);
5262         let r = _mm_sllv_epi64(a, b);
5263         let e = _mm_set1_epi64x(4);
5264         assert_eq_m128i(r, e);
5265     }
5266
5267     #[simd_test(enable = "avx2")]
5268     unsafe fn test_mm256_sllv_epi64() {
5269         let a = _mm256_set1_epi64x(2);
5270         let b = _mm256_set1_epi64x(1);
5271         let r = _mm256_sllv_epi64(a, b);
5272         let e = _mm256_set1_epi64x(4);
5273         assert_eq_m256i(r, e);
5274     }
5275
5276     #[simd_test(enable = "avx2")]
5277     unsafe fn test_mm256_sra_epi16() {
5278         let a = _mm256_set1_epi16(-1);
5279         let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5280         let r = _mm256_sra_epi16(a, b);
5281         assert_eq_m256i(r, _mm256_set1_epi16(-1));
5282     }
5283
5284     #[simd_test(enable = "avx2")]
5285     unsafe fn test_mm256_sra_epi32() {
5286         let a = _mm256_set1_epi32(-1);
5287         let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
5288         let r = _mm256_sra_epi32(a, b);
5289         assert_eq_m256i(r, _mm256_set1_epi32(-1));
5290     }
5291
5292     #[simd_test(enable = "avx2")]
5293     unsafe fn test_mm256_srai_epi16() {
5294         assert_eq_m256i(
5295             _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
5296             _mm256_set1_epi16(-1),
5297         );
5298     }
5299
5300     #[simd_test(enable = "avx2")]
5301     unsafe fn test_mm256_srai_epi32() {
5302         assert_eq_m256i(
5303             _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
5304             _mm256_set1_epi32(-1),
5305         );
5306     }
5307
5308     #[simd_test(enable = "avx2")]
5309     unsafe fn test_mm_srav_epi32() {
5310         let a = _mm_set1_epi32(4);
5311         let count = _mm_set1_epi32(1);
5312         let r = _mm_srav_epi32(a, count);
5313         let e = _mm_set1_epi32(2);
5314         assert_eq_m128i(r, e);
5315     }
5316
5317     #[simd_test(enable = "avx2")]
5318     unsafe fn test_mm256_srav_epi32() {
5319         let a = _mm256_set1_epi32(4);
5320         let count = _mm256_set1_epi32(1);
5321         let r = _mm256_srav_epi32(a, count);
5322         let e = _mm256_set1_epi32(2);
5323         assert_eq_m256i(r, e);
5324     }
5325
5326     #[simd_test(enable = "avx2")]
5327     unsafe fn test_mm256_srli_si256() {
5328         #[rustfmt::skip]
5329         let a = _mm256_setr_epi8(
5330             1, 2, 3, 4, 5, 6, 7, 8,
5331             9, 10, 11, 12, 13, 14, 15, 16,
5332             17, 18, 19, 20, 21, 22, 23, 24,
5333             25, 26, 27, 28, 29, 30, 31, 32,
5334         );
5335         let r = _mm256_srli_si256(a, 3);
5336         #[rustfmt::skip]
5337         let e = _mm256_setr_epi8(
5338             4, 5, 6, 7, 8, 9, 10, 11,
5339             12, 13, 14, 15, 16, 0, 0, 0,
5340             20, 21, 22, 23, 24, 25, 26, 27,
5341             28, 29, 30, 31, 32, 0, 0, 0,
5342         );
5343         assert_eq_m256i(r, e);
5344     }
5345
5346     #[simd_test(enable = "avx2")]
5347     unsafe fn test_mm256_srl_epi16() {
5348         let a = _mm256_set1_epi16(0xFF);
5349         let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5350         let r = _mm256_srl_epi16(a, b);
5351         assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5352     }
5353
5354     #[simd_test(enable = "avx2")]
5355     unsafe fn test_mm256_srl_epi32() {
5356         let a = _mm256_set1_epi32(0xFFFF);
5357         let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5358         let r = _mm256_srl_epi32(a, b);
5359         assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5360     }
5361
5362     #[simd_test(enable = "avx2")]
5363     unsafe fn test_mm256_srl_epi64() {
5364         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5365         let b = _mm_setr_epi64x(4, 0);
5366         let r = _mm256_srl_epi64(a, b);
5367         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5368     }
5369
5370     #[simd_test(enable = "avx2")]
5371     unsafe fn test_mm256_srli_epi16() {
5372         assert_eq_m256i(
5373             _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
5374             _mm256_set1_epi16(0xF),
5375         );
5376     }
5377
5378     #[simd_test(enable = "avx2")]
5379     unsafe fn test_mm256_srli_epi32() {
5380         assert_eq_m256i(
5381             _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5382             _mm256_set1_epi32(0xFFF),
5383         );
5384     }
5385
5386     #[simd_test(enable = "avx2")]
5387     unsafe fn test_mm256_srli_epi64() {
5388         assert_eq_m256i(
5389             _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5390             _mm256_set1_epi64x(0xFFFFFFF),
5391         );
5392     }
5393
5394     #[simd_test(enable = "avx2")]
5395     unsafe fn test_mm_srlv_epi32() {
5396         let a = _mm_set1_epi32(2);
5397         let count = _mm_set1_epi32(1);
5398         let r = _mm_srlv_epi32(a, count);
5399         let e = _mm_set1_epi32(1);
5400         assert_eq_m128i(r, e);
5401     }
5402
5403     #[simd_test(enable = "avx2")]
5404     unsafe fn test_mm256_srlv_epi32() {
5405         let a = _mm256_set1_epi32(2);
5406         let count = _mm256_set1_epi32(1);
5407         let r = _mm256_srlv_epi32(a, count);
5408         let e = _mm256_set1_epi32(1);
5409         assert_eq_m256i(r, e);
5410     }
5411
5412     #[simd_test(enable = "avx2")]
5413     unsafe fn test_mm_srlv_epi64() {
5414         let a = _mm_set1_epi64x(2);
5415         let count = _mm_set1_epi64x(1);
5416         let r = _mm_srlv_epi64(a, count);
5417         let e = _mm_set1_epi64x(1);
5418         assert_eq_m128i(r, e);
5419     }
5420
5421     #[simd_test(enable = "avx2")]
5422     unsafe fn test_mm256_srlv_epi64() {
5423         let a = _mm256_set1_epi64x(2);
5424         let count = _mm256_set1_epi64x(1);
5425         let r = _mm256_srlv_epi64(a, count);
5426         let e = _mm256_set1_epi64x(1);
5427         assert_eq_m256i(r, e);
5428     }
5429
5430     #[simd_test(enable = "avx2")]
5431     unsafe fn test_mm256_sub_epi16() {
5432         let a = _mm256_set1_epi16(4);
5433         let b = _mm256_set1_epi16(2);
5434         let r = _mm256_sub_epi16(a, b);
5435         assert_eq_m256i(r, b);
5436     }
5437
5438     #[simd_test(enable = "avx2")]
5439     unsafe fn test_mm256_sub_epi32() {
5440         let a = _mm256_set1_epi32(4);
5441         let b = _mm256_set1_epi32(2);
5442         let r = _mm256_sub_epi32(a, b);
5443         assert_eq_m256i(r, b);
5444     }
5445
5446     #[simd_test(enable = "avx2")]
5447     unsafe fn test_mm256_sub_epi64() {
5448         let a = _mm256_set1_epi64x(4);
5449         let b = _mm256_set1_epi64x(2);
5450         let r = _mm256_sub_epi64(a, b);
5451         assert_eq_m256i(r, b);
5452     }
5453
5454     #[simd_test(enable = "avx2")]
5455     unsafe fn test_mm256_sub_epi8() {
5456         let a = _mm256_set1_epi8(4);
5457         let b = _mm256_set1_epi8(2);
5458         let r = _mm256_sub_epi8(a, b);
5459         assert_eq_m256i(r, b);
5460     }
5461
5462     #[simd_test(enable = "avx2")]
5463     unsafe fn test_mm256_subs_epi16() {
5464         let a = _mm256_set1_epi16(4);
5465         let b = _mm256_set1_epi16(2);
5466         let r = _mm256_subs_epi16(a, b);
5467         assert_eq_m256i(r, b);
5468     }
5469
5470     #[simd_test(enable = "avx2")]
5471     unsafe fn test_mm256_subs_epi8() {
5472         let a = _mm256_set1_epi8(4);
5473         let b = _mm256_set1_epi8(2);
5474         let r = _mm256_subs_epi8(a, b);
5475         assert_eq_m256i(r, b);
5476     }
5477
5478     #[simd_test(enable = "avx2")]
5479     unsafe fn test_mm256_subs_epu16() {
5480         let a = _mm256_set1_epi16(4);
5481         let b = _mm256_set1_epi16(2);
5482         let r = _mm256_subs_epu16(a, b);
5483         assert_eq_m256i(r, b);
5484     }
5485
5486     #[simd_test(enable = "avx2")]
5487     unsafe fn test_mm256_subs_epu8() {
5488         let a = _mm256_set1_epi8(4);
5489         let b = _mm256_set1_epi8(2);
5490         let r = _mm256_subs_epu8(a, b);
5491         assert_eq_m256i(r, b);
5492     }
5493
5494     #[simd_test(enable = "avx2")]
5495     unsafe fn test_mm256_xor_si256() {
5496         let a = _mm256_set1_epi8(5);
5497         let b = _mm256_set1_epi8(3);
5498         let r = _mm256_xor_si256(a, b);
5499         assert_eq_m256i(r, _mm256_set1_epi8(6));
5500     }
5501
5502     #[simd_test(enable = "avx2")]
5503     unsafe fn test_mm256_alignr_epi8() {
5504         #[rustfmt::skip]
5505         let a = _mm256_setr_epi8(
5506             1, 2, 3, 4, 5, 6, 7, 8,
5507             9, 10, 11, 12, 13, 14, 15, 16,
5508             17, 18, 19, 20, 21, 22, 23, 24,
5509             25, 26, 27, 28, 29, 30, 31, 32,
5510         );
5511         #[rustfmt::skip]
5512         let b = _mm256_setr_epi8(
5513             -1, -2, -3, -4, -5, -6, -7, -8,
5514             -9, -10, -11, -12, -13, -14, -15, -16,
5515             -17, -18, -19, -20, -21, -22, -23, -24,
5516             -25, -26, -27, -28, -29, -30, -31, -32,
5517         );
5518         let r = _mm256_alignr_epi8(a, b, 33);
5519         assert_eq_m256i(r, _mm256_set1_epi8(0));
5520
5521         let r = _mm256_alignr_epi8(a, b, 17);
5522         #[rustfmt::skip]
5523         let expected = _mm256_setr_epi8(
5524             2, 3, 4, 5, 6, 7, 8, 9,
5525             10, 11, 12, 13, 14, 15, 16, 0,
5526             18, 19, 20, 21, 22, 23, 24, 25,
5527             26, 27, 28, 29, 30, 31, 32, 0,
5528         );
5529         assert_eq_m256i(r, expected);
5530
5531         let r = _mm256_alignr_epi8(a, b, 4);
5532         #[rustfmt::skip]
5533         let expected = _mm256_setr_epi8(
5534             -5, -6, -7, -8, -9, -10, -11, -12,
5535             -13, -14, -15, -16, 1, 2, 3, 4,
5536             -21, -22, -23, -24, -25, -26, -27, -28,
5537             -29, -30, -31, -32, 17, 18, 19, 20,
5538         );
5539         assert_eq_m256i(r, expected);
5540
5541         #[rustfmt::skip]
5542         let expected = _mm256_setr_epi8(
5543             -1, -2, -3, -4, -5, -6, -7, -8,
5544             -9, -10, -11, -12, -13, -14, -15, -16, -17,
5545             -18, -19, -20, -21, -22, -23, -24, -25,
5546             -26, -27, -28, -29, -30, -31, -32,
5547         );
5548         let r = _mm256_alignr_epi8(a, b, 16);
5549         assert_eq_m256i(r, expected);
5550
5551         let r = _mm256_alignr_epi8(a, b, 15);
5552         #[rustfmt::skip]
5553         let expected = _mm256_setr_epi8(
5554             -16, 1, 2, 3, 4, 5, 6, 7,
5555             8, 9, 10, 11, 12, 13, 14, 15,
5556             -32, 17, 18, 19, 20, 21, 22, 23,
5557             24, 25, 26, 27, 28, 29, 30, 31,
5558         );
5559         assert_eq_m256i(r, expected);
5560
5561         let r = _mm256_alignr_epi8(a, b, 0);
5562         assert_eq_m256i(r, b);
5563     }
5564
5565     #[simd_test(enable = "avx2")]
5566     unsafe fn test_mm256_shuffle_epi8() {
5567         #[rustfmt::skip]
5568         let a = _mm256_setr_epi8(
5569             1, 2, 3, 4, 5, 6, 7, 8,
5570             9, 10, 11, 12, 13, 14, 15, 16,
5571             17, 18, 19, 20, 21, 22, 23, 24,
5572             25, 26, 27, 28, 29, 30, 31, 32,
5573         );
5574         #[rustfmt::skip]
5575         let b = _mm256_setr_epi8(
5576             4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5577             12, 5, 5, 10, 4, 1, 8, 0,
5578             4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5579             12, 5, 5, 10, 4, 1, 8, 0,
5580         );
5581         #[rustfmt::skip]
5582         let expected = _mm256_setr_epi8(
5583             5, 0, 5, 4, 9, 13, 7, 4,
5584             13, 6, 6, 11, 5, 2, 9, 1,
5585             21, 0, 21, 20, 25, 29, 23, 20,
5586             29, 22, 22, 27, 21, 18, 25, 17,
5587         );
5588         let r = _mm256_shuffle_epi8(a, b);
5589         assert_eq_m256i(r, expected);
5590     }
5591
5592     #[simd_test(enable = "avx2")]
5593     unsafe fn test_mm256_permutevar8x32_epi32() {
5594         let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5595         let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5596         let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5597         let r = _mm256_permutevar8x32_epi32(a, b);
5598         assert_eq_m256i(r, expected);
5599     }
5600
5601     #[simd_test(enable = "avx2")]
5602     unsafe fn test_mm256_permute4x64_epi64() {
5603         let a = _mm256_setr_epi64x(100, 200, 300, 400);
5604         let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5605         let r = _mm256_permute4x64_epi64(a, 0b00010011);
5606         assert_eq_m256i(r, expected);
5607     }
5608
5609     #[simd_test(enable = "avx2")]
5610     unsafe fn test_mm256_permute2x128_si256() {
5611         let a = _mm256_setr_epi64x(100, 200, 500, 600);
5612         let b = _mm256_setr_epi64x(300, 400, 700, 800);
5613         let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
5614         let e = _mm256_setr_epi64x(700, 800, 500, 600);
5615         assert_eq_m256i(r, e);
5616     }
5617
5618     #[simd_test(enable = "avx2")]
5619     unsafe fn test_mm256_permute4x64_pd() {
5620         let a = _mm256_setr_pd(1., 2., 3., 4.);
5621         let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
5622         let e = _mm256_setr_pd(4., 1., 2., 1.);
5623         assert_eq_m256d(r, e);
5624     }
5625
5626     #[simd_test(enable = "avx2")]
5627     unsafe fn test_mm256_permutevar8x32_ps() {
5628         let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5629         let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5630         let r = _mm256_permutevar8x32_ps(a, b);
5631         let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5632         assert_eq_m256(r, e);
5633     }
5634
5635     #[simd_test(enable = "avx2")]
5636     unsafe fn test_mm_i32gather_epi32() {
5637         let mut arr = [0i32; 128];
5638         for i in 0..128i32 {
5639             arr[i as usize] = i;
5640         }
5641         // A multiplier of 4 is word-addressing
5642         let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5643         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5644     }
5645
5646     #[simd_test(enable = "avx2")]
5647     unsafe fn test_mm_mask_i32gather_epi32() {
5648         let mut arr = [0i32; 128];
5649         for i in 0..128i32 {
5650             arr[i as usize] = i;
5651         }
5652         // A multiplier of 4 is word-addressing
5653         let r = _mm_mask_i32gather_epi32(
5654             _mm_set1_epi32(256),
5655             arr.as_ptr(),
5656             _mm_setr_epi32(0, 16, 64, 96),
5657             _mm_setr_epi32(-1, -1, -1, 0),
5658             4,
5659         );
5660         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5661     }
5662
5663     #[simd_test(enable = "avx2")]
5664     unsafe fn test_mm256_i32gather_epi32() {
5665         let mut arr = [0i32; 128];
5666         for i in 0..128i32 {
5667             arr[i as usize] = i;
5668         }
5669         // A multiplier of 4 is word-addressing
5670         let r = _mm256_i32gather_epi32(
5671             arr.as_ptr(),
5672             _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5673             4,
5674         );
5675         assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5676     }
5677
5678     #[simd_test(enable = "avx2")]
5679     unsafe fn test_mm256_mask_i32gather_epi32() {
5680         let mut arr = [0i32; 128];
5681         for i in 0..128i32 {
5682             arr[i as usize] = i;
5683         }
5684         // A multiplier of 4 is word-addressing
5685         let r = _mm256_mask_i32gather_epi32(
5686             _mm256_set1_epi32(256),
5687             arr.as_ptr(),
5688             _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5689             _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5690             4,
5691         );
5692         assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5693     }
5694
5695     #[simd_test(enable = "avx2")]
5696     unsafe fn test_mm_i32gather_ps() {
5697         let mut arr = [0.0f32; 128];
5698         let mut j = 0.0;
5699         for i in 0..128usize {
5700             arr[i] = j;
5701             j += 1.0;
5702         }
5703         // A multiplier of 4 is word-addressing for f32s
5704         let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5705         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5706     }
5707
5708     #[simd_test(enable = "avx2")]
5709     unsafe fn test_mm_mask_i32gather_ps() {
5710         let mut arr = [0.0f32; 128];
5711         let mut j = 0.0;
5712         for i in 0..128usize {
5713             arr[i] = j;
5714             j += 1.0;
5715         }
5716         // A multiplier of 4 is word-addressing for f32s
5717         let r = _mm_mask_i32gather_ps(
5718             _mm_set1_ps(256.0),
5719             arr.as_ptr(),
5720             _mm_setr_epi32(0, 16, 64, 96),
5721             _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5722             4,
5723         );
5724         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5725     }
5726
5727     #[simd_test(enable = "avx2")]
5728     unsafe fn test_mm256_i32gather_ps() {
5729         let mut arr = [0.0f32; 128];
5730         let mut j = 0.0;
5731         for i in 0..128usize {
5732             arr[i] = j;
5733             j += 1.0;
5734         }
5735         // A multiplier of 4 is word-addressing for f32s
5736         let r = _mm256_i32gather_ps(
5737             arr.as_ptr(),
5738             _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5739             4,
5740         );
5741         assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5742     }
5743
5744     #[simd_test(enable = "avx2")]
5745     unsafe fn test_mm256_mask_i32gather_ps() {
5746         let mut arr = [0.0f32; 128];
5747         let mut j = 0.0;
5748         for i in 0..128usize {
5749             arr[i] = j;
5750             j += 1.0;
5751         }
5752         // A multiplier of 4 is word-addressing for f32s
5753         let r = _mm256_mask_i32gather_ps(
5754             _mm256_set1_ps(256.0),
5755             arr.as_ptr(),
5756             _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5757             _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5758             4,
5759         );
5760         assert_eq_m256(
5761             r,
5762             _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5763         );
5764     }
5765
5766     #[simd_test(enable = "avx2")]
5767     unsafe fn test_mm_i32gather_epi64() {
5768         let mut arr = [0i64; 128];
5769         for i in 0..128i64 {
5770             arr[i as usize] = i;
5771         }
5772         // A multiplier of 8 is word-addressing for i64s
5773         let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5774         assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5775     }
5776
5777     #[simd_test(enable = "avx2")]
5778     unsafe fn test_mm_mask_i32gather_epi64() {
5779         let mut arr = [0i64; 128];
5780         for i in 0..128i64 {
5781             arr[i as usize] = i;
5782         }
5783         // A multiplier of 8 is word-addressing for i64s
5784         let r = _mm_mask_i32gather_epi64(
5785             _mm_set1_epi64x(256),
5786             arr.as_ptr(),
5787             _mm_setr_epi32(16, 16, 16, 16),
5788             _mm_setr_epi64x(-1, 0),
5789             8,
5790         );
5791         assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5792     }
5793
5794     #[simd_test(enable = "avx2")]
5795     unsafe fn test_mm256_i32gather_epi64() {
5796         let mut arr = [0i64; 128];
5797         for i in 0..128i64 {
5798             arr[i as usize] = i;
5799         }
5800         // A multiplier of 8 is word-addressing for i64s
5801         let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5802         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5803     }
5804
5805     #[simd_test(enable = "avx2")]
5806     unsafe fn test_mm256_mask_i32gather_epi64() {
5807         let mut arr = [0i64; 128];
5808         for i in 0..128i64 {
5809             arr[i as usize] = i;
5810         }
5811         // A multiplier of 8 is word-addressing for i64s
5812         let r = _mm256_mask_i32gather_epi64(
5813             _mm256_set1_epi64x(256),
5814             arr.as_ptr(),
5815             _mm_setr_epi32(0, 16, 64, 96),
5816             _mm256_setr_epi64x(-1, -1, -1, 0),
5817             8,
5818         );
5819         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5820     }
5821
5822     #[simd_test(enable = "avx2")]
5823     unsafe fn test_mm_i32gather_pd() {
5824         let mut arr = [0.0f64; 128];
5825         let mut j = 0.0;
5826         for i in 0..128usize {
5827             arr[i] = j;
5828             j += 1.0;
5829         }
5830         // A multiplier of 8 is word-addressing for f64s
5831         let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5832         assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5833     }
5834
5835     #[simd_test(enable = "avx2")]
5836     unsafe fn test_mm_mask_i32gather_pd() {
5837         let mut arr = [0.0f64; 128];
5838         let mut j = 0.0;
5839         for i in 0..128usize {
5840             arr[i] = j;
5841             j += 1.0;
5842         }
5843         // A multiplier of 8 is word-addressing for f64s
5844         let r = _mm_mask_i32gather_pd(
5845             _mm_set1_pd(256.0),
5846             arr.as_ptr(),
5847             _mm_setr_epi32(16, 16, 16, 16),
5848             _mm_setr_pd(-1.0, 0.0),
5849             8,
5850         );
5851         assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5852     }
5853
5854     #[simd_test(enable = "avx2")]
5855     unsafe fn test_mm256_i32gather_pd() {
5856         let mut arr = [0.0f64; 128];
5857         let mut j = 0.0;
5858         for i in 0..128usize {
5859             arr[i] = j;
5860             j += 1.0;
5861         }
5862         // A multiplier of 8 is word-addressing for f64s
5863         let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5864         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5865     }
5866
5867     #[simd_test(enable = "avx2")]
5868     unsafe fn test_mm256_mask_i32gather_pd() {
5869         let mut arr = [0.0f64; 128];
5870         let mut j = 0.0;
5871         for i in 0..128usize {
5872             arr[i] = j;
5873             j += 1.0;
5874         }
5875         // A multiplier of 8 is word-addressing for f64s
5876         let r = _mm256_mask_i32gather_pd(
5877             _mm256_set1_pd(256.0),
5878             arr.as_ptr(),
5879             _mm_setr_epi32(0, 16, 64, 96),
5880             _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5881             8,
5882         );
5883         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5884     }
5885
5886     #[simd_test(enable = "avx2")]
5887     unsafe fn test_mm_i64gather_epi32() {
5888         let mut arr = [0i32; 128];
5889         for i in 0..128i32 {
5890             arr[i as usize] = i;
5891         }
5892         // A multiplier of 4 is word-addressing
5893         let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5894         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5895     }
5896
5897     #[simd_test(enable = "avx2")]
5898     unsafe fn test_mm_mask_i64gather_epi32() {
5899         let mut arr = [0i32; 128];
5900         for i in 0..128i32 {
5901             arr[i as usize] = i;
5902         }
5903         // A multiplier of 4 is word-addressing
5904         let r = _mm_mask_i64gather_epi32(
5905             _mm_set1_epi32(256),
5906             arr.as_ptr(),
5907             _mm_setr_epi64x(0, 16),
5908             _mm_setr_epi32(-1, 0, -1, 0),
5909             4,
5910         );
5911         assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5912     }
5913
5914     #[simd_test(enable = "avx2")]
5915     unsafe fn test_mm256_i64gather_epi32() {
5916         let mut arr = [0i32; 128];
5917         for i in 0..128i32 {
5918             arr[i as usize] = i;
5919         }
5920         // A multiplier of 4 is word-addressing
5921         let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5922         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5923     }
5924
5925     #[simd_test(enable = "avx2")]
5926     unsafe fn test_mm256_mask_i64gather_epi32() {
5927         let mut arr = [0i32; 128];
5928         for i in 0..128i32 {
5929             arr[i as usize] = i;
5930         }
5931         // A multiplier of 4 is word-addressing
5932         let r = _mm256_mask_i64gather_epi32(
5933             _mm_set1_epi32(256),
5934             arr.as_ptr(),
5935             _mm256_setr_epi64x(0, 16, 64, 96),
5936             _mm_setr_epi32(-1, -1, -1, 0),
5937             4,
5938         );
5939         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5940     }
5941
5942     #[simd_test(enable = "avx2")]
5943     unsafe fn test_mm_i64gather_ps() {
5944         let mut arr = [0.0f32; 128];
5945         let mut j = 0.0;
5946         for i in 0..128usize {
5947             arr[i] = j;
5948             j += 1.0;
5949         }
5950         // A multiplier of 4 is word-addressing for f32s
5951         let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5952         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5953     }
5954
5955     #[simd_test(enable = "avx2")]
5956     unsafe fn test_mm_mask_i64gather_ps() {
5957         let mut arr = [0.0f32; 128];
5958         let mut j = 0.0;
5959         for i in 0..128usize {
5960             arr[i] = j;
5961             j += 1.0;
5962         }
5963         // A multiplier of 4 is word-addressing for f32s
5964         let r = _mm_mask_i64gather_ps(
5965             _mm_set1_ps(256.0),
5966             arr.as_ptr(),
5967             _mm_setr_epi64x(0, 16),
5968             _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5969             4,
5970         );
5971         assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5972     }
5973
5974     #[simd_test(enable = "avx2")]
5975     unsafe fn test_mm256_i64gather_ps() {
5976         let mut arr = [0.0f32; 128];
5977         let mut j = 0.0;
5978         for i in 0..128usize {
5979             arr[i] = j;
5980             j += 1.0;
5981         }
5982         // A multiplier of 4 is word-addressing for f32s
5983         let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5984         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5985     }
5986
5987     #[simd_test(enable = "avx2")]
5988     unsafe fn test_mm256_mask_i64gather_ps() {
5989         let mut arr = [0.0f32; 128];
5990         let mut j = 0.0;
5991         for i in 0..128usize {
5992             arr[i] = j;
5993             j += 1.0;
5994         }
5995         // A multiplier of 4 is word-addressing for f32s
5996         let r = _mm256_mask_i64gather_ps(
5997             _mm_set1_ps(256.0),
5998             arr.as_ptr(),
5999             _mm256_setr_epi64x(0, 16, 64, 96),
6000             _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
6001             4,
6002         );
6003         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
6004     }
6005
6006     #[simd_test(enable = "avx2")]
6007     unsafe fn test_mm_i64gather_epi64() {
6008         let mut arr = [0i64; 128];
6009         for i in 0..128i64 {
6010             arr[i as usize] = i;
6011         }
6012         // A multiplier of 8 is word-addressing for i64s
6013         let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6014         assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
6015     }
6016
6017     #[simd_test(enable = "avx2")]
6018     unsafe fn test_mm_mask_i64gather_epi64() {
6019         let mut arr = [0i64; 128];
6020         for i in 0..128i64 {
6021             arr[i as usize] = i;
6022         }
6023         // A multiplier of 8 is word-addressing for i64s
6024         let r = _mm_mask_i64gather_epi64(
6025             _mm_set1_epi64x(256),
6026             arr.as_ptr(),
6027             _mm_setr_epi64x(16, 16),
6028             _mm_setr_epi64x(-1, 0),
6029             8,
6030         );
6031         assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
6032     }
6033
6034     #[simd_test(enable = "avx2")]
6035     unsafe fn test_mm256_i64gather_epi64() {
6036         let mut arr = [0i64; 128];
6037         for i in 0..128i64 {
6038             arr[i as usize] = i;
6039         }
6040         // A multiplier of 8 is word-addressing for i64s
6041         let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6042         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
6043     }
6044
6045     #[simd_test(enable = "avx2")]
6046     unsafe fn test_mm256_mask_i64gather_epi64() {
6047         let mut arr = [0i64; 128];
6048         for i in 0..128i64 {
6049             arr[i as usize] = i;
6050         }
6051         // A multiplier of 8 is word-addressing for i64s
6052         let r = _mm256_mask_i64gather_epi64(
6053             _mm256_set1_epi64x(256),
6054             arr.as_ptr(),
6055             _mm256_setr_epi64x(0, 16, 64, 96),
6056             _mm256_setr_epi64x(-1, -1, -1, 0),
6057             8,
6058         );
6059         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
6060     }
6061
6062     #[simd_test(enable = "avx2")]
6063     unsafe fn test_mm_i64gather_pd() {
6064         let mut arr = [0.0f64; 128];
6065         let mut j = 0.0;
6066         for i in 0..128usize {
6067             arr[i] = j;
6068             j += 1.0;
6069         }
6070         // A multiplier of 8 is word-addressing for f64s
6071         let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6072         assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
6073     }
6074
6075     #[simd_test(enable = "avx2")]
6076     unsafe fn test_mm_mask_i64gather_pd() {
6077         let mut arr = [0.0f64; 128];
6078         let mut j = 0.0;
6079         for i in 0..128usize {
6080             arr[i] = j;
6081             j += 1.0;
6082         }
6083         // A multiplier of 8 is word-addressing for f64s
6084         let r = _mm_mask_i64gather_pd(
6085             _mm_set1_pd(256.0),
6086             arr.as_ptr(),
6087             _mm_setr_epi64x(16, 16),
6088             _mm_setr_pd(-1.0, 0.0),
6089             8,
6090         );
6091         assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
6092     }
6093
6094     #[simd_test(enable = "avx2")]
6095     unsafe fn test_mm256_i64gather_pd() {
6096         let mut arr = [0.0f64; 128];
6097         let mut j = 0.0;
6098         for i in 0..128usize {
6099             arr[i] = j;
6100             j += 1.0;
6101         }
6102         // A multiplier of 8 is word-addressing for f64s
6103         let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6104         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6105     }
6106
6107     #[simd_test(enable = "avx2")]
6108     unsafe fn test_mm256_mask_i64gather_pd() {
6109         let mut arr = [0.0f64; 128];
6110         let mut j = 0.0;
6111         for i in 0..128usize {
6112             arr[i] = j;
6113             j += 1.0;
6114         }
6115         // A multiplier of 8 is word-addressing for f64s
6116         let r = _mm256_mask_i64gather_pd(
6117             _mm256_set1_pd(256.0),
6118             arr.as_ptr(),
6119             _mm256_setr_epi64x(0, 16, 64, 96),
6120             _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6121             8,
6122         );
6123         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6124     }
6125
6126     #[simd_test(enable = "avx")]
6127     unsafe fn test_mm256_extract_epi8() {
6128         #[rustfmt::skip]
6129         let a = _mm256_setr_epi8(
6130             -1, 1, 2, 3, 4, 5, 6, 7,
6131             8, 9, 10, 11, 12, 13, 14, 15,
6132             16, 17, 18, 19, 20, 21, 22, 23,
6133             24, 25, 26, 27, 28, 29, 30, 31
6134         );
6135         let r1 = _mm256_extract_epi8(a, 0);
6136         let r2 = _mm256_extract_epi8(a, 35);
6137         assert_eq!(r1, -1);
6138         assert_eq!(r2, 3);
6139     }
6140
6141     #[simd_test(enable = "avx2")]
6142     unsafe fn test_mm256_extract_epi16() {
6143         #[rustfmt::skip]
6144         let a = _mm256_setr_epi16(
6145             -1, 1, 2, 3, 4, 5, 6, 7,
6146             8, 9, 10, 11, 12, 13, 14, 15,
6147         );
6148         let r1 = _mm256_extract_epi16(a, 0);
6149         let r2 = _mm256_extract_epi16(a, 19);
6150         assert_eq!(r1, -1);
6151         assert_eq!(r2, 3);
6152     }
6153
6154     #[simd_test(enable = "avx2")]
6155     unsafe fn test_mm256_extract_epi32() {
6156         let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
6157         let r1 = _mm256_extract_epi32(a, 0);
6158         let r2 = _mm256_extract_epi32(a, 11);
6159         assert_eq!(r1, -1);
6160         assert_eq!(r2, 3);
6161     }
6162
6163     #[simd_test(enable = "avx2")]
6164     unsafe fn test_mm256_cvtsd_f64() {
6165         let a = _mm256_setr_pd(1., 2., 3., 4.);
6166         let r = _mm256_cvtsd_f64(a);
6167         assert_eq!(r, 1.);
6168     }
6169
6170     #[simd_test(enable = "avx2")]
6171     unsafe fn test_mm256_cvtsi256_si32() {
6172         let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
6173         let r = _mm256_cvtsi256_si32(a);
6174         assert_eq!(r, 1);
6175     }
6176 }