src/stdarch/crates/core_arch/src/x86/avx2.rs

   1 //! Advanced Vector Extensions 2 (AVX)
   2 //!
   3 //! AVX2 expands most AVX commands to 256-bit wide vector registers and
   4 //! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
   5 //!
   6 //! The references are:
   7 //!
   8 //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
   9 //!   Instruction Set Reference, A-Z][intel64_ref].
  10 //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
  11 //!   System Instructions][amd64_ref].
  12 //!
  13 //! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
  14 //! overview of the instructions available.
  15 //!
  16 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
  17 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
  18 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
  19 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
  20
  21 use crate::{
  22     core_arch::{simd::*, simd_llvm::*, x86::*},
  23     mem::transmute,
  24 };
  25
  26 #[cfg(test)]
  27 use stdarch_test::assert_instr;
  28
  29 /// Computes the absolute values of packed 32-bit integers in `a`.
  30 ///
  31 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
  32 #[inline]
  33 #[target_feature(enable = "avx2")]
  34 #[cfg_attr(test, assert_instr(vpabsd))]
  35 #[stable(feature = "simd_x86", since = "1.27.0")]
  36 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
  37     transmute(pabsd(a.as_i32x8()))
  38 }
  39
  40 /// Computes the absolute values of packed 16-bit integers in `a`.
  41 ///
  42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
  43 #[inline]
  44 #[target_feature(enable = "avx2")]
  45 #[cfg_attr(test, assert_instr(vpabsw))]
  46 #[stable(feature = "simd_x86", since = "1.27.0")]
  47 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
  48     transmute(pabsw(a.as_i16x16()))
  49 }
  50
  51 /// Computes the absolute values of packed 8-bit integers in `a`.
  52 ///
  53 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
  54 #[inline]
  55 #[target_feature(enable = "avx2")]
  56 #[cfg_attr(test, assert_instr(vpabsb))]
  57 #[stable(feature = "simd_x86", since = "1.27.0")]
  58 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
  59     transmute(pabsb(a.as_i8x32()))
  60 }
  61
  62 /// Adds packed 64-bit integers in `a` and `b`.
  63 ///
  64 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
  65 #[inline]
  66 #[target_feature(enable = "avx2")]
  67 #[cfg_attr(test, assert_instr(vpaddq))]
  68 #[stable(feature = "simd_x86", since = "1.27.0")]
  69 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
  70     transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
  71 }
  72
  73 /// Adds packed 32-bit integers in `a` and `b`.
  74 ///
  75 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
  76 #[inline]
  77 #[target_feature(enable = "avx2")]
  78 #[cfg_attr(test, assert_instr(vpaddd))]
  79 #[stable(feature = "simd_x86", since = "1.27.0")]
  80 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
  81     transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
  82 }
  83
  84 /// Adds packed 16-bit integers in `a` and `b`.
  85 ///
  86 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
  87 #[inline]
  88 #[target_feature(enable = "avx2")]
  89 #[cfg_attr(test, assert_instr(vpaddw))]
  90 #[stable(feature = "simd_x86", since = "1.27.0")]
  91 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
  92     transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
  93 }
  94
  95 /// Adds packed 8-bit integers in `a` and `b`.
  96 ///
  97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
  98 #[inline]
  99 #[target_feature(enable = "avx2")]
 100 #[cfg_attr(test, assert_instr(vpaddb))]
 101 #[stable(feature = "simd_x86", since = "1.27.0")]
 102 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 103     transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
 104 }
 105
 106 /// Adds packed 8-bit integers in `a` and `b` using saturation.
 107 ///
 108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
 109 #[inline]
 110 #[target_feature(enable = "avx2")]
 111 #[cfg_attr(test, assert_instr(vpaddsb))]
 112 #[stable(feature = "simd_x86", since = "1.27.0")]
 113 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 114     transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
 115 }
 116
 117 /// Adds packed 16-bit integers in `a` and `b` using saturation.
 118 ///
 119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
 120 #[inline]
 121 #[target_feature(enable = "avx2")]
 122 #[cfg_attr(test, assert_instr(vpaddsw))]
 123 #[stable(feature = "simd_x86", since = "1.27.0")]
 124 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 125     transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
 126 }
 127
 128 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
 129 ///
 130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
 131 #[inline]
 132 #[target_feature(enable = "avx2")]
 133 #[cfg_attr(test, assert_instr(vpaddusb))]
 134 #[stable(feature = "simd_x86", since = "1.27.0")]
 135 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 136     transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
 137 }
 138
 139 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
 140 ///
 141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
 142 #[inline]
 143 #[target_feature(enable = "avx2")]
 144 #[cfg_attr(test, assert_instr(vpaddusw))]
 145 #[stable(feature = "simd_x86", since = "1.27.0")]
 146 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
 147     transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
 148 }
 149
 150 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 151 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
 152 ///
 153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
 154 #[inline]
 155 #[target_feature(enable = "avx2")]
 156 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
 157 #[rustc_args_required_const(2)]
 158 #[stable(feature = "simd_x86", since = "1.27.0")]
 159 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
 160     let n = n as u32;
 161     // If `palignr` is shifting the pair of vectors more than the size of two
 162     // lanes, emit zero.
 163     if n > 32 {
 164         return _mm256_set1_epi8(0);
 165     }
 166     // If `palignr` is shifting the pair of input vectors more than one lane,
 167     // but less than two lanes, convert to shifting in zeroes.
 168     let (a, b, n) = if n > 16 {
 169         (_mm256_set1_epi8(0), a, n - 16)
 170     } else {
 171         (a, b, n)
 172     };
 173
 174     let a = a.as_i8x32();
 175     let b = b.as_i8x32();
 176
 177     let r: i8x32 = match n {
 178         0 => simd_shuffle32(
 179             b,
 180             a,
 181             [
 182                 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
 183                 23, 24, 25, 26, 27, 28, 29, 30, 31,
 184             ],
 185         ),
 186         1 => simd_shuffle32(
 187             b,
 188             a,
 189             [
 190                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
 191                 24, 25, 26, 27, 28, 29, 30, 31, 48,
 192             ],
 193         ),
 194         2 => simd_shuffle32(
 195             b,
 196             a,
 197             [
 198                 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
 199                 25, 26, 27, 28, 29, 30, 31, 48, 49,
 200             ],
 201         ),
 202         3 => simd_shuffle32(
 203             b,
 204             a,
 205             [
 206                 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
 207                 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
 208             ],
 209         ),
 210         4 => simd_shuffle32(
 211             b,
 212             a,
 213             [
 214                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
 215                 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
 216             ],
 217         ),
 218         5 => simd_shuffle32(
 219             b,
 220             a,
 221             [
 222                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
 223                 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
 224             ],
 225         ),
 226         6 => simd_shuffle32(
 227             b,
 228             a,
 229             [
 230                 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
 231                 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
 232             ],
 233         ),
 234         7 => simd_shuffle32(
 235             b,
 236             a,
 237             [
 238                 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
 239                 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
 240             ],
 241         ),
 242         8 => simd_shuffle32(
 243             b,
 244             a,
 245             [
 246                 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
 247                 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
 248             ],
 249         ),
 250         9 => simd_shuffle32(
 251             b,
 252             a,
 253             [
 254                 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
 255                 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
 256             ],
 257         ),
 258         10 => simd_shuffle32(
 259             b,
 260             a,
 261             [
 262                 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
 263                 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
 264             ],
 265         ),
 266         11 => simd_shuffle32(
 267             b,
 268             a,
 269             [
 270                 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
 271                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
 272             ],
 273         ),
 274         12 => simd_shuffle32(
 275             b,
 276             a,
 277             [
 278                 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
 279                 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
 280             ],
 281         ),
 282         13 => simd_shuffle32(
 283             b,
 284             a,
 285             [
 286                 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
 287                 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
 288             ],
 289         ),
 290         14 => simd_shuffle32(
 291             b,
 292             a,
 293             [
 294                 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
 295                 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
 296             ],
 297         ),
 298         15 => simd_shuffle32(
 299             b,
 300             a,
 301             [
 302                 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
 303                 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
 304             ],
 305         ),
 306         _ => b,
 307     };
 308     transmute(r)
 309 }
 310
 311 /// Computes the bitwise AND of 256 bits (representing integer data)
 312 /// in `a` and `b`.
 313 ///
 314 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
 315 #[inline]
 316 #[target_feature(enable = "avx2")]
 317 #[cfg_attr(test, assert_instr(vandps))]
 318 #[stable(feature = "simd_x86", since = "1.27.0")]
 319 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 320     transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
 321 }
 322
 323 /// Computes the bitwise NOT of 256 bits (representing integer data)
 324 /// in `a` and then AND with `b`.
 325 ///
 326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
 327 #[inline]
 328 #[target_feature(enable = "avx2")]
 329 #[cfg_attr(test, assert_instr(vandnps))]
 330 #[stable(feature = "simd_x86", since = "1.27.0")]
 331 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 332     let all_ones = _mm256_set1_epi8(-1);
 333     transmute(simd_and(
 334         simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
 335         b.as_i64x4(),
 336     ))
 337 }
 338
 339 /// Averages packed unsigned 16-bit integers in `a` and `b`.
 340 ///
 341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
 342 #[inline]
 343 #[target_feature(enable = "avx2")]
 344 #[cfg_attr(test, assert_instr(vpavgw))]
 345 #[stable(feature = "simd_x86", since = "1.27.0")]
 346 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 347     transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
 348 }
 349
 350 /// Averages packed unsigned 8-bit integers in `a` and `b`.
 351 ///
 352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
 353 #[inline]
 354 #[target_feature(enable = "avx2")]
 355 #[cfg_attr(test, assert_instr(vpavgb))]
 356 #[stable(feature = "simd_x86", since = "1.27.0")]
 357 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 358     transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
 359 }
 360
 361 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
 362 ///
 363 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
 364 #[inline]
 365 #[target_feature(enable = "avx2")]
 366 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 367 #[rustc_args_required_const(2)]
 368 #[stable(feature = "simd_x86", since = "1.27.0")]
 369 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 370     let imm8 = (imm8 & 0xFF) as u8;
 371     let a = a.as_i32x4();
 372     let b = b.as_i32x4();
 373     macro_rules! blend2 {
 374         ($a:expr, $b:expr, $c:expr, $d:expr) => {
 375             simd_shuffle4(a, b, [$a, $b, $c, $d]);
 376         };
 377     }
 378     macro_rules! blend1 {
 379         ($a:expr, $b:expr) => {
 380             match (imm8 >> 2) & 0b11 {
 381                 0b00 => blend2!($a, $b, 2, 3),
 382                 0b01 => blend2!($a, $b, 6, 3),
 383                 0b10 => blend2!($a, $b, 2, 7),
 384                 _ => blend2!($a, $b, 6, 7),
 385             }
 386         };
 387     }
 388     let r: i32x4 = match imm8 & 0b11 {
 389         0b00 => blend1!(0, 1),
 390         0b01 => blend1!(4, 1),
 391         0b10 => blend1!(0, 5),
 392         _ => blend1!(4, 5),
 393     };
 394     transmute(r)
 395 }
 396
 397 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
 398 ///
 399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
 400 #[inline]
 401 #[target_feature(enable = "avx2")]
 402 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 403 #[rustc_args_required_const(2)]
 404 #[stable(feature = "simd_x86", since = "1.27.0")]
 405 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 406     let imm8 = (imm8 & 0xFF) as u8;
 407     let a = a.as_i32x8();
 408     let b = b.as_i32x8();
 409     macro_rules! blend4 {
 410         (
 411             $a:expr,
 412             $b:expr,
 413             $c:expr,
 414             $d:expr,
 415             $e:expr,
 416             $f:expr,
 417             $g:expr,
 418             $h:expr
 419         ) => {
 420             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
 421         };
 422     }
 423     macro_rules! blend3 {
 424         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
 425             match (imm8 >> 6) & 0b11 {
 426                 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
 427                 0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
 428                 0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
 429                 _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
 430             }
 431         };
 432     }
 433     macro_rules! blend2 {
 434         ($a:expr, $b:expr, $c:expr, $d:expr) => {
 435             match (imm8 >> 4) & 0b11 {
 436                 0b00 => blend3!($a, $b, $c, $d, 4, 5),
 437                 0b01 => blend3!($a, $b, $c, $d, 12, 5),
 438                 0b10 => blend3!($a, $b, $c, $d, 4, 13),
 439                 _ => blend3!($a, $b, $c, $d, 12, 13),
 440             }
 441         };
 442     }
 443     macro_rules! blend1 {
 444         ($a:expr, $b:expr) => {
 445             match (imm8 >> 2) & 0b11 {
 446                 0b00 => blend2!($a, $b, 2, 3),
 447                 0b01 => blend2!($a, $b, 10, 3),
 448                 0b10 => blend2!($a, $b, 2, 11),
 449                 _ => blend2!($a, $b, 10, 11),
 450             }
 451         };
 452     }
 453     let r: i32x8 = match imm8 & 0b11 {
 454         0b00 => blend1!(0, 1),
 455         0b01 => blend1!(8, 1),
 456         0b10 => blend1!(0, 9),
 457         _ => blend1!(8, 9),
 458     };
 459     transmute(r)
 460 }
 461
 462 /// Blends packed 16-bit integers from `a` and `b` using control mask `imm8`.
 463 ///
 464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
 465 #[inline]
 466 #[target_feature(enable = "avx2")]
 467 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
 468 #[rustc_args_required_const(2)]
 469 #[stable(feature = "simd_x86", since = "1.27.0")]
 470 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 471     let imm8 = (imm8 & 0xFF) as u8;
 472     let a = a.as_i16x16();
 473     let b = b.as_i16x16();
 474     macro_rules! blend4 {
 475         (
 476             $a:expr,
 477             $b:expr,
 478             $c:expr,
 479             $d:expr,
 480             $e:expr,
 481             $f:expr,
 482             $g:expr,
 483             $h:expr,
 484             $i:expr,
 485             $j:expr,
 486             $k:expr,
 487             $l:expr,
 488             $m:expr,
 489             $n:expr,
 490             $o:expr,
 491             $p:expr
 492         ) => {
 493             simd_shuffle16(
 494                 a,
 495                 b,
 496                 [
 497                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
 498                 ],
 499             )
 500         };
 501     }
 502     macro_rules! blend3 {
 503         (
 504             $a:expr,
 505             $b:expr,
 506             $c:expr,
 507             $d:expr,
 508             $e:expr,
 509             $f:expr,
 510             $a2:expr,
 511             $b2:expr,
 512             $c2:expr,
 513             $d2:expr,
 514             $e2:expr,
 515             $f2:expr
 516         ) => {
 517             match (imm8 >> 6) & 0b11 {
 518                 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
 519                 0b01 => {
 520                     blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
 521                 }
 522                 0b10 => {
 523                     blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
 524                 }
 525                 _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
 526             }
 527         };
 528     }
 529     macro_rules! blend2 {
 530         (
 531             $a:expr,
 532             $b:expr,
 533             $c:expr,
 534             $d:expr,
 535             $a2:expr,
 536             $b2:expr,
 537             $c2:expr,
 538             $d2:expr
 539         ) => {
 540             match (imm8 >> 4) & 0b11 {
 541                 0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
 542                 0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
 543                 0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
 544                 _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
 545             }
 546         };
 547     }
 548     macro_rules! blend1 {
 549         ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
 550             match (imm8 >> 2) & 0b11 {
 551                 0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
 552                 0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
 553                 0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
 554                 _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
 555             }
 556         };
 557     }
 558     let r: i16x16 = match imm8 & 0b11 {
 559         0b00 => blend1!(0, 1, 8, 9),
 560         0b01 => blend1!(16, 1, 24, 9),
 561         0b10 => blend1!(0, 17, 8, 25),
 562         _ => blend1!(16, 17, 24, 25),
 563     };
 564     transmute(r)
 565 }
 566
 567 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
 568 ///
 569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
 570 #[inline]
 571 #[target_feature(enable = "avx2")]
 572 #[cfg_attr(test, assert_instr(vpblendvb))]
 573 #[stable(feature = "simd_x86", since = "1.27.0")]
 574 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
 575     transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
 576 }
 577
 578 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 579 /// the 128-bit returned value.
 580 ///
 581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
 582 #[inline]
 583 #[target_feature(enable = "avx2")]
 584 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 585 #[stable(feature = "simd_x86", since = "1.27.0")]
 586 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 587     let zero = _mm_setzero_si128();
 588     let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
 589     transmute::<i8x16, _>(ret)
 590 }
 591
 592 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
 593 /// the 256-bit returned value.
 594 ///
 595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
 596 #[inline]
 597 #[target_feature(enable = "avx2")]
 598 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 599 #[stable(feature = "simd_x86", since = "1.27.0")]
 600 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 601     let zero = _mm_setzero_si128();
 602     let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
 603     transmute::<i8x32, _>(ret)
 604 }
 605
 606 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
 607 // often compiled to `vbroadcastss`.
 608 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 609 /// the 128-bit returned value.
 610 ///
 611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
 612 #[inline]
 613 #[target_feature(enable = "avx2")]
 614 #[cfg_attr(test, assert_instr(vbroadcastss))]
 615 #[stable(feature = "simd_x86", since = "1.27.0")]
 616 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 617     let zero = _mm_setzero_si128();
 618     let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
 619     transmute::<i32x4, _>(ret)
 620 }
 621
 622 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
 623 // often compiled to `vbroadcastss`.
 624 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
 625 /// the 256-bit returned value.
 626 ///
 627 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
 628 #[inline]
 629 #[target_feature(enable = "avx2")]
 630 #[cfg_attr(test, assert_instr(vbroadcastss))]
 631 #[stable(feature = "simd_x86", since = "1.27.0")]
 632 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 633     let zero = _mm_setzero_si128();
 634     let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
 635     transmute::<i32x8, _>(ret)
 636 }
 637
 638 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 639 /// the 128-bit returned value.
 640 ///
 641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
 642 #[inline]
 643 #[target_feature(enable = "avx2")]
 644 // FIXME: https://github.com/rust-lang/stdarch/issues/791
 645 #[cfg_attr(test, assert_instr(vmovddup))]
 646 #[stable(feature = "simd_x86", since = "1.27.0")]
 647 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 648     let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
 649     transmute::<i64x2, _>(ret)
 650 }
 651
 652 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
 653 /// the 256-bit returned value.
 654 ///
 655 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
 656 #[inline]
 657 #[target_feature(enable = "avx2")]
 658 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 659 #[stable(feature = "simd_x86", since = "1.27.0")]
 660 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 661     let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
 662     transmute::<i64x4, _>(ret)
 663 }
 664
 665 /// Broadcasts the low double-precision (64-bit) floating-point element
 666 /// from `a` to all elements of the 128-bit returned value.
 667 ///
 668 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
 669 #[inline]
 670 #[target_feature(enable = "avx2")]
 671 #[cfg_attr(test, assert_instr(vmovddup))]
 672 #[stable(feature = "simd_x86", since = "1.27.0")]
 673 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 674     simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
 675 }
 676
 677 /// Broadcasts the low double-precision (64-bit) floating-point element
 678 /// from `a` to all elements of the 256-bit returned value.
 679 ///
 680 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
 681 #[inline]
 682 #[target_feature(enable = "avx2")]
 683 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 684 #[stable(feature = "simd_x86", since = "1.27.0")]
 685 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 686     simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
 687 }
 688
 689 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
 690 // `vbroadcastf128`.
 691 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
 692 /// the 256-bit returned value.
 693 ///
 694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
 695 #[inline]
 696 #[target_feature(enable = "avx2")]
 697 #[stable(feature = "simd_x86", since = "1.27.0")]
 698 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 699     let zero = _mm_setzero_si128();
 700     let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
 701     transmute::<i64x4, _>(ret)
 702 }
 703
 704 /// Broadcasts the low single-precision (32-bit) floating-point element
 705 /// from `a` to all elements of the 128-bit returned value.
 706 ///
 707 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
 708 #[inline]
 709 #[target_feature(enable = "avx2")]
 710 #[cfg_attr(test, assert_instr(vbroadcastss))]
 711 #[stable(feature = "simd_x86", since = "1.27.0")]
 712 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 713     simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
 714 }
 715
 716 /// Broadcasts the low single-precision (32-bit) floating-point element
 717 /// from `a` to all elements of the 256-bit returned value.
 718 ///
 719 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
 720 #[inline]
 721 #[target_feature(enable = "avx2")]
 722 #[cfg_attr(test, assert_instr(vbroadcastss))]
 723 #[stable(feature = "simd_x86", since = "1.27.0")]
 724 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 725     simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
 726 }
 727
 728 /// Broadcasts the low packed 16-bit integer from a to all elements of
 729 /// the 128-bit returned value
 730 ///
 731 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
 732 #[inline]
 733 #[target_feature(enable = "avx2")]
 734 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 735 #[stable(feature = "simd_x86", since = "1.27.0")]
 736 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 737     let zero = _mm_setzero_si128();
 738     let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
 739     transmute::<i16x8, _>(ret)
 740 }
 741
 742 /// Broadcasts the low packed 16-bit integer from a to all elements of
 743 /// the 256-bit returned value
 744 ///
 745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
 746 #[inline]
 747 #[target_feature(enable = "avx2")]
 748 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 749 #[stable(feature = "simd_x86", since = "1.27.0")]
 750 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 751     let zero = _mm_setzero_si128();
 752     let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
 753     transmute::<i16x16, _>(ret)
 754 }
 755
 756 /// Compares packed 64-bit integers in `a` and `b` for equality.
 757 ///
 758 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
 759 #[inline]
 760 #[target_feature(enable = "avx2")]
 761 #[cfg_attr(test, assert_instr(vpcmpeqq))]
 762 #[stable(feature = "simd_x86", since = "1.27.0")]
 763 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 764     transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
 765 }
 766
 767 /// Compares packed 32-bit integers in `a` and `b` for equality.
 768 ///
 769 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
 770 #[inline]
 771 #[target_feature(enable = "avx2")]
 772 #[cfg_attr(test, assert_instr(vpcmpeqd))]
 773 #[stable(feature = "simd_x86", since = "1.27.0")]
 774 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 775     transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
 776 }
 777
 778 /// Compares packed 16-bit integers in `a` and `b` for equality.
 779 ///
 780 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
 781 #[inline]
 782 #[target_feature(enable = "avx2")]
 783 #[cfg_attr(test, assert_instr(vpcmpeqw))]
 784 #[stable(feature = "simd_x86", since = "1.27.0")]
 785 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 786     transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
 787 }
 788
 789 /// Compares packed 8-bit integers in `a` and `b` for equality.
 790 ///
 791 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
 792 #[inline]
 793 #[target_feature(enable = "avx2")]
 794 #[cfg_attr(test, assert_instr(vpcmpeqb))]
 795 #[stable(feature = "simd_x86", since = "1.27.0")]
 796 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 797     transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
 798 }
 799
 800 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
 801 ///
 802 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
 803 #[inline]
 804 #[target_feature(enable = "avx2")]
 805 #[cfg_attr(test, assert_instr(vpcmpgtq))]
 806 #[stable(feature = "simd_x86", since = "1.27.0")]
 807 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 808     transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
 809 }
 810
 811 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
 812 ///
 813 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
 814 #[inline]
 815 #[target_feature(enable = "avx2")]
 816 #[cfg_attr(test, assert_instr(vpcmpgtd))]
 817 #[stable(feature = "simd_x86", since = "1.27.0")]
 818 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 819     transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
 820 }
 821
 822 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
 823 ///
 824 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
 825 #[inline]
 826 #[target_feature(enable = "avx2")]
 827 #[cfg_attr(test, assert_instr(vpcmpgtw))]
 828 #[stable(feature = "simd_x86", since = "1.27.0")]
 829 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 830     transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
 831 }
 832
 833 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
 834 ///
 835 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
 836 #[inline]
 837 #[target_feature(enable = "avx2")]
 838 #[cfg_attr(test, assert_instr(vpcmpgtb))]
 839 #[stable(feature = "simd_x86", since = "1.27.0")]
 840 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 841     transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
 842 }
 843
 844 /// Sign-extend 16-bit integers to 32-bit integers.
 845 ///
 846 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
 847 #[inline]
 848 #[target_feature(enable = "avx2")]
 849 #[cfg_attr(test, assert_instr(vpmovsxwd))]
 850 #[stable(feature = "simd_x86", since = "1.27.0")]
 851 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 852     transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
 853 }
 854
 855 /// Sign-extend 16-bit integers to 64-bit integers.
 856 ///
 857 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
 858 #[inline]
 859 #[target_feature(enable = "avx2")]
 860 #[cfg_attr(test, assert_instr(vpmovsxwq))]
 861 #[stable(feature = "simd_x86", since = "1.27.0")]
 862 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 863     let a = a.as_i16x8();
 864     let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 865     transmute::<i64x4, _>(simd_cast(v64))
 866 }
 867
 868 /// Sign-extend 32-bit integers to 64-bit integers.
 869 ///
 870 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
 871 #[inline]
 872 #[target_feature(enable = "avx2")]
 873 #[cfg_attr(test, assert_instr(vpmovsxdq))]
 874 #[stable(feature = "simd_x86", since = "1.27.0")]
 875 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 876     transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
 877 }
 878
 879 /// Sign-extend 8-bit integers to 16-bit integers.
 880 ///
 881 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
 882 #[inline]
 883 #[target_feature(enable = "avx2")]
 884 #[cfg_attr(test, assert_instr(vpmovsxbw))]
 885 #[stable(feature = "simd_x86", since = "1.27.0")]
 886 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 887     transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
 888 }
 889
 890 /// Sign-extend 8-bit integers to 32-bit integers.
 891 ///
 892 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
 893 #[inline]
 894 #[target_feature(enable = "avx2")]
 895 #[cfg_attr(test, assert_instr(vpmovsxbd))]
 896 #[stable(feature = "simd_x86", since = "1.27.0")]
 897 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 898     let a = a.as_i8x16();
 899     let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 900     transmute::<i32x8, _>(simd_cast(v64))
 901 }
 902
 903 /// Sign-extend 8-bit integers to 64-bit integers.
 904 ///
 905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
 906 #[inline]
 907 #[target_feature(enable = "avx2")]
 908 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 909 #[stable(feature = "simd_x86", since = "1.27.0")]
 910 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 911     let a = a.as_i8x16();
 912     let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 913     transmute::<i64x4, _>(simd_cast(v32))
 914 }
 915
 916 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
 917 /// integers, and stores the results in `dst`.
 918 ///
 919 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
 920 #[inline]
 921 #[target_feature(enable = "avx2")]
 922 #[cfg_attr(test, assert_instr(vpmovzxwd))]
 923 #[stable(feature = "simd_x86", since = "1.27.0")]
 924 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 925     transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
 926 }
 927
 928 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 929 /// integers. The upper four elements of `a` are unused.
 930 ///
 931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
 932 #[inline]
 933 #[target_feature(enable = "avx2")]
 934 #[cfg_attr(test, assert_instr(vpmovzxwq))]
 935 #[stable(feature = "simd_x86", since = "1.27.0")]
 936 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 937     let a = a.as_u16x8();
 938     let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 939     transmute::<i64x4, _>(simd_cast(v64))
 940 }
 941
 942 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
 943 ///
 944 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
 945 #[inline]
 946 #[target_feature(enable = "avx2")]
 947 #[cfg_attr(test, assert_instr(vpmovzxdq))]
 948 #[stable(feature = "simd_x86", since = "1.27.0")]
 949 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 950     transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
 951 }
 952
 953 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
 954 ///
 955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
 956 #[inline]
 957 #[target_feature(enable = "avx2")]
 958 #[cfg_attr(test, assert_instr(vpmovzxbw))]
 959 #[stable(feature = "simd_x86", since = "1.27.0")]
 960 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 961     transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
 962 }
 963
 964 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 965 /// integers. The upper eight elements of `a` are unused.
 966 ///
 967 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
 968 #[inline]
 969 #[target_feature(enable = "avx2")]
 970 #[cfg_attr(test, assert_instr(vpmovzxbd))]
 971 #[stable(feature = "simd_x86", since = "1.27.0")]
 972 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 973     let a = a.as_u8x16();
 974     let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 975     transmute::<i32x8, _>(simd_cast(v64))
 976 }
 977
 978 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 979 /// integers. The upper twelve elements of `a` are unused.
 980 ///
 981 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
 982 #[inline]
 983 #[target_feature(enable = "avx2")]
 984 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 985 #[stable(feature = "simd_x86", since = "1.27.0")]
 986 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 987     let a = a.as_u8x16();
 988     let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
 989     transmute::<i64x4, _>(simd_cast(v32))
 990 }
 991
 992 /// Extracts 128 bits (of integer data) from `a` selected with `imm8`.
 993 ///
 994 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
 995 #[inline]
 996 #[target_feature(enable = "avx2")]
 997 #[cfg_attr(
 998     all(test, not(target_os = "windows")),
 999     assert_instr(vextractf128, imm8 = 1)
1000 )]
1001 #[rustc_args_required_const(1)]
1002 #[stable(feature = "simd_x86", since = "1.27.0")]
1003 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
1004     let a = a.as_i64x4();
1005     let b = _mm256_undefined_si256().as_i64x4();
1006     let dst: i64x2 = match imm8 & 0b01 {
1007         0 => simd_shuffle2(a, b, [0, 1]),
1008         _ => simd_shuffle2(a, b, [2, 3]),
1009     };
1010     transmute(dst)
1011 }
1012
1013 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
1014 ///
1015 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
1016 #[inline]
1017 #[target_feature(enable = "avx2")]
1018 #[cfg_attr(test, assert_instr(vphaddw))]
1019 #[stable(feature = "simd_x86", since = "1.27.0")]
1020 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
1021     transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
1022 }
1023
1024 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
1025 ///
1026 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
1027 #[inline]
1028 #[target_feature(enable = "avx2")]
1029 #[cfg_attr(test, assert_instr(vphaddd))]
1030 #[stable(feature = "simd_x86", since = "1.27.0")]
1031 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
1032     transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
1033 }
1034
1035 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
1036 /// using saturation.
1037 ///
1038 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
1039 #[inline]
1040 #[target_feature(enable = "avx2")]
1041 #[cfg_attr(test, assert_instr(vphaddsw))]
1042 #[stable(feature = "simd_x86", since = "1.27.0")]
1043 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1044     transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
1045 }
1046
1047 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1048 ///
1049 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
1050 #[inline]
1051 #[target_feature(enable = "avx2")]
1052 #[cfg_attr(test, assert_instr(vphsubw))]
1053 #[stable(feature = "simd_x86", since = "1.27.0")]
1054 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1055     transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
1056 }
1057
1058 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1059 ///
1060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
1061 #[inline]
1062 #[target_feature(enable = "avx2")]
1063 #[cfg_attr(test, assert_instr(vphsubd))]
1064 #[stable(feature = "simd_x86", since = "1.27.0")]
1065 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1066     transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
1067 }
1068
1069 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1070 /// using saturation.
1071 ///
1072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
1073 #[inline]
1074 #[target_feature(enable = "avx2")]
1075 #[cfg_attr(test, assert_instr(vphsubsw))]
1076 #[stable(feature = "simd_x86", since = "1.27.0")]
1077 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1078     transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
1079 }
1080
1081 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1082 /// where
1083 /// `scale` is between 1 and 8.
1084 ///
1085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
1086 #[inline]
1087 #[target_feature(enable = "avx2")]
1088 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1089 #[rustc_args_required_const(2)]
1090 #[stable(feature = "simd_x86", since = "1.27.0")]
1091 pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1092     let zero = _mm_setzero_si128().as_i32x4();
1093     let neg_one = _mm_set1_epi32(-1).as_i32x4();
1094     let offsets = offsets.as_i32x4();
1095     let slice = slice as *const i8;
1096     macro_rules! call {
1097         ($imm8:expr) => {
1098             pgatherdd(zero, slice, offsets, neg_one, $imm8)
1099         };
1100     }
1101     let r = constify_imm8!(scale, call);
1102     transmute(r)
1103 }
1104
1105 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1106 /// where
1107 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1108 /// that position instead.
1109 ///
1110 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
1111 #[inline]
1112 #[target_feature(enable = "avx2")]
1113 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1114 #[rustc_args_required_const(4)]
1115 #[stable(feature = "simd_x86", since = "1.27.0")]
1116 pub unsafe fn _mm_mask_i32gather_epi32(
1117     src: __m128i,
1118     slice: *const i32,
1119     offsets: __m128i,
1120     mask: __m128i,
1121     scale: i32,
1122 ) -> __m128i {
1123     let src = src.as_i32x4();
1124     let mask = mask.as_i32x4();
1125     let offsets = offsets.as_i32x4();
1126     let slice = slice as *const i8;
1127     macro_rules! call {
1128         ($imm8:expr) => {
1129             pgatherdd(src, slice, offsets, mask, $imm8)
1130         };
1131     }
1132     let r = constify_imm8!(scale, call);
1133     transmute(r)
1134 }
1135
1136 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1137 /// where
1138 /// `scale` is between 1 and 8.
1139 ///
1140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
1141 #[inline]
1142 #[target_feature(enable = "avx2")]
1143 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1144 #[rustc_args_required_const(2)]
1145 #[stable(feature = "simd_x86", since = "1.27.0")]
1146 pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
1147     let zero = _mm256_setzero_si256().as_i32x8();
1148     let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1149     let offsets = offsets.as_i32x8();
1150     let slice = slice as *const i8;
1151     macro_rules! call {
1152         ($imm8:expr) => {
1153             vpgatherdd(zero, slice, offsets, neg_one, $imm8)
1154         };
1155     }
1156     let r = constify_imm8!(scale, call);
1157     transmute(r)
1158 }
1159
1160 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1161 /// where
1162 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1163 /// that position instead.
1164 ///
1165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
1166 #[inline]
1167 #[target_feature(enable = "avx2")]
1168 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1169 #[rustc_args_required_const(4)]
1170 #[stable(feature = "simd_x86", since = "1.27.0")]
1171 pub unsafe fn _mm256_mask_i32gather_epi32(
1172     src: __m256i,
1173     slice: *const i32,
1174     offsets: __m256i,
1175     mask: __m256i,
1176     scale: i32,
1177 ) -> __m256i {
1178     let src = src.as_i32x8();
1179     let mask = mask.as_i32x8();
1180     let offsets = offsets.as_i32x8();
1181     let slice = slice as *const i8;
1182     macro_rules! call {
1183         ($imm8:expr) => {
1184             vpgatherdd(src, slice, offsets, mask, $imm8)
1185         };
1186     }
1187     let r = constify_imm8!(scale, call);
1188     transmute(r)
1189 }
1190
1191 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1192 /// where
1193 /// `scale` is between 1 and 8.
1194 ///
1195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
1196 #[inline]
1197 #[target_feature(enable = "avx2")]
1198 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1199 #[rustc_args_required_const(2)]
1200 #[stable(feature = "simd_x86", since = "1.27.0")]
1201 pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1202     let zero = _mm_setzero_ps();
1203     let neg_one = _mm_set1_ps(-1.0);
1204     let offsets = offsets.as_i32x4();
1205     let slice = slice as *const i8;
1206     macro_rules! call {
1207         ($imm8:expr) => {
1208             pgatherdps(zero, slice, offsets, neg_one, $imm8)
1209         };
1210     }
1211     constify_imm8!(scale, call)
1212 }
1213
1214 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1215 /// where
1216 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1217 /// that position instead.
1218 ///
1219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
1220 #[inline]
1221 #[target_feature(enable = "avx2")]
1222 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1223 #[rustc_args_required_const(4)]
1224 #[stable(feature = "simd_x86", since = "1.27.0")]
1225 pub unsafe fn _mm_mask_i32gather_ps(
1226     src: __m128,
1227     slice: *const f32,
1228     offsets: __m128i,
1229     mask: __m128,
1230     scale: i32,
1231 ) -> __m128 {
1232     let offsets = offsets.as_i32x4();
1233     let slice = slice as *const i8;
1234     macro_rules! call {
1235         ($imm8:expr) => {
1236             pgatherdps(src, slice, offsets, mask, $imm8)
1237         };
1238     }
1239     constify_imm8!(scale, call)
1240 }
1241
1242 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1243 /// where
1244 /// `scale` is between 1 and 8.
1245 ///
1246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
1247 #[inline]
1248 #[target_feature(enable = "avx2")]
1249 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1250 #[rustc_args_required_const(2)]
1251 #[stable(feature = "simd_x86", since = "1.27.0")]
1252 pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
1253     let zero = _mm256_setzero_ps();
1254     let neg_one = _mm256_set1_ps(-1.0);
1255     let offsets = offsets.as_i32x8();
1256     let slice = slice as *const i8;
1257     macro_rules! call {
1258         ($imm8:expr) => {
1259             vpgatherdps(zero, slice, offsets, neg_one, $imm8)
1260         };
1261     }
1262     constify_imm8!(scale, call)
1263 }
1264
1265 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1266 /// where
1267 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1268 /// that position instead.
1269 ///
1270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
1271 #[inline]
1272 #[target_feature(enable = "avx2")]
1273 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1274 #[rustc_args_required_const(4)]
1275 #[stable(feature = "simd_x86", since = "1.27.0")]
1276 pub unsafe fn _mm256_mask_i32gather_ps(
1277     src: __m256,
1278     slice: *const f32,
1279     offsets: __m256i,
1280     mask: __m256,
1281     scale: i32,
1282 ) -> __m256 {
1283     let offsets = offsets.as_i32x8();
1284     let slice = slice as *const i8;
1285     macro_rules! call {
1286         ($imm8:expr) => {
1287             vpgatherdps(src, slice, offsets, mask, $imm8)
1288         };
1289     }
1290     constify_imm8!(scale, call)
1291 }
1292
1293 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1294 /// where
1295 /// `scale` is between 1 and 8.
1296 ///
1297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
1298 #[inline]
1299 #[target_feature(enable = "avx2")]
1300 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1301 #[rustc_args_required_const(2)]
1302 #[stable(feature = "simd_x86", since = "1.27.0")]
1303 pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1304     let zero = _mm_setzero_si128().as_i64x2();
1305     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1306     let offsets = offsets.as_i32x4();
1307     let slice = slice as *const i8;
1308     macro_rules! call {
1309         ($imm8:expr) => {
1310             pgatherdq(zero, slice, offsets, neg_one, $imm8)
1311         };
1312     }
1313     let r = constify_imm8!(scale, call);
1314     transmute(r)
1315 }
1316
1317 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1318 /// where
1319 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1320 /// that position instead.
1321 ///
1322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
1323 #[inline]
1324 #[target_feature(enable = "avx2")]
1325 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1326 #[rustc_args_required_const(4)]
1327 #[stable(feature = "simd_x86", since = "1.27.0")]
1328 pub unsafe fn _mm_mask_i32gather_epi64(
1329     src: __m128i,
1330     slice: *const i64,
1331     offsets: __m128i,
1332     mask: __m128i,
1333     scale: i32,
1334 ) -> __m128i {
1335     let src = src.as_i64x2();
1336     let mask = mask.as_i64x2();
1337     let offsets = offsets.as_i32x4();
1338     let slice = slice as *const i8;
1339     macro_rules! call {
1340         ($imm8:expr) => {
1341             pgatherdq(src, slice, offsets, mask, $imm8)
1342         };
1343     }
1344     let r = constify_imm8!(scale, call);
1345     transmute(r)
1346 }
1347
1348 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1349 /// where
1350 /// `scale` is between 1 and 8.
1351 ///
1352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
1353 #[inline]
1354 #[target_feature(enable = "avx2")]
1355 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1356 #[rustc_args_required_const(2)]
1357 #[stable(feature = "simd_x86", since = "1.27.0")]
1358 pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
1359     let zero = _mm256_setzero_si256().as_i64x4();
1360     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1361     let offsets = offsets.as_i32x4();
1362     let slice = slice as *const i8;
1363     macro_rules! call {
1364         ($imm8:expr) => {
1365             vpgatherdq(zero, slice, offsets, neg_one, $imm8)
1366         };
1367     }
1368     let r = constify_imm8!(scale, call);
1369     transmute(r)
1370 }
1371
1372 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1373 /// where
1374 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1375 /// that position instead.
1376 ///
1377 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
1378 #[inline]
1379 #[target_feature(enable = "avx2")]
1380 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1381 #[rustc_args_required_const(4)]
1382 #[stable(feature = "simd_x86", since = "1.27.0")]
1383 pub unsafe fn _mm256_mask_i32gather_epi64(
1384     src: __m256i,
1385     slice: *const i64,
1386     offsets: __m128i,
1387     mask: __m256i,
1388     scale: i32,
1389 ) -> __m256i {
1390     let src = src.as_i64x4();
1391     let mask = mask.as_i64x4();
1392     let offsets = offsets.as_i32x4();
1393     let slice = slice as *const i8;
1394     macro_rules! call {
1395         ($imm8:expr) => {
1396             vpgatherdq(src, slice, offsets, mask, $imm8)
1397         };
1398     }
1399     let r = constify_imm8!(scale, call);
1400     transmute(r)
1401 }
1402
1403 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1404 /// where
1405 /// `scale` is between 1 and 8.
1406 ///
1407 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
1408 #[inline]
1409 #[target_feature(enable = "avx2")]
1410 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1411 #[rustc_args_required_const(2)]
1412 #[stable(feature = "simd_x86", since = "1.27.0")]
1413 pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1414     let zero = _mm_setzero_pd();
1415     let neg_one = _mm_set1_pd(-1.0);
1416     let offsets = offsets.as_i32x4();
1417     let slice = slice as *const i8;
1418     macro_rules! call {
1419         ($imm8:expr) => {
1420             pgatherdpd(zero, slice, offsets, neg_one, $imm8)
1421         };
1422     }
1423     constify_imm8!(scale, call)
1424 }
1425
1426 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1427 /// where
1428 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1429 /// that position instead.
1430 ///
1431 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
1432 #[inline]
1433 #[target_feature(enable = "avx2")]
1434 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1435 #[rustc_args_required_const(4)]
1436 #[stable(feature = "simd_x86", since = "1.27.0")]
1437 pub unsafe fn _mm_mask_i32gather_pd(
1438     src: __m128d,
1439     slice: *const f64,
1440     offsets: __m128i,
1441     mask: __m128d,
1442     scale: i32,
1443 ) -> __m128d {
1444     let offsets = offsets.as_i32x4();
1445     let slice = slice as *const i8;
1446     macro_rules! call {
1447         ($imm8:expr) => {
1448             pgatherdpd(src, slice, offsets, mask, $imm8)
1449         };
1450     }
1451     constify_imm8!(scale, call)
1452 }
1453
1454 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1455 /// where
1456 /// `scale` is between 1 and 8.
1457 ///
1458 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
1459 #[inline]
1460 #[target_feature(enable = "avx2")]
1461 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1462 #[rustc_args_required_const(2)]
1463 #[stable(feature = "simd_x86", since = "1.27.0")]
1464 pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
1465     let zero = _mm256_setzero_pd();
1466     let neg_one = _mm256_set1_pd(-1.0);
1467     let offsets = offsets.as_i32x4();
1468     let slice = slice as *const i8;
1469     macro_rules! call {
1470         ($imm8:expr) => {
1471             vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
1472         };
1473     }
1474     constify_imm8!(scale, call)
1475 }
1476
1477 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1478 /// where
1479 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1480 /// that position instead.
1481 ///
1482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
1483 #[inline]
1484 #[target_feature(enable = "avx2")]
1485 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1486 #[rustc_args_required_const(4)]
1487 #[stable(feature = "simd_x86", since = "1.27.0")]
1488 pub unsafe fn _mm256_mask_i32gather_pd(
1489     src: __m256d,
1490     slice: *const f64,
1491     offsets: __m128i,
1492     mask: __m256d,
1493     scale: i32,
1494 ) -> __m256d {
1495     let offsets = offsets.as_i32x4();
1496     let slice = slice as *const i8;
1497     macro_rules! call {
1498         ($imm8:expr) => {
1499             vpgatherdpd(src, slice, offsets, mask, $imm8)
1500         };
1501     }
1502     constify_imm8!(scale, call)
1503 }
1504
1505 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1506 /// where
1507 /// `scale` is between 1 and 8.
1508 ///
1509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
1510 #[inline]
1511 #[target_feature(enable = "avx2")]
1512 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1513 #[rustc_args_required_const(2)]
1514 #[stable(feature = "simd_x86", since = "1.27.0")]
1515 pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1516     let zero = _mm_setzero_si128().as_i32x4();
1517     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1518     let offsets = offsets.as_i64x2();
1519     let slice = slice as *const i8;
1520     macro_rules! call {
1521         ($imm8:expr) => {
1522             pgatherqd(zero, slice, offsets, neg_one, $imm8)
1523         };
1524     }
1525     let r = constify_imm8!(scale, call);
1526     transmute(r)
1527 }
1528
1529 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1530 /// where
1531 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1532 /// that position instead.
1533 ///
1534 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
1535 #[inline]
1536 #[target_feature(enable = "avx2")]
1537 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1538 #[rustc_args_required_const(4)]
1539 #[stable(feature = "simd_x86", since = "1.27.0")]
1540 pub unsafe fn _mm_mask_i64gather_epi32(
1541     src: __m128i,
1542     slice: *const i32,
1543     offsets: __m128i,
1544     mask: __m128i,
1545     scale: i32,
1546 ) -> __m128i {
1547     let src = src.as_i32x4();
1548     let mask = mask.as_i32x4();
1549     let offsets = offsets.as_i64x2();
1550     let slice = slice as *const i8;
1551     macro_rules! call {
1552         ($imm8:expr) => {
1553             pgatherqd(src, slice, offsets, mask, $imm8)
1554         };
1555     }
1556     let r = constify_imm8!(scale, call);
1557     transmute(r)
1558 }
1559
1560 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1561 /// where
1562 /// `scale` is between 1 and 8.
1563 ///
1564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
1565 #[inline]
1566 #[target_feature(enable = "avx2")]
1567 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1568 #[rustc_args_required_const(2)]
1569 #[stable(feature = "simd_x86", since = "1.27.0")]
1570 pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
1571     let zero = _mm_setzero_si128().as_i32x4();
1572     let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1573     let offsets = offsets.as_i64x4();
1574     let slice = slice as *const i8;
1575     macro_rules! call {
1576         ($imm8:expr) => {
1577             vpgatherqd(zero, slice, offsets, neg_one, $imm8)
1578         };
1579     }
1580     let r = constify_imm8!(scale, call);
1581     transmute(r)
1582 }
1583
1584 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1585 /// where
1586 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1587 /// that position instead.
1588 ///
1589 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
1590 #[inline]
1591 #[target_feature(enable = "avx2")]
1592 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1593 #[rustc_args_required_const(4)]
1594 #[stable(feature = "simd_x86", since = "1.27.0")]
1595 pub unsafe fn _mm256_mask_i64gather_epi32(
1596     src: __m128i,
1597     slice: *const i32,
1598     offsets: __m256i,
1599     mask: __m128i,
1600     scale: i32,
1601 ) -> __m128i {
1602     let src = src.as_i32x4();
1603     let mask = mask.as_i32x4();
1604     let offsets = offsets.as_i64x4();
1605     let slice = slice as *const i8;
1606     macro_rules! call {
1607         ($imm8:expr) => {
1608             vpgatherqd(src, slice, offsets, mask, $imm8)
1609         };
1610     }
1611     let r = constify_imm8!(scale, call);
1612     transmute(r)
1613 }
1614
1615 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1616 /// where
1617 /// `scale` is between 1 and 8.
1618 ///
1619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
1620 #[inline]
1621 #[target_feature(enable = "avx2")]
1622 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1623 #[rustc_args_required_const(2)]
1624 #[stable(feature = "simd_x86", since = "1.27.0")]
1625 pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1626     let zero = _mm_setzero_ps();
1627     let neg_one = _mm_set1_ps(-1.0);
1628     let offsets = offsets.as_i64x2();
1629     let slice = slice as *const i8;
1630     macro_rules! call {
1631         ($imm8:expr) => {
1632             pgatherqps(zero, slice, offsets, neg_one, $imm8)
1633         };
1634     }
1635     constify_imm8!(scale, call)
1636 }
1637
1638 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1639 /// where
1640 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1641 /// that position instead.
1642 ///
1643 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
1644 #[inline]
1645 #[target_feature(enable = "avx2")]
1646 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1647 #[rustc_args_required_const(4)]
1648 #[stable(feature = "simd_x86", since = "1.27.0")]
1649 pub unsafe fn _mm_mask_i64gather_ps(
1650     src: __m128,
1651     slice: *const f32,
1652     offsets: __m128i,
1653     mask: __m128,
1654     scale: i32,
1655 ) -> __m128 {
1656     let offsets = offsets.as_i64x2();
1657     let slice = slice as *const i8;
1658     macro_rules! call {
1659         ($imm8:expr) => {
1660             pgatherqps(src, slice, offsets, mask, $imm8)
1661         };
1662     }
1663     constify_imm8!(scale, call)
1664 }
1665
1666 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1667 /// where
1668 /// `scale` is between 1 and 8.
1669 ///
1670 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
1671 #[inline]
1672 #[target_feature(enable = "avx2")]
1673 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1674 #[rustc_args_required_const(2)]
1675 #[stable(feature = "simd_x86", since = "1.27.0")]
1676 pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
1677     let zero = _mm_setzero_ps();
1678     let neg_one = _mm_set1_ps(-1.0);
1679     let offsets = offsets.as_i64x4();
1680     let slice = slice as *const i8;
1681     macro_rules! call {
1682         ($imm8:expr) => {
1683             vpgatherqps(zero, slice, offsets, neg_one, $imm8)
1684         };
1685     }
1686     constify_imm8!(scale, call)
1687 }
1688
1689 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1690 /// where
1691 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1692 /// that position instead.
1693 ///
1694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
1695 #[inline]
1696 #[target_feature(enable = "avx2")]
1697 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1698 #[rustc_args_required_const(4)]
1699 #[stable(feature = "simd_x86", since = "1.27.0")]
1700 pub unsafe fn _mm256_mask_i64gather_ps(
1701     src: __m128,
1702     slice: *const f32,
1703     offsets: __m256i,
1704     mask: __m128,
1705     scale: i32,
1706 ) -> __m128 {
1707     let offsets = offsets.as_i64x4();
1708     let slice = slice as *const i8;
1709     macro_rules! call {
1710         ($imm8:expr) => {
1711             vpgatherqps(src, slice, offsets, mask, $imm8)
1712         };
1713     }
1714     constify_imm8!(scale, call)
1715 }
1716
1717 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1718 /// where
1719 /// `scale` is between 1 and 8.
1720 ///
1721 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
1722 #[inline]
1723 #[target_feature(enable = "avx2")]
1724 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1725 #[rustc_args_required_const(2)]
1726 #[stable(feature = "simd_x86", since = "1.27.0")]
1727 pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1728     let zero = _mm_setzero_si128().as_i64x2();
1729     let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1730     let slice = slice as *const i8;
1731     let offsets = offsets.as_i64x2();
1732     macro_rules! call {
1733         ($imm8:expr) => {
1734             pgatherqq(zero, slice, offsets, neg_one, $imm8)
1735         };
1736     }
1737     let r = constify_imm8!(scale, call);
1738     transmute(r)
1739 }
1740
1741 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1742 /// where
1743 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1744 /// that position instead.
1745 ///
1746 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
1747 #[inline]
1748 #[target_feature(enable = "avx2")]
1749 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1750 #[rustc_args_required_const(4)]
1751 #[stable(feature = "simd_x86", since = "1.27.0")]
1752 pub unsafe fn _mm_mask_i64gather_epi64(
1753     src: __m128i,
1754     slice: *const i64,
1755     offsets: __m128i,
1756     mask: __m128i,
1757     scale: i32,
1758 ) -> __m128i {
1759     let src = src.as_i64x2();
1760     let mask = mask.as_i64x2();
1761     let offsets = offsets.as_i64x2();
1762     let slice = slice as *const i8;
1763     macro_rules! call {
1764         ($imm8:expr) => {
1765             pgatherqq(src, slice, offsets, mask, $imm8)
1766         };
1767     }
1768     let r = constify_imm8!(scale, call);
1769     transmute(r)
1770 }
1771
1772 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1773 /// where
1774 /// `scale` is between 1 and 8.
1775 ///
1776 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
1777 #[inline]
1778 #[target_feature(enable = "avx2")]
1779 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1780 #[rustc_args_required_const(2)]
1781 #[stable(feature = "simd_x86", since = "1.27.0")]
1782 pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
1783     let zero = _mm256_setzero_si256().as_i64x4();
1784     let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1785     let slice = slice as *const i8;
1786     let offsets = offsets.as_i64x4();
1787     macro_rules! call {
1788         ($imm8:expr) => {
1789             vpgatherqq(zero, slice, offsets, neg_one, $imm8)
1790         };
1791     }
1792     let r = constify_imm8!(scale, call);
1793     transmute(r)
1794 }
1795
1796 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1797 /// where
1798 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1799 /// that position instead.
1800 ///
1801 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
1802 #[inline]
1803 #[target_feature(enable = "avx2")]
1804 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1805 #[rustc_args_required_const(4)]
1806 #[stable(feature = "simd_x86", since = "1.27.0")]
1807 pub unsafe fn _mm256_mask_i64gather_epi64(
1808     src: __m256i,
1809     slice: *const i64,
1810     offsets: __m256i,
1811     mask: __m256i,
1812     scale: i32,
1813 ) -> __m256i {
1814     let src = src.as_i64x4();
1815     let mask = mask.as_i64x4();
1816     let offsets = offsets.as_i64x4();
1817     let slice = slice as *const i8;
1818     macro_rules! call {
1819         ($imm8:expr) => {
1820             vpgatherqq(src, slice, offsets, mask, $imm8)
1821         };
1822     }
1823     let r = constify_imm8!(scale, call);
1824     transmute(r)
1825 }
1826
1827 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1828 /// where
1829 /// `scale` is between 1 and 8.
1830 ///
1831 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
1832 #[inline]
1833 #[target_feature(enable = "avx2")]
1834 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1835 #[rustc_args_required_const(2)]
1836 #[stable(feature = "simd_x86", since = "1.27.0")]
1837 pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1838     let zero = _mm_setzero_pd();
1839     let neg_one = _mm_set1_pd(-1.0);
1840     let slice = slice as *const i8;
1841     let offsets = offsets.as_i64x2();
1842     macro_rules! call {
1843         ($imm8:expr) => {
1844             pgatherqpd(zero, slice, offsets, neg_one, $imm8)
1845         };
1846     }
1847     constify_imm8!(scale, call)
1848 }
1849
1850 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1851 /// where
1852 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1853 /// that position instead.
1854 ///
1855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
1856 #[inline]
1857 #[target_feature(enable = "avx2")]
1858 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1859 #[rustc_args_required_const(4)]
1860 #[stable(feature = "simd_x86", since = "1.27.0")]
1861 pub unsafe fn _mm_mask_i64gather_pd(
1862     src: __m128d,
1863     slice: *const f64,
1864     offsets: __m128i,
1865     mask: __m128d,
1866     scale: i32,
1867 ) -> __m128d {
1868     let slice = slice as *const i8;
1869     let offsets = offsets.as_i64x2();
1870     macro_rules! call {
1871         ($imm8:expr) => {
1872             pgatherqpd(src, slice, offsets, mask, $imm8)
1873         };
1874     }
1875     constify_imm8!(scale, call)
1876 }
1877
1878 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1879 /// where
1880 /// `scale` is between 1 and 8.
1881 ///
1882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
1883 #[inline]
1884 #[target_feature(enable = "avx2")]
1885 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1886 #[rustc_args_required_const(2)]
1887 #[stable(feature = "simd_x86", since = "1.27.0")]
1888 pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
1889     let zero = _mm256_setzero_pd();
1890     let neg_one = _mm256_set1_pd(-1.0);
1891     let slice = slice as *const i8;
1892     let offsets = offsets.as_i64x4();
1893     macro_rules! call {
1894         ($imm8:expr) => {
1895             vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
1896         };
1897     }
1898     constify_imm8!(scale, call)
1899 }
1900
1901 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1902 /// where
1903 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1904 /// that position instead.
1905 ///
1906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
1907 #[inline]
1908 #[target_feature(enable = "avx2")]
1909 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1910 #[rustc_args_required_const(4)]
1911 #[stable(feature = "simd_x86", since = "1.27.0")]
1912 pub unsafe fn _mm256_mask_i64gather_pd(
1913     src: __m256d,
1914     slice: *const f64,
1915     offsets: __m256i,
1916     mask: __m256d,
1917     scale: i32,
1918 ) -> __m256d {
1919     let slice = slice as *const i8;
1920     let offsets = offsets.as_i64x4();
1921     macro_rules! call {
1922         ($imm8:expr) => {
1923             vpgatherqpd(src, slice, offsets, mask, $imm8)
1924         };
1925     }
1926     constify_imm8!(scale, call)
1927 }
1928
1929 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1930 /// location specified by `imm8`.
1931 ///
1932 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
1933 #[inline]
1934 #[target_feature(enable = "avx2")]
1935 #[cfg_attr(
1936     all(test, not(target_os = "windows")),
1937     assert_instr(vinsertf128, imm8 = 1)
1938 )]
1939 #[rustc_args_required_const(2)]
1940 #[stable(feature = "simd_x86", since = "1.27.0")]
1941 pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
1942     let a = a.as_i64x4();
1943     let b = _mm256_castsi128_si256(b).as_i64x4();
1944     let dst: i64x4 = match imm8 & 0b01 {
1945         0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
1946         _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
1947     };
1948     transmute(dst)
1949 }
1950
1951 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1952 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1953 /// of intermediate 32-bit integers.
1954 ///
1955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
1956 #[inline]
1957 #[target_feature(enable = "avx2")]
1958 #[cfg_attr(test, assert_instr(vpmaddwd))]
1959 #[stable(feature = "simd_x86", since = "1.27.0")]
1960 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1961     transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1962 }
1963
1964 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
1965 /// corresponding signed 8-bit integer from `b`, producing intermediate
1966 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1967 /// signed 16-bit integers
1968 ///
1969 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
1970 #[inline]
1971 #[target_feature(enable = "avx2")]
1972 #[cfg_attr(test, assert_instr(vpmaddubsw))]
1973 #[stable(feature = "simd_x86", since = "1.27.0")]
1974 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1975     transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1976 }
1977
1978 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1979 /// (elements are zeroed out when the highest bit is not set in the
1980 /// corresponding element).
1981 ///
1982 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
1983 #[inline]
1984 #[target_feature(enable = "avx2")]
1985 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1986 #[stable(feature = "simd_x86", since = "1.27.0")]
1987 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1988     transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1989 }
1990
1991 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1992 /// (elements are zeroed out when the highest bit is not set in the
1993 /// corresponding element).
1994 ///
1995 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
1996 #[inline]
1997 #[target_feature(enable = "avx2")]
1998 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1999 #[stable(feature = "simd_x86", since = "1.27.0")]
2000 pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
2001     transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
2002 }
2003
2004 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2005 /// (elements are zeroed out when the highest bit is not set in the
2006 /// corresponding element).
2007 ///
2008 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
2009 #[inline]
2010 #[target_feature(enable = "avx2")]
2011 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2012 #[stable(feature = "simd_x86", since = "1.27.0")]
2013 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
2014     transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
2015 }
2016
2017 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2018 /// (elements are zeroed out when the highest bit is not set in the
2019 /// corresponding element).
2020 ///
2021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
2022 #[inline]
2023 #[target_feature(enable = "avx2")]
2024 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2025 #[stable(feature = "simd_x86", since = "1.27.0")]
2026 pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
2027     transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
2028 }
2029
2030 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2031 /// using `mask` (elements are not stored when the highest bit is not set
2032 /// in the corresponding element).
2033 ///
2034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
2035 #[inline]
2036 #[target_feature(enable = "avx2")]
2037 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2038 #[stable(feature = "simd_x86", since = "1.27.0")]
2039 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
2040     maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
2041 }
2042
2043 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2044 /// using `mask` (elements are not stored when the highest bit is not set
2045 /// in the corresponding element).
2046 ///
2047 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
2048 #[inline]
2049 #[target_feature(enable = "avx2")]
2050 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2051 #[stable(feature = "simd_x86", since = "1.27.0")]
2052 pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
2053     maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
2054 }
2055
2056 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2057 /// using `mask` (elements are not stored when the highest bit is not set
2058 /// in the corresponding element).
2059 ///
2060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
2061 #[inline]
2062 #[target_feature(enable = "avx2")]
2063 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2064 #[stable(feature = "simd_x86", since = "1.27.0")]
2065 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
2066     maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
2067 }
2068
2069 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2070 /// using `mask` (elements are not stored when the highest bit is not set
2071 /// in the corresponding element).
2072 ///
2073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
2074 #[inline]
2075 #[target_feature(enable = "avx2")]
2076 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2077 #[stable(feature = "simd_x86", since = "1.27.0")]
2078 pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
2079     maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
2080 }
2081
2082 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2083 /// maximum values.
2084 ///
2085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
2086 #[inline]
2087 #[target_feature(enable = "avx2")]
2088 #[cfg_attr(test, assert_instr(vpmaxsw))]
2089 #[stable(feature = "simd_x86", since = "1.27.0")]
2090 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2091     transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
2092 }
2093
2094 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2095 /// maximum values.
2096 ///
2097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
2098 #[inline]
2099 #[target_feature(enable = "avx2")]
2100 #[cfg_attr(test, assert_instr(vpmaxsd))]
2101 #[stable(feature = "simd_x86", since = "1.27.0")]
2102 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2103     transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
2104 }
2105
2106 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2107 /// maximum values.
2108 ///
2109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
2110 #[inline]
2111 #[target_feature(enable = "avx2")]
2112 #[cfg_attr(test, assert_instr(vpmaxsb))]
2113 #[stable(feature = "simd_x86", since = "1.27.0")]
2114 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2115     transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
2116 }
2117
2118 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2119 /// the packed maximum values.
2120 ///
2121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
2122 #[inline]
2123 #[target_feature(enable = "avx2")]
2124 #[cfg_attr(test, assert_instr(vpmaxuw))]
2125 #[stable(feature = "simd_x86", since = "1.27.0")]
2126 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2127     transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
2128 }
2129
2130 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2131 /// the packed maximum values.
2132 ///
2133 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
2134 #[inline]
2135 #[target_feature(enable = "avx2")]
2136 #[cfg_attr(test, assert_instr(vpmaxud))]
2137 #[stable(feature = "simd_x86", since = "1.27.0")]
2138 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2139     transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
2140 }
2141
2142 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2143 /// the packed maximum values.
2144 ///
2145 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
2146 #[inline]
2147 #[target_feature(enable = "avx2")]
2148 #[cfg_attr(test, assert_instr(vpmaxub))]
2149 #[stable(feature = "simd_x86", since = "1.27.0")]
2150 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2151     transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
2152 }
2153
2154 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2155 /// minimum values.
2156 ///
2157 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
2158 #[inline]
2159 #[target_feature(enable = "avx2")]
2160 #[cfg_attr(test, assert_instr(vpminsw))]
2161 #[stable(feature = "simd_x86", since = "1.27.0")]
2162 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2163     transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
2164 }
2165
2166 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2167 /// minimum values.
2168 ///
2169 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
2170 #[inline]
2171 #[target_feature(enable = "avx2")]
2172 #[cfg_attr(test, assert_instr(vpminsd))]
2173 #[stable(feature = "simd_x86", since = "1.27.0")]
2174 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2175     transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
2176 }
2177
2178 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2179 /// minimum values.
2180 ///
2181 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
2182 #[inline]
2183 #[target_feature(enable = "avx2")]
2184 #[cfg_attr(test, assert_instr(vpminsb))]
2185 #[stable(feature = "simd_x86", since = "1.27.0")]
2186 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2187     transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
2188 }
2189
2190 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2191 /// the packed minimum values.
2192 ///
2193 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
2194 #[inline]
2195 #[target_feature(enable = "avx2")]
2196 #[cfg_attr(test, assert_instr(vpminuw))]
2197 #[stable(feature = "simd_x86", since = "1.27.0")]
2198 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2199     transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
2200 }
2201
2202 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2203 /// the packed minimum values.
2204 ///
2205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
2206 #[inline]
2207 #[target_feature(enable = "avx2")]
2208 #[cfg_attr(test, assert_instr(vpminud))]
2209 #[stable(feature = "simd_x86", since = "1.27.0")]
2210 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2211     transmute(pminud(a.as_u32x8(), b.as_u32x8()))
2212 }
2213
2214 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2215 /// the packed minimum values.
2216 ///
2217 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
2218 #[inline]
2219 #[target_feature(enable = "avx2")]
2220 #[cfg_attr(test, assert_instr(vpminub))]
2221 #[stable(feature = "simd_x86", since = "1.27.0")]
2222 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2223     transmute(pminub(a.as_u8x32(), b.as_u8x32()))
2224 }
2225
2226 /// Creates mask from the most significant bit of each 8-bit element in `a`,
2227 /// return the result.
2228 ///
2229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
2230 #[inline]
2231 #[target_feature(enable = "avx2")]
2232 #[cfg_attr(test, assert_instr(vpmovmskb))]
2233 #[stable(feature = "simd_x86", since = "1.27.0")]
2234 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2235     pmovmskb(a.as_i8x32())
2236 }
2237
2238 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2239 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2240 /// results in dst. Eight SADs are performed for each 128-bit lane using one
2241 /// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2242 /// selected from `b` starting at on the offset specified in `imm8`. Eight
2243 /// quadruplets are formed from sequential 8-bit integers selected from `a`
2244 /// starting at the offset specified in `imm8`.
2245 ///
2246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
2247 #[inline]
2248 #[target_feature(enable = "avx2")]
2249 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
2250 #[rustc_args_required_const(2)]
2251 #[stable(feature = "simd_x86", since = "1.27.0")]
2252 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2253     let a = a.as_u8x32();
2254     let b = b.as_u8x32();
2255     macro_rules! call {
2256         ($imm8:expr) => {
2257             mpsadbw(a, b, $imm8)
2258         };
2259     }
2260     let r = constify_imm8!(imm8, call);
2261     transmute(r)
2262 }
2263
2264 /// Multiplies the low 32-bit integers from each packed 64-bit element in
2265 /// `a` and `b`
2266 ///
2267 /// Returns the 64-bit results.
2268 ///
2269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
2270 #[inline]
2271 #[target_feature(enable = "avx2")]
2272 #[cfg_attr(test, assert_instr(vpmuldq))]
2273 #[stable(feature = "simd_x86", since = "1.27.0")]
2274 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2275     transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
2276 }
2277
2278 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2279 /// element in `a` and `b`
2280 ///
2281 /// Returns the unsigned 64-bit results.
2282 ///
2283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
2284 #[inline]
2285 #[target_feature(enable = "avx2")]
2286 #[cfg_attr(test, assert_instr(vpmuludq))]
2287 #[stable(feature = "simd_x86", since = "1.27.0")]
2288 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2289     transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
2290 }
2291
2292 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2293 /// intermediate 32-bit integers and returning the high 16 bits of the
2294 /// intermediate integers.
2295 ///
2296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
2297 #[inline]
2298 #[target_feature(enable = "avx2")]
2299 #[cfg_attr(test, assert_instr(vpmulhw))]
2300 #[stable(feature = "simd_x86", since = "1.27.0")]
2301 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2302     transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
2303 }
2304
2305 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2306 /// intermediate 32-bit integers and returning the high 16 bits of the
2307 /// intermediate integers.
2308 ///
2309 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
2310 #[inline]
2311 #[target_feature(enable = "avx2")]
2312 #[cfg_attr(test, assert_instr(vpmulhuw))]
2313 #[stable(feature = "simd_x86", since = "1.27.0")]
2314 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2315     transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
2316 }
2317
2318 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2319 /// intermediate 32-bit integers, and returns the low 16 bits of the
2320 /// intermediate integers
2321 ///
2322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
2323 #[inline]
2324 #[target_feature(enable = "avx2")]
2325 #[cfg_attr(test, assert_instr(vpmullw))]
2326 #[stable(feature = "simd_x86", since = "1.27.0")]
2327 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2328     transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2329 }
2330
2331 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
2332 /// intermediate 64-bit integers, and returns the low 32 bits of the
2333 /// intermediate integers
2334 ///
2335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
2336 #[inline]
2337 #[target_feature(enable = "avx2")]
2338 #[cfg_attr(test, assert_instr(vpmulld))]
2339 #[stable(feature = "simd_x86", since = "1.27.0")]
2340 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2341     transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2342 }
2343
2344 /// Multiplies packed 16-bit integers in `a` and `b`, producing
2345 /// intermediate signed 32-bit integers. Truncate each intermediate
2346 /// integer to the 18 most significant bits, round by adding 1, and
2347 /// return bits `[16:1]`.
2348 ///
2349 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
2350 #[inline]
2351 #[target_feature(enable = "avx2")]
2352 #[cfg_attr(test, assert_instr(vpmulhrsw))]
2353 #[stable(feature = "simd_x86", since = "1.27.0")]
2354 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2355     transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2356 }
2357
2358 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2359 /// and `b`
2360 ///
2361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
2362 #[inline]
2363 #[target_feature(enable = "avx2")]
2364 #[cfg_attr(test, assert_instr(vorps))]
2365 #[stable(feature = "simd_x86", since = "1.27.0")]
2366 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2367     transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2368 }
2369
2370 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2371 /// using signed saturation
2372 ///
2373 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
2374 #[inline]
2375 #[target_feature(enable = "avx2")]
2376 #[cfg_attr(test, assert_instr(vpacksswb))]
2377 #[stable(feature = "simd_x86", since = "1.27.0")]
2378 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2379     transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2380 }
2381
2382 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2383 /// using signed saturation
2384 ///
2385 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
2386 #[inline]
2387 #[target_feature(enable = "avx2")]
2388 #[cfg_attr(test, assert_instr(vpackssdw))]
2389 #[stable(feature = "simd_x86", since = "1.27.0")]
2390 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2391     transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2392 }
2393
2394 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2395 /// using unsigned saturation
2396 ///
2397 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
2398 #[inline]
2399 #[target_feature(enable = "avx2")]
2400 #[cfg_attr(test, assert_instr(vpackuswb))]
2401 #[stable(feature = "simd_x86", since = "1.27.0")]
2402 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2403     transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2404 }
2405
2406 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2407 /// using unsigned saturation
2408 ///
2409 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
2410 #[inline]
2411 #[target_feature(enable = "avx2")]
2412 #[cfg_attr(test, assert_instr(vpackusdw))]
2413 #[stable(feature = "simd_x86", since = "1.27.0")]
2414 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2415     transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2416 }
2417
2418 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
2419 ///
2420 /// The last 3 bits of each integer of `b` are used as addresses into the 8
2421 /// integers of `a`.
2422 ///
2423 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
2424 #[inline]
2425 #[target_feature(enable = "avx2")]
2426 #[cfg_attr(test, assert_instr(vpermps))]
2427 #[stable(feature = "simd_x86", since = "1.27.0")]
2428 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2429     transmute(permd(a.as_u32x8(), b.as_u32x8()))
2430 }
2431
2432 /// Permutes 64-bit integers from `a` using control mask `imm8`.
2433 ///
2434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
2435 #[inline]
2436 #[target_feature(enable = "avx2")]
2437 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
2438 #[rustc_args_required_const(1)]
2439 #[stable(feature = "simd_x86", since = "1.27.0")]
2440 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
2441     let imm8 = (imm8 & 0xFF) as u8;
2442     let zero = _mm256_setzero_si256().as_i64x4();
2443     let a = a.as_i64x4();
2444     macro_rules! permute4 {
2445         ($a:expr, $b:expr, $c:expr, $d:expr) => {
2446             simd_shuffle4(a, zero, [$a, $b, $c, $d]);
2447         };
2448     }
2449     macro_rules! permute3 {
2450         ($a:expr, $b:expr, $c:expr) => {
2451             match (imm8 >> 6) & 0b11 {
2452                 0b00 => permute4!($a, $b, $c, 0),
2453                 0b01 => permute4!($a, $b, $c, 1),
2454                 0b10 => permute4!($a, $b, $c, 2),
2455                 _ => permute4!($a, $b, $c, 3),
2456             }
2457         };
2458     }
2459     macro_rules! permute2 {
2460         ($a:expr, $b:expr) => {
2461             match (imm8 >> 4) & 0b11 {
2462                 0b00 => permute3!($a, $b, 0),
2463                 0b01 => permute3!($a, $b, 1),
2464                 0b10 => permute3!($a, $b, 2),
2465                 _ => permute3!($a, $b, 3),
2466             }
2467         };
2468     }
2469     macro_rules! permute1 {
2470         ($a:expr) => {
2471             match (imm8 >> 2) & 0b11 {
2472                 0b00 => permute2!($a, 0),
2473                 0b01 => permute2!($a, 1),
2474                 0b10 => permute2!($a, 2),
2475                 _ => permute2!($a, 3),
2476             }
2477         };
2478     }
2479     let r: i64x4 = match imm8 & 0b11 {
2480         0b00 => permute1!(0),
2481         0b01 => permute1!(1),
2482         0b10 => permute1!(2),
2483         _ => permute1!(3),
2484     };
2485     transmute(r)
2486 }
2487
2488 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2489 ///
2490 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
2491 #[inline]
2492 #[target_feature(enable = "avx2")]
2493 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
2494 #[rustc_args_required_const(2)]
2495 #[stable(feature = "simd_x86", since = "1.27.0")]
2496 pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2497     let a = a.as_i64x4();
2498     let b = b.as_i64x4();
2499     macro_rules! call {
2500         ($imm8:expr) => {
2501             vperm2i128(a, b, $imm8)
2502         };
2503     }
2504     transmute(constify_imm8!(imm8, call))
2505 }
2506
2507 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
2508 /// control in `imm8`.
2509 ///
2510 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
2511 #[inline]
2512 #[target_feature(enable = "avx2")]
2513 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
2514 #[rustc_args_required_const(1)]
2515 #[stable(feature = "simd_x86", since = "1.27.0")]
2516 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
2517     let imm8 = (imm8 & 0xFF) as u8;
2518     let undef = _mm256_undefined_pd();
2519     macro_rules! shuffle_done {
2520         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2521             simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
2522         };
2523     }
2524     macro_rules! shuffle_x67 {
2525         ($x01:expr, $x23:expr, $x45:expr) => {
2526             match (imm8 >> 6) & 0b11 {
2527                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2528                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2529                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2530                 _ => shuffle_done!($x01, $x23, $x45, 3),
2531             }
2532         };
2533     }
2534     macro_rules! shuffle_x45 {
2535         ($x01:expr, $x23:expr) => {
2536             match (imm8 >> 4) & 0b11 {
2537                 0b00 => shuffle_x67!($x01, $x23, 0),
2538                 0b01 => shuffle_x67!($x01, $x23, 1),
2539                 0b10 => shuffle_x67!($x01, $x23, 2),
2540                 _ => shuffle_x67!($x01, $x23, 3),
2541             }
2542         };
2543     }
2544     macro_rules! shuffle_x23 {
2545         ($x01:expr) => {
2546             match (imm8 >> 2) & 0b11 {
2547                 0b00 => shuffle_x45!($x01, 0),
2548                 0b01 => shuffle_x45!($x01, 1),
2549                 0b10 => shuffle_x45!($x01, 2),
2550                 _ => shuffle_x45!($x01, 3),
2551             }
2552         };
2553     }
2554     match imm8 & 0b11 {
2555         0b00 => shuffle_x23!(0),
2556         0b01 => shuffle_x23!(1),
2557         0b10 => shuffle_x23!(2),
2558         _ => shuffle_x23!(3),
2559     }
2560 }
2561
2562 /// Shuffles eight 32-bit foating-point elements in `a` across lanes using
2563 /// the corresponding 32-bit integer index in `idx`.
2564 ///
2565 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
2566 #[inline]
2567 #[target_feature(enable = "avx2")]
2568 #[cfg_attr(test, assert_instr(vpermps))]
2569 #[stable(feature = "simd_x86", since = "1.27.0")]
2570 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2571     permps(a, idx.as_i32x8())
2572 }
2573
2574 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2575 /// and `b`, then horizontally sum each consecutive 8 differences to
2576 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2577 /// integers in the low 16 bits of the 64-bit return value
2578 ///
2579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
2580 #[inline]
2581 #[target_feature(enable = "avx2")]
2582 #[cfg_attr(test, assert_instr(vpsadbw))]
2583 #[stable(feature = "simd_x86", since = "1.27.0")]
2584 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2585     transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2586 }
2587
2588 /// Shuffles bytes from `a` according to the content of `b`.
2589 ///
2590 /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
2591 /// of `a`.
2592 ///
2593 /// In addition, if the highest significant bit of a byte of `b` is set, the
2594 /// respective destination byte is set to 0.
2595 ///
2596 /// The low and high halves of the vectors are shuffled separately.
2597 ///
2598 /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2599 /// equivalent to:
2600 ///
2601 /// ```
2602 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2603 ///     let mut r = [0; 32];
2604 ///     for i in 0..16 {
2605 ///         // if the most significant bit of b is set,
2606 ///         // then the destination byte is set to 0.
2607 ///         if b[i] & 0x80 == 0u8 {
2608 ///             r[i] = a[(b[i] % 16) as usize];
2609 ///         }
2610 ///         if b[i + 16] & 0x80 == 0u8 {
2611 ///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2612 ///         }
2613 ///     }
2614 ///     r
2615 /// }
2616 /// ```
2617 ///
2618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
2619 #[inline]
2620 #[target_feature(enable = "avx2")]
2621 #[cfg_attr(test, assert_instr(vpshufb))]
2622 #[stable(feature = "simd_x86", since = "1.27.0")]
2623 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2624     transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2625 }
2626
2627 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2628 /// `imm8`.
2629 ///
2630 /// ```rust
2631 /// #[cfg(target_arch = "x86")]
2632 /// use std::arch::x86::*;
2633 /// #[cfg(target_arch = "x86_64")]
2634 /// use std::arch::x86_64::*;
2635 ///
2636 /// # fn main() {
2637 /// #     if is_x86_feature_detected!("avx2") {
2638 /// #         #[target_feature(enable = "avx2")]
2639 /// #         unsafe fn worker() {
2640 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2641 ///
2642 /// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2643 /// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2644 ///
2645 /// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2646 /// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2647 ///
2648 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2649 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2650 /// #         }
2651 /// #         unsafe { worker(); }
2652 /// #     }
2653 /// # }
2654 /// ```
2655 ///
2656 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2657 #[inline]
2658 #[target_feature(enable = "avx2")]
2659 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
2660 #[rustc_args_required_const(1)]
2661 #[stable(feature = "simd_x86", since = "1.27.0")]
2662 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
2663     // simd_shuffleX requires that its selector parameter be made up of
2664     // constant values, but we can't enforce that here. In spirit, we need
2665     // to write a `match` on all possible values of a byte, and for each value,
2666     // hard-code the correct `simd_shuffleX` call using only constants. We
2667     // then hope for LLVM to do the rest.
2668     //
2669     // Of course, that's... awful. So we try to use macros to do it for us.
2670     let imm8 = (imm8 & 0xFF) as u8;
2671
2672     let a = a.as_i32x8();
2673     macro_rules! shuffle_done {
2674         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2675             simd_shuffle8(
2676                 a,
2677                 a,
2678                 [
2679                     $x01,
2680                     $x23,
2681                     $x45,
2682                     $x67,
2683                     4 + $x01,
2684                     4 + $x23,
2685                     4 + $x45,
2686                     4 + $x67,
2687                 ],
2688             )
2689         };
2690     }
2691     macro_rules! shuffle_x67 {
2692         ($x01:expr, $x23:expr, $x45:expr) => {
2693             match (imm8 >> 6) & 0b11 {
2694                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2695                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2696                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2697                 _ => shuffle_done!($x01, $x23, $x45, 3),
2698             }
2699         };
2700     }
2701     macro_rules! shuffle_x45 {
2702         ($x01:expr, $x23:expr) => {
2703             match (imm8 >> 4) & 0b11 {
2704                 0b00 => shuffle_x67!($x01, $x23, 0),
2705                 0b01 => shuffle_x67!($x01, $x23, 1),
2706                 0b10 => shuffle_x67!($x01, $x23, 2),
2707                 _ => shuffle_x67!($x01, $x23, 3),
2708             }
2709         };
2710     }
2711     macro_rules! shuffle_x23 {
2712         ($x01:expr) => {
2713             match (imm8 >> 2) & 0b11 {
2714                 0b00 => shuffle_x45!($x01, 0),
2715                 0b01 => shuffle_x45!($x01, 1),
2716                 0b10 => shuffle_x45!($x01, 2),
2717                 _ => shuffle_x45!($x01, 3),
2718             }
2719         };
2720     }
2721     let r: i32x8 = match imm8 & 0b11 {
2722         0b00 => shuffle_x23!(0),
2723         0b01 => shuffle_x23!(1),
2724         0b10 => shuffle_x23!(2),
2725         _ => shuffle_x23!(3),
2726     };
2727     transmute(r)
2728 }
2729
2730 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2731 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2732 /// to the output.
2733 ///
2734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
2735 #[inline]
2736 #[target_feature(enable = "avx2")]
2737 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
2738 #[rustc_args_required_const(1)]
2739 #[stable(feature = "simd_x86", since = "1.27.0")]
2740 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
2741     let imm8 = (imm8 & 0xFF) as u8;
2742     let a = a.as_i16x16();
2743     macro_rules! shuffle_done {
2744         ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2745             #[rustfmt::skip]
2746                         simd_shuffle16(a, a, [
2747                             0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
2748                             8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
2749                         ]);
2750         };
2751     }
2752     macro_rules! shuffle_x67 {
2753         ($x01:expr, $x23:expr, $x45:expr) => {
2754             match (imm8 >> 6) & 0b11 {
2755                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2756                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2757                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2758                 _ => shuffle_done!($x01, $x23, $x45, 3),
2759             }
2760         };
2761     }
2762     macro_rules! shuffle_x45 {
2763         ($x01:expr, $x23:expr) => {
2764             match (imm8 >> 4) & 0b11 {
2765                 0b00 => shuffle_x67!($x01, $x23, 0),
2766                 0b01 => shuffle_x67!($x01, $x23, 1),
2767                 0b10 => shuffle_x67!($x01, $x23, 2),
2768                 _ => shuffle_x67!($x01, $x23, 3),
2769             }
2770         };
2771     }
2772     macro_rules! shuffle_x23 {
2773         ($x01:expr) => {
2774             match (imm8 >> 2) & 0b11 {
2775                 0b00 => shuffle_x45!($x01, 0),
2776                 0b01 => shuffle_x45!($x01, 1),
2777                 0b10 => shuffle_x45!($x01, 2),
2778                 _ => shuffle_x45!($x01, 3),
2779             }
2780         };
2781     }
2782     let r: i16x16 = match imm8 & 0b11 {
2783         0b00 => shuffle_x23!(0),
2784         0b01 => shuffle_x23!(1),
2785         0b10 => shuffle_x23!(2),
2786         _ => shuffle_x23!(3),
2787     };
2788     transmute(r)
2789 }
2790
2791 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2792 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2793 /// to the output.
2794 ///
2795 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
2796 #[inline]
2797 #[target_feature(enable = "avx2")]
2798 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
2799 #[rustc_args_required_const(1)]
2800 #[stable(feature = "simd_x86", since = "1.27.0")]
2801 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
2802     let imm8 = (imm8 & 0xFF) as u8;
2803     let a = a.as_i16x16();
2804     macro_rules! shuffle_done {
2805         ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2806             #[rustfmt::skip]
2807                         simd_shuffle16(a, a, [
2808                             0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
2809                             8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
2810                         ]);
2811         };
2812     }
2813     macro_rules! shuffle_x67 {
2814         ($x01:expr, $x23:expr, $x45:expr) => {
2815             match (imm8 >> 6) & 0b11 {
2816                 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2817                 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2818                 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2819                 _ => shuffle_done!($x01, $x23, $x45, 3),
2820             }
2821         };
2822     }
2823     macro_rules! shuffle_x45 {
2824         ($x01:expr, $x23:expr) => {
2825             match (imm8 >> 4) & 0b11 {
2826                 0b00 => shuffle_x67!($x01, $x23, 0),
2827                 0b01 => shuffle_x67!($x01, $x23, 1),
2828                 0b10 => shuffle_x67!($x01, $x23, 2),
2829                 _ => shuffle_x67!($x01, $x23, 3),
2830             }
2831         };
2832     }
2833     macro_rules! shuffle_x23 {
2834         ($x01:expr) => {
2835             match (imm8 >> 2) & 0b11 {
2836                 0b00 => shuffle_x45!($x01, 0),
2837                 0b01 => shuffle_x45!($x01, 1),
2838                 0b10 => shuffle_x45!($x01, 2),
2839                 _ => shuffle_x45!($x01, 3),
2840             }
2841         };
2842     }
2843     let r: i16x16 = match imm8 & 0b11 {
2844         0b00 => shuffle_x23!(0),
2845         0b01 => shuffle_x23!(1),
2846         0b10 => shuffle_x23!(2),
2847         _ => shuffle_x23!(3),
2848     };
2849     transmute(r)
2850 }
2851
2852 /// Negates packed 16-bit integers in `a` when the corresponding signed
2853 /// 16-bit integer in `b` is negative, and returns the results.
2854 /// Results are zeroed out when the corresponding element in `b` is zero.
2855 ///
2856 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
2857 #[inline]
2858 #[target_feature(enable = "avx2")]
2859 #[cfg_attr(test, assert_instr(vpsignw))]
2860 #[stable(feature = "simd_x86", since = "1.27.0")]
2861 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2862     transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2863 }
2864
2865 /// Negates packed 32-bit integers in `a` when the corresponding signed
2866 /// 32-bit integer in `b` is negative, and returns the results.
2867 /// Results are zeroed out when the corresponding element in `b` is zero.
2868 ///
2869 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
2870 #[inline]
2871 #[target_feature(enable = "avx2")]
2872 #[cfg_attr(test, assert_instr(vpsignd))]
2873 #[stable(feature = "simd_x86", since = "1.27.0")]
2874 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2875     transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2876 }
2877
2878 /// Negates packed 8-bit integers in `a` when the corresponding signed
2879 /// 8-bit integer in `b` is negative, and returns the results.
2880 /// Results are zeroed out when the corresponding element in `b` is zero.
2881 ///
2882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
2883 #[inline]
2884 #[target_feature(enable = "avx2")]
2885 #[cfg_attr(test, assert_instr(vpsignb))]
2886 #[stable(feature = "simd_x86", since = "1.27.0")]
2887 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2888     transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2889 }
2890
2891 /// Shifts packed 16-bit integers in `a` left by `count` while
2892 /// shifting in zeros, and returns the result
2893 ///
2894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
2895 #[inline]
2896 #[target_feature(enable = "avx2")]
2897 #[cfg_attr(test, assert_instr(vpsllw))]
2898 #[stable(feature = "simd_x86", since = "1.27.0")]
2899 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2900     transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2901 }
2902
2903 /// Shifts packed 32-bit integers in `a` left by `count` while
2904 /// shifting in zeros, and returns the result
2905 ///
2906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
2907 #[inline]
2908 #[target_feature(enable = "avx2")]
2909 #[cfg_attr(test, assert_instr(vpslld))]
2910 #[stable(feature = "simd_x86", since = "1.27.0")]
2911 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2912     transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2913 }
2914
2915 /// Shifts packed 64-bit integers in `a` left by `count` while
2916 /// shifting in zeros, and returns the result
2917 ///
2918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
2919 #[inline]
2920 #[target_feature(enable = "avx2")]
2921 #[cfg_attr(test, assert_instr(vpsllq))]
2922 #[stable(feature = "simd_x86", since = "1.27.0")]
2923 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2924     transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2925 }
2926
2927 /// Shifts packed 16-bit integers in `a` left by `imm8` while
2928 /// shifting in zeros, return the results;
2929 ///
2930 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
2931 #[inline]
2932 #[target_feature(enable = "avx2")]
2933 #[cfg_attr(test, assert_instr(vpsllw))]
2934 #[stable(feature = "simd_x86", since = "1.27.0")]
2935 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
2936     transmute(pslliw(a.as_i16x16(), imm8))
2937 }
2938
2939 /// Shifts packed 32-bit integers in `a` left by `imm8` while
2940 /// shifting in zeros, return the results;
2941 ///
2942 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
2943 #[inline]
2944 #[target_feature(enable = "avx2")]
2945 #[cfg_attr(test, assert_instr(vpslld))]
2946 #[stable(feature = "simd_x86", since = "1.27.0")]
2947 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
2948     transmute(psllid(a.as_i32x8(), imm8))
2949 }
2950
2951 /// Shifts packed 64-bit integers in `a` left by `imm8` while
2952 /// shifting in zeros, return the results;
2953 ///
2954 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
2955 #[inline]
2956 #[target_feature(enable = "avx2")]
2957 #[cfg_attr(test, assert_instr(vpsllq))]
2958 #[stable(feature = "simd_x86", since = "1.27.0")]
2959 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
2960     transmute(pslliq(a.as_i64x4(), imm8))
2961 }
2962
2963 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2964 ///
2965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
2966 #[inline]
2967 #[target_feature(enable = "avx2")]
2968 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2969 #[rustc_args_required_const(1)]
2970 #[stable(feature = "simd_x86", since = "1.27.0")]
2971 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2972     let a = a.as_i64x4();
2973     macro_rules! call {
2974         ($imm8:expr) => {
2975             vpslldq(a, $imm8)
2976         };
2977     }
2978     transmute(constify_imm8!(imm8 * 8, call))
2979 }
2980
2981 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2982 ///
2983 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
2984 #[inline]
2985 #[target_feature(enable = "avx2")]
2986 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2987 #[rustc_args_required_const(1)]
2988 #[stable(feature = "simd_x86", since = "1.27.0")]
2989 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2990     let a = a.as_i64x4();
2991     macro_rules! call {
2992         ($imm8:expr) => {
2993             vpslldq(a, $imm8)
2994         };
2995     }
2996     transmute(constify_imm8!(imm8 * 8, call))
2997 }
2998
2999 /// Shifts packed 32-bit integers in `a` left by the amount
3000 /// specified by the corresponding element in `count` while
3001 /// shifting in zeros, and returns the result.
3002 ///
3003 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
3004 #[inline]
3005 #[target_feature(enable = "avx2")]
3006 #[cfg_attr(test, assert_instr(vpsllvd))]
3007 #[stable(feature = "simd_x86", since = "1.27.0")]
3008 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
3009     transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
3010 }
3011
3012 /// Shifts packed 32-bit integers in `a` left by the amount
3013 /// specified by the corresponding element in `count` while
3014 /// shifting in zeros, and returns the result.
3015 ///
3016 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
3017 #[inline]
3018 #[target_feature(enable = "avx2")]
3019 #[cfg_attr(test, assert_instr(vpsllvd))]
3020 #[stable(feature = "simd_x86", since = "1.27.0")]
3021 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
3022     transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
3023 }
3024
3025 /// Shifts packed 64-bit integers in `a` left by the amount
3026 /// specified by the corresponding element in `count` while
3027 /// shifting in zeros, and returns the result.
3028 ///
3029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
3030 #[inline]
3031 #[target_feature(enable = "avx2")]
3032 #[cfg_attr(test, assert_instr(vpsllvq))]
3033 #[stable(feature = "simd_x86", since = "1.27.0")]
3034 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3035     transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
3036 }
3037
3038 /// Shifts packed 64-bit integers in `a` left by the amount
3039 /// specified by the corresponding element in `count` while
3040 /// shifting in zeros, and returns the result.
3041 ///
3042 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
3043 #[inline]
3044 #[target_feature(enable = "avx2")]
3045 #[cfg_attr(test, assert_instr(vpsllvq))]
3046 #[stable(feature = "simd_x86", since = "1.27.0")]
3047 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3048     transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
3049 }
3050
3051 /// Shifts packed 16-bit integers in `a` right by `count` while
3052 /// shifting in sign bits.
3053 ///
3054 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
3055 #[inline]
3056 #[target_feature(enable = "avx2")]
3057 #[cfg_attr(test, assert_instr(vpsraw))]
3058 #[stable(feature = "simd_x86", since = "1.27.0")]
3059 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3060     transmute(psraw(a.as_i16x16(), count.as_i16x8()))
3061 }
3062
3063 /// Shifts packed 32-bit integers in `a` right by `count` while
3064 /// shifting in sign bits.
3065 ///
3066 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
3067 #[inline]
3068 #[target_feature(enable = "avx2")]
3069 #[cfg_attr(test, assert_instr(vpsrad))]
3070 #[stable(feature = "simd_x86", since = "1.27.0")]
3071 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3072     transmute(psrad(a.as_i32x8(), count.as_i32x4()))
3073 }
3074
3075 /// Shifts packed 16-bit integers in `a` right by `imm8` while
3076 /// shifting in sign bits.
3077 ///
3078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
3079 #[inline]
3080 #[target_feature(enable = "avx2")]
3081 #[cfg_attr(test, assert_instr(vpsraw))]
3082 #[stable(feature = "simd_x86", since = "1.27.0")]
3083 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
3084     transmute(psraiw(a.as_i16x16(), imm8))
3085 }
3086
3087 /// Shifts packed 32-bit integers in `a` right by `imm8` while
3088 /// shifting in sign bits.
3089 ///
3090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
3091 #[inline]
3092 #[target_feature(enable = "avx2")]
3093 #[cfg_attr(test, assert_instr(vpsrad))]
3094 #[stable(feature = "simd_x86", since = "1.27.0")]
3095 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
3096     transmute(psraid(a.as_i32x8(), imm8))
3097 }
3098
3099 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3100 /// corresponding element in `count` while shifting in sign bits.
3101 ///
3102 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
3103 #[inline]
3104 #[target_feature(enable = "avx2")]
3105 #[cfg_attr(test, assert_instr(vpsravd))]
3106 #[stable(feature = "simd_x86", since = "1.27.0")]
3107 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3108     transmute(psravd(a.as_i32x4(), count.as_i32x4()))
3109 }
3110
3111 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3112 /// corresponding element in `count` while shifting in sign bits.
3113 ///
3114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
3115 #[inline]
3116 #[target_feature(enable = "avx2")]
3117 #[cfg_attr(test, assert_instr(vpsravd))]
3118 #[stable(feature = "simd_x86", since = "1.27.0")]
3119 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3120     transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
3121 }
3122
3123 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3124 ///
3125 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
3126 #[inline]
3127 #[target_feature(enable = "avx2")]
3128 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3129 #[rustc_args_required_const(1)]
3130 #[stable(feature = "simd_x86", since = "1.27.0")]
3131 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
3132     let a = a.as_i64x4();
3133     macro_rules! call {
3134         ($imm8:expr) => {
3135             vpsrldq(a, $imm8)
3136         };
3137     }
3138     transmute(constify_imm8!(imm8 * 8, call))
3139 }
3140
3141 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3142 ///
3143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
3144 #[inline]
3145 #[target_feature(enable = "avx2")]
3146 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3147 #[rustc_args_required_const(1)]
3148 #[stable(feature = "simd_x86", since = "1.27.0")]
3149 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
3150     let a = a.as_i64x4();
3151     macro_rules! call {
3152         ($imm8:expr) => {
3153             vpsrldq(a, $imm8)
3154         };
3155     }
3156     transmute(constify_imm8!(imm8 * 8, call))
3157 }
3158
3159 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3160 /// zeros.
3161 ///
3162 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
3163 #[inline]
3164 #[target_feature(enable = "avx2")]
3165 #[cfg_attr(test, assert_instr(vpsrlw))]
3166 #[stable(feature = "simd_x86", since = "1.27.0")]
3167 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3168     transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3169 }
3170
3171 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3172 /// zeros.
3173 ///
3174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
3175 #[inline]
3176 #[target_feature(enable = "avx2")]
3177 #[cfg_attr(test, assert_instr(vpsrld))]
3178 #[stable(feature = "simd_x86", since = "1.27.0")]
3179 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3180     transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3181 }
3182
3183 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3184 /// zeros.
3185 ///
3186 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
3187 #[inline]
3188 #[target_feature(enable = "avx2")]
3189 #[cfg_attr(test, assert_instr(vpsrlq))]
3190 #[stable(feature = "simd_x86", since = "1.27.0")]
3191 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3192     transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3193 }
3194
3195 /// Shifts packed 16-bit integers in `a` right by `imm8` while shifting in
3196 /// zeros
3197 ///
3198 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
3199 #[inline]
3200 #[target_feature(enable = "avx2")]
3201 #[cfg_attr(test, assert_instr(vpsrlw))]
3202 #[stable(feature = "simd_x86", since = "1.27.0")]
3203 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
3204     transmute(psrliw(a.as_i16x16(), imm8))
3205 }
3206
3207 /// Shifts packed 32-bit integers in `a` right by `imm8` while shifting in
3208 /// zeros
3209 ///
3210 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
3211 #[inline]
3212 #[target_feature(enable = "avx2")]
3213 #[cfg_attr(test, assert_instr(vpsrld))]
3214 #[stable(feature = "simd_x86", since = "1.27.0")]
3215 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
3216     transmute(psrlid(a.as_i32x8(), imm8))
3217 }
3218
3219 /// Shifts packed 64-bit integers in `a` right by `imm8` while shifting in
3220 /// zeros
3221 ///
3222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
3223 #[inline]
3224 #[target_feature(enable = "avx2")]
3225 #[cfg_attr(test, assert_instr(vpsrlq))]
3226 #[stable(feature = "simd_x86", since = "1.27.0")]
3227 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
3228     transmute(psrliq(a.as_i64x4(), imm8))
3229 }
3230
3231 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3232 /// the corresponding element in `count` while shifting in zeros,
3233 ///
3234 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
3235 #[inline]
3236 #[target_feature(enable = "avx2")]
3237 #[cfg_attr(test, assert_instr(vpsrlvd))]
3238 #[stable(feature = "simd_x86", since = "1.27.0")]
3239 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3240     transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3241 }
3242
3243 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3244 /// the corresponding element in `count` while shifting in zeros,
3245 ///
3246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
3247 #[inline]
3248 #[target_feature(enable = "avx2")]
3249 #[cfg_attr(test, assert_instr(vpsrlvd))]
3250 #[stable(feature = "simd_x86", since = "1.27.0")]
3251 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3252     transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3253 }
3254
3255 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3256 /// the corresponding element in `count` while shifting in zeros,
3257 ///
3258 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
3259 #[inline]
3260 #[target_feature(enable = "avx2")]
3261 #[cfg_attr(test, assert_instr(vpsrlvq))]
3262 #[stable(feature = "simd_x86", since = "1.27.0")]
3263 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3264     transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3265 }
3266
3267 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3268 /// the corresponding element in `count` while shifting in zeros,
3269 ///
3270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
3271 #[inline]
3272 #[target_feature(enable = "avx2")]
3273 #[cfg_attr(test, assert_instr(vpsrlvq))]
3274 #[stable(feature = "simd_x86", since = "1.27.0")]
3275 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3276     transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3277 }
3278
3279 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
3280
3281 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3282 ///
3283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
3284 #[inline]
3285 #[target_feature(enable = "avx2")]
3286 #[cfg_attr(test, assert_instr(vpsubw))]
3287 #[stable(feature = "simd_x86", since = "1.27.0")]
3288 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3289     transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3290 }
3291
3292 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
3293 ///
3294 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
3295 #[inline]
3296 #[target_feature(enable = "avx2")]
3297 #[cfg_attr(test, assert_instr(vpsubd))]
3298 #[stable(feature = "simd_x86", since = "1.27.0")]
3299 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3300     transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3301 }
3302
3303 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
3304 ///
3305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
3306 #[inline]
3307 #[target_feature(enable = "avx2")]
3308 #[cfg_attr(test, assert_instr(vpsubq))]
3309 #[stable(feature = "simd_x86", since = "1.27.0")]
3310 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3311     transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3312 }
3313
3314 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
3315 ///
3316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
3317 #[inline]
3318 #[target_feature(enable = "avx2")]
3319 #[cfg_attr(test, assert_instr(vpsubb))]
3320 #[stable(feature = "simd_x86", since = "1.27.0")]
3321 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3322     transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3323 }
3324
3325 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3326 /// `a` using saturation.
3327 ///
3328 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
3329 #[inline]
3330 #[target_feature(enable = "avx2")]
3331 #[cfg_attr(test, assert_instr(vpsubsw))]
3332 #[stable(feature = "simd_x86", since = "1.27.0")]
3333 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3334     transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
3335 }
3336
3337 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3338 /// `a` using saturation.
3339 ///
3340 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
3341 #[inline]
3342 #[target_feature(enable = "avx2")]
3343 #[cfg_attr(test, assert_instr(vpsubsb))]
3344 #[stable(feature = "simd_x86", since = "1.27.0")]
3345 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3346     transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
3347 }
3348
3349 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3350 /// integers in `a` using saturation.
3351 ///
3352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
3353 #[inline]
3354 #[target_feature(enable = "avx2")]
3355 #[cfg_attr(test, assert_instr(vpsubusw))]
3356 #[stable(feature = "simd_x86", since = "1.27.0")]
3357 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3358     transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
3359 }
3360
3361 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3362 /// integers in `a` using saturation.
3363 ///
3364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
3365 #[inline]
3366 #[target_feature(enable = "avx2")]
3367 #[cfg_attr(test, assert_instr(vpsubusb))]
3368 #[stable(feature = "simd_x86", since = "1.27.0")]
3369 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3370     transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
3371 }
3372
3373 /// Unpacks and interleave 8-bit integers from the high half of each
3374 /// 128-bit lane in `a` and `b`.
3375 ///
3376 /// ```rust
3377 /// #[cfg(target_arch = "x86")]
3378 /// use std::arch::x86::*;
3379 /// #[cfg(target_arch = "x86_64")]
3380 /// use std::arch::x86_64::*;
3381 ///
3382 /// # fn main() {
3383 /// #     if is_x86_feature_detected!("avx2") {
3384 /// #         #[target_feature(enable = "avx2")]
3385 /// #         unsafe fn worker() {
3386 /// let a = _mm256_setr_epi8(
3387 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3388 ///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3389 /// );
3390 /// let b = _mm256_setr_epi8(
3391 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3392 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3393 ///     -30, -31,
3394 /// );
3395 ///
3396 /// let c = _mm256_unpackhi_epi8(a, b);
3397 ///
3398 /// let expected = _mm256_setr_epi8(
3399 ///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3400 ///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3401 ///     -31,
3402 /// );
3403 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3404 ///
3405 /// #         }
3406 /// #         unsafe { worker(); }
3407 /// #     }
3408 /// # }
3409 /// ```
3410 ///
3411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
3412 #[inline]
3413 #[target_feature(enable = "avx2")]
3414 #[cfg_attr(test, assert_instr(vpunpckhbw))]
3415 #[stable(feature = "simd_x86", since = "1.27.0")]
3416 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3417     #[rustfmt::skip]
3418     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3419             8, 40, 9, 41, 10, 42, 11, 43,
3420             12, 44, 13, 45, 14, 46, 15, 47,
3421             24, 56, 25, 57, 26, 58, 27, 59,
3422             28, 60, 29, 61, 30, 62, 31, 63,
3423     ]);
3424     transmute(r)
3425 }
3426
3427 /// Unpacks and interleave 8-bit integers from the low half of each
3428 /// 128-bit lane of `a` and `b`.
3429 ///
3430 /// ```rust
3431 /// #[cfg(target_arch = "x86")]
3432 /// use std::arch::x86::*;
3433 /// #[cfg(target_arch = "x86_64")]
3434 /// use std::arch::x86_64::*;
3435 ///
3436 /// # fn main() {
3437 /// #     if is_x86_feature_detected!("avx2") {
3438 /// #         #[target_feature(enable = "avx2")]
3439 /// #         unsafe fn worker() {
3440 /// let a = _mm256_setr_epi8(
3441 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3442 ///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3443 /// );
3444 /// let b = _mm256_setr_epi8(
3445 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3446 ///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3447 ///     -30, -31,
3448 /// );
3449 ///
3450 /// let c = _mm256_unpacklo_epi8(a, b);
3451 ///
3452 /// let expected = _mm256_setr_epi8(
3453 ///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3454 ///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3455 /// );
3456 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3457 ///
3458 /// #         }
3459 /// #         unsafe { worker(); }
3460 /// #     }
3461 /// # }
3462 /// ```
3463 ///
3464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
3465 #[inline]
3466 #[target_feature(enable = "avx2")]
3467 #[cfg_attr(test, assert_instr(vpunpcklbw))]
3468 #[stable(feature = "simd_x86", since = "1.27.0")]
3469 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3470     #[rustfmt::skip]
3471     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3472         0, 32, 1, 33, 2, 34, 3, 35,
3473         4, 36, 5, 37, 6, 38, 7, 39,
3474         16, 48, 17, 49, 18, 50, 19, 51,
3475         20, 52, 21, 53, 22, 54, 23, 55,
3476     ]);
3477     transmute(r)
3478 }
3479
3480 /// Unpacks and interleave 16-bit integers from the high half of each
3481 /// 128-bit lane of `a` and `b`.
3482 ///
3483 /// ```rust
3484 /// #[cfg(target_arch = "x86")]
3485 /// use std::arch::x86::*;
3486 /// #[cfg(target_arch = "x86_64")]
3487 /// use std::arch::x86_64::*;
3488 ///
3489 /// # fn main() {
3490 /// #     if is_x86_feature_detected!("avx2") {
3491 /// #         #[target_feature(enable = "avx2")]
3492 /// #         unsafe fn worker() {
3493 /// let a = _mm256_setr_epi16(
3494 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3495 /// );
3496 /// let b = _mm256_setr_epi16(
3497 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3498 /// );
3499 ///
3500 /// let c = _mm256_unpackhi_epi16(a, b);
3501 ///
3502 /// let expected = _mm256_setr_epi16(
3503 ///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3504 /// );
3505 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3506 ///
3507 /// #         }
3508 /// #         unsafe { worker(); }
3509 /// #     }
3510 /// # }
3511 /// ```
3512 ///
3513 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
3514 #[inline]
3515 #[target_feature(enable = "avx2")]
3516 #[cfg_attr(test, assert_instr(vpunpckhwd))]
3517 #[stable(feature = "simd_x86", since = "1.27.0")]
3518 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3519     let r: i16x16 = simd_shuffle16(
3520         a.as_i16x16(),
3521         b.as_i16x16(),
3522         [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3523     );
3524     transmute(r)
3525 }
3526
3527 /// Unpacks and interleave 16-bit integers from the low half of each
3528 /// 128-bit lane of `a` and `b`.
3529 ///
3530 /// ```rust
3531 /// #[cfg(target_arch = "x86")]
3532 /// use std::arch::x86::*;
3533 /// #[cfg(target_arch = "x86_64")]
3534 /// use std::arch::x86_64::*;
3535 ///
3536 /// # fn main() {
3537 /// #     if is_x86_feature_detected!("avx2") {
3538 /// #         #[target_feature(enable = "avx2")]
3539 /// #         unsafe fn worker() {
3540 ///
3541 /// let a = _mm256_setr_epi16(
3542 ///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3543 /// );
3544 /// let b = _mm256_setr_epi16(
3545 ///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3546 /// );
3547 ///
3548 /// let c = _mm256_unpacklo_epi16(a, b);
3549 ///
3550 /// let expected = _mm256_setr_epi16(
3551 ///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3552 /// );
3553 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3554 ///
3555 /// #         }
3556 /// #         unsafe { worker(); }
3557 /// #     }
3558 /// # }
3559 /// ```
3560 ///
3561 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
3562 #[inline]
3563 #[target_feature(enable = "avx2")]
3564 #[cfg_attr(test, assert_instr(vpunpcklwd))]
3565 #[stable(feature = "simd_x86", since = "1.27.0")]
3566 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3567     let r: i16x16 = simd_shuffle16(
3568         a.as_i16x16(),
3569         b.as_i16x16(),
3570         [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3571     );
3572     transmute(r)
3573 }
3574
3575 /// Unpacks and interleave 32-bit integers from the high half of each
3576 /// 128-bit lane of `a` and `b`.
3577 ///
3578 /// ```rust
3579 /// #[cfg(target_arch = "x86")]
3580 /// use std::arch::x86::*;
3581 /// #[cfg(target_arch = "x86_64")]
3582 /// use std::arch::x86_64::*;
3583 ///
3584 /// # fn main() {
3585 /// #     if is_x86_feature_detected!("avx2") {
3586 /// #         #[target_feature(enable = "avx2")]
3587 /// #         unsafe fn worker() {
3588 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3589 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3590 ///
3591 /// let c = _mm256_unpackhi_epi32(a, b);
3592 ///
3593 /// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3594 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3595 ///
3596 /// #         }
3597 /// #         unsafe { worker(); }
3598 /// #     }
3599 /// # }
3600 /// ```
3601 ///
3602 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
3603 #[inline]
3604 #[target_feature(enable = "avx2")]
3605 #[cfg_attr(test, assert_instr(vunpckhps))]
3606 #[stable(feature = "simd_x86", since = "1.27.0")]
3607 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3608     let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3609     transmute(r)
3610 }
3611
3612 /// Unpacks and interleave 32-bit integers from the low half of each
3613 /// 128-bit lane of `a` and `b`.
3614 ///
3615 /// ```rust
3616 /// #[cfg(target_arch = "x86")]
3617 /// use std::arch::x86::*;
3618 /// #[cfg(target_arch = "x86_64")]
3619 /// use std::arch::x86_64::*;
3620 ///
3621 /// # fn main() {
3622 /// #     if is_x86_feature_detected!("avx2") {
3623 /// #         #[target_feature(enable = "avx2")]
3624 /// #         unsafe fn worker() {
3625 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3626 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3627 ///
3628 /// let c = _mm256_unpacklo_epi32(a, b);
3629 ///
3630 /// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3631 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3632 ///
3633 /// #         }
3634 /// #         unsafe { worker(); }
3635 /// #     }
3636 /// # }
3637 /// ```
3638 ///
3639 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
3640 #[inline]
3641 #[target_feature(enable = "avx2")]
3642 #[cfg_attr(test, assert_instr(vunpcklps))]
3643 #[stable(feature = "simd_x86", since = "1.27.0")]
3644 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3645     let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3646     transmute(r)
3647 }
3648
3649 /// Unpacks and interleave 64-bit integers from the high half of each
3650 /// 128-bit lane of `a` and `b`.
3651 ///
3652 /// ```rust
3653 /// #[cfg(target_arch = "x86")]
3654 /// use std::arch::x86::*;
3655 /// #[cfg(target_arch = "x86_64")]
3656 /// use std::arch::x86_64::*;
3657 ///
3658 /// # fn main() {
3659 /// #     if is_x86_feature_detected!("avx2") {
3660 /// #         #[target_feature(enable = "avx2")]
3661 /// #         unsafe fn worker() {
3662 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3663 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3664 ///
3665 /// let c = _mm256_unpackhi_epi64(a, b);
3666 ///
3667 /// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3668 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3669 ///
3670 /// #         }
3671 /// #         unsafe { worker(); }
3672 /// #     }
3673 /// # }
3674 /// ```
3675 ///
3676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
3677 #[inline]
3678 #[target_feature(enable = "avx2")]
3679 #[cfg_attr(test, assert_instr(vunpckhpd))]
3680 #[stable(feature = "simd_x86", since = "1.27.0")]
3681 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3682     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3683     transmute(r)
3684 }
3685
3686 /// Unpacks and interleave 64-bit integers from the low half of each
3687 /// 128-bit lane of `a` and `b`.
3688 ///
3689 /// ```rust
3690 /// #[cfg(target_arch = "x86")]
3691 /// use std::arch::x86::*;
3692 /// #[cfg(target_arch = "x86_64")]
3693 /// use std::arch::x86_64::*;
3694 ///
3695 /// # fn main() {
3696 /// #     if is_x86_feature_detected!("avx2") {
3697 /// #         #[target_feature(enable = "avx2")]
3698 /// #         unsafe fn worker() {
3699 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3700 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3701 ///
3702 /// let c = _mm256_unpacklo_epi64(a, b);
3703 ///
3704 /// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3705 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3706 ///
3707 /// #         }
3708 /// #         unsafe { worker(); }
3709 /// #     }
3710 /// # }
3711 /// ```
3712 ///
3713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
3714 #[inline]
3715 #[target_feature(enable = "avx2")]
3716 #[cfg_attr(test, assert_instr(vunpcklpd))]
3717 #[stable(feature = "simd_x86", since = "1.27.0")]
3718 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3719     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3720     transmute(r)
3721 }
3722
3723 /// Computes the bitwise XOR of 256 bits (representing integer data)
3724 /// in `a` and `b`
3725 ///
3726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
3727 #[inline]
3728 #[target_feature(enable = "avx2")]
3729 #[cfg_attr(test, assert_instr(vxorps))]
3730 #[stable(feature = "simd_x86", since = "1.27.0")]
3731 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3732     transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3733 }
3734
3735 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3736 /// integer containing the zero-extended integer data.
3737 ///
3738 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3739 ///
3740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
3741 #[inline]
3742 #[target_feature(enable = "avx2")]
3743 // This intrinsic has no corresponding instruction.
3744 #[rustc_args_required_const(1)]
3745 #[stable(feature = "simd_x86", since = "1.27.0")]
3746 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
3747     let imm8 = (imm8 & 31) as u32;
3748     simd_extract(a.as_i8x32(), imm8)
3749 }
3750
3751 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3752 /// integer containing the zero-extended integer data.
3753 ///
3754 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3755 ///
3756 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
3757 #[inline]
3758 #[target_feature(enable = "avx2")]
3759 // This intrinsic has no corresponding instruction.
3760 #[rustc_args_required_const(1)]
3761 #[stable(feature = "simd_x86", since = "1.27.0")]
3762 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
3763     let imm8 = (imm8 & 15) as u32;
3764     simd_extract(a.as_i16x16(), imm8)
3765 }
3766
3767 /// Extracts a 32-bit integer from `a`, selected with `imm8`.
3768 ///
3769 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
3770 #[inline]
3771 #[target_feature(enable = "avx2")]
3772 // This intrinsic has no corresponding instruction.
3773 #[rustc_args_required_const(1)]
3774 #[stable(feature = "simd_x86", since = "1.27.0")]
3775 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
3776     let imm8 = (imm8 & 7) as u32;
3777     simd_extract(a.as_i32x8(), imm8)
3778 }
3779
3780 /// Returns the first element of the input vector of `[4 x double]`.
3781 ///
3782 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
3783 #[inline]
3784 #[target_feature(enable = "avx2")]
3785 //#[cfg_attr(test, assert_instr(movsd))] FIXME
3786 #[stable(feature = "simd_x86", since = "1.27.0")]
3787 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
3788     simd_extract(a, 0)
3789 }
3790
3791 /// Returns the first element of the input vector of `[8 x i32]`.
3792 ///
3793 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
3794 #[inline]
3795 #[target_feature(enable = "avx2")]
3796 //#[cfg_attr(test, assert_instr(movd))] FIXME
3797 #[stable(feature = "simd_x86", since = "1.27.0")]
3798 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
3799     simd_extract(a.as_i32x8(), 0)
3800 }
3801
3802 #[allow(improper_ctypes)]
3803 extern "C" {
3804     #[link_name = "llvm.x86.avx2.pabs.b"]
3805     fn pabsb(a: i8x32) -> u8x32;
3806     #[link_name = "llvm.x86.avx2.pabs.w"]
3807     fn pabsw(a: i16x16) -> u16x16;
3808     #[link_name = "llvm.x86.avx2.pabs.d"]
3809     fn pabsd(a: i32x8) -> u32x8;
3810     #[link_name = "llvm.x86.avx2.pavg.b"]
3811     fn pavgb(a: u8x32, b: u8x32) -> u8x32;
3812     #[link_name = "llvm.x86.avx2.pavg.w"]
3813     fn pavgw(a: u16x16, b: u16x16) -> u16x16;
3814     #[link_name = "llvm.x86.avx2.pblendvb"]
3815     fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
3816     #[link_name = "llvm.x86.avx2.phadd.w"]
3817     fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3818     #[link_name = "llvm.x86.avx2.phadd.d"]
3819     fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3820     #[link_name = "llvm.x86.avx2.phadd.sw"]
3821     fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3822     #[link_name = "llvm.x86.avx2.phsub.w"]
3823     fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3824     #[link_name = "llvm.x86.avx2.phsub.d"]
3825     fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3826     #[link_name = "llvm.x86.avx2.phsub.sw"]
3827     fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3828     #[link_name = "llvm.x86.avx2.pmadd.wd"]
3829     fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3830     #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3831     fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3832     #[link_name = "llvm.x86.avx2.maskload.d"]
3833     fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3834     #[link_name = "llvm.x86.avx2.maskload.d.256"]
3835     fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3836     #[link_name = "llvm.x86.avx2.maskload.q"]
3837     fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3838     #[link_name = "llvm.x86.avx2.maskload.q.256"]
3839     fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3840     #[link_name = "llvm.x86.avx2.maskstore.d"]
3841     fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3842     #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3843     fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3844     #[link_name = "llvm.x86.avx2.maskstore.q"]
3845     fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3846     #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3847     fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3848     #[link_name = "llvm.x86.avx2.pmaxs.w"]
3849     fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
3850     #[link_name = "llvm.x86.avx2.pmaxs.d"]
3851     fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
3852     #[link_name = "llvm.x86.avx2.pmaxs.b"]
3853     fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
3854     #[link_name = "llvm.x86.avx2.pmaxu.w"]
3855     fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
3856     #[link_name = "llvm.x86.avx2.pmaxu.d"]
3857     fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
3858     #[link_name = "llvm.x86.avx2.pmaxu.b"]
3859     fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
3860     #[link_name = "llvm.x86.avx2.pmins.w"]
3861     fn pminsw(a: i16x16, b: i16x16) -> i16x16;
3862     #[link_name = "llvm.x86.avx2.pmins.d"]
3863     fn pminsd(a: i32x8, b: i32x8) -> i32x8;
3864     #[link_name = "llvm.x86.avx2.pmins.b"]
3865     fn pminsb(a: i8x32, b: i8x32) -> i8x32;
3866     #[link_name = "llvm.x86.avx2.pminu.w"]
3867     fn pminuw(a: u16x16, b: u16x16) -> u16x16;
3868     #[link_name = "llvm.x86.avx2.pminu.d"]
3869     fn pminud(a: u32x8, b: u32x8) -> u32x8;
3870     #[link_name = "llvm.x86.avx2.pminu.b"]
3871     fn pminub(a: u8x32, b: u8x32) -> u8x32;
3872     #[link_name = "llvm.x86.avx2.pmovmskb"]
3873     fn pmovmskb(a: i8x32) -> i32;
3874     #[link_name = "llvm.x86.avx2.mpsadbw"]
3875     fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3876     #[link_name = "llvm.x86.avx2.pmulhu.w"]
3877     fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
3878     #[link_name = "llvm.x86.avx2.pmulh.w"]
3879     fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
3880     #[link_name = "llvm.x86.avx2.pmul.dq"]
3881     fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
3882     #[link_name = "llvm.x86.avx2.pmulu.dq"]
3883     fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
3884     #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3885     fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3886     #[link_name = "llvm.x86.avx2.packsswb"]
3887     fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3888     #[link_name = "llvm.x86.avx2.packssdw"]
3889     fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3890     #[link_name = "llvm.x86.avx2.packuswb"]
3891     fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3892     #[link_name = "llvm.x86.avx2.packusdw"]
3893     fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3894     #[link_name = "llvm.x86.avx2.psad.bw"]
3895     fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3896     #[link_name = "llvm.x86.avx2.psign.b"]
3897     fn psignb(a: i8x32, b: i8x32) -> i8x32;
3898     #[link_name = "llvm.x86.avx2.psign.w"]
3899     fn psignw(a: i16x16, b: i16x16) -> i16x16;
3900     #[link_name = "llvm.x86.avx2.psign.d"]
3901     fn psignd(a: i32x8, b: i32x8) -> i32x8;
3902     #[link_name = "llvm.x86.avx2.psll.w"]
3903     fn psllw(a: i16x16, count: i16x8) -> i16x16;
3904     #[link_name = "llvm.x86.avx2.psll.d"]
3905     fn pslld(a: i32x8, count: i32x4) -> i32x8;
3906     #[link_name = "llvm.x86.avx2.psll.q"]
3907     fn psllq(a: i64x4, count: i64x2) -> i64x4;
3908     #[link_name = "llvm.x86.avx2.pslli.w"]
3909     fn pslliw(a: i16x16, imm8: i32) -> i16x16;
3910     #[link_name = "llvm.x86.avx2.pslli.d"]
3911     fn psllid(a: i32x8, imm8: i32) -> i32x8;
3912     #[link_name = "llvm.x86.avx2.pslli.q"]
3913     fn pslliq(a: i64x4, imm8: i32) -> i64x4;
3914     #[link_name = "llvm.x86.avx2.psllv.d"]
3915     fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3916     #[link_name = "llvm.x86.avx2.psllv.d.256"]
3917     fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3918     #[link_name = "llvm.x86.avx2.psllv.q"]
3919     fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3920     #[link_name = "llvm.x86.avx2.psllv.q.256"]
3921     fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3922     #[link_name = "llvm.x86.avx2.psra.w"]
3923     fn psraw(a: i16x16, count: i16x8) -> i16x16;
3924     #[link_name = "llvm.x86.avx2.psra.d"]
3925     fn psrad(a: i32x8, count: i32x4) -> i32x8;
3926     #[link_name = "llvm.x86.avx2.psrai.w"]
3927     fn psraiw(a: i16x16, imm8: i32) -> i16x16;
3928     #[link_name = "llvm.x86.avx2.psrai.d"]
3929     fn psraid(a: i32x8, imm8: i32) -> i32x8;
3930     #[link_name = "llvm.x86.avx2.psrav.d"]
3931     fn psravd(a: i32x4, count: i32x4) -> i32x4;
3932     #[link_name = "llvm.x86.avx2.psrav.d.256"]
3933     fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3934     #[link_name = "llvm.x86.avx2.psrl.w"]
3935     fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3936     #[link_name = "llvm.x86.avx2.psrl.d"]
3937     fn psrld(a: i32x8, count: i32x4) -> i32x8;
3938     #[link_name = "llvm.x86.avx2.psrl.q"]
3939     fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3940     #[link_name = "llvm.x86.avx2.psrli.w"]
3941     fn psrliw(a: i16x16, imm8: i32) -> i16x16;
3942     #[link_name = "llvm.x86.avx2.psrli.d"]
3943     fn psrlid(a: i32x8, imm8: i32) -> i32x8;
3944     #[link_name = "llvm.x86.avx2.psrli.q"]
3945     fn psrliq(a: i64x4, imm8: i32) -> i64x4;
3946     #[link_name = "llvm.x86.avx2.psrlv.d"]
3947     fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3948     #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3949     fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3950     #[link_name = "llvm.x86.avx2.psrlv.q"]
3951     fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3952     #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3953     fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3954     #[link_name = "llvm.x86.avx2.pshuf.b"]
3955     fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3956     #[link_name = "llvm.x86.avx2.permd"]
3957     fn permd(a: u32x8, b: u32x8) -> u32x8;
3958     #[link_name = "llvm.x86.avx2.permps"]
3959     fn permps(a: __m256, b: i32x8) -> __m256;
3960     #[link_name = "llvm.x86.avx2.vperm2i128"]
3961     fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3962     #[link_name = "llvm.x86.avx2.gather.d.d"]
3963     fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3964     #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3965     fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3966     #[link_name = "llvm.x86.avx2.gather.d.q"]
3967     fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3968     #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3969     fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3970     #[link_name = "llvm.x86.avx2.gather.q.d"]
3971     fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3972     #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3973     fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3974     #[link_name = "llvm.x86.avx2.gather.q.q"]
3975     fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3976     #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3977     fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3978     #[link_name = "llvm.x86.avx2.gather.d.pd"]
3979     fn pgatherdpd(
3980         src: __m128d,
3981         slice: *const i8,
3982         offsets: i32x4,
3983         mask: __m128d,
3984         scale: i8,
3985     ) -> __m128d;
3986     #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3987     fn vpgatherdpd(
3988         src: __m256d,
3989         slice: *const i8,
3990         offsets: i32x4,
3991         mask: __m256d,
3992         scale: i8,
3993     ) -> __m256d;
3994     #[link_name = "llvm.x86.avx2.gather.q.pd"]
3995     fn pgatherqpd(
3996         src: __m128d,
3997         slice: *const i8,
3998         offsets: i64x2,
3999         mask: __m128d,
4000         scale: i8,
4001     ) -> __m128d;
4002     #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
4003     fn vpgatherqpd(
4004         src: __m256d,
4005         slice: *const i8,
4006         offsets: i64x4,
4007         mask: __m256d,
4008         scale: i8,
4009     ) -> __m256d;
4010     #[link_name = "llvm.x86.avx2.gather.d.ps"]
4011     fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
4012         -> __m128;
4013     #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4014     fn vpgatherdps(
4015         src: __m256,
4016         slice: *const i8,
4017         offsets: i32x8,
4018         mask: __m256,
4019         scale: i8,
4020     ) -> __m256;
4021     #[link_name = "llvm.x86.avx2.gather.q.ps"]
4022     fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4023         -> __m128;
4024     #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4025     fn vpgatherqps(
4026         src: __m128,
4027         slice: *const i8,
4028         offsets: i64x4,
4029         mask: __m128,
4030         scale: i8,
4031     ) -> __m128;
4032     #[link_name = "llvm.x86.avx2.psll.dq"]
4033     fn vpslldq(a: i64x4, b: i32) -> i64x4;
4034     #[link_name = "llvm.x86.avx2.psrl.dq"]
4035     fn vpsrldq(a: i64x4, b: i32) -> i64x4;
4036 }
4037
4038 #[cfg(test)]
4039 mod tests {
4040     use std;
4041     use stdarch_test::simd_test;
4042
4043     use crate::core_arch::x86::*;
4044
4045     #[simd_test(enable = "avx2")]
4046     unsafe fn test_mm256_abs_epi32() {
4047         #[rustfmt::skip]
4048         let a = _mm256_setr_epi32(
4049             0, 1, -1, i32::MAX,
4050             i32::MIN, 100, -100, -32,
4051         );
4052         let r = _mm256_abs_epi32(a);
4053         #[rustfmt::skip]
4054         let e = _mm256_setr_epi32(
4055             0, 1, 1, i32::MAX,
4056             i32::MAX.wrapping_add(1), 100, 100, 32,
4057         );
4058         assert_eq_m256i(r, e);
4059     }
4060
4061     #[simd_test(enable = "avx2")]
4062     unsafe fn test_mm256_abs_epi16() {
4063         #[rustfmt::skip]
4064         let a = _mm256_setr_epi16(
4065             0,  1, -1, 2, -2, 3, -3, 4,
4066             -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
4067         );
4068         let r = _mm256_abs_epi16(a);
4069         #[rustfmt::skip]
4070         let e = _mm256_setr_epi16(
4071             0, 1, 1, 2, 2, 3, 3, 4,
4072             4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
4073         );
4074         assert_eq_m256i(r, e);
4075     }
4076
4077     #[simd_test(enable = "avx2")]
4078     unsafe fn test_mm256_abs_epi8() {
4079         #[rustfmt::skip]
4080         let a = _mm256_setr_epi8(
4081             0, 1, -1, 2, -2, 3, -3, 4,
4082             -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4083             0, 1, -1, 2, -2, 3, -3, 4,
4084             -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4085         );
4086         let r = _mm256_abs_epi8(a);
4087         #[rustfmt::skip]
4088         let e = _mm256_setr_epi8(
4089             0, 1, 1, 2, 2, 3, 3, 4,
4090             4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4091             0, 1, 1, 2, 2, 3, 3, 4,
4092             4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4093         );
4094         assert_eq_m256i(r, e);
4095     }
4096
4097     #[simd_test(enable = "avx2")]
4098     unsafe fn test_mm256_add_epi64() {
4099         let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4100         let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4101         let r = _mm256_add_epi64(a, b);
4102         let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4103         assert_eq_m256i(r, e);
4104     }
4105
4106     #[simd_test(enable = "avx2")]
4107     unsafe fn test_mm256_add_epi32() {
4108         let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4109         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4110         let r = _mm256_add_epi32(a, b);
4111         let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4112         assert_eq_m256i(r, e);
4113     }
4114
4115     #[simd_test(enable = "avx2")]
4116     unsafe fn test_mm256_add_epi16() {
4117         #[rustfmt::skip]
4118         let a = _mm256_setr_epi16(
4119             0, 1, 2, 3, 4, 5, 6, 7,
4120             8, 9, 10, 11, 12, 13, 14, 15,
4121         );
4122         #[rustfmt::skip]
4123         let b = _mm256_setr_epi16(
4124             0, 1, 2, 3, 4, 5, 6, 7,
4125             8, 9, 10, 11, 12, 13, 14, 15,
4126         );
4127         let r = _mm256_add_epi16(a, b);
4128         #[rustfmt::skip]
4129         let e = _mm256_setr_epi16(
4130             0, 2, 4, 6, 8, 10, 12, 14,
4131             16, 18, 20, 22, 24, 26, 28, 30,
4132         );
4133         assert_eq_m256i(r, e);
4134     }
4135
4136     #[simd_test(enable = "avx2")]
4137     unsafe fn test_mm256_add_epi8() {
4138         #[rustfmt::skip]
4139         let a = _mm256_setr_epi8(
4140             0, 1, 2, 3, 4, 5, 6, 7,
4141             8, 9, 10, 11, 12, 13, 14, 15,
4142             16, 17, 18, 19, 20, 21, 22, 23,
4143             24, 25, 26, 27, 28, 29, 30, 31,
4144         );
4145         #[rustfmt::skip]
4146         let b = _mm256_setr_epi8(
4147             0, 1, 2, 3, 4, 5, 6, 7,
4148             8, 9, 10, 11, 12, 13, 14, 15,
4149             16, 17, 18, 19, 20, 21, 22, 23,
4150             24, 25, 26, 27, 28, 29, 30, 31,
4151         );
4152         let r = _mm256_add_epi8(a, b);
4153         #[rustfmt::skip]
4154         let e = _mm256_setr_epi8(
4155             0, 2, 4, 6, 8, 10, 12, 14,
4156             16, 18, 20, 22, 24, 26, 28, 30,
4157             32, 34, 36, 38, 40, 42, 44, 46,
4158             48, 50, 52, 54, 56, 58, 60, 62,
4159         );
4160         assert_eq_m256i(r, e);
4161     }
4162
4163     #[simd_test(enable = "avx2")]
4164     unsafe fn test_mm256_adds_epi8() {
4165         #[rustfmt::skip]
4166         let a = _mm256_setr_epi8(
4167             0, 1, 2, 3, 4, 5, 6, 7,
4168             8, 9, 10, 11, 12, 13, 14, 15,
4169             16, 17, 18, 19, 20, 21, 22, 23,
4170             24, 25, 26, 27, 28, 29, 30, 31,
4171         );
4172         #[rustfmt::skip]
4173         let b = _mm256_setr_epi8(
4174             32, 33, 34, 35, 36, 37, 38, 39,
4175             40, 41, 42, 43, 44, 45, 46, 47,
4176             48, 49, 50, 51, 52, 53, 54, 55,
4177             56, 57, 58, 59, 60, 61, 62, 63,
4178         );
4179         let r = _mm256_adds_epi8(a, b);
4180         #[rustfmt::skip]
4181         let e = _mm256_setr_epi8(
4182             32, 34, 36, 38, 40, 42, 44, 46,
4183             48, 50, 52, 54, 56, 58, 60, 62,
4184             64, 66, 68, 70, 72, 74, 76, 78,
4185             80, 82, 84, 86, 88, 90, 92, 94,
4186         );
4187         assert_eq_m256i(r, e);
4188     }
4189
4190     #[simd_test(enable = "avx2")]
4191     unsafe fn test_mm256_adds_epi8_saturate_positive() {
4192         let a = _mm256_set1_epi8(0x7F);
4193         let b = _mm256_set1_epi8(1);
4194         let r = _mm256_adds_epi8(a, b);
4195         assert_eq_m256i(r, a);
4196     }
4197
4198     #[simd_test(enable = "avx2")]
4199     unsafe fn test_mm256_adds_epi8_saturate_negative() {
4200         let a = _mm256_set1_epi8(-0x80);
4201         let b = _mm256_set1_epi8(-1);
4202         let r = _mm256_adds_epi8(a, b);
4203         assert_eq_m256i(r, a);
4204     }
4205
4206     #[simd_test(enable = "avx2")]
4207     unsafe fn test_mm256_adds_epi16() {
4208         #[rustfmt::skip]
4209         let a = _mm256_setr_epi16(
4210             0, 1, 2, 3, 4, 5, 6, 7,
4211             8, 9, 10, 11, 12, 13, 14, 15,
4212         );
4213         #[rustfmt::skip]
4214         let b = _mm256_setr_epi16(
4215             32, 33, 34, 35, 36, 37, 38, 39,
4216             40, 41, 42, 43, 44, 45, 46, 47,
4217         );
4218         let r = _mm256_adds_epi16(a, b);
4219         #[rustfmt::skip]
4220         let e = _mm256_setr_epi16(
4221             32, 34, 36, 38, 40, 42, 44, 46,
4222             48, 50, 52, 54, 56, 58, 60, 62,
4223         );
4224
4225         assert_eq_m256i(r, e);
4226     }
4227
4228     #[simd_test(enable = "avx2")]
4229     unsafe fn test_mm256_adds_epi16_saturate_positive() {
4230         let a = _mm256_set1_epi16(0x7FFF);
4231         let b = _mm256_set1_epi16(1);
4232         let r = _mm256_adds_epi16(a, b);
4233         assert_eq_m256i(r, a);
4234     }
4235
4236     #[simd_test(enable = "avx2")]
4237     unsafe fn test_mm256_adds_epi16_saturate_negative() {
4238         let a = _mm256_set1_epi16(-0x8000);
4239         let b = _mm256_set1_epi16(-1);
4240         let r = _mm256_adds_epi16(a, b);
4241         assert_eq_m256i(r, a);
4242     }
4243
4244     #[simd_test(enable = "avx2")]
4245     unsafe fn test_mm256_adds_epu8() {
4246         #[rustfmt::skip]
4247         let a = _mm256_setr_epi8(
4248             0, 1, 2, 3, 4, 5, 6, 7,
4249             8, 9, 10, 11, 12, 13, 14, 15,
4250             16, 17, 18, 19, 20, 21, 22, 23,
4251             24, 25, 26, 27, 28, 29, 30, 31,
4252         );
4253         #[rustfmt::skip]
4254         let b = _mm256_setr_epi8(
4255             32, 33, 34, 35, 36, 37, 38, 39,
4256             40, 41, 42, 43, 44, 45, 46, 47,
4257             48, 49, 50, 51, 52, 53, 54, 55,
4258             56, 57, 58, 59, 60, 61, 62, 63,
4259         );
4260         let r = _mm256_adds_epu8(a, b);
4261         #[rustfmt::skip]
4262         let e = _mm256_setr_epi8(
4263             32, 34, 36, 38, 40, 42, 44, 46,
4264             48, 50, 52, 54, 56, 58, 60, 62,
4265             64, 66, 68, 70, 72, 74, 76, 78,
4266             80, 82, 84, 86, 88, 90, 92, 94,
4267         );
4268         assert_eq_m256i(r, e);
4269     }
4270
4271     #[simd_test(enable = "avx2")]
4272     unsafe fn test_mm256_adds_epu8_saturate() {
4273         let a = _mm256_set1_epi8(!0);
4274         let b = _mm256_set1_epi8(1);
4275         let r = _mm256_adds_epu8(a, b);
4276         assert_eq_m256i(r, a);
4277     }
4278
4279     #[simd_test(enable = "avx2")]
4280     unsafe fn test_mm256_adds_epu16() {
4281         #[rustfmt::skip]
4282         let a = _mm256_setr_epi16(
4283             0, 1, 2, 3, 4, 5, 6, 7,
4284             8, 9, 10, 11, 12, 13, 14, 15,
4285         );
4286         #[rustfmt::skip]
4287         let b = _mm256_setr_epi16(
4288             32, 33, 34, 35, 36, 37, 38, 39,
4289             40, 41, 42, 43, 44, 45, 46, 47,
4290         );
4291         let r = _mm256_adds_epu16(a, b);
4292         #[rustfmt::skip]
4293         let e = _mm256_setr_epi16(
4294             32, 34, 36, 38, 40, 42, 44, 46,
4295             48, 50, 52, 54, 56, 58, 60, 62,
4296         );
4297
4298         assert_eq_m256i(r, e);
4299     }
4300
4301     #[simd_test(enable = "avx2")]
4302     unsafe fn test_mm256_adds_epu16_saturate() {
4303         let a = _mm256_set1_epi16(!0);
4304         let b = _mm256_set1_epi16(1);
4305         let r = _mm256_adds_epu16(a, b);
4306         assert_eq_m256i(r, a);
4307     }
4308
4309     #[simd_test(enable = "avx2")]
4310     unsafe fn test_mm256_and_si256() {
4311         let a = _mm256_set1_epi8(5);
4312         let b = _mm256_set1_epi8(3);
4313         let got = _mm256_and_si256(a, b);
4314         assert_eq_m256i(got, _mm256_set1_epi8(1));
4315     }
4316
4317     #[simd_test(enable = "avx2")]
4318     unsafe fn test_mm256_andnot_si256() {
4319         let a = _mm256_set1_epi8(5);
4320         let b = _mm256_set1_epi8(3);
4321         let got = _mm256_andnot_si256(a, b);
4322         assert_eq_m256i(got, _mm256_set1_epi8(2));
4323     }
4324
4325     #[simd_test(enable = "avx2")]
4326     unsafe fn test_mm256_avg_epu8() {
4327         let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4328         let r = _mm256_avg_epu8(a, b);
4329         assert_eq_m256i(r, _mm256_set1_epi8(6));
4330     }
4331
4332     #[simd_test(enable = "avx2")]
4333     unsafe fn test_mm256_avg_epu16() {
4334         let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4335         let r = _mm256_avg_epu16(a, b);
4336         assert_eq_m256i(r, _mm256_set1_epi16(6));
4337     }
4338
4339     #[simd_test(enable = "avx2")]
4340     unsafe fn test_mm_blend_epi32() {
4341         let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4342         let e = _mm_setr_epi32(9, 3, 3, 3);
4343         let r = _mm_blend_epi32(a, b, 0x01 as i32);
4344         assert_eq_m128i(r, e);
4345
4346         let r = _mm_blend_epi32(b, a, 0x0E as i32);
4347         assert_eq_m128i(r, e);
4348     }
4349
4350     #[simd_test(enable = "avx2")]
4351     unsafe fn test_mm256_blend_epi32() {
4352         let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4353         let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4354         let r = _mm256_blend_epi32(a, b, 0x01 as i32);
4355         assert_eq_m256i(r, e);
4356
4357         let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4358         let r = _mm256_blend_epi32(a, b, 0x82 as i32);
4359         assert_eq_m256i(r, e);
4360
4361         let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4362         let r = _mm256_blend_epi32(a, b, 0x7C as i32);
4363         assert_eq_m256i(r, e);
4364     }
4365
4366     #[simd_test(enable = "avx2")]
4367     unsafe fn test_mm256_blend_epi16() {
4368         let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4369         let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4370         let r = _mm256_blend_epi16(a, b, 0x01 as i32);
4371         assert_eq_m256i(r, e);
4372
4373         let r = _mm256_blend_epi16(b, a, 0xFE as i32);
4374         assert_eq_m256i(r, e);
4375     }
4376
4377     #[simd_test(enable = "avx2")]
4378     unsafe fn test_mm256_blendv_epi8() {
4379         let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4380         let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
4381         let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
4382         let r = _mm256_blendv_epi8(a, b, mask);
4383         assert_eq_m256i(r, e);
4384     }
4385
4386     #[simd_test(enable = "avx2")]
4387     unsafe fn test_mm_broadcastb_epi8() {
4388         let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4389         let res = _mm_broadcastb_epi8(a);
4390         assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4391     }
4392
4393     #[simd_test(enable = "avx2")]
4394     unsafe fn test_mm256_broadcastb_epi8() {
4395         let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4396         let res = _mm256_broadcastb_epi8(a);
4397         assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4398     }
4399
4400     #[simd_test(enable = "avx2")]
4401     unsafe fn test_mm_broadcastd_epi32() {
4402         let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4403         let res = _mm_broadcastd_epi32(a);
4404         assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4405     }
4406
4407     #[simd_test(enable = "avx2")]
4408     unsafe fn test_mm256_broadcastd_epi32() {
4409         let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4410         let res = _mm256_broadcastd_epi32(a);
4411         assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4412     }
4413
4414     #[simd_test(enable = "avx2")]
4415     unsafe fn test_mm_broadcastq_epi64() {
4416         let a = _mm_setr_epi64x(0x1ffffffff, 0);
4417         let res = _mm_broadcastq_epi64(a);
4418         assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4419     }
4420
4421     #[simd_test(enable = "avx2")]
4422     unsafe fn test_mm256_broadcastq_epi64() {
4423         let a = _mm_setr_epi64x(0x1ffffffff, 0);
4424         let res = _mm256_broadcastq_epi64(a);
4425         assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4426     }
4427
4428     #[simd_test(enable = "avx2")]
4429     unsafe fn test_mm_broadcastsd_pd() {
4430         let a = _mm_setr_pd(6.28, 3.14);
4431         let res = _mm_broadcastsd_pd(a);
4432         assert_eq_m128d(res, _mm_set1_pd(6.28f64));
4433     }
4434
4435     #[simd_test(enable = "avx2")]
4436     unsafe fn test_mm256_broadcastsd_pd() {
4437         let a = _mm_setr_pd(6.28, 3.14);
4438         let res = _mm256_broadcastsd_pd(a);
4439         assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
4440     }
4441
4442     #[simd_test(enable = "avx2")]
4443     unsafe fn test_mm256_broadcastsi128_si256() {
4444         let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4445         let res = _mm256_broadcastsi128_si256(a);
4446         let retval = _mm256_setr_epi64x(
4447             0x0987654321012334,
4448             0x5678909876543210,
4449             0x0987654321012334,
4450             0x5678909876543210,
4451         );
4452         assert_eq_m256i(res, retval);
4453     }
4454
4455     #[simd_test(enable = "avx2")]
4456     unsafe fn test_mm_broadcastss_ps() {
4457         let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4458         let res = _mm_broadcastss_ps(a);
4459         assert_eq_m128(res, _mm_set1_ps(6.28f32));
4460     }
4461
4462     #[simd_test(enable = "avx2")]
4463     unsafe fn test_mm256_broadcastss_ps() {
4464         let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4465         let res = _mm256_broadcastss_ps(a);
4466         assert_eq_m256(res, _mm256_set1_ps(6.28f32));
4467     }
4468
4469     #[simd_test(enable = "avx2")]
4470     unsafe fn test_mm_broadcastw_epi16() {
4471         let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4472         let res = _mm_broadcastw_epi16(a);
4473         assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4474     }
4475
4476     #[simd_test(enable = "avx2")]
4477     unsafe fn test_mm256_broadcastw_epi16() {
4478         let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4479         let res = _mm256_broadcastw_epi16(a);
4480         assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4481     }
4482
4483     #[simd_test(enable = "avx2")]
4484     unsafe fn test_mm256_cmpeq_epi8() {
4485         #[rustfmt::skip]
4486         let a = _mm256_setr_epi8(
4487             0, 1, 2, 3, 4, 5, 6, 7,
4488             8, 9, 10, 11, 12, 13, 14, 15,
4489             16, 17, 18, 19, 20, 21, 22, 23,
4490             24, 25, 26, 27, 28, 29, 30, 31,
4491         );
4492         #[rustfmt::skip]
4493         let b = _mm256_setr_epi8(
4494             31, 30, 2, 28, 27, 26, 25, 24,
4495             23, 22, 21, 20, 19, 18, 17, 16,
4496             15, 14, 13, 12, 11, 10, 9, 8,
4497             7, 6, 5, 4, 3, 2, 1, 0,
4498         );
4499         let r = _mm256_cmpeq_epi8(a, b);
4500         assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
4501     }
4502
4503     #[simd_test(enable = "avx2")]
4504     unsafe fn test_mm256_cmpeq_epi16() {
4505         #[rustfmt::skip]
4506         let a = _mm256_setr_epi16(
4507             0, 1, 2, 3, 4, 5, 6, 7,
4508             8, 9, 10, 11, 12, 13, 14, 15,
4509         );
4510         #[rustfmt::skip]
4511         let b = _mm256_setr_epi16(
4512             15, 14, 2, 12, 11, 10, 9, 8,
4513             7, 6, 5, 4, 3, 2, 1, 0,
4514         );
4515         let r = _mm256_cmpeq_epi16(a, b);
4516         assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
4517     }
4518
4519     #[simd_test(enable = "avx2")]
4520     unsafe fn test_mm256_cmpeq_epi32() {
4521         let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4522         let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4523         let r = _mm256_cmpeq_epi32(a, b);
4524         let e = _mm256_set1_epi32(0);
4525         let e = _mm256_insert_epi32(e, !0, 2);
4526         assert_eq_m256i(r, e);
4527     }
4528
4529     #[simd_test(enable = "avx2")]
4530     unsafe fn test_mm256_cmpeq_epi64() {
4531         let a = _mm256_setr_epi64x(0, 1, 2, 3);
4532         let b = _mm256_setr_epi64x(3, 2, 2, 0);
4533         let r = _mm256_cmpeq_epi64(a, b);
4534         assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
4535     }
4536
4537     #[simd_test(enable = "avx2")]
4538     unsafe fn test_mm256_cmpgt_epi8() {
4539         let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
4540         let b = _mm256_set1_epi8(0);
4541         let r = _mm256_cmpgt_epi8(a, b);
4542         assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
4543     }
4544
4545     #[simd_test(enable = "avx2")]
4546     unsafe fn test_mm256_cmpgt_epi16() {
4547         let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
4548         let b = _mm256_set1_epi16(0);
4549         let r = _mm256_cmpgt_epi16(a, b);
4550         assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
4551     }
4552
4553     #[simd_test(enable = "avx2")]
4554     unsafe fn test_mm256_cmpgt_epi32() {
4555         let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
4556         let b = _mm256_set1_epi32(0);
4557         let r = _mm256_cmpgt_epi32(a, b);
4558         assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
4559     }
4560
4561     #[simd_test(enable = "avx2")]
4562     unsafe fn test_mm256_cmpgt_epi64() {
4563         let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
4564         let b = _mm256_set1_epi64x(0);
4565         let r = _mm256_cmpgt_epi64(a, b);
4566         assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
4567     }
4568
4569     #[simd_test(enable = "avx2")]
4570     unsafe fn test_mm256_cvtepi8_epi16() {
4571         #[rustfmt::skip]
4572         let a = _mm_setr_epi8(
4573             0, 0, -1, 1, -2, 2, -3, 3,
4574             -4, 4, -5, 5, -6, 6, -7, 7,
4575         );
4576         #[rustfmt::skip]
4577         let r = _mm256_setr_epi16(
4578             0, 0, -1, 1, -2, 2, -3, 3,
4579             -4, 4, -5, 5, -6, 6, -7, 7,
4580         );
4581         assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4582     }
4583
4584     #[simd_test(enable = "avx2")]
4585     unsafe fn test_mm256_cvtepi8_epi32() {
4586         #[rustfmt::skip]
4587         let a = _mm_setr_epi8(
4588             0, 0, -1, 1, -2, 2, -3, 3,
4589             -4, 4, -5, 5, -6, 6, -7, 7,
4590         );
4591         let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4592         assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4593     }
4594
4595     #[simd_test(enable = "avx2")]
4596     unsafe fn test_mm256_cvtepi8_epi64() {
4597         #[rustfmt::skip]
4598         let a = _mm_setr_epi8(
4599             0, 0, -1, 1, -2, 2, -3, 3,
4600             -4, 4, -5, 5, -6, 6, -7, 7,
4601         );
4602         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4603         assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4604     }
4605
4606     #[simd_test(enable = "avx2")]
4607     unsafe fn test_mm256_cvtepi16_epi32() {
4608         let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4609         let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4610         assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4611     }
4612
4613     #[simd_test(enable = "avx2")]
4614     unsafe fn test_mm256_cvtepi16_epi64() {
4615         let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4616         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4617         assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4618     }
4619
4620     #[simd_test(enable = "avx2")]
4621     unsafe fn test_mm256_cvtepi32_epi64() {
4622         let a = _mm_setr_epi32(0, 0, -1, 1);
4623         let r = _mm256_setr_epi64x(0, 0, -1, 1);
4624         assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4625     }
4626
4627     #[simd_test(enable = "avx2")]
4628     unsafe fn test_mm256_cvtepu16_epi32() {
4629         let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4630         let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4631         assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4632     }
4633
4634     #[simd_test(enable = "avx2")]
4635     unsafe fn test_mm256_cvtepu16_epi64() {
4636         let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4637         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4638         assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4639     }
4640
4641     #[simd_test(enable = "avx2")]
4642     unsafe fn test_mm256_cvtepu32_epi64() {
4643         let a = _mm_setr_epi32(0, 1, 2, 3);
4644         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4645         assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4646     }
4647
4648     #[simd_test(enable = "avx2")]
4649     unsafe fn test_mm256_cvtepu8_epi16() {
4650         #[rustfmt::skip]
4651         let a = _mm_setr_epi8(
4652             0, 1, 2, 3, 4, 5, 6, 7,
4653             8, 9, 10, 11, 12, 13, 14, 15,
4654         );
4655         #[rustfmt::skip]
4656         let r = _mm256_setr_epi16(
4657             0, 1, 2, 3, 4, 5, 6, 7,
4658             8, 9, 10, 11, 12, 13, 14, 15,
4659         );
4660         assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4661     }
4662
4663     #[simd_test(enable = "avx2")]
4664     unsafe fn test_mm256_cvtepu8_epi32() {
4665         #[rustfmt::skip]
4666         let a = _mm_setr_epi8(
4667             0, 1, 2, 3, 4, 5, 6, 7,
4668             8, 9, 10, 11, 12, 13, 14, 15,
4669         );
4670         let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4671         assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4672     }
4673
4674     #[simd_test(enable = "avx2")]
4675     unsafe fn test_mm256_cvtepu8_epi64() {
4676         #[rustfmt::skip]
4677         let a = _mm_setr_epi8(
4678             0, 1, 2, 3, 4, 5, 6, 7,
4679             8, 9, 10, 11, 12, 13, 14, 15,
4680         );
4681         let r = _mm256_setr_epi64x(0, 1, 2, 3);
4682         assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4683     }
4684
4685     #[simd_test(enable = "avx2")]
4686     unsafe fn test_mm256_extracti128_si256() {
4687         let a = _mm256_setr_epi64x(1, 2, 3, 4);
4688         let r = _mm256_extracti128_si256(a, 0b01);
4689         let e = _mm_setr_epi64x(3, 4);
4690         assert_eq_m128i(r, e);
4691     }
4692
4693     #[simd_test(enable = "avx2")]
4694     unsafe fn test_mm256_hadd_epi16() {
4695         let a = _mm256_set1_epi16(2);
4696         let b = _mm256_set1_epi16(4);
4697         let r = _mm256_hadd_epi16(a, b);
4698         let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4699         assert_eq_m256i(r, e);
4700     }
4701
4702     #[simd_test(enable = "avx2")]
4703     unsafe fn test_mm256_hadd_epi32() {
4704         let a = _mm256_set1_epi32(2);
4705         let b = _mm256_set1_epi32(4);
4706         let r = _mm256_hadd_epi32(a, b);
4707         let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4708         assert_eq_m256i(r, e);
4709     }
4710
4711     #[simd_test(enable = "avx2")]
4712     unsafe fn test_mm256_hadds_epi16() {
4713         let a = _mm256_set1_epi16(2);
4714         let a = _mm256_insert_epi16(a, 0x7fff, 0);
4715         let a = _mm256_insert_epi16(a, 1, 1);
4716         let b = _mm256_set1_epi16(4);
4717         let r = _mm256_hadds_epi16(a, b);
4718         #[rustfmt::skip]
4719         let e = _mm256_setr_epi16(
4720             0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4721             4, 4, 4, 4, 8, 8, 8, 8,
4722         );
4723         assert_eq_m256i(r, e);
4724     }
4725
4726     #[simd_test(enable = "avx2")]
4727     unsafe fn test_mm256_hsub_epi16() {
4728         let a = _mm256_set1_epi16(2);
4729         let b = _mm256_set1_epi16(4);
4730         let r = _mm256_hsub_epi16(a, b);
4731         let e = _mm256_set1_epi16(0);
4732         assert_eq_m256i(r, e);
4733     }
4734
4735     #[simd_test(enable = "avx2")]
4736     unsafe fn test_mm256_hsub_epi32() {
4737         let a = _mm256_set1_epi32(2);
4738         let b = _mm256_set1_epi32(4);
4739         let r = _mm256_hsub_epi32(a, b);
4740         let e = _mm256_set1_epi32(0);
4741         assert_eq_m256i(r, e);
4742     }
4743
4744     #[simd_test(enable = "avx2")]
4745     unsafe fn test_mm256_hsubs_epi16() {
4746         let a = _mm256_set1_epi16(2);
4747         let a = _mm256_insert_epi16(a, 0x7fff, 0);
4748         let a = _mm256_insert_epi16(a, -1, 1);
4749         let b = _mm256_set1_epi16(4);
4750         let r = _mm256_hsubs_epi16(a, b);
4751         let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
4752         assert_eq_m256i(r, e);
4753     }
4754
4755     #[simd_test(enable = "avx2")]
4756     unsafe fn test_mm256_madd_epi16() {
4757         let a = _mm256_set1_epi16(2);
4758         let b = _mm256_set1_epi16(4);
4759         let r = _mm256_madd_epi16(a, b);
4760         let e = _mm256_set1_epi32(16);
4761         assert_eq_m256i(r, e);
4762     }
4763
4764     #[simd_test(enable = "avx2")]
4765     unsafe fn test_mm256_inserti128_si256() {
4766         let a = _mm256_setr_epi64x(1, 2, 3, 4);
4767         let b = _mm_setr_epi64x(7, 8);
4768         let r = _mm256_inserti128_si256(a, b, 0b01);
4769         let e = _mm256_setr_epi64x(1, 2, 7, 8);
4770         assert_eq_m256i(r, e);
4771     }
4772
4773     #[simd_test(enable = "avx2")]
4774     unsafe fn test_mm256_maddubs_epi16() {
4775         let a = _mm256_set1_epi8(2);
4776         let b = _mm256_set1_epi8(4);
4777         let r = _mm256_maddubs_epi16(a, b);
4778         let e = _mm256_set1_epi16(16);
4779         assert_eq_m256i(r, e);
4780     }
4781
4782     #[simd_test(enable = "avx2")]
4783     unsafe fn test_mm_maskload_epi32() {
4784         let nums = [1, 2, 3, 4];
4785         let a = &nums as *const i32;
4786         let mask = _mm_setr_epi32(-1, 0, 0, -1);
4787         let r = _mm_maskload_epi32(a, mask);
4788         let e = _mm_setr_epi32(1, 0, 0, 4);
4789         assert_eq_m128i(r, e);
4790     }
4791
4792     #[simd_test(enable = "avx2")]
4793     unsafe fn test_mm256_maskload_epi32() {
4794         let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4795         let a = &nums as *const i32;
4796         let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4797         let r = _mm256_maskload_epi32(a, mask);
4798         let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4799         assert_eq_m256i(r, e);
4800     }
4801
4802     #[simd_test(enable = "avx2")]
4803     unsafe fn test_mm_maskload_epi64() {
4804         let nums = [1_i64, 2_i64];
4805         let a = &nums as *const i64;
4806         let mask = _mm_setr_epi64x(0, -1);
4807         let r = _mm_maskload_epi64(a, mask);
4808         let e = _mm_setr_epi64x(0, 2);
4809         assert_eq_m128i(r, e);
4810     }
4811
4812     #[simd_test(enable = "avx2")]
4813     unsafe fn test_mm256_maskload_epi64() {
4814         let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4815         let a = &nums as *const i64;
4816         let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4817         let r = _mm256_maskload_epi64(a, mask);
4818         let e = _mm256_setr_epi64x(0, 2, 3, 0);
4819         assert_eq_m256i(r, e);
4820     }
4821
4822     #[simd_test(enable = "avx2")]
4823     unsafe fn test_mm_maskstore_epi32() {
4824         let a = _mm_setr_epi32(1, 2, 3, 4);
4825         let mut arr = [-1, -1, -1, -1];
4826         let mask = _mm_setr_epi32(-1, 0, 0, -1);
4827         _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4828         let e = [1, -1, -1, 4];
4829         assert_eq!(arr, e);
4830     }
4831
4832     #[simd_test(enable = "avx2")]
4833     unsafe fn test_mm256_maskstore_epi32() {
4834         let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4835         let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4836         let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4837         _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4838         let e = [1, -1, -1, 42, -1, 6, 7, -1];
4839         assert_eq!(arr, e);
4840     }
4841
4842     #[simd_test(enable = "avx2")]
4843     unsafe fn test_mm_maskstore_epi64() {
4844         let a = _mm_setr_epi64x(1_i64, 2_i64);
4845         let mut arr = [-1_i64, -1_i64];
4846         let mask = _mm_setr_epi64x(0, -1);
4847         _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4848         let e = [-1, 2];
4849         assert_eq!(arr, e);
4850     }
4851
4852     #[simd_test(enable = "avx2")]
4853     unsafe fn test_mm256_maskstore_epi64() {
4854         let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4855         let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4856         let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4857         _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4858         let e = [-1, 2, 3, -1];
4859         assert_eq!(arr, e);
4860     }
4861
4862     #[simd_test(enable = "avx2")]
4863     unsafe fn test_mm256_max_epi16() {
4864         let a = _mm256_set1_epi16(2);
4865         let b = _mm256_set1_epi16(4);
4866         let r = _mm256_max_epi16(a, b);
4867         assert_eq_m256i(r, b);
4868     }
4869
4870     #[simd_test(enable = "avx2")]
4871     unsafe fn test_mm256_max_epi32() {
4872         let a = _mm256_set1_epi32(2);
4873         let b = _mm256_set1_epi32(4);
4874         let r = _mm256_max_epi32(a, b);
4875         assert_eq_m256i(r, b);
4876     }
4877
4878     #[simd_test(enable = "avx2")]
4879     unsafe fn test_mm256_max_epi8() {
4880         let a = _mm256_set1_epi8(2);
4881         let b = _mm256_set1_epi8(4);
4882         let r = _mm256_max_epi8(a, b);
4883         assert_eq_m256i(r, b);
4884     }
4885
4886     #[simd_test(enable = "avx2")]
4887     unsafe fn test_mm256_max_epu16() {
4888         let a = _mm256_set1_epi16(2);
4889         let b = _mm256_set1_epi16(4);
4890         let r = _mm256_max_epu16(a, b);
4891         assert_eq_m256i(r, b);
4892     }
4893
4894     #[simd_test(enable = "avx2")]
4895     unsafe fn test_mm256_max_epu32() {
4896         let a = _mm256_set1_epi32(2);
4897         let b = _mm256_set1_epi32(4);
4898         let r = _mm256_max_epu32(a, b);
4899         assert_eq_m256i(r, b);
4900     }
4901
4902     #[simd_test(enable = "avx2")]
4903     unsafe fn test_mm256_max_epu8() {
4904         let a = _mm256_set1_epi8(2);
4905         let b = _mm256_set1_epi8(4);
4906         let r = _mm256_max_epu8(a, b);
4907         assert_eq_m256i(r, b);
4908     }
4909
4910     #[simd_test(enable = "avx2")]
4911     unsafe fn test_mm256_min_epi16() {
4912         let a = _mm256_set1_epi16(2);
4913         let b = _mm256_set1_epi16(4);
4914         let r = _mm256_min_epi16(a, b);
4915         assert_eq_m256i(r, a);
4916     }
4917
4918     #[simd_test(enable = "avx2")]
4919     unsafe fn test_mm256_min_epi32() {
4920         let a = _mm256_set1_epi32(2);
4921         let b = _mm256_set1_epi32(4);
4922         let r = _mm256_min_epi32(a, b);
4923         assert_eq_m256i(r, a);
4924     }
4925
4926     #[simd_test(enable = "avx2")]
4927     unsafe fn test_mm256_min_epi8() {
4928         let a = _mm256_set1_epi8(2);
4929         let b = _mm256_set1_epi8(4);
4930         let r = _mm256_min_epi8(a, b);
4931         assert_eq_m256i(r, a);
4932     }
4933
4934     #[simd_test(enable = "avx2")]
4935     unsafe fn test_mm256_min_epu16() {
4936         let a = _mm256_set1_epi16(2);
4937         let b = _mm256_set1_epi16(4);
4938         let r = _mm256_min_epu16(a, b);
4939         assert_eq_m256i(r, a);
4940     }
4941
4942     #[simd_test(enable = "avx2")]
4943     unsafe fn test_mm256_min_epu32() {
4944         let a = _mm256_set1_epi32(2);
4945         let b = _mm256_set1_epi32(4);
4946         let r = _mm256_min_epu32(a, b);
4947         assert_eq_m256i(r, a);
4948     }
4949
4950     #[simd_test(enable = "avx2")]
4951     unsafe fn test_mm256_min_epu8() {
4952         let a = _mm256_set1_epi8(2);
4953         let b = _mm256_set1_epi8(4);
4954         let r = _mm256_min_epu8(a, b);
4955         assert_eq_m256i(r, a);
4956     }
4957
4958     #[simd_test(enable = "avx2")]
4959     unsafe fn test_mm256_movemask_epi8() {
4960         let a = _mm256_set1_epi8(-1);
4961         let r = _mm256_movemask_epi8(a);
4962         let e = -1;
4963         assert_eq!(r, e);
4964     }
4965
4966     #[simd_test(enable = "avx2")]
4967     unsafe fn test_mm256_mpsadbw_epu8() {
4968         let a = _mm256_set1_epi8(2);
4969         let b = _mm256_set1_epi8(4);
4970         let r = _mm256_mpsadbw_epu8(a, b, 0);
4971         let e = _mm256_set1_epi16(8);
4972         assert_eq_m256i(r, e);
4973     }
4974
4975     #[simd_test(enable = "avx2")]
4976     unsafe fn test_mm256_mul_epi32() {
4977         let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4978         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4979         let r = _mm256_mul_epi32(a, b);
4980         let e = _mm256_setr_epi64x(0, 0, 10, 14);
4981         assert_eq_m256i(r, e);
4982     }
4983
4984     #[simd_test(enable = "avx2")]
4985     unsafe fn test_mm256_mul_epu32() {
4986         let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4987         let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4988         let r = _mm256_mul_epu32(a, b);
4989         let e = _mm256_setr_epi64x(0, 0, 10, 14);
4990         assert_eq_m256i(r, e);
4991     }
4992
4993     #[simd_test(enable = "avx2")]
4994     unsafe fn test_mm256_mulhi_epi16() {
4995         let a = _mm256_set1_epi16(6535);
4996         let b = _mm256_set1_epi16(6535);
4997         let r = _mm256_mulhi_epi16(a, b);
4998         let e = _mm256_set1_epi16(651);
4999         assert_eq_m256i(r, e);
5000     }
5001
5002     #[simd_test(enable = "avx2")]
5003     unsafe fn test_mm256_mulhi_epu16() {
5004         let a = _mm256_set1_epi16(6535);
5005         let b = _mm256_set1_epi16(6535);
5006         let r = _mm256_mulhi_epu16(a, b);
5007         let e = _mm256_set1_epi16(651);
5008         assert_eq_m256i(r, e);
5009     }
5010
5011     #[simd_test(enable = "avx2")]
5012     unsafe fn test_mm256_mullo_epi16() {
5013         let a = _mm256_set1_epi16(2);
5014         let b = _mm256_set1_epi16(4);
5015         let r = _mm256_mullo_epi16(a, b);
5016         let e = _mm256_set1_epi16(8);
5017         assert_eq_m256i(r, e);
5018     }
5019
5020     #[simd_test(enable = "avx2")]
5021     unsafe fn test_mm256_mullo_epi32() {
5022         let a = _mm256_set1_epi32(2);
5023         let b = _mm256_set1_epi32(4);
5024         let r = _mm256_mullo_epi32(a, b);
5025         let e = _mm256_set1_epi32(8);
5026         assert_eq_m256i(r, e);
5027     }
5028
5029     #[simd_test(enable = "avx2")]
5030     unsafe fn test_mm256_mulhrs_epi16() {
5031         let a = _mm256_set1_epi16(2);
5032         let b = _mm256_set1_epi16(4);
5033         let r = _mm256_mullo_epi16(a, b);
5034         let e = _mm256_set1_epi16(8);
5035         assert_eq_m256i(r, e);
5036     }
5037
5038     #[simd_test(enable = "avx2")]
5039     unsafe fn test_mm256_or_si256() {
5040         let a = _mm256_set1_epi8(-1);
5041         let b = _mm256_set1_epi8(0);
5042         let r = _mm256_or_si256(a, b);
5043         assert_eq_m256i(r, a);
5044     }
5045
5046     #[simd_test(enable = "avx2")]
5047     unsafe fn test_mm256_packs_epi16() {
5048         let a = _mm256_set1_epi16(2);
5049         let b = _mm256_set1_epi16(4);
5050         let r = _mm256_packs_epi16(a, b);
5051         #[rustfmt::skip]
5052         let e = _mm256_setr_epi8(
5053             2, 2, 2, 2, 2, 2, 2, 2,
5054             4, 4, 4, 4, 4, 4, 4, 4,
5055             2, 2, 2, 2, 2, 2, 2, 2,
5056             4, 4, 4, 4, 4, 4, 4, 4,
5057         );
5058
5059         assert_eq_m256i(r, e);
5060     }
5061
5062     #[simd_test(enable = "avx2")]
5063     unsafe fn test_mm256_packs_epi32() {
5064         let a = _mm256_set1_epi32(2);
5065         let b = _mm256_set1_epi32(4);
5066         let r = _mm256_packs_epi32(a, b);
5067         let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5068
5069         assert_eq_m256i(r, e);
5070     }
5071
5072     #[simd_test(enable = "avx2")]
5073     unsafe fn test_mm256_packus_epi16() {
5074         let a = _mm256_set1_epi16(2);
5075         let b = _mm256_set1_epi16(4);
5076         let r = _mm256_packus_epi16(a, b);
5077         #[rustfmt::skip]
5078         let e = _mm256_setr_epi8(
5079             2, 2, 2, 2, 2, 2, 2, 2,
5080             4, 4, 4, 4, 4, 4, 4, 4,
5081             2, 2, 2, 2, 2, 2, 2, 2,
5082             4, 4, 4, 4, 4, 4, 4, 4,
5083         );
5084
5085         assert_eq_m256i(r, e);
5086     }
5087
5088     #[simd_test(enable = "avx2")]
5089     unsafe fn test_mm256_packus_epi32() {
5090         let a = _mm256_set1_epi32(2);
5091         let b = _mm256_set1_epi32(4);
5092         let r = _mm256_packus_epi32(a, b);
5093         let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5094
5095         assert_eq_m256i(r, e);
5096     }
5097
5098     #[simd_test(enable = "avx2")]
5099     unsafe fn test_mm256_sad_epu8() {
5100         let a = _mm256_set1_epi8(2);
5101         let b = _mm256_set1_epi8(4);
5102         let r = _mm256_sad_epu8(a, b);
5103         let e = _mm256_set1_epi64x(16);
5104         assert_eq_m256i(r, e);
5105     }
5106
5107     #[simd_test(enable = "avx2")]
5108     unsafe fn test_mm256_shufflehi_epi16() {
5109         #[rustfmt::skip]
5110         let a = _mm256_setr_epi16(
5111             0, 1, 2, 3, 11, 22, 33, 44,
5112             4, 5, 6, 7, 55, 66, 77, 88,
5113         );
5114         #[rustfmt::skip]
5115         let e = _mm256_setr_epi16(
5116             0, 1, 2, 3, 44, 22, 22, 11,
5117             4, 5, 6, 7, 88, 66, 66, 55,
5118         );
5119         let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
5120         assert_eq_m256i(r, e);
5121     }
5122
5123     #[simd_test(enable = "avx2")]
5124     unsafe fn test_mm256_shufflelo_epi16() {
5125         #[rustfmt::skip]
5126         let a = _mm256_setr_epi16(
5127             11, 22, 33, 44, 0, 1, 2, 3,
5128             55, 66, 77, 88, 4, 5, 6, 7,
5129         );
5130         #[rustfmt::skip]
5131         let e = _mm256_setr_epi16(
5132             44, 22, 22, 11, 0, 1, 2, 3,
5133             88, 66, 66, 55, 4, 5, 6, 7,
5134         );
5135         let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
5136         assert_eq_m256i(r, e);
5137     }
5138
5139     #[simd_test(enable = "avx2")]
5140     unsafe fn test_mm256_sign_epi16() {
5141         let a = _mm256_set1_epi16(2);
5142         let b = _mm256_set1_epi16(-1);
5143         let r = _mm256_sign_epi16(a, b);
5144         let e = _mm256_set1_epi16(-2);
5145         assert_eq_m256i(r, e);
5146     }
5147
5148     #[simd_test(enable = "avx2")]
5149     unsafe fn test_mm256_sign_epi32() {
5150         let a = _mm256_set1_epi32(2);
5151         let b = _mm256_set1_epi32(-1);
5152         let r = _mm256_sign_epi32(a, b);
5153         let e = _mm256_set1_epi32(-2);
5154         assert_eq_m256i(r, e);
5155     }
5156
5157     #[simd_test(enable = "avx2")]
5158     unsafe fn test_mm256_sign_epi8() {
5159         let a = _mm256_set1_epi8(2);
5160         let b = _mm256_set1_epi8(-1);
5161         let r = _mm256_sign_epi8(a, b);
5162         let e = _mm256_set1_epi8(-2);
5163         assert_eq_m256i(r, e);
5164     }
5165
5166     #[simd_test(enable = "avx2")]
5167     unsafe fn test_mm256_sll_epi16() {
5168         let a = _mm256_set1_epi16(0xFF);
5169         let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5170         let r = _mm256_sll_epi16(a, b);
5171         assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5172     }
5173
5174     #[simd_test(enable = "avx2")]
5175     unsafe fn test_mm256_sll_epi32() {
5176         let a = _mm256_set1_epi32(0xFFFF);
5177         let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5178         let r = _mm256_sll_epi32(a, b);
5179         assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5180     }
5181
5182     #[simd_test(enable = "avx2")]
5183     unsafe fn test_mm256_sll_epi64() {
5184         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5185         let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
5186         let r = _mm256_sll_epi64(a, b);
5187         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5188     }
5189
5190     #[simd_test(enable = "avx2")]
5191     unsafe fn test_mm256_slli_epi16() {
5192         assert_eq_m256i(
5193             _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
5194             _mm256_set1_epi16(0xFF0),
5195         );
5196     }
5197
5198     #[simd_test(enable = "avx2")]
5199     unsafe fn test_mm256_slli_epi32() {
5200         assert_eq_m256i(
5201             _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5202             _mm256_set1_epi32(0xFFFF0),
5203         );
5204     }
5205
5206     #[simd_test(enable = "avx2")]
5207     unsafe fn test_mm256_slli_epi64() {
5208         assert_eq_m256i(
5209             _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5210             _mm256_set1_epi64x(0xFFFFFFFF0),
5211         );
5212     }
5213
5214     #[simd_test(enable = "avx2")]
5215     unsafe fn test_mm256_slli_si256() {
5216         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5217         let r = _mm256_slli_si256(a, 3);
5218         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5219     }
5220
5221     #[simd_test(enable = "avx2")]
5222     unsafe fn test_mm_sllv_epi32() {
5223         let a = _mm_set1_epi32(2);
5224         let b = _mm_set1_epi32(1);
5225         let r = _mm_sllv_epi32(a, b);
5226         let e = _mm_set1_epi32(4);
5227         assert_eq_m128i(r, e);
5228     }
5229
5230     #[simd_test(enable = "avx2")]
5231     unsafe fn test_mm256_sllv_epi32() {
5232         let a = _mm256_set1_epi32(2);
5233         let b = _mm256_set1_epi32(1);
5234         let r = _mm256_sllv_epi32(a, b);
5235         let e = _mm256_set1_epi32(4);
5236         assert_eq_m256i(r, e);
5237     }
5238
5239     #[simd_test(enable = "avx2")]
5240     unsafe fn test_mm_sllv_epi64() {
5241         let a = _mm_set1_epi64x(2);
5242         let b = _mm_set1_epi64x(1);
5243         let r = _mm_sllv_epi64(a, b);
5244         let e = _mm_set1_epi64x(4);
5245         assert_eq_m128i(r, e);
5246     }
5247
5248     #[simd_test(enable = "avx2")]
5249     unsafe fn test_mm256_sllv_epi64() {
5250         let a = _mm256_set1_epi64x(2);
5251         let b = _mm256_set1_epi64x(1);
5252         let r = _mm256_sllv_epi64(a, b);
5253         let e = _mm256_set1_epi64x(4);
5254         assert_eq_m256i(r, e);
5255     }
5256
5257     #[simd_test(enable = "avx2")]
5258     unsafe fn test_mm256_sra_epi16() {
5259         let a = _mm256_set1_epi16(-1);
5260         let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5261         let r = _mm256_sra_epi16(a, b);
5262         assert_eq_m256i(r, _mm256_set1_epi16(-1));
5263     }
5264
5265     #[simd_test(enable = "avx2")]
5266     unsafe fn test_mm256_sra_epi32() {
5267         let a = _mm256_set1_epi32(-1);
5268         let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
5269         let r = _mm256_sra_epi32(a, b);
5270         assert_eq_m256i(r, _mm256_set1_epi32(-1));
5271     }
5272
5273     #[simd_test(enable = "avx2")]
5274     unsafe fn test_mm256_srai_epi16() {
5275         assert_eq_m256i(
5276             _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
5277             _mm256_set1_epi16(-1),
5278         );
5279     }
5280
5281     #[simd_test(enable = "avx2")]
5282     unsafe fn test_mm256_srai_epi32() {
5283         assert_eq_m256i(
5284             _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
5285             _mm256_set1_epi32(-1),
5286         );
5287     }
5288
5289     #[simd_test(enable = "avx2")]
5290     unsafe fn test_mm_srav_epi32() {
5291         let a = _mm_set1_epi32(4);
5292         let count = _mm_set1_epi32(1);
5293         let r = _mm_srav_epi32(a, count);
5294         let e = _mm_set1_epi32(2);
5295         assert_eq_m128i(r, e);
5296     }
5297
5298     #[simd_test(enable = "avx2")]
5299     unsafe fn test_mm256_srav_epi32() {
5300         let a = _mm256_set1_epi32(4);
5301         let count = _mm256_set1_epi32(1);
5302         let r = _mm256_srav_epi32(a, count);
5303         let e = _mm256_set1_epi32(2);
5304         assert_eq_m256i(r, e);
5305     }
5306
5307     #[simd_test(enable = "avx2")]
5308     unsafe fn test_mm256_srli_si256() {
5309         #[rustfmt::skip]
5310         let a = _mm256_setr_epi8(
5311             1, 2, 3, 4, 5, 6, 7, 8,
5312             9, 10, 11, 12, 13, 14, 15, 16,
5313             17, 18, 19, 20, 21, 22, 23, 24,
5314             25, 26, 27, 28, 29, 30, 31, 32,
5315         );
5316         let r = _mm256_srli_si256(a, 3);
5317         #[rustfmt::skip]
5318         let e = _mm256_setr_epi8(
5319             4, 5, 6, 7, 8, 9, 10, 11,
5320             12, 13, 14, 15, 16, 0, 0, 0,
5321             20, 21, 22, 23, 24, 25, 26, 27,
5322             28, 29, 30, 31, 32, 0, 0, 0,
5323         );
5324         assert_eq_m256i(r, e);
5325     }
5326
5327     #[simd_test(enable = "avx2")]
5328     unsafe fn test_mm256_srl_epi16() {
5329         let a = _mm256_set1_epi16(0xFF);
5330         let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5331         let r = _mm256_srl_epi16(a, b);
5332         assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5333     }
5334
5335     #[simd_test(enable = "avx2")]
5336     unsafe fn test_mm256_srl_epi32() {
5337         let a = _mm256_set1_epi32(0xFFFF);
5338         let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5339         let r = _mm256_srl_epi32(a, b);
5340         assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5341     }
5342
5343     #[simd_test(enable = "avx2")]
5344     unsafe fn test_mm256_srl_epi64() {
5345         let a = _mm256_set1_epi64x(0xFFFFFFFF);
5346         let b = _mm_setr_epi64x(4, 0);
5347         let r = _mm256_srl_epi64(a, b);
5348         assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5349     }
5350
5351     #[simd_test(enable = "avx2")]
5352     unsafe fn test_mm256_srli_epi16() {
5353         assert_eq_m256i(
5354             _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
5355             _mm256_set1_epi16(0xF),
5356         );
5357     }
5358
5359     #[simd_test(enable = "avx2")]
5360     unsafe fn test_mm256_srli_epi32() {
5361         assert_eq_m256i(
5362             _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5363             _mm256_set1_epi32(0xFFF),
5364         );
5365     }
5366
5367     #[simd_test(enable = "avx2")]
5368     unsafe fn test_mm256_srli_epi64() {
5369         assert_eq_m256i(
5370             _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5371             _mm256_set1_epi64x(0xFFFFFFF),
5372         );
5373     }
5374
5375     #[simd_test(enable = "avx2")]
5376     unsafe fn test_mm_srlv_epi32() {
5377         let a = _mm_set1_epi32(2);
5378         let count = _mm_set1_epi32(1);
5379         let r = _mm_srlv_epi32(a, count);
5380         let e = _mm_set1_epi32(1);
5381         assert_eq_m128i(r, e);
5382     }
5383
5384     #[simd_test(enable = "avx2")]
5385     unsafe fn test_mm256_srlv_epi32() {
5386         let a = _mm256_set1_epi32(2);
5387         let count = _mm256_set1_epi32(1);
5388         let r = _mm256_srlv_epi32(a, count);
5389         let e = _mm256_set1_epi32(1);
5390         assert_eq_m256i(r, e);
5391     }
5392
5393     #[simd_test(enable = "avx2")]
5394     unsafe fn test_mm_srlv_epi64() {
5395         let a = _mm_set1_epi64x(2);
5396         let count = _mm_set1_epi64x(1);
5397         let r = _mm_srlv_epi64(a, count);
5398         let e = _mm_set1_epi64x(1);
5399         assert_eq_m128i(r, e);
5400     }
5401
5402     #[simd_test(enable = "avx2")]
5403     unsafe fn test_mm256_srlv_epi64() {
5404         let a = _mm256_set1_epi64x(2);
5405         let count = _mm256_set1_epi64x(1);
5406         let r = _mm256_srlv_epi64(a, count);
5407         let e = _mm256_set1_epi64x(1);
5408         assert_eq_m256i(r, e);
5409     }
5410
5411     #[simd_test(enable = "avx2")]
5412     unsafe fn test_mm256_sub_epi16() {
5413         let a = _mm256_set1_epi16(4);
5414         let b = _mm256_set1_epi16(2);
5415         let r = _mm256_sub_epi16(a, b);
5416         assert_eq_m256i(r, b);
5417     }
5418
5419     #[simd_test(enable = "avx2")]
5420     unsafe fn test_mm256_sub_epi32() {
5421         let a = _mm256_set1_epi32(4);
5422         let b = _mm256_set1_epi32(2);
5423         let r = _mm256_sub_epi32(a, b);
5424         assert_eq_m256i(r, b);
5425     }
5426
5427     #[simd_test(enable = "avx2")]
5428     unsafe fn test_mm256_sub_epi64() {
5429         let a = _mm256_set1_epi64x(4);
5430         let b = _mm256_set1_epi64x(2);
5431         let r = _mm256_sub_epi64(a, b);
5432         assert_eq_m256i(r, b);
5433     }
5434
5435     #[simd_test(enable = "avx2")]
5436     unsafe fn test_mm256_sub_epi8() {
5437         let a = _mm256_set1_epi8(4);
5438         let b = _mm256_set1_epi8(2);
5439         let r = _mm256_sub_epi8(a, b);
5440         assert_eq_m256i(r, b);
5441     }
5442
5443     #[simd_test(enable = "avx2")]
5444     unsafe fn test_mm256_subs_epi16() {
5445         let a = _mm256_set1_epi16(4);
5446         let b = _mm256_set1_epi16(2);
5447         let r = _mm256_subs_epi16(a, b);
5448         assert_eq_m256i(r, b);
5449     }
5450
5451     #[simd_test(enable = "avx2")]
5452     unsafe fn test_mm256_subs_epi8() {
5453         let a = _mm256_set1_epi8(4);
5454         let b = _mm256_set1_epi8(2);
5455         let r = _mm256_subs_epi8(a, b);
5456         assert_eq_m256i(r, b);
5457     }
5458
5459     #[simd_test(enable = "avx2")]
5460     unsafe fn test_mm256_subs_epu16() {
5461         let a = _mm256_set1_epi16(4);
5462         let b = _mm256_set1_epi16(2);
5463         let r = _mm256_subs_epu16(a, b);
5464         assert_eq_m256i(r, b);
5465     }
5466
5467     #[simd_test(enable = "avx2")]
5468     unsafe fn test_mm256_subs_epu8() {
5469         let a = _mm256_set1_epi8(4);
5470         let b = _mm256_set1_epi8(2);
5471         let r = _mm256_subs_epu8(a, b);
5472         assert_eq_m256i(r, b);
5473     }
5474
5475     #[simd_test(enable = "avx2")]
5476     unsafe fn test_mm256_xor_si256() {
5477         let a = _mm256_set1_epi8(5);
5478         let b = _mm256_set1_epi8(3);
5479         let r = _mm256_xor_si256(a, b);
5480         assert_eq_m256i(r, _mm256_set1_epi8(6));
5481     }
5482
5483     #[simd_test(enable = "avx2")]
5484     unsafe fn test_mm256_alignr_epi8() {
5485         #[rustfmt::skip]
5486         let a = _mm256_setr_epi8(
5487             1, 2, 3, 4, 5, 6, 7, 8,
5488             9, 10, 11, 12, 13, 14, 15, 16,
5489             17, 18, 19, 20, 21, 22, 23, 24,
5490             25, 26, 27, 28, 29, 30, 31, 32,
5491         );
5492         #[rustfmt::skip]
5493         let b = _mm256_setr_epi8(
5494             -1, -2, -3, -4, -5, -6, -7, -8,
5495             -9, -10, -11, -12, -13, -14, -15, -16,
5496             -17, -18, -19, -20, -21, -22, -23, -24,
5497             -25, -26, -27, -28, -29, -30, -31, -32,
5498         );
5499         let r = _mm256_alignr_epi8(a, b, 33);
5500         assert_eq_m256i(r, _mm256_set1_epi8(0));
5501
5502         let r = _mm256_alignr_epi8(a, b, 17);
5503         #[rustfmt::skip]
5504         let expected = _mm256_setr_epi8(
5505             2, 3, 4, 5, 6, 7, 8, 9,
5506             10, 11, 12, 13, 14, 15, 16, 0,
5507             18, 19, 20, 21, 22, 23, 24, 25,
5508             26, 27, 28, 29, 30, 31, 32, 0,
5509         );
5510         assert_eq_m256i(r, expected);
5511
5512         let r = _mm256_alignr_epi8(a, b, 4);
5513         #[rustfmt::skip]
5514         let expected = _mm256_setr_epi8(
5515             -5, -6, -7, -8, -9, -10, -11, -12,
5516             -13, -14, -15, -16, 1, 2, 3, 4,
5517             -21, -22, -23, -24, -25, -26, -27, -28,
5518             -29, -30, -31, -32, 17, 18, 19, 20,
5519         );
5520         assert_eq_m256i(r, expected);
5521
5522         #[rustfmt::skip]
5523         let expected = _mm256_setr_epi8(
5524             -1, -2, -3, -4, -5, -6, -7, -8,
5525             -9, -10, -11, -12, -13, -14, -15, -16, -17,
5526             -18, -19, -20, -21, -22, -23, -24, -25,
5527             -26, -27, -28, -29, -30, -31, -32,
5528         );
5529         let r = _mm256_alignr_epi8(a, b, 16);
5530         assert_eq_m256i(r, expected);
5531
5532         let r = _mm256_alignr_epi8(a, b, 15);
5533         #[rustfmt::skip]
5534         let expected = _mm256_setr_epi8(
5535             -16, 1, 2, 3, 4, 5, 6, 7,
5536             8, 9, 10, 11, 12, 13, 14, 15,
5537             -32, 17, 18, 19, 20, 21, 22, 23,
5538             24, 25, 26, 27, 28, 29, 30, 31,
5539         );
5540         assert_eq_m256i(r, expected);
5541
5542         let r = _mm256_alignr_epi8(a, b, 0);
5543         assert_eq_m256i(r, b);
5544     }
5545
5546     #[simd_test(enable = "avx2")]
5547     unsafe fn test_mm256_shuffle_epi8() {
5548         #[rustfmt::skip]
5549         let a = _mm256_setr_epi8(
5550             1, 2, 3, 4, 5, 6, 7, 8,
5551             9, 10, 11, 12, 13, 14, 15, 16,
5552             17, 18, 19, 20, 21, 22, 23, 24,
5553             25, 26, 27, 28, 29, 30, 31, 32,
5554         );
5555         #[rustfmt::skip]
5556         let b = _mm256_setr_epi8(
5557             4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5558             12, 5, 5, 10, 4, 1, 8, 0,
5559             4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5560             12, 5, 5, 10, 4, 1, 8, 0,
5561         );
5562         #[rustfmt::skip]
5563         let expected = _mm256_setr_epi8(
5564             5, 0, 5, 4, 9, 13, 7, 4,
5565             13, 6, 6, 11, 5, 2, 9, 1,
5566             21, 0, 21, 20, 25, 29, 23, 20,
5567             29, 22, 22, 27, 21, 18, 25, 17,
5568         );
5569         let r = _mm256_shuffle_epi8(a, b);
5570         assert_eq_m256i(r, expected);
5571     }
5572
5573     #[simd_test(enable = "avx2")]
5574     unsafe fn test_mm256_permutevar8x32_epi32() {
5575         let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5576         let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5577         let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5578         let r = _mm256_permutevar8x32_epi32(a, b);
5579         assert_eq_m256i(r, expected);
5580     }
5581
5582     #[simd_test(enable = "avx2")]
5583     unsafe fn test_mm256_permute4x64_epi64() {
5584         let a = _mm256_setr_epi64x(100, 200, 300, 400);
5585         let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5586         let r = _mm256_permute4x64_epi64(a, 0b00010011);
5587         assert_eq_m256i(r, expected);
5588     }
5589
5590     #[simd_test(enable = "avx2")]
5591     unsafe fn test_mm256_permute2x128_si256() {
5592         let a = _mm256_setr_epi64x(100, 200, 500, 600);
5593         let b = _mm256_setr_epi64x(300, 400, 700, 800);
5594         let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
5595         let e = _mm256_setr_epi64x(700, 800, 500, 600);
5596         assert_eq_m256i(r, e);
5597     }
5598
5599     #[simd_test(enable = "avx2")]
5600     unsafe fn test_mm256_permute4x64_pd() {
5601         let a = _mm256_setr_pd(1., 2., 3., 4.);
5602         let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
5603         let e = _mm256_setr_pd(4., 1., 2., 1.);
5604         assert_eq_m256d(r, e);
5605     }
5606
5607     #[simd_test(enable = "avx2")]
5608     unsafe fn test_mm256_permutevar8x32_ps() {
5609         let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5610         let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5611         let r = _mm256_permutevar8x32_ps(a, b);
5612         let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5613         assert_eq_m256(r, e);
5614     }
5615
5616     #[simd_test(enable = "avx2")]
5617     unsafe fn test_mm_i32gather_epi32() {
5618         let mut arr = [0i32; 128];
5619         for i in 0..128i32 {
5620             arr[i as usize] = i;
5621         }
5622         // A multiplier of 4 is word-addressing
5623         let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5624         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5625     }
5626
5627     #[simd_test(enable = "avx2")]
5628     unsafe fn test_mm_mask_i32gather_epi32() {
5629         let mut arr = [0i32; 128];
5630         for i in 0..128i32 {
5631             arr[i as usize] = i;
5632         }
5633         // A multiplier of 4 is word-addressing
5634         let r = _mm_mask_i32gather_epi32(
5635             _mm_set1_epi32(256),
5636             arr.as_ptr(),
5637             _mm_setr_epi32(0, 16, 64, 96),
5638             _mm_setr_epi32(-1, -1, -1, 0),
5639             4,
5640         );
5641         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5642     }
5643
5644     #[simd_test(enable = "avx2")]
5645     unsafe fn test_mm256_i32gather_epi32() {
5646         let mut arr = [0i32; 128];
5647         for i in 0..128i32 {
5648             arr[i as usize] = i;
5649         }
5650         // A multiplier of 4 is word-addressing
5651         let r = _mm256_i32gather_epi32(
5652             arr.as_ptr(),
5653             _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5654             4,
5655         );
5656         assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5657     }
5658
5659     #[simd_test(enable = "avx2")]
5660     unsafe fn test_mm256_mask_i32gather_epi32() {
5661         let mut arr = [0i32; 128];
5662         for i in 0..128i32 {
5663             arr[i as usize] = i;
5664         }
5665         // A multiplier of 4 is word-addressing
5666         let r = _mm256_mask_i32gather_epi32(
5667             _mm256_set1_epi32(256),
5668             arr.as_ptr(),
5669             _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5670             _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5671             4,
5672         );
5673         assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5674     }
5675
5676     #[simd_test(enable = "avx2")]
5677     unsafe fn test_mm_i32gather_ps() {
5678         let mut arr = [0.0f32; 128];
5679         let mut j = 0.0;
5680         for i in 0..128usize {
5681             arr[i] = j;
5682             j += 1.0;
5683         }
5684         // A multiplier of 4 is word-addressing for f32s
5685         let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5686         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5687     }
5688
5689     #[simd_test(enable = "avx2")]
5690     unsafe fn test_mm_mask_i32gather_ps() {
5691         let mut arr = [0.0f32; 128];
5692         let mut j = 0.0;
5693         for i in 0..128usize {
5694             arr[i] = j;
5695             j += 1.0;
5696         }
5697         // A multiplier of 4 is word-addressing for f32s
5698         let r = _mm_mask_i32gather_ps(
5699             _mm_set1_ps(256.0),
5700             arr.as_ptr(),
5701             _mm_setr_epi32(0, 16, 64, 96),
5702             _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5703             4,
5704         );
5705         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5706     }
5707
5708     #[simd_test(enable = "avx2")]
5709     unsafe fn test_mm256_i32gather_ps() {
5710         let mut arr = [0.0f32; 128];
5711         let mut j = 0.0;
5712         for i in 0..128usize {
5713             arr[i] = j;
5714             j += 1.0;
5715         }
5716         // A multiplier of 4 is word-addressing for f32s
5717         let r = _mm256_i32gather_ps(
5718             arr.as_ptr(),
5719             _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5720             4,
5721         );
5722         assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5723     }
5724
5725     #[simd_test(enable = "avx2")]
5726     unsafe fn test_mm256_mask_i32gather_ps() {
5727         let mut arr = [0.0f32; 128];
5728         let mut j = 0.0;
5729         for i in 0..128usize {
5730             arr[i] = j;
5731             j += 1.0;
5732         }
5733         // A multiplier of 4 is word-addressing for f32s
5734         let r = _mm256_mask_i32gather_ps(
5735             _mm256_set1_ps(256.0),
5736             arr.as_ptr(),
5737             _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5738             _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5739             4,
5740         );
5741         assert_eq_m256(
5742             r,
5743             _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5744         );
5745     }
5746
5747     #[simd_test(enable = "avx2")]
5748     unsafe fn test_mm_i32gather_epi64() {
5749         let mut arr = [0i64; 128];
5750         for i in 0..128i64 {
5751             arr[i as usize] = i;
5752         }
5753         // A multiplier of 8 is word-addressing for i64s
5754         let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5755         assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5756     }
5757
5758     #[simd_test(enable = "avx2")]
5759     unsafe fn test_mm_mask_i32gather_epi64() {
5760         let mut arr = [0i64; 128];
5761         for i in 0..128i64 {
5762             arr[i as usize] = i;
5763         }
5764         // A multiplier of 8 is word-addressing for i64s
5765         let r = _mm_mask_i32gather_epi64(
5766             _mm_set1_epi64x(256),
5767             arr.as_ptr(),
5768             _mm_setr_epi32(16, 16, 16, 16),
5769             _mm_setr_epi64x(-1, 0),
5770             8,
5771         );
5772         assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5773     }
5774
5775     #[simd_test(enable = "avx2")]
5776     unsafe fn test_mm256_i32gather_epi64() {
5777         let mut arr = [0i64; 128];
5778         for i in 0..128i64 {
5779             arr[i as usize] = i;
5780         }
5781         // A multiplier of 8 is word-addressing for i64s
5782         let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5783         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5784     }
5785
5786     #[simd_test(enable = "avx2")]
5787     unsafe fn test_mm256_mask_i32gather_epi64() {
5788         let mut arr = [0i64; 128];
5789         for i in 0..128i64 {
5790             arr[i as usize] = i;
5791         }
5792         // A multiplier of 8 is word-addressing for i64s
5793         let r = _mm256_mask_i32gather_epi64(
5794             _mm256_set1_epi64x(256),
5795             arr.as_ptr(),
5796             _mm_setr_epi32(0, 16, 64, 96),
5797             _mm256_setr_epi64x(-1, -1, -1, 0),
5798             8,
5799         );
5800         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5801     }
5802
5803     #[simd_test(enable = "avx2")]
5804     unsafe fn test_mm_i32gather_pd() {
5805         let mut arr = [0.0f64; 128];
5806         let mut j = 0.0;
5807         for i in 0..128usize {
5808             arr[i] = j;
5809             j += 1.0;
5810         }
5811         // A multiplier of 8 is word-addressing for f64s
5812         let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5813         assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5814     }
5815
5816     #[simd_test(enable = "avx2")]
5817     unsafe fn test_mm_mask_i32gather_pd() {
5818         let mut arr = [0.0f64; 128];
5819         let mut j = 0.0;
5820         for i in 0..128usize {
5821             arr[i] = j;
5822             j += 1.0;
5823         }
5824         // A multiplier of 8 is word-addressing for f64s
5825         let r = _mm_mask_i32gather_pd(
5826             _mm_set1_pd(256.0),
5827             arr.as_ptr(),
5828             _mm_setr_epi32(16, 16, 16, 16),
5829             _mm_setr_pd(-1.0, 0.0),
5830             8,
5831         );
5832         assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5833     }
5834
5835     #[simd_test(enable = "avx2")]
5836     unsafe fn test_mm256_i32gather_pd() {
5837         let mut arr = [0.0f64; 128];
5838         let mut j = 0.0;
5839         for i in 0..128usize {
5840             arr[i] = j;
5841             j += 1.0;
5842         }
5843         // A multiplier of 8 is word-addressing for f64s
5844         let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5845         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5846     }
5847
5848     #[simd_test(enable = "avx2")]
5849     unsafe fn test_mm256_mask_i32gather_pd() {
5850         let mut arr = [0.0f64; 128];
5851         let mut j = 0.0;
5852         for i in 0..128usize {
5853             arr[i] = j;
5854             j += 1.0;
5855         }
5856         // A multiplier of 8 is word-addressing for f64s
5857         let r = _mm256_mask_i32gather_pd(
5858             _mm256_set1_pd(256.0),
5859             arr.as_ptr(),
5860             _mm_setr_epi32(0, 16, 64, 96),
5861             _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5862             8,
5863         );
5864         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5865     }
5866
5867     #[simd_test(enable = "avx2")]
5868     unsafe fn test_mm_i64gather_epi32() {
5869         let mut arr = [0i32; 128];
5870         for i in 0..128i32 {
5871             arr[i as usize] = i;
5872         }
5873         // A multiplier of 4 is word-addressing
5874         let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5875         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5876     }
5877
5878     #[simd_test(enable = "avx2")]
5879     unsafe fn test_mm_mask_i64gather_epi32() {
5880         let mut arr = [0i32; 128];
5881         for i in 0..128i32 {
5882             arr[i as usize] = i;
5883         }
5884         // A multiplier of 4 is word-addressing
5885         let r = _mm_mask_i64gather_epi32(
5886             _mm_set1_epi32(256),
5887             arr.as_ptr(),
5888             _mm_setr_epi64x(0, 16),
5889             _mm_setr_epi32(-1, 0, -1, 0),
5890             4,
5891         );
5892         assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5893     }
5894
5895     #[simd_test(enable = "avx2")]
5896     unsafe fn test_mm256_i64gather_epi32() {
5897         let mut arr = [0i32; 128];
5898         for i in 0..128i32 {
5899             arr[i as usize] = i;
5900         }
5901         // A multiplier of 4 is word-addressing
5902         let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5903         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5904     }
5905
5906     #[simd_test(enable = "avx2")]
5907     unsafe fn test_mm256_mask_i64gather_epi32() {
5908         let mut arr = [0i32; 128];
5909         for i in 0..128i32 {
5910             arr[i as usize] = i;
5911         }
5912         // A multiplier of 4 is word-addressing
5913         let r = _mm256_mask_i64gather_epi32(
5914             _mm_set1_epi32(256),
5915             arr.as_ptr(),
5916             _mm256_setr_epi64x(0, 16, 64, 96),
5917             _mm_setr_epi32(-1, -1, -1, 0),
5918             4,
5919         );
5920         assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5921     }
5922
5923     #[simd_test(enable = "avx2")]
5924     unsafe fn test_mm_i64gather_ps() {
5925         let mut arr = [0.0f32; 128];
5926         let mut j = 0.0;
5927         for i in 0..128usize {
5928             arr[i] = j;
5929             j += 1.0;
5930         }
5931         // A multiplier of 4 is word-addressing for f32s
5932         let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5933         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5934     }
5935
5936     #[simd_test(enable = "avx2")]
5937     unsafe fn test_mm_mask_i64gather_ps() {
5938         let mut arr = [0.0f32; 128];
5939         let mut j = 0.0;
5940         for i in 0..128usize {
5941             arr[i] = j;
5942             j += 1.0;
5943         }
5944         // A multiplier of 4 is word-addressing for f32s
5945         let r = _mm_mask_i64gather_ps(
5946             _mm_set1_ps(256.0),
5947             arr.as_ptr(),
5948             _mm_setr_epi64x(0, 16),
5949             _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5950             4,
5951         );
5952         assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5953     }
5954
5955     #[simd_test(enable = "avx2")]
5956     unsafe fn test_mm256_i64gather_ps() {
5957         let mut arr = [0.0f32; 128];
5958         let mut j = 0.0;
5959         for i in 0..128usize {
5960             arr[i] = j;
5961             j += 1.0;
5962         }
5963         // A multiplier of 4 is word-addressing for f32s
5964         let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5965         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5966     }
5967
5968     #[simd_test(enable = "avx2")]
5969     unsafe fn test_mm256_mask_i64gather_ps() {
5970         let mut arr = [0.0f32; 128];
5971         let mut j = 0.0;
5972         for i in 0..128usize {
5973             arr[i] = j;
5974             j += 1.0;
5975         }
5976         // A multiplier of 4 is word-addressing for f32s
5977         let r = _mm256_mask_i64gather_ps(
5978             _mm_set1_ps(256.0),
5979             arr.as_ptr(),
5980             _mm256_setr_epi64x(0, 16, 64, 96),
5981             _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5982             4,
5983         );
5984         assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5985     }
5986
5987     #[simd_test(enable = "avx2")]
5988     unsafe fn test_mm_i64gather_epi64() {
5989         let mut arr = [0i64; 128];
5990         for i in 0..128i64 {
5991             arr[i as usize] = i;
5992         }
5993         // A multiplier of 8 is word-addressing for i64s
5994         let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
5995         assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5996     }
5997
5998     #[simd_test(enable = "avx2")]
5999     unsafe fn test_mm_mask_i64gather_epi64() {
6000         let mut arr = [0i64; 128];
6001         for i in 0..128i64 {
6002             arr[i as usize] = i;
6003         }
6004         // A multiplier of 8 is word-addressing for i64s
6005         let r = _mm_mask_i64gather_epi64(
6006             _mm_set1_epi64x(256),
6007             arr.as_ptr(),
6008             _mm_setr_epi64x(16, 16),
6009             _mm_setr_epi64x(-1, 0),
6010             8,
6011         );
6012         assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
6013     }
6014
6015     #[simd_test(enable = "avx2")]
6016     unsafe fn test_mm256_i64gather_epi64() {
6017         let mut arr = [0i64; 128];
6018         for i in 0..128i64 {
6019             arr[i as usize] = i;
6020         }
6021         // A multiplier of 8 is word-addressing for i64s
6022         let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6023         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
6024     }
6025
6026     #[simd_test(enable = "avx2")]
6027     unsafe fn test_mm256_mask_i64gather_epi64() {
6028         let mut arr = [0i64; 128];
6029         for i in 0..128i64 {
6030             arr[i as usize] = i;
6031         }
6032         // A multiplier of 8 is word-addressing for i64s
6033         let r = _mm256_mask_i64gather_epi64(
6034             _mm256_set1_epi64x(256),
6035             arr.as_ptr(),
6036             _mm256_setr_epi64x(0, 16, 64, 96),
6037             _mm256_setr_epi64x(-1, -1, -1, 0),
6038             8,
6039         );
6040         assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
6041     }
6042
6043     #[simd_test(enable = "avx2")]
6044     unsafe fn test_mm_i64gather_pd() {
6045         let mut arr = [0.0f64; 128];
6046         let mut j = 0.0;
6047         for i in 0..128usize {
6048             arr[i] = j;
6049             j += 1.0;
6050         }
6051         // A multiplier of 8 is word-addressing for f64s
6052         let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6053         assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
6054     }
6055
6056     #[simd_test(enable = "avx2")]
6057     unsafe fn test_mm_mask_i64gather_pd() {
6058         let mut arr = [0.0f64; 128];
6059         let mut j = 0.0;
6060         for i in 0..128usize {
6061             arr[i] = j;
6062             j += 1.0;
6063         }
6064         // A multiplier of 8 is word-addressing for f64s
6065         let r = _mm_mask_i64gather_pd(
6066             _mm_set1_pd(256.0),
6067             arr.as_ptr(),
6068             _mm_setr_epi64x(16, 16),
6069             _mm_setr_pd(-1.0, 0.0),
6070             8,
6071         );
6072         assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
6073     }
6074
6075     #[simd_test(enable = "avx2")]
6076     unsafe fn test_mm256_i64gather_pd() {
6077         let mut arr = [0.0f64; 128];
6078         let mut j = 0.0;
6079         for i in 0..128usize {
6080             arr[i] = j;
6081             j += 1.0;
6082         }
6083         // A multiplier of 8 is word-addressing for f64s
6084         let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6085         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6086     }
6087
6088     #[simd_test(enable = "avx2")]
6089     unsafe fn test_mm256_mask_i64gather_pd() {
6090         let mut arr = [0.0f64; 128];
6091         let mut j = 0.0;
6092         for i in 0..128usize {
6093             arr[i] = j;
6094             j += 1.0;
6095         }
6096         // A multiplier of 8 is word-addressing for f64s
6097         let r = _mm256_mask_i64gather_pd(
6098             _mm256_set1_pd(256.0),
6099             arr.as_ptr(),
6100             _mm256_setr_epi64x(0, 16, 64, 96),
6101             _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6102             8,
6103         );
6104         assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6105     }
6106
6107     #[simd_test(enable = "avx")]
6108     unsafe fn test_mm256_extract_epi8() {
6109         #[rustfmt::skip]
6110         let a = _mm256_setr_epi8(
6111             -1, 1, 2, 3, 4, 5, 6, 7,
6112             8, 9, 10, 11, 12, 13, 14, 15,
6113             16, 17, 18, 19, 20, 21, 22, 23,
6114             24, 25, 26, 27, 28, 29, 30, 31
6115         );
6116         let r1 = _mm256_extract_epi8(a, 0);
6117         let r2 = _mm256_extract_epi8(a, 35);
6118         assert_eq!(r1, -1);
6119         assert_eq!(r2, 3);
6120     }
6121
6122     #[simd_test(enable = "avx2")]
6123     unsafe fn test_mm256_extract_epi16() {
6124         #[rustfmt::skip]
6125         let a = _mm256_setr_epi16(
6126             -1, 1, 2, 3, 4, 5, 6, 7,
6127             8, 9, 10, 11, 12, 13, 14, 15,
6128         );
6129         let r1 = _mm256_extract_epi16(a, 0);
6130         let r2 = _mm256_extract_epi16(a, 19);
6131         assert_eq!(r1, -1);
6132         assert_eq!(r2, 3);
6133     }
6134
6135     #[simd_test(enable = "avx2")]
6136     unsafe fn test_mm256_extract_epi32() {
6137         let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
6138         let r1 = _mm256_extract_epi32(a, 0);
6139         let r2 = _mm256_extract_epi32(a, 11);
6140         assert_eq!(r1, -1);
6141         assert_eq!(r2, 3);
6142     }
6143
6144     #[simd_test(enable = "avx2")]
6145     unsafe fn test_mm256_cvtsd_f64() {
6146         let a = _mm256_setr_pd(1., 2., 3., 4.);
6147         let r = _mm256_cvtsd_f64(a);
6148         assert_eq!(r, 1.);
6149     }
6150
6151     #[simd_test(enable = "avx2")]
6152     unsafe fn test_mm256_cvtsi256_si32() {
6153         let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
6154         let r = _mm256_cvtsi256_si32(a);
6155         assert_eq!(r, 1);
6156     }
6157 }