library/stdarch/crates/core_arch/src/x86/avx512vbmi2.rs

   1 use crate::core_arch::{simd::*, simd_llvm::*, x86::*};
   2
   3 #[cfg(test)]
   4 use stdarch_test::assert_instr;
   5
   6 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
   7 ///
   8 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi16&expand=1192)
   9 #[inline]
  10 #[target_feature(enable = "avx512vbmi2")]
  11 #[cfg_attr(test, assert_instr(vpcompressw))]
  12 pub unsafe fn _mm512_mask_compress_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
  13     transmute(vpcompressw(a.as_i16x32(), src.as_i16x32(), k))
  14 }
  15
  16 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
  17 ///
  18 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi16&expand=1193)
  19 #[inline]
  20 #[target_feature(enable = "avx512vbmi2")]
  21 #[cfg_attr(test, assert_instr(vpcompressw))]
  22 pub unsafe fn _mm512_maskz_compress_epi16(k: __mmask32, a: __m512i) -> __m512i {
  23     transmute(vpcompressw(
  24         a.as_i16x32(),
  25         _mm512_setzero_si512().as_i16x32(),
  26         k,
  27     ))
  28 }
  29
  30 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
  31 ///
  32 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi16&expand=1190)
  33 #[inline]
  34 #[target_feature(enable = "avx512vbmi2,avx512vl")]
  35 #[cfg_attr(test, assert_instr(vpcompressw))]
  36 pub unsafe fn _mm256_mask_compress_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
  37     transmute(vpcompressw256(a.as_i16x16(), src.as_i16x16(), k))
  38 }
  39
  40 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
  41 ///
  42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi16&expand=1191)
  43 #[inline]
  44 #[target_feature(enable = "avx512vbmi2,avx512vl")]
  45 #[cfg_attr(test, assert_instr(vpcompressw))]
  46 pub unsafe fn _mm256_maskz_compress_epi16(k: __mmask16, a: __m256i) -> __m256i {
  47     transmute(vpcompressw256(
  48         a.as_i16x16(),
  49         _mm256_setzero_si256().as_i16x16(),
  50         k,
  51     ))
  52 }
  53
  54 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
  55 ///
  56 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi16&expand=1188)
  57 #[inline]
  58 #[target_feature(enable = "avx512vbmi2,avx512vl")]
  59 #[cfg_attr(test, assert_instr(vpcompressw))]
  60 pub unsafe fn _mm_mask_compress_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
  61     transmute(vpcompressw128(a.as_i16x8(), src.as_i16x8(), k))
  62 }
  63
  64 /// Contiguously store the active 16-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
  65 ///
  66 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi16&expand=1189)
  67 #[inline]
  68 #[target_feature(enable = "avx512vbmi2,avx512vl")]
  69 #[cfg_attr(test, assert_instr(vpcompressw))]
  70 pub unsafe fn _mm_maskz_compress_epi16(k: __mmask8, a: __m128i) -> __m128i {
  71     transmute(vpcompressw128(
  72         a.as_i16x8(),
  73         _mm_setzero_si128().as_i16x8(),
  74         k,
  75     ))
  76 }
  77
  78 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
  79 ///
  80 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi8&expand=1210)
  81 #[inline]
  82 #[target_feature(enable = "avx512vbmi2")]
  83 #[cfg_attr(test, assert_instr(vpcompressb))]
  84 pub unsafe fn _mm512_mask_compress_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
  85     transmute(vpcompressb(a.as_i8x64(), src.as_i8x64(), k))
  86 }
  87
  88 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
  89 ///
  90 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi8&expand=1211)
  91 #[inline]
  92 #[target_feature(enable = "avx512vbmi2")]
  93 #[cfg_attr(test, assert_instr(vpcompressb))]
  94 pub unsafe fn _mm512_maskz_compress_epi8(k: __mmask64, a: __m512i) -> __m512i {
  95     transmute(vpcompressb(
  96         a.as_i8x64(),
  97         _mm512_setzero_si512().as_i8x64(),
  98         k,
  99     ))
 100 }
 101
 102 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
 103 ///
 104 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi8&expand=1208)
 105 #[inline]
 106 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 107 #[cfg_attr(test, assert_instr(vpcompressb))]
 108 pub unsafe fn _mm256_mask_compress_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
 109     transmute(vpcompressb256(a.as_i8x32(), src.as_i8x32(), k))
 110 }
 111
 112 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
 113 ///
 114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi8&expand=1209)
 115 #[inline]
 116 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 117 #[cfg_attr(test, assert_instr(vpcompressb))]
 118 pub unsafe fn _mm256_maskz_compress_epi8(k: __mmask32, a: __m256i) -> __m256i {
 119     transmute(vpcompressb256(
 120         a.as_i8x32(),
 121         _mm256_setzero_si256().as_i8x32(),
 122         k,
 123     ))
 124 }
 125
 126 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
 127 ///
 128 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi8&expand=1206)
 129 #[inline]
 130 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 131 #[cfg_attr(test, assert_instr(vpcompressb))]
 132 pub unsafe fn _mm_mask_compress_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
 133     transmute(vpcompressb128(a.as_i8x16(), src.as_i8x16(), k))
 134 }
 135
 136 /// Contiguously store the active 8-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
 137 ///
 138 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi8&expand=1207)
 139 #[inline]
 140 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 141 #[cfg_attr(test, assert_instr(vpcompressb))]
 142 pub unsafe fn _mm_maskz_compress_epi8(k: __mmask16, a: __m128i) -> __m128i {
 143     transmute(vpcompressb128(
 144         a.as_i8x16(),
 145         _mm_setzero_si128().as_i8x16(),
 146         k,
 147     ))
 148 }
 149
 150 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 151 ///
 152 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi16&expand=2310)
 153 #[inline]
 154 #[target_feature(enable = "avx512vbmi2")]
 155 #[cfg_attr(test, assert_instr(vpexpandw))]
 156 pub unsafe fn _mm512_mask_expand_epi16(src: __m512i, k: __mmask32, a: __m512i) -> __m512i {
 157     transmute(vpexpandw(a.as_i16x32(), src.as_i16x32(), k))
 158 }
 159
 160 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 161 ///
 162 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi16&expand=2311)
 163 #[inline]
 164 #[target_feature(enable = "avx512vbmi2")]
 165 #[cfg_attr(test, assert_instr(vpexpandw))]
 166 pub unsafe fn _mm512_maskz_expand_epi16(k: __mmask32, a: __m512i) -> __m512i {
 167     transmute(vpexpandw(
 168         a.as_i16x32(),
 169         _mm512_setzero_si512().as_i16x32(),
 170         k,
 171     ))
 172 }
 173
 174 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 175 ///
 176 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi16&expand=2308)
 177 #[inline]
 178 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 179 #[cfg_attr(test, assert_instr(vpexpandw))]
 180 pub unsafe fn _mm256_mask_expand_epi16(src: __m256i, k: __mmask16, a: __m256i) -> __m256i {
 181     transmute(vpexpandw256(a.as_i16x16(), src.as_i16x16(), k))
 182 }
 183
 184 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 185 ///
 186 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi16&expand=2309)
 187 #[inline]
 188 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 189 #[cfg_attr(test, assert_instr(vpexpandw))]
 190 pub unsafe fn _mm256_maskz_expand_epi16(k: __mmask16, a: __m256i) -> __m256i {
 191     transmute(vpexpandw256(
 192         a.as_i16x16(),
 193         _mm256_setzero_si256().as_i16x16(),
 194         k,
 195     ))
 196 }
 197
 198 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 199 ///
 200 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi16&expand=2306)
 201 #[inline]
 202 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 203 #[cfg_attr(test, assert_instr(vpexpandw))]
 204 pub unsafe fn _mm_mask_expand_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
 205     transmute(vpexpandw128(a.as_i16x8(), src.as_i16x8(), k))
 206 }
 207
 208 /// Load contiguous active 16-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 209 ///
 210 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi16&expand=2307)
 211 #[inline]
 212 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 213 #[cfg_attr(test, assert_instr(vpexpandw))]
 214 pub unsafe fn _mm_maskz_expand_epi16(k: __mmask8, a: __m128i) -> __m128i {
 215     transmute(vpexpandw128(
 216         a.as_i16x8(),
 217         _mm_setzero_si128().as_i16x8(),
 218         k,
 219     ))
 220 }
 221
 222 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 223 ///
 224 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi8&expand=2328)
 225 #[inline]
 226 #[target_feature(enable = "avx512vbmi2")]
 227 #[cfg_attr(test, assert_instr(vpexpandb))]
 228 pub unsafe fn _mm512_mask_expand_epi8(src: __m512i, k: __mmask64, a: __m512i) -> __m512i {
 229     transmute(vpexpandb(a.as_i8x64(), src.as_i8x64(), k))
 230 }
 231
 232 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 233 ///
 234 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi8&expand=2329)
 235 #[inline]
 236 #[target_feature(enable = "avx512vbmi2")]
 237 #[cfg_attr(test, assert_instr(vpexpandb))]
 238 pub unsafe fn _mm512_maskz_expand_epi8(k: __mmask64, a: __m512i) -> __m512i {
 239     transmute(vpexpandb(
 240         a.as_i8x64(),
 241         _mm512_setzero_si512().as_i8x64(),
 242         k,
 243     ))
 244 }
 245
 246 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 247 ///
 248 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi8&expand=2326)
 249 #[inline]
 250 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 251 #[cfg_attr(test, assert_instr(vpexpandb))]
 252 pub unsafe fn _mm256_mask_expand_epi8(src: __m256i, k: __mmask32, a: __m256i) -> __m256i {
 253     transmute(vpexpandb256(a.as_i8x32(), src.as_i8x32(), k))
 254 }
 255
 256 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 257 ///
 258 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi8&expand=2327)
 259 #[inline]
 260 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 261 #[cfg_attr(test, assert_instr(vpexpandb))]
 262 pub unsafe fn _mm256_maskz_expand_epi8(k: __mmask32, a: __m256i) -> __m256i {
 263     transmute(vpexpandb256(
 264         a.as_i8x32(),
 265         _mm256_setzero_si256().as_i8x32(),
 266         k,
 267     ))
 268 }
 269
 270 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 271 ///
 272 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi8&expand=2324)
 273 #[inline]
 274 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 275 #[cfg_attr(test, assert_instr(vpexpandb))]
 276 pub unsafe fn _mm_mask_expand_epi8(src: __m128i, k: __mmask16, a: __m128i) -> __m128i {
 277     transmute(vpexpandb128(a.as_i8x16(), src.as_i8x16(), k))
 278 }
 279
 280 /// Load contiguous active 8-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 281 ///
 282 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi8&expand=2325)
 283 #[inline]
 284 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 285 #[cfg_attr(test, assert_instr(vpexpandb))]
 286 pub unsafe fn _mm_maskz_expand_epi8(k: __mmask16, a: __m128i) -> __m128i {
 287     transmute(vpexpandb128(
 288         a.as_i8x16(),
 289         _mm_setzero_si128().as_i8x16(),
 290         k,
 291     ))
 292 }
 293
 294 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
 295 ///
 296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi64&expand=5087)
 297 #[inline]
 298 #[target_feature(enable = "avx512vbmi2")]
 299 #[cfg_attr(test, assert_instr(vpshldvq))]
 300 pub unsafe fn _mm512_shldv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 301     transmute(vpshldvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
 302 }
 303
 304 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 305 ///
 306 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi64&expand=5085)
 307 #[inline]
 308 #[target_feature(enable = "avx512vbmi2")]
 309 #[cfg_attr(test, assert_instr(vpshldvq))]
 310 pub unsafe fn _mm512_mask_shldv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
 311     let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
 312     transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
 313 }
 314
 315 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 316 ///
 317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi64&expand=5086)
 318 #[inline]
 319 #[target_feature(enable = "avx512vbmi2")]
 320 #[cfg_attr(test, assert_instr(vpshldvq))]
 321 pub unsafe fn _mm512_maskz_shldv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 322     let shf = _mm512_shldv_epi64(a, b, c).as_i64x8();
 323     let zero = _mm512_setzero_si512().as_i64x8();
 324     transmute(simd_select_bitmask(k, shf, zero))
 325 }
 326
 327 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
 328 ///
 329 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi64&expand=5084)
 330 #[inline]
 331 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 332 #[cfg_attr(test, assert_instr(vpshldvq))]
 333 pub unsafe fn _mm256_shldv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 334     transmute(vpshldvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
 335 }
 336
 337 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 338 ///
 339 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi64&expand=5082)
 340 #[inline]
 341 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 342 #[cfg_attr(test, assert_instr(vpshldvq))]
 343 pub unsafe fn _mm256_mask_shldv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
 344     let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
 345     transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
 346 }
 347
 348 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 349 ///
 350 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi64&expand=5083)
 351 #[inline]
 352 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 353 #[cfg_attr(test, assert_instr(vpshldvq))]
 354 pub unsafe fn _mm256_maskz_shldv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 355     let shf = _mm256_shldv_epi64(a, b, c).as_i64x4();
 356     let zero = _mm256_setzero_si256().as_i64x4();
 357     transmute(simd_select_bitmask(k, shf, zero))
 358 }
 359
 360 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst.
 361 ///
 362 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi64&expand=5081)
 363 #[inline]
 364 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 365 #[cfg_attr(test, assert_instr(vpshldvq))]
 366 pub unsafe fn _mm_shldv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 367     transmute(vpshldvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
 368 }
 369
 370 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 371 ///
 372 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi64&expand=5079)
 373 #[inline]
 374 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 375 #[cfg_attr(test, assert_instr(vpshldvq))]
 376 pub unsafe fn _mm_mask_shldv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
 377     let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
 378     transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
 379 }
 380
 381 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 382 ///
 383 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi64&expand=5080)
 384 #[inline]
 385 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 386 #[cfg_attr(test, assert_instr(vpshldvq))]
 387 pub unsafe fn _mm_maskz_shldv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 388     let shf = _mm_shldv_epi64(a, b, c).as_i64x2();
 389     let zero = _mm_setzero_si128().as_i64x2();
 390     transmute(simd_select_bitmask(k, shf, zero))
 391 }
 392
 393 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
 394 ///
 395 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi32&expand=5078)
 396 #[inline]
 397 #[target_feature(enable = "avx512vbmi2")]
 398 #[cfg_attr(test, assert_instr(vpshldvd))]
 399 pub unsafe fn _mm512_shldv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 400     transmute(vpshldvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
 401 }
 402
 403 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 404 ///
 405 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi32&expand=5076)
 406 #[inline]
 407 #[target_feature(enable = "avx512vbmi2")]
 408 #[cfg_attr(test, assert_instr(vpshldvd))]
 409 pub unsafe fn _mm512_mask_shldv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
 410     let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
 411     transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
 412 }
 413
 414 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 415 ///
 416 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi32&expand=5077)
 417 #[inline]
 418 #[target_feature(enable = "avx512vbmi2")]
 419 #[cfg_attr(test, assert_instr(vpshldvd))]
 420 pub unsafe fn _mm512_maskz_shldv_epi32(
 421     k: __mmask16,
 422     a: __m512i,
 423     b: __m512i,
 424     c: __m512i,
 425 ) -> __m512i {
 426     let shf = _mm512_shldv_epi32(a, b, c).as_i32x16();
 427     let zero = _mm512_setzero_si512().as_i32x16();
 428     transmute(simd_select_bitmask(k, shf, zero))
 429 }
 430
 431 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
 432 ///
 433 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi32&expand=5075)
 434 #[inline]
 435 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 436 #[cfg_attr(test, assert_instr(vpshldvd))]
 437 pub unsafe fn _mm256_shldv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 438     transmute(vpshldvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
 439 }
 440
 441 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 442 ///
 443 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi32&expand=5073)
 444 #[inline]
 445 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 446 #[cfg_attr(test, assert_instr(vpshldvd))]
 447 pub unsafe fn _mm256_mask_shldv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
 448     let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
 449     transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
 450 }
 451
 452 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 453 ///
 454 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi32&expand=5074)
 455 #[inline]
 456 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 457 #[cfg_attr(test, assert_instr(vpshldvd))]
 458 pub unsafe fn _mm256_maskz_shldv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 459     let shf = _mm256_shldv_epi32(a, b, c).as_i32x8();
 460     let zero = _mm256_setzero_si256().as_i32x8();
 461     transmute(simd_select_bitmask(k, shf, zero))
 462 }
 463
 464 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst.
 465 ///
 466 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi32&expand=5072)
 467 #[inline]
 468 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 469 #[cfg_attr(test, assert_instr(vpshldvd))]
 470 pub unsafe fn _mm_shldv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 471     transmute(vpshldvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
 472 }
 473
 474 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 475 ///
 476 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi32&expand=5070)
 477 #[inline]
 478 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 479 #[cfg_attr(test, assert_instr(vpshldvd))]
 480 pub unsafe fn _mm_mask_shldv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
 481     let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
 482     transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
 483 }
 484
 485 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 486 ///
 487 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi32&expand=5071)
 488 #[inline]
 489 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 490 #[cfg_attr(test, assert_instr(vpshldvd))]
 491 pub unsafe fn _mm_maskz_shldv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 492     let shf = _mm_shldv_epi32(a, b, c).as_i32x4();
 493     let zero = _mm_setzero_si128().as_i32x4();
 494     transmute(simd_select_bitmask(k, shf, zero))
 495 }
 496
 497 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
 498 ///
 499 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldv_epi16&expand=5069)
 500 #[inline]
 501 #[target_feature(enable = "avx512vbmi2")]
 502 #[cfg_attr(test, assert_instr(vpshldvw))]
 503 pub unsafe fn _mm512_shldv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 504     transmute(vpshldvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
 505 }
 506
 507 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 508 ///
 509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldv_epi16&expand=5067)
 510 #[inline]
 511 #[target_feature(enable = "avx512vbmi2")]
 512 #[cfg_attr(test, assert_instr(vpshldvw))]
 513 pub unsafe fn _mm512_mask_shldv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
 514     let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
 515     transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
 516 }
 517
 518 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 519 ///
 520 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldv_epi16&expand=5068)
 521 #[inline]
 522 #[target_feature(enable = "avx512vbmi2")]
 523 #[cfg_attr(test, assert_instr(vpshldvw))]
 524 pub unsafe fn _mm512_maskz_shldv_epi16(
 525     k: __mmask32,
 526     a: __m512i,
 527     b: __m512i,
 528     c: __m512i,
 529 ) -> __m512i {
 530     let shf = _mm512_shldv_epi16(a, b, c).as_i16x32();
 531     let zero = _mm512_setzero_si512().as_i16x32();
 532     transmute(simd_select_bitmask(k, shf, zero))
 533 }
 534
 535 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
 536 ///
 537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldv_epi16&expand=5066)
 538 #[inline]
 539 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 540 #[cfg_attr(test, assert_instr(vpshldvw))]
 541 pub unsafe fn _mm256_shldv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 542     transmute(vpshldvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
 543 }
 544
 545 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 546 ///
 547 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldv_epi16&expand=5064)
 548 #[inline]
 549 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 550 #[cfg_attr(test, assert_instr(vpshldvw))]
 551 pub unsafe fn _mm256_mask_shldv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
 552     let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
 553     transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
 554 }
 555
 556 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 557 ///
 558 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldv_epi16&expand=5065)
 559 #[inline]
 560 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 561 #[cfg_attr(test, assert_instr(vpshldvw))]
 562 pub unsafe fn _mm256_maskz_shldv_epi16(
 563     k: __mmask16,
 564     a: __m256i,
 565     b: __m256i,
 566     c: __m256i,
 567 ) -> __m256i {
 568     let shf = _mm256_shldv_epi16(a, b, c).as_i16x16();
 569     let zero = _mm256_setzero_si256().as_i16x16();
 570     transmute(simd_select_bitmask(k, shf, zero))
 571 }
 572
 573 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst.
 574 ///
 575 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldv_epi16&expand=5063)
 576 #[inline]
 577 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 578 #[cfg_attr(test, assert_instr(vpshldvw))]
 579 pub unsafe fn _mm_shldv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 580     transmute(vpshldvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
 581 }
 582
 583 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 584 ///
 585 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldv_epi16&expand=5061)
 586 #[inline]
 587 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 588 #[cfg_attr(test, assert_instr(vpshldvw))]
 589 pub unsafe fn _mm_mask_shldv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
 590     let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
 591     transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
 592 }
 593
 594 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by the amount specified in the corresponding element of c, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 595 ///
 596 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldv_epi16&expand=5062)
 597 #[inline]
 598 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 599 #[cfg_attr(test, assert_instr(vpshldvw))]
 600 pub unsafe fn _mm_maskz_shldv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 601     let shf = _mm_shldv_epi16(a, b, c).as_i16x8();
 602     let zero = _mm_setzero_si128().as_i16x8();
 603     transmute(simd_select_bitmask(k, shf, zero))
 604 }
 605
 606 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
 607 ///
 608 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi64&expand=5141)
 609 #[inline]
 610 #[target_feature(enable = "avx512vbmi2")]
 611 #[cfg_attr(test, assert_instr(vpshrdvq))]
 612 pub unsafe fn _mm512_shrdv_epi64(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 613     transmute(vpshrdvq(a.as_i64x8(), b.as_i64x8(), c.as_i64x8()))
 614 }
 615
 616 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 617 ///
 618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi64&expand=5139)
 619 #[inline]
 620 #[target_feature(enable = "avx512vbmi2")]
 621 #[cfg_attr(test, assert_instr(vpshrdvq))]
 622 pub unsafe fn _mm512_mask_shrdv_epi64(a: __m512i, k: __mmask8, b: __m512i, c: __m512i) -> __m512i {
 623     let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
 624     transmute(simd_select_bitmask(k, shf, a.as_i64x8()))
 625 }
 626
 627 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 628 ///
 629 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi64&expand=5140)
 630 #[inline]
 631 #[target_feature(enable = "avx512vbmi2")]
 632 #[cfg_attr(test, assert_instr(vpshrdvq))]
 633 pub unsafe fn _mm512_maskz_shrdv_epi64(k: __mmask8, a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 634     let shf = _mm512_shrdv_epi64(a, b, c).as_i64x8();
 635     let zero = _mm512_setzero_si512().as_i64x8();
 636     transmute(simd_select_bitmask(k, shf, zero))
 637 }
 638
 639 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
 640 ///
 641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi64&expand=5138)
 642 #[inline]
 643 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 644 #[cfg_attr(test, assert_instr(vpshrdvq))]
 645 pub unsafe fn _mm256_shrdv_epi64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 646     transmute(vpshrdvq256(a.as_i64x4(), b.as_i64x4(), c.as_i64x4()))
 647 }
 648
 649 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 650 ///
 651 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi64&expand=5136)
 652 #[inline]
 653 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 654 #[cfg_attr(test, assert_instr(vpshrdvq))]
 655 pub unsafe fn _mm256_mask_shrdv_epi64(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
 656     let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
 657     transmute(simd_select_bitmask(k, shf, a.as_i64x4()))
 658 }
 659
 660 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 661 ///
 662 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi64&expand=5137)
 663 #[inline]
 664 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 665 #[cfg_attr(test, assert_instr(vpshrdvq))]
 666 pub unsafe fn _mm256_maskz_shrdv_epi64(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 667     let shf = _mm256_shrdv_epi64(a, b, c).as_i64x4();
 668     let zero = _mm256_setzero_si256().as_i64x4();
 669     transmute(simd_select_bitmask(k, shf, zero))
 670 }
 671
 672 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst.
 673 ///
 674 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi64&expand=5135)
 675 #[inline]
 676 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 677 #[cfg_attr(test, assert_instr(vpshrdvq))]
 678 pub unsafe fn _mm_shrdv_epi64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 679     transmute(vpshrdvq128(a.as_i64x2(), b.as_i64x2(), c.as_i64x2()))
 680 }
 681
 682 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 683 ///
 684 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi64&expand=5133)
 685 #[inline]
 686 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 687 #[cfg_attr(test, assert_instr(vpshrdvq))]
 688 pub unsafe fn _mm_mask_shrdv_epi64(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
 689     let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
 690     transmute(simd_select_bitmask(k, shf, a.as_i64x2()))
 691 }
 692
 693 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 694 ///
 695 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi64&expand=5134)
 696 #[inline]
 697 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 698 #[cfg_attr(test, assert_instr(vpshrdvq))]
 699 pub unsafe fn _mm_maskz_shrdv_epi64(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 700     let shf = _mm_shrdv_epi64(a, b, c).as_i64x2();
 701     let zero = _mm_setzero_si128().as_i64x2();
 702     transmute(simd_select_bitmask(k, shf, zero))
 703 }
 704
 705 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
 706 ///
 707 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi32&expand=5132)
 708 #[inline]
 709 #[target_feature(enable = "avx512vbmi2")]
 710 #[cfg_attr(test, assert_instr(vpshrdvd))]
 711 pub unsafe fn _mm512_shrdv_epi32(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 712     transmute(vpshrdvd(a.as_i32x16(), b.as_i32x16(), c.as_i32x16()))
 713 }
 714
 715 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 716 ///
 717 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi32&expand=5130)
 718 #[inline]
 719 #[target_feature(enable = "avx512vbmi2")]
 720 #[cfg_attr(test, assert_instr(vpshrdvd))]
 721 pub unsafe fn _mm512_mask_shrdv_epi32(a: __m512i, k: __mmask16, b: __m512i, c: __m512i) -> __m512i {
 722     let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
 723     transmute(simd_select_bitmask(k, shf, a.as_i32x16()))
 724 }
 725
 726 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 727 ///
 728 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi32&expand=5131)
 729 #[inline]
 730 #[target_feature(enable = "avx512vbmi2")]
 731 #[cfg_attr(test, assert_instr(vpshrdvd))]
 732 pub unsafe fn _mm512_maskz_shrdv_epi32(
 733     k: __mmask16,
 734     a: __m512i,
 735     b: __m512i,
 736     c: __m512i,
 737 ) -> __m512i {
 738     let shf = _mm512_shrdv_epi32(a, b, c).as_i32x16();
 739     let zero = _mm512_setzero_si512().as_i32x16();
 740     transmute(simd_select_bitmask(k, shf, zero))
 741 }
 742
 743 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
 744 ///
 745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi32&expand=5129)
 746 #[inline]
 747 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 748 #[cfg_attr(test, assert_instr(vpshrdvd))]
 749 pub unsafe fn _mm256_shrdv_epi32(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 750     transmute(vpshrdvd256(a.as_i32x8(), b.as_i32x8(), c.as_i32x8()))
 751 }
 752
 753 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 754 ///
 755 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi32&expand=5127)
 756 #[inline]
 757 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 758 #[cfg_attr(test, assert_instr(vpshrdvd))]
 759 pub unsafe fn _mm256_mask_shrdv_epi32(a: __m256i, k: __mmask8, b: __m256i, c: __m256i) -> __m256i {
 760     let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
 761     transmute(simd_select_bitmask(k, shf, a.as_i32x8()))
 762 }
 763
 764 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 765 ///
 766 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi32&expand=5128)
 767 #[inline]
 768 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 769 #[cfg_attr(test, assert_instr(vpshrdvd))]
 770 pub unsafe fn _mm256_maskz_shrdv_epi32(k: __mmask8, a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 771     let shf = _mm256_shrdv_epi32(a, b, c).as_i32x8();
 772     let zero = _mm256_setzero_si256().as_i32x8();
 773     transmute(simd_select_bitmask(k, shf, zero))
 774 }
 775
 776 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst.
 777 ///
 778 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi32&expand=5126)
 779 #[inline]
 780 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 781 #[cfg_attr(test, assert_instr(vpshrdvd))]
 782 pub unsafe fn _mm_shrdv_epi32(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 783     transmute(vpshrdvd128(a.as_i32x4(), b.as_i32x4(), c.as_i32x4()))
 784 }
 785
 786 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 787 ///
 788 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi32&expand=5124)
 789 #[inline]
 790 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 791 #[cfg_attr(test, assert_instr(vpshrdvd))]
 792 pub unsafe fn _mm_mask_shrdv_epi32(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
 793     let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
 794     transmute(simd_select_bitmask(k, shf, a.as_i32x4()))
 795 }
 796
 797 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 798 ///
 799 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi32&expand=5125)
 800 #[inline]
 801 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 802 #[cfg_attr(test, assert_instr(vpshrdvd))]
 803 pub unsafe fn _mm_maskz_shrdv_epi32(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 804     let shf = _mm_shrdv_epi32(a, b, c).as_i32x4();
 805     let zero = _mm_setzero_si128().as_i32x4();
 806     transmute(simd_select_bitmask(k, shf, zero))
 807 }
 808
 809 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
 810 ///
 811 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdv_epi16&expand=5123)
 812 #[inline]
 813 #[target_feature(enable = "avx512vbmi2")]
 814 #[cfg_attr(test, assert_instr(vpshrdvw))]
 815 pub unsafe fn _mm512_shrdv_epi16(a: __m512i, b: __m512i, c: __m512i) -> __m512i {
 816     transmute(vpshrdvw(a.as_i16x32(), b.as_i16x32(), c.as_i16x32()))
 817 }
 818
 819 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 820 ///
 821 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdv_epi16&expand=5121)
 822 #[inline]
 823 #[target_feature(enable = "avx512vbmi2")]
 824 #[cfg_attr(test, assert_instr(vpshrdvw))]
 825 pub unsafe fn _mm512_mask_shrdv_epi16(a: __m512i, k: __mmask32, b: __m512i, c: __m512i) -> __m512i {
 826     let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
 827     transmute(simd_select_bitmask(k, shf, a.as_i16x32()))
 828 }
 829
 830 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 831 ///
 832 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdv_epi16&expand=5122)
 833 #[inline]
 834 #[target_feature(enable = "avx512vbmi2")]
 835 #[cfg_attr(test, assert_instr(vpshrdvw))]
 836 pub unsafe fn _mm512_maskz_shrdv_epi16(
 837     k: __mmask32,
 838     a: __m512i,
 839     b: __m512i,
 840     c: __m512i,
 841 ) -> __m512i {
 842     let shf = _mm512_shrdv_epi16(a, b, c).as_i16x32();
 843     let zero = _mm512_setzero_si512().as_i16x32();
 844     transmute(simd_select_bitmask(k, shf, zero))
 845 }
 846
 847 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
 848 ///
 849 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdv_epi16&expand=5120)
 850 #[inline]
 851 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 852 #[cfg_attr(test, assert_instr(vpshrdvw))]
 853 pub unsafe fn _mm256_shrdv_epi16(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
 854     transmute(vpshrdvw256(a.as_i16x16(), b.as_i16x16(), c.as_i16x16()))
 855 }
 856
 857 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 858 ///
 859 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdv_epi16&expand=5118)
 860 #[inline]
 861 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 862 #[cfg_attr(test, assert_instr(vpshrdvw))]
 863 pub unsafe fn _mm256_mask_shrdv_epi16(a: __m256i, k: __mmask16, b: __m256i, c: __m256i) -> __m256i {
 864     let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
 865     transmute(simd_select_bitmask(k, shf, a.as_i16x16()))
 866 }
 867
 868 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 869 ///
 870 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdv_epi16&expand=5119)
 871 #[inline]
 872 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 873 #[cfg_attr(test, assert_instr(vpshrdvw))]
 874 pub unsafe fn _mm256_maskz_shrdv_epi16(
 875     k: __mmask16,
 876     a: __m256i,
 877     b: __m256i,
 878     c: __m256i,
 879 ) -> __m256i {
 880     let shf = _mm256_shrdv_epi16(a, b, c).as_i16x16();
 881     let zero = _mm256_setzero_si256().as_i16x16();
 882     transmute(simd_select_bitmask(k, shf, zero))
 883 }
 884
 885 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst.
 886 ///
 887 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdv_epi16&expand=5117)
 888 #[inline]
 889 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 890 #[cfg_attr(test, assert_instr(vpshrdvw))]
 891 pub unsafe fn _mm_shrdv_epi16(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 892     transmute(vpshrdvw128(a.as_i16x8(), b.as_i16x8(), c.as_i16x8()))
 893 }
 894
 895 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
 896 ///
 897 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdv_epi16&expand=5115)
 898 #[inline]
 899 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 900 #[cfg_attr(test, assert_instr(vpshrdvw))]
 901 pub unsafe fn _mm_mask_shrdv_epi16(a: __m128i, k: __mmask8, b: __m128i, c: __m128i) -> __m128i {
 902     let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
 903     transmute(simd_select_bitmask(k, shf, a.as_i16x8()))
 904 }
 905
 906 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by the amount specified in the corresponding element of c, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 907 ///
 908 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdv_epi16&expand=5116)
 909 #[inline]
 910 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 911 #[cfg_attr(test, assert_instr(vpshrdvw))]
 912 pub unsafe fn _mm_maskz_shrdv_epi16(k: __mmask8, a: __m128i, b: __m128i, c: __m128i) -> __m128i {
 913     let shf = _mm_shrdv_epi16(a, b, c).as_i16x8();
 914     let zero = _mm_setzero_si128().as_i16x8();
 915     transmute(simd_select_bitmask(k, shf, zero))
 916 }
 917
 918 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
 919 ///
 920 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi64&expand=5060)
 921 #[inline]
 922 #[target_feature(enable = "avx512vbmi2")]
 923 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
 924 #[rustc_args_required_const(2)]
 925 pub unsafe fn _mm512_shldi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 926     assert!(imm8 >= 0 && imm8 <= 255);
 927     transmute(vpshldvq(
 928         a.as_i64x8(),
 929         b.as_i64x8(),
 930         _mm512_set1_epi64(imm8 as i64).as_i64x8(),
 931     ))
 932 }
 933
 934 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 935 ///
 936 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi64&expand=5058)
 937 #[inline]
 938 #[target_feature(enable = "avx512vbmi2")]
 939 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
 940 #[rustc_args_required_const(4)]
 941 pub unsafe fn _mm512_mask_shldi_epi64(
 942     src: __m512i,
 943     k: __mmask8,
 944     a: __m512i,
 945     b: __m512i,
 946     imm8: i32,
 947 ) -> __m512i {
 948     assert!(imm8 >= 0 && imm8 <= 255);
 949     let shf: i64x8 = vpshldvq(
 950         a.as_i64x8(),
 951         b.as_i64x8(),
 952         _mm512_set1_epi64(imm8 as i64).as_i64x8(),
 953     );
 954     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
 955 }
 956
 957 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 958 ///
 959 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi64&expand=5059)
 960 #[inline]
 961 #[target_feature(enable = "avx512vbmi2")]
 962 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
 963 #[rustc_args_required_const(3)]
 964 pub unsafe fn _mm512_maskz_shldi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
 965     assert!(imm8 >= 0 && imm8 <= 255);
 966     let shf: i64x8 = vpshldvq(
 967         a.as_i64x8(),
 968         b.as_i64x8(),
 969         _mm512_set1_epi64(imm8 as i64).as_i64x8(),
 970     );
 971     let zero = _mm512_setzero_si512().as_i64x8();
 972     transmute(simd_select_bitmask(k, shf, zero))
 973 }
 974
 975 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
 976 ///
 977 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi64&expand=5057)
 978 #[inline]
 979 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 980 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
 981 #[rustc_args_required_const(2)]
 982 pub unsafe fn _mm256_shldi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 983     assert!(imm8 >= 0 && imm8 <= 255);
 984     transmute(vpshldvq256(
 985         a.as_i64x4(),
 986         b.as_i64x4(),
 987         _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
 988     ))
 989 }
 990
 991 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 992 ///
 993 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi64&expand=5055)
 994 #[inline]
 995 #[target_feature(enable = "avx512vbmi2,avx512vl")]
 996 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
 997 #[rustc_args_required_const(4)]
 998 pub unsafe fn _mm256_mask_shldi_epi64(
 999     src: __m256i,
1000     k: __mmask8,
1001     a: __m256i,
1002     b: __m256i,
1003     imm8: i32,
1004 ) -> __m256i {
1005     assert!(imm8 >= 0 && imm8 <= 255);
1006     let shf: i64x4 = vpshldvq256(
1007         a.as_i64x4(),
1008         b.as_i64x4(),
1009         _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
1010     );
1011     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
1012 }
1013
1014 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1015 ///
1016 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi64&expand=5056)
1017 #[inline]
1018 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1019 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
1020 #[rustc_args_required_const(3)]
1021 pub unsafe fn _mm256_maskz_shldi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1022     assert!(imm8 >= 0 && imm8 <= 255);
1023     let shf: i64x4 = vpshldvq256(
1024         a.as_i64x4(),
1025         b.as_i64x4(),
1026         _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
1027     );
1028     let zero = _mm256_setzero_si256().as_i64x4();
1029     transmute(simd_select_bitmask(k, shf, zero))
1030 }
1031
1032 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst).
1033 ///
1034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi64&expand=5054)
1035 #[inline]
1036 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1037 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
1038 #[rustc_args_required_const(2)]
1039 pub unsafe fn _mm_shldi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1040     assert!(imm8 >= 0 && imm8 <= 255);
1041     transmute(vpshldvq128(
1042         a.as_i64x2(),
1043         b.as_i64x2(),
1044         _mm_set1_epi64x(imm8 as i64).as_i64x2(),
1045     ))
1046 }
1047
1048 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1049 ///
1050 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi64&expand=5052)
1051 #[inline]
1052 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1053 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
1054 #[rustc_args_required_const(4)]
1055 pub unsafe fn _mm_mask_shldi_epi64(
1056     src: __m128i,
1057     k: __mmask8,
1058     a: __m128i,
1059     b: __m128i,
1060     imm8: i32,
1061 ) -> __m128i {
1062     assert!(imm8 >= 0 && imm8 <= 255);
1063     let shf: i64x2 = vpshldvq128(
1064         a.as_i64x2(),
1065         b.as_i64x2(),
1066         _mm_set1_epi64x(imm8 as i64).as_i64x2(),
1067     );
1068     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
1069 }
1070
1071 /// Concatenate packed 64-bit integers in a and b producing an intermediate 128-bit result. Shift the result left by imm8 bits, and store the upper 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1072 ///
1073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi64&expand=5053)
1074 #[inline]
1075 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1076 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))]
1077 #[rustc_args_required_const(3)]
1078 pub unsafe fn _mm_maskz_shldi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1079     assert!(imm8 >= 0 && imm8 <= 255);
1080     let shf: i64x2 = vpshldvq128(
1081         a.as_i64x2(),
1082         b.as_i64x2(),
1083         _mm_set1_epi64x(imm8 as i64).as_i64x2(),
1084     );
1085     let zero = _mm_setzero_si128().as_i64x2();
1086     transmute(simd_select_bitmask(k, shf, zero))
1087 }
1088
1089 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
1090 ///
1091 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi32&expand=5051)
1092 #[inline]
1093 #[target_feature(enable = "avx512vbmi2")]
1094 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1095 #[rustc_args_required_const(2)]
1096 pub unsafe fn _mm512_shldi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1097     assert!(imm8 >= 0 && imm8 <= 255);
1098     transmute(vpshldvd(
1099         a.as_i32x16(),
1100         b.as_i32x16(),
1101         _mm512_set1_epi32(imm8).as_i32x16(),
1102     ))
1103 }
1104
1105 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1106 ///
1107 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi32&expand=5049)
1108 #[inline]
1109 #[target_feature(enable = "avx512vbmi2")]
1110 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1111 #[rustc_args_required_const(4)]
1112 pub unsafe fn _mm512_mask_shldi_epi32(
1113     src: __m512i,
1114     k: __mmask16,
1115     a: __m512i,
1116     b: __m512i,
1117     imm8: i32,
1118 ) -> __m512i {
1119     assert!(imm8 >= 0 && imm8 <= 255);
1120     let shf: i32x16 = vpshldvd(
1121         a.as_i32x16(),
1122         b.as_i32x16(),
1123         _mm512_set1_epi32(imm8).as_i32x16(),
1124     );
1125     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
1126 }
1127
1128 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1129 ///
1130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi32&expand=5050)
1131 #[inline]
1132 #[target_feature(enable = "avx512vbmi2")]
1133 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1134 #[rustc_args_required_const(3)]
1135 pub unsafe fn _mm512_maskz_shldi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1136     assert!(imm8 >= 0 && imm8 <= 255);
1137     let shf: i32x16 = vpshldvd(
1138         a.as_i32x16(),
1139         b.as_i32x16(),
1140         _mm512_set1_epi32(imm8).as_i32x16(),
1141     );
1142     let zero = _mm512_setzero_si512().as_i32x16();
1143     transmute(simd_select_bitmask(k, shf, zero))
1144 }
1145
1146 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
1147 ///
1148 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi32&expand=5048)
1149 #[inline]
1150 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1151 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1152 #[rustc_args_required_const(2)]
1153 pub unsafe fn _mm256_shldi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1154     assert!(imm8 >= 0 && imm8 <= 255);
1155     transmute(vpshldvd256(
1156         a.as_i32x8(),
1157         b.as_i32x8(),
1158         _mm256_set1_epi32(imm8).as_i32x8(),
1159     ))
1160 }
1161
1162 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1163 ///
1164 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi32&expand=5046)
1165 #[inline]
1166 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1167 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1168 #[rustc_args_required_const(4)]
1169 pub unsafe fn _mm256_mask_shldi_epi32(
1170     src: __m256i,
1171     k: __mmask8,
1172     a: __m256i,
1173     b: __m256i,
1174     imm8: i32,
1175 ) -> __m256i {
1176     assert!(imm8 >= 0 && imm8 <= 255);
1177     let shf: i32x8 = vpshldvd256(
1178         a.as_i32x8(),
1179         b.as_i32x8(),
1180         _mm256_set1_epi32(imm8).as_i32x8(),
1181     );
1182     transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
1183 }
1184
1185 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1186 ///
1187 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi32&expand=5047)
1188 #[inline]
1189 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1190 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1191 #[rustc_args_required_const(3)]
1192 pub unsafe fn _mm256_maskz_shldi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1193     assert!(imm8 >= 0 && imm8 <= 255);
1194     let shf: i32x8 = vpshldvd256(
1195         a.as_i32x8(),
1196         b.as_i32x8(),
1197         _mm256_set1_epi32(imm8).as_i32x8(),
1198     );
1199     let zero = _mm256_setzero_si256().as_i32x8();
1200     transmute(simd_select_bitmask(k, shf, zero))
1201 }
1202
1203 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst.
1204 ///
1205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi32&expand=5045)
1206 #[inline]
1207 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1208 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1209 #[rustc_args_required_const(2)]
1210 pub unsafe fn _mm_shldi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1211     assert!(imm8 >= 0 && imm8 <= 255);
1212     transmute(vpshldvd128(
1213         a.as_i32x4(),
1214         b.as_i32x4(),
1215         _mm_set1_epi32(imm8).as_i32x4(),
1216     ))
1217 }
1218
1219 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1220 ///
1221 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi32&expand=5043)
1222 #[inline]
1223 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1224 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1225 #[rustc_args_required_const(4)]
1226 pub unsafe fn _mm_mask_shldi_epi32(
1227     src: __m128i,
1228     k: __mmask8,
1229     a: __m128i,
1230     b: __m128i,
1231     imm8: i32,
1232 ) -> __m128i {
1233     assert!(imm8 >= 0 && imm8 <= 255);
1234     let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
1235     transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
1236 }
1237
1238 /// Concatenate packed 32-bit integers in a and b producing an intermediate 64-bit result. Shift the result left by imm8 bits, and store the upper 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1239 ///
1240 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi32&expand=5044)
1241 #[inline]
1242 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1243 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))]
1244 #[rustc_args_required_const(3)]
1245 pub unsafe fn _mm_maskz_shldi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1246     assert!(imm8 >= 0 && imm8 <= 255);
1247     let shf: i32x4 = vpshldvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
1248     let zero = _mm_setzero_si128().as_i32x4();
1249     transmute(simd_select_bitmask(k, shf, zero))
1250 }
1251
1252 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
1253 ///
1254 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shldi_epi16&expand=5042)
1255 #[inline]
1256 #[target_feature(enable = "avx512vbmi2")]
1257 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1258 #[rustc_args_required_const(2)]
1259 pub unsafe fn _mm512_shldi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1260     assert!(imm8 >= 0 && imm8 <= 255);
1261     transmute(vpshldvw(
1262         a.as_i16x32(),
1263         b.as_i16x32(),
1264         _mm512_set1_epi16(imm8 as i16).as_i16x32(),
1265     ))
1266 }
1267
1268 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1269 ///
1270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shldi_epi16&expand=5040)
1271 #[inline]
1272 #[target_feature(enable = "avx512vbmi2")]
1273 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1274 #[rustc_args_required_const(4)]
1275 pub unsafe fn _mm512_mask_shldi_epi16(
1276     src: __m512i,
1277     k: __mmask32,
1278     a: __m512i,
1279     b: __m512i,
1280     imm8: i32,
1281 ) -> __m512i {
1282     assert!(imm8 >= 0 && imm8 <= 255);
1283     let shf: i16x32 = vpshldvw(
1284         a.as_i16x32(),
1285         b.as_i16x32(),
1286         _mm512_set1_epi16(imm8 as i16).as_i16x32(),
1287     );
1288     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
1289 }
1290
1291 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1292 ///
1293 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shldi_epi16&expand=5041)
1294 #[inline]
1295 #[target_feature(enable = "avx512vbmi2")]
1296 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1297 #[rustc_args_required_const(3)]
1298 pub unsafe fn _mm512_maskz_shldi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1299     assert!(imm8 >= 0 && imm8 <= 255);
1300     let shf: i16x32 = vpshldvw(
1301         a.as_i16x32(),
1302         b.as_i16x32(),
1303         _mm512_set1_epi16(imm8 as i16).as_i16x32(),
1304     );
1305     let zero = _mm512_setzero_si512().as_i16x32();
1306     transmute(simd_select_bitmask(k, shf, zero))
1307 }
1308
1309 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
1310 ///
1311 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shldi_epi16&expand=5039)
1312 #[inline]
1313 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1314 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1315 #[rustc_args_required_const(2)]
1316 pub unsafe fn _mm256_shldi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1317     assert!(imm8 >= 0 && imm8 <= 255);
1318     transmute(vpshldvw256(
1319         a.as_i16x16(),
1320         b.as_i16x16(),
1321         _mm256_set1_epi16(imm8 as i16).as_i16x16(),
1322     ))
1323 }
1324
1325 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1326 ///
1327 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shldi_epi16&expand=5037)
1328 #[inline]
1329 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1330 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1331 #[rustc_args_required_const(4)]
1332 pub unsafe fn _mm256_mask_shldi_epi16(
1333     src: __m256i,
1334     k: __mmask16,
1335     a: __m256i,
1336     b: __m256i,
1337     imm8: i32,
1338 ) -> __m256i {
1339     assert!(imm8 >= 0 && imm8 <= 255);
1340     let shf: i16x16 = vpshldvw256(
1341         a.as_i16x16(),
1342         b.as_i16x16(),
1343         _mm256_set1_epi16(imm8 as i16).as_i16x16(),
1344     );
1345     transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
1346 }
1347
1348 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1349 ///
1350 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shldi_epi16&expand=5038)
1351 #[inline]
1352 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1353 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1354 #[rustc_args_required_const(3)]
1355 pub unsafe fn _mm256_maskz_shldi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1356     let shf: i16x16 = vpshldvw256(
1357         a.as_i16x16(),
1358         b.as_i16x16(),
1359         _mm256_set1_epi16(imm8 as i16).as_i16x16(),
1360     );
1361     let zero = _mm256_setzero_si256().as_i16x16();
1362     transmute(simd_select_bitmask(k, shf, zero))
1363 }
1364
1365 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst).
1366 ///
1367 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shldi_epi16&expand=5036)
1368 #[inline]
1369 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1370 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1371 #[rustc_args_required_const(2)]
1372 pub unsafe fn _mm_shldi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1373     transmute(vpshldvw128(
1374         a.as_i16x8(),
1375         b.as_i16x8(),
1376         _mm_set1_epi16(imm8 as i16).as_i16x8(),
1377     ))
1378 }
1379
1380 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1381 ///
1382 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shldi_epi16&expand=5034)
1383 #[inline]
1384 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1385 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1386 #[rustc_args_required_const(4)]
1387 pub unsafe fn _mm_mask_shldi_epi16(
1388     src: __m128i,
1389     k: __mmask8,
1390     a: __m128i,
1391     b: __m128i,
1392     imm8: i32,
1393 ) -> __m128i {
1394     let shf: i16x8 = vpshldvw128(
1395         a.as_i16x8(),
1396         b.as_i16x8(),
1397         _mm_set1_epi16(imm8 as i16).as_i16x8(),
1398     );
1399     transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
1400 }
1401
1402 /// Concatenate packed 16-bit integers in a and b producing an intermediate 32-bit result. Shift the result left by imm8 bits, and store the upper 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1403 ///
1404 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shldi_epi16&expand=5035)
1405 #[inline]
1406 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1407 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))]
1408 #[rustc_args_required_const(3)]
1409 pub unsafe fn _mm_maskz_shldi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1410     let shf: i16x8 = vpshldvw128(
1411         a.as_i16x8(),
1412         b.as_i16x8(),
1413         _mm_set1_epi16(imm8 as i16).as_i16x8(),
1414     );
1415     let zero = _mm_setzero_si128().as_i16x8();
1416     transmute(simd_select_bitmask(k, shf, zero))
1417 }
1418
1419 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
1420 ///
1421 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi64&expand=5114)
1422 #[inline]
1423 #[target_feature(enable = "avx512vbmi2")]
1424 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1425 #[rustc_args_required_const(2)]
1426 pub unsafe fn _mm512_shrdi_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1427     assert!(imm8 >= 0 && imm8 <= 255);
1428     transmute(vpshrdvq(
1429         a.as_i64x8(),
1430         b.as_i64x8(),
1431         _mm512_set1_epi64(imm8 as i64).as_i64x8(),
1432     ))
1433 }
1434
1435 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
1436 ///
1437 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi64&expand=5112)
1438 #[inline]
1439 #[target_feature(enable = "avx512vbmi2")]
1440 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1441 #[rustc_args_required_const(4)]
1442 pub unsafe fn _mm512_mask_shrdi_epi64(
1443     src: __m512i,
1444     k: __mmask8,
1445     a: __m512i,
1446     b: __m512i,
1447     imm8: i32,
1448 ) -> __m512i {
1449     assert!(imm8 >= 0 && imm8 <= 255);
1450     let shf: i64x8 = vpshrdvq(
1451         a.as_i64x8(),
1452         b.as_i64x8(),
1453         _mm512_set1_epi64(imm8 as i64).as_i64x8(),
1454     );
1455     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
1456 }
1457
1458 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1459 ///
1460 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi64&expand=5113)
1461 #[inline]
1462 #[target_feature(enable = "avx512vbmi2")]
1463 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 255))] //should be vpshrdq
1464 #[rustc_args_required_const(3)]
1465 pub unsafe fn _mm512_maskz_shrdi_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1466     assert!(imm8 >= 0 && imm8 <= 255);
1467     let shf: i64x8 = vpshrdvq(
1468         a.as_i64x8(),
1469         b.as_i64x8(),
1470         _mm512_set1_epi64(imm8 as i64).as_i64x8(),
1471     );
1472     let zero = _mm512_setzero_si512().as_i64x8();
1473     transmute(simd_select_bitmask(k, shf, zero))
1474 }
1475
1476 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
1477 ///
1478 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi64&expand=5111)
1479 #[inline]
1480 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1481 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1482 #[rustc_args_required_const(2)]
1483 pub unsafe fn _mm256_shrdi_epi64(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1484     assert!(imm8 >= 0 && imm8 <= 255);
1485     transmute(vpshrdvq256(
1486         a.as_i64x4(),
1487         b.as_i64x4(),
1488         _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
1489     ))
1490 }
1491
1492 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
1493 ///
1494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi64&expand=5109)
1495 #[inline]
1496 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1497 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1498 #[rustc_args_required_const(4)]
1499 pub unsafe fn _mm256_mask_shrdi_epi64(
1500     src: __m256i,
1501     k: __mmask8,
1502     a: __m256i,
1503     b: __m256i,
1504     imm8: i32,
1505 ) -> __m256i {
1506     assert!(imm8 >= 0 && imm8 <= 255);
1507     let shf: i64x4 = vpshrdvq256(
1508         a.as_i64x4(),
1509         b.as_i64x4(),
1510         _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
1511     );
1512     transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
1513 }
1514
1515 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1516 ///
1517 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi64&expand=5110)
1518 #[inline]
1519 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1520 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1521 #[rustc_args_required_const(3)]
1522 pub unsafe fn _mm256_maskz_shrdi_epi64(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1523     assert!(imm8 >= 0 && imm8 <= 255);
1524     let shf: i64x4 = vpshrdvq256(
1525         a.as_i64x4(),
1526         b.as_i64x4(),
1527         _mm256_set1_epi64x(imm8 as i64).as_i64x4(),
1528     );
1529     let zero = _mm256_setzero_si256().as_i64x4();
1530     transmute(simd_select_bitmask(k, shf, zero))
1531 }
1532
1533 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst.
1534 ///
1535 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi64&expand=5108)
1536 #[inline]
1537 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1538 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1539 #[rustc_args_required_const(2)]
1540 pub unsafe fn _mm_shrdi_epi64(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1541     assert!(imm8 >= 0 && imm8 <= 255);
1542     transmute(vpshrdvq128(
1543         a.as_i64x2(),
1544         b.as_i64x2(),
1545         _mm_set1_epi64x(imm8 as i64).as_i64x2(),
1546     ))
1547 }
1548
1549 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using writemask k (elements are copied from src" when the corresponding mask bit is not set).
1550 ///
1551 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi64&expand=5106)
1552 #[inline]
1553 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1554 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1555 #[rustc_args_required_const(4)]
1556 pub unsafe fn _mm_mask_shrdi_epi64(
1557     src: __m128i,
1558     k: __mmask8,
1559     a: __m128i,
1560     b: __m128i,
1561     imm8: i32,
1562 ) -> __m128i {
1563     assert!(imm8 >= 0 && imm8 <= 255);
1564     let shf: i64x2 = vpshrdvq128(
1565         a.as_i64x2(),
1566         b.as_i64x2(),
1567         _mm_set1_epi64x(imm8 as i64).as_i64x2(),
1568     );
1569     transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
1570 }
1571
1572 /// Concatenate packed 64-bit integers in b and a producing an intermediate 128-bit result. Shift the result right by imm8 bits, and store the lower 64-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1573 ///
1574 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi64&expand=5107)
1575 #[inline]
1576 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1577 #[cfg_attr(test, assert_instr(vpshldq, imm8 = 5))] //should be vpshrdq
1578 #[rustc_args_required_const(3)]
1579 pub unsafe fn _mm_maskz_shrdi_epi64(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1580     assert!(imm8 >= 0 && imm8 <= 255);
1581     let shf: i64x2 = vpshrdvq128(
1582         a.as_i64x2(),
1583         b.as_i64x2(),
1584         _mm_set1_epi64x(imm8 as i64).as_i64x2(),
1585     );
1586     let zero = _mm_setzero_si128().as_i64x2();
1587     transmute(simd_select_bitmask(k, shf, zero))
1588 }
1589
1590 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
1591 ///
1592 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi32&expand=5105)
1593 #[inline]
1594 #[target_feature(enable = "avx512vbmi2")]
1595 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1596 #[rustc_args_required_const(2)]
1597 pub unsafe fn _mm512_shrdi_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1598     assert!(imm8 >= 0 && imm8 <= 255);
1599     transmute(vpshrdvd(
1600         a.as_i32x16(),
1601         b.as_i32x16(),
1602         _mm512_set1_epi32(imm8).as_i32x16(),
1603     ))
1604 }
1605
1606 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1607 ///
1608 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi32&expand=5103)
1609 #[inline]
1610 #[target_feature(enable = "avx512vbmi2")]
1611 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1612 #[rustc_args_required_const(4)]
1613 pub unsafe fn _mm512_mask_shrdi_epi32(
1614     src: __m512i,
1615     k: __mmask16,
1616     a: __m512i,
1617     b: __m512i,
1618     imm8: i32,
1619 ) -> __m512i {
1620     assert!(imm8 >= 0 && imm8 <= 255);
1621     let shf: i32x16 = vpshrdvd(
1622         a.as_i32x16(),
1623         b.as_i32x16(),
1624         _mm512_set1_epi32(imm8).as_i32x16(),
1625     );
1626     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
1627 }
1628
1629 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1630 ///
1631 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi32&expand=5104)
1632 #[inline]
1633 #[target_feature(enable = "avx512vbmi2")]
1634 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1635 #[rustc_args_required_const(3)]
1636 pub unsafe fn _mm512_maskz_shrdi_epi32(k: __mmask16, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1637     assert!(imm8 >= 0 && imm8 <= 255);
1638     let shf: i32x16 = vpshrdvd(
1639         a.as_i32x16(),
1640         b.as_i32x16(),
1641         _mm512_set1_epi32(imm8).as_i32x16(),
1642     );
1643     let zero = _mm512_setzero_si512().as_i32x16();
1644     transmute(simd_select_bitmask(k, shf, zero))
1645 }
1646
1647 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
1648 ///
1649 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi32&expand=5102)
1650 #[inline]
1651 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1652 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1653 #[rustc_args_required_const(2)]
1654 pub unsafe fn _mm256_shrdi_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1655     assert!(imm8 >= 0 && imm8 <= 255);
1656     transmute(vpshrdvd256(
1657         a.as_i32x8(),
1658         b.as_i32x8(),
1659         _mm256_set1_epi32(imm8).as_i32x8(),
1660     ))
1661 }
1662
1663 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1664 ///
1665 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi32&expand=5100)
1666 #[inline]
1667 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1668 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1669 #[rustc_args_required_const(4)]
1670 pub unsafe fn _mm256_mask_shrdi_epi32(
1671     src: __m256i,
1672     k: __mmask8,
1673     a: __m256i,
1674     b: __m256i,
1675     imm8: i32,
1676 ) -> __m256i {
1677     assert!(imm8 >= 0 && imm8 <= 255);
1678     let shf: i32x8 = vpshrdvd256(
1679         a.as_i32x8(),
1680         b.as_i32x8(),
1681         _mm256_set1_epi32(imm8).as_i32x8(),
1682     );
1683     transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
1684 }
1685
1686 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1687 ///
1688 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi32&expand=5101)
1689 #[inline]
1690 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1691 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1692 #[rustc_args_required_const(3)]
1693 pub unsafe fn _mm256_maskz_shrdi_epi32(k: __mmask8, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1694     assert!(imm8 >= 0 && imm8 <= 255);
1695     let shf: i32x8 = vpshrdvd256(
1696         a.as_i32x8(),
1697         b.as_i32x8(),
1698         _mm256_set1_epi32(imm8).as_i32x8(),
1699     );
1700     let zero = _mm256_setzero_si256().as_i32x8();
1701     transmute(simd_select_bitmask(k, shf, zero))
1702 }
1703
1704 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst.
1705 ///
1706 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi32&expand=5099)
1707 #[inline]
1708 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1709 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1710 #[rustc_args_required_const(2)]
1711 pub unsafe fn _mm_shrdi_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1712     assert!(imm8 >= 0 && imm8 <= 255);
1713     transmute(vpshrdvd128(
1714         a.as_i32x4(),
1715         b.as_i32x4(),
1716         _mm_set1_epi32(imm8).as_i32x4(),
1717     ))
1718 }
1719
1720 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1721 ///
1722 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi32&expand=5097)
1723 #[inline]
1724 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1725 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1726 #[rustc_args_required_const(4)]
1727 pub unsafe fn _mm_mask_shrdi_epi32(
1728     src: __m128i,
1729     k: __mmask8,
1730     a: __m128i,
1731     b: __m128i,
1732     imm8: i32,
1733 ) -> __m128i {
1734     assert!(imm8 >= 0 && imm8 <= 255);
1735     let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
1736     transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
1737 }
1738
1739 /// Concatenate packed 32-bit integers in b and a producing an intermediate 64-bit result. Shift the result right by imm8 bits, and store the lower 32-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1740 ///
1741 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi32&expand=5098)
1742 #[inline]
1743 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1744 #[cfg_attr(test, assert_instr(vpshldd, imm8 = 5))] //should be vpshldd
1745 #[rustc_args_required_const(3)]
1746 pub unsafe fn _mm_maskz_shrdi_epi32(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1747     assert!(imm8 >= 0 && imm8 <= 255);
1748     let shf: i32x4 = vpshrdvd128(a.as_i32x4(), b.as_i32x4(), _mm_set1_epi32(imm8).as_i32x4());
1749     let zero = _mm_setzero_si128().as_i32x4();
1750     transmute(simd_select_bitmask(k, shf, zero))
1751 }
1752
1753 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
1754 ///
1755 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shrdi_epi16&expand=5096)
1756 #[inline]
1757 #[target_feature(enable = "avx512vbmi2")]
1758 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1759 #[rustc_args_required_const(2)]
1760 pub unsafe fn _mm512_shrdi_epi16(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1761     assert!(imm8 >= 0 && imm8 <= 255);
1762     transmute(vpshrdvw(
1763         a.as_i16x32(),
1764         b.as_i16x32(),
1765         _mm512_set1_epi16(imm8 as i16).as_i16x32(),
1766     ))
1767 }
1768
1769 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1770 ///
1771 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shrdi_epi16&expand=5094)
1772 #[inline]
1773 #[target_feature(enable = "avx512vbmi2")]
1774 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1775 #[rustc_args_required_const(4)]
1776 pub unsafe fn _mm512_mask_shrdi_epi16(
1777     src: __m512i,
1778     k: __mmask32,
1779     a: __m512i,
1780     b: __m512i,
1781     imm8: i32,
1782 ) -> __m512i {
1783     assert!(imm8 >= 0 && imm8 <= 255);
1784     let shf: i16x32 = vpshrdvw(
1785         a.as_i16x32(),
1786         b.as_i16x32(),
1787         _mm512_set1_epi16(imm8 as i16).as_i16x32(),
1788     );
1789     transmute(simd_select_bitmask(k, shf, src.as_i16x32()))
1790 }
1791
1792 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1793 ///
1794 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shrdi_epi16&expand=5095)
1795 #[inline]
1796 #[target_feature(enable = "avx512vbmi2")]
1797 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1798 #[rustc_args_required_const(3)]
1799 pub unsafe fn _mm512_maskz_shrdi_epi16(k: __mmask32, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
1800     assert!(imm8 >= 0 && imm8 <= 255);
1801     let shf: i16x32 = vpshrdvw(
1802         a.as_i16x32(),
1803         b.as_i16x32(),
1804         _mm512_set1_epi16(imm8 as i16).as_i16x32(),
1805     );
1806     let zero = _mm512_setzero_si512().as_i16x32();
1807     transmute(simd_select_bitmask(k, shf, zero))
1808 }
1809
1810 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
1811 ///
1812 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shrdi_epi16&expand=5093)
1813 #[inline]
1814 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1815 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1816 #[rustc_args_required_const(2)]
1817 pub unsafe fn _mm256_shrdi_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1818     assert!(imm8 >= 0 && imm8 <= 255);
1819     transmute(vpshrdvw256(
1820         a.as_i16x16(),
1821         b.as_i16x16(),
1822         _mm256_set1_epi16(imm8 as i16).as_i16x16(),
1823     ))
1824 }
1825
1826 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1827 ///
1828 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shrdi_epi16&expand=5091)
1829 #[inline]
1830 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1831 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1832 #[rustc_args_required_const(4)]
1833 pub unsafe fn _mm256_mask_shrdi_epi16(
1834     src: __m256i,
1835     k: __mmask16,
1836     a: __m256i,
1837     b: __m256i,
1838     imm8: i32,
1839 ) -> __m256i {
1840     assert!(imm8 >= 0 && imm8 <= 255);
1841     let shf: i16x16 = vpshrdvw256(
1842         a.as_i16x16(),
1843         b.as_i16x16(),
1844         _mm256_set1_epi16(imm8 as i16).as_i16x16(),
1845     );
1846     transmute(simd_select_bitmask(k, shf, src.as_i16x16()))
1847 }
1848
1849 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1850 ///
1851 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shrdi_epi16&expand=5092)
1852 #[inline]
1853 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1854 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1855 #[rustc_args_required_const(3)]
1856 pub unsafe fn _mm256_maskz_shrdi_epi16(k: __mmask16, a: __m256i, b: __m256i, imm8: i32) -> __m256i {
1857     let shf: i16x16 = vpshrdvw256(
1858         a.as_i16x16(),
1859         b.as_i16x16(),
1860         _mm256_set1_epi16(imm8 as i16).as_i16x16(),
1861     );
1862     let zero = _mm256_setzero_si256().as_i16x16();
1863     transmute(simd_select_bitmask(k, shf, zero))
1864 }
1865
1866 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst.
1867 ///
1868 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shrdi_epi16&expand=5090)
1869 #[inline]
1870 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1871 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1872 #[rustc_args_required_const(2)]
1873 pub unsafe fn _mm_shrdi_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1874     transmute(vpshrdvw128(
1875         a.as_i16x8(),
1876         b.as_i16x8(),
1877         _mm_set1_epi16(imm8 as i16).as_i16x8(),
1878     ))
1879 }
1880
1881 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1882 ///
1883 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shrdi_epi16&expand=5088)
1884 #[inline]
1885 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1886 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1887 #[rustc_args_required_const(4)]
1888 pub unsafe fn _mm_mask_shrdi_epi16(
1889     src: __m128i,
1890     k: __mmask8,
1891     a: __m128i,
1892     b: __m128i,
1893     imm8: i32,
1894 ) -> __m128i {
1895     let shf: i16x8 = vpshrdvw128(
1896         a.as_i16x8(),
1897         b.as_i16x8(),
1898         _mm_set1_epi16(imm8 as i16).as_i16x8(),
1899     );
1900     transmute(simd_select_bitmask(k, shf, src.as_i16x8()))
1901 }
1902
1903 /// Concatenate packed 16-bit integers in b and a producing an intermediate 32-bit result. Shift the result right by imm8 bits, and store the lower 16-bits in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1904 ///
1905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shrdi_epi16&expand=5089)
1906 #[inline]
1907 #[target_feature(enable = "avx512vbmi2,avx512vl")]
1908 #[cfg_attr(test, assert_instr(vpshldw, imm8 = 5))] //should be vpshrdw
1909 #[rustc_args_required_const(3)]
1910 pub unsafe fn _mm_maskz_shrdi_epi16(k: __mmask8, a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1911     let shf: i16x8 = vpshrdvw128(
1912         a.as_i16x8(),
1913         b.as_i16x8(),
1914         _mm_set1_epi16(imm8 as i16).as_i16x8(),
1915     );
1916     let zero = _mm_setzero_si128().as_i16x8();
1917     transmute(simd_select_bitmask(k, shf, zero))
1918 }
1919
1920 #[allow(improper_ctypes)]
1921 extern "C" {
1922     #[link_name = "llvm.x86.avx512.mask.compress.w.512"]
1923     fn vpcompressw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
1924     #[link_name = "llvm.x86.avx512.mask.compress.w.256"]
1925     fn vpcompressw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
1926     #[link_name = "llvm.x86.avx512.mask.compress.w.128"]
1927     fn vpcompressw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
1928
1929     #[link_name = "llvm.x86.avx512.mask.compress.b.512"]
1930     fn vpcompressb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
1931     #[link_name = "llvm.x86.avx512.mask.compress.b.256"]
1932     fn vpcompressb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
1933     #[link_name = "llvm.x86.avx512.mask.compress.b.128"]
1934     fn vpcompressb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
1935
1936     #[link_name = "llvm.x86.avx512.mask.expand.w.512"]
1937     fn vpexpandw(a: i16x32, src: i16x32, mask: u32) -> i16x32;
1938     #[link_name = "llvm.x86.avx512.mask.expand.w.256"]
1939     fn vpexpandw256(a: i16x16, src: i16x16, mask: u16) -> i16x16;
1940     #[link_name = "llvm.x86.avx512.mask.expand.w.128"]
1941     fn vpexpandw128(a: i16x8, src: i16x8, mask: u8) -> i16x8;
1942
1943     #[link_name = "llvm.x86.avx512.mask.expand.b.512"]
1944     fn vpexpandb(a: i8x64, src: i8x64, mask: u64) -> i8x64;
1945     #[link_name = "llvm.x86.avx512.mask.expand.b.256"]
1946     fn vpexpandb256(a: i8x32, src: i8x32, mask: u32) -> i8x32;
1947     #[link_name = "llvm.x86.avx512.mask.expand.b.128"]
1948     fn vpexpandb128(a: i8x16, src: i8x16, mask: u16) -> i8x16;
1949
1950     #[link_name = "llvm.fshl.v8i64"]
1951     fn vpshldvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
1952     #[link_name = "llvm.fshl.v4i64"]
1953     fn vpshldvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
1954     #[link_name = "llvm.fshl.v2i64"]
1955     fn vpshldvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
1956     #[link_name = "llvm.fshl.v16i32"]
1957     fn vpshldvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
1958     #[link_name = "llvm.fshl.v8i32"]
1959     fn vpshldvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
1960     #[link_name = "llvm.fshl.v4i32"]
1961     fn vpshldvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
1962     #[link_name = "llvm.fshl.v32i16"]
1963     fn vpshldvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
1964     #[link_name = "llvm.fshl.v16i16"]
1965     fn vpshldvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
1966     #[link_name = "llvm.fshl.v8i16"]
1967     fn vpshldvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
1968
1969     #[link_name = "llvm.fshr.v8i64"]
1970     fn vpshrdvq(a: i64x8, b: i64x8, c: i64x8) -> i64x8;
1971     #[link_name = "llvm.fshr.v4i64"]
1972     fn vpshrdvq256(a: i64x4, b: i64x4, c: i64x4) -> i64x4;
1973     #[link_name = "llvm.fshr.v2i64"]
1974     fn vpshrdvq128(a: i64x2, b: i64x2, c: i64x2) -> i64x2;
1975     #[link_name = "llvm.fshr.v16i32"]
1976     fn vpshrdvd(a: i32x16, b: i32x16, c: i32x16) -> i32x16;
1977     #[link_name = "llvm.fshr.v8i32"]
1978     fn vpshrdvd256(a: i32x8, b: i32x8, c: i32x8) -> i32x8;
1979     #[link_name = "llvm.fshr.v4i32"]
1980     fn vpshrdvd128(a: i32x4, b: i32x4, c: i32x4) -> i32x4;
1981     #[link_name = "llvm.fshr.v32i16"]
1982     fn vpshrdvw(a: i16x32, b: i16x32, c: i16x32) -> i16x32;
1983     #[link_name = "llvm.fshr.v16i16"]
1984     fn vpshrdvw256(a: i16x16, b: i16x16, c: i16x16) -> i16x16;
1985     #[link_name = "llvm.fshr.v8i16"]
1986     fn vpshrdvw128(a: i16x8, b: i16x8, c: i16x8) -> i16x8;
1987 }
1988
1989 #[cfg(test)]
1990 mod tests {
1991
1992     use stdarch_test::simd_test;
1993
1994     use crate::core_arch::x86::*;
1995
1996     #[simd_test(enable = "avx512vbmi2")]
1997     unsafe fn test_mm512_mask_compress_epi16() {
1998         let src = _mm512_set1_epi16(200);
1999         #[rustfmt::skip]
2000         let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2001                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2002         let r = _mm512_mask_compress_epi16(src, 0b01010101_01010101_01010101_01010101, a);
2003         #[rustfmt::skip]
2004         let e = _mm512_set_epi16(
2005             200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200,
2006             1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
2007         );
2008         assert_eq_m512i(r, e);
2009     }
2010
2011     #[simd_test(enable = "avx512vbmi2")]
2012     unsafe fn test_mm512_maskz_compress_epi16() {
2013         #[rustfmt::skip]
2014         let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2015                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2016         let r = _mm512_maskz_compress_epi16(0b01010101_01010101_01010101_01010101, a);
2017         #[rustfmt::skip]
2018         let e = _mm512_set_epi16(
2019             0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2020             1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2021         );
2022         assert_eq_m512i(r, e);
2023     }
2024
2025     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2026     unsafe fn test_mm256_mask_compress_epi16() {
2027         let src = _mm256_set1_epi16(200);
2028         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2029         let r = _mm256_mask_compress_epi16(src, 0b01010101_01010101, a);
2030         let e = _mm256_set_epi16(
2031             200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
2032         );
2033         assert_eq_m256i(r, e);
2034     }
2035
2036     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2037     unsafe fn test_mm256_maskz_compress_epi16() {
2038         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2039         let r = _mm256_maskz_compress_epi16(0b01010101_01010101, a);
2040         let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
2041         assert_eq_m256i(r, e);
2042     }
2043
2044     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2045     unsafe fn test_mm_mask_compress_epi16() {
2046         let src = _mm_set1_epi16(200);
2047         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2048         let r = _mm_mask_compress_epi16(src, 0b01010101, a);
2049         let e = _mm_set_epi16(200, 200, 200, 200, 1, 3, 5, 7);
2050         assert_eq_m128i(r, e);
2051     }
2052
2053     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2054     unsafe fn test_mm_maskz_compress_epi16() {
2055         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2056         let r = _mm_maskz_compress_epi16(0b01010101, a);
2057         let e = _mm_set_epi16(0, 0, 0, 0, 1, 3, 5, 7);
2058         assert_eq_m128i(r, e);
2059     }
2060
2061     #[simd_test(enable = "avx512vbmi2")]
2062     unsafe fn test_mm512_mask_compress_epi8() {
2063         let src = _mm512_set1_epi8(100);
2064         #[rustfmt::skip]
2065         let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2066                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2067                                 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2068                                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
2069         let r = _mm512_mask_compress_epi8(
2070             src,
2071             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
2072             a,
2073         );
2074         #[rustfmt::skip]
2075         let e = _mm512_set_epi8(
2076             100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
2077             100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
2078             1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
2079             33,  35,  37,  39,  41,  43,  45,  47,  49,  51,  53,  55,  57,  59,  61,  63,
2080         );
2081         assert_eq_m512i(r, e);
2082     }
2083
2084     #[simd_test(enable = "avx512vbmi2")]
2085     unsafe fn test_mm512_maskz_compress_epi8() {
2086         #[rustfmt::skip]
2087         let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2088                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2089                                 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2090                                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
2091         let r = _mm512_maskz_compress_epi8(
2092             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
2093             a,
2094         );
2095         #[rustfmt::skip]
2096         let e = _mm512_set_epi8(
2097             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2098             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2099             1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2100             33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
2101         );
2102         assert_eq_m512i(r, e);
2103     }
2104
2105     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2106     unsafe fn test_mm256_mask_compress_epi8() {
2107         let src = _mm256_set1_epi8(100);
2108         #[rustfmt::skip]
2109         let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2110                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2111         let r = _mm256_mask_compress_epi8(src, 0b01010101_01010101_01010101_01010101, a);
2112         #[rustfmt::skip]
2113         let e = _mm256_set_epi8(
2114             100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
2115             1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,  27,  29,  31,
2116         );
2117         assert_eq_m256i(r, e);
2118     }
2119
2120     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2121     unsafe fn test_mm256_maskz_compress_epi8() {
2122         #[rustfmt::skip]
2123         let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2124                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2125         let r = _mm256_maskz_compress_epi8(0b01010101_01010101_01010101_01010101, a);
2126         #[rustfmt::skip]
2127         let e = _mm256_set_epi8(
2128             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2129             1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
2130         );
2131         assert_eq_m256i(r, e);
2132     }
2133
2134     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2135     unsafe fn test_mm_mask_compress_epi8() {
2136         let src = _mm_set1_epi8(100);
2137         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2138         let r = _mm_mask_compress_epi8(src, 0b01010101_01010101, a);
2139         let e = _mm_set_epi8(
2140             100, 100, 100, 100, 100, 100, 100, 100, 1, 3, 5, 7, 9, 11, 13, 15,
2141         );
2142         assert_eq_m128i(r, e);
2143     }
2144
2145     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2146     unsafe fn test_mm_maskz_compress_epi8() {
2147         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2148         let r = _mm_maskz_compress_epi8(0b01010101_01010101, a);
2149         let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
2150         assert_eq_m128i(r, e);
2151     }
2152
2153     #[simd_test(enable = "avx512vbmi2")]
2154     unsafe fn test_mm512_mask_expand_epi16() {
2155         let src = _mm512_set1_epi16(200);
2156         #[rustfmt::skip]
2157         let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2158                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2159         let r = _mm512_mask_expand_epi16(src, 0b01010101_01010101_01010101_01010101, a);
2160         #[rustfmt::skip]
2161         let e = _mm512_set_epi16(
2162             200, 16, 200, 17, 200, 18, 200, 19, 200, 20, 200, 21, 200, 22, 200, 23,
2163             200, 24, 200, 25, 200, 26, 200, 27, 200, 28, 200, 29, 200, 30, 200, 31,
2164         );
2165         assert_eq_m512i(r, e);
2166     }
2167
2168     #[simd_test(enable = "avx512vbmi2")]
2169     unsafe fn test_mm512_maskz_expand_epi16() {
2170         #[rustfmt::skip]
2171         let a = _mm512_set_epi16(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2172                                  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2173         let r = _mm512_maskz_expand_epi16(0b01010101_01010101_01010101_01010101, a);
2174         #[rustfmt::skip]
2175         let e = _mm512_set_epi16(0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
2176                                  0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31);
2177         assert_eq_m512i(r, e);
2178     }
2179
2180     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2181     unsafe fn test_mm256_mask_expand_epi16() {
2182         let src = _mm256_set1_epi16(200);
2183         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2184         let r = _mm256_mask_expand_epi16(src, 0b01010101_01010101, a);
2185         let e = _mm256_set_epi16(
2186             200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
2187         );
2188         assert_eq_m256i(r, e);
2189     }
2190
2191     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2192     unsafe fn test_mm256_maskz_expand_epi16() {
2193         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2194         let r = _mm256_maskz_expand_epi16(0b01010101_01010101, a);
2195         let e = _mm256_set_epi16(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
2196         assert_eq_m256i(r, e);
2197     }
2198
2199     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2200     unsafe fn test_mm_mask_expand_epi16() {
2201         let src = _mm_set1_epi16(200);
2202         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2203         let r = _mm_mask_expand_epi16(src, 0b01010101, a);
2204         let e = _mm_set_epi16(200, 4, 200, 5, 200, 6, 200, 7);
2205         assert_eq_m128i(r, e);
2206     }
2207
2208     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2209     unsafe fn test_mm_maskz_expand_epi16() {
2210         let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
2211         let r = _mm_maskz_expand_epi16(0b01010101, a);
2212         let e = _mm_set_epi16(0, 4, 0, 5, 0, 6, 0, 7);
2213         assert_eq_m128i(r, e);
2214     }
2215
2216     #[simd_test(enable = "avx512vbmi2")]
2217     unsafe fn test_mm512_mask_expand_epi8() {
2218         let src = _mm512_set1_epi8(100);
2219         #[rustfmt::skip]
2220         let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2221                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2222                                 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2223                                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
2224         let r = _mm512_mask_expand_epi8(
2225             src,
2226             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
2227             a,
2228         );
2229         #[rustfmt::skip]
2230         let e = _mm512_set_epi8(
2231             100, 32, 100, 33, 100, 34, 100, 35, 100, 36, 100, 37, 100, 38, 100, 39,
2232             100, 40, 100, 41, 100, 42, 100, 43, 100, 44, 100, 45, 100, 46, 100, 47,
2233             100, 48, 100, 49, 100, 50, 100, 51, 100, 52, 100, 53, 100, 54, 100, 55,
2234             100, 56, 100, 57, 100, 58, 100, 59, 100, 60, 100, 61, 100, 62, 100, 63,
2235         );
2236         assert_eq_m512i(r, e);
2237     }
2238
2239     #[simd_test(enable = "avx512vbmi2")]
2240     unsafe fn test_mm512_maskz_expand_epi8() {
2241         #[rustfmt::skip]
2242         let a = _mm512_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2243                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
2244                                 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
2245                                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
2246         let r = _mm512_maskz_expand_epi8(
2247             0b01010101_01010101_01010101_01010101_01010101_01010101_01010101_01010101,
2248             a,
2249         );
2250         #[rustfmt::skip]
2251         let e = _mm512_set_epi8(
2252             0, 32, 0, 33, 0, 34, 0, 35, 0, 36, 0, 37, 0, 38, 0, 39,
2253             0, 40, 0, 41, 0, 42, 0, 43, 0, 44, 0, 45, 0, 46, 0, 47,
2254             0, 48, 0, 49, 0, 50, 0, 51, 0, 52, 0, 53, 0, 54, 0, 55,
2255             0, 56, 0, 57, 0, 58, 0, 59, 0, 60, 0, 61, 0, 62, 0, 63,
2256         );
2257         assert_eq_m512i(r, e);
2258     }
2259
2260     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2261     unsafe fn test_mm256_mask_expand_epi8() {
2262         let src = _mm256_set1_epi8(100);
2263         #[rustfmt::skip]
2264         let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2265                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2266         let r = _mm256_mask_expand_epi8(src, 0b01010101_01010101_01010101_01010101, a);
2267         #[rustfmt::skip]
2268         let e = _mm256_set_epi8(
2269             100, 16, 100, 17, 100, 18, 100, 19, 100, 20, 100, 21, 100, 22, 100, 23,
2270             100, 24, 100, 25, 100, 26, 100, 27, 100, 28, 100, 29, 100, 30, 100, 31,
2271         );
2272         assert_eq_m256i(r, e);
2273     }
2274
2275     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2276     unsafe fn test_mm256_maskz_expand_epi8() {
2277         #[rustfmt::skip]
2278         let a = _mm256_set_epi8(0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
2279                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
2280         let r = _mm256_maskz_expand_epi8(0b01010101_01010101_01010101_01010101, a);
2281         #[rustfmt::skip]
2282         let e = _mm256_set_epi8(
2283             0, 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23,
2284             0, 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31,
2285         );
2286         assert_eq_m256i(r, e);
2287     }
2288
2289     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2290     unsafe fn test_mm_mask_expand_epi8() {
2291         let src = _mm_set1_epi8(100);
2292         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2293         let r = _mm_mask_expand_epi8(src, 0b01010101_01010101, a);
2294         let e = _mm_set_epi8(
2295             100, 8, 100, 9, 100, 10, 100, 11, 100, 12, 100, 13, 100, 14, 100, 15,
2296         );
2297         assert_eq_m128i(r, e);
2298     }
2299
2300     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2301     unsafe fn test_mm_maskz_expand_epi8() {
2302         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2303         let r = _mm_maskz_expand_epi8(0b01010101_01010101, a);
2304         let e = _mm_set_epi8(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
2305         assert_eq_m128i(r, e);
2306     }
2307
2308     #[simd_test(enable = "avx512vbmi2")]
2309     unsafe fn test_mm512_shldv_epi64() {
2310         let a = _mm512_set1_epi64(1);
2311         let b = _mm512_set1_epi64(1 << 63);
2312         let c = _mm512_set1_epi64(2);
2313         let r = _mm512_shldv_epi64(a, b, c);
2314         let e = _mm512_set1_epi64(6);
2315         assert_eq_m512i(r, e);
2316     }
2317
2318     #[simd_test(enable = "avx512vbmi2")]
2319     unsafe fn test_mm512_mask_shldv_epi64() {
2320         let a = _mm512_set1_epi64(1);
2321         let b = _mm512_set1_epi64(1 << 63);
2322         let c = _mm512_set1_epi64(2);
2323         let r = _mm512_mask_shldv_epi64(a, 0, b, c);
2324         assert_eq_m512i(r, a);
2325         let r = _mm512_mask_shldv_epi64(a, 0b11111111, b, c);
2326         let e = _mm512_set1_epi64(6);
2327         assert_eq_m512i(r, e);
2328     }
2329
2330     #[simd_test(enable = "avx512vbmi2")]
2331     unsafe fn test_mm512_maskz_shldv_epi64() {
2332         let a = _mm512_set1_epi64(1);
2333         let b = _mm512_set1_epi64(1 << 63);
2334         let c = _mm512_set1_epi64(2);
2335         let r = _mm512_maskz_shldv_epi64(0, a, b, c);
2336         assert_eq_m512i(r, _mm512_setzero_si512());
2337         let r = _mm512_maskz_shldv_epi64(0b11111111, a, b, c);
2338         let e = _mm512_set1_epi64(6);
2339         assert_eq_m512i(r, e);
2340     }
2341
2342     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2343     unsafe fn test_mm256_shldv_epi64() {
2344         let a = _mm256_set1_epi64x(1);
2345         let b = _mm256_set1_epi64x(1 << 63);
2346         let c = _mm256_set1_epi64x(2);
2347         let r = _mm256_shldv_epi64(a, b, c);
2348         let e = _mm256_set1_epi64x(6);
2349         assert_eq_m256i(r, e);
2350     }
2351
2352     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2353     unsafe fn test_mm256_mask_shldv_epi64() {
2354         let a = _mm256_set1_epi64x(1);
2355         let b = _mm256_set1_epi64x(1 << 63);
2356         let c = _mm256_set1_epi64x(2);
2357         let r = _mm256_mask_shldv_epi64(a, 0, b, c);
2358         assert_eq_m256i(r, a);
2359         let r = _mm256_mask_shldv_epi64(a, 0b00001111, b, c);
2360         let e = _mm256_set1_epi64x(6);
2361         assert_eq_m256i(r, e);
2362     }
2363
2364     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2365     unsafe fn test_mm256_maskz_shldv_epi64() {
2366         let a = _mm256_set1_epi64x(1);
2367         let b = _mm256_set1_epi64x(1 << 63);
2368         let c = _mm256_set1_epi64x(2);
2369         let r = _mm256_maskz_shldv_epi64(0, a, b, c);
2370         assert_eq_m256i(r, _mm256_setzero_si256());
2371         let r = _mm256_maskz_shldv_epi64(0b00001111, a, b, c);
2372         let e = _mm256_set1_epi64x(6);
2373         assert_eq_m256i(r, e);
2374     }
2375
2376     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2377     unsafe fn test_mm_shldv_epi64() {
2378         let a = _mm_set1_epi64x(1);
2379         let b = _mm_set1_epi64x(1 << 63);
2380         let c = _mm_set1_epi64x(2);
2381         let r = _mm_shldv_epi64(a, b, c);
2382         let e = _mm_set1_epi64x(6);
2383         assert_eq_m128i(r, e);
2384     }
2385
2386     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2387     unsafe fn test_mm_mask_shldv_epi64() {
2388         let a = _mm_set1_epi64x(1);
2389         let b = _mm_set1_epi64x(1 << 63);
2390         let c = _mm_set1_epi64x(2);
2391         let r = _mm_mask_shldv_epi64(a, 0, b, c);
2392         assert_eq_m128i(r, a);
2393         let r = _mm_mask_shldv_epi64(a, 0b00000011, b, c);
2394         let e = _mm_set1_epi64x(6);
2395         assert_eq_m128i(r, e);
2396     }
2397
2398     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2399     unsafe fn test_mm_maskz_shldv_epi64() {
2400         let a = _mm_set1_epi64x(1);
2401         let b = _mm_set1_epi64x(1 << 63);
2402         let c = _mm_set1_epi64x(2);
2403         let r = _mm_maskz_shldv_epi64(0, a, b, c);
2404         assert_eq_m128i(r, _mm_setzero_si128());
2405         let r = _mm_maskz_shldv_epi64(0b00000011, a, b, c);
2406         let e = _mm_set1_epi64x(6);
2407         assert_eq_m128i(r, e);
2408     }
2409
2410     #[simd_test(enable = "avx512vbmi2")]
2411     unsafe fn test_mm512_shldv_epi32() {
2412         let a = _mm512_set1_epi32(1);
2413         let b = _mm512_set1_epi32(1 << 31);
2414         let c = _mm512_set1_epi32(2);
2415         let r = _mm512_shldv_epi32(a, b, c);
2416         let e = _mm512_set1_epi32(6);
2417         assert_eq_m512i(r, e);
2418     }
2419
2420     #[simd_test(enable = "avx512vbmi2")]
2421     unsafe fn test_mm512_mask_shldv_epi32() {
2422         let a = _mm512_set1_epi32(1);
2423         let b = _mm512_set1_epi32(1 << 31);
2424         let c = _mm512_set1_epi32(2);
2425         let r = _mm512_mask_shldv_epi32(a, 0, b, c);
2426         assert_eq_m512i(r, a);
2427         let r = _mm512_mask_shldv_epi32(a, 0b11111111_11111111, b, c);
2428         let e = _mm512_set1_epi32(6);
2429         assert_eq_m512i(r, e);
2430     }
2431
2432     #[simd_test(enable = "avx512vbmi2")]
2433     unsafe fn test_mm512_maskz_shldv_epi32() {
2434         let a = _mm512_set1_epi32(1);
2435         let b = _mm512_set1_epi32(1 << 31);
2436         let c = _mm512_set1_epi32(2);
2437         let r = _mm512_maskz_shldv_epi32(0, a, b, c);
2438         assert_eq_m512i(r, _mm512_setzero_si512());
2439         let r = _mm512_maskz_shldv_epi32(0b11111111_11111111, a, b, c);
2440         let e = _mm512_set1_epi32(6);
2441         assert_eq_m512i(r, e);
2442     }
2443
2444     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2445     unsafe fn test_mm256_shldv_epi32() {
2446         let a = _mm256_set1_epi32(1);
2447         let b = _mm256_set1_epi32(1 << 31);
2448         let c = _mm256_set1_epi32(2);
2449         let r = _mm256_shldv_epi32(a, b, c);
2450         let e = _mm256_set1_epi32(6);
2451         assert_eq_m256i(r, e);
2452     }
2453
2454     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2455     unsafe fn test_mm256_mask_shldv_epi32() {
2456         let a = _mm256_set1_epi32(1);
2457         let b = _mm256_set1_epi32(1 << 31);
2458         let c = _mm256_set1_epi32(2);
2459         let r = _mm256_mask_shldv_epi32(a, 0, b, c);
2460         assert_eq_m256i(r, a);
2461         let r = _mm256_mask_shldv_epi32(a, 0b11111111, b, c);
2462         let e = _mm256_set1_epi32(6);
2463         assert_eq_m256i(r, e);
2464     }
2465
2466     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2467     unsafe fn test_mm256_maskz_shldv_epi32() {
2468         let a = _mm256_set1_epi32(1);
2469         let b = _mm256_set1_epi32(1 << 31);
2470         let c = _mm256_set1_epi32(2);
2471         let r = _mm256_maskz_shldv_epi32(0, a, b, c);
2472         assert_eq_m256i(r, _mm256_setzero_si256());
2473         let r = _mm256_maskz_shldv_epi32(0b11111111, a, b, c);
2474         let e = _mm256_set1_epi32(6);
2475         assert_eq_m256i(r, e);
2476     }
2477
2478     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2479     unsafe fn test_mm_shldv_epi32() {
2480         let a = _mm_set1_epi32(1);
2481         let b = _mm_set1_epi32(1 << 31);
2482         let c = _mm_set1_epi32(2);
2483         let r = _mm_shldv_epi32(a, b, c);
2484         let e = _mm_set1_epi32(6);
2485         assert_eq_m128i(r, e);
2486     }
2487
2488     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2489     unsafe fn test_mm_mask_shldv_epi32() {
2490         let a = _mm_set1_epi32(1);
2491         let b = _mm_set1_epi32(1 << 31);
2492         let c = _mm_set1_epi32(2);
2493         let r = _mm_mask_shldv_epi32(a, 0, b, c);
2494         assert_eq_m128i(r, a);
2495         let r = _mm_mask_shldv_epi32(a, 0b00001111, b, c);
2496         let e = _mm_set1_epi32(6);
2497         assert_eq_m128i(r, e);
2498     }
2499
2500     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2501     unsafe fn test_mm_maskz_shldv_epi32() {
2502         let a = _mm_set1_epi32(1);
2503         let b = _mm_set1_epi32(1 << 31);
2504         let c = _mm_set1_epi32(2);
2505         let r = _mm_maskz_shldv_epi32(0, a, b, c);
2506         assert_eq_m128i(r, _mm_setzero_si128());
2507         let r = _mm_maskz_shldv_epi32(0b00001111, a, b, c);
2508         let e = _mm_set1_epi32(6);
2509         assert_eq_m128i(r, e);
2510     }
2511
2512     #[simd_test(enable = "avx512vbmi2")]
2513     unsafe fn test_mm512_shldv_epi16() {
2514         let a = _mm512_set1_epi16(1);
2515         let b = _mm512_set1_epi16(1 << 15);
2516         let c = _mm512_set1_epi16(2);
2517         let r = _mm512_shldv_epi16(a, b, c);
2518         let e = _mm512_set1_epi16(6);
2519         assert_eq_m512i(r, e);
2520     }
2521
2522     #[simd_test(enable = "avx512vbmi2")]
2523     unsafe fn test_mm512_mask_shldv_epi16() {
2524         let a = _mm512_set1_epi16(1);
2525         let b = _mm512_set1_epi16(1 << 15);
2526         let c = _mm512_set1_epi16(2);
2527         let r = _mm512_mask_shldv_epi16(a, 0, b, c);
2528         assert_eq_m512i(r, a);
2529         let r = _mm512_mask_shldv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
2530         let e = _mm512_set1_epi16(6);
2531         assert_eq_m512i(r, e);
2532     }
2533
2534     #[simd_test(enable = "avx512vbmi2")]
2535     unsafe fn test_mm512_maskz_shldv_epi16() {
2536         let a = _mm512_set1_epi16(1);
2537         let b = _mm512_set1_epi16(1 << 15);
2538         let c = _mm512_set1_epi16(2);
2539         let r = _mm512_maskz_shldv_epi16(0, a, b, c);
2540         assert_eq_m512i(r, _mm512_setzero_si512());
2541         let r = _mm512_maskz_shldv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
2542         let e = _mm512_set1_epi16(6);
2543         assert_eq_m512i(r, e);
2544     }
2545
2546     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2547     unsafe fn test_mm256_shldv_epi16() {
2548         let a = _mm256_set1_epi16(1);
2549         let b = _mm256_set1_epi16(1 << 15);
2550         let c = _mm256_set1_epi16(2);
2551         let r = _mm256_shldv_epi16(a, b, c);
2552         let e = _mm256_set1_epi16(6);
2553         assert_eq_m256i(r, e);
2554     }
2555
2556     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2557     unsafe fn test_mm256_mask_shldv_epi16() {
2558         let a = _mm256_set1_epi16(1);
2559         let b = _mm256_set1_epi16(1 << 15);
2560         let c = _mm256_set1_epi16(2);
2561         let r = _mm256_mask_shldv_epi16(a, 0, b, c);
2562         assert_eq_m256i(r, a);
2563         let r = _mm256_mask_shldv_epi16(a, 0b11111111_11111111, b, c);
2564         let e = _mm256_set1_epi16(6);
2565         assert_eq_m256i(r, e);
2566     }
2567
2568     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2569     unsafe fn test_mm256_maskz_shldv_epi16() {
2570         let a = _mm256_set1_epi16(1);
2571         let b = _mm256_set1_epi16(1 << 15);
2572         let c = _mm256_set1_epi16(2);
2573         let r = _mm256_maskz_shldv_epi16(0, a, b, c);
2574         assert_eq_m256i(r, _mm256_setzero_si256());
2575         let r = _mm256_maskz_shldv_epi16(0b11111111_11111111, a, b, c);
2576         let e = _mm256_set1_epi16(6);
2577         assert_eq_m256i(r, e);
2578     }
2579
2580     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2581     unsafe fn test_mm_shldv_epi16() {
2582         let a = _mm_set1_epi16(1);
2583         let b = _mm_set1_epi16(1 << 15);
2584         let c = _mm_set1_epi16(2);
2585         let r = _mm_shldv_epi16(a, b, c);
2586         let e = _mm_set1_epi16(6);
2587         assert_eq_m128i(r, e);
2588     }
2589
2590     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2591     unsafe fn test_mm_mask_shldv_epi16() {
2592         let a = _mm_set1_epi16(1);
2593         let b = _mm_set1_epi16(1 << 15);
2594         let c = _mm_set1_epi16(2);
2595         let r = _mm_mask_shldv_epi16(a, 0, b, c);
2596         assert_eq_m128i(r, a);
2597         let r = _mm_mask_shldv_epi16(a, 0b11111111, b, c);
2598         let e = _mm_set1_epi16(6);
2599         assert_eq_m128i(r, e);
2600     }
2601
2602     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2603     unsafe fn test_mm_maskz_shldv_epi16() {
2604         let a = _mm_set1_epi16(1);
2605         let b = _mm_set1_epi16(1 << 15);
2606         let c = _mm_set1_epi16(2);
2607         let r = _mm_maskz_shldv_epi16(0, a, b, c);
2608         assert_eq_m128i(r, _mm_setzero_si128());
2609         let r = _mm_maskz_shldv_epi16(0b11111111, a, b, c);
2610         let e = _mm_set1_epi16(6);
2611         assert_eq_m128i(r, e);
2612     }
2613
2614     #[simd_test(enable = "avx512vbmi2")]
2615     unsafe fn test_mm512_shrdv_epi64() {
2616         let a = _mm512_set1_epi64(8);
2617         let b = _mm512_set1_epi64(2);
2618         let c = _mm512_set1_epi64(1);
2619         let r = _mm512_shrdv_epi64(a, b, c);
2620         let e = _mm512_set1_epi64(1);
2621         assert_eq_m512i(r, e);
2622     }
2623
2624     #[simd_test(enable = "avx512vbmi2")]
2625     unsafe fn test_mm512_mask_shrdv_epi64() {
2626         let a = _mm512_set1_epi64(8);
2627         let b = _mm512_set1_epi64(2);
2628         let c = _mm512_set1_epi64(1);
2629         let r = _mm512_mask_shrdv_epi64(a, 0, b, c);
2630         assert_eq_m512i(r, a);
2631         let r = _mm512_mask_shrdv_epi64(a, 0b11111111, b, c);
2632         let e = _mm512_set1_epi64(1);
2633         assert_eq_m512i(r, e);
2634     }
2635
2636     #[simd_test(enable = "avx512vbmi2")]
2637     unsafe fn test_mm512_maskz_shrdv_epi64() {
2638         let a = _mm512_set1_epi64(8);
2639         let b = _mm512_set1_epi64(2);
2640         let c = _mm512_set1_epi64(1);
2641         let r = _mm512_maskz_shrdv_epi64(0, a, b, c);
2642         assert_eq_m512i(r, _mm512_setzero_si512());
2643         let r = _mm512_maskz_shrdv_epi64(0b11111111, a, b, c);
2644         let e = _mm512_set1_epi64(1);
2645         assert_eq_m512i(r, e);
2646     }
2647
2648     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2649     unsafe fn test_mm256_shrdv_epi64() {
2650         let a = _mm256_set1_epi64x(8);
2651         let b = _mm256_set1_epi64x(2);
2652         let c = _mm256_set1_epi64x(1);
2653         let r = _mm256_shrdv_epi64(a, b, c);
2654         let e = _mm256_set1_epi64x(1);
2655         assert_eq_m256i(r, e);
2656     }
2657
2658     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2659     unsafe fn test_mm256_mask_shrdv_epi64() {
2660         let a = _mm256_set1_epi64x(8);
2661         let b = _mm256_set1_epi64x(2);
2662         let c = _mm256_set1_epi64x(1);
2663         let r = _mm256_mask_shrdv_epi64(a, 0, b, c);
2664         assert_eq_m256i(r, a);
2665         let r = _mm256_mask_shrdv_epi64(a, 0b00001111, b, c);
2666         let e = _mm256_set1_epi64x(1);
2667         assert_eq_m256i(r, e);
2668     }
2669
2670     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2671     unsafe fn test_mm256_maskz_shrdv_epi64() {
2672         let a = _mm256_set1_epi64x(8);
2673         let b = _mm256_set1_epi64x(2);
2674         let c = _mm256_set1_epi64x(1);
2675         let r = _mm256_maskz_shrdv_epi64(0, a, b, c);
2676         assert_eq_m256i(r, _mm256_setzero_si256());
2677         let r = _mm256_maskz_shrdv_epi64(0b00001111, a, b, c);
2678         let e = _mm256_set1_epi64x(1);
2679         assert_eq_m256i(r, e);
2680     }
2681
2682     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2683     unsafe fn test_mm_shrdv_epi64() {
2684         let a = _mm_set1_epi64x(8);
2685         let b = _mm_set1_epi64x(2);
2686         let c = _mm_set1_epi64x(1);
2687         let r = _mm_shrdv_epi64(a, b, c);
2688         let e = _mm_set1_epi64x(1);
2689         assert_eq_m128i(r, e);
2690     }
2691
2692     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2693     unsafe fn test_mm_mask_shrdv_epi64() {
2694         let a = _mm_set1_epi64x(8);
2695         let b = _mm_set1_epi64x(2);
2696         let c = _mm_set1_epi64x(1);
2697         let r = _mm_mask_shrdv_epi64(a, 0, b, c);
2698         assert_eq_m128i(r, a);
2699         let r = _mm_mask_shrdv_epi64(a, 0b00000011, b, c);
2700         let e = _mm_set1_epi64x(1);
2701         assert_eq_m128i(r, e);
2702     }
2703
2704     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2705     unsafe fn test_mm_maskz_shrdv_epi64() {
2706         let a = _mm_set1_epi64x(8);
2707         let b = _mm_set1_epi64x(2);
2708         let c = _mm_set1_epi64x(1);
2709         let r = _mm_maskz_shrdv_epi64(0, a, b, c);
2710         assert_eq_m128i(r, _mm_setzero_si128());
2711         let r = _mm_maskz_shrdv_epi64(0b00000011, a, b, c);
2712         let e = _mm_set1_epi64x(1);
2713         assert_eq_m128i(r, e);
2714     }
2715
2716     #[simd_test(enable = "avx512vbmi2")]
2717     unsafe fn test_mm512_shrdv_epi32() {
2718         let a = _mm512_set1_epi32(8);
2719         let b = _mm512_set1_epi32(2);
2720         let c = _mm512_set1_epi32(1);
2721         let r = _mm512_shrdv_epi32(a, b, c);
2722         let e = _mm512_set1_epi32(1);
2723         assert_eq_m512i(r, e);
2724     }
2725
2726     #[simd_test(enable = "avx512vbmi2")]
2727     unsafe fn test_mm512_mask_shrdv_epi32() {
2728         let a = _mm512_set1_epi32(8);
2729         let b = _mm512_set1_epi32(2);
2730         let c = _mm512_set1_epi32(1);
2731         let r = _mm512_mask_shrdv_epi32(a, 0, b, c);
2732         assert_eq_m512i(r, a);
2733         let r = _mm512_mask_shrdv_epi32(a, 0b11111111_11111111, b, c);
2734         let e = _mm512_set1_epi32(1);
2735         assert_eq_m512i(r, e);
2736     }
2737
2738     #[simd_test(enable = "avx512vbmi2")]
2739     unsafe fn test_mm512_maskz_shrdv_epi32() {
2740         let a = _mm512_set1_epi32(8);
2741         let b = _mm512_set1_epi32(2);
2742         let c = _mm512_set1_epi32(1);
2743         let r = _mm512_maskz_shrdv_epi32(0, a, b, c);
2744         assert_eq_m512i(r, _mm512_setzero_si512());
2745         let r = _mm512_maskz_shrdv_epi32(0b11111111_11111111, a, b, c);
2746         let e = _mm512_set1_epi32(1);
2747         assert_eq_m512i(r, e);
2748     }
2749
2750     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2751     unsafe fn test_mm256_shrdv_epi32() {
2752         let a = _mm256_set1_epi32(8);
2753         let b = _mm256_set1_epi32(2);
2754         let c = _mm256_set1_epi32(1);
2755         let r = _mm256_shrdv_epi32(a, b, c);
2756         let e = _mm256_set1_epi32(1);
2757         assert_eq_m256i(r, e);
2758     }
2759
2760     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2761     unsafe fn test_mm256_mask_shrdv_epi32() {
2762         let a = _mm256_set1_epi32(8);
2763         let b = _mm256_set1_epi32(2);
2764         let c = _mm256_set1_epi32(1);
2765         let r = _mm256_mask_shrdv_epi32(a, 0, b, c);
2766         assert_eq_m256i(r, a);
2767         let r = _mm256_mask_shrdv_epi32(a, 0b11111111, b, c);
2768         let e = _mm256_set1_epi32(1);
2769         assert_eq_m256i(r, e);
2770     }
2771
2772     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2773     unsafe fn test_mm256_maskz_shrdv_epi32() {
2774         let a = _mm256_set1_epi32(8);
2775         let b = _mm256_set1_epi32(2);
2776         let c = _mm256_set1_epi32(1);
2777         let r = _mm256_maskz_shrdv_epi32(0, a, b, c);
2778         assert_eq_m256i(r, _mm256_setzero_si256());
2779         let r = _mm256_maskz_shrdv_epi32(0b11111111, a, b, c);
2780         let e = _mm256_set1_epi32(1);
2781         assert_eq_m256i(r, e);
2782     }
2783
2784     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2785     unsafe fn test_mm_shrdv_epi32() {
2786         let a = _mm_set1_epi32(8);
2787         let b = _mm_set1_epi32(2);
2788         let c = _mm_set1_epi32(1);
2789         let r = _mm_shrdv_epi32(a, b, c);
2790         let e = _mm_set1_epi32(1);
2791         assert_eq_m128i(r, e);
2792     }
2793
2794     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2795     unsafe fn test_mm_mask_shrdv_epi32() {
2796         let a = _mm_set1_epi32(8);
2797         let b = _mm_set1_epi32(2);
2798         let c = _mm_set1_epi32(1);
2799         let r = _mm_mask_shrdv_epi32(a, 0, b, c);
2800         assert_eq_m128i(r, a);
2801         let r = _mm_mask_shrdv_epi32(a, 0b00001111, b, c);
2802         let e = _mm_set1_epi32(1);
2803         assert_eq_m128i(r, e);
2804     }
2805
2806     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2807     unsafe fn test_mm_maskz_shrdv_epi32() {
2808         let a = _mm_set1_epi32(8);
2809         let b = _mm_set1_epi32(2);
2810         let c = _mm_set1_epi32(1);
2811         let r = _mm_maskz_shrdv_epi32(0, a, b, c);
2812         assert_eq_m128i(r, _mm_setzero_si128());
2813         let r = _mm_maskz_shrdv_epi32(0b00001111, a, b, c);
2814         let e = _mm_set1_epi32(1);
2815         assert_eq_m128i(r, e);
2816     }
2817
2818     #[simd_test(enable = "avx512vbmi2")]
2819     unsafe fn test_mm512_shrdv_epi16() {
2820         let a = _mm512_set1_epi16(8);
2821         let b = _mm512_set1_epi16(2);
2822         let c = _mm512_set1_epi16(1);
2823         let r = _mm512_shrdv_epi16(a, b, c);
2824         let e = _mm512_set1_epi16(1);
2825         assert_eq_m512i(r, e);
2826     }
2827
2828     #[simd_test(enable = "avx512vbmi2")]
2829     unsafe fn test_mm512_mask_shrdv_epi16() {
2830         let a = _mm512_set1_epi16(8);
2831         let b = _mm512_set1_epi16(2);
2832         let c = _mm512_set1_epi16(1);
2833         let r = _mm512_mask_shrdv_epi16(a, 0, b, c);
2834         assert_eq_m512i(r, a);
2835         let r = _mm512_mask_shrdv_epi16(a, 0b11111111_11111111_11111111_11111111, b, c);
2836         let e = _mm512_set1_epi16(1);
2837         assert_eq_m512i(r, e);
2838     }
2839
2840     #[simd_test(enable = "avx512vbmi2")]
2841     unsafe fn test_mm512_maskz_shrdv_epi16() {
2842         let a = _mm512_set1_epi16(8);
2843         let b = _mm512_set1_epi16(2);
2844         let c = _mm512_set1_epi16(1);
2845         let r = _mm512_maskz_shrdv_epi16(0, a, b, c);
2846         assert_eq_m512i(r, _mm512_setzero_si512());
2847         let r = _mm512_maskz_shrdv_epi16(0b11111111_11111111_11111111_11111111, a, b, c);
2848         let e = _mm512_set1_epi16(1);
2849         assert_eq_m512i(r, e);
2850     }
2851
2852     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2853     unsafe fn test_mm256_shrdv_epi16() {
2854         let a = _mm256_set1_epi16(8);
2855         let b = _mm256_set1_epi16(2);
2856         let c = _mm256_set1_epi16(1);
2857         let r = _mm256_shrdv_epi16(a, b, c);
2858         let e = _mm256_set1_epi16(1);
2859         assert_eq_m256i(r, e);
2860     }
2861
2862     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2863     unsafe fn test_mm256_mask_shrdv_epi16() {
2864         let a = _mm256_set1_epi16(8);
2865         let b = _mm256_set1_epi16(2);
2866         let c = _mm256_set1_epi16(1);
2867         let r = _mm256_mask_shrdv_epi16(a, 0, b, c);
2868         assert_eq_m256i(r, a);
2869         let r = _mm256_mask_shrdv_epi16(a, 0b11111111_11111111, b, c);
2870         let e = _mm256_set1_epi16(1);
2871         assert_eq_m256i(r, e);
2872     }
2873
2874     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2875     unsafe fn test_mm256_maskz_shrdv_epi16() {
2876         let a = _mm256_set1_epi16(8);
2877         let b = _mm256_set1_epi16(2);
2878         let c = _mm256_set1_epi16(1);
2879         let r = _mm256_maskz_shrdv_epi16(0, a, b, c);
2880         assert_eq_m256i(r, _mm256_setzero_si256());
2881         let r = _mm256_maskz_shrdv_epi16(0b11111111_11111111, a, b, c);
2882         let e = _mm256_set1_epi16(1);
2883         assert_eq_m256i(r, e);
2884     }
2885
2886     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2887     unsafe fn test_mm_shrdv_epi16() {
2888         let a = _mm_set1_epi16(8);
2889         let b = _mm_set1_epi16(2);
2890         let c = _mm_set1_epi16(1);
2891         let r = _mm_shrdv_epi16(a, b, c);
2892         let e = _mm_set1_epi16(1);
2893         assert_eq_m128i(r, e);
2894     }
2895
2896     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2897     unsafe fn test_mm_mask_shrdv_epi16() {
2898         let a = _mm_set1_epi16(8);
2899         let b = _mm_set1_epi16(2);
2900         let c = _mm_set1_epi16(1);
2901         let r = _mm_mask_shrdv_epi16(a, 0, b, c);
2902         assert_eq_m128i(r, a);
2903         let r = _mm_mask_shrdv_epi16(a, 0b11111111, b, c);
2904         let e = _mm_set1_epi16(1);
2905         assert_eq_m128i(r, e);
2906     }
2907
2908     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2909     unsafe fn test_mm_maskz_shrdv_epi16() {
2910         let a = _mm_set1_epi16(8);
2911         let b = _mm_set1_epi16(2);
2912         let c = _mm_set1_epi16(1);
2913         let r = _mm_maskz_shrdv_epi16(0, a, b, c);
2914         assert_eq_m128i(r, _mm_setzero_si128());
2915         let r = _mm_maskz_shrdv_epi16(0b11111111, a, b, c);
2916         let e = _mm_set1_epi16(1);
2917         assert_eq_m128i(r, e);
2918     }
2919
2920     #[simd_test(enable = "avx512vbmi2")]
2921     unsafe fn test_mm512_shldi_epi64() {
2922         let a = _mm512_set1_epi64(1);
2923         let b = _mm512_set1_epi64(1 << 63);
2924         let r = _mm512_shldi_epi64(a, b, 2);
2925         let e = _mm512_set1_epi64(6);
2926         assert_eq_m512i(r, e);
2927     }
2928
2929     #[simd_test(enable = "avx512vbmi2")]
2930     unsafe fn test_mm512_mask_shldi_epi64() {
2931         let a = _mm512_set1_epi64(1);
2932         let b = _mm512_set1_epi64(1 << 63);
2933         let r = _mm512_mask_shldi_epi64(a, 0, a, b, 2);
2934         assert_eq_m512i(r, a);
2935         let r = _mm512_mask_shldi_epi64(a, 0b11111111, a, b, 2);
2936         let e = _mm512_set1_epi64(6);
2937         assert_eq_m512i(r, e);
2938     }
2939
2940     #[simd_test(enable = "avx512vbmi2")]
2941     unsafe fn test_mm512_maskz_shldi_epi64() {
2942         let a = _mm512_set1_epi64(1);
2943         let b = _mm512_set1_epi64(1 << 63);
2944         let r = _mm512_maskz_shldi_epi64(0, a, b, 2);
2945         assert_eq_m512i(r, _mm512_setzero_si512());
2946         let r = _mm512_maskz_shldi_epi64(0b11111111, a, b, 2);
2947         let e = _mm512_set1_epi64(6);
2948         assert_eq_m512i(r, e);
2949     }
2950
2951     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2952     unsafe fn test_mm256_shldi_epi64() {
2953         let a = _mm256_set1_epi64x(1);
2954         let b = _mm256_set1_epi64x(1 << 63);
2955         let r = _mm256_shldi_epi64(a, b, 2);
2956         let e = _mm256_set1_epi64x(6);
2957         assert_eq_m256i(r, e);
2958     }
2959
2960     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2961     unsafe fn test_mm256_mask_shldi_epi64() {
2962         let a = _mm256_set1_epi64x(1);
2963         let b = _mm256_set1_epi64x(1 << 63);
2964         let r = _mm256_mask_shldi_epi64(a, 0, a, b, 2);
2965         assert_eq_m256i(r, a);
2966         let r = _mm256_mask_shldi_epi64(a, 0b00001111, a, b, 2);
2967         let e = _mm256_set1_epi64x(6);
2968         assert_eq_m256i(r, e);
2969     }
2970
2971     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2972     unsafe fn test_mm256_maskz_shldi_epi64() {
2973         let a = _mm256_set1_epi64x(1);
2974         let b = _mm256_set1_epi64x(1 << 63);
2975         let r = _mm256_maskz_shldi_epi64(0, a, b, 2);
2976         assert_eq_m256i(r, _mm256_setzero_si256());
2977         let r = _mm256_maskz_shldi_epi64(0b00001111, a, b, 2);
2978         let e = _mm256_set1_epi64x(6);
2979         assert_eq_m256i(r, e);
2980     }
2981
2982     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2983     unsafe fn test_mm_shldi_epi64() {
2984         let a = _mm_set1_epi64x(1);
2985         let b = _mm_set1_epi64x(1 << 63);
2986         let r = _mm_shldi_epi64(a, b, 2);
2987         let e = _mm_set1_epi64x(6);
2988         assert_eq_m128i(r, e);
2989     }
2990
2991     #[simd_test(enable = "avx512vbmi2,avx512vl")]
2992     unsafe fn test_mm_mask_shldi_epi64() {
2993         let a = _mm_set1_epi64x(1);
2994         let b = _mm_set1_epi64x(1 << 63);
2995         let r = _mm_mask_shldi_epi64(a, 0, a, b, 2);
2996         assert_eq_m128i(r, a);
2997         let r = _mm_mask_shldi_epi64(a, 0b00000011, a, b, 2);
2998         let e = _mm_set1_epi64x(6);
2999         assert_eq_m128i(r, e);
3000     }
3001
3002     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3003     unsafe fn test_mm_maskz_shldi_epi64() {
3004         let a = _mm_set1_epi64x(1);
3005         let b = _mm_set1_epi64x(1 << 63);
3006         let r = _mm_maskz_shldi_epi64(0, a, b, 2);
3007         assert_eq_m128i(r, _mm_setzero_si128());
3008         let r = _mm_maskz_shldi_epi64(0b00000011, a, b, 2);
3009         let e = _mm_set1_epi64x(6);
3010         assert_eq_m128i(r, e);
3011     }
3012
3013     #[simd_test(enable = "avx512vbmi2")]
3014     unsafe fn test_mm512_shldi_epi32() {
3015         let a = _mm512_set1_epi32(1);
3016         let b = _mm512_set1_epi32(1 << 31);
3017         let r = _mm512_shldi_epi32(a, b, 2);
3018         let e = _mm512_set1_epi32(6);
3019         assert_eq_m512i(r, e);
3020     }
3021
3022     #[simd_test(enable = "avx512vbmi2")]
3023     unsafe fn test_mm512_mask_shldi_epi32() {
3024         let a = _mm512_set1_epi32(1);
3025         let b = _mm512_set1_epi32(1 << 31);
3026         let r = _mm512_mask_shldi_epi32(a, 0, a, b, 2);
3027         assert_eq_m512i(r, a);
3028         let r = _mm512_mask_shldi_epi32(a, 0b11111111_11111111, a, b, 2);
3029         let e = _mm512_set1_epi32(6);
3030         assert_eq_m512i(r, e);
3031     }
3032
3033     #[simd_test(enable = "avx512vbmi2")]
3034     unsafe fn test_mm512_maskz_shldi_epi32() {
3035         let a = _mm512_set1_epi32(1);
3036         let b = _mm512_set1_epi32(1 << 31);
3037         let r = _mm512_maskz_shldi_epi32(0, a, b, 2);
3038         assert_eq_m512i(r, _mm512_setzero_si512());
3039         let r = _mm512_maskz_shldi_epi32(0b11111111_11111111, a, b, 2);
3040         let e = _mm512_set1_epi32(6);
3041         assert_eq_m512i(r, e);
3042     }
3043
3044     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3045     unsafe fn test_mm256_shldi_epi32() {
3046         let a = _mm256_set1_epi32(1);
3047         let b = _mm256_set1_epi32(1 << 31);
3048         let r = _mm256_shldi_epi32(a, b, 2);
3049         let e = _mm256_set1_epi32(6);
3050         assert_eq_m256i(r, e);
3051     }
3052
3053     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3054     unsafe fn test_mm256_mask_shldi_epi32() {
3055         let a = _mm256_set1_epi32(1);
3056         let b = _mm256_set1_epi32(1 << 31);
3057         let r = _mm256_mask_shldi_epi32(a, 0, a, b, 2);
3058         assert_eq_m256i(r, a);
3059         let r = _mm256_mask_shldi_epi32(a, 0b11111111, a, b, 2);
3060         let e = _mm256_set1_epi32(6);
3061         assert_eq_m256i(r, e);
3062     }
3063
3064     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3065     unsafe fn test_mm256_maskz_shldi_epi32() {
3066         let a = _mm256_set1_epi32(1);
3067         let b = _mm256_set1_epi32(1 << 31);
3068         let r = _mm256_maskz_shldi_epi32(0, a, b, 2);
3069         assert_eq_m256i(r, _mm256_setzero_si256());
3070         let r = _mm256_maskz_shldi_epi32(0b11111111, a, b, 2);
3071         let e = _mm256_set1_epi32(6);
3072         assert_eq_m256i(r, e);
3073     }
3074
3075     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3076     unsafe fn test_mm_shldi_epi32() {
3077         let a = _mm_set1_epi32(1);
3078         let b = _mm_set1_epi32(1 << 31);
3079         let r = _mm_shldi_epi32(a, b, 2);
3080         let e = _mm_set1_epi32(6);
3081         assert_eq_m128i(r, e);
3082     }
3083
3084     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3085     unsafe fn test_mm_mask_shldi_epi32() {
3086         let a = _mm_set1_epi32(1);
3087         let b = _mm_set1_epi32(1 << 31);
3088         let r = _mm_mask_shldi_epi32(a, 0, a, b, 2);
3089         assert_eq_m128i(r, a);
3090         let r = _mm_mask_shldi_epi32(a, 0b00001111, a, b, 2);
3091         let e = _mm_set1_epi32(6);
3092         assert_eq_m128i(r, e);
3093     }
3094
3095     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3096     unsafe fn test_mm_maskz_shldi_epi32() {
3097         let a = _mm_set1_epi32(1);
3098         let b = _mm_set1_epi32(1 << 31);
3099         let r = _mm_maskz_shldi_epi32(0, a, b, 2);
3100         assert_eq_m128i(r, _mm_setzero_si128());
3101         let r = _mm_maskz_shldi_epi32(0b00001111, a, b, 2);
3102         let e = _mm_set1_epi32(6);
3103         assert_eq_m128i(r, e);
3104     }
3105
3106     #[simd_test(enable = "avx512vbmi2")]
3107     unsafe fn test_mm512_shldi_epi16() {
3108         let a = _mm512_set1_epi16(1);
3109         let b = _mm512_set1_epi16(1 << 15);
3110         let r = _mm512_shldi_epi16(a, b, 2);
3111         let e = _mm512_set1_epi16(6);
3112         assert_eq_m512i(r, e);
3113     }
3114
3115     #[simd_test(enable = "avx512vbmi2")]
3116     unsafe fn test_mm512_mask_shldi_epi16() {
3117         let a = _mm512_set1_epi16(1);
3118         let b = _mm512_set1_epi16(1 << 15);
3119         let r = _mm512_mask_shldi_epi16(a, 0, a, b, 2);
3120         assert_eq_m512i(r, a);
3121         let r = _mm512_mask_shldi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 2);
3122         let e = _mm512_set1_epi16(6);
3123         assert_eq_m512i(r, e);
3124     }
3125
3126     #[simd_test(enable = "avx512vbmi2")]
3127     unsafe fn test_mm512_maskz_shldi_epi16() {
3128         let a = _mm512_set1_epi16(1);
3129         let b = _mm512_set1_epi16(1 << 15);
3130         let r = _mm512_maskz_shldi_epi16(0, a, b, 2);
3131         assert_eq_m512i(r, _mm512_setzero_si512());
3132         let r = _mm512_maskz_shldi_epi16(0b11111111_11111111_11111111_11111111, a, b, 2);
3133         let e = _mm512_set1_epi16(6);
3134         assert_eq_m512i(r, e);
3135     }
3136
3137     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3138     unsafe fn test_mm256_shldi_epi16() {
3139         let a = _mm256_set1_epi16(1);
3140         let b = _mm256_set1_epi16(1 << 15);
3141         let r = _mm256_shldi_epi16(a, b, 2);
3142         let e = _mm256_set1_epi16(6);
3143         assert_eq_m256i(r, e);
3144     }
3145
3146     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3147     unsafe fn test_mm256_mask_shldi_epi16() {
3148         let a = _mm256_set1_epi16(1);
3149         let b = _mm256_set1_epi16(1 << 15);
3150         let r = _mm256_mask_shldi_epi16(a, 0, a, b, 2);
3151         assert_eq_m256i(r, a);
3152         let r = _mm256_mask_shldi_epi16(a, 0b11111111_11111111, a, b, 2);
3153         let e = _mm256_set1_epi16(6);
3154         assert_eq_m256i(r, e);
3155     }
3156
3157     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3158     unsafe fn test_mm256_maskz_shldi_epi16() {
3159         let a = _mm256_set1_epi16(1);
3160         let b = _mm256_set1_epi16(1 << 15);
3161         let r = _mm256_maskz_shldi_epi16(0, a, b, 2);
3162         assert_eq_m256i(r, _mm256_setzero_si256());
3163         let r = _mm256_maskz_shldi_epi16(0b11111111_11111111, a, b, 2);
3164         let e = _mm256_set1_epi16(6);
3165         assert_eq_m256i(r, e);
3166     }
3167
3168     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3169     unsafe fn test_mm_shldi_epi16() {
3170         let a = _mm_set1_epi16(1);
3171         let b = _mm_set1_epi16(1 << 15);
3172         let r = _mm_shldi_epi16(a, b, 2);
3173         let e = _mm_set1_epi16(6);
3174         assert_eq_m128i(r, e);
3175     }
3176
3177     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3178     unsafe fn test_mm_mask_shldi_epi16() {
3179         let a = _mm_set1_epi16(1);
3180         let b = _mm_set1_epi16(1 << 15);
3181         let r = _mm_mask_shldi_epi16(a, 0, a, b, 2);
3182         assert_eq_m128i(r, a);
3183         let r = _mm_mask_shldi_epi16(a, 0b11111111, a, b, 2);
3184         let e = _mm_set1_epi16(6);
3185         assert_eq_m128i(r, e);
3186     }
3187
3188     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3189     unsafe fn test_mm_maskz_shldi_epi16() {
3190         let a = _mm_set1_epi16(1);
3191         let b = _mm_set1_epi16(1 << 15);
3192         let r = _mm_maskz_shldi_epi16(0, a, b, 2);
3193         assert_eq_m128i(r, _mm_setzero_si128());
3194         let r = _mm_maskz_shldi_epi16(0b11111111, a, b, 2);
3195         let e = _mm_set1_epi16(6);
3196         assert_eq_m128i(r, e);
3197     }
3198
3199     #[simd_test(enable = "avx512vbmi2")]
3200     unsafe fn test_mm512_shrdi_epi64() {
3201         let a = _mm512_set1_epi64(8);
3202         let b = _mm512_set1_epi64(2);
3203         let r = _mm512_shrdi_epi64(a, b, 1);
3204         let e = _mm512_set1_epi64(1);
3205         assert_eq_m512i(r, e);
3206     }
3207
3208     #[simd_test(enable = "avx512vbmi2")]
3209     unsafe fn test_mm512_mask_shrdi_epi64() {
3210         let a = _mm512_set1_epi64(8);
3211         let b = _mm512_set1_epi64(2);
3212         let r = _mm512_mask_shrdi_epi64(a, 0, a, b, 1);
3213         assert_eq_m512i(r, a);
3214         let r = _mm512_mask_shrdi_epi64(a, 0b11111111, a, b, 1);
3215         let e = _mm512_set1_epi64(1);
3216         assert_eq_m512i(r, e);
3217     }
3218
3219     #[simd_test(enable = "avx512vbmi2")]
3220     unsafe fn test_mm512_maskz_shrdi_epi64() {
3221         let a = _mm512_set1_epi64(8);
3222         let b = _mm512_set1_epi64(2);
3223         let r = _mm512_maskz_shrdi_epi64(0, a, b, 1);
3224         assert_eq_m512i(r, _mm512_setzero_si512());
3225         let r = _mm512_maskz_shrdi_epi64(0b11111111, a, b, 1);
3226         let e = _mm512_set1_epi64(1);
3227         assert_eq_m512i(r, e);
3228     }
3229
3230     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3231     unsafe fn test_mm256_shrdi_epi64() {
3232         let a = _mm256_set1_epi64x(8);
3233         let b = _mm256_set1_epi64x(2);
3234         let r = _mm256_shrdi_epi64(a, b, 1);
3235         let e = _mm256_set1_epi64x(1);
3236         assert_eq_m256i(r, e);
3237     }
3238
3239     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3240     unsafe fn test_mm256_mask_shrdi_epi64() {
3241         let a = _mm256_set1_epi64x(8);
3242         let b = _mm256_set1_epi64x(2);
3243         let r = _mm256_mask_shrdi_epi64(a, 0, a, b, 1);
3244         assert_eq_m256i(r, a);
3245         let r = _mm256_mask_shrdi_epi64(a, 0b00001111, a, b, 1);
3246         let e = _mm256_set1_epi64x(1);
3247         assert_eq_m256i(r, e);
3248     }
3249
3250     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3251     unsafe fn test_mm256_maskz_shrdi_epi64() {
3252         let a = _mm256_set1_epi64x(8);
3253         let b = _mm256_set1_epi64x(2);
3254         let r = _mm256_maskz_shrdi_epi64(0, a, b, 1);
3255         assert_eq_m256i(r, _mm256_setzero_si256());
3256         let r = _mm256_maskz_shrdi_epi64(0b00001111, a, b, 1);
3257         let e = _mm256_set1_epi64x(1);
3258         assert_eq_m256i(r, e);
3259     }
3260
3261     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3262     unsafe fn test_mm_shrdi_epi64() {
3263         let a = _mm_set1_epi64x(8);
3264         let b = _mm_set1_epi64x(2);
3265         let r = _mm_shrdi_epi64(a, b, 1);
3266         let e = _mm_set1_epi64x(1);
3267         assert_eq_m128i(r, e);
3268     }
3269
3270     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3271     unsafe fn test_mm_mask_shrdi_epi64() {
3272         let a = _mm_set1_epi64x(8);
3273         let b = _mm_set1_epi64x(2);
3274         let r = _mm_mask_shrdi_epi64(a, 0, a, b, 1);
3275         assert_eq_m128i(r, a);
3276         let r = _mm_mask_shrdi_epi64(a, 0b00000011, a, b, 1);
3277         let e = _mm_set1_epi64x(1);
3278         assert_eq_m128i(r, e);
3279     }
3280
3281     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3282     unsafe fn test_mm_maskz_shrdi_epi64() {
3283         let a = _mm_set1_epi64x(8);
3284         let b = _mm_set1_epi64x(2);
3285         let r = _mm_maskz_shrdi_epi64(0, a, b, 1);
3286         assert_eq_m128i(r, _mm_setzero_si128());
3287         let r = _mm_maskz_shrdi_epi64(0b00000011, a, b, 1);
3288         let e = _mm_set1_epi64x(1);
3289         assert_eq_m128i(r, e);
3290     }
3291
3292     #[simd_test(enable = "avx512vbmi2")]
3293     unsafe fn test_mm512_shrdi_epi32() {
3294         let a = _mm512_set1_epi32(8);
3295         let b = _mm512_set1_epi32(2);
3296         let r = _mm512_shrdi_epi32(a, b, 1);
3297         let e = _mm512_set1_epi32(1);
3298         assert_eq_m512i(r, e);
3299     }
3300
3301     #[simd_test(enable = "avx512vbmi2")]
3302     unsafe fn test_mm512_mask_shrdi_epi32() {
3303         let a = _mm512_set1_epi32(8);
3304         let b = _mm512_set1_epi32(2);
3305         let r = _mm512_mask_shrdi_epi32(a, 0, a, b, 1);
3306         assert_eq_m512i(r, a);
3307         let r = _mm512_mask_shrdi_epi32(a, 0b11111111_11111111, a, b, 1);
3308         let e = _mm512_set1_epi32(1);
3309         assert_eq_m512i(r, e);
3310     }
3311
3312     #[simd_test(enable = "avx512vbmi2")]
3313     unsafe fn test_mm512_maskz_shrdi_epi32() {
3314         let a = _mm512_set1_epi32(8);
3315         let b = _mm512_set1_epi32(2);
3316         let r = _mm512_maskz_shrdi_epi32(0, a, b, 1);
3317         assert_eq_m512i(r, _mm512_setzero_si512());
3318         let r = _mm512_maskz_shrdi_epi32(0b11111111_11111111, a, b, 1);
3319         let e = _mm512_set1_epi32(1);
3320         assert_eq_m512i(r, e);
3321     }
3322
3323     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3324     unsafe fn test_mm256_shrdi_epi32() {
3325         let a = _mm256_set1_epi32(8);
3326         let b = _mm256_set1_epi32(2);
3327         let r = _mm256_shrdi_epi32(a, b, 1);
3328         let e = _mm256_set1_epi32(1);
3329         assert_eq_m256i(r, e);
3330     }
3331
3332     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3333     unsafe fn test_mm256_mask_shrdi_epi32() {
3334         let a = _mm256_set1_epi32(8);
3335         let b = _mm256_set1_epi32(2);
3336         let r = _mm256_mask_shrdi_epi32(a, 0, a, b, 1);
3337         assert_eq_m256i(r, a);
3338         let r = _mm256_mask_shrdi_epi32(a, 0b11111111, a, b, 1);
3339         let e = _mm256_set1_epi32(1);
3340         assert_eq_m256i(r, e);
3341     }
3342
3343     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3344     unsafe fn test_mm256_maskz_shrdi_epi32() {
3345         let a = _mm256_set1_epi32(8);
3346         let b = _mm256_set1_epi32(2);
3347         let r = _mm256_maskz_shrdi_epi32(0, a, b, 1);
3348         assert_eq_m256i(r, _mm256_setzero_si256());
3349         let r = _mm256_maskz_shrdi_epi32(0b11111111, a, b, 1);
3350         let e = _mm256_set1_epi32(1);
3351         assert_eq_m256i(r, e);
3352     }
3353
3354     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3355     unsafe fn test_mm_shrdi_epi32() {
3356         let a = _mm_set1_epi32(8);
3357         let b = _mm_set1_epi32(2);
3358         let r = _mm_shrdi_epi32(a, b, 1);
3359         let e = _mm_set1_epi32(1);
3360         assert_eq_m128i(r, e);
3361     }
3362
3363     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3364     unsafe fn test_mm_mask_shrdi_epi32() {
3365         let a = _mm_set1_epi32(8);
3366         let b = _mm_set1_epi32(2);
3367         let r = _mm_mask_shrdi_epi32(a, 0, a, b, 1);
3368         assert_eq_m128i(r, a);
3369         let r = _mm_mask_shrdi_epi32(a, 0b00001111, a, b, 1);
3370         let e = _mm_set1_epi32(1);
3371         assert_eq_m128i(r, e);
3372     }
3373
3374     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3375     unsafe fn test_mm_maskz_shrdi_epi32() {
3376         let a = _mm_set1_epi32(8);
3377         let b = _mm_set1_epi32(2);
3378         let r = _mm_maskz_shrdi_epi32(0, a, b, 1);
3379         assert_eq_m128i(r, _mm_setzero_si128());
3380         let r = _mm_maskz_shrdi_epi32(0b00001111, a, b, 1);
3381         let e = _mm_set1_epi32(1);
3382         assert_eq_m128i(r, e);
3383     }
3384
3385     #[simd_test(enable = "avx512vbmi2")]
3386     unsafe fn test_mm512_shrdi_epi16() {
3387         let a = _mm512_set1_epi16(8);
3388         let b = _mm512_set1_epi16(2);
3389         let r = _mm512_shrdi_epi16(a, b, 1);
3390         let e = _mm512_set1_epi16(1);
3391         assert_eq_m512i(r, e);
3392     }
3393
3394     #[simd_test(enable = "avx512vbmi2")]
3395     unsafe fn test_mm512_mask_shrdi_epi16() {
3396         let a = _mm512_set1_epi16(8);
3397         let b = _mm512_set1_epi16(2);
3398         let r = _mm512_mask_shrdi_epi16(a, 0, a, b, 1);
3399         assert_eq_m512i(r, a);
3400         let r = _mm512_mask_shrdi_epi16(a, 0b11111111_11111111_11111111_11111111, a, b, 1);
3401         let e = _mm512_set1_epi16(1);
3402         assert_eq_m512i(r, e);
3403     }
3404
3405     #[simd_test(enable = "avx512vbmi2")]
3406     unsafe fn test_mm512_maskz_shrdi_epi16() {
3407         let a = _mm512_set1_epi16(8);
3408         let b = _mm512_set1_epi16(2);
3409         let r = _mm512_maskz_shrdi_epi16(0, a, b, 1);
3410         assert_eq_m512i(r, _mm512_setzero_si512());
3411         let r = _mm512_maskz_shrdi_epi16(0b11111111_11111111_11111111_11111111, a, b, 1);
3412         let e = _mm512_set1_epi16(1);
3413         assert_eq_m512i(r, e);
3414     }
3415
3416     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3417     unsafe fn test_mm256_shrdi_epi16() {
3418         let a = _mm256_set1_epi16(8);
3419         let b = _mm256_set1_epi16(2);
3420         let r = _mm256_shrdi_epi16(a, b, 1);
3421         let e = _mm256_set1_epi16(1);
3422         assert_eq_m256i(r, e);
3423     }
3424
3425     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3426     unsafe fn test_mm256_mask_shrdi_epi16() {
3427         let a = _mm256_set1_epi16(8);
3428         let b = _mm256_set1_epi16(2);
3429         let r = _mm256_mask_shrdi_epi16(a, 0, a, b, 1);
3430         assert_eq_m256i(r, a);
3431         let r = _mm256_mask_shrdi_epi16(a, 0b11111111_11111111, a, b, 1);
3432         let e = _mm256_set1_epi16(1);
3433         assert_eq_m256i(r, e);
3434     }
3435
3436     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3437     unsafe fn test_mm256_maskz_shrdi_epi16() {
3438         let a = _mm256_set1_epi16(8);
3439         let b = _mm256_set1_epi16(2);
3440         let r = _mm256_maskz_shrdi_epi16(0, a, b, 1);
3441         assert_eq_m256i(r, _mm256_setzero_si256());
3442         let r = _mm256_maskz_shrdi_epi16(0b11111111_11111111, a, b, 1);
3443         let e = _mm256_set1_epi16(1);
3444         assert_eq_m256i(r, e);
3445     }
3446
3447     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3448     unsafe fn test_mm_shrdi_epi16() {
3449         let a = _mm_set1_epi16(8);
3450         let b = _mm_set1_epi16(2);
3451         let r = _mm_shrdi_epi16(a, b, 1);
3452         let e = _mm_set1_epi16(1);
3453         assert_eq_m128i(r, e);
3454     }
3455
3456     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3457     unsafe fn test_mm_mask_shrdi_epi16() {
3458         let a = _mm_set1_epi16(8);
3459         let b = _mm_set1_epi16(2);
3460         let r = _mm_mask_shrdi_epi16(a, 0, a, b, 1);
3461         assert_eq_m128i(r, a);
3462         let r = _mm_mask_shrdi_epi16(a, 0b11111111, a, b, 1);
3463         let e = _mm_set1_epi16(1);
3464         assert_eq_m128i(r, e);
3465     }
3466
3467     #[simd_test(enable = "avx512vbmi2,avx512vl")]
3468     unsafe fn test_mm_maskz_shrdi_epi16() {
3469         let a = _mm_set1_epi16(8);
3470         let b = _mm_set1_epi16(2);
3471         let r = _mm_maskz_shrdi_epi16(0, a, b, 1);
3472         assert_eq_m128i(r, _mm_setzero_si128());
3473         let r = _mm_maskz_shrdi_epi16(0b11111111, a, b, 1);
3474         let e = _mm_set1_epi16(1);
3475         assert_eq_m128i(r, e);
3476     }
3477 }