library/stdarch/crates/core_arch/src/x86/avx512f.rs

   1 use crate::{
   2     core_arch::{simd::*, simd_llvm::*, x86::*},
   3     mem::{self, transmute},
   4     ptr,
   5 };
   6
   7 #[cfg(test)]
   8 use stdarch_test::assert_instr;
   9
  10 /// Computes the absolute values of packed 32-bit integers in `a`.
  11 ///
  12 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33&text=_mm512_abs_epi32)
  13 #[inline]
  14 #[target_feature(enable = "avx512f")]
  15 #[cfg_attr(test, assert_instr(vpabsd))]
  16 pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
  17     let a = a.as_i32x16();
  18     // all-0 is a properly initialized i32x16
  19     let zero: i32x16 = mem::zeroed();
  20     let sub = simd_sub(zero, a);
  21     let cmp: i32x16 = simd_gt(a, zero);
  22     transmute(simd_select(cmp, a, sub))
  23 }
  24
  25 /// Computes the absolute value of packed 32-bit integers in `a`, and store the
  26 /// unsigned results in `dst` using writemask `k` (elements are copied from
  27 /// `src` when the corresponding mask bit is not set).
  28 ///
  29 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33&text=_mm512_abs_epi32)
  30 #[inline]
  31 #[target_feature(enable = "avx512f")]
  32 #[cfg_attr(test, assert_instr(vpabsd))]
  33 pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
  34     let abs = _mm512_abs_epi32(a).as_i32x16();
  35     transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
  36 }
  37
  38 /// Computes the absolute value of packed 32-bit integers in `a`, and store the
  39 /// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
  40 /// the corresponding mask bit is not set).
  41 ///
  42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990,33,34,35,35&text=_mm512_maskz_abs_epi32)
  43 #[inline]
  44 #[target_feature(enable = "avx512f")]
  45 #[cfg_attr(test, assert_instr(vpabsd))]
  46 pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
  47     let abs = _mm512_abs_epi32(a).as_i32x16();
  48     let zero = _mm512_setzero_si512().as_i32x16();
  49     transmute(simd_select_bitmask(k, abs, zero))
  50 }
  51
  52 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
  53 ///
  54 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_abs_epi64&expand=48)
  55 #[inline]
  56 #[target_feature(enable = "avx512f")]
  57 #[cfg_attr(test, assert_instr(vpabsq))]
  58 pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
  59     let a = a.as_i64x8();
  60     // all-0 is a properly initialized i64x8
  61     let zero: i64x8 = mem::zeroed();
  62     let sub = simd_sub(zero, a);
  63     let cmp: i64x8 = simd_gt(a, zero);
  64     transmute(simd_select(cmp, a, sub))
  65 }
  66
  67 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
  68 ///
  69 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_abs_epi64&expand=49)
  70 #[inline]
  71 #[target_feature(enable = "avx512f")]
  72 #[cfg_attr(test, assert_instr(vpabsq))]
  73 pub unsafe fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
  74     let abs = _mm512_abs_epi64(a).as_i64x8();
  75     transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
  76 }
  77
  78 /// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
  79 ///
  80 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_abs_epi64&expand=50)
  81 #[inline]
  82 #[target_feature(enable = "avx512f")]
  83 #[cfg_attr(test, assert_instr(vpabsq))]
  84 pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
  85     let abs = _mm512_abs_epi64(a).as_i64x8();
  86     let zero = _mm512_setzero_si512().as_i64x8();
  87     transmute(simd_select_bitmask(k, abs, zero))
  88 }
  89
  90 /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
  91 ///
  92 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_abs_ps&expand=65)
  93 #[inline]
  94 #[target_feature(enable = "avx512f")]
  95 #[cfg_attr(test, assert_instr(vpandq))]
  96 pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
  97     let a = _mm512_set1_epi32(0x7FFFFFFF); // from LLVM code
  98     let b = transmute::<f32x16, __m512i>(v2.as_f32x16());
  99     let abs = _mm512_and_epi32(a, b);
 100     transmute(abs)
 101 }
 102
 103 /// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 104 ///
 105 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_abs_ps&expand=66)
 106 #[inline]
 107 #[target_feature(enable = "avx512f")]
 108 #[cfg_attr(test, assert_instr(vpandd))]
 109 pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
 110     let abs = _mm512_abs_ps(v2).as_f32x16();
 111     transmute(simd_select_bitmask(k, abs, src.as_f32x16()))
 112 }
 113
 114 /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
 115 ///
 116 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_abs_pd&expand=60)
 117 #[inline]
 118 #[target_feature(enable = "avx512f")]
 119 #[cfg_attr(test, assert_instr(vpandq))]
 120 pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
 121     let a = _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF); // from LLVM code
 122     let b = transmute::<f64x8, __m512i>(v2.as_f64x8());
 123     let abs = _mm512_and_epi64(a, b);
 124     transmute(abs)
 125 }
 126
 127 /// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 128 ///
 129 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_abs_pd&expand=61)
 130 #[inline]
 131 #[target_feature(enable = "avx512f")]
 132 #[cfg_attr(test, assert_instr(vpandq))]
 133 pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
 134     let abs = _mm512_abs_pd(v2).as_f64x8();
 135     transmute(simd_select_bitmask(k, abs, src.as_f64x8()))
 136 }
 137
 138 /// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 139 ///
 140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi32&expand=3801)
 141 #[inline]
 142 #[target_feature(enable = "avx512f")]
 143 #[cfg_attr(test, assert_instr(vmovdqa32))]
 144 pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
 145     let mov = a.as_i32x16();
 146     transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
 147 }
 148
 149 /// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 150 ///
 151 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi32&expand=3802)
 152 #[inline]
 153 #[target_feature(enable = "avx512f")]
 154 #[cfg_attr(test, assert_instr(vmovdqa32))]
 155 pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
 156     let mov = a.as_i32x16();
 157     let zero = _mm512_setzero_si512().as_i32x16();
 158     transmute(simd_select_bitmask(k, mov, zero))
 159 }
 160
 161 /// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 162 ///
 163 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_epi64&expand=3807)
 164 #[inline]
 165 #[target_feature(enable = "avx512f")]
 166 #[cfg_attr(test, assert_instr(vmovdqa64))]
 167 pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
 168     let mov = a.as_i64x8();
 169     transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
 170 }
 171
 172 /// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 173 ///
 174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_epi64&expand=3808)
 175 #[inline]
 176 #[target_feature(enable = "avx512f")]
 177 #[cfg_attr(test, assert_instr(vmovdqa64))]
 178 pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
 179     let mov = a.as_i64x8();
 180     let zero = _mm512_setzero_si512().as_i64x8();
 181     transmute(simd_select_bitmask(k, mov, zero))
 182 }
 183
 184 /// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 185 ///
 186 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_ps&expand=3825)
 187 #[inline]
 188 #[target_feature(enable = "avx512f")]
 189 #[cfg_attr(test, assert_instr(vmovaps))]
 190 pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
 191     let mov = a.as_f32x16();
 192     transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
 193 }
 194
 195 /// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 196 ///
 197 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_ps&expand=3826)
 198 #[inline]
 199 #[target_feature(enable = "avx512f")]
 200 #[cfg_attr(test, assert_instr(vmovaps))]
 201 pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
 202     let mov = a.as_f32x16();
 203     let zero = _mm512_setzero_ps().as_f32x16();
 204     transmute(simd_select_bitmask(k, mov, zero))
 205 }
 206
 207 /// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 208 ///
 209 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mov_pd&expand=3819)
 210 #[inline]
 211 #[target_feature(enable = "avx512f")]
 212 #[cfg_attr(test, assert_instr(vmovapd))]
 213 pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
 214     let mov = a.as_f64x8();
 215     transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
 216 }
 217
 218 /// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 219 ///
 220 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mov_pd&expand=3820)
 221 #[inline]
 222 #[target_feature(enable = "avx512f")]
 223 #[cfg_attr(test, assert_instr(vmovapd))]
 224 pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
 225     let mov = a.as_f64x8();
 226     let zero = _mm512_setzero_pd().as_f64x8();
 227     transmute(simd_select_bitmask(k, mov, zero))
 228 }
 229
 230 /// Add packed 32-bit integers in a and b, and store the results in dst.
 231 ///
 232 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_epi32&expand=100)
 233 #[inline]
 234 #[target_feature(enable = "avx512f")]
 235 #[cfg_attr(test, assert_instr(vpaddd))]
 236 pub unsafe fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
 237     transmute(simd_add(a.as_i32x16(), b.as_i32x16()))
 238 }
 239
 240 /// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 241 ///
 242 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_add_epi32&expand=101)
 243 #[inline]
 244 #[target_feature(enable = "avx512f")]
 245 #[cfg_attr(test, assert_instr(vpaddd))]
 246 pub unsafe fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 247     let add = _mm512_add_epi32(a, b).as_i32x16();
 248     transmute(simd_select_bitmask(k, add, src.as_i32x16()))
 249 }
 250
 251 /// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 252 ///
 253 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_add_epi32&expand=102)
 254 #[inline]
 255 #[target_feature(enable = "avx512f")]
 256 #[cfg_attr(test, assert_instr(vpaddd))]
 257 pub unsafe fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 258     let add = _mm512_add_epi32(a, b).as_i32x16();
 259     let zero = _mm512_setzero_si512().as_i32x16();
 260     transmute(simd_select_bitmask(k, add, zero))
 261 }
 262
 263 /// Add packed 64-bit integers in a and b, and store the results in dst.
 264 ///
 265 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_epi64&expand=109)
 266 #[inline]
 267 #[target_feature(enable = "avx512f")]
 268 #[cfg_attr(test, assert_instr(vpaddq))]
 269 pub unsafe fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
 270     transmute(simd_add(a.as_i64x8(), b.as_i64x8()))
 271 }
 272
 273 /// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 274 ///
 275 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_add_epi64&expand=110)
 276 #[inline]
 277 #[target_feature(enable = "avx512f")]
 278 #[cfg_attr(test, assert_instr(vpaddq))]
 279 pub unsafe fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 280     let add = _mm512_add_epi64(a, b).as_i64x8();
 281     transmute(simd_select_bitmask(k, add, src.as_i64x8()))
 282 }
 283
 284 /// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 285 ///
 286 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_add_epi64&expand=111)
 287 #[inline]
 288 #[target_feature(enable = "avx512f")]
 289 #[cfg_attr(test, assert_instr(vpaddq))]
 290 pub unsafe fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 291     let add = _mm512_add_epi64(a, b).as_i64x8();
 292     let zero = _mm512_setzero_si512().as_i64x8();
 293     transmute(simd_select_bitmask(k, add, zero))
 294 }
 295
 296 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
 297 ///
 298 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_ps&expand=139)
 299 #[inline]
 300 #[target_feature(enable = "avx512f")]
 301 #[cfg_attr(test, assert_instr(vaddps))]
 302 pub unsafe fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
 303     transmute(simd_add(a.as_f32x16(), b.as_f32x16()))
 304 }
 305
 306 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 307 ///
 308 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_add_ps&expand=140)
 309 #[inline]
 310 #[target_feature(enable = "avx512f")]
 311 #[cfg_attr(test, assert_instr(vaddps))]
 312 pub unsafe fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
 313     let add = _mm512_add_ps(a, b).as_f32x16();
 314     transmute(simd_select_bitmask(k, add, src.as_f32x16()))
 315 }
 316
 317 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 318 ///
 319 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_add_ps&expand=141)
 320 #[inline]
 321 #[target_feature(enable = "avx512f")]
 322 #[cfg_attr(test, assert_instr(vaddps))]
 323 pub unsafe fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
 324     let add = _mm512_add_ps(a, b).as_f32x16();
 325     let zero = _mm512_setzero_ps().as_f32x16();
 326     transmute(simd_select_bitmask(k, add, zero))
 327 }
 328
 329 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
 330 ///
 331 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_pd&expand=127)
 332 #[inline]
 333 #[target_feature(enable = "avx512f")]
 334 #[cfg_attr(test, assert_instr(vaddpd))]
 335 pub unsafe fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
 336     transmute(simd_add(a.as_f64x8(), b.as_f64x8()))
 337 }
 338
 339 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 340 ///
 341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_add_pd&expand=128)
 342 #[inline]
 343 #[target_feature(enable = "avx512f")]
 344 #[cfg_attr(test, assert_instr(vaddpd))]
 345 pub unsafe fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 346     let add = _mm512_add_pd(a, b).as_f64x8();
 347     transmute(simd_select_bitmask(k, add, src.as_f64x8()))
 348 }
 349
 350 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 351 ///
 352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_add_pd&expand=129)
 353 #[inline]
 354 #[target_feature(enable = "avx512f")]
 355 #[cfg_attr(test, assert_instr(vaddpd))]
 356 pub unsafe fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 357     let add = _mm512_add_pd(a, b).as_f64x8();
 358     let zero = _mm512_setzero_pd().as_f64x8();
 359     transmute(simd_select_bitmask(k, add, zero))
 360 }
 361
 362 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
 363 ///
 364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sub_epi32&expand=5694)
 365 #[inline]
 366 #[target_feature(enable = "avx512f")]
 367 #[cfg_attr(test, assert_instr(vpsubd))]
 368 pub unsafe fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
 369     transmute(simd_sub(a.as_i32x16(), b.as_i32x16()))
 370 }
 371
 372 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 373 ///
 374 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sub_epi32&expand=5692)
 375 #[inline]
 376 #[target_feature(enable = "avx512f")]
 377 #[cfg_attr(test, assert_instr(vpsubd))]
 378 pub unsafe fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 379     let sub = _mm512_sub_epi32(a, b).as_i32x16();
 380     transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
 381 }
 382
 383 /// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 384 ///
 385 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sub_epi32&expand=5693)
 386 #[inline]
 387 #[target_feature(enable = "avx512f")]
 388 #[cfg_attr(test, assert_instr(vpsubd))]
 389 pub unsafe fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 390     let sub = _mm512_sub_epi32(a, b).as_i32x16();
 391     let zero = _mm512_setzero_si512().as_i32x16();
 392     transmute(simd_select_bitmask(k, sub, zero))
 393 }
 394
 395 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
 396 ///
 397 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sub_epi64&expand=5703)
 398 #[inline]
 399 #[target_feature(enable = "avx512f")]
 400 #[cfg_attr(test, assert_instr(vpsubq))]
 401 pub unsafe fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
 402     transmute(simd_sub(a.as_i64x8(), b.as_i64x8()))
 403 }
 404
 405 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 406 ///
 407 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sub_epi64&expand=5701)
 408 #[inline]
 409 #[target_feature(enable = "avx512f")]
 410 #[cfg_attr(test, assert_instr(vpsubq))]
 411 pub unsafe fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 412     let sub = _mm512_sub_epi64(a, b).as_i64x8();
 413     transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
 414 }
 415
 416 /// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 417 ///
 418 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sub_epi64&expand=5702)
 419 #[inline]
 420 #[target_feature(enable = "avx512f")]
 421 #[cfg_attr(test, assert_instr(vpsubq))]
 422 pub unsafe fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 423     let add = _mm512_sub_epi64(a, b).as_i64x8();
 424     let zero = _mm512_setzero_si512().as_i64x8();
 425     transmute(simd_select_bitmask(k, add, zero))
 426 }
 427
 428 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
 429 ///
 430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sub_ps&expand=5733)
 431 #[inline]
 432 #[target_feature(enable = "avx512f")]
 433 #[cfg_attr(test, assert_instr(vsubps))]
 434 pub unsafe fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
 435     transmute(simd_sub(a.as_f32x16(), b.as_f32x16()))
 436 }
 437
 438 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 439 ///
 440 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sub_ps&expand=5731)
 441 #[inline]
 442 #[target_feature(enable = "avx512f")]
 443 #[cfg_attr(test, assert_instr(vsubps))]
 444 pub unsafe fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
 445     let sub = _mm512_sub_ps(a, b).as_f32x16();
 446     transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
 447 }
 448
 449 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 450 ///
 451 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sub_ps&expand=5732)
 452 #[inline]
 453 #[target_feature(enable = "avx512f")]
 454 #[cfg_attr(test, assert_instr(vsubps))]
 455 pub unsafe fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
 456     let sub = _mm512_sub_ps(a, b).as_f32x16();
 457     let zero = _mm512_setzero_ps().as_f32x16();
 458     transmute(simd_select_bitmask(k, sub, zero))
 459 }
 460
 461 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
 462 ///
 463 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sub_pd&expand=5721)
 464 #[inline]
 465 #[target_feature(enable = "avx512f")]
 466 #[cfg_attr(test, assert_instr(vsubpd))]
 467 pub unsafe fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
 468     transmute(simd_sub(a.as_f64x8(), b.as_f64x8()))
 469 }
 470
 471 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 472 ///
 473 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sub_pd&expand=5719)
 474 #[inline]
 475 #[target_feature(enable = "avx512f")]
 476 #[cfg_attr(test, assert_instr(vsubpd))]
 477 pub unsafe fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 478     let sub = _mm512_sub_pd(a, b).as_f64x8();
 479     transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
 480 }
 481
 482 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 483 ///
 484 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sub_pd&expand=5720)
 485 #[inline]
 486 #[target_feature(enable = "avx512f")]
 487 #[cfg_attr(test, assert_instr(vsubpd))]
 488 pub unsafe fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 489     let sub = _mm512_sub_pd(a, b).as_f64x8();
 490     let zero = _mm512_setzero_pd().as_f64x8();
 491     transmute(simd_select_bitmask(k, sub, zero))
 492 }
 493
 494 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
 495 ///
 496 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_epi32&expand=3907)
 497 #[inline]
 498 #[target_feature(enable = "avx512f")]
 499 #[cfg_attr(test, assert_instr(vpmuldq))]
 500 pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
 501     transmute(vpmuldq(a.as_i32x16(), b.as_i32x16()))
 502 }
 503
 504 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 505 ///
 506 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_epi32&expand=3905)
 507 #[inline]
 508 #[target_feature(enable = "avx512f")]
 509 #[cfg_attr(test, assert_instr(vpmuldq))]
 510 pub unsafe fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 511     let mul = _mm512_mul_epi32(a, b).as_i64x8();
 512     transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
 513 }
 514
 515 /// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 516 ///
 517 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_epi32&expand=3906)
 518 #[inline]
 519 #[target_feature(enable = "avx512f")]
 520 #[cfg_attr(test, assert_instr(vpmuldq))]
 521 pub unsafe fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 522     let mul = _mm512_mul_epi32(a, b).as_i64x8();
 523     let zero = _mm512_setzero_si512().as_i64x8();
 524     transmute(simd_select_bitmask(k, mul, zero))
 525 }
 526
 527 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
 528 ///
 529 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mullo_epi&expand=4005)
 530 #[inline]
 531 #[target_feature(enable = "avx512f")]
 532 #[cfg_attr(test, assert_instr(vpmulld))]
 533 pub unsafe fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
 534     transmute(simd_mul(a.as_i32x16(), b.as_i32x16()))
 535 }
 536
 537 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 538 ///
 539 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mullo_epi32&expand=4003)
 540 #[inline]
 541 #[target_feature(enable = "avx512f")]
 542 #[cfg_attr(test, assert_instr(vpmulld))]
 543 pub unsafe fn _mm512_mask_mullo_epi32(
 544     src: __m512i,
 545     k: __mmask16,
 546     a: __m512i,
 547     b: __m512i,
 548 ) -> __m512i {
 549     let mul = _mm512_mullo_epi32(a, b).as_i32x16();
 550     transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
 551 }
 552
 553 /// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 554 ///
 555 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mullo_epi32&expand=4004)
 556 #[inline]
 557 #[target_feature(enable = "avx512f")]
 558 #[cfg_attr(test, assert_instr(vpmulld))]
 559 pub unsafe fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 560     let mul = _mm512_mullo_epi32(a, b).as_i32x16();
 561     let zero = _mm512_setzero_si512().as_i32x16();
 562     transmute(simd_select_bitmask(k, mul, zero))
 563 }
 564
 565 /// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
 566 ///
 567 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mullox_epi64&expand=4017)
 568 ///
 569 /// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
 570 #[inline]
 571 #[target_feature(enable = "avx512f")]
 572 pub unsafe fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
 573     transmute(simd_mul(a.as_i64x8(), b.as_i64x8()))
 574 }
 575
 576 /// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 577 ///
 578 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mullox&expand=4016)
 579 ///
 580 /// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
 581 #[inline]
 582 #[target_feature(enable = "avx512f")]
 583 pub unsafe fn _mm512_mask_mullox_epi64(
 584     src: __m512i,
 585     k: __mmask8,
 586     a: __m512i,
 587     b: __m512i,
 588 ) -> __m512i {
 589     let mul = _mm512_mullox_epi64(a, b).as_i64x8();
 590     transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
 591 }
 592
 593 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
 594 ///
 595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_epu32&expand=3916)
 596 #[inline]
 597 #[target_feature(enable = "avx512f")]
 598 #[cfg_attr(test, assert_instr(vpmuludq))]
 599 pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
 600     transmute(vpmuludq(a.as_u32x16(), b.as_u32x16()))
 601 }
 602
 603 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 604 ///
 605 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_epu32&expand=3914)
 606 #[inline]
 607 #[target_feature(enable = "avx512f")]
 608 #[cfg_attr(test, assert_instr(vpmuludq))]
 609 pub unsafe fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 610     let mul = _mm512_mul_epu32(a, b).as_u64x8();
 611     transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
 612 }
 613
 614 /// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 615 ///
 616 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_epu32&expand=3915)
 617 #[inline]
 618 #[target_feature(enable = "avx512f")]
 619 #[cfg_attr(test, assert_instr(vpmuludq))]
 620 pub unsafe fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 621     let mul = _mm512_mul_epu32(a, b).as_u64x8();
 622     let zero = _mm512_setzero_si512().as_u64x8();
 623     transmute(simd_select_bitmask(k, mul, zero))
 624 }
 625
 626 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
 627 ///
 628 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm512_mul_ps&expand=3934)
 629 #[inline]
 630 #[target_feature(enable = "avx512f")]
 631 #[cfg_attr(test, assert_instr(vmulps))]
 632 pub unsafe fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
 633     transmute(simd_mul(a.as_f32x16(), b.as_f32x16()))
 634 }
 635
 636 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.
 637 ///
 638 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_ps&expand=3932)
 639 #[inline]
 640 #[target_feature(enable = "avx512f")]
 641 #[cfg_attr(test, assert_instr(vmulps))]
 642 pub unsafe fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
 643     let mul = _mm512_mul_ps(a, b).as_f32x16();
 644     transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
 645 }
 646
 647 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 648 ///
 649 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_ps&expand=3933)
 650 #[inline]
 651 #[target_feature(enable = "avx512f")]
 652 #[cfg_attr(test, assert_instr(vmulps))]
 653 pub unsafe fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
 654     let mul = _mm512_mul_ps(a, b).as_f32x16();
 655     let zero = _mm512_setzero_ps().as_f32x16();
 656     transmute(simd_select_bitmask(k, mul, zero))
 657 }
 658
 659 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
 660 ///
 661 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_pd&expand=3925)
 662 #[inline]
 663 #[target_feature(enable = "avx512f")]
 664 #[cfg_attr(test, assert_instr(vmulpd))]
 665 pub unsafe fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
 666     transmute(simd_mul(a.as_f64x8(), b.as_f64x8()))
 667 }
 668
 669 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.
 670 ///
 671 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_pd&expand=3923)
 672 #[inline]
 673 #[target_feature(enable = "avx512f")]
 674 #[cfg_attr(test, assert_instr(vmulpd))]
 675 pub unsafe fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 676     let mul = _mm512_mul_pd(a, b).as_f64x8();
 677     transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
 678 }
 679
 680 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 681 ///
 682 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_pd&expand=3924)
 683 #[inline]
 684 #[target_feature(enable = "avx512f")]
 685 #[cfg_attr(test, assert_instr(vmulpd))]
 686 pub unsafe fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 687     let mul = _mm512_mul_pd(a, b).as_f64x8();
 688     let zero = _mm512_setzero_pd().as_f64x8();
 689     transmute(simd_select_bitmask(k, mul, zero))
 690 }
 691
 692 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
 693 ///
 694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_div_ps&expand=2162)
 695 #[inline]
 696 #[target_feature(enable = "avx512f")]
 697 #[cfg_attr(test, assert_instr(vdivps))]
 698 pub unsafe fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
 699     transmute(simd_div(a.as_f32x16(), b.as_f32x16()))
 700 }
 701
 702 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 703 ///
 704 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_div_ps&expand=2163)
 705 #[inline]
 706 #[target_feature(enable = "avx512f")]
 707 #[cfg_attr(test, assert_instr(vdivps))]
 708 pub unsafe fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
 709     let div = _mm512_div_ps(a, b).as_f32x16();
 710     transmute(simd_select_bitmask(k, div, src.as_f32x16()))
 711 }
 712
 713 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 714 ///
 715 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_div_ps&expand=2164)
 716 #[inline]
 717 #[target_feature(enable = "avx512f")]
 718 #[cfg_attr(test, assert_instr(vdivps))]
 719 pub unsafe fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
 720     let div = _mm512_div_ps(a, b).as_f32x16();
 721     let zero = _mm512_setzero_ps().as_f32x16();
 722     transmute(simd_select_bitmask(k, div, zero))
 723 }
 724
 725 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
 726 ///
 727 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_div_pd&expand=2153)
 728 #[inline]
 729 #[target_feature(enable = "avx512f")]
 730 #[cfg_attr(test, assert_instr(vdivpd))]
 731 pub unsafe fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
 732     transmute(simd_div(a.as_f64x8(), b.as_f64x8()))
 733 }
 734
 735 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 736 ///
 737 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_div_pd&expand=2154)
 738 #[inline]
 739 #[target_feature(enable = "avx512f")]
 740 #[cfg_attr(test, assert_instr(vdivpd))]
 741 pub unsafe fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 742     let div = _mm512_div_pd(a, b).as_f64x8();
 743     transmute(simd_select_bitmask(k, div, src.as_f64x8()))
 744 }
 745
 746 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 747 ///
 748 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_div_pd&expand=2155)
 749 #[inline]
 750 #[target_feature(enable = "avx512f")]
 751 #[cfg_attr(test, assert_instr(vdivpd))]
 752 pub unsafe fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 753     let div = _mm512_div_pd(a, b).as_f64x8();
 754     let zero = _mm512_setzero_pd().as_f64x8();
 755     transmute(simd_select_bitmask(k, div, zero))
 756 }
 757
 758 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
 759 ///
 760 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_max_epi32&expand=3582)
 761 #[inline]
 762 #[target_feature(enable = "avx512f")]
 763 #[cfg_attr(test, assert_instr(vpmaxsd))]
 764 pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
 765     transmute(vpmaxsd(a.as_i32x16(), b.as_i32x16()))
 766 }
 767
 768 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 769 ///
 770 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_epi32&expand=3580)
 771 #[inline]
 772 #[target_feature(enable = "avx512f")]
 773 #[cfg_attr(test, assert_instr(vpmaxsd))]
 774 pub unsafe fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 775     let max = _mm512_max_epi32(a, b).as_i32x16();
 776     transmute(simd_select_bitmask(k, max, src.as_i32x16()))
 777 }
 778
 779 /// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 780 ///
 781 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_epi32&expand=3581)
 782 #[inline]
 783 #[target_feature(enable = "avx512f")]
 784 #[cfg_attr(test, assert_instr(vpmaxsd))]
 785 pub unsafe fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 786     let max = _mm512_max_epi32(a, b).as_i32x16();
 787     let zero = _mm512_setzero_si512().as_i32x16();
 788     transmute(simd_select_bitmask(k, max, zero))
 789 }
 790
 791 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
 792 ///
 793 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_max_epi64&expand=3591)
 794 #[inline]
 795 #[target_feature(enable = "avx512f")]
 796 #[cfg_attr(test, assert_instr(vpmaxsq))]
 797 pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
 798     transmute(vpmaxsq(a.as_i64x8(), b.as_i64x8()))
 799 }
 800
 801 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 802 ///
 803 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_epi64&expand=3589)
 804 #[inline]
 805 #[target_feature(enable = "avx512f")]
 806 #[cfg_attr(test, assert_instr(vpmaxsq))]
 807 pub unsafe fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 808     let max = _mm512_max_epi64(a, b).as_i64x8();
 809     transmute(simd_select_bitmask(k, max, src.as_i64x8()))
 810 }
 811
 812 /// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 813 ///
 814 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_epi64&expand=3590)
 815 #[inline]
 816 #[target_feature(enable = "avx512f")]
 817 #[cfg_attr(test, assert_instr(vpmaxsq))]
 818 pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 819     let max = _mm512_max_epi64(a, b).as_i64x8();
 820     let zero = _mm512_setzero_si512().as_i64x8();
 821     transmute(simd_select_bitmask(k, max, zero))
 822 }
 823
 824 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
 825 ///
 826 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_max_ps&expand=3655)
 827 #[inline]
 828 #[target_feature(enable = "avx512f")]
 829 #[cfg_attr(test, assert_instr(vmaxps))]
 830 pub unsafe fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
 831     transmute(vmaxps(
 832         a.as_f32x16(),
 833         b.as_f32x16(),
 834         _MM_FROUND_CUR_DIRECTION,
 835     ))
 836 }
 837
 838 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 839 ///
 840 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_ps&expand=3653)
 841 #[inline]
 842 #[target_feature(enable = "avx512f")]
 843 #[cfg_attr(test, assert_instr(vmaxps))]
 844 pub unsafe fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
 845     let max = _mm512_max_ps(a, b).as_f32x16();
 846     transmute(simd_select_bitmask(k, max, src.as_f32x16()))
 847 }
 848
 849 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 850 ///
 851 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_ps&expand=3654)
 852 #[inline]
 853 #[target_feature(enable = "avx512f")]
 854 #[cfg_attr(test, assert_instr(vmaxps))]
 855 pub unsafe fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
 856     let max = _mm512_max_ps(a, b).as_f32x16();
 857     let zero = _mm512_setzero_ps().as_f32x16();
 858     transmute(simd_select_bitmask(k, max, zero))
 859 }
 860
 861 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
 862 ///
 863 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_max_pd&expand=3645)
 864 #[inline]
 865 #[target_feature(enable = "avx512f")]
 866 #[cfg_attr(test, assert_instr(vmaxpd))]
 867 pub unsafe fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
 868     transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
 869 }
 870
 871 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 872 ///
 873 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_pd&expand=3643)
 874 #[inline]
 875 #[target_feature(enable = "avx512f")]
 876 #[cfg_attr(test, assert_instr(vmaxpd))]
 877 pub unsafe fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 878     let max = _mm512_max_pd(a, b).as_f64x8();
 879     transmute(simd_select_bitmask(k, max, src.as_f64x8()))
 880 }
 881
 882 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 883 ///
 884 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_pd&expand=3644)
 885 #[inline]
 886 #[target_feature(enable = "avx512f")]
 887 #[cfg_attr(test, assert_instr(vmaxpd))]
 888 pub unsafe fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
 889     let max = _mm512_max_pd(a, b).as_f64x8();
 890     let zero = _mm512_setzero_pd().as_f64x8();
 891     transmute(simd_select_bitmask(k, max, zero))
 892 }
 893
 894 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
 895 ///
 896 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_max_epu32&expand=3618)
 897 #[inline]
 898 #[target_feature(enable = "avx512f")]
 899 #[cfg_attr(test, assert_instr(vpmaxud))]
 900 pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
 901     transmute(vpmaxud(a.as_u32x16(), b.as_u32x16()))
 902 }
 903
 904 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 905 ///
 906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_epu32&expand=3616)
 907 #[inline]
 908 #[target_feature(enable = "avx512f")]
 909 #[cfg_attr(test, assert_instr(vpmaxud))]
 910 pub unsafe fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 911     let max = _mm512_max_epu32(a, b).as_u32x16();
 912     transmute(simd_select_bitmask(k, max, src.as_u32x16()))
 913 }
 914
 915 /// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 916 ///
 917 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_epu32&expand=3617)
 918 #[inline]
 919 #[target_feature(enable = "avx512f")]
 920 #[cfg_attr(test, assert_instr(vpmaxud))]
 921 pub unsafe fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 922     let max = _mm512_max_epu32(a, b).as_u32x16();
 923     let zero = _mm512_setzero_si512().as_u32x16();
 924     transmute(simd_select_bitmask(k, max, zero))
 925 }
 926
 927 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
 928 ///
 929 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=max_epu64&expand=3627)
 930 #[inline]
 931 #[target_feature(enable = "avx512f")]
 932 #[cfg_attr(test, assert_instr(vpmaxuq))]
 933 pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
 934     transmute(vpmaxuq(a.as_u64x8(), b.as_u64x8()))
 935 }
 936
 937 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 938 ///
 939 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_epu64&expand=3625)
 940 #[inline]
 941 #[target_feature(enable = "avx512f")]
 942 #[cfg_attr(test, assert_instr(vpmaxuq))]
 943 pub unsafe fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 944     let max = _mm512_max_epu64(a, b).as_u64x8();
 945     transmute(simd_select_bitmask(k, max, src.as_u64x8()))
 946 }
 947
 948 /// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 949 ///
 950 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_epu&expand=3626)
 951 #[inline]
 952 #[target_feature(enable = "avx512f")]
 953 #[cfg_attr(test, assert_instr(vpmaxuq))]
 954 pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
 955     let max = _mm512_max_epu64(a, b).as_u64x8();
 956     let zero = _mm512_setzero_si512().as_u64x8();
 957     transmute(simd_select_bitmask(k, max, zero))
 958 }
 959
 960 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
 961 ///
 962 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_epi32&expand=3696)
 963 #[inline]
 964 #[target_feature(enable = "avx512f")]
 965 #[cfg_attr(test, assert_instr(vpminsd))]
 966 pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
 967     transmute(vpminsd(a.as_i32x16(), b.as_i32x16()))
 968 }
 969
 970 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
 971 ///
 972 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_epi32&expand=3694)
 973 #[inline]
 974 #[target_feature(enable = "avx512f")]
 975 #[cfg_attr(test, assert_instr(vpminsd))]
 976 pub unsafe fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 977     let max = _mm512_min_epi32(a, b).as_i32x16();
 978     transmute(simd_select_bitmask(k, max, src.as_i32x16()))
 979 }
 980
 981 /// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
 982 ///
 983 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_epi32&expand=3695)
 984 #[inline]
 985 #[target_feature(enable = "avx512f")]
 986 #[cfg_attr(test, assert_instr(vpminsd))]
 987 pub unsafe fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
 988     let max = _mm512_min_epi32(a, b).as_i32x16();
 989     let zero = _mm512_setzero_si512().as_i32x16();
 990     transmute(simd_select_bitmask(k, max, zero))
 991 }
 992
 993 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
 994 ///
 995 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_epi64&expand=3705)
 996 #[inline]
 997 #[target_feature(enable = "avx512f")]
 998 #[cfg_attr(test, assert_instr(vpminsq))]
 999 pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
1000     transmute(vpminsq(a.as_i64x8(), b.as_i64x8()))
1001 }
1002
1003 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1004 ///
1005 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_epi64&expand=3703)
1006 #[inline]
1007 #[target_feature(enable = "avx512f")]
1008 #[cfg_attr(test, assert_instr(vpminsq))]
1009 pub unsafe fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
1010     let max = _mm512_min_epi64(a, b).as_i64x8();
1011     transmute(simd_select_bitmask(k, max, src.as_i64x8()))
1012 }
1013
1014 /// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1015 ///
1016 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_epi64&expand=3704)
1017 #[inline]
1018 #[target_feature(enable = "avx512f")]
1019 #[cfg_attr(test, assert_instr(vpminsq))]
1020 pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
1021     let max = _mm512_min_epi64(a, b).as_i64x8();
1022     let zero = _mm512_setzero_si512().as_i64x8();
1023     transmute(simd_select_bitmask(k, max, zero))
1024 }
1025
1026 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
1027 ///
1028 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_ps&expand=3769)
1029 #[inline]
1030 #[target_feature(enable = "avx512f")]
1031 #[cfg_attr(test, assert_instr(vminps))]
1032 pub unsafe fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
1033     transmute(vminps(
1034         a.as_f32x16(),
1035         b.as_f32x16(),
1036         _MM_FROUND_CUR_DIRECTION,
1037     ))
1038 }
1039
1040 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1041 ///
1042 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_ps&expand=3767)
1043 #[inline]
1044 #[target_feature(enable = "avx512f")]
1045 #[cfg_attr(test, assert_instr(vminps))]
1046 pub unsafe fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
1047     let max = _mm512_min_ps(a, b).as_f32x16();
1048     transmute(simd_select_bitmask(k, max, src.as_f32x16()))
1049 }
1050
1051 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1052 ///
1053 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_ps&expand=3768)
1054 #[inline]
1055 #[target_feature(enable = "avx512f")]
1056 #[cfg_attr(test, assert_instr(vminps))]
1057 pub unsafe fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
1058     let max = _mm512_min_ps(a, b).as_f32x16();
1059     let zero = _mm512_setzero_ps().as_f32x16();
1060     transmute(simd_select_bitmask(k, max, zero))
1061 }
1062
1063 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
1064 ///
1065 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_pd&expand=3759)
1066 #[inline]
1067 #[target_feature(enable = "avx512f")]
1068 #[cfg_attr(test, assert_instr(vminpd))]
1069 pub unsafe fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
1070     transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
1071 }
1072
1073 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1074 ///
1075 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_pd&expand=3757)
1076 #[inline]
1077 #[target_feature(enable = "avx512f")]
1078 #[cfg_attr(test, assert_instr(vminpd))]
1079 pub unsafe fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
1080     let max = _mm512_min_pd(a, b).as_f64x8();
1081     transmute(simd_select_bitmask(k, max, src.as_f64x8()))
1082 }
1083
1084 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1085 ///
1086 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_pd&expand=3758)
1087 #[inline]
1088 #[target_feature(enable = "avx512f")]
1089 #[cfg_attr(test, assert_instr(vminpd))]
1090 pub unsafe fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
1091     let max = _mm512_min_pd(a, b).as_f64x8();
1092     let zero = _mm512_setzero_pd().as_f64x8();
1093     transmute(simd_select_bitmask(k, max, zero))
1094 }
1095
1096 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
1097 ///
1098 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_epu32&expand=3732)
1099 #[inline]
1100 #[target_feature(enable = "avx512f")]
1101 #[cfg_attr(test, assert_instr(vpminud))]
1102 pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
1103     transmute(vpminud(a.as_u32x16(), b.as_u32x16()))
1104 }
1105
1106 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1107 ///
1108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_epu32&expand=3730)
1109 #[inline]
1110 #[target_feature(enable = "avx512f")]
1111 #[cfg_attr(test, assert_instr(vpminud))]
1112 pub unsafe fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
1113     let max = _mm512_min_epu32(a, b).as_u32x16();
1114     transmute(simd_select_bitmask(k, max, src.as_u32x16()))
1115 }
1116
1117 /// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1118 ///
1119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_epu32&expand=3731)
1120 #[inline]
1121 #[target_feature(enable = "avx512f")]
1122 #[cfg_attr(test, assert_instr(vpminud))]
1123 pub unsafe fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
1124     let max = _mm512_min_epu32(a, b).as_u32x16();
1125     let zero = _mm512_setzero_si512().as_u32x16();
1126     transmute(simd_select_bitmask(k, max, zero))
1127 }
1128
1129 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
1130 ///
1131 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_epu64&expand=3741)
1132 #[inline]
1133 #[target_feature(enable = "avx512f")]
1134 #[cfg_attr(test, assert_instr(vpminuq))]
1135 pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
1136     transmute(vpminuq(a.as_u64x8(), b.as_u64x8()))
1137 }
1138
1139 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1140 ///
1141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_epu64&expand=3739)
1142 #[inline]
1143 #[target_feature(enable = "avx512f")]
1144 #[cfg_attr(test, assert_instr(vpminuq))]
1145 pub unsafe fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
1146     let max = _mm512_min_epu64(a, b).as_u64x8();
1147     transmute(simd_select_bitmask(k, max, src.as_u64x8()))
1148 }
1149
1150 /// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1151 ///
1152 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_epu64&expand=3740)
1153 #[inline]
1154 #[target_feature(enable = "avx512f")]
1155 #[cfg_attr(test, assert_instr(vpminuq))]
1156 pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
1157     let max = _mm512_min_epu64(a, b).as_u64x8();
1158     let zero = _mm512_setzero_si512().as_u64x8();
1159     transmute(simd_select_bitmask(k, max, zero))
1160 }
1161
1162 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
1163 ///
1164 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sqrt_ps&expand=5371)
1165 #[inline]
1166 #[target_feature(enable = "avx512f")]
1167 #[cfg_attr(test, assert_instr(vsqrtps))]
1168 pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
1169     transmute(vsqrtps(a.as_f32x16(), _MM_FROUND_CUR_DIRECTION))
1170 }
1171
1172 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1173 ///
1174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sqrt_ps&expand=5369)
1175 #[inline]
1176 #[target_feature(enable = "avx512f")]
1177 #[cfg_attr(test, assert_instr(vsqrtps))]
1178 pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
1179     let sqrt = _mm512_sqrt_ps(a).as_f32x16();
1180     transmute(simd_select_bitmask(k, sqrt, src.as_f32x16()))
1181 }
1182
1183 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1184 ///
1185 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sqrt_ps&expand=5370)
1186 #[inline]
1187 #[target_feature(enable = "avx512f")]
1188 #[cfg_attr(test, assert_instr(vsqrtps))]
1189 pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
1190     let sqrt = _mm512_sqrt_ps(a).as_f32x16();
1191     let zero = _mm512_setzero_ps().as_f32x16();
1192     transmute(simd_select_bitmask(k, sqrt, zero))
1193 }
1194
1195 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
1196 ///
1197 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sqrt_pd&expand=5362)
1198 #[inline]
1199 #[target_feature(enable = "avx512f")]
1200 #[cfg_attr(test, assert_instr(vsqrtpd))]
1201 pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
1202     transmute(vsqrtpd(a.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
1203 }
1204
1205 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
1206 ///
1207 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sqrt_pd&expand=5360)
1208 #[inline]
1209 #[target_feature(enable = "avx512f")]
1210 #[cfg_attr(test, assert_instr(vsqrtpd))]
1211 pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
1212     let sqrt = _mm512_sqrt_pd(a).as_f64x8();
1213     transmute(simd_select_bitmask(k, sqrt, src.as_f64x8()))
1214 }
1215
1216 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1217 ///
1218 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sqrt_pd&expand=5361)
1219 #[inline]
1220 #[target_feature(enable = "avx512f")]
1221 #[cfg_attr(test, assert_instr(vsqrtpd))]
1222 pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
1223     let sqrt = _mm512_sqrt_pd(a).as_f64x8();
1224     let zero = _mm512_setzero_pd().as_f64x8();
1225     transmute(simd_select_bitmask(k, sqrt, zero))
1226 }
1227
1228 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
1229 ///
1230 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=fmadd_ps&expand=2557)
1231 #[inline]
1232 #[target_feature(enable = "avx512f")]
1233 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
1234 pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
1235     transmute(vfmadd132ps(
1236         a.as_f32x16(),
1237         b.as_f32x16(),
1238         c.as_f32x16(),
1239         _MM_FROUND_CUR_DIRECTION,
1240     ))
1241 }
1242
1243 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1244 ///
1245 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmadd_ps&expand=2558)
1246 #[inline]
1247 #[target_feature(enable = "avx512f")]
1248 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
1249 pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
1250     let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
1251     transmute(simd_select_bitmask(k, fmadd, a.as_f32x16()))
1252 }
1253
1254 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1255 ///
1256 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmadd_ps&expand=2560)
1257 #[inline]
1258 #[target_feature(enable = "avx512f")]
1259 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
1260 pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
1261     let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
1262     let zero = _mm512_setzero_ps().as_f32x16();
1263     transmute(simd_select_bitmask(k, fmadd, zero))
1264 }
1265
1266 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1267 ///
1268 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmadd_ps&expand=2559)
1269 #[inline]
1270 #[target_feature(enable = "avx512f")]
1271 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
1272 pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
1273     let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
1274     transmute(simd_select_bitmask(k, fmadd, c.as_f32x16()))
1275 }
1276
1277 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
1278 ///
1279 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmadd_pd&expand=2545)
1280 #[inline]
1281 #[target_feature(enable = "avx512f")]
1282 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
1283 pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1284     transmute(vfmadd132pd(
1285         a.as_f64x8(),
1286         b.as_f64x8(),
1287         c.as_f64x8(),
1288         _MM_FROUND_CUR_DIRECTION,
1289     ))
1290 }
1291
1292 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1293 ///
1294 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmadd_pd&expand=2546)
1295 #[inline]
1296 #[target_feature(enable = "avx512f")]
1297 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
1298 pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
1299     let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
1300     transmute(simd_select_bitmask(k, fmadd, a.as_f64x8()))
1301 }
1302
1303 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1304 ///
1305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmadd_pd&expand=2548)
1306 #[inline]
1307 #[target_feature(enable = "avx512f")]
1308 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
1309 pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1310     let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
1311     let zero = _mm512_setzero_pd().as_f64x8();
1312     transmute(simd_select_bitmask(k, fmadd, zero))
1313 }
1314
1315 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1316 ///
1317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmadd_pd&expand=2547)
1318 #[inline]
1319 #[target_feature(enable = "avx512f")]
1320 #[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
1321 pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
1322     let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
1323     transmute(simd_select_bitmask(k, fmadd, c.as_f64x8()))
1324 }
1325
1326 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
1327 ///
1328 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsub_ps&expand=2643)
1329 #[inline]
1330 #[target_feature(enable = "avx512f")]
1331 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
1332 pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
1333     let zero: f32x16 = mem::zeroed();
1334     let sub = simd_sub(zero, c.as_f32x16());
1335     transmute(vfmadd132ps(
1336         a.as_f32x16(),
1337         b.as_f32x16(),
1338         sub,
1339         _MM_FROUND_CUR_DIRECTION,
1340     ))
1341 }
1342
1343 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1344 ///
1345 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsub_ps&expand=2644)
1346 #[inline]
1347 #[target_feature(enable = "avx512f")]
1348 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
1349 pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
1350     let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
1351     transmute(simd_select_bitmask(k, fmsub, a.as_f32x16()))
1352 }
1353
1354 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1355 ///
1356 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsub_ps&expand=2646)
1357 #[inline]
1358 #[target_feature(enable = "avx512f")]
1359 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
1360 pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
1361     let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
1362     let zero = _mm512_setzero_ps().as_f32x16();
1363     transmute(simd_select_bitmask(k, fmsub, zero))
1364 }
1365
1366 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1367 ///
1368 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsub_ps&expand=2645)
1369 #[inline]
1370 #[target_feature(enable = "avx512f")]
1371 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
1372 pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
1373     let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
1374     transmute(simd_select_bitmask(k, fmsub, c.as_f32x16()))
1375 }
1376
1377 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
1378 ///
1379 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsub_pd&expand=2631)
1380 #[inline]
1381 #[target_feature(enable = "avx512f")]
1382 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
1383 pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1384     let zero: f64x8 = mem::zeroed();
1385     let sub = simd_sub(zero, c.as_f64x8());
1386     transmute(vfmadd132pd(
1387         a.as_f64x8(),
1388         b.as_f64x8(),
1389         sub,
1390         _MM_FROUND_CUR_DIRECTION,
1391     ))
1392 }
1393
1394 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1395 ///
1396 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsub_pd&expand=2632)
1397 #[inline]
1398 #[target_feature(enable = "avx512f")]
1399 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
1400 pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
1401     let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
1402     transmute(simd_select_bitmask(k, fmsub, a.as_f64x8()))
1403 }
1404
1405 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1406 ///
1407 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsub_pd&expand=2634)
1408 #[inline]
1409 #[target_feature(enable = "avx512f")]
1410 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
1411 pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1412     let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
1413     let zero = _mm512_setzero_pd().as_f64x8();
1414     transmute(simd_select_bitmask(k, fmsub, zero))
1415 }
1416
1417 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1418 ///
1419 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsub_pd&expand=2633)
1420 #[inline]
1421 #[target_feature(enable = "avx512f")]
1422 #[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
1423 pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
1424     let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
1425     transmute(simd_select_bitmask(k, fmsub, c.as_f64x8()))
1426 }
1427
1428 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
1429 ///
1430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmaddsub_ps&expand=2611)
1431 #[inline]
1432 #[target_feature(enable = "avx512f")]
1433 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
1434 pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
1435     transmute(vfmaddsub213ps(
1436         a.as_f32x16(),
1437         b.as_f32x16(),
1438         c.as_f32x16(),
1439         _MM_FROUND_CUR_DIRECTION,
1440     ))
1441 }
1442
1443 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1444 ///
1445 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmaddsub_ps&expand=2612)
1446 #[inline]
1447 #[target_feature(enable = "avx512f")]
1448 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
1449 pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
1450     let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
1451     transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x16()))
1452 }
1453
1454 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1455 ///
1456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmaddsub_ps&expand=2614)
1457 #[inline]
1458 #[target_feature(enable = "avx512f")]
1459 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
1460 pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
1461     let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
1462     let zero = _mm512_setzero_ps().as_f32x16();
1463     transmute(simd_select_bitmask(k, fmaddsub, zero))
1464 }
1465
1466 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1467 ///
1468 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmaddsub_ps&expand=2613)
1469 #[inline]
1470 #[target_feature(enable = "avx512f")]
1471 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
1472 pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
1473     let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
1474     transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x16()))
1475 }
1476
1477 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
1478 ///
1479 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmaddsub_pd&expand=2599)
1480 #[inline]
1481 #[target_feature(enable = "avx512f")]
1482 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
1483 pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1484     transmute(vfmaddsub213pd(
1485         a.as_f64x8(),
1486         b.as_f64x8(),
1487         c.as_f64x8(),
1488         _MM_FROUND_CUR_DIRECTION,
1489     ))
1490 }
1491
1492 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1493 ///
1494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmaddsub_pd&expand=2600)
1495 #[inline]
1496 #[target_feature(enable = "avx512f")]
1497 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
1498 pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
1499     let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
1500     transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x8()))
1501 }
1502
1503 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1504 ///
1505 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmaddsub_pd&expand=2602)
1506 #[inline]
1507 #[target_feature(enable = "avx512f")]
1508 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
1509 pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1510     let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
1511     let zero = _mm512_setzero_pd().as_f64x8();
1512     transmute(simd_select_bitmask(k, fmaddsub, zero))
1513 }
1514
1515 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1516 ///
1517 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmaddsub_ps&expand=2613)
1518 #[inline]
1519 #[target_feature(enable = "avx512f")]
1520 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
1521 pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
1522     let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
1523     transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x8()))
1524 }
1525
1526 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
1527 ///
1528 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsubadd_ps&expand=2691)
1529 #[inline]
1530 #[target_feature(enable = "avx512f")]
1531 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
1532 pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
1533     let zero: f32x16 = mem::zeroed();
1534     let sub = simd_sub(zero, c.as_f32x16());
1535     transmute(vfmaddsub213ps(
1536         a.as_f32x16(),
1537         b.as_f32x16(),
1538         sub,
1539         _MM_FROUND_CUR_DIRECTION,
1540     ))
1541 }
1542
1543 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1544 ///
1545 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsubadd_ps&expand=2692)
1546 #[inline]
1547 #[target_feature(enable = "avx512f")]
1548 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
1549 pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
1550     let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
1551     transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16()))
1552 }
1553
1554 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1555 ///
1556 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsubadd_ps&expand=2694)
1557 #[inline]
1558 #[target_feature(enable = "avx512f")]
1559 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
1560 pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
1561     let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
1562     let zero = _mm512_setzero_ps().as_f32x16();
1563     transmute(simd_select_bitmask(k, fmsubadd, zero))
1564 }
1565
1566 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1567 ///
1568 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsubadd_ps&expand=2693)
1569 #[inline]
1570 #[target_feature(enable = "avx512f")]
1571 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
1572 pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
1573     let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
1574     transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16()))
1575 }
1576
1577 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
1578 ///
1579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsubadd_pd&expand=2679)
1580 #[inline]
1581 #[target_feature(enable = "avx512f")]
1582 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
1583 pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1584     let zero: f64x8 = mem::zeroed();
1585     let sub = simd_sub(zero, c.as_f64x8());
1586     transmute(vfmaddsub213pd(
1587         a.as_f64x8(),
1588         b.as_f64x8(),
1589         sub,
1590         _MM_FROUND_CUR_DIRECTION,
1591     ))
1592 }
1593
1594 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1595 ///
1596 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsubadd_pd&expand=2680)
1597 #[inline]
1598 #[target_feature(enable = "avx512f")]
1599 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
1600 pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
1601     let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
1602     transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8()))
1603 }
1604
1605 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1606 ///
1607 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsubadd_pd&expand=2682)
1608 #[inline]
1609 #[target_feature(enable = "avx512f")]
1610 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
1611 pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1612     let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
1613     let zero = _mm512_setzero_pd().as_f64x8();
1614     transmute(simd_select_bitmask(k, fmsubadd, zero))
1615 }
1616
1617 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1618 ///
1619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsubadd_pd&expand=2681)
1620 #[inline]
1621 #[target_feature(enable = "avx512f")]
1622 #[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
1623 pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
1624     let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
1625     transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8()))
1626 }
1627
1628 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
1629 ///
1630 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmadd_ps&expand=2723)
1631 #[inline]
1632 #[target_feature(enable = "avx512f")]
1633 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
1634 pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
1635     let zero: f32x16 = mem::zeroed();
1636     let sub = simd_sub(zero, a.as_f32x16());
1637     transmute(vfmadd132ps(
1638         sub,
1639         b.as_f32x16(),
1640         c.as_f32x16(),
1641         _MM_FROUND_CUR_DIRECTION,
1642     ))
1643 }
1644
1645 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1646 ///
1647 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmadd_ps&expand=2724)
1648 #[inline]
1649 #[target_feature(enable = "avx512f")]
1650 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
1651 pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
1652     let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
1653     transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16()))
1654 }
1655
1656 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1657 ///
1658 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmadd_ps&expand=2726)
1659 #[inline]
1660 #[target_feature(enable = "avx512f")]
1661 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
1662 pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
1663     let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
1664     let zero = _mm512_setzero_ps().as_f32x16();
1665     transmute(simd_select_bitmask(k, fnmadd, zero))
1666 }
1667
1668 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1669 ///
1670 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmadd_ps&expand=2725)
1671 #[inline]
1672 #[target_feature(enable = "avx512f")]
1673 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
1674 pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
1675     let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
1676     transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16()))
1677 }
1678
1679 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
1680 ///
1681 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmadd_pd&expand=2711)
1682 #[inline]
1683 #[target_feature(enable = "avx512f")]
1684 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
1685 pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1686     let zero: f64x8 = mem::zeroed();
1687     let sub = simd_sub(zero, a.as_f64x8());
1688     transmute(vfmadd132pd(
1689         sub,
1690         b.as_f64x8(),
1691         c.as_f64x8(),
1692         _MM_FROUND_CUR_DIRECTION,
1693     ))
1694 }
1695
1696 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1697 ///
1698 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmadd_pd&expand=2712)
1699 #[inline]
1700 #[target_feature(enable = "avx512f")]
1701 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
1702 pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
1703     let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
1704     transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8()))
1705 }
1706
1707 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1708 ///
1709 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmadd_pd&expand=2714)
1710 #[inline]
1711 #[target_feature(enable = "avx512f")]
1712 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
1713 pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1714     let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
1715     let zero = _mm512_setzero_pd().as_f64x8();
1716     transmute(simd_select_bitmask(k, fnmadd, zero))
1717 }
1718
1719 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1720 ///
1721 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmadd_pd&expand=2713)
1722 #[inline]
1723 #[target_feature(enable = "avx512f")]
1724 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
1725 pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
1726     let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
1727     transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8()))
1728 }
1729
1730 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
1731 ///
1732 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmsub_ps&expand=2771)
1733 #[inline]
1734 #[target_feature(enable = "avx512f")]
1735 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
1736 pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
1737     let zero: f32x16 = mem::zeroed();
1738     let suba = simd_sub(zero, a.as_f32x16());
1739     let subc = simd_sub(zero, c.as_f32x16());
1740     transmute(vfmadd132ps(
1741         suba,
1742         b.as_f32x16(),
1743         subc,
1744         _MM_FROUND_CUR_DIRECTION,
1745     ))
1746 }
1747
1748 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1749 ///
1750 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmsub_ps&expand=2772)
1751 #[inline]
1752 #[target_feature(enable = "avx512f")]
1753 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
1754 pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
1755     let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
1756     transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16()))
1757 }
1758
1759 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1760 ///
1761 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmsub_ps&expand=2774)
1762 #[inline]
1763 #[target_feature(enable = "avx512f")]
1764 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
1765 pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
1766     let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
1767     let zero = _mm512_setzero_ps().as_f32x16();
1768     transmute(simd_select_bitmask(k, fnmsub, zero))
1769 }
1770
1771 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1772 ///
1773 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmsub_ps&expand=2773)
1774 #[inline]
1775 #[target_feature(enable = "avx512f")]
1776 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
1777 pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
1778     let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
1779     transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16()))
1780 }
1781
1782 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
1783 ///
1784 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmsub_pd&expand=2759)
1785 #[inline]
1786 #[target_feature(enable = "avx512f")]
1787 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
1788 pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1789     let zero: f64x8 = mem::zeroed();
1790     let suba = simd_sub(zero, a.as_f64x8());
1791     let subc = simd_sub(zero, c.as_f64x8());
1792     transmute(vfmadd132pd(
1793         suba,
1794         b.as_f64x8(),
1795         subc,
1796         _MM_FROUND_CUR_DIRECTION,
1797     ))
1798 }
1799
1800 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
1801 ///
1802 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmsub_pd&expand=2760)
1803 #[inline]
1804 #[target_feature(enable = "avx512f")]
1805 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
1806 pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
1807     let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
1808     transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8()))
1809 }
1810
1811 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1812 ///
1813 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmsub_pd&expand=2762)
1814 #[inline]
1815 #[target_feature(enable = "avx512f")]
1816 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
1817 pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
1818     let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
1819     let zero = _mm512_setzero_pd().as_f64x8();
1820     transmute(simd_select_bitmask(k, fnmsub, zero))
1821 }
1822
1823 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
1824 ///
1825 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmsub_pd&expand=2761)
1826 #[inline]
1827 #[target_feature(enable = "avx512f")]
1828 #[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
1829 pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
1830     let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
1831     transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8()))
1832 }
1833
1834 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
1835 ///
1836 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rcp14_ps&expand=4502)
1837 #[inline]
1838 #[target_feature(enable = "avx512f")]
1839 #[cfg_attr(test, assert_instr(vrcp14ps))]
1840 pub unsafe fn _mm512_rcp14_ps(a: __m512) -> __m512 {
1841     transmute(vrcp14ps(
1842         a.as_f32x16(),
1843         _mm512_setzero_ps().as_f32x16(),
1844         0b11111111_11111111,
1845     ))
1846 }
1847
1848 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1849 ///
1850 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rcp14_ps&expand=4500)
1851 #[inline]
1852 #[target_feature(enable = "avx512f")]
1853 #[cfg_attr(test, assert_instr(vrcp14ps))]
1854 pub unsafe fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
1855     transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k))
1856 }
1857
1858 /// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1859 ///
1860 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rcp14_ps&expand=4501)
1861 #[inline]
1862 #[target_feature(enable = "avx512f")]
1863 #[cfg_attr(test, assert_instr(vrcp14ps))]
1864 pub unsafe fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
1865     transmute(vrcp14ps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k))
1866 }
1867
1868 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
1869 ///
1870 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rcp14_pd&expand=4493)
1871 #[inline]
1872 #[target_feature(enable = "avx512f")]
1873 #[cfg_attr(test, assert_instr(vrcp14pd))]
1874 pub unsafe fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
1875     transmute(vrcp14pd(
1876         a.as_f64x8(),
1877         _mm512_setzero_pd().as_f64x8(),
1878         0b11111111,
1879     ))
1880 }
1881
1882 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1883 ///
1884 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rcp14_pd&expand=4491)
1885 #[inline]
1886 #[target_feature(enable = "avx512f")]
1887 #[cfg_attr(test, assert_instr(vrcp14pd))]
1888 pub unsafe fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
1889     transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k))
1890 }
1891
1892 /// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1893 ///
1894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rcp14_pd&expand=4492)
1895 #[inline]
1896 #[target_feature(enable = "avx512f")]
1897 #[cfg_attr(test, assert_instr(vrcp14pd))]
1898 pub unsafe fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
1899     transmute(vrcp14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
1900 }
1901
1902 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
1903 ///
1904 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rsqrt14_ps&expand=4819)
1905 #[inline]
1906 #[target_feature(enable = "avx512f")]
1907 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
1908 pub unsafe fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
1909     transmute(vrsqrt14ps(
1910         a.as_f32x16(),
1911         _mm512_setzero_ps().as_f32x16(),
1912         0b11111111_11111111,
1913     ))
1914 }
1915
1916 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1917 ///
1918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rsqrt14_ps&expand=4817)
1919 #[inline]
1920 #[target_feature(enable = "avx512f")]
1921 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
1922 pub unsafe fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
1923     transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k))
1924 }
1925
1926 /// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1927 ///
1928 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rsqrt14_ps&expand=4818)
1929 #[inline]
1930 #[target_feature(enable = "avx512f")]
1931 #[cfg_attr(test, assert_instr(vrsqrt14ps))]
1932 pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
1933     transmute(vrsqrt14ps(
1934         a.as_f32x16(),
1935         _mm512_setzero_ps().as_f32x16(),
1936         k,
1937     ))
1938 }
1939
1940 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
1941 ///
1942 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rsqrt14_pd&expand=4812)
1943 #[inline]
1944 #[target_feature(enable = "avx512f")]
1945 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
1946 pub unsafe fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
1947     transmute(vrsqrt14pd(
1948         a.as_f64x8(),
1949         _mm512_setzero_pd().as_f64x8(),
1950         0b11111111,
1951     ))
1952 }
1953
1954 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1955 ///
1956 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rsqrt14_pd&expand=4810)
1957 #[inline]
1958 #[target_feature(enable = "avx512f")]
1959 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
1960 pub unsafe fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
1961     transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k))
1962 }
1963
1964 /// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
1965 ///
1966 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rsqrt14_pd&expand=4811)
1967 #[inline]
1968 #[target_feature(enable = "avx512f")]
1969 #[cfg_attr(test, assert_instr(vrsqrt14pd))]
1970 pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
1971     transmute(vrsqrt14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
1972 }
1973
1974 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
1975 ///
1976 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getexp_ps&expand=2844)
1977 #[inline]
1978 #[target_feature(enable = "avx512f")]
1979 #[cfg_attr(test, assert_instr(vgetexpps))]
1980 pub unsafe fn _mm512_getexp_ps(a: __m512) -> __m512 {
1981     transmute(vgetexpps(
1982         a.as_f32x16(),
1983         _mm512_setzero_ps().as_f32x16(),
1984         0b11111111_11111111,
1985         _MM_FROUND_CUR_DIRECTION,
1986     ))
1987 }
1988
1989 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
1990 ///
1991 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getexp_ps&expand=2845)
1992 #[inline]
1993 #[target_feature(enable = "avx512f")]
1994 #[cfg_attr(test, assert_instr(vgetexpps))]
1995 pub unsafe fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
1996     transmute(vgetexpps(
1997         a.as_f32x16(),
1998         src.as_f32x16(),
1999         k,
2000         _MM_FROUND_CUR_DIRECTION,
2001     ))
2002 }
2003
2004 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
2005 ///
2006 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getexp_ps&expand=2846)
2007 #[inline]
2008 #[target_feature(enable = "avx512f")]
2009 #[cfg_attr(test, assert_instr(vgetexpps))]
2010 pub unsafe fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
2011     transmute(vgetexpps(
2012         a.as_f32x16(),
2013         _mm512_setzero_ps().as_f32x16(),
2014         k,
2015         _MM_FROUND_CUR_DIRECTION,
2016     ))
2017 }
2018
2019 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
2020 ///
2021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getexp_pd&expand=2835)
2022 #[inline]
2023 #[target_feature(enable = "avx512f")]
2024 #[cfg_attr(test, assert_instr(vgetexppd))]
2025 pub unsafe fn _mm512_getexp_pd(a: __m512d) -> __m512d {
2026     transmute(vgetexppd(
2027         a.as_f64x8(),
2028         _mm512_setzero_pd().as_f64x8(),
2029         0b11111111,
2030         _MM_FROUND_CUR_DIRECTION,
2031     ))
2032 }
2033
2034 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
2035 ///
2036 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getexp_pd&expand=2836)
2037 #[inline]
2038 #[target_feature(enable = "avx512f")]
2039 #[cfg_attr(test, assert_instr(vgetexppd))]
2040 pub unsafe fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
2041     transmute(vgetexppd(
2042         a.as_f64x8(),
2043         src.as_f64x8(),
2044         k,
2045         _MM_FROUND_CUR_DIRECTION,
2046     ))
2047 }
2048
2049 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
2050 ///
2051 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getexp_pd&expand=2837)
2052 #[inline]
2053 #[target_feature(enable = "avx512f")]
2054 #[cfg_attr(test, assert_instr(vgetexppd))]
2055 pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
2056     transmute(vgetexppd(
2057         a.as_f64x8(),
2058         _mm512_setzero_pd().as_f64x8(),
2059         k,
2060         _MM_FROUND_CUR_DIRECTION,
2061     ))
2062 }
2063
2064 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
2065 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
2066 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
2067 ///    _MM_FROUND_TO_NEG_INF     // round down\
2068 ///    _MM_FROUND_TO_POS_INF     // round up\
2069 ///    _MM_FROUND_TO_ZERO        // truncate\
2070 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2071 ///
2072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_ps&expand=4784)
2073 #[inline]
2074 #[target_feature(enable = "avx512f")]
2075 #[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
2076 #[rustc_args_required_const(1)]
2077 pub unsafe fn _mm512_roundscale_ps(a: __m512, imm8: i32) -> __m512 {
2078     let a = a.as_f32x16();
2079     let zero = _mm512_setzero_ps().as_f32x16();
2080     macro_rules! call {
2081         ($imm8:expr) => {
2082             vrndscaleps(
2083                 a,
2084                 $imm8,
2085                 zero,
2086                 0b11111111_11111111,
2087                 _MM_FROUND_CUR_DIRECTION,
2088             )
2089         };
2090     }
2091     let r = constify_imm8_sae!(imm8, call);
2092     transmute(r)
2093 }
2094
2095 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
2096 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
2097 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
2098 ///    _MM_FROUND_TO_NEG_INF     // round down\
2099 ///    _MM_FROUND_TO_POS_INF     // round up\
2100 ///    _MM_FROUND_TO_ZERO        // truncate\
2101 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2102 ///
2103 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_ps&expand=4782)
2104 #[inline]
2105 #[target_feature(enable = "avx512f")]
2106 #[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
2107 #[rustc_args_required_const(3)]
2108 pub unsafe fn _mm512_mask_roundscale_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 {
2109     let a = a.as_f32x16();
2110     let src = src.as_f32x16();
2111     macro_rules! call {
2112         ($imm8:expr) => {
2113             vrndscaleps(a, $imm8, src, k, _MM_FROUND_CUR_DIRECTION)
2114         };
2115     }
2116     let r = constify_imm8_sae!(imm8, call);
2117     transmute(r)
2118 }
2119
2120 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
2121 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
2122 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
2123 ///    _MM_FROUND_TO_NEG_INF     // round down\
2124 ///    _MM_FROUND_TO_POS_INF     // round up\
2125 ///    _MM_FROUND_TO_ZERO        // truncate\
2126 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2127 ///
2128 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_ps&expand=4783)
2129 #[inline]
2130 #[target_feature(enable = "avx512f")]
2131 #[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0))]
2132 #[rustc_args_required_const(2)]
2133 pub unsafe fn _mm512_maskz_roundscale_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 {
2134     let a = a.as_f32x16();
2135     let zero = _mm512_setzero_ps().as_f32x16();
2136     macro_rules! call {
2137         ($imm8:expr) => {
2138             vrndscaleps(a, $imm8, zero, k, _MM_FROUND_CUR_DIRECTION)
2139         };
2140     }
2141     let r = constify_imm8_sae!(imm8, call);
2142     transmute(r)
2143 }
2144
2145 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
2146 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
2147 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
2148 ///    _MM_FROUND_TO_NEG_INF     // round down\
2149 ///    _MM_FROUND_TO_POS_INF     // round up\
2150 ///    _MM_FROUND_TO_ZERO        // truncate\
2151 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2152 ///
2153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_pd&expand=4775)
2154 #[inline]
2155 #[target_feature(enable = "avx512f")]
2156 #[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
2157 #[rustc_args_required_const(1)]
2158 pub unsafe fn _mm512_roundscale_pd(a: __m512d, imm8: i32) -> __m512d {
2159     let a = a.as_f64x8();
2160     let zero = _mm512_setzero_pd().as_f64x8();
2161     macro_rules! call {
2162         ($imm8:expr) => {
2163             vrndscalepd(a, $imm8, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION)
2164         };
2165     }
2166     let r = constify_imm8_sae!(imm8, call);
2167     transmute(r)
2168 }
2169
2170 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
2171 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
2172 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
2173 ///    _MM_FROUND_TO_NEG_INF     // round down\
2174 ///    _MM_FROUND_TO_POS_INF     // round up\
2175 ///    _MM_FROUND_TO_ZERO        // truncate\
2176 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2177 ///
2178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_pd&expand=4773)
2179 #[inline]
2180 #[target_feature(enable = "avx512f")]
2181 #[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
2182 #[rustc_args_required_const(3)]
2183 pub unsafe fn _mm512_mask_roundscale_pd(
2184     src: __m512d,
2185     k: __mmask8,
2186     a: __m512d,
2187     imm8: i32,
2188 ) -> __m512d {
2189     let a = a.as_f64x8();
2190     let src = src.as_f64x8();
2191     macro_rules! call {
2192         ($imm8:expr) => {
2193             vrndscalepd(a, $imm8, src, k, _MM_FROUND_CUR_DIRECTION)
2194         };
2195     }
2196     let r = constify_imm8_sae!(imm8, call);
2197     transmute(r)
2198 }
2199
2200 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
2201 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
2202 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
2203 ///    _MM_FROUND_TO_NEG_INF     // round down\
2204 ///    _MM_FROUND_TO_POS_INF     // round up\
2205 ///    _MM_FROUND_TO_ZERO        // truncate\
2206 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2207 ///
2208 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_pd&expand=4774)
2209 #[inline]
2210 #[target_feature(enable = "avx512f")]
2211 #[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0))]
2212 #[rustc_args_required_const(2)]
2213 pub unsafe fn _mm512_maskz_roundscale_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
2214     let a = a.as_f64x8();
2215     let zero = _mm512_setzero_pd().as_f64x8();
2216     macro_rules! call {
2217         ($imm8:expr) => {
2218             vrndscalepd(a, $imm8, zero, k, _MM_FROUND_CUR_DIRECTION)
2219         };
2220     }
2221     let r = constify_imm8_sae!(imm8, call);
2222     transmute(r)
2223 }
2224
2225 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
2226 ///
2227 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_ps&expand=4883)
2228 #[inline]
2229 #[target_feature(enable = "avx512f")]
2230 #[cfg_attr(test, assert_instr(vscalefps))]
2231 pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
2232     transmute(vscalefps(
2233         a.as_f32x16(),
2234         b.as_f32x16(),
2235         _mm512_setzero_ps().as_f32x16(),
2236         0b11111111_11111111,
2237         _MM_FROUND_CUR_DIRECTION,
2238     ))
2239 }
2240
2241 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
2242 ///
2243 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_ps&expand=4881)
2244 #[inline]
2245 #[target_feature(enable = "avx512f")]
2246 #[cfg_attr(test, assert_instr(vscalefps))]
2247 pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
2248     transmute(vscalefps(
2249         a.as_f32x16(),
2250         b.as_f32x16(),
2251         src.as_f32x16(),
2252         k,
2253         _MM_FROUND_CUR_DIRECTION,
2254     ))
2255 }
2256
2257 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2258 ///
2259 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_ps&expand=4882)
2260 #[inline]
2261 #[target_feature(enable = "avx512f")]
2262 #[cfg_attr(test, assert_instr(vscalefps))]
2263 pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
2264     transmute(vscalefps(
2265         a.as_f32x16(),
2266         b.as_f32x16(),
2267         _mm512_setzero_ps().as_f32x16(),
2268         k,
2269         _MM_FROUND_CUR_DIRECTION,
2270     ))
2271 }
2272
2273 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
2274 ///
2275 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_pd&expand=4874)
2276 #[inline]
2277 #[target_feature(enable = "avx512f")]
2278 #[cfg_attr(test, assert_instr(vscalefpd))]
2279 pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
2280     transmute(vscalefpd(
2281         a.as_f64x8(),
2282         b.as_f64x8(),
2283         _mm512_setzero_pd().as_f64x8(),
2284         0b11111111,
2285         _MM_FROUND_CUR_DIRECTION,
2286     ))
2287 }
2288
2289 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
2290 ///
2291 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_pd&expand=4872)
2292 #[inline]
2293 #[target_feature(enable = "avx512f")]
2294 #[cfg_attr(test, assert_instr(vscalefpd))]
2295 pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
2296     transmute(vscalefpd(
2297         a.as_f64x8(),
2298         b.as_f64x8(),
2299         src.as_f64x8(),
2300         k,
2301         _MM_FROUND_CUR_DIRECTION,
2302     ))
2303 }
2304
2305 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2306 ///
2307 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_pd&expand=4873)
2308 #[inline]
2309 #[target_feature(enable = "avx512f")]
2310 #[cfg_attr(test, assert_instr(vscalefpd))]
2311 pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
2312     transmute(vscalefpd(
2313         a.as_f64x8(),
2314         b.as_f64x8(),
2315         _mm512_setzero_pd().as_f64x8(),
2316         k,
2317         _MM_FROUND_CUR_DIRECTION,
2318     ))
2319 }
2320
2321 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
2322 ///
2323 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_ps&expand=2499)
2324 #[inline]
2325 #[target_feature(enable = "avx512f")]
2326 #[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
2327 #[rustc_args_required_const(3)]
2328 pub unsafe fn _mm512_fixupimm_ps(a: __m512, b: __m512, c: __m512i, imm8: i32) -> __m512 {
2329     let a = a.as_f32x16();
2330     let b = b.as_f32x16();
2331     let c = c.as_i32x16();
2332     macro_rules! call {
2333         ($imm8:expr) => {
2334             vfixupimmps(
2335                 a,
2336                 b,
2337                 c,
2338                 $imm8,
2339                 0b11111111_11111111,
2340                 _MM_FROUND_CUR_DIRECTION,
2341             )
2342         };
2343     }
2344     let r = constify_imm8_sae!(imm8, call);
2345     transmute(r)
2346 }
2347
2348 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
2349 ///
2350 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_ps&expand=2500)
2351 #[inline]
2352 #[target_feature(enable = "avx512f")]
2353 #[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
2354 #[rustc_args_required_const(4)]
2355 pub unsafe fn _mm512_mask_fixupimm_ps(
2356     a: __m512,
2357     k: __mmask16,
2358     b: __m512,
2359     c: __m512i,
2360     imm8: i32,
2361 ) -> __m512 {
2362     let a = a.as_f32x16();
2363     let b = b.as_f32x16();
2364     let c = c.as_i32x16();
2365     macro_rules! call {
2366         ($imm8:expr) => {
2367             vfixupimmps(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
2368         };
2369     }
2370     let r = constify_imm8_sae!(imm8, call);
2371     transmute(r)
2372 }
2373
2374 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
2375 ///
2376 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_ps&expand=2501)
2377 #[inline]
2378 #[target_feature(enable = "avx512f")]
2379 #[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0))]
2380 #[rustc_args_required_const(4)]
2381 pub unsafe fn _mm512_maskz_fixupimm_ps(
2382     k: __mmask16,
2383     a: __m512,
2384     b: __m512,
2385     c: __m512i,
2386     imm8: i32,
2387 ) -> __m512 {
2388     let a = a.as_f32x16();
2389     let b = b.as_f32x16();
2390     let c = c.as_i32x16();
2391     macro_rules! call {
2392         ($imm8:expr) => {
2393             vfixupimmpsz(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
2394         };
2395     }
2396     let r = constify_imm8_sae!(imm8, call);
2397     transmute(r)
2398 }
2399
2400 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
2401 ///
2402 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_pd&expand=2490)
2403 #[inline]
2404 #[target_feature(enable = "avx512f")]
2405 #[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
2406 #[rustc_args_required_const(3)]
2407 pub unsafe fn _mm512_fixupimm_pd(a: __m512d, b: __m512d, c: __m512i, imm8: i32) -> __m512d {
2408     let a = a.as_f64x8();
2409     let b = b.as_f64x8();
2410     let c = c.as_i64x8();
2411     macro_rules! call {
2412         ($imm8:expr) => {
2413             vfixupimmpd(a, b, c, $imm8, 0b11111111, _MM_FROUND_CUR_DIRECTION)
2414         };
2415     }
2416     let r = constify_imm8_sae!(imm8, call);
2417     transmute(r)
2418 }
2419
2420 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
2421 ///
2422 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_pd&expand=2491)
2423 #[inline]
2424 #[target_feature(enable = "avx512f")]
2425 #[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
2426 #[rustc_args_required_const(4)]
2427 pub unsafe fn _mm512_mask_fixupimm_pd(
2428     a: __m512d,
2429     k: __mmask8,
2430     b: __m512d,
2431     c: __m512i,
2432     imm8: i32,
2433 ) -> __m512d {
2434     let a = a.as_f64x8();
2435     let b = b.as_f64x8();
2436     let c = c.as_i64x8();
2437     macro_rules! call {
2438         ($imm8:expr) => {
2439             vfixupimmpd(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
2440         };
2441     }
2442     let r = constify_imm8_sae!(imm8, call);
2443     transmute(r)
2444 }
2445
2446 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
2447 ///
2448 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_pd&expand=2492)
2449 #[inline]
2450 #[target_feature(enable = "avx512f")]
2451 #[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0))]
2452 #[rustc_args_required_const(4)]
2453 pub unsafe fn _mm512_maskz_fixupimm_pd(
2454     k: __mmask8,
2455     a: __m512d,
2456     b: __m512d,
2457     c: __m512i,
2458     imm8: i32,
2459 ) -> __m512d {
2460     let a = a.as_f64x8();
2461     let b = b.as_f64x8();
2462     let c = c.as_i64x8();
2463     macro_rules! call {
2464         ($imm8:expr) => {
2465             vfixupimmpdz(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
2466         };
2467     }
2468     let r = constify_imm8_sae!(imm8, call);
2469     transmute(r)
2470 }
2471
2472 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
2473 ///
2474 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi32&expand=5867)
2475 #[inline]
2476 #[target_feature(enable = "avx512f")]
2477 #[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
2478 #[rustc_args_required_const(3)]
2479 pub unsafe fn _mm512_ternarylogic_epi32(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i {
2480     let a = a.as_i32x16();
2481     let b = b.as_i32x16();
2482     let c = c.as_i32x16();
2483     macro_rules! call {
2484         ($imm8:expr) => {
2485             vpternlogd(a, b, c, $imm8)
2486         };
2487     }
2488     let r = constify_imm8_sae!(imm8, call);
2489     transmute(r)
2490 }
2491
2492 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
2493 ///
2494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi32&expand=5865)
2495 #[inline]
2496 #[target_feature(enable = "avx512f")]
2497 #[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
2498 #[rustc_args_required_const(4)]
2499 pub unsafe fn _mm512_mask_ternarylogic_epi32(
2500     src: __m512i,
2501     k: __mmask16,
2502     a: __m512i,
2503     b: __m512i,
2504     imm8: i32,
2505 ) -> __m512i {
2506     let src = src.as_i32x16();
2507     let a = a.as_i32x16();
2508     let b = b.as_i32x16();
2509     macro_rules! call {
2510         ($imm8:expr) => {
2511             vpternlogd(src, a, b, $imm8)
2512         };
2513     }
2514     let ternarylogic = constify_imm8_sae!(imm8, call);
2515     transmute(simd_select_bitmask(k, ternarylogic, src))
2516 }
2517
2518 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
2519 ///
2520 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi32&expand=5866)
2521 #[inline]
2522 #[target_feature(enable = "avx512f")]
2523 #[cfg_attr(test, assert_instr(vpternlogd, imm8 = 114))]
2524 #[rustc_args_required_const(4)]
2525 pub unsafe fn _mm512_maskz_ternarylogic_epi32(
2526     k: __mmask16,
2527     a: __m512i,
2528     b: __m512i,
2529     c: __m512i,
2530     imm8: i32,
2531 ) -> __m512i {
2532     let a = a.as_i32x16();
2533     let b = b.as_i32x16();
2534     let c = c.as_i32x16();
2535     macro_rules! call {
2536         ($imm8:expr) => {
2537             vpternlogd(a, b, c, $imm8)
2538         };
2539     }
2540     let ternarylogic = constify_imm8_sae!(imm8, call);
2541     let zero = _mm512_setzero_si512().as_i32x16();
2542     transmute(simd_select_bitmask(k, ternarylogic, zero))
2543 }
2544
2545 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
2546 ///
2547 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ternarylogic_epi64&expand=5876)
2548 #[inline]
2549 #[target_feature(enable = "avx512f")]
2550 #[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
2551 #[rustc_args_required_const(3)]
2552 pub unsafe fn _mm512_ternarylogic_epi64(a: __m512i, b: __m512i, c: __m512i, imm8: i32) -> __m512i {
2553     let a = a.as_i64x8();
2554     let b = b.as_i64x8();
2555     let c = c.as_i64x8();
2556     macro_rules! call {
2557         ($imm8:expr) => {
2558             vpternlogq(a, b, c, $imm8)
2559         };
2560     }
2561     let r = constify_imm8_sae!(imm8, call);
2562     transmute(r)
2563 }
2564
2565 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
2566 ///
2567 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ternarylogic_epi64&expand=5874)
2568 #[inline]
2569 #[target_feature(enable = "avx512f")]
2570 #[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
2571 #[rustc_args_required_const(4)]
2572 pub unsafe fn _mm512_mask_ternarylogic_epi64(
2573     src: __m512i,
2574     k: __mmask8,
2575     a: __m512i,
2576     b: __m512i,
2577     imm8: i32,
2578 ) -> __m512i {
2579     let src = src.as_i64x8();
2580     let a = a.as_i64x8();
2581     let b = b.as_i64x8();
2582     macro_rules! call {
2583         ($imm8:expr) => {
2584             vpternlogq(src, a, b, $imm8)
2585         };
2586     }
2587     let ternarylogic = constify_imm8_sae!(imm8, call);
2588     transmute(simd_select_bitmask(k, ternarylogic, src))
2589 }
2590
2591 /// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
2592 ///
2593 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ternarylogic_epi64&expand=5875)
2594 #[inline]
2595 #[target_feature(enable = "avx512f")]
2596 #[cfg_attr(test, assert_instr(vpternlogq, imm8 = 114))]
2597 #[rustc_args_required_const(4)]
2598 pub unsafe fn _mm512_maskz_ternarylogic_epi64(
2599     k: __mmask8,
2600     a: __m512i,
2601     b: __m512i,
2602     c: __m512i,
2603     imm8: i32,
2604 ) -> __m512i {
2605     let a = a.as_i64x8();
2606     let b = b.as_i64x8();
2607     let c = c.as_i64x8();
2608     macro_rules! call {
2609         ($imm8:expr) => {
2610             vpternlogq(a, b, c, $imm8)
2611         };
2612     }
2613     let ternarylogic = constify_imm8_sae!(imm8, call);
2614     let zero = _mm512_setzero_si512().as_i64x8();
2615     transmute(simd_select_bitmask(k, ternarylogic, zero))
2616 }
2617
2618 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
2619 /// The mantissa is normalized to the interval specified by interv, which can take the following values:
2620 ///    _MM_MANT_NORM_1_2     // interval [1, 2)
2621 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
2622 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
2623 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
2624 /// The sign is determined by sc which can take the following values:
2625 ///    _MM_MANT_SIGN_src     // sign = sign(src)
2626 ///    _MM_MANT_SIGN_zero    // sign = 0
2627 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
2628 ///
2629 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getmant_ps&expand=2880)
2630 #[inline]
2631 #[target_feature(enable = "avx512f")]
2632 #[cfg_attr(test, assert_instr(vgetmantps, norm = 0, sign = 0))]
2633 #[rustc_args_required_const(1, 2)]
2634 pub unsafe fn _mm512_getmant_ps(
2635     a: __m512,
2636     norm: _MM_MANTISSA_NORM_ENUM,
2637     sign: _MM_MANTISSA_SIGN_ENUM,
2638 ) -> __m512 {
2639     macro_rules! call {
2640         ($imm4:expr, $imm2:expr) => {
2641             vgetmantps(
2642                 a.as_f32x16(),
2643                 $imm2 << 2 | $imm4,
2644                 _mm512_setzero_ps().as_f32x16(),
2645                 0b11111111_11111111,
2646                 _MM_FROUND_CUR_DIRECTION,
2647             )
2648         };
2649     }
2650     let r = constify_imm4_mantissas!(norm, sign, call);
2651     transmute(r)
2652 }
2653
2654 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
2655 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
2656 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
2657 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
2658 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
2659 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
2660 /// The sign is determined by sc which can take the following values:\
2661 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
2662 ///    _MM_MANT_SIGN_zero    // sign = 0\
2663 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
2664 ///
2665 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getmant_ps&expand=2881)
2666 #[inline]
2667 #[target_feature(enable = "avx512f")]
2668 #[cfg_attr(test, assert_instr(vgetmantps, norm = 0, sign = 0))]
2669 #[rustc_args_required_const(3, 4)]
2670 pub unsafe fn _mm512_mask_getmant_ps(
2671     src: __m512,
2672     k: __mmask16,
2673     a: __m512,
2674     norm: _MM_MANTISSA_NORM_ENUM,
2675     sign: _MM_MANTISSA_SIGN_ENUM,
2676 ) -> __m512 {
2677     macro_rules! call {
2678         ($imm4:expr, $imm2:expr) => {
2679             vgetmantps(
2680                 a.as_f32x16(),
2681                 $imm2 << 2 | $imm4,
2682                 src.as_f32x16(),
2683                 k,
2684                 _MM_FROUND_CUR_DIRECTION,
2685             )
2686         };
2687     }
2688     let r = constify_imm4_mantissas!(norm, sign, call);
2689     transmute(r)
2690 }
2691
2692 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
2693 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
2694 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
2695 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
2696 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
2697 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
2698 /// The sign is determined by sc which can take the following values:\
2699 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
2700 ///    _MM_MANT_SIGN_zero    // sign = 0\
2701 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
2702 ///
2703 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getmant_ps&expand=2882)
2704 #[inline]
2705 #[target_feature(enable = "avx512f")]
2706 #[cfg_attr(test, assert_instr(vgetmantps, norm = 0, sign = 0))]
2707 #[rustc_args_required_const(2, 3)]
2708 pub unsafe fn _mm512_maskz_getmant_ps(
2709     k: __mmask16,
2710     a: __m512,
2711     norm: _MM_MANTISSA_NORM_ENUM,
2712     sign: _MM_MANTISSA_SIGN_ENUM,
2713 ) -> __m512 {
2714     macro_rules! call {
2715         ($imm4:expr, $imm2:expr) => {
2716             vgetmantps(
2717                 a.as_f32x16(),
2718                 $imm2 << 2 | $imm4,
2719                 _mm512_setzero_ps().as_f32x16(),
2720                 k,
2721                 _MM_FROUND_CUR_DIRECTION,
2722             )
2723         };
2724     }
2725     let r = constify_imm4_mantissas!(norm, sign, call);
2726     transmute(r)
2727 }
2728
2729 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
2730 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
2731 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
2732 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
2733 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
2734 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
2735 /// The sign is determined by sc which can take the following values:\
2736 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
2737 ///    _MM_MANT_SIGN_zero    // sign = 0\
2738 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
2739 ///
2740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getmant_pd&expand=2871)
2741 #[inline]
2742 #[target_feature(enable = "avx512f")]
2743 #[cfg_attr(test, assert_instr(vgetmantpd, norm = 0, sign = 0))]
2744 #[rustc_args_required_const(1, 2)]
2745 pub unsafe fn _mm512_getmant_pd(
2746     a: __m512d,
2747     norm: _MM_MANTISSA_NORM_ENUM,
2748     sign: _MM_MANTISSA_SIGN_ENUM,
2749 ) -> __m512d {
2750     macro_rules! call {
2751         ($imm4:expr, $imm2:expr) => {
2752             vgetmantpd(
2753                 a.as_f64x8(),
2754                 $imm2 << 2 | $imm4,
2755                 _mm512_setzero_pd().as_f64x8(),
2756                 0b11111111,
2757                 _MM_FROUND_CUR_DIRECTION,
2758             )
2759         };
2760     }
2761     let r = constify_imm4_mantissas!(norm, sign, call);
2762     transmute(r)
2763 }
2764
2765 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
2766 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
2767 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
2768 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
2769 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
2770 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
2771 /// The sign is determined by sc which can take the following values:\
2772 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
2773 ///    _MM_MANT_SIGN_zero    // sign = 0\
2774 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
2775 ///
2776 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getmant_pd&expand=2872)
2777 #[inline]
2778 #[target_feature(enable = "avx512f")]
2779 #[cfg_attr(test, assert_instr(vgetmantpd, norm = 0, sign = 0))]
2780 #[rustc_args_required_const(3, 4)]
2781 pub unsafe fn _mm512_mask_getmant_pd(
2782     src: __m512d,
2783     k: __mmask8,
2784     a: __m512d,
2785     norm: _MM_MANTISSA_NORM_ENUM,
2786     sign: _MM_MANTISSA_SIGN_ENUM,
2787 ) -> __m512d {
2788     macro_rules! call {
2789         ($imm4:expr, $imm2:expr) => {
2790             vgetmantpd(
2791                 a.as_f64x8(),
2792                 $imm2 << 2 | $imm4,
2793                 src.as_f64x8(),
2794                 k,
2795                 _MM_FROUND_CUR_DIRECTION,
2796             )
2797         };
2798     }
2799     let r = constify_imm4_mantissas!(norm, sign, call);
2800     transmute(r)
2801 }
2802
2803 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
2804 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
2805 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
2806 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
2807 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
2808 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
2809 /// The sign is determined by sc which can take the following values:\
2810 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
2811 ///    _MM_MANT_SIGN_zero    // sign = 0\
2812 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
2813 ///
2814 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getmant_pd&expand=2873)
2815 #[inline]
2816 #[target_feature(enable = "avx512f")]
2817 #[cfg_attr(test, assert_instr(vgetmantpd, norm = 0, sign = 0))]
2818 #[rustc_args_required_const(2, 3)]
2819 pub unsafe fn _mm512_maskz_getmant_pd(
2820     k: __mmask8,
2821     a: __m512d,
2822     norm: _MM_MANTISSA_NORM_ENUM,
2823     sign: _MM_MANTISSA_SIGN_ENUM,
2824 ) -> __m512d {
2825     macro_rules! call {
2826         ($imm4:expr, $imm2:expr) => {
2827             vgetmantpd(
2828                 a.as_f64x8(),
2829                 $imm2 << 2 | $imm4,
2830                 _mm512_setzero_pd().as_f64x8(),
2831                 k,
2832                 _MM_FROUND_CUR_DIRECTION,
2833             )
2834         };
2835     }
2836     let r = constify_imm4_mantissas!(norm, sign, call);
2837     transmute(r)
2838 }
2839
2840 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
2841 ///
2842 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
2843 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
2844 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
2845 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
2846 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
2847 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2848 ///
2849 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_round_ps&expand=145)
2850 #[inline]
2851 #[target_feature(enable = "avx512f")]
2852 #[cfg_attr(test, assert_instr(vaddps, rounding = 8))]
2853 #[rustc_args_required_const(2)]
2854 pub unsafe fn _mm512_add_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
2855     let a = a.as_f32x16();
2856     let b = b.as_f32x16();
2857     macro_rules! call {
2858         ($imm4:expr) => {
2859             vaddps(a, b, $imm4)
2860         };
2861     }
2862     let r = constify_imm4_round!(rounding, call);
2863     transmute(r)
2864 }
2865
2866 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
2867 ///
2868 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
2869 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
2870 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
2871 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
2872 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
2873 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2874 ///
2875 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_add_round_ps&expand=146)
2876 #[inline]
2877 #[target_feature(enable = "avx512f")]
2878 #[cfg_attr(test, assert_instr(vaddps, rounding = 8))]
2879 #[rustc_args_required_const(4)]
2880 pub unsafe fn _mm512_mask_add_round_ps(
2881     src: __m512,
2882     k: __mmask16,
2883     a: __m512,
2884     b: __m512,
2885     rounding: i32,
2886 ) -> __m512 {
2887     let a = a.as_f32x16();
2888     let b = b.as_f32x16();
2889     macro_rules! call {
2890         ($imm4:expr) => {
2891             vaddps(a, b, $imm4)
2892         };
2893     }
2894     let addround = constify_imm4_round!(rounding, call);
2895     transmute(simd_select_bitmask(k, addround, src.as_f32x16()))
2896 }
2897
2898 /// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
2899 ///
2900 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
2901 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
2902 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
2903 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
2904 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
2905 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2906 ///
2907 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_add_round_ps&expand=147)
2908 #[inline]
2909 #[target_feature(enable = "avx512f")]
2910 #[cfg_attr(test, assert_instr(vaddps, rounding = 8))]
2911 #[rustc_args_required_const(3)]
2912 pub unsafe fn _mm512_maskz_add_round_ps(
2913     k: __mmask16,
2914     a: __m512,
2915     b: __m512,
2916     rounding: i32,
2917 ) -> __m512 {
2918     let a = a.as_f32x16();
2919     let b = b.as_f32x16();
2920     macro_rules! call {
2921         ($imm4:expr) => {
2922             vaddps(a, b, $imm4)
2923         };
2924     }
2925     let addround = constify_imm4_round!(rounding, call);
2926     let zero = _mm512_setzero_ps().as_f32x16();
2927     transmute(simd_select_bitmask(k, addround, zero))
2928 }
2929
2930 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
2931 ///
2932 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
2933 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
2934 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
2935 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
2936 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
2937 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2938 ///
2939 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_add_round_pd&expand=142)
2940 #[inline]
2941 #[target_feature(enable = "avx512f")]
2942 #[cfg_attr(test, assert_instr(vaddpd, rounding = 8))]
2943 #[rustc_args_required_const(2)]
2944 pub unsafe fn _mm512_add_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
2945     let a = a.as_f64x8();
2946     let b = b.as_f64x8();
2947     macro_rules! call {
2948         ($imm4:expr) => {
2949             vaddpd(a, b, $imm4)
2950         };
2951     }
2952     let r = constify_imm4_round!(rounding, call);
2953     transmute(r)
2954 }
2955
2956 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
2957 ///
2958 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
2959 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
2960 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
2961 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
2962 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
2963 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2964 ///
2965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_add_round_pd&expand=143)
2966 #[inline]
2967 #[target_feature(enable = "avx512f")]
2968 #[cfg_attr(test, assert_instr(vaddpd, rounding = 8))]
2969 #[rustc_args_required_const(4)]
2970 pub unsafe fn _mm512_mask_add_round_pd(
2971     src: __m512d,
2972     k: __mmask8,
2973     a: __m512d,
2974     b: __m512d,
2975     rounding: i32,
2976 ) -> __m512d {
2977     let a = a.as_f64x8();
2978     let b = b.as_f64x8();
2979     macro_rules! call {
2980         ($imm4:expr) => {
2981             vaddpd(a, b, $imm4)
2982         };
2983     }
2984     let addround = constify_imm4_round!(rounding, call);
2985     transmute(simd_select_bitmask(k, addround, src.as_f64x8()))
2986 }
2987
2988 /// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
2989 ///
2990 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
2991 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
2992 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
2993 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
2994 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
2995 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
2996 ///
2997 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_add_round_pd&expand=144)
2998 #[inline]
2999 #[target_feature(enable = "avx512f")]
3000 #[cfg_attr(test, assert_instr(vaddpd, rounding = 8))]
3001 #[rustc_args_required_const(3)]
3002 pub unsafe fn _mm512_maskz_add_round_pd(
3003     k: __mmask8,
3004     a: __m512d,
3005     b: __m512d,
3006     rounding: i32,
3007 ) -> __m512d {
3008     let a = a.as_f64x8();
3009     let b = b.as_f64x8();
3010     macro_rules! call {
3011         ($imm4:expr) => {
3012             vaddpd(a, b, $imm4)
3013         };
3014     }
3015     let addround = constify_imm4_round!(rounding, call);
3016     let zero = _mm512_setzero_pd().as_f64x8();
3017     transmute(simd_select_bitmask(k, addround, zero))
3018 }
3019
3020 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
3021 ///
3022 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3023 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3024 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3025 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3026 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3027 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3028 ///
3029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sub_round_ps&expand=5739)
3030 #[inline]
3031 #[target_feature(enable = "avx512f")]
3032 #[cfg_attr(test, assert_instr(vsubps, rounding = 8))]
3033 #[rustc_args_required_const(2)]
3034 pub unsafe fn _mm512_sub_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
3035     let a = a.as_f32x16();
3036     let b = b.as_f32x16();
3037     macro_rules! call {
3038         ($imm4:expr) => {
3039             vsubps(a, b, $imm4)
3040         };
3041     }
3042     let r = constify_imm4_round!(rounding, call);
3043     transmute(r)
3044 }
3045
3046 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3047 ///
3048 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3049 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3050 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3051 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3052 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3053 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3054 ///
3055 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sub_round_ps&expand=5737)
3056 #[inline]
3057 #[target_feature(enable = "avx512f")]
3058 #[cfg_attr(test, assert_instr(vsubps, rounding = 8))]
3059 #[rustc_args_required_const(4)]
3060 pub unsafe fn _mm512_mask_sub_round_ps(
3061     src: __m512,
3062     k: __mmask16,
3063     a: __m512,
3064     b: __m512,
3065     rounding: i32,
3066 ) -> __m512 {
3067     let a = a.as_f32x16();
3068     let b = b.as_f32x16();
3069     macro_rules! call {
3070         ($imm4:expr) => {
3071             vsubps(a, b, $imm4)
3072         };
3073     }
3074     let subround = constify_imm4_round!(rounding, call);
3075     transmute(simd_select_bitmask(k, subround, src.as_f32x16()))
3076 }
3077
3078 /// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3079 ///
3080 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3081 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3082 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3083 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3084 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3085 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3086 ///
3087 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sub_round_ps&expand=5738)
3088 #[inline]
3089 #[target_feature(enable = "avx512f")]
3090 #[cfg_attr(test, assert_instr(vsubps, rounding = 8))]
3091 #[rustc_args_required_const(3)]
3092 pub unsafe fn _mm512_maskz_sub_round_ps(
3093     k: __mmask16,
3094     a: __m512,
3095     b: __m512,
3096     rounding: i32,
3097 ) -> __m512 {
3098     let a = a.as_f32x16();
3099     let b = b.as_f32x16();
3100     macro_rules! call {
3101         ($imm4:expr) => {
3102             vsubps(a, b, $imm4)
3103         };
3104     }
3105     let subround = constify_imm4_round!(rounding, call);
3106     let zero = _mm512_setzero_ps().as_f32x16();
3107     transmute(simd_select_bitmask(k, subround, zero))
3108 }
3109
3110 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
3111 ///
3112 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3113 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3114 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3115 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3116 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3117 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3118 ///
3119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sub_round_pd&expand=5736)
3120 #[inline]
3121 #[target_feature(enable = "avx512f")]
3122 #[cfg_attr(test, assert_instr(vsubpd, rounding = 8))]
3123 #[rustc_args_required_const(2)]
3124 pub unsafe fn _mm512_sub_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
3125     let a = a.as_f64x8();
3126     let b = b.as_f64x8();
3127     macro_rules! call {
3128         ($imm4:expr) => {
3129             vsubpd(a, b, $imm4)
3130         };
3131     }
3132     let r = constify_imm4_round!(rounding, call);
3133     transmute(r)
3134 }
3135
3136 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3137 ///
3138 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3139 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3140 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3141 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3142 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3143 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3144 ///
3145 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sub_round_pd&expand=5734)
3146 #[inline]
3147 #[target_feature(enable = "avx512f")]
3148 #[cfg_attr(test, assert_instr(vsubpd, rounding = 8))]
3149 #[rustc_args_required_const(4)]
3150 pub unsafe fn _mm512_mask_sub_round_pd(
3151     src: __m512d,
3152     k: __mmask8,
3153     a: __m512d,
3154     b: __m512d,
3155     rounding: i32,
3156 ) -> __m512d {
3157     let a = a.as_f64x8();
3158     let b = b.as_f64x8();
3159     macro_rules! call {
3160         ($imm4:expr) => {
3161             vsubpd(a, b, $imm4)
3162         };
3163     }
3164     let subround = constify_imm4_round!(rounding, call);
3165     transmute(simd_select_bitmask(k, subround, src.as_f64x8()))
3166 }
3167
3168 /// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3169 ///
3170 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3171 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3172 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3173 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3174 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3175 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3176 ///
3177 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sub_round_pd&expand=5735)
3178 #[inline]
3179 #[target_feature(enable = "avx512f")]
3180 #[cfg_attr(test, assert_instr(vsubpd, rounding = 8))]
3181 #[rustc_args_required_const(3)]
3182 pub unsafe fn _mm512_maskz_sub_round_pd(
3183     k: __mmask8,
3184     a: __m512d,
3185     b: __m512d,
3186     rounding: i32,
3187 ) -> __m512d {
3188     let a = a.as_f64x8();
3189     let b = b.as_f64x8();
3190     macro_rules! call {
3191         ($imm4:expr) => {
3192             vsubpd(a, b, $imm4)
3193         };
3194     }
3195     let subround = constify_imm4_round!(rounding, call);
3196     let zero = _mm512_setzero_pd().as_f64x8();
3197     transmute(simd_select_bitmask(k, subround, zero))
3198 }
3199
3200 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
3201 ///
3202 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3203 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3204 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3205 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3206 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3207 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3208 ///
3209 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_round_ps&expand=3940)
3210 #[inline]
3211 #[target_feature(enable = "avx512f")]
3212 #[cfg_attr(test, assert_instr(vmulps, rounding = 8))]
3213 #[rustc_args_required_const(2)]
3214 pub unsafe fn _mm512_mul_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
3215     let a = a.as_f32x16();
3216     let b = b.as_f32x16();
3217     macro_rules! call {
3218         ($imm4:expr) => {
3219             vmulps(a, b, $imm4)
3220         };
3221     }
3222     let r = constify_imm4_round!(rounding, call);
3223     transmute(r)
3224 }
3225
3226 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3227 ///
3228 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3229 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3230 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3231 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3232 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3233 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3234 ///
3235 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_round_ps&expand=3938)
3236 #[inline]
3237 #[target_feature(enable = "avx512f")]
3238 #[cfg_attr(test, assert_instr(vmulps, rounding = 8))]
3239 #[rustc_args_required_const(4)]
3240 pub unsafe fn _mm512_mask_mul_round_ps(
3241     src: __m512,
3242     k: __mmask16,
3243     a: __m512,
3244     b: __m512,
3245     rounding: i32,
3246 ) -> __m512 {
3247     let a = a.as_f32x16();
3248     let b = b.as_f32x16();
3249     macro_rules! call {
3250         ($imm4:expr) => {
3251             vmulps(a, b, $imm4)
3252         };
3253     }
3254     let mulround = constify_imm4_round!(rounding, call);
3255     transmute(simd_select_bitmask(k, mulround, src.as_f32x16()))
3256 }
3257
3258 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3259 ///
3260 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3261 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3262 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3263 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3264 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3265 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3266 ///
3267 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_round_ps&expand=3939)
3268 #[inline]
3269 #[target_feature(enable = "avx512f")]
3270 #[cfg_attr(test, assert_instr(vmulps, rounding = 8))]
3271 #[rustc_args_required_const(3)]
3272 pub unsafe fn _mm512_maskz_mul_round_ps(
3273     k: __mmask16,
3274     a: __m512,
3275     b: __m512,
3276     rounding: i32,
3277 ) -> __m512 {
3278     let a = a.as_f32x16();
3279     let b = b.as_f32x16();
3280     macro_rules! call {
3281         ($imm4:expr) => {
3282             vmulps(a, b, $imm4)
3283         };
3284     }
3285     let mulround = constify_imm4_round!(rounding, call);
3286     let zero = _mm512_setzero_ps().as_f32x16();
3287     transmute(simd_select_bitmask(k, mulround, zero))
3288 }
3289
3290 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
3291 ///
3292 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3293 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3294 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3295 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3296 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3297 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3298 ///
3299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_round_pd&expand=3937)
3300 #[inline]
3301 #[target_feature(enable = "avx512f")]
3302 #[cfg_attr(test, assert_instr(vmulpd, rounding = 8))]
3303 #[rustc_args_required_const(2)]
3304 pub unsafe fn _mm512_mul_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
3305     let a = a.as_f64x8();
3306     let b = b.as_f64x8();
3307     macro_rules! call {
3308         ($imm4:expr) => {
3309             vmulpd(a, b, $imm4)
3310         };
3311     }
3312     let r = constify_imm4_round!(rounding, call);
3313     transmute(r)
3314 }
3315
3316 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3317 ///
3318 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3319 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3320 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3321 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3322 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3323 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3324 ///
3325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_round_pd&expand=3935)
3326 #[inline]
3327 #[target_feature(enable = "avx512f")]
3328 #[cfg_attr(test, assert_instr(vmulpd, rounding = 8))]
3329 #[rustc_args_required_const(4)]
3330 pub unsafe fn _mm512_mask_mul_round_pd(
3331     src: __m512d,
3332     k: __mmask8,
3333     a: __m512d,
3334     b: __m512d,
3335     rounding: i32,
3336 ) -> __m512d {
3337     let a = a.as_f64x8();
3338     let b = b.as_f64x8();
3339     macro_rules! call {
3340         ($imm4:expr) => {
3341             vmulpd(a, b, $imm4)
3342         };
3343     }
3344     let mulround = constify_imm4_round!(rounding, call);
3345     transmute(simd_select_bitmask(k, mulround, src.as_f64x8()))
3346 }
3347
3348 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3349 ///
3350 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3351 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3352 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3353 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3354 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3355 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3356 ///
3357 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_round_ps&expand=3939)
3358 #[inline]
3359 #[target_feature(enable = "avx512f")]
3360 #[cfg_attr(test, assert_instr(vmulpd, rounding = 8))]
3361 #[rustc_args_required_const(3)]
3362 pub unsafe fn _mm512_maskz_mul_round_pd(
3363     k: __mmask8,
3364     a: __m512d,
3365     b: __m512d,
3366     rounding: i32,
3367 ) -> __m512d {
3368     let a = a.as_f64x8();
3369     let b = b.as_f64x8();
3370     macro_rules! call {
3371         ($imm4:expr) => {
3372             vmulpd(a, b, $imm4)
3373         };
3374     }
3375     let mulround = constify_imm4_round!(rounding, call);
3376     let zero = _mm512_setzero_pd().as_f64x8();
3377     transmute(simd_select_bitmask(k, mulround, zero))
3378 }
3379
3380 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
3381 ///
3382 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3383 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3384 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3385 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3386 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3387 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3388 ///
3389 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_div_round_ps&expand=2168)
3390 #[inline]
3391 #[target_feature(enable = "avx512f")]
3392 #[cfg_attr(test, assert_instr(vdivps, rounding = 8))]
3393 #[rustc_args_required_const(2)]
3394 pub unsafe fn _mm512_div_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
3395     let a = a.as_f32x16();
3396     let b = b.as_f32x16();
3397     macro_rules! call {
3398         ($imm4:expr) => {
3399             vdivps(a, b, $imm4)
3400         };
3401     }
3402     let r = constify_imm4_round!(rounding, call);
3403     transmute(r)
3404 }
3405
3406 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3407 ///
3408 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3409 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3410 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3411 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3412 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3413 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3414 ///
3415 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_div_round_ps&expand=2169)
3416 #[inline]
3417 #[target_feature(enable = "avx512f")]
3418 #[cfg_attr(test, assert_instr(vdivps, rounding = 8))]
3419 #[rustc_args_required_const(4)]
3420 pub unsafe fn _mm512_mask_div_round_ps(
3421     src: __m512,
3422     k: __mmask16,
3423     a: __m512,
3424     b: __m512,
3425     rounding: i32,
3426 ) -> __m512 {
3427     let a = a.as_f32x16();
3428     let b = b.as_f32x16();
3429     macro_rules! call {
3430         ($imm4:expr) => {
3431             vdivps(a, b, $imm4)
3432         };
3433     }
3434     let divround = constify_imm4_round!(rounding, call);
3435     transmute(simd_select_bitmask(k, divround, src.as_f32x16()))
3436 }
3437
3438 /// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3439 ///
3440 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3441 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3442 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3443 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3444 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3445 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3446 ///
3447 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_div_round_ps&expand=2170)
3448 #[inline]
3449 #[target_feature(enable = "avx512f")]
3450 #[cfg_attr(test, assert_instr(vdivps, rounding = 8))]
3451 #[rustc_args_required_const(3)]
3452 pub unsafe fn _mm512_maskz_div_round_ps(
3453     k: __mmask16,
3454     a: __m512,
3455     b: __m512,
3456     rounding: i32,
3457 ) -> __m512 {
3458     let a = a.as_f32x16();
3459     let b = b.as_f32x16();
3460     macro_rules! call {
3461         ($imm4:expr) => {
3462             vdivps(a, b, $imm4)
3463         };
3464     }
3465     let divround = constify_imm4_round!(rounding, call);
3466     let zero = _mm512_setzero_ps().as_f32x16();
3467     transmute(simd_select_bitmask(k, divround, zero))
3468 }
3469
3470 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
3471 ///
3472 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3473 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3474 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3475 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3476 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3477 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3478 ///
3479 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_div_round_pd&expand=2165)
3480 #[inline]
3481 #[target_feature(enable = "avx512f")]
3482 #[cfg_attr(test, assert_instr(vdivpd, rounding = 8))]
3483 #[rustc_args_required_const(2)]
3484 pub unsafe fn _mm512_div_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
3485     let a = a.as_f64x8();
3486     let b = b.as_f64x8();
3487     macro_rules! call {
3488         ($imm4:expr) => {
3489             vdivpd(a, b, $imm4)
3490         };
3491     }
3492     let r = constify_imm4_round!(rounding, call);
3493     transmute(r)
3494 }
3495
3496 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3497 ///
3498 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3499 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3500 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3501 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3502 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3503 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3504 ///
3505 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_div_round_pd&expand=2166)
3506 #[inline]
3507 #[target_feature(enable = "avx512f")]
3508 #[cfg_attr(test, assert_instr(vdivpd, rounding = 8))]
3509 #[rustc_args_required_const(4)]
3510 pub unsafe fn _mm512_mask_div_round_pd(
3511     src: __m512d,
3512     k: __mmask8,
3513     a: __m512d,
3514     b: __m512d,
3515     rounding: i32,
3516 ) -> __m512d {
3517     let a = a.as_f64x8();
3518     let b = b.as_f64x8();
3519     macro_rules! call {
3520         ($imm4:expr) => {
3521             vdivpd(a, b, $imm4)
3522         };
3523     }
3524     let divround = constify_imm4_round!(rounding, call);
3525     transmute(simd_select_bitmask(k, divround, src.as_f64x8()))
3526 }
3527
3528 /// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3529 ///
3530 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3531 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3532 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3533 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3534 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3535 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3536 ///
3537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_div_round_pd&expand=2167)
3538 #[inline]
3539 #[target_feature(enable = "avx512f")]
3540 #[cfg_attr(test, assert_instr(vdivpd, rounding = 8))]
3541 #[rustc_args_required_const(3)]
3542 pub unsafe fn _mm512_maskz_div_round_pd(
3543     k: __mmask8,
3544     a: __m512d,
3545     b: __m512d,
3546     rounding: i32,
3547 ) -> __m512d {
3548     let a = a.as_f64x8();
3549     let b = b.as_f64x8();
3550     macro_rules! call {
3551         ($imm4:expr) => {
3552             vdivpd(a, b, $imm4)
3553         };
3554     }
3555     let divround = constify_imm4_round!(rounding, call);
3556     let zero = _mm512_setzero_pd().as_f64x8();
3557     transmute(simd_select_bitmask(k, divround, zero))
3558 }
3559
3560 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
3561 ///
3562 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3563 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3564 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3565 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3566 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3567 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3568 ///
3569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sqrt_round_ps&expand=5377)
3570 #[inline]
3571 #[target_feature(enable = "avx512f")]
3572 #[cfg_attr(test, assert_instr(vsqrtps, rounding = 8))]
3573 #[rustc_args_required_const(1)]
3574 pub unsafe fn _mm512_sqrt_round_ps(a: __m512, rounding: i32) -> __m512 {
3575     let a = a.as_f32x16();
3576     macro_rules! call {
3577         ($imm4:expr) => {
3578             vsqrtps(a, $imm4)
3579         };
3580     }
3581     let r = constify_imm4_round!(rounding, call);
3582     transmute(r)
3583 }
3584
3585 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3586 ///
3587 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3588 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3589 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3590 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3591 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3592 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3593 ///
3594 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sqrt_round_ps&expand=5375)
3595 #[inline]
3596 #[target_feature(enable = "avx512f")]
3597 #[cfg_attr(test, assert_instr(vsqrtps, rounding = 8))]
3598 #[rustc_args_required_const(3)]
3599 pub unsafe fn _mm512_mask_sqrt_round_ps(
3600     src: __m512,
3601     k: __mmask16,
3602     a: __m512,
3603     rounding: i32,
3604 ) -> __m512 {
3605     let a = a.as_f32x16();
3606     macro_rules! call {
3607         ($imm4:expr) => {
3608             vsqrtps(a, $imm4)
3609         };
3610     }
3611     let sqrtround = constify_imm4_round!(rounding, call);
3612     transmute(simd_select_bitmask(k, sqrtround, src.as_f32x16()))
3613 }
3614
3615 /// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3616 ///
3617 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3618 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3619 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3620 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3621 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3622 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3623 ///
3624 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sqrt_round_ps&expand=5376)
3625 #[inline]
3626 #[target_feature(enable = "avx512f")]
3627 #[cfg_attr(test, assert_instr(vsqrtps, rounding = 8))]
3628 #[rustc_args_required_const(2)]
3629 pub unsafe fn _mm512_maskz_sqrt_round_ps(k: __mmask16, a: __m512, rounding: i32) -> __m512 {
3630     let a = a.as_f32x16();
3631     macro_rules! call {
3632         ($imm4:expr) => {
3633             vsqrtps(a, $imm4)
3634         };
3635     }
3636     let sqrtround = constify_imm4_round!(rounding, call);
3637     let zero = _mm512_setzero_ps().as_f32x16();
3638     transmute(simd_select_bitmask(k, sqrtround, zero))
3639 }
3640
3641 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
3642 ///
3643 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3644 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3645 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3646 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3647 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3648 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3649 ///
3650 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sqrt_round_pd&expand=5374)
3651 #[inline]
3652 #[target_feature(enable = "avx512f")]
3653 #[cfg_attr(test, assert_instr(vsqrtpd, rounding = 8))]
3654 #[rustc_args_required_const(1)]
3655 pub unsafe fn _mm512_sqrt_round_pd(a: __m512d, rounding: i32) -> __m512d {
3656     let a = a.as_f64x8();
3657     macro_rules! call {
3658         ($imm4:expr) => {
3659             vsqrtpd(a, $imm4)
3660         };
3661     }
3662     let r = constify_imm4_round!(rounding, call);
3663     transmute(r)
3664 }
3665
3666 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
3667 ///
3668 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3669 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3670 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3671 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3672 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3673 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3674 ///
3675 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sqrt_round_pd&expand=5372)
3676 #[inline]
3677 #[target_feature(enable = "avx512f")]
3678 #[cfg_attr(test, assert_instr(vsqrtpd, rounding = 8))]
3679 #[rustc_args_required_const(3)]
3680 pub unsafe fn _mm512_mask_sqrt_round_pd(
3681     src: __m512d,
3682     k: __mmask8,
3683     a: __m512d,
3684     rounding: i32,
3685 ) -> __m512d {
3686     macro_rules! call {
3687         ($imm4:expr) => {
3688             vsqrtpd(a.as_f64x8(), $imm4)
3689         };
3690     }
3691     let sqrtround = constify_imm4_round!(rounding, call);
3692     transmute(simd_select_bitmask(k, sqrtround, src.as_f64x8()))
3693 }
3694
3695 /// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3696 ///
3697 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3698 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3699 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3700 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3701 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3702 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3703 ///
3704 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sqrt_round_pd&expand=5373)
3705 #[inline]
3706 #[target_feature(enable = "avx512f")]
3707 #[cfg_attr(test, assert_instr(vsqrtpd, rounding = 8))]
3708 #[rustc_args_required_const(2)]
3709 pub unsafe fn _mm512_maskz_sqrt_round_pd(k: __mmask8, a: __m512d, rounding: i32) -> __m512d {
3710     macro_rules! call {
3711         ($imm4:expr) => {
3712             vsqrtpd(a.as_f64x8(), $imm4)
3713         };
3714     }
3715     let sqrtround = constify_imm4_round!(rounding, call);
3716     let zero = _mm512_setzero_pd().as_f64x8();
3717     transmute(simd_select_bitmask(k, sqrtround, zero))
3718 }
3719
3720 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
3721 ///
3722 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3723 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3724 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3725 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3726 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3727 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3728 ///
3729 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmadd_round_ps&expand=2565)
3730 #[inline]
3731 #[target_feature(enable = "avx512f")]
3732 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
3733 #[rustc_args_required_const(3)]
3734 pub unsafe fn _mm512_fmadd_round_ps(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512 {
3735     macro_rules! call {
3736         ($imm4:expr) => {
3737             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
3738         };
3739     }
3740     let r = constify_imm4_round!(rounding, call);
3741     transmute(r)
3742 }
3743
3744 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
3745 ///
3746 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3747 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3748 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3749 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3750 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3751 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3752 ///
3753 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmadd_round_ps&expand=2566)
3754 #[inline]
3755 #[target_feature(enable = "avx512f")]
3756 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
3757 #[rustc_args_required_const(4)]
3758 pub unsafe fn _mm512_mask_fmadd_round_ps(
3759     a: __m512,
3760     k: __mmask16,
3761     b: __m512,
3762     c: __m512,
3763     rounding: i32,
3764 ) -> __m512 {
3765     macro_rules! call {
3766         ($imm4:expr) => {
3767             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
3768         };
3769     }
3770     let fmadd = constify_imm4_round!(rounding, call);
3771     transmute(simd_select_bitmask(k, fmadd, a.as_f32x16()))
3772 }
3773
3774 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3775 ///
3776 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3777 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3778 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3779 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3780 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3781 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3782 ///
3783 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmadd_round_ps&expand=2568)
3784 #[inline]
3785 #[target_feature(enable = "avx512f")]
3786 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
3787 #[rustc_args_required_const(4)]
3788 pub unsafe fn _mm512_maskz_fmadd_round_ps(
3789     k: __mmask16,
3790     a: __m512,
3791     b: __m512,
3792     c: __m512,
3793     rounding: i32,
3794 ) -> __m512 {
3795     macro_rules! call {
3796         ($imm4:expr) => {
3797             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
3798         };
3799     }
3800     let fmadd = constify_imm4_round!(rounding, call);
3801     let zero = _mm512_setzero_ps().as_f32x16();
3802     transmute(simd_select_bitmask(k, fmadd, zero))
3803 }
3804
3805 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
3806 ///
3807 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3808 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3809 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3810 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3811 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3812 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3813 ///
3814 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmadd_round_ps&expand=2567)
3815 #[inline]
3816 #[target_feature(enable = "avx512f")]
3817 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
3818 #[rustc_args_required_const(4)]
3819 pub unsafe fn _mm512_mask3_fmadd_round_ps(
3820     a: __m512,
3821     b: __m512,
3822     c: __m512,
3823     k: __mmask16,
3824     rounding: i32,
3825 ) -> __m512 {
3826     macro_rules! call {
3827         ($imm4:expr) => {
3828             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
3829         };
3830     }
3831     let fmadd = constify_imm4_round!(rounding, call);
3832     transmute(simd_select_bitmask(k, fmadd, c.as_f32x16()))
3833 }
3834
3835 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
3836 ///
3837 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3838 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3839 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3840 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3841 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3842 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3843 ///
3844 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmadd_round_pd&expand=2561)
3845 #[inline]
3846 #[target_feature(enable = "avx512f")]
3847 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
3848 #[rustc_args_required_const(3)]
3849 pub unsafe fn _mm512_fmadd_round_pd(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d {
3850     macro_rules! call {
3851         ($imm4:expr) => {
3852             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
3853         };
3854     }
3855     let r = constify_imm4_round!(rounding, call);
3856     transmute(r)
3857 }
3858
3859 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
3860 ///
3861 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3862 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3863 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3864 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3865 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3866 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3867 ///
3868 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmadd_round_pd&expand=2562)
3869 #[inline]
3870 #[target_feature(enable = "avx512f")]
3871 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
3872 #[rustc_args_required_const(4)]
3873 pub unsafe fn _mm512_mask_fmadd_round_pd(
3874     a: __m512d,
3875     k: __mmask8,
3876     b: __m512d,
3877     c: __m512d,
3878     rounding: i32,
3879 ) -> __m512d {
3880     macro_rules! call {
3881         ($imm4:expr) => {
3882             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
3883         };
3884     }
3885     let fmadd = constify_imm4_round!(rounding, call);
3886     transmute(simd_select_bitmask(k, fmadd, a.as_f64x8()))
3887 }
3888
3889 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
3890 ///
3891 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3892 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3893 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3894 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3895 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3896 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3897 ///
3898 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmadd_round_pd&expand=2564)
3899 #[inline]
3900 #[target_feature(enable = "avx512f")]
3901 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
3902 #[rustc_args_required_const(4)]
3903 pub unsafe fn _mm512_maskz_fmadd_round_pd(
3904     k: __mmask8,
3905     a: __m512d,
3906     b: __m512d,
3907     c: __m512d,
3908     rounding: i32,
3909 ) -> __m512d {
3910     macro_rules! call {
3911         ($imm4:expr) => {
3912             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
3913         };
3914     }
3915     let fmadd = constify_imm4_round!(rounding, call);
3916     let zero = _mm512_setzero_pd().as_f64x8();
3917     transmute(simd_select_bitmask(k, fmadd, zero))
3918 }
3919
3920 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
3921 ///
3922 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3923 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3924 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3925 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3926 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3927 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3928 ///
3929 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmadd_round_pd&expand=2563)
3930 #[inline]
3931 #[target_feature(enable = "avx512f")]
3932 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
3933 #[rustc_args_required_const(4)]
3934 pub unsafe fn _mm512_mask3_fmadd_round_pd(
3935     a: __m512d,
3936     b: __m512d,
3937     c: __m512d,
3938     k: __mmask8,
3939     rounding: i32,
3940 ) -> __m512d {
3941     macro_rules! call {
3942         ($imm4:expr) => {
3943             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
3944         };
3945     }
3946     let fmadd = constify_imm4_round!(rounding, call);
3947     transmute(simd_select_bitmask(k, fmadd, c.as_f64x8()))
3948 }
3949
3950 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
3951 ///
3952 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3953 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3954 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3955 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3956 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3957 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3958 ///
3959 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsub_round_ps&expand=2651)
3960 #[inline]
3961 #[target_feature(enable = "avx512f")]
3962 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
3963 #[rustc_args_required_const(3)]
3964 pub unsafe fn _mm512_fmsub_round_ps(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512 {
3965     let zero: f32x16 = mem::zeroed();
3966     let sub = simd_sub(zero, c.as_f32x16());
3967     macro_rules! call {
3968         ($imm4:expr) => {
3969             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
3970         };
3971     }
3972     let r = constify_imm4_round!(rounding, call);
3973     transmute(r)
3974 }
3975
3976 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
3977 ///
3978 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
3979 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
3980 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
3981 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
3982 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
3983 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
3984 ///
3985 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsub_round_ps&expand=2652)
3986 #[inline]
3987 #[target_feature(enable = "avx512f")]
3988 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
3989 #[rustc_args_required_const(4)]
3990 pub unsafe fn _mm512_mask_fmsub_round_ps(
3991     a: __m512,
3992     k: __mmask16,
3993     b: __m512,
3994     c: __m512,
3995     rounding: i32,
3996 ) -> __m512 {
3997     let zero: f32x16 = mem::zeroed();
3998     let sub = simd_sub(zero, c.as_f32x16());
3999     macro_rules! call {
4000         ($imm4:expr) => {
4001             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4002         };
4003     }
4004     let fmsub = constify_imm4_round!(rounding, call);
4005     transmute(simd_select_bitmask(k, fmsub, a.as_f32x16()))
4006 }
4007
4008 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4009 ///
4010 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4011 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4012 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4013 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4014 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4015 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4016 ///
4017 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsub_round_ps&expand=2654)
4018 #[inline]
4019 #[target_feature(enable = "avx512f")]
4020 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
4021 #[rustc_args_required_const(4)]
4022 pub unsafe fn _mm512_maskz_fmsub_round_ps(
4023     k: __mmask16,
4024     a: __m512,
4025     b: __m512,
4026     c: __m512,
4027     rounding: i32,
4028 ) -> __m512 {
4029     let zero: f32x16 = mem::zeroed();
4030     let sub = simd_sub(zero, c.as_f32x16());
4031     macro_rules! call {
4032         ($imm4:expr) => {
4033             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4034         };
4035     }
4036     let fmsub = constify_imm4_round!(rounding, call);
4037     transmute(simd_select_bitmask(k, fmsub, zero))
4038 }
4039
4040 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4041 ///
4042 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4043 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4044 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4045 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4046 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4047 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4048 ///
4049 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsub_round_ps&expand=2653)
4050 #[inline]
4051 #[target_feature(enable = "avx512f")]
4052 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
4053 #[rustc_args_required_const(4)]
4054 pub unsafe fn _mm512_mask3_fmsub_round_ps(
4055     a: __m512,
4056     b: __m512,
4057     c: __m512,
4058     k: __mmask16,
4059     rounding: i32,
4060 ) -> __m512 {
4061     let zero: f32x16 = mem::zeroed();
4062     let sub = simd_sub(zero, c.as_f32x16());
4063     macro_rules! call {
4064         ($imm4:expr) => {
4065             vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4066         };
4067     }
4068     let fmsub = constify_imm4_round!(rounding, call);
4069     transmute(simd_select_bitmask(k, fmsub, c.as_f32x16()))
4070 }
4071
4072 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
4073 ///
4074 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4075 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4076 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4077 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4078 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4079 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4080 ///
4081 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsub_round_pd&expand=2647)
4082 #[inline]
4083 #[target_feature(enable = "avx512f")]
4084 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
4085 #[rustc_args_required_const(3)]
4086 pub unsafe fn _mm512_fmsub_round_pd(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d {
4087     let zero: f64x8 = mem::zeroed();
4088     let sub = simd_sub(zero, c.as_f64x8());
4089     macro_rules! call {
4090         ($imm4:expr) => {
4091             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4092         };
4093     }
4094     let r = constify_imm4_round!(rounding, call);
4095     transmute(r)
4096 }
4097
4098 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4099 ///
4100 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4101 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4102 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4103 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4104 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4105 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4106 ///
4107 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsub_round_pd&expand=2648)
4108 #[inline]
4109 #[target_feature(enable = "avx512f")]
4110 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
4111 #[rustc_args_required_const(4)]
4112 pub unsafe fn _mm512_mask_fmsub_round_pd(
4113     a: __m512d,
4114     k: __mmask8,
4115     b: __m512d,
4116     c: __m512d,
4117     rounding: i32,
4118 ) -> __m512d {
4119     let zero: f64x8 = mem::zeroed();
4120     let sub = simd_sub(zero, c.as_f64x8());
4121     macro_rules! call {
4122         ($imm4:expr) => {
4123             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4124         };
4125     }
4126     let fmsub = constify_imm4_round!(rounding, call);
4127     transmute(simd_select_bitmask(k, fmsub, a.as_f64x8()))
4128 }
4129
4130 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4131 ///
4132 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4133 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4134 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4135 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4136 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4137 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4138 ///
4139 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsub_round_pd&expand=2650)
4140 #[inline]
4141 #[target_feature(enable = "avx512f")]
4142 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
4143 #[rustc_args_required_const(4)]
4144 pub unsafe fn _mm512_maskz_fmsub_round_pd(
4145     k: __mmask8,
4146     a: __m512d,
4147     b: __m512d,
4148     c: __m512d,
4149     rounding: i32,
4150 ) -> __m512d {
4151     let zero: f64x8 = mem::zeroed();
4152     let sub = simd_sub(zero, c.as_f64x8());
4153     macro_rules! call {
4154         ($imm4:expr) => {
4155             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4156         };
4157     }
4158     let fmsub = constify_imm4_round!(rounding, call);
4159     transmute(simd_select_bitmask(k, fmsub, zero))
4160 }
4161
4162 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4163 ///
4164 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4165 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4166 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4167 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4168 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4169 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4170 ///
4171 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsub_round_pd&expand=2649)
4172 #[inline]
4173 #[target_feature(enable = "avx512f")]
4174 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
4175 #[rustc_args_required_const(4)]
4176 pub unsafe fn _mm512_mask3_fmsub_round_pd(
4177     a: __m512d,
4178     b: __m512d,
4179     c: __m512d,
4180     k: __mmask8,
4181     rounding: i32,
4182 ) -> __m512d {
4183     let zero: f64x8 = mem::zeroed();
4184     let sub = simd_sub(zero, c.as_f64x8());
4185     macro_rules! call {
4186         ($imm4:expr) => {
4187             vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4188         };
4189     }
4190     let fmsub = constify_imm4_round!(rounding, call);
4191     transmute(simd_select_bitmask(k, fmsub, c.as_f64x8()))
4192 }
4193
4194 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
4195 ///
4196 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4197 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4198 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4199 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4200 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4201 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4202 ///
4203 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmaddsub_round_ps&expand=2619)
4204 #[inline]
4205 #[target_feature(enable = "avx512f")]
4206 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
4207 #[rustc_args_required_const(3)]
4208 pub unsafe fn _mm512_fmaddsub_round_ps(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512 {
4209     macro_rules! call {
4210         ($imm4:expr) => {
4211             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
4212         };
4213     }
4214     let r = constify_imm4_round!(rounding, call);
4215     transmute(r)
4216 }
4217
4218 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4219 ///
4220 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4221 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4222 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4223 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4224 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4225 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4226 ///
4227 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmaddsub_round_ps&expand=2620)
4228 #[inline]
4229 #[target_feature(enable = "avx512f")]
4230 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
4231 #[rustc_args_required_const(4)]
4232 pub unsafe fn _mm512_mask_fmaddsub_round_ps(
4233     a: __m512,
4234     k: __mmask16,
4235     b: __m512,
4236     c: __m512,
4237     rounding: i32,
4238 ) -> __m512 {
4239     macro_rules! call {
4240         ($imm4:expr) => {
4241             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
4242         };
4243     }
4244     let fmaddsub = constify_imm4_round!(rounding, call);
4245     transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x16()))
4246 }
4247
4248 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4249 ///
4250 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4251 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4252 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4253 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4254 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4255 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4256 ///
4257 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmaddsub_round_ps&expand=2622)
4258 #[inline]
4259 #[target_feature(enable = "avx512f")]
4260 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
4261 #[rustc_args_required_const(4)]
4262 pub unsafe fn _mm512_maskz_fmaddsub_round_ps(
4263     k: __mmask16,
4264     a: __m512,
4265     b: __m512,
4266     c: __m512,
4267     rounding: i32,
4268 ) -> __m512 {
4269     macro_rules! call {
4270         ($imm4:expr) => {
4271             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
4272         };
4273     }
4274     let fmaddsub = constify_imm4_round!(rounding, call);
4275     let zero = _mm512_setzero_ps().as_f32x16();
4276     transmute(simd_select_bitmask(k, fmaddsub, zero))
4277 }
4278
4279 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4280 ///
4281 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4282 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4283 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4284 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4285 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4286 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4287 ///
4288 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmaddsub_round_ps&expand=2621)
4289 #[inline]
4290 #[target_feature(enable = "avx512f")]
4291 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
4292 #[rustc_args_required_const(4)]
4293 pub unsafe fn _mm512_mask3_fmaddsub_round_ps(
4294     a: __m512,
4295     b: __m512,
4296     c: __m512,
4297     k: __mmask16,
4298     rounding: i32,
4299 ) -> __m512 {
4300     macro_rules! call {
4301         ($imm4:expr) => {
4302             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16(), $imm4)
4303         };
4304     }
4305     let fmaddsub = constify_imm4_round!(rounding, call);
4306     transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x16()))
4307 }
4308
4309 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
4310 ///
4311 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4312 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4313 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4314 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4315 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4316 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4317 ///
4318 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmaddsub_round_pd&expand=2615)
4319 #[inline]
4320 #[target_feature(enable = "avx512f")]
4321 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
4322 #[rustc_args_required_const(3)]
4323 pub unsafe fn _mm512_fmaddsub_round_pd(
4324     a: __m512d,
4325     b: __m512d,
4326     c: __m512d,
4327     rounding: i32,
4328 ) -> __m512d {
4329     macro_rules! call {
4330         ($imm4:expr) => {
4331             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
4332         };
4333     }
4334     let r = constify_imm4_round!(rounding, call);
4335     transmute(r)
4336 }
4337
4338 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4339 ///
4340 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4341 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4342 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4343 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4344 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4345 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4346 ///
4347 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmaddsub_round_pd&expand=2616)
4348 #[inline]
4349 #[target_feature(enable = "avx512f")]
4350 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
4351 #[rustc_args_required_const(4)]
4352 pub unsafe fn _mm512_mask_fmaddsub_round_pd(
4353     a: __m512d,
4354     k: __mmask8,
4355     b: __m512d,
4356     c: __m512d,
4357     rounding: i32,
4358 ) -> __m512d {
4359     macro_rules! call {
4360         ($imm4:expr) => {
4361             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
4362         };
4363     }
4364     let fmaddsub = constify_imm4_round!(rounding, call);
4365     transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x8()))
4366 }
4367
4368 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4369 ///
4370 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4371 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4372 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4373 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4374 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4375 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4376 ///
4377 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmaddsub_round_pd&expand=2618)
4378 #[inline]
4379 #[target_feature(enable = "avx512f")]
4380 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
4381 #[rustc_args_required_const(4)]
4382 pub unsafe fn _mm512_maskz_fmaddsub_round_pd(
4383     k: __mmask8,
4384     a: __m512d,
4385     b: __m512d,
4386     c: __m512d,
4387     rounding: i32,
4388 ) -> __m512d {
4389     macro_rules! call {
4390         ($imm4:expr) => {
4391             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
4392         };
4393     }
4394     let fmaddsub = constify_imm4_round!(rounding, call);
4395     let zero = _mm512_setzero_pd().as_f64x8();
4396     transmute(simd_select_bitmask(k, fmaddsub, zero))
4397 }
4398
4399 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4400 ///
4401 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4402 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4403 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4404 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4405 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4406 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4407 ///
4408 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmaddsub_round_pd&expand=2617)
4409 #[inline]
4410 #[target_feature(enable = "avx512f")]
4411 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
4412 #[rustc_args_required_const(4)]
4413 pub unsafe fn _mm512_mask3_fmaddsub_round_pd(
4414     a: __m512d,
4415     b: __m512d,
4416     c: __m512d,
4417     k: __mmask8,
4418     rounding: i32,
4419 ) -> __m512d {
4420     macro_rules! call {
4421         ($imm4:expr) => {
4422             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8(), $imm4)
4423         };
4424     }
4425     let fmaddsub = constify_imm4_round!(rounding, call);
4426     transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x8()))
4427 }
4428
4429 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
4430 ///
4431 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4432 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4433 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4434 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4435 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4436 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4437 ///
4438 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsubadd_round_ps&expand=2699)
4439 #[inline]
4440 #[target_feature(enable = "avx512f")]
4441 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
4442 #[rustc_args_required_const(3)]
4443 pub unsafe fn _mm512_fmsubadd_round_ps(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512 {
4444     let zero: f32x16 = mem::zeroed();
4445     let sub = simd_sub(zero, c.as_f32x16());
4446     macro_rules! call {
4447         ($imm4:expr) => {
4448             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4449         };
4450     }
4451     let r = constify_imm4_round!(rounding, call);
4452     transmute(r)
4453 }
4454
4455 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4456 ///
4457 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4458 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4459 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4460 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4461 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4462 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4463 ///
4464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsubadd_round_ps&expand=2700)
4465 #[inline]
4466 #[target_feature(enable = "avx512f")]
4467 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
4468 #[rustc_args_required_const(4)]
4469 pub unsafe fn _mm512_mask_fmsubadd_round_ps(
4470     a: __m512,
4471     k: __mmask16,
4472     b: __m512,
4473     c: __m512,
4474     rounding: i32,
4475 ) -> __m512 {
4476     let zero: f32x16 = mem::zeroed();
4477     let sub = simd_sub(zero, c.as_f32x16());
4478     macro_rules! call {
4479         ($imm4:expr) => {
4480             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4481         };
4482     }
4483     let fmsubadd = constify_imm4_round!(rounding, call);
4484     transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16()))
4485 }
4486
4487 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4488 ///
4489 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4490 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4491 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4492 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4493 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4494 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4495 ///
4496 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsubadd_round_ps&expand=2702)
4497 #[inline]
4498 #[target_feature(enable = "avx512f")]
4499 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
4500 #[rustc_args_required_const(4)]
4501 pub unsafe fn _mm512_maskz_fmsubadd_round_ps(
4502     k: __mmask16,
4503     a: __m512,
4504     b: __m512,
4505     c: __m512,
4506     rounding: i32,
4507 ) -> __m512 {
4508     let zero: f32x16 = mem::zeroed();
4509     let sub = simd_sub(zero, c.as_f32x16());
4510     macro_rules! call {
4511         ($imm4:expr) => {
4512             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4513         };
4514     }
4515     let fmsubadd = constify_imm4_round!(rounding, call);
4516     transmute(simd_select_bitmask(k, fmsubadd, zero))
4517 }
4518
4519 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4520 ///
4521 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4522 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4523 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4524 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4525 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4526 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4527 ///
4528 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsubadd_round_ps&expand=2701)
4529 #[inline]
4530 #[target_feature(enable = "avx512f")]
4531 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
4532 #[rustc_args_required_const(4)]
4533 pub unsafe fn _mm512_mask3_fmsubadd_round_ps(
4534     a: __m512,
4535     b: __m512,
4536     c: __m512,
4537     k: __mmask16,
4538     rounding: i32,
4539 ) -> __m512 {
4540     let zero: f32x16 = mem::zeroed();
4541     let sub = simd_sub(zero, c.as_f32x16());
4542     macro_rules! call {
4543         ($imm4:expr) => {
4544             vfmaddsub213ps(a.as_f32x16(), b.as_f32x16(), sub, $imm4)
4545         };
4546     }
4547     let fmsubadd = constify_imm4_round!(rounding, call);
4548     transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16()))
4549 }
4550
4551 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
4552 ///
4553 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4554 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4555 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4556 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4557 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4558 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4559 ///
4560 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fmsubadd_round_pd&expand=2695)
4561 #[inline]
4562 #[target_feature(enable = "avx512f")]
4563 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
4564 #[rustc_args_required_const(3)]
4565 pub unsafe fn _mm512_fmsubadd_round_pd(
4566     a: __m512d,
4567     b: __m512d,
4568     c: __m512d,
4569     rounding: i32,
4570 ) -> __m512d {
4571     let zero: f64x8 = mem::zeroed();
4572     let sub = simd_sub(zero, c.as_f64x8());
4573     macro_rules! call {
4574         ($imm4:expr) => {
4575             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4576         };
4577     }
4578     let r = constify_imm4_round!(rounding, call);
4579     transmute(r)
4580 }
4581
4582 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4583 ///
4584 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4585 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4586 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4587 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4588 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4589 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4590 ///
4591 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fmsubadd_round_pd&expand=2696)
4592 #[inline]
4593 #[target_feature(enable = "avx512f")]
4594 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
4595 #[rustc_args_required_const(4)]
4596 pub unsafe fn _mm512_mask_fmsubadd_round_pd(
4597     a: __m512d,
4598     k: __mmask8,
4599     b: __m512d,
4600     c: __m512d,
4601     rounding: i32,
4602 ) -> __m512d {
4603     let zero: f64x8 = mem::zeroed();
4604     let sub = simd_sub(zero, c.as_f64x8());
4605     macro_rules! call {
4606         ($imm4:expr) => {
4607             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4608         };
4609     }
4610     let fmsubadd = constify_imm4_round!(rounding, call);
4611     transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8()))
4612 }
4613
4614 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4615 ///
4616 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4617 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4618 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4619 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4620 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4621 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4622 ///
4623 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fmsubadd_round_pd&expand=2698)
4624 #[inline]
4625 #[target_feature(enable = "avx512f")]
4626 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
4627 #[rustc_args_required_const(4)]
4628 pub unsafe fn _mm512_maskz_fmsubadd_round_pd(
4629     k: __mmask8,
4630     a: __m512d,
4631     b: __m512d,
4632     c: __m512d,
4633     rounding: i32,
4634 ) -> __m512d {
4635     let zero: f64x8 = mem::zeroed();
4636     let sub = simd_sub(zero, c.as_f64x8());
4637     macro_rules! call {
4638         ($imm4:expr) => {
4639             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4640         };
4641     }
4642     let fmsubadd = constify_imm4_round!(rounding, call);
4643     transmute(simd_select_bitmask(k, fmsubadd, zero))
4644 }
4645
4646 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4647 ///
4648 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4649 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4650 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4651 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4652 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4653 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4654 ///
4655 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fmsubadd_round_pd&expand=2697)
4656 #[inline]
4657 #[target_feature(enable = "avx512f")]
4658 #[cfg_attr(test, assert_instr(vfmaddsub, rounding = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
4659 #[rustc_args_required_const(4)]
4660 pub unsafe fn _mm512_mask3_fmsubadd_round_pd(
4661     a: __m512d,
4662     b: __m512d,
4663     c: __m512d,
4664     k: __mmask8,
4665     rounding: i32,
4666 ) -> __m512d {
4667     let zero: f64x8 = mem::zeroed();
4668     let sub = simd_sub(zero, c.as_f64x8());
4669     macro_rules! call {
4670         ($imm4:expr) => {
4671             vfmaddsub213pd(a.as_f64x8(), b.as_f64x8(), sub, $imm4)
4672         };
4673     }
4674     let fmsubadd = constify_imm4_round!(rounding, call);
4675     transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8()))
4676 }
4677
4678 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
4679 ///
4680 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4681 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4682 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4683 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4684 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4685 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4686 ///
4687 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmadd_round_ps&expand=2731)
4688 #[inline]
4689 #[target_feature(enable = "avx512f")]
4690 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
4691 #[rustc_args_required_const(3)]
4692 pub unsafe fn _mm512_fnmadd_round_ps(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512 {
4693     let zero: f32x16 = mem::zeroed();
4694     let sub = simd_sub(zero, a.as_f32x16());
4695     macro_rules! call {
4696         ($imm4:expr) => {
4697             vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16(), $imm4)
4698         };
4699     }
4700     let r = constify_imm4_round!(rounding, call);
4701     transmute(r)
4702 }
4703
4704 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4705 ///
4706 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4707 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4708 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4709 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4710 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4711 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4712 ///
4713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmadd_round_ps&expand=2732)
4714 #[inline]
4715 #[target_feature(enable = "avx512f")]
4716 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
4717 #[rustc_args_required_const(4)]
4718 pub unsafe fn _mm512_mask_fnmadd_round_ps(
4719     a: __m512,
4720     k: __mmask16,
4721     b: __m512,
4722     c: __m512,
4723     rounding: i32,
4724 ) -> __m512 {
4725     let zero: f32x16 = mem::zeroed();
4726     let sub = simd_sub(zero, a.as_f32x16());
4727     macro_rules! call {
4728         ($imm4:expr) => {
4729             vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16(), $imm4)
4730         };
4731     }
4732     let fnmadd = constify_imm4_round!(rounding, call);
4733     transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16()))
4734 }
4735
4736 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4737 ///
4738 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4739 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4740 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4741 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4742 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4743 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4744 ///
4745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmadd_round_ps&expand=2734)
4746 #[inline]
4747 #[target_feature(enable = "avx512f")]
4748 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
4749 #[rustc_args_required_const(4)]
4750 pub unsafe fn _mm512_maskz_fnmadd_round_ps(
4751     k: __mmask16,
4752     a: __m512,
4753     b: __m512,
4754     c: __m512,
4755     rounding: i32,
4756 ) -> __m512 {
4757     let zero: f32x16 = mem::zeroed();
4758     let sub = simd_sub(zero, a.as_f32x16());
4759     macro_rules! call {
4760         ($imm4:expr) => {
4761             vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16(), $imm4)
4762         };
4763     }
4764     let fnmadd = constify_imm4_round!(rounding, call);
4765     transmute(simd_select_bitmask(k, fnmadd, zero))
4766 }
4767
4768 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4769 ///
4770 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4771 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4772 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4773 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4774 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4775 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4776 ///
4777 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmadd_round_ps&expand=2733)
4778 #[inline]
4779 #[target_feature(enable = "avx512f")]
4780 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
4781 #[rustc_args_required_const(4)]
4782 pub unsafe fn _mm512_mask3_fnmadd_round_ps(
4783     a: __m512,
4784     b: __m512,
4785     c: __m512,
4786     k: __mmask16,
4787     rounding: i32,
4788 ) -> __m512 {
4789     let zero: f32x16 = mem::zeroed();
4790     let sub = simd_sub(zero, a.as_f32x16());
4791     macro_rules! call {
4792         ($imm4:expr) => {
4793             vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16(), $imm4)
4794         };
4795     }
4796     let fnmadd = constify_imm4_round!(rounding, call);
4797     transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16()))
4798 }
4799
4800 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
4801 ///
4802 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4803 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4804 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4805 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4806 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4807 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4808 ///
4809 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmadd_pd&expand=2711)
4810 #[inline]
4811 #[target_feature(enable = "avx512f")]
4812 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
4813 #[rustc_args_required_const(3)]
4814 pub unsafe fn _mm512_fnmadd_round_pd(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d {
4815     let zero: f64x8 = mem::zeroed();
4816     let sub = simd_sub(zero, a.as_f64x8());
4817     macro_rules! call {
4818         ($imm4:expr) => {
4819             vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8(), $imm4)
4820         };
4821     }
4822     let r = constify_imm4_round!(rounding, call);
4823     transmute(r)
4824 }
4825
4826 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4827 ///
4828 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4829 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4830 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4831 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4832 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4833 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4834 ///
4835 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmadd_round_pd&expand=2728)
4836 #[inline]
4837 #[target_feature(enable = "avx512f")]
4838 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
4839 #[rustc_args_required_const(4)]
4840 pub unsafe fn _mm512_mask_fnmadd_round_pd(
4841     a: __m512d,
4842     k: __mmask8,
4843     b: __m512d,
4844     c: __m512d,
4845     rounding: i32,
4846 ) -> __m512d {
4847     let zero: f64x8 = mem::zeroed();
4848     let sub = simd_sub(zero, a.as_f64x8());
4849     macro_rules! call {
4850         ($imm4:expr) => {
4851             vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8(), $imm4)
4852         };
4853     }
4854     let fnmadd = constify_imm4_round!(rounding, call);
4855     transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8()))
4856 }
4857
4858 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4859 ///
4860 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4861 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4862 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4863 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4864 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4865 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4866 ///
4867 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmadd_round_pd&expand=2730)
4868 #[inline]
4869 #[target_feature(enable = "avx512f")]
4870 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
4871 #[rustc_args_required_const(4)]
4872 pub unsafe fn _mm512_maskz_fnmadd_round_pd(
4873     k: __mmask8,
4874     a: __m512d,
4875     b: __m512d,
4876     c: __m512d,
4877     rounding: i32,
4878 ) -> __m512d {
4879     let zero: f64x8 = mem::zeroed();
4880     let sub = simd_sub(zero, a.as_f64x8());
4881     macro_rules! call {
4882         ($imm4:expr) => {
4883             vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8(), $imm4)
4884         };
4885     }
4886     let fnmadd = constify_imm4_round!(rounding, call);
4887     transmute(simd_select_bitmask(k, fnmadd, zero))
4888 }
4889
4890 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
4891 ///
4892 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4893 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4894 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4895 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4896 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4897 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4898 ///
4899 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmadd_round_pd&expand=2729)
4900 #[inline]
4901 #[target_feature(enable = "avx512f")]
4902 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
4903 #[rustc_args_required_const(4)]
4904 pub unsafe fn _mm512_mask3_fnmadd_round_pd(
4905     a: __m512d,
4906     b: __m512d,
4907     c: __m512d,
4908     k: __mmask8,
4909     rounding: i32,
4910 ) -> __m512d {
4911     let zero: f64x8 = mem::zeroed();
4912     let sub = simd_sub(zero, a.as_f64x8());
4913     macro_rules! call {
4914         ($imm4:expr) => {
4915             vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8(), $imm4)
4916         };
4917     }
4918     let fnmadd = constify_imm4_round!(rounding, call);
4919     transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8()))
4920 }
4921
4922 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
4923 ///
4924 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4925 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4926 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4927 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4928 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4929 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4930 ///
4931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmsub_round_ps&expand=2779)
4932 #[inline]
4933 #[target_feature(enable = "avx512f")]
4934 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
4935 #[rustc_args_required_const(3)]
4936 pub unsafe fn _mm512_fnmsub_round_ps(a: __m512, b: __m512, c: __m512, rounding: i32) -> __m512 {
4937     let zero: f32x16 = mem::zeroed();
4938     let suba = simd_sub(zero, a.as_f32x16());
4939     let subc = simd_sub(zero, c.as_f32x16());
4940     macro_rules! call {
4941         ($imm4:expr) => {
4942             vfmadd132ps(suba, b.as_f32x16(), subc, $imm4)
4943         };
4944     }
4945     let r = constify_imm4_round!(rounding, call);
4946     transmute(r)
4947 }
4948
4949 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
4950 ///
4951 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4952 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4953 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4954 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4955 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4956 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4957 ///
4958 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmsub_round_ps&expand=2780)
4959 #[inline]
4960 #[target_feature(enable = "avx512f")]
4961 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
4962 #[rustc_args_required_const(4)]
4963 pub unsafe fn _mm512_mask_fnmsub_round_ps(
4964     a: __m512,
4965     k: __mmask16,
4966     b: __m512,
4967     c: __m512,
4968     rounding: i32,
4969 ) -> __m512 {
4970     let zero: f32x16 = mem::zeroed();
4971     let suba = simd_sub(zero, a.as_f32x16());
4972     let subc = simd_sub(zero, c.as_f32x16());
4973     macro_rules! call {
4974         ($imm4:expr) => {
4975             vfmadd132ps(suba, b.as_f32x16(), subc, $imm4)
4976         };
4977     }
4978     let fnmsub = constify_imm4_round!(rounding, call);
4979     transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16()))
4980 }
4981
4982 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
4983 ///
4984 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
4985 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
4986 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
4987 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
4988 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
4989 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
4990 ///
4991 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmsub_round_ps&expand=2782)
4992 #[inline]
4993 #[target_feature(enable = "avx512f")]
4994 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
4995 #[rustc_args_required_const(4)]
4996 pub unsafe fn _mm512_maskz_fnmsub_round_ps(
4997     k: __mmask16,
4998     a: __m512,
4999     b: __m512,
5000     c: __m512,
5001     rounding: i32,
5002 ) -> __m512 {
5003     let zero: f32x16 = mem::zeroed();
5004     let suba = simd_sub(zero, a.as_f32x16());
5005     let subc = simd_sub(zero, c.as_f32x16());
5006     macro_rules! call {
5007         ($imm4:expr) => {
5008             vfmadd132ps(suba, b.as_f32x16(), subc, $imm4)
5009         };
5010     }
5011     let fnmsub = constify_imm4_round!(rounding, call);
5012     transmute(simd_select_bitmask(k, fnmsub, zero))
5013 }
5014
5015 /// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
5016 ///
5017 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5018 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5019 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5020 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5021 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5022 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5023 ///
5024 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmsub_round_ps&expand=2781)
5025 #[inline]
5026 #[target_feature(enable = "avx512f")]
5027 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
5028 #[rustc_args_required_const(4)]
5029 pub unsafe fn _mm512_mask3_fnmsub_round_ps(
5030     a: __m512,
5031     b: __m512,
5032     c: __m512,
5033     k: __mmask16,
5034     rounding: i32,
5035 ) -> __m512 {
5036     let zero: f32x16 = mem::zeroed();
5037     let suba = simd_sub(zero, a.as_f32x16());
5038     let subc = simd_sub(zero, c.as_f32x16());
5039     macro_rules! call {
5040         ($imm4:expr) => {
5041             vfmadd132ps(suba, b.as_f32x16(), subc, $imm4)
5042         };
5043     }
5044     let fnmsub = constify_imm4_round!(rounding, call);
5045     transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16()))
5046 }
5047
5048 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
5049 ///
5050 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5051 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5052 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5053 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5054 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5055 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5056 ///
5057 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fnmsub_round_pd&expand=2775)
5058 #[inline]
5059 #[target_feature(enable = "avx512f")]
5060 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
5061 #[rustc_args_required_const(3)]
5062 pub unsafe fn _mm512_fnmsub_round_pd(a: __m512d, b: __m512d, c: __m512d, rounding: i32) -> __m512d {
5063     let zero: f64x8 = mem::zeroed();
5064     let suba = simd_sub(zero, a.as_f64x8());
5065     let subc = simd_sub(zero, c.as_f64x8());
5066     macro_rules! call {
5067         ($imm4:expr) => {
5068             vfmadd132pd(suba, b.as_f64x8(), subc, $imm4)
5069         };
5070     }
5071     let r = constify_imm4_round!(rounding, call);
5072     transmute(r)
5073 }
5074
5075 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
5076 ///
5077 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5078 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5079 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5080 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5081 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5082 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5083 ///
5084 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fnmsub_round_pd&expand=2776)
5085 #[inline]
5086 #[target_feature(enable = "avx512f")]
5087 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
5088 #[rustc_args_required_const(4)]
5089 pub unsafe fn _mm512_mask_fnmsub_round_pd(
5090     a: __m512d,
5091     k: __mmask8,
5092     b: __m512d,
5093     c: __m512d,
5094     rounding: i32,
5095 ) -> __m512d {
5096     let zero: f64x8 = mem::zeroed();
5097     let suba = simd_sub(zero, a.as_f64x8());
5098     let subc = simd_sub(zero, c.as_f64x8());
5099     macro_rules! call {
5100         ($imm4:expr) => {
5101             vfmadd132pd(suba, b.as_f64x8(), subc, $imm4)
5102         };
5103     }
5104     let fnmsub = constify_imm4_round!(rounding, call);
5105     transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8()))
5106 }
5107
5108 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5109 ///
5110 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5111 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5112 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5113 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5114 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5115 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5116 ///
5117 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fnmsub_round_pd&expand=2778)
5118 #[inline]
5119 #[target_feature(enable = "avx512f")]
5120 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
5121 #[rustc_args_required_const(4)]
5122 pub unsafe fn _mm512_maskz_fnmsub_round_pd(
5123     k: __mmask8,
5124     a: __m512d,
5125     b: __m512d,
5126     c: __m512d,
5127     rounding: i32,
5128 ) -> __m512d {
5129     let zero: f64x8 = mem::zeroed();
5130     let suba = simd_sub(zero, a.as_f64x8());
5131     let subc = simd_sub(zero, c.as_f64x8());
5132     macro_rules! call {
5133         ($imm4:expr) => {
5134             vfmadd132pd(suba, b.as_f64x8(), subc, $imm4)
5135         };
5136     }
5137     let fnmsub = constify_imm4_round!(rounding, call);
5138     transmute(simd_select_bitmask(k, fnmsub, zero))
5139 }
5140
5141 /// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
5142 ///
5143 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5144 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5145 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5146 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5147 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5148 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5149 ///
5150 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask3_fnmsub_round_pd&expand=2777)
5151 #[inline]
5152 #[target_feature(enable = "avx512f")]
5153 #[cfg_attr(test, assert_instr(vfmadd, rounding = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
5154 #[rustc_args_required_const(4)]
5155 pub unsafe fn _mm512_mask3_fnmsub_round_pd(
5156     a: __m512d,
5157     b: __m512d,
5158     c: __m512d,
5159     k: __mmask8,
5160     rounding: i32,
5161 ) -> __m512d {
5162     let zero: f64x8 = mem::zeroed();
5163     let suba = simd_sub(zero, a.as_f64x8());
5164     let subc = simd_sub(zero, c.as_f64x8());
5165     macro_rules! call {
5166         ($imm4:expr) => {
5167             vfmadd132pd(suba, b.as_f64x8(), subc, $imm4)
5168         };
5169     }
5170     let fnmsub = constify_imm4_round!(rounding, call);
5171     transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8()))
5172 }
5173
5174 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
5175 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5176 ///
5177 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=max_round_ps&expand=3662)
5178 #[inline]
5179 #[target_feature(enable = "avx512f")]
5180 #[cfg_attr(test, assert_instr(vmaxps, sae = 8))]
5181 #[rustc_args_required_const(2)]
5182 pub unsafe fn _mm512_max_round_ps(a: __m512, b: __m512, sae: i32) -> __m512 {
5183     macro_rules! call {
5184         ($imm4:expr) => {
5185             vmaxps(a.as_f32x16(), b.as_f32x16(), $imm4)
5186         };
5187     }
5188     let r = constify_imm4_sae!(sae, call);
5189     transmute(r)
5190 }
5191
5192 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5193 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5194 ///
5195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_round_ps&expand=3660)
5196 #[inline]
5197 #[target_feature(enable = "avx512f")]
5198 #[cfg_attr(test, assert_instr(vmaxps, sae = 8))]
5199 #[rustc_args_required_const(4)]
5200 pub unsafe fn _mm512_mask_max_round_ps(
5201     src: __m512,
5202     k: __mmask16,
5203     a: __m512,
5204     b: __m512,
5205     sae: i32,
5206 ) -> __m512 {
5207     macro_rules! call {
5208         ($imm4:expr) => {
5209             vmaxps(a.as_f32x16(), b.as_f32x16(), $imm4)
5210         };
5211     }
5212     let max = constify_imm4_sae!(sae, call);
5213     transmute(simd_select_bitmask(k, max, src.as_f32x16()))
5214 }
5215
5216 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5217 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5218 ///
5219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_round_ps&expand=3661)
5220 #[inline]
5221 #[target_feature(enable = "avx512f")]
5222 #[cfg_attr(test, assert_instr(vmaxps, sae = 8))]
5223 #[rustc_args_required_const(3)]
5224 pub unsafe fn _mm512_maskz_max_round_ps(k: __mmask16, a: __m512, b: __m512, sae: i32) -> __m512 {
5225     macro_rules! call {
5226         ($imm4:expr) => {
5227             vmaxps(a.as_f32x16(), b.as_f32x16(), $imm4)
5228         };
5229     }
5230     let max = constify_imm4_sae!(sae, call);
5231     let zero = _mm512_setzero_ps().as_f32x16();
5232     transmute(simd_select_bitmask(k, max, zero))
5233 }
5234
5235 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
5236 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5237 ///
5238 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_max_round_pd&expand=3659)
5239 #[inline]
5240 #[target_feature(enable = "avx512f")]
5241 #[cfg_attr(test, assert_instr(vmaxpd, sae = 8))]
5242 #[rustc_args_required_const(2)]
5243 pub unsafe fn _mm512_max_round_pd(a: __m512d, b: __m512d, sae: i32) -> __m512d {
5244     macro_rules! call {
5245         ($imm4:expr) => {
5246             vmaxpd(a.as_f64x8(), b.as_f64x8(), $imm4)
5247         };
5248     }
5249     let r = constify_imm4_sae!(sae, call);
5250     transmute(r)
5251 }
5252
5253 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5254 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5255 ///
5256 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_max_round_pd&expand=3657)
5257 #[inline]
5258 #[target_feature(enable = "avx512f")]
5259 #[cfg_attr(test, assert_instr(vmaxpd, sae = 8))]
5260 #[rustc_args_required_const(4)]
5261 pub unsafe fn _mm512_mask_max_round_pd(
5262     src: __m512d,
5263     k: __mmask8,
5264     a: __m512d,
5265     b: __m512d,
5266     sae: i32,
5267 ) -> __m512d {
5268     macro_rules! call {
5269         ($imm4:expr) => {
5270             vmaxpd(a.as_f64x8(), b.as_f64x8(), $imm4)
5271         };
5272     }
5273     let max = constify_imm4_sae!(sae, call);
5274     transmute(simd_select_bitmask(k, max, src.as_f64x8()))
5275 }
5276
5277 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5278 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5279 ///
5280 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_max_round_pd&expand=3658)
5281 #[inline]
5282 #[target_feature(enable = "avx512f")]
5283 #[cfg_attr(test, assert_instr(vmaxpd, sae = 8))]
5284 #[rustc_args_required_const(3)]
5285 pub unsafe fn _mm512_maskz_max_round_pd(k: __mmask8, a: __m512d, b: __m512d, sae: i32) -> __m512d {
5286     macro_rules! call {
5287         ($imm4:expr) => {
5288             vmaxpd(a.as_f64x8(), b.as_f64x8(), $imm4)
5289         };
5290     }
5291     let max = constify_imm4_sae!(sae, call);
5292     let zero = _mm512_setzero_pd().as_f64x8();
5293     transmute(simd_select_bitmask(k, max, zero))
5294 }
5295
5296 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
5297 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5298 ///
5299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_round_ps&expand=3776)
5300 #[inline]
5301 #[target_feature(enable = "avx512f")]
5302 #[cfg_attr(test, assert_instr(vminps, sae = 8))]
5303 #[rustc_args_required_const(2)]
5304 pub unsafe fn _mm512_min_round_ps(a: __m512, b: __m512, sae: i32) -> __m512 {
5305     macro_rules! call {
5306         ($imm4:expr) => {
5307             vminps(a.as_f32x16(), b.as_f32x16(), $imm4)
5308         };
5309     }
5310     let r = constify_imm4_sae!(sae, call);
5311     transmute(r)
5312 }
5313
5314 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5315 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5316 ///
5317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_round_ps&expand=3774)
5318 #[inline]
5319 #[target_feature(enable = "avx512f")]
5320 #[cfg_attr(test, assert_instr(vminps, sae = 8))]
5321 #[rustc_args_required_const(4)]
5322 pub unsafe fn _mm512_mask_min_round_ps(
5323     src: __m512,
5324     k: __mmask16,
5325     a: __m512,
5326     b: __m512,
5327     sae: i32,
5328 ) -> __m512 {
5329     macro_rules! call {
5330         ($imm4:expr) => {
5331             vminps(a.as_f32x16(), b.as_f32x16(), $imm4)
5332         };
5333     }
5334     let max = constify_imm4_sae!(sae, call);
5335     transmute(simd_select_bitmask(k, max, src.as_f32x16()))
5336 }
5337
5338 /// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5339 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5340 ///
5341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_round_ps&expand=3775)
5342 #[inline]
5343 #[target_feature(enable = "avx512f")]
5344 #[cfg_attr(test, assert_instr(vminps, sae = 8))]
5345 #[rustc_args_required_const(3)]
5346 pub unsafe fn _mm512_maskz_min_round_ps(k: __mmask16, a: __m512, b: __m512, sae: i32) -> __m512 {
5347     macro_rules! call {
5348         ($imm4:expr) => {
5349             vminps(a.as_f32x16(), b.as_f32x16(), $imm4)
5350         };
5351     }
5352     let max = constify_imm4_sae!(sae, call);
5353     let zero = _mm512_setzero_ps().as_f32x16();
5354     transmute(simd_select_bitmask(k, max, zero))
5355 }
5356
5357 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
5358 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5359 ///
5360 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_round_pd&expand=3773)
5361 #[inline]
5362 #[target_feature(enable = "avx512f")]
5363 #[cfg_attr(test, assert_instr(vminpd, sae = 8))]
5364 #[rustc_args_required_const(2)]
5365 pub unsafe fn _mm512_min_round_pd(a: __m512d, b: __m512d, sae: i32) -> __m512d {
5366     macro_rules! call {
5367         ($imm4:expr) => {
5368             vminpd(a.as_f64x8(), b.as_f64x8(), $imm4)
5369         };
5370     }
5371     let r = constify_imm4_sae!(sae, call);
5372     transmute(r)
5373 }
5374
5375 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5376 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5377 ///
5378 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_round_pd&expand=3771)
5379 #[inline]
5380 #[target_feature(enable = "avx512f")]
5381 #[cfg_attr(test, assert_instr(vminpd, sae = 8))]
5382 #[rustc_args_required_const(4)]
5383 pub unsafe fn _mm512_mask_min_round_pd(
5384     src: __m512d,
5385     k: __mmask8,
5386     a: __m512d,
5387     b: __m512d,
5388     sae: i32,
5389 ) -> __m512d {
5390     macro_rules! call {
5391         ($imm4:expr) => {
5392             vminpd(a.as_f64x8(), b.as_f64x8(), $imm4)
5393         };
5394     }
5395     let max = constify_imm4_sae!(sae, call);
5396     transmute(simd_select_bitmask(k, max, src.as_f64x8()))
5397 }
5398
5399 /// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5400 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5401 ///
5402 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_round_pd&expand=3772)
5403 #[inline]
5404 #[target_feature(enable = "avx512f")]
5405 #[cfg_attr(test, assert_instr(vminpd, sae = 8))]
5406 #[rustc_args_required_const(3)]
5407 pub unsafe fn _mm512_maskz_min_round_pd(k: __mmask8, a: __m512d, b: __m512d, sae: i32) -> __m512d {
5408     macro_rules! call {
5409         ($imm4:expr) => {
5410             vminpd(a.as_f64x8(), b.as_f64x8(), $imm4)
5411         };
5412     }
5413     let max = constify_imm4_sae!(sae, call);
5414     let zero = _mm512_setzero_pd().as_f64x8();
5415     transmute(simd_select_bitmask(k, max, zero))
5416 }
5417
5418 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
5419 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5420 ///
5421 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getexp_round_ps&expand=2850)
5422 #[inline]
5423 #[target_feature(enable = "avx512f")]
5424 #[cfg_attr(test, assert_instr(vgetexpps, sae = 8))]
5425 #[rustc_args_required_const(1)]
5426 pub unsafe fn _mm512_getexp_round_ps(a: __m512, sae: i32) -> __m512 {
5427     macro_rules! call {
5428         ($imm4:expr) => {
5429             vgetexpps(
5430                 a.as_f32x16(),
5431                 _mm512_setzero_ps().as_f32x16(),
5432                 0b11111111_11111111,
5433                 $imm4,
5434             )
5435         };
5436     }
5437     let r = constify_imm4_sae!(sae, call);
5438     transmute(r)
5439 }
5440
5441 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
5442 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5443 ///
5444 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getexp_round_ps&expand=2851)
5445 #[inline]
5446 #[target_feature(enable = "avx512f")]
5447 #[cfg_attr(test, assert_instr(vgetexpps, sae = 8))]
5448 #[rustc_args_required_const(3)]
5449 pub unsafe fn _mm512_mask_getexp_round_ps(
5450     src: __m512,
5451     k: __mmask16,
5452     a: __m512,
5453     sae: i32,
5454 ) -> __m512 {
5455     macro_rules! call {
5456         ($imm4:expr) => {
5457             vgetexpps(a.as_f32x16(), src.as_f32x16(), k, $imm4)
5458         };
5459     }
5460     let r = constify_imm4_sae!(sae, call);
5461     transmute(r)
5462 }
5463
5464 /// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
5465 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5466 ///
5467 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getexp_round_ps&expand=2852)
5468 #[inline]
5469 #[target_feature(enable = "avx512f")]
5470 #[cfg_attr(test, assert_instr(vgetexpps, sae = 8))]
5471 #[rustc_args_required_const(2)]
5472 pub unsafe fn _mm512_maskz_getexp_round_ps(k: __mmask16, a: __m512, sae: i32) -> __m512 {
5473     macro_rules! call {
5474         ($imm4:expr) => {
5475             vgetexpps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k, $imm4)
5476         };
5477     }
5478     let r = constify_imm4_sae!(sae, call);
5479     transmute(r)
5480 }
5481
5482 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
5483 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5484 ///
5485 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getexp_round_pd&expand=2847)
5486 #[inline]
5487 #[target_feature(enable = "avx512f")]
5488 #[cfg_attr(test, assert_instr(vgetexppd, sae = 8))]
5489 #[rustc_args_required_const(1)]
5490 pub unsafe fn _mm512_getexp_round_pd(a: __m512d, sae: i32) -> __m512d {
5491     macro_rules! call {
5492         ($imm4:expr) => {
5493             vgetexppd(
5494                 a.as_f64x8(),
5495                 _mm512_setzero_pd().as_f64x8(),
5496                 0b11111111,
5497                 $imm4,
5498             )
5499         };
5500     }
5501     let r = constify_imm4_sae!(sae, call);
5502     transmute(r)
5503 }
5504
5505 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
5506 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5507 ///
5508 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getexp_round_pd&expand=2848)
5509 #[inline]
5510 #[target_feature(enable = "avx512f")]
5511 #[cfg_attr(test, assert_instr(vgetexppd, sae = 8))]
5512 #[rustc_args_required_const(3)]
5513 pub unsafe fn _mm512_mask_getexp_round_pd(
5514     src: __m512d,
5515     k: __mmask8,
5516     a: __m512d,
5517     sae: i32,
5518 ) -> __m512d {
5519     macro_rules! call {
5520         ($imm4:expr) => {
5521             vgetexppd(a.as_f64x8(), src.as_f64x8(), k, $imm4)
5522         };
5523     }
5524     let r = constify_imm4_sae!(sae, call);
5525     transmute(r)
5526 }
5527
5528 /// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
5529 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5530 ///
5531 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getexp_round_pd&expand=2849)
5532 #[inline]
5533 #[target_feature(enable = "avx512f")]
5534 #[cfg_attr(test, assert_instr(vgetexppd, sae = 8))]
5535 #[rustc_args_required_const(2)]
5536 pub unsafe fn _mm512_maskz_getexp_round_pd(k: __mmask8, a: __m512d, sae: i32) -> __m512d {
5537     macro_rules! call {
5538         ($imm4:expr) => {
5539             vgetexppd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k, $imm4)
5540         };
5541     }
5542     let r = constify_imm4_sae!(sae, call);
5543     transmute(r)
5544 }
5545
5546 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
5547 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
5548 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
5549 ///    _MM_FROUND_TO_NEG_INF     // round down\
5550 ///    _MM_FROUND_TO_POS_INF     // round up\
5551 ///    _MM_FROUND_TO_ZERO        // truncate\
5552 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
5553 ///
5554 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5555 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_ps&expand=4790)
5556 #[inline]
5557 #[target_feature(enable = "avx512f")]
5558 #[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
5559 #[rustc_args_required_const(1, 2)]
5560 pub unsafe fn _mm512_roundscale_round_ps(a: __m512, imm8: i32, sae: i32) -> __m512 {
5561     let a = a.as_f32x16();
5562     let zero = _mm512_setzero_ps().as_f32x16();
5563     macro_rules! call {
5564         ($imm8:expr, $imm4:expr) => {
5565             vrndscaleps(a, $imm8, zero, 0b11111111_11111111, $imm4)
5566         };
5567     }
5568     let r = constify_imm8_roundscale!(imm8, sae, call);
5569     transmute(r)
5570 }
5571
5572 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5573 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
5574 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
5575 ///    _MM_FROUND_TO_NEG_INF     // round down\
5576 ///    _MM_FROUND_TO_POS_INF     // round up\
5577 ///    _MM_FROUND_TO_ZERO        // truncate\
5578 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
5579 ///
5580 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_ps&expand=4788)
5582 #[inline]
5583 #[target_feature(enable = "avx512f")]
5584 #[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
5585 #[rustc_args_required_const(3, 4)]
5586 pub unsafe fn _mm512_mask_roundscale_round_ps(
5587     src: __m512,
5588     k: __mmask16,
5589     a: __m512,
5590     imm8: i32,
5591     sae: i32,
5592 ) -> __m512 {
5593     let a = a.as_f32x16();
5594     let src = src.as_f32x16();
5595     macro_rules! call {
5596         ($imm8:expr, $imm4:expr) => {
5597             vrndscaleps(a, $imm8, src, k, $imm4)
5598         };
5599     }
5600     let r = constify_imm8_roundscale!(imm8, sae, call);
5601     transmute(r)
5602 }
5603
5604 /// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5605 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
5606 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
5607 ///    _MM_FROUND_TO_NEG_INF     // round down\
5608 ///    _MM_FROUND_TO_POS_INF     // round up\
5609 ///    _MM_FROUND_TO_ZERO        // truncate\
5610 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
5611 ///
5612 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5613 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_ps&expand=4789)
5614 #[inline]
5615 #[target_feature(enable = "avx512f")]
5616 #[cfg_attr(test, assert_instr(vrndscaleps, imm8 = 0, sae = 8))]
5617 #[rustc_args_required_const(2, 3)]
5618 pub unsafe fn _mm512_maskz_roundscale_round_ps(
5619     k: __mmask16,
5620     a: __m512,
5621     imm8: i32,
5622     sae: i32,
5623 ) -> __m512 {
5624     let a = a.as_f32x16();
5625     let zero = _mm512_setzero_ps().as_f32x16();
5626     macro_rules! call {
5627         ($imm8:expr, $imm4:expr) => {
5628             vrndscaleps(a, $imm8, zero, k, $imm4)
5629         };
5630     }
5631     let r = constify_imm8_roundscale!(imm8, sae, call);
5632     transmute(r)
5633 }
5634
5635 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
5636 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
5637 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
5638 ///    _MM_FROUND_TO_NEG_INF     // round down\
5639 ///    _MM_FROUND_TO_POS_INF     // round up\
5640 ///    _MM_FROUND_TO_ZERO        // truncate\
5641 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
5642 ///
5643 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5644 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_roundscale_round_pd&expand=4787)
5645 #[inline]
5646 #[target_feature(enable = "avx512f")]
5647 #[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
5648 #[rustc_args_required_const(1, 2)]
5649 pub unsafe fn _mm512_roundscale_round_pd(a: __m512d, imm8: i32, sae: i32) -> __m512d {
5650     let a = a.as_f64x8();
5651     let zero = _mm512_setzero_pd().as_f64x8();
5652     macro_rules! call {
5653         ($imm8:expr, $imm4:expr) => {
5654             vrndscalepd(a, $imm8, zero, 0b11111111, $imm4)
5655         };
5656     }
5657     let r = constify_imm8_roundscale!(imm8, sae, call);
5658     transmute(r)
5659 }
5660
5661 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5662 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
5663 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
5664 ///    _MM_FROUND_TO_NEG_INF     // round down\
5665 ///    _MM_FROUND_TO_POS_INF     // round up\
5666 ///    _MM_FROUND_TO_ZERO        // truncate\
5667 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
5668 ///
5669 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5670 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_roundscale_round_pd&expand=4785)
5671 #[inline]
5672 #[target_feature(enable = "avx512f")]
5673 #[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
5674 #[rustc_args_required_const(3, 4)]
5675 pub unsafe fn _mm512_mask_roundscale_round_pd(
5676     src: __m512d,
5677     k: __mmask8,
5678     a: __m512d,
5679     imm8: i32,
5680     sae: i32,
5681 ) -> __m512d {
5682     let a = a.as_f64x8();
5683     let src = src.as_f64x8();
5684     macro_rules! call {
5685         ($imm8:expr, $imm4:expr) => {
5686             vrndscalepd(a, $imm8, src, k, $imm4)
5687         };
5688     }
5689     let r = constify_imm8_roundscale!(imm8, sae, call);
5690     transmute(r)
5691 }
5692
5693 /// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5694 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
5695 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
5696 ///    _MM_FROUND_TO_NEG_INF     // round down\
5697 ///    _MM_FROUND_TO_POS_INF     // round up\
5698 ///    _MM_FROUND_TO_ZERO        // truncate\
5699 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
5700 ///
5701 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5702 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_roundscale_round_pd&expand=4786)
5703 #[inline]
5704 #[target_feature(enable = "avx512f")]
5705 #[cfg_attr(test, assert_instr(vrndscalepd, imm8 = 0, sae = 8))]
5706 #[rustc_args_required_const(2, 3)]
5707 pub unsafe fn _mm512_maskz_roundscale_round_pd(
5708     k: __mmask8,
5709     a: __m512d,
5710     imm8: i32,
5711     sae: i32,
5712 ) -> __m512d {
5713     let a = a.as_f64x8();
5714     let zero = _mm512_setzero_pd().as_f64x8();
5715     macro_rules! call {
5716         ($imm8:expr, $imm4:expr) => {
5717             vrndscalepd(a, $imm8, zero, k, $imm4)
5718         };
5719     }
5720     let r = constify_imm8_roundscale!(imm8, sae, call);
5721     transmute(r)
5722 }
5723
5724 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
5725 ///
5726 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5727 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5728 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5729 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5730 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5731 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5732 ///
5733 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_ps&expand=4889)
5734 #[inline]
5735 #[target_feature(enable = "avx512f")]
5736 #[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
5737 #[rustc_args_required_const(2)]
5738 pub unsafe fn _mm512_scalef_round_ps(a: __m512, b: __m512, rounding: i32) -> __m512 {
5739     macro_rules! call {
5740         ($imm4:expr) => {
5741             vscalefps(
5742                 a.as_f32x16(),
5743                 b.as_f32x16(),
5744                 _mm512_setzero_ps().as_f32x16(),
5745                 0b11111111_11111111,
5746                 $imm4,
5747             )
5748         };
5749     }
5750     let r = constify_imm4_round!(rounding, call);
5751     transmute(r)
5752 }
5753
5754 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5755 ///
5756 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5757 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5758 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5759 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5760 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5761 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5762 ///
5763 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_ps&expand=4887)
5764 #[inline]
5765 #[target_feature(enable = "avx512f")]
5766 #[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
5767 #[rustc_args_required_const(4)]
5768 pub unsafe fn _mm512_mask_scalef_round_ps(
5769     src: __m512,
5770     k: __mmask16,
5771     a: __m512,
5772     b: __m512,
5773     rounding: i32,
5774 ) -> __m512 {
5775     macro_rules! call {
5776         ($imm4:expr) => {
5777             vscalefps(a.as_f32x16(), b.as_f32x16(), src.as_f32x16(), k, $imm4)
5778         };
5779     }
5780     let r = constify_imm4_round!(rounding, call);
5781     transmute(r)
5782 }
5783
5784 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5785 ///
5786 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5787 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5788 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5789 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5790 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5791 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5792 ///
5793 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_ps&expand=4888)
5794 #[inline]
5795 #[target_feature(enable = "avx512f")]
5796 #[cfg_attr(test, assert_instr(vscalefps, rounding = 8))]
5797 #[rustc_args_required_const(3)]
5798 pub unsafe fn _mm512_maskz_scalef_round_ps(
5799     k: __mmask16,
5800     a: __m512,
5801     b: __m512,
5802     rounding: i32,
5803 ) -> __m512 {
5804     macro_rules! call {
5805         ($imm4:expr) => {
5806             vscalefps(
5807                 a.as_f32x16(),
5808                 b.as_f32x16(),
5809                 _mm512_setzero_ps().as_f32x16(),
5810                 k,
5811                 $imm4,
5812             )
5813         };
5814     }
5815     let r = constify_imm4_round!(rounding, call);
5816     transmute(r)
5817 }
5818
5819 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
5820 ///
5821 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5822 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5823 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5824 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5825 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5826 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5827 ///
5828 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_scalef_round_pd&expand=4886)
5829 #[inline]
5830 #[target_feature(enable = "avx512f")]
5831 #[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
5832 #[rustc_args_required_const(2)]
5833 pub unsafe fn _mm512_scalef_round_pd(a: __m512d, b: __m512d, rounding: i32) -> __m512d {
5834     macro_rules! call {
5835         ($imm4:expr) => {
5836             vscalefpd(
5837                 a.as_f64x8(),
5838                 b.as_f64x8(),
5839                 _mm512_setzero_pd().as_f64x8(),
5840                 0b11111111,
5841                 $imm4,
5842             )
5843         };
5844     }
5845     let r = constify_imm4_round!(rounding, call);
5846     transmute(r)
5847 }
5848
5849 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
5850 ///
5851 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5852 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5853 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5854 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5855 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5856 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5857 ///
5858 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_scalef_round_pd&expand=4884)
5859 #[inline]
5860 #[target_feature(enable = "avx512f")]
5861 #[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
5862 #[rustc_args_required_const(4)]
5863 pub unsafe fn _mm512_mask_scalef_round_pd(
5864     src: __m512d,
5865     k: __mmask8,
5866     a: __m512d,
5867     b: __m512d,
5868     rounding: i32,
5869 ) -> __m512d {
5870     macro_rules! call {
5871         ($imm4:expr) => {
5872             vscalefpd(a.as_f64x8(), b.as_f64x8(), src.as_f64x8(), k, $imm4)
5873         };
5874     }
5875     let r = constify_imm4_round!(rounding, call);
5876     transmute(r)
5877 }
5878
5879 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
5880 ///
5881 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
5882 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
5883 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
5884 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
5885 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
5886 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
5887 ///
5888 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_scalef_round_pd&expand=4885)
5889 #[inline]
5890 #[target_feature(enable = "avx512f")]
5891 #[cfg_attr(test, assert_instr(vscalefpd, rounding = 8))]
5892 #[rustc_args_required_const(3)]
5893 pub unsafe fn _mm512_maskz_scalef_round_pd(
5894     k: __mmask8,
5895     a: __m512d,
5896     b: __m512d,
5897     rounding: i32,
5898 ) -> __m512d {
5899     macro_rules! call {
5900         ($imm4:expr) => {
5901             vscalefpd(
5902                 a.as_f64x8(),
5903                 b.as_f64x8(),
5904                 _mm512_setzero_pd().as_f64x8(),
5905                 k,
5906                 $imm4,
5907             )
5908         };
5909     }
5910     let r = constify_imm4_round!(rounding, call);
5911     transmute(r)
5912 }
5913
5914 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
5915 ///
5916 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5917 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_ps&expand=2505)
5918 #[inline]
5919 #[target_feature(enable = "avx512f")]
5920 #[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
5921 #[rustc_args_required_const(3, 4)]
5922 pub unsafe fn _mm512_fixupimm_round_ps(
5923     a: __m512,
5924     b: __m512,
5925     c: __m512i,
5926     imm8: i32,
5927     sae: i32,
5928 ) -> __m512 {
5929     let a = a.as_f32x16();
5930     let b = b.as_f32x16();
5931     let c = c.as_i32x16();
5932     macro_rules! call {
5933         ($imm8:expr, $imm4:expr) => {
5934             vfixupimmps(a, b, c, $imm8, 0b11111111_11111111, $imm4)
5935         };
5936     }
5937     let r = constify_imm8_roundscale!(imm8, sae, call);
5938     transmute(r)
5939 }
5940
5941 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
5942 ///
5943 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5944 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_ps&expand=2506)
5945 #[inline]
5946 #[target_feature(enable = "avx512f")]
5947 #[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
5948 #[rustc_args_required_const(4, 5)]
5949 pub unsafe fn _mm512_mask_fixupimm_round_ps(
5950     a: __m512,
5951     k: __mmask16,
5952     b: __m512,
5953     c: __m512i,
5954     imm8: i32,
5955     sae: i32,
5956 ) -> __m512 {
5957     let a = a.as_f32x16();
5958     let b = b.as_f32x16();
5959     let c = c.as_i32x16();
5960     macro_rules! call {
5961         ($imm8:expr, $imm4:expr) => {
5962             vfixupimmps(a, b, c, $imm8, k, $imm4)
5963         };
5964     }
5965     let r = constify_imm8_roundscale!(imm8, sae, call);
5966     transmute(r)
5967 }
5968
5969 /// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
5970 ///
5971 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
5972 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_ps&expand=2507)
5973 #[inline]
5974 #[target_feature(enable = "avx512f")]
5975 #[cfg_attr(test, assert_instr(vfixupimmps, imm8 = 0, sae = 8))]
5976 #[rustc_args_required_const(4, 5)]
5977 pub unsafe fn _mm512_maskz_fixupimm_round_ps(
5978     k: __mmask16,
5979     a: __m512,
5980     b: __m512,
5981     c: __m512i,
5982     imm8: i32,
5983     sae: i32,
5984 ) -> __m512 {
5985     let a = a.as_f32x16();
5986     let b = b.as_f32x16();
5987     let c = c.as_i32x16();
5988     macro_rules! call {
5989         ($imm8:expr, $imm4:expr) => {
5990             vfixupimmpsz(a, b, c, $imm8, k, $imm4)
5991         };
5992     }
5993     let r = constify_imm8_roundscale!(imm8, sae, call);
5994     transmute(r)
5995 }
5996
5997 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
5998 ///
5999 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6000 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_fixupimm_round_pd&expand=2502)
6001 #[inline]
6002 #[target_feature(enable = "avx512f")]
6003 #[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
6004 #[rustc_args_required_const(3, 4)]
6005 pub unsafe fn _mm512_fixupimm_round_pd(
6006     a: __m512d,
6007     b: __m512d,
6008     c: __m512i,
6009     imm8: i32,
6010     sae: i32,
6011 ) -> __m512d {
6012     let a = a.as_f64x8();
6013     let b = b.as_f64x8();
6014     let c = c.as_i64x8();
6015     macro_rules! call {
6016         ($imm8:expr, $imm4:expr) => {
6017             vfixupimmpd(a, b, c, $imm8, 0b11111111, $imm4)
6018         };
6019     }
6020     let r = constify_imm8_roundscale!(imm8, sae, call);
6021     transmute(r)
6022 }
6023
6024 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
6025 ///
6026 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6027 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_fixupimm_round_pd&expand=2503)
6028 #[inline]
6029 #[target_feature(enable = "avx512f")]
6030 #[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
6031 #[rustc_args_required_const(4, 5)]
6032 pub unsafe fn _mm512_mask_fixupimm_round_pd(
6033     a: __m512d,
6034     k: __mmask8,
6035     b: __m512d,
6036     c: __m512i,
6037     imm8: i32,
6038     sae: i32,
6039 ) -> __m512d {
6040     let a = a.as_f64x8();
6041     let b = b.as_f64x8();
6042     let c = c.as_i64x8();
6043     macro_rules! call {
6044         ($imm8:expr, $imm4:expr) => {
6045             vfixupimmpd(a, b, c, $imm8, k, $imm4)
6046         };
6047     }
6048     let r = constify_imm8_roundscale!(imm8, sae, call);
6049     transmute(r)
6050 }
6051
6052 /// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
6053 ///
6054 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6055 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_fixupimm_round_pd&expand=2504)
6056 #[inline]
6057 #[target_feature(enable = "avx512f")]
6058 #[cfg_attr(test, assert_instr(vfixupimmpd, imm8 = 0, sae = 8))]
6059 #[rustc_args_required_const(4, 5)]
6060 pub unsafe fn _mm512_maskz_fixupimm_round_pd(
6061     k: __mmask8,
6062     a: __m512d,
6063     b: __m512d,
6064     c: __m512i,
6065     imm8: i32,
6066     sae: i32,
6067 ) -> __m512d {
6068     let a = a.as_f64x8();
6069     let b = b.as_f64x8();
6070     let c = c.as_i64x8();
6071     macro_rules! call {
6072         ($imm8:expr, $imm4:expr) => {
6073             vfixupimmpdz(a, b, c, $imm8, k, $imm4)
6074         };
6075     }
6076     let r = constify_imm8_roundscale!(imm8, sae, call);
6077     transmute(r)
6078 }
6079
6080 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
6081 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
6082 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
6083 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
6084 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
6085 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
6086 /// The sign is determined by sc which can take the following values:\
6087 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
6088 ///    _MM_MANT_SIGN_zero    // sign = 0\
6089 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
6090 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6091 ///
6092 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getmant_round_ps&expand=2886)
6093 #[inline]
6094 #[target_feature(enable = "avx512f")]
6095 #[cfg_attr(test, assert_instr(vgetmantps, norm = 0, sign = 0, sae = 4))]
6096 #[rustc_args_required_const(1, 2, 3)]
6097 pub unsafe fn _mm512_getmant_round_ps(
6098     a: __m512,
6099     norm: _MM_MANTISSA_NORM_ENUM,
6100     sign: _MM_MANTISSA_SIGN_ENUM,
6101     sae: i32,
6102 ) -> __m512 {
6103     macro_rules! call {
6104         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
6105             vgetmantps(
6106                 a.as_f32x16(),
6107                 $imm2 << 2 | $imm4_1,
6108                 _mm512_setzero_ps().as_f32x16(),
6109                 0b11111111_11111111,
6110                 $imm4_2,
6111             )
6112         };
6113     }
6114     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
6115     transmute(r)
6116 }
6117
6118 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
6119 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
6120 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
6121 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
6122 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
6123 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
6124 /// The sign is determined by sc which can take the following values:\
6125 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
6126 ///    _MM_MANT_SIGN_zero    // sign = 0\
6127 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
6128 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6129 ///
6130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getmant_round_ps&expand=2887)
6131 #[inline]
6132 #[target_feature(enable = "avx512f")]
6133 #[cfg_attr(test, assert_instr(vgetmantps, norm = 0, sign = 0, sae = 4))]
6134 #[rustc_args_required_const(3, 4, 5)]
6135 pub unsafe fn _mm512_mask_getmant_round_ps(
6136     src: __m512,
6137     k: __mmask16,
6138     a: __m512,
6139     norm: _MM_MANTISSA_NORM_ENUM,
6140     sign: _MM_MANTISSA_SIGN_ENUM,
6141     sae: i32,
6142 ) -> __m512 {
6143     macro_rules! call {
6144         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
6145             vgetmantps(
6146                 a.as_f32x16(),
6147                 $imm2 << 2 | $imm4_1,
6148                 src.as_f32x16(),
6149                 k,
6150                 $imm4_2,
6151             )
6152         };
6153     }
6154     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
6155     transmute(r)
6156 }
6157
6158 /// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
6159 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
6160 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
6161 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
6162 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
6163 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
6164 /// The sign is determined by sc which can take the following values:\
6165 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
6166 ///    _MM_MANT_SIGN_zero    // sign = 0\
6167 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
6168 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6169 ///
6170 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getmant_round_ps&expand=2888)
6171 #[inline]
6172 #[target_feature(enable = "avx512f")]
6173 #[cfg_attr(test, assert_instr(vgetmantps, norm = 0, sign = 0, sae = 4))]
6174 #[rustc_args_required_const(2, 3, 4)]
6175 pub unsafe fn _mm512_maskz_getmant_round_ps(
6176     k: __mmask16,
6177     a: __m512,
6178     norm: _MM_MANTISSA_NORM_ENUM,
6179     sign: _MM_MANTISSA_SIGN_ENUM,
6180     sae: i32,
6181 ) -> __m512 {
6182     macro_rules! call {
6183         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
6184             vgetmantps(
6185                 a.as_f32x16(),
6186                 $imm2 << 2 | $imm4_1,
6187                 _mm512_setzero_ps().as_f32x16(),
6188                 k,
6189                 $imm4_2,
6190             )
6191         };
6192     }
6193     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
6194     transmute(r)
6195 }
6196
6197 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
6198 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
6199 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
6200 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
6201 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
6202 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
6203 /// The sign is determined by sc which can take the following values:\
6204 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
6205 ///    _MM_MANT_SIGN_zero    // sign = 0\
6206 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
6207 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6208 ///
6209 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_getmant_round_pd&expand=2883)
6210 #[inline]
6211 #[target_feature(enable = "avx512f")]
6212 #[cfg_attr(test, assert_instr(vgetmantpd, norm = 0, sign = 0, sae = 4))]
6213 #[rustc_args_required_const(1, 2, 3)]
6214 pub unsafe fn _mm512_getmant_round_pd(
6215     a: __m512d,
6216     norm: _MM_MANTISSA_NORM_ENUM,
6217     sign: _MM_MANTISSA_SIGN_ENUM,
6218     sae: i32,
6219 ) -> __m512d {
6220     macro_rules! call {
6221         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
6222             vgetmantpd(
6223                 a.as_f64x8(),
6224                 $imm2 << 2 | $imm4_1,
6225                 _mm512_setzero_pd().as_f64x8(),
6226                 0b11111111,
6227                 $imm4_2,
6228             )
6229         };
6230     }
6231     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
6232     transmute(r)
6233 }
6234
6235 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
6236 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
6237 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
6238 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
6239 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
6240 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
6241 /// The sign is determined by sc which can take the following values:\
6242 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
6243 ///    _MM_MANT_SIGN_zero    // sign = 0\
6244 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
6245 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6246 ///
6247 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_getmant_round_pd&expand=2884)
6248 #[inline]
6249 #[target_feature(enable = "avx512f")]
6250 #[cfg_attr(test, assert_instr(vgetmantpd, norm = 0, sign = 0, sae = 4))]
6251 #[rustc_args_required_const(3, 4, 5)]
6252 pub unsafe fn _mm512_mask_getmant_round_pd(
6253     src: __m512d,
6254     k: __mmask8,
6255     a: __m512d,
6256     norm: _MM_MANTISSA_NORM_ENUM,
6257     sign: _MM_MANTISSA_SIGN_ENUM,
6258     sae: i32,
6259 ) -> __m512d {
6260     macro_rules! call {
6261         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
6262             vgetmantpd(
6263                 a.as_f64x8(),
6264                 $imm2 << 2 | $imm4_1,
6265                 src.as_f64x8(),
6266                 k,
6267                 $imm4_2,
6268             )
6269         };
6270     }
6271     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
6272     transmute(r)
6273 }
6274
6275 /// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
6276 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
6277 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
6278 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
6279 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
6280 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
6281 /// The sign is determined by sc which can take the following values:\
6282 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
6283 ///    _MM_MANT_SIGN_zero    // sign = 0\
6284 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
6285 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
6286 ///
6287 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_getmant_round_pd&expand=2885)
6288 #[inline]
6289 #[target_feature(enable = "avx512f")]
6290 #[cfg_attr(test, assert_instr(vgetmantpd, norm = 0, sign = 0, sae = 4))]
6291 #[rustc_args_required_const(2, 3, 4)]
6292 pub unsafe fn _mm512_maskz_getmant_round_pd(
6293     k: __mmask8,
6294     a: __m512d,
6295     norm: _MM_MANTISSA_NORM_ENUM,
6296     sign: _MM_MANTISSA_SIGN_ENUM,
6297     sae: i32,
6298 ) -> __m512d {
6299     macro_rules! call {
6300         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
6301             vgetmantpd(
6302                 a.as_f64x8(),
6303                 $imm2 << 2 | $imm4_1,
6304                 _mm512_setzero_pd().as_f64x8(),
6305                 k,
6306                 $imm4_2,
6307             )
6308         };
6309     }
6310     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
6311     transmute(r)
6312 }
6313
6314 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
6315 ///
6316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvtps_epi32&expand=1737)
6317 #[inline]
6318 #[target_feature(enable = "avx512f")]
6319 #[cfg_attr(test, assert_instr(vcvtps2dq))]
6320 pub unsafe fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
6321     transmute(vcvtps2dq(
6322         a.as_f32x16(),
6323         _mm512_setzero_si512().as_i32x16(),
6324         0b11111111_11111111,
6325         _MM_FROUND_CUR_DIRECTION,
6326     ))
6327 }
6328
6329 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6330 ///
6331 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtps_epi32&expand=1738)
6332 #[inline]
6333 #[target_feature(enable = "avx512f")]
6334 #[cfg_attr(test, assert_instr(vcvtps2dq))]
6335 pub unsafe fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
6336     transmute(vcvtps2dq(
6337         a.as_f32x16(),
6338         src.as_i32x16(),
6339         k,
6340         _MM_FROUND_CUR_DIRECTION,
6341     ))
6342 }
6343
6344 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6345 ///
6346 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtps_epi32&expand=1739)
6347 #[inline]
6348 #[target_feature(enable = "avx512f")]
6349 #[cfg_attr(test, assert_instr(vcvtps2dq))]
6350 pub unsafe fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
6351     transmute(vcvtps2dq(
6352         a.as_f32x16(),
6353         _mm512_setzero_si512().as_i32x16(),
6354         k,
6355         _MM_FROUND_CUR_DIRECTION,
6356     ))
6357 }
6358
6359 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
6360 ///
6361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtps_epu32&expand=1755)
6362 #[inline]
6363 #[target_feature(enable = "avx512f")]
6364 #[cfg_attr(test, assert_instr(vcvtps2udq))]
6365 pub unsafe fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
6366     transmute(vcvtps2udq(
6367         a.as_f32x16(),
6368         _mm512_setzero_si512().as_u32x16(),
6369         0b11111111_11111111,
6370         _MM_FROUND_CUR_DIRECTION,
6371     ))
6372 }
6373
6374 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6375 ///
6376 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtps_epu32&expand=1756)
6377 #[inline]
6378 #[target_feature(enable = "avx512f")]
6379 #[cfg_attr(test, assert_instr(vcvtps2udq))]
6380 pub unsafe fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
6381     transmute(vcvtps2udq(
6382         a.as_f32x16(),
6383         src.as_u32x16(),
6384         k,
6385         _MM_FROUND_CUR_DIRECTION,
6386     ))
6387 }
6388
6389 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6390 ///
6391 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=maskz_cvt_roundps_epu32&expand=1343)
6392 #[inline]
6393 #[target_feature(enable = "avx512f")]
6394 #[cfg_attr(test, assert_instr(vcvtps2udq))]
6395 pub unsafe fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
6396     transmute(vcvtps2udq(
6397         a.as_f32x16(),
6398         _mm512_setzero_si512().as_u32x16(),
6399         k,
6400         _MM_FROUND_CUR_DIRECTION,
6401     ))
6402 }
6403
6404 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
6405 ///
6406 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtps_pd&expand=1769)
6407 #[inline]
6408 #[target_feature(enable = "avx512f")]
6409 #[cfg_attr(test, assert_instr(vcvtps2pd))]
6410 pub unsafe fn _mm512_cvtps_pd(a: __m256) -> __m512d {
6411     transmute(vcvtps2pd(
6412         a.as_f32x8(),
6413         _mm512_setzero_pd().as_f64x8(),
6414         0b11111111,
6415         _MM_FROUND_CUR_DIRECTION,
6416     ))
6417 }
6418
6419 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6420 ///
6421 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtps_pd&expand=1770)
6422 #[inline]
6423 #[target_feature(enable = "avx512f")]
6424 #[cfg_attr(test, assert_instr(vcvtps2pd))]
6425 pub unsafe fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
6426     transmute(vcvtps2pd(
6427         a.as_f32x8(),
6428         src.as_f64x8(),
6429         k,
6430         _MM_FROUND_CUR_DIRECTION,
6431     ))
6432 }
6433
6434 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6435 ///
6436 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtps_pd&expand=1771)
6437 #[inline]
6438 #[target_feature(enable = "avx512f")]
6439 #[cfg_attr(test, assert_instr(vcvtps2pd))]
6440 pub unsafe fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
6441     transmute(vcvtps2pd(
6442         a.as_f32x8(),
6443         _mm512_setzero_pd().as_f64x8(),
6444         k,
6445         _MM_FROUND_CUR_DIRECTION,
6446     ))
6447 }
6448
6449 /// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
6450 ///
6451 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtpslo_pd&expand=1784)
6452 #[inline]
6453 #[target_feature(enable = "avx512f")]
6454 #[cfg_attr(test, assert_instr(vcvtps2pd))]
6455 pub unsafe fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
6456     transmute(vcvtps2pd(
6457         _mm512_castps512_ps256(v2).as_f32x8(),
6458         _mm512_setzero_pd().as_f64x8(),
6459         0b11111111,
6460         _MM_FROUND_CUR_DIRECTION,
6461     ))
6462 }
6463
6464 /// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6465 ///
6466 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtpslo_pd&expand=1785)
6467 #[inline]
6468 #[target_feature(enable = "avx512f")]
6469 #[cfg_attr(test, assert_instr(vcvtps2pd))]
6470 pub unsafe fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
6471     transmute(vcvtps2pd(
6472         _mm512_castps512_ps256(v2).as_f32x8(),
6473         src.as_f64x8(),
6474         k,
6475         _MM_FROUND_CUR_DIRECTION,
6476     ))
6477 }
6478
6479 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
6480 ///
6481 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtpd_ps&expand=1712)
6482 #[inline]
6483 #[target_feature(enable = "avx512f")]
6484 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
6485 pub unsafe fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
6486     transmute(vcvtpd2ps(
6487         a.as_f64x8(),
6488         _mm256_setzero_ps().as_f32x8(),
6489         0b11111111,
6490         _MM_FROUND_CUR_DIRECTION,
6491     ))
6492 }
6493
6494 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6495 ///
6496 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtpd_ps&expand=1713)
6497 #[inline]
6498 #[target_feature(enable = "avx512f")]
6499 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
6500 pub unsafe fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
6501     transmute(vcvtpd2ps(
6502         a.as_f64x8(),
6503         src.as_f32x8(),
6504         k,
6505         _MM_FROUND_CUR_DIRECTION,
6506     ))
6507 }
6508
6509 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6510 ///
6511 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtpd_ps&expand=1714)
6512 #[inline]
6513 #[target_feature(enable = "avx512f")]
6514 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
6515 pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
6516     transmute(vcvtpd2ps(
6517         a.as_f64x8(),
6518         _mm256_setzero_ps().as_f32x8(),
6519         k,
6520         _MM_FROUND_CUR_DIRECTION,
6521     ))
6522 }
6523
6524 /// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
6525 ///
6526 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtpd_pslo&expand=1715)
6527 #[inline]
6528 #[target_feature(enable = "avx512f")]
6529 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
6530 pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
6531     let r: f32x8 = vcvtpd2ps(
6532         v2.as_f64x8(),
6533         _mm256_setzero_ps().as_f32x8(),
6534         0b11111111,
6535         _MM_FROUND_CUR_DIRECTION,
6536     );
6537     simd_shuffle16(
6538         r,
6539         _mm256_setzero_ps().as_f32x8(),
6540         [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
6541     )
6542 }
6543
6544 /// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
6545 ///
6546 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtpd_pslo&expand=1716)
6547 #[inline]
6548 #[target_feature(enable = "avx512f")]
6549 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
6550 pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
6551     let r: f32x8 = vcvtpd2ps(
6552         v2.as_f64x8(),
6553         _mm512_castps512_ps256(src).as_f32x8(),
6554         k,
6555         _MM_FROUND_CUR_DIRECTION,
6556     );
6557     simd_shuffle16(
6558         r,
6559         _mm256_setzero_ps().as_f32x8(),
6560         [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
6561     )
6562 }
6563
6564 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
6565 ///
6566 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi8_epi32&expand=1535)
6567 #[inline]
6568 #[target_feature(enable = "avx512f")]
6569 #[cfg_attr(test, assert_instr(vpmovsxbd))]
6570 pub unsafe fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
6571     let a = a.as_i8x16();
6572     transmute::<i32x16, _>(simd_cast(a))
6573 }
6574
6575 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6576 ///
6577 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi8_epi32&expand=1536)
6578 #[inline]
6579 #[target_feature(enable = "avx512f")]
6580 #[cfg_attr(test, assert_instr(vpmovsxbd))]
6581 pub unsafe fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
6582     let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
6583     transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
6584 }
6585
6586 /// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6587 ///
6588 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi8_epi32&expand=1537)
6589 #[inline]
6590 #[target_feature(enable = "avx512f")]
6591 #[cfg_attr(test, assert_instr(vpmovsxbd))]
6592 pub unsafe fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
6593     let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
6594     let zero = _mm512_setzero_si512().as_i32x16();
6595     transmute(simd_select_bitmask(k, convert, zero))
6596 }
6597
6598 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
6599 ///
6600 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi8_epi64&expand=1544)
6601 #[inline]
6602 #[target_feature(enable = "avx512f")]
6603 #[cfg_attr(test, assert_instr(vpmovsxbq))]
6604 pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
6605     let a = a.as_i8x16();
6606     let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
6607     transmute::<i64x8, _>(simd_cast(v64))
6608 }
6609
6610 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6611 ///
6612 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi8_epi64&expand=1545)
6613 #[inline]
6614 #[target_feature(enable = "avx512f")]
6615 #[cfg_attr(test, assert_instr(vpmovsxbq))]
6616 pub unsafe fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
6617     let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
6618     transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
6619 }
6620
6621 /// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6622 ///
6623 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi8_epi64&expand=1546)
6624 #[inline]
6625 #[target_feature(enable = "avx512f")]
6626 #[cfg_attr(test, assert_instr(vpmovsxbq))]
6627 pub unsafe fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
6628     let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
6629     let zero = _mm512_setzero_si512().as_i64x8();
6630     transmute(simd_select_bitmask(k, convert, zero))
6631 }
6632
6633 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
6634 ///
6635 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu8_epi32&expand=1621)
6636 #[inline]
6637 #[target_feature(enable = "avx512f")]
6638 #[cfg_attr(test, assert_instr(vpmovzxbd))]
6639 pub unsafe fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
6640     let a = a.as_u8x16();
6641     transmute::<i32x16, _>(simd_cast(a))
6642 }
6643
6644 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6645 ///
6646 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu8_epi32&expand=1622)
6647 #[inline]
6648 #[target_feature(enable = "avx512f")]
6649 #[cfg_attr(test, assert_instr(vpmovzxbd))]
6650 pub unsafe fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
6651     let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
6652     transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
6653 }
6654
6655 /// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6656 ///
6657 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu8_epi32&expand=1623)
6658 #[inline]
6659 #[target_feature(enable = "avx512f")]
6660 #[cfg_attr(test, assert_instr(vpmovzxbd))]
6661 pub unsafe fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
6662     let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
6663     let zero = _mm512_setzero_si512().as_i32x16();
6664     transmute(simd_select_bitmask(k, convert, zero))
6665 }
6666
6667 /// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
6668 ///
6669 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu8_epi64&expand=1630)
6670 #[inline]
6671 #[target_feature(enable = "avx512f")]
6672 #[cfg_attr(test, assert_instr(vpmovzxbq))]
6673 pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
6674     let a = a.as_u8x16();
6675     let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
6676     transmute::<i64x8, _>(simd_cast(v64))
6677 }
6678
6679 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6680 ///
6681 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu8_epi64&expand=1631)
6682 #[inline]
6683 #[target_feature(enable = "avx512f")]
6684 #[cfg_attr(test, assert_instr(vpmovzxbq))]
6685 pub unsafe fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
6686     let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
6687     transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
6688 }
6689
6690 /// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6691 ///
6692 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu8_epi64&expand=1632)
6693 #[inline]
6694 #[target_feature(enable = "avx512f")]
6695 #[cfg_attr(test, assert_instr(vpmovzxbq))]
6696 pub unsafe fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
6697     let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
6698     let zero = _mm512_setzero_si512().as_i64x8();
6699     transmute(simd_select_bitmask(k, convert, zero))
6700 }
6701
6702 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
6703 ///
6704 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi16_epi32&expand=1389)
6705 #[inline]
6706 #[target_feature(enable = "avx512f")]
6707 #[cfg_attr(test, assert_instr(vpmovsxwd))]
6708 pub unsafe fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
6709     let a = a.as_i16x16();
6710     transmute::<i32x16, _>(simd_cast(a))
6711 }
6712
6713 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6714 ///
6715 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi16_epi32&expand=1390)
6716 #[inline]
6717 #[target_feature(enable = "avx512f")]
6718 #[cfg_attr(test, assert_instr(vpmovsxwd))]
6719 pub unsafe fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
6720     let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
6721     transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
6722 }
6723
6724 /// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6725 ///
6726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi16_epi32&expand=1391)
6727 #[inline]
6728 #[target_feature(enable = "avx512f")]
6729 #[cfg_attr(test, assert_instr(vpmovsxwd))]
6730 pub unsafe fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
6731     let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
6732     let zero = _mm512_setzero_si512().as_i32x16();
6733     transmute(simd_select_bitmask(k, convert, zero))
6734 }
6735
6736 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
6737 ///
6738 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi16_epi64&expand=1398)
6739 #[inline]
6740 #[target_feature(enable = "avx512f")]
6741 #[cfg_attr(test, assert_instr(vpmovsxwq))]
6742 pub unsafe fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
6743     let a = a.as_i16x8();
6744     transmute::<i64x8, _>(simd_cast(a))
6745 }
6746
6747 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6748 ///
6749 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi16_epi64&expand=1399)
6750 #[inline]
6751 #[target_feature(enable = "avx512f")]
6752 #[cfg_attr(test, assert_instr(vpmovsxwq))]
6753 pub unsafe fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
6754     let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
6755     transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
6756 }
6757
6758 /// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6759 ///
6760 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi16_epi64&expand=1400)
6761 #[inline]
6762 #[target_feature(enable = "avx512f")]
6763 #[cfg_attr(test, assert_instr(vpmovsxwq))]
6764 pub unsafe fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
6765     let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
6766     let zero = _mm512_setzero_si512().as_i64x8();
6767     transmute(simd_select_bitmask(k, convert, zero))
6768 }
6769
6770 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
6771 ///
6772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu16_epi32&expand=1553)
6773 #[inline]
6774 #[target_feature(enable = "avx512f")]
6775 #[cfg_attr(test, assert_instr(vpmovzxwd))]
6776 pub unsafe fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
6777     let a = a.as_u16x16();
6778     transmute::<i32x16, _>(simd_cast(a))
6779 }
6780
6781 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6782 ///
6783 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu16_epi32&expand=1554)
6784 #[inline]
6785 #[target_feature(enable = "avx512f")]
6786 #[cfg_attr(test, assert_instr(vpmovzxwd))]
6787 pub unsafe fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
6788     let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
6789     transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
6790 }
6791
6792 /// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6793 ///
6794 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu16_epi32&expand=1555)
6795 #[inline]
6796 #[target_feature(enable = "avx512f")]
6797 #[cfg_attr(test, assert_instr(vpmovzxwd))]
6798 pub unsafe fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
6799     let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
6800     let zero = _mm512_setzero_si512().as_i32x16();
6801     transmute(simd_select_bitmask(k, convert, zero))
6802 }
6803
6804 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
6805 ///
6806 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu16_epi64&expand=1562)
6807 #[inline]
6808 #[target_feature(enable = "avx512f")]
6809 #[cfg_attr(test, assert_instr(vpmovzxwq))]
6810 pub unsafe fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
6811     let a = a.as_u16x8();
6812     transmute::<i64x8, _>(simd_cast(a))
6813 }
6814
6815 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6816 ///
6817 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu16_epi64&expand=1563)
6818 #[inline]
6819 #[target_feature(enable = "avx512f")]
6820 #[cfg_attr(test, assert_instr(vpmovzxwq))]
6821 pub unsafe fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
6822     let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
6823     transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
6824 }
6825
6826 /// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6827 ///
6828 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu16_epi64&expand=1564)
6829 #[inline]
6830 #[target_feature(enable = "avx512f")]
6831 #[cfg_attr(test, assert_instr(vpmovzxwq))]
6832 pub unsafe fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
6833     let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
6834     let zero = _mm512_setzero_si512().as_i64x8();
6835     transmute(simd_select_bitmask(k, convert, zero))
6836 }
6837
6838 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
6839 ///
6840 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi32_epi64&expand=1428)
6841 #[inline]
6842 #[target_feature(enable = "avx512f")]
6843 #[cfg_attr(test, assert_instr(vpmovsxdq))]
6844 pub unsafe fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
6845     let a = a.as_i32x8();
6846     transmute::<i64x8, _>(simd_cast(a))
6847 }
6848
6849 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6850 ///
6851 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi32_epi64&expand=1429)
6852 #[inline]
6853 #[target_feature(enable = "avx512f")]
6854 #[cfg_attr(test, assert_instr(vpmovsxdq))]
6855 pub unsafe fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
6856     let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
6857     transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
6858 }
6859
6860 /// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6861 ///
6862 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi32_epi64&expand=1430)
6863 #[inline]
6864 #[target_feature(enable = "avx512f")]
6865 #[cfg_attr(test, assert_instr(vpmovsxdq))]
6866 pub unsafe fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
6867     let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
6868     let zero = _mm512_setzero_si512().as_i64x8();
6869     transmute(simd_select_bitmask(k, convert, zero))
6870 }
6871
6872 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
6873 ///
6874 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu32_epi64&expand=1571)
6875 #[inline]
6876 #[target_feature(enable = "avx512f")]
6877 #[cfg_attr(test, assert_instr(vpmovzxdq))]
6878 pub unsafe fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
6879     let a = a.as_u32x8();
6880     transmute::<i64x8, _>(simd_cast(a))
6881 }
6882
6883 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6884 ///
6885 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu32_epi64&expand=1572)
6886 #[inline]
6887 #[target_feature(enable = "avx512f")]
6888 #[cfg_attr(test, assert_instr(vpmovzxdq))]
6889 pub unsafe fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
6890     let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
6891     transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
6892 }
6893
6894 /// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6895 ///
6896 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu32_epi64&expand=1573)
6897 #[inline]
6898 #[target_feature(enable = "avx512f")]
6899 #[cfg_attr(test, assert_instr(vpmovzxdq))]
6900 pub unsafe fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
6901     let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
6902     let zero = _mm512_setzero_si512().as_i64x8();
6903     transmute(simd_select_bitmask(k, convert, zero))
6904 }
6905
6906 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
6907 ///
6908 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi32_ps&expand=1455)
6909 #[inline]
6910 #[target_feature(enable = "avx512f")]
6911 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
6912 pub unsafe fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
6913     let a = a.as_i32x16();
6914     transmute::<f32x16, _>(simd_cast(a))
6915 }
6916
6917 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6918 ///
6919 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi32_ps&expand=1456)
6920 #[inline]
6921 #[target_feature(enable = "avx512f")]
6922 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
6923 pub unsafe fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
6924     let convert = _mm512_cvtepi32_ps(a).as_f32x16();
6925     transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
6926 }
6927
6928 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6929 ///
6930 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi32_ps&expand=1457)
6931 #[inline]
6932 #[target_feature(enable = "avx512f")]
6933 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
6934 pub unsafe fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
6935     let convert = _mm512_cvtepi32_ps(a).as_f32x16();
6936     let zero = _mm512_setzero_ps().as_f32x16();
6937     transmute(simd_select_bitmask(k, convert, zero))
6938 }
6939
6940 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
6941 ///
6942 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi32_pd&expand=1446)
6943 #[inline]
6944 #[target_feature(enable = "avx512f")]
6945 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
6946 pub unsafe fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
6947     let a = a.as_i32x8();
6948     transmute::<f64x8, _>(simd_cast(a))
6949 }
6950
6951 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6952 ///
6953 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi32_pd&expand=1447)
6954 #[inline]
6955 #[target_feature(enable = "avx512f")]
6956 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
6957 pub unsafe fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
6958     let convert = _mm512_cvtepi32_pd(a).as_f64x8();
6959     transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
6960 }
6961
6962 /// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6963 ///
6964 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi32_pd&expand=1448)
6965 #[inline]
6966 #[target_feature(enable = "avx512f")]
6967 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
6968 pub unsafe fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
6969     let convert = _mm512_cvtepi32_pd(a).as_f64x8();
6970     let zero = _mm512_setzero_pd().as_f64x8();
6971     transmute(simd_select_bitmask(k, convert, zero))
6972 }
6973
6974 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
6975 ///
6976 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu32_ps&expand=1583)
6977 #[inline]
6978 #[target_feature(enable = "avx512f")]
6979 #[cfg_attr(test, assert_instr(vcvtudq2ps))]
6980 pub unsafe fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
6981     let a = a.as_u32x16();
6982     transmute::<f32x16, _>(simd_cast(a))
6983 }
6984
6985 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
6986 ///
6987 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu32_ps&expand=1584)
6988 #[inline]
6989 #[target_feature(enable = "avx512f")]
6990 #[cfg_attr(test, assert_instr(vcvtudq2ps))]
6991 pub unsafe fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
6992     let convert = _mm512_cvtepu32_ps(a).as_f32x16();
6993     transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
6994 }
6995
6996 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
6997 ///
6998 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu32_ps&expand=1585)
6999 #[inline]
7000 #[target_feature(enable = "avx512f")]
7001 #[cfg_attr(test, assert_instr(vcvtudq2ps))]
7002 pub unsafe fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
7003     let convert = _mm512_cvtepu32_ps(a).as_f32x16();
7004     let zero = _mm512_setzero_ps().as_f32x16();
7005     transmute(simd_select_bitmask(k, convert, zero))
7006 }
7007
7008 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
7009 ///
7010 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu32_pd&expand=1580)
7011 #[inline]
7012 #[target_feature(enable = "avx512f")]
7013 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
7014 pub unsafe fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
7015     let a = a.as_u32x8();
7016     transmute::<f64x8, _>(simd_cast(a))
7017 }
7018
7019 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7020 ///
7021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu32_pd&expand=1581)
7022 #[inline]
7023 #[target_feature(enable = "avx512f")]
7024 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
7025 pub unsafe fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
7026     let convert = _mm512_cvtepu32_pd(a).as_f64x8();
7027     transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
7028 }
7029
7030 /// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7031 ///
7032 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepu32_pd&expand=1582)
7033 #[inline]
7034 #[target_feature(enable = "avx512f")]
7035 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
7036 pub unsafe fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
7037     let convert = _mm512_cvtepu32_pd(a).as_f64x8();
7038     let zero = _mm512_setzero_pd().as_f64x8();
7039     transmute(simd_select_bitmask(k, convert, zero))
7040 }
7041
7042 /// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
7043 ///
7044 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi32lo_pd&expand=1464)
7045 #[inline]
7046 #[target_feature(enable = "avx512f")]
7047 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
7048 pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
7049     let v2 = v2.as_i32x16();
7050     let v256: i32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
7051     transmute::<f64x8, _>(simd_cast(v256))
7052 }
7053
7054 /// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7055 ///
7056 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi32lo_pd&expand=1465)
7057 #[inline]
7058 #[target_feature(enable = "avx512f")]
7059 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
7060 pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
7061     let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
7062     transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
7063 }
7064
7065 /// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
7066 ///
7067 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepu32lo_pd&expand=1586)
7068 #[inline]
7069 #[target_feature(enable = "avx512f")]
7070 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
7071 pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
7072     let v2 = v2.as_u32x16();
7073     let v256: u32x8 = simd_shuffle8(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
7074     transmute::<f64x8, _>(simd_cast(v256))
7075 }
7076
7077 /// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7078 ///
7079 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepu32lo_pd&expand=1587)
7080 #[inline]
7081 #[target_feature(enable = "avx512f")]
7082 #[cfg_attr(test, assert_instr(vcvtudq2pd))]
7083 pub unsafe fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
7084     let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
7085     transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
7086 }
7087
7088 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
7089 ///
7090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi32_epi16&expand=1419)
7091 #[inline]
7092 #[target_feature(enable = "avx512f")]
7093 #[cfg_attr(test, assert_instr(vpmovdw))]
7094 pub unsafe fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
7095     let a = a.as_i32x16();
7096     transmute::<i16x16, _>(simd_cast(a))
7097 }
7098
7099 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7100 ///
7101 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi32_epi16&expand=1420)
7102 #[inline]
7103 #[target_feature(enable = "avx512f")]
7104 #[cfg_attr(test, assert_instr(vpmovdw))]
7105 pub unsafe fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
7106     let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
7107     transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
7108 }
7109
7110 /// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7111 ///
7112 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi32_epi16&expand=1421)
7113 #[inline]
7114 #[target_feature(enable = "avx512f")]
7115 #[cfg_attr(test, assert_instr(vpmovdw))]
7116 pub unsafe fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
7117     let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
7118     let zero = _mm256_setzero_si256().as_i16x16();
7119     transmute(simd_select_bitmask(k, convert, zero))
7120 }
7121
7122 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
7123 ///
7124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi32_epi8&expand=1437)
7125 #[inline]
7126 #[target_feature(enable = "avx512f")]
7127 #[cfg_attr(test, assert_instr(vpmovdb))]
7128 pub unsafe fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
7129     let a = a.as_i32x16();
7130     transmute::<i8x16, _>(simd_cast(a))
7131 }
7132
7133 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7134 ///
7135 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi32_epi8&expand=1438)
7136 #[inline]
7137 #[target_feature(enable = "avx512f")]
7138 #[cfg_attr(test, assert_instr(vpmovdb))]
7139 pub unsafe fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
7140     let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
7141     transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
7142 }
7143
7144 /// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7145 ///
7146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi32_epi8&expand=1439)
7147 #[inline]
7148 #[target_feature(enable = "avx512f")]
7149 #[cfg_attr(test, assert_instr(vpmovdb))]
7150 pub unsafe fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
7151     let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
7152     let zero = _mm_setzero_si128().as_i8x16();
7153     transmute(simd_select_bitmask(k, convert, zero))
7154 }
7155
7156 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
7157 ///
7158 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi64_epi32&expand=1481)
7159 #[inline]
7160 #[target_feature(enable = "avx512f")]
7161 #[cfg_attr(test, assert_instr(vpmovqd))]
7162 pub unsafe fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
7163     let a = a.as_i64x8();
7164     transmute::<i32x8, _>(simd_cast(a))
7165 }
7166
7167 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7168 ///
7169 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi64_epi32&expand=1482)
7170 #[inline]
7171 #[target_feature(enable = "avx512f")]
7172 #[cfg_attr(test, assert_instr(vpmovqd))]
7173 pub unsafe fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
7174     let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
7175     transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
7176 }
7177
7178 /// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7179 ///
7180 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi64_epi32&expand=1483)
7181 #[inline]
7182 #[target_feature(enable = "avx512f")]
7183 #[cfg_attr(test, assert_instr(vpmovqd))]
7184 pub unsafe fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
7185     let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
7186     let zero = _mm256_setzero_si256().as_i32x8();
7187     transmute(simd_select_bitmask(k, convert, zero))
7188 }
7189
7190 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
7191 ///
7192 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi64_epi16&expand=1472)
7193 #[inline]
7194 #[target_feature(enable = "avx512f")]
7195 #[cfg_attr(test, assert_instr(vpmovqw))]
7196 pub unsafe fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
7197     let a = a.as_i64x8();
7198     transmute::<i16x8, _>(simd_cast(a))
7199 }
7200
7201 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7202 ///
7203 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi64_epi16&expand=1473)
7204 #[inline]
7205 #[target_feature(enable = "avx512f")]
7206 #[cfg_attr(test, assert_instr(vpmovqw))]
7207 pub unsafe fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
7208     let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
7209     transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
7210 }
7211
7212 /// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7213 ///
7214 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi64_epi16&expand=1474)
7215 #[inline]
7216 #[target_feature(enable = "avx512f")]
7217 #[cfg_attr(test, assert_instr(vpmovqw))]
7218 pub unsafe fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
7219     let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
7220     let zero = _mm_setzero_si128().as_i16x8();
7221     transmute(simd_select_bitmask(k, convert, zero))
7222 }
7223
7224 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
7225 ///
7226 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtepi64_epi8&expand=1490)
7227 #[inline]
7228 #[target_feature(enable = "avx512f")]
7229 #[cfg_attr(test, assert_instr(vpmovqb))]
7230 pub unsafe fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
7231     transmute(vpmovqb(
7232         a.as_i64x8(),
7233         _mm_setzero_si128().as_i8x16(),
7234         0b11111111,
7235     ))
7236 }
7237
7238 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7239 ///
7240 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtepi64_epi8&expand=1491)
7241 #[inline]
7242 #[target_feature(enable = "avx512f")]
7243 #[cfg_attr(test, assert_instr(vpmovqb))]
7244 pub unsafe fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
7245     transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k))
7246 }
7247
7248 /// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7249 ///
7250 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtepi64_epi8&expand=1492)
7251 #[inline]
7252 #[target_feature(enable = "avx512f")]
7253 #[cfg_attr(test, assert_instr(vpmovqb))]
7254 pub unsafe fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
7255     transmute(vpmovqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
7256 }
7257
7258 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
7259 ///
7260 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtsepi32_epi16&expand=1819)
7261 #[inline]
7262 #[target_feature(enable = "avx512f")]
7263 #[cfg_attr(test, assert_instr(vpmovsdw))]
7264 pub unsafe fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
7265     transmute(vpmovsdw(
7266         a.as_i32x16(),
7267         _mm256_setzero_si256().as_i16x16(),
7268         0b11111111_11111111,
7269     ))
7270 }
7271
7272 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7273 ///
7274 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtsepi32_epi16&expand=1820)
7275 #[inline]
7276 #[target_feature(enable = "avx512f")]
7277 #[cfg_attr(test, assert_instr(vpmovsdw))]
7278 pub unsafe fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
7279     transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k))
7280 }
7281
7282 /// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
7283 ///
7284 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtsepi32_epi16&expand=1819)
7285 #[inline]
7286 #[target_feature(enable = "avx512f")]
7287 #[cfg_attr(test, assert_instr(vpmovsdw))]
7288 pub unsafe fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
7289     transmute(vpmovsdw(
7290         a.as_i32x16(),
7291         _mm256_setzero_si256().as_i16x16(),
7292         k,
7293     ))
7294 }
7295
7296 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
7297 ///
7298 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtsepi32_epi8&expand=1828)
7299 #[inline]
7300 #[target_feature(enable = "avx512f")]
7301 #[cfg_attr(test, assert_instr(vpmovsdb))]
7302 pub unsafe fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
7303     transmute(vpmovsdb(
7304         a.as_i32x16(),
7305         _mm_setzero_si128().as_i8x16(),
7306         0b11111111_11111111,
7307     ))
7308 }
7309
7310 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7311 ///
7312 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtsepi32_epi8&expand=1829)
7313 #[inline]
7314 #[target_feature(enable = "avx512f")]
7315 #[cfg_attr(test, assert_instr(vpmovsdb))]
7316 pub unsafe fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
7317     transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k))
7318 }
7319
7320 /// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7321 ///
7322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtsepi32_epi8&expand=1830)
7323 #[inline]
7324 #[target_feature(enable = "avx512f")]
7325 #[cfg_attr(test, assert_instr(vpmovsdb))]
7326 pub unsafe fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
7327     transmute(vpmovsdb(a.as_i32x16(), _mm_setzero_si128().as_i8x16(), k))
7328 }
7329
7330 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
7331 ///
7332 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtsepi64_epi32&expand=1852)
7333 #[inline]
7334 #[target_feature(enable = "avx512f")]
7335 #[cfg_attr(test, assert_instr(vpmovsqd))]
7336 pub unsafe fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
7337     transmute(vpmovsqd(
7338         a.as_i64x8(),
7339         _mm256_setzero_si256().as_i32x8(),
7340         0b11111111,
7341     ))
7342 }
7343
7344 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7345 ///
7346 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtsepi64_epi32&expand=1853)
7347 #[inline]
7348 #[target_feature(enable = "avx512f")]
7349 #[cfg_attr(test, assert_instr(vpmovsqd))]
7350 pub unsafe fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
7351     transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k))
7352 }
7353
7354 /// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7355 ///
7356 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtsepi64_epi32&expand=1854)
7357 #[inline]
7358 #[target_feature(enable = "avx512f")]
7359 #[cfg_attr(test, assert_instr(vpmovsqd))]
7360 pub unsafe fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
7361     transmute(vpmovsqd(a.as_i64x8(), _mm256_setzero_si256().as_i32x8(), k))
7362 }
7363
7364 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
7365 ///
7366 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtsepi64_epi16&expand=1843)
7367 #[inline]
7368 #[target_feature(enable = "avx512f")]
7369 #[cfg_attr(test, assert_instr(vpmovsqw))]
7370 pub unsafe fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
7371     transmute(vpmovsqw(
7372         a.as_i64x8(),
7373         _mm_setzero_si128().as_i16x8(),
7374         0b11111111,
7375     ))
7376 }
7377
7378 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7379 ///
7380 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtsepi64_epi16&expand=1844)
7381 #[inline]
7382 #[target_feature(enable = "avx512f")]
7383 #[cfg_attr(test, assert_instr(vpmovsqw))]
7384 pub unsafe fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
7385     transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k))
7386 }
7387
7388 /// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7389 ///
7390 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtsepi64_epi16&expand=1845)
7391 #[inline]
7392 #[target_feature(enable = "avx512f")]
7393 #[cfg_attr(test, assert_instr(vpmovsqw))]
7394 pub unsafe fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
7395     transmute(vpmovsqw(a.as_i64x8(), _mm_setzero_si128().as_i16x8(), k))
7396 }
7397
7398 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
7399 ///
7400 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtsepi64_epi8&expand=1861)
7401 #[inline]
7402 #[target_feature(enable = "avx512f")]
7403 #[cfg_attr(test, assert_instr(vpmovsqb))]
7404 pub unsafe fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
7405     transmute(vpmovsqb(
7406         a.as_i64x8(),
7407         _mm_setzero_si128().as_i8x16(),
7408         0b11111111,
7409     ))
7410 }
7411
7412 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7413 ///
7414 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtsepi64_epi8&expand=1862)
7415 #[inline]
7416 #[target_feature(enable = "avx512f")]
7417 #[cfg_attr(test, assert_instr(vpmovsqb))]
7418 pub unsafe fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
7419     transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k))
7420 }
7421
7422 /// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7423 ///
7424 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtsepi64_epi8&expand=1863)
7425 #[inline]
7426 #[target_feature(enable = "avx512f")]
7427 #[cfg_attr(test, assert_instr(vpmovsqb))]
7428 pub unsafe fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
7429     transmute(vpmovsqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
7430 }
7431
7432 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
7433 ///
7434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtusepi32_epi16&expand=2054)
7435 #[inline]
7436 #[target_feature(enable = "avx512f")]
7437 #[cfg_attr(test, assert_instr(vpmovusdw))]
7438 pub unsafe fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
7439     transmute(vpmovusdw(
7440         a.as_u32x16(),
7441         _mm256_setzero_si256().as_u16x16(),
7442         0b11111111_11111111,
7443     ))
7444 }
7445
7446 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7447 ///
7448 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtusepi32_epi16&expand=2055)
7449 #[inline]
7450 #[target_feature(enable = "avx512f")]
7451 #[cfg_attr(test, assert_instr(vpmovusdw))]
7452 pub unsafe fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
7453     transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k))
7454 }
7455
7456 /// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7457 ///
7458 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtusepi32_epi16&expand=2056)
7459 #[inline]
7460 #[target_feature(enable = "avx512f")]
7461 #[cfg_attr(test, assert_instr(vpmovusdw))]
7462 pub unsafe fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
7463     transmute(vpmovusdw(
7464         a.as_u32x16(),
7465         _mm256_setzero_si256().as_u16x16(),
7466         k,
7467     ))
7468 }
7469
7470 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
7471 ///
7472 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtusepi32_epi8&expand=2063)
7473 #[inline]
7474 #[target_feature(enable = "avx512f")]
7475 #[cfg_attr(test, assert_instr(vpmovusdb))]
7476 pub unsafe fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
7477     transmute(vpmovusdb(
7478         a.as_u32x16(),
7479         _mm_setzero_si128().as_u8x16(),
7480         0b11111111_11111111,
7481     ))
7482 }
7483
7484 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7485 ///
7486 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtusepi32_epi8&expand=2064)
7487 #[inline]
7488 #[target_feature(enable = "avx512f")]
7489 #[cfg_attr(test, assert_instr(vpmovusdb))]
7490 pub unsafe fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
7491     transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k))
7492 }
7493
7494 /// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7495 ///
7496 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtusepi32_epi8&expand=2065)
7497 #[inline]
7498 #[target_feature(enable = "avx512f")]
7499 #[cfg_attr(test, assert_instr(vpmovusdb))]
7500 pub unsafe fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
7501     transmute(vpmovusdb(a.as_u32x16(), _mm_setzero_si128().as_u8x16(), k))
7502 }
7503
7504 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
7505 ///
7506 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtusepi64_epi32&expand=2087)
7507 #[inline]
7508 #[target_feature(enable = "avx512f")]
7509 #[cfg_attr(test, assert_instr(vpmovusqd))]
7510 pub unsafe fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
7511     transmute(vpmovusqd(
7512         a.as_u64x8(),
7513         _mm256_setzero_si256().as_u32x8(),
7514         0b11111111,
7515     ))
7516 }
7517
7518 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7519 ///
7520 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtusepi64_epi32&expand=2088)
7521 #[inline]
7522 #[target_feature(enable = "avx512f")]
7523 #[cfg_attr(test, assert_instr(vpmovusqd))]
7524 pub unsafe fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
7525     transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k))
7526 }
7527
7528 /// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7529 ///
7530 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtusepi64_epi32&expand=2089)
7531 #[inline]
7532 #[target_feature(enable = "avx512f")]
7533 #[cfg_attr(test, assert_instr(vpmovusqd))]
7534 pub unsafe fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
7535     transmute(vpmovusqd(
7536         a.as_u64x8(),
7537         _mm256_setzero_si256().as_u32x8(),
7538         k,
7539     ))
7540 }
7541
7542 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
7543 ///
7544 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtusepi64_epi16&expand=2078)
7545 #[inline]
7546 #[target_feature(enable = "avx512f")]
7547 #[cfg_attr(test, assert_instr(vpmovusqw))]
7548 pub unsafe fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
7549     transmute(vpmovusqw(
7550         a.as_u64x8(),
7551         _mm_setzero_si128().as_u16x8(),
7552         0b11111111,
7553     ))
7554 }
7555
7556 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7557 ///
7558 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtusepi64_epi16&expand=2079)
7559 #[inline]
7560 #[target_feature(enable = "avx512f")]
7561 #[cfg_attr(test, assert_instr(vpmovusqw))]
7562 pub unsafe fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
7563     transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k))
7564 }
7565
7566 /// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7567 ///
7568 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtusepi64_epi16&expand=2080)
7569 #[inline]
7570 #[target_feature(enable = "avx512f")]
7571 #[cfg_attr(test, assert_instr(vpmovusqw))]
7572 pub unsafe fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
7573     transmute(vpmovusqw(a.as_u64x8(), _mm_setzero_si128().as_u16x8(), k))
7574 }
7575
7576 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
7577 ///
7578 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtusepi64_epi8&expand=2096)
7579 #[inline]
7580 #[target_feature(enable = "avx512f")]
7581 #[cfg_attr(test, assert_instr(vpmovusqb))]
7582 pub unsafe fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
7583     transmute(vpmovusqb(
7584         a.as_u64x8(),
7585         _mm_setzero_si128().as_u8x16(),
7586         0b11111111,
7587     ))
7588 }
7589
7590 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7591 ///
7592 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtusepi64_epi8&expand=2097)
7593 #[inline]
7594 #[target_feature(enable = "avx512f")]
7595 #[cfg_attr(test, assert_instr(vpmovusqb))]
7596 pub unsafe fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
7597     transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k))
7598 }
7599
7600 /// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7601 ///
7602 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtusepi64_epi8&expand=2098)
7603 #[inline]
7604 #[target_feature(enable = "avx512f")]
7605 #[cfg_attr(test, assert_instr(vpmovusqb))]
7606 pub unsafe fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
7607     transmute(vpmovusqb(a.as_u64x8(), _mm_setzero_si128().as_u8x16(), k))
7608 }
7609
7610 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
7611 ///
7612 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
7613 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
7614 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
7615 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
7616 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
7617 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7618 ///
7619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundps_epi32&expand=1335)
7620 #[inline]
7621 #[target_feature(enable = "avx512f")]
7622 #[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
7623 #[rustc_args_required_const(1)]
7624 pub unsafe fn _mm512_cvt_roundps_epi32(a: __m512, rounding: i32) -> __m512i {
7625     macro_rules! call {
7626         ($imm4:expr) => {
7627             vcvtps2dq(
7628                 a.as_f32x16(),
7629                 _mm512_setzero_si512().as_i32x16(),
7630                 0b11111111_11111111,
7631                 $imm4,
7632             )
7633         };
7634     }
7635     let r = constify_imm4_round!(rounding, call);
7636     transmute(r)
7637 }
7638
7639 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
7640 ///
7641 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7642 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7643 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7644 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7645 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7646 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7647 ///
7648 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundps_epi32&expand=1336)
7649 #[inline]
7650 #[target_feature(enable = "avx512f")]
7651 #[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
7652 #[rustc_args_required_const(3)]
7653 pub unsafe fn _mm512_mask_cvt_roundps_epi32(
7654     src: __m512i,
7655     k: __mmask16,
7656     a: __m512,
7657     rounding: i32,
7658 ) -> __m512i {
7659     macro_rules! call {
7660         ($imm4:expr) => {
7661             vcvtps2dq(a.as_f32x16(), src.as_i32x16(), k, $imm4)
7662         };
7663     }
7664     let r = constify_imm4_round!(rounding, call);
7665     transmute(r)
7666 }
7667
7668 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
7669 ///
7670 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7671 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7672 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7673 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7674 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7675 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7676 ///
7677 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundps_epi32&expand=1337)
7678 #[inline]
7679 #[target_feature(enable = "avx512f")]
7680 #[cfg_attr(test, assert_instr(vcvtps2dq, rounding = 8))]
7681 #[rustc_args_required_const(2)]
7682 pub unsafe fn _mm512_maskz_cvt_roundps_epi32(k: __mmask16, a: __m512, rounding: i32) -> __m512i {
7683     macro_rules! call {
7684         ($imm4:expr) => {
7685             vcvtps2dq(a.as_f32x16(), _mm512_setzero_si512().as_i32x16(), k, $imm4)
7686         };
7687     }
7688     let r = constify_imm4_round!(rounding, call);
7689     transmute(r)
7690 }
7691
7692 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
7693 ///
7694 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7695 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7696 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7697 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7698 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7699 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7700 ///
7701 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundps_epu32&expand=1341)
7702 #[inline]
7703 #[target_feature(enable = "avx512f")]
7704 #[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
7705 #[rustc_args_required_const(1)]
7706 pub unsafe fn _mm512_cvt_roundps_epu32(a: __m512, rounding: i32) -> __m512i {
7707     macro_rules! call {
7708         ($imm4:expr) => {
7709             vcvtps2udq(
7710                 a.as_f32x16(),
7711                 _mm512_setzero_si512().as_u32x16(),
7712                 0b11111111_11111111,
7713                 $imm4,
7714             )
7715         };
7716     }
7717     let r = constify_imm4_round!(rounding, call);
7718     transmute(r)
7719 }
7720
7721 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
7722 ///
7723 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7724 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7725 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7726 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7727 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7728 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7729 ///
7730 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundps_epu32&expand=1342)
7731 #[inline]
7732 #[target_feature(enable = "avx512f")]
7733 #[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
7734 #[rustc_args_required_const(3)]
7735 pub unsafe fn _mm512_mask_cvt_roundps_epu32(
7736     src: __m512i,
7737     k: __mmask16,
7738     a: __m512,
7739     rounding: i32,
7740 ) -> __m512i {
7741     macro_rules! call {
7742         ($imm4:expr) => {
7743             vcvtps2udq(a.as_f32x16(), src.as_u32x16(), k, $imm4)
7744         };
7745     }
7746     let r = constify_imm4_round!(rounding, call);
7747     transmute(r)
7748 }
7749
7750 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
7751 ///
7752 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7753 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7754 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7755 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7756 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7757 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7758 ///
7759 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=maskz_cvt_roundps_epu32&expand=1343)
7760 #[inline]
7761 #[target_feature(enable = "avx512f")]
7762 #[cfg_attr(test, assert_instr(vcvtps2udq, rounding = 8))]
7763 #[rustc_args_required_const(2)]
7764 pub unsafe fn _mm512_maskz_cvt_roundps_epu32(k: __mmask16, a: __m512, rounding: i32) -> __m512i {
7765     macro_rules! call {
7766         ($imm4:expr) => {
7767             vcvtps2udq(a.as_f32x16(), _mm512_setzero_si512().as_u32x16(), k, $imm4)
7768         };
7769     }
7770     let r = constify_imm4_round!(rounding, call);
7771     transmute(r)
7772 }
7773
7774 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
7775 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
7776 ///
7777 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvt_roundps_pd&expand=1347)
7778 #[inline]
7779 #[target_feature(enable = "avx512f")]
7780 #[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
7781 #[rustc_args_required_const(1)]
7782 pub unsafe fn _mm512_cvt_roundps_pd(a: __m256, sae: i32) -> __m512d {
7783     macro_rules! call {
7784         ($imm4:expr) => {
7785             vcvtps2pd(
7786                 a.as_f32x8(),
7787                 _mm512_setzero_pd().as_f64x8(),
7788                 0b11111111,
7789                 $imm4,
7790             )
7791         };
7792     }
7793     let r = constify_imm4_sae!(sae, call);
7794     transmute(r)
7795 }
7796
7797 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
7798 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
7799 ///
7800 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundps_epi32&expand=1336)
7801 #[inline]
7802 #[target_feature(enable = "avx512f")]
7803 #[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
7804 #[rustc_args_required_const(3)]
7805 pub unsafe fn _mm512_mask_cvt_roundps_pd(
7806     src: __m512d,
7807     k: __mmask8,
7808     a: __m256,
7809     sae: i32,
7810 ) -> __m512d {
7811     macro_rules! call {
7812         ($imm4:expr) => {
7813             vcvtps2pd(a.as_f32x8(), src.as_f64x8(), k, $imm4)
7814         };
7815     }
7816     let r = constify_imm4_sae!(sae, call);
7817     transmute(r)
7818 }
7819
7820 /// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
7821 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
7822 ///
7823 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundps_epi32&expand=1337)
7824 #[inline]
7825 #[target_feature(enable = "avx512f")]
7826 #[cfg_attr(test, assert_instr(vcvtps2pd, sae = 8))]
7827 #[rustc_args_required_const(2)]
7828 pub unsafe fn _mm512_maskz_cvt_roundps_pd(k: __mmask8, a: __m256, sae: i32) -> __m512d {
7829     macro_rules! call {
7830         ($imm4:expr) => {
7831             vcvtps2pd(a.as_f32x8(), _mm512_setzero_pd().as_f64x8(), k, $imm4)
7832         };
7833     }
7834     let r = constify_imm4_sae!(sae, call);
7835     transmute(r)
7836 }
7837
7838 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
7839 ///
7840 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7841 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7842 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7843 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7844 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7845 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7846 ///
7847 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundpd_epi32&expand=1315)
7848 #[inline]
7849 #[target_feature(enable = "avx512f")]
7850 #[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
7851 #[rustc_args_required_const(1)]
7852 pub unsafe fn _mm512_cvt_roundpd_epi32(a: __m512d, rounding: i32) -> __m256i {
7853     macro_rules! call {
7854         ($imm4:expr) => {
7855             vcvtpd2dq(
7856                 a.as_f64x8(),
7857                 _mm256_setzero_si256().as_i32x8(),
7858                 0b11111111,
7859                 $imm4,
7860             )
7861         };
7862     }
7863     let r = constify_imm4_round!(rounding, call);
7864     transmute(r)
7865 }
7866
7867 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
7868 ///
7869 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7870 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7871 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7872 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7873 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7874 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7875 ///
7876 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundpd_epi32&expand=1316)
7877 #[inline]
7878 #[target_feature(enable = "avx512f")]
7879 #[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
7880 #[rustc_args_required_const(3)]
7881 pub unsafe fn _mm512_mask_cvt_roundpd_epi32(
7882     src: __m256i,
7883     k: __mmask8,
7884     a: __m512d,
7885     rounding: i32,
7886 ) -> __m256i {
7887     macro_rules! call {
7888         ($imm4:expr) => {
7889             vcvtpd2dq(a.as_f64x8(), src.as_i32x8(), k, $imm4)
7890         };
7891     }
7892     let r = constify_imm4_round!(rounding, call);
7893     transmute(r)
7894 }
7895
7896 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
7897 ///
7898 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7899 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7900 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7901 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7902 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7903 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7904 ///
7905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=512_maskz_cvt_roundpd_epi32&expand=1317)
7906 #[inline]
7907 #[target_feature(enable = "avx512f")]
7908 #[cfg_attr(test, assert_instr(vcvtpd2dq, rounding = 8))]
7909 #[rustc_args_required_const(2)]
7910 pub unsafe fn _mm512_maskz_cvt_roundpd_epi32(k: __mmask8, a: __m512d, rounding: i32) -> __m256i {
7911     macro_rules! call {
7912         ($imm4:expr) => {
7913             vcvtpd2dq(a.as_f64x8(), _mm256_setzero_si256().as_i32x8(), k, $imm4)
7914         };
7915     }
7916     let r = constify_imm4_round!(rounding, call);
7917     transmute(r)
7918 }
7919
7920 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
7921 ///
7922 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7923 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7924 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7925 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7926 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7927 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7928 ///
7929 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundpd_epu32&expand=1321)
7930 #[inline]
7931 #[target_feature(enable = "avx512f")]
7932 #[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
7933 #[rustc_args_required_const(1)]
7934 pub unsafe fn _mm512_cvt_roundpd_epu32(a: __m512d, rounding: i32) -> __m256i {
7935     macro_rules! call {
7936         ($imm4:expr) => {
7937             vcvtpd2udq(
7938                 a.as_f64x8(),
7939                 _mm256_setzero_si256().as_u32x8(),
7940                 0b11111111,
7941                 $imm4,
7942             )
7943         };
7944     }
7945     let r = constify_imm4_round!(rounding, call);
7946     transmute(r)
7947 }
7948
7949 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
7950 ///
7951 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7952 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7953 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7954 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7955 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7956 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7957 ///
7958 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundpd_epu32&expand=1322)
7959 #[inline]
7960 #[target_feature(enable = "avx512f")]
7961 #[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
7962 #[rustc_args_required_const(3)]
7963 pub unsafe fn _mm512_mask_cvt_roundpd_epu32(
7964     src: __m256i,
7965     k: __mmask8,
7966     a: __m512d,
7967     rounding: i32,
7968 ) -> __m256i {
7969     macro_rules! call {
7970         ($imm4:expr) => {
7971             vcvtpd2udq(a.as_f64x8(), src.as_u32x8(), k, $imm4)
7972         };
7973     }
7974     let r = constify_imm4_round!(rounding, call);
7975     transmute(r)
7976 }
7977
7978 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
7979 ///
7980 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
7981 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
7982 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
7983 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
7984 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
7985 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
7986 ///
7987 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=512_maskz_cvt_roundpd_epu32&expand=1323)
7988 #[inline]
7989 #[target_feature(enable = "avx512f")]
7990 #[cfg_attr(test, assert_instr(vcvtpd2udq, rounding = 8))]
7991 #[rustc_args_required_const(2)]
7992 pub unsafe fn _mm512_maskz_cvt_roundpd_epu32(k: __mmask8, a: __m512d, rounding: i32) -> __m256i {
7993     macro_rules! call {
7994         ($imm4:expr) => {
7995             vcvtpd2udq(a.as_f64x8(), _mm256_setzero_si256().as_u32x8(), k, $imm4)
7996         };
7997     }
7998     let r = constify_imm4_round!(rounding, call);
7999     transmute(r)
8000 }
8001
8002 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
8003 ///
8004 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8005 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8006 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8007 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8008 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8009 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8010 ///
8011 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundpd_ps&expand=1327)
8012 #[inline]
8013 #[target_feature(enable = "avx512f")]
8014 #[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
8015 #[rustc_args_required_const(1)]
8016 pub unsafe fn _mm512_cvt_roundpd_ps(a: __m512d, rounding: i32) -> __m256 {
8017     macro_rules! call {
8018         ($imm4:expr) => {
8019             vcvtpd2ps(
8020                 a.as_f64x8(),
8021                 _mm256_setzero_ps().as_f32x8(),
8022                 0b11111111,
8023                 $imm4,
8024             )
8025         };
8026     }
8027     let r = constify_imm4_round!(rounding, call);
8028     transmute(r)
8029 }
8030
8031 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8032 ///
8033 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8034 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8035 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8036 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8037 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8038 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8039 ///
8040 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundpd_ps&expand=1328)
8041 #[inline]
8042 #[target_feature(enable = "avx512f")]
8043 #[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
8044 #[rustc_args_required_const(3)]
8045 pub unsafe fn _mm512_mask_cvt_roundpd_ps(
8046     src: __m256,
8047     k: __mmask8,
8048     a: __m512d,
8049     rounding: i32,
8050 ) -> __m256 {
8051     macro_rules! call {
8052         ($imm4:expr) => {
8053             vcvtpd2ps(a.as_f64x8(), src.as_f32x8(), k, $imm4)
8054         };
8055     }
8056     let r = constify_imm4_round!(rounding, call);
8057     transmute(r)
8058 }
8059
8060 /// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8061 ///
8062 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8063 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8064 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8065 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8066 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8067 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8068 ///
8069 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundpd_ps&expand=1329)
8070 #[inline]
8071 #[target_feature(enable = "avx512f")]
8072 #[cfg_attr(test, assert_instr(vcvtpd2ps, rounding = 8))]
8073 #[rustc_args_required_const(2)]
8074 pub unsafe fn _mm512_maskz_cvt_roundpd_ps(k: __mmask8, a: __m512d, rounding: i32) -> __m256 {
8075     macro_rules! call {
8076         ($imm4:expr) => {
8077             vcvtpd2ps(a.as_f64x8(), _mm256_setzero_ps().as_f32x8(), k, $imm4)
8078         };
8079     }
8080     let r = constify_imm4_round!(rounding, call);
8081     transmute(r)
8082 }
8083
8084 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
8085 ///
8086 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8087 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8088 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8089 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8090 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8091 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8092 ///
8093 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundepi32_ps&expand=1294)
8094 #[inline]
8095 #[target_feature(enable = "avx512f")]
8096 #[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
8097 #[rustc_args_required_const(1)]
8098 pub unsafe fn _mm512_cvt_roundepi32_ps(a: __m512i, rounding: i32) -> __m512 {
8099     macro_rules! call {
8100         ($imm4:expr) => {
8101             vcvtdq2ps(a.as_i32x16(), $imm4)
8102         };
8103     }
8104     let r = constify_imm4_round!(rounding, call);
8105     transmute(r)
8106 }
8107
8108 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8109 ///
8110 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8111 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8112 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8113 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8114 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8115 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8116 ///
8117 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundepi32_ps&expand=1295)
8118 #[inline]
8119 #[target_feature(enable = "avx512f")]
8120 #[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
8121 #[rustc_args_required_const(3)]
8122 pub unsafe fn _mm512_mask_cvt_roundepi32_ps(
8123     src: __m512,
8124     k: __mmask16,
8125     a: __m512i,
8126     rounding: i32,
8127 ) -> __m512 {
8128     macro_rules! call {
8129         ($imm4:expr) => {
8130             vcvtdq2ps(a.as_i32x16(), $imm4)
8131         };
8132     }
8133     let r: f32x16 = constify_imm4_round!(rounding, call);
8134     transmute(simd_select_bitmask(k, r, src.as_f32x16()))
8135 }
8136
8137 /// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8138 ///
8139 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8140 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8141 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8142 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8143 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8144 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8145 ///
8146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundepi32_ps&expand=1296)
8147 #[inline]
8148 #[target_feature(enable = "avx512f")]
8149 #[cfg_attr(test, assert_instr(vcvtdq2ps, rounding = 8))]
8150 #[rustc_args_required_const(2)]
8151 pub unsafe fn _mm512_maskz_cvt_roundepi32_ps(k: __mmask16, a: __m512i, rounding: i32) -> __m512 {
8152     macro_rules! call {
8153         ($imm4:expr) => {
8154             vcvtdq2ps(a.as_i32x16(), $imm4)
8155         };
8156     }
8157     let r = constify_imm4_round!(rounding, call);
8158     let zero = _mm512_setzero_ps().as_f32x16();
8159     transmute(simd_select_bitmask(k, r, zero))
8160 }
8161
8162 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
8163 ///
8164 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8165 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8166 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8167 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8168 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8169 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8170 ///
8171 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundepu32_ps&expand=1303)
8172 #[inline]
8173 #[target_feature(enable = "avx512f")]
8174 #[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
8175 #[rustc_args_required_const(1)]
8176 pub unsafe fn _mm512_cvt_roundepu32_ps(a: __m512i, rounding: i32) -> __m512 {
8177     macro_rules! call {
8178         ($imm4:expr) => {
8179             vcvtudq2ps(a.as_u32x16(), $imm4)
8180         };
8181     }
8182     let r = constify_imm4_round!(rounding, call);
8183     transmute(r)
8184 }
8185
8186 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8187 ///
8188 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8189 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8190 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8191 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8192 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8193 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8194 ///
8195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundepu32_ps&expand=1304)
8196 #[inline]
8197 #[target_feature(enable = "avx512f")]
8198 #[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
8199 #[rustc_args_required_const(3)]
8200 pub unsafe fn _mm512_mask_cvt_roundepu32_ps(
8201     src: __m512,
8202     k: __mmask16,
8203     a: __m512i,
8204     rounding: i32,
8205 ) -> __m512 {
8206     macro_rules! call {
8207         ($imm4:expr) => {
8208             vcvtudq2ps(a.as_u32x16(), $imm4)
8209         };
8210     }
8211     let r: f32x16 = constify_imm4_round!(rounding, call);
8212     transmute(simd_select_bitmask(k, r, src.as_f32x16()))
8213 }
8214
8215 /// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8216 ///
8217 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
8218 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
8219 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
8220 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
8221 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
8222 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
8223 ///
8224 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundepu32_ps&expand=1305)
8225 #[inline]
8226 #[target_feature(enable = "avx512f")]
8227 #[cfg_attr(test, assert_instr(vcvtudq2ps, rounding = 8))]
8228 #[rustc_args_required_const(2)]
8229 pub unsafe fn _mm512_maskz_cvt_roundepu32_ps(k: __mmask16, a: __m512i, rounding: i32) -> __m512 {
8230     macro_rules! call {
8231         ($imm4:expr) => {
8232             vcvtudq2ps(a.as_u32x16(), $imm4)
8233         };
8234     }
8235     let r = constify_imm4_round!(rounding, call);
8236     let zero = _mm512_setzero_ps().as_f32x16();
8237     transmute(simd_select_bitmask(k, r, zero))
8238 }
8239
8240 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
8241 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8242 ///
8243 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundps_ph&expand=1354)
8244 #[inline]
8245 #[target_feature(enable = "avx512f")]
8246 #[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
8247 #[rustc_args_required_const(1)]
8248 pub unsafe fn _mm512_cvt_roundps_ph(a: __m512, sae: i32) -> __m256i {
8249     macro_rules! call {
8250         ($imm4:expr) => {
8251             vcvtps2ph(
8252                 a.as_f32x16(),
8253                 $imm4,
8254                 _mm256_setzero_si256().as_i16x16(),
8255                 0b11111111_11111111,
8256             )
8257         };
8258     }
8259     let r = constify_imm4_sae!(sae, call);
8260     transmute(r)
8261 }
8262
8263 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8264 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8265 ///
8266 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundps_ph&expand=1355)
8267 #[inline]
8268 #[target_feature(enable = "avx512f")]
8269 #[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
8270 #[rustc_args_required_const(3)]
8271 pub unsafe fn _mm512_mask_cvt_roundps_ph(
8272     src: __m256i,
8273     k: __mmask16,
8274     a: __m512,
8275     sae: i32,
8276 ) -> __m256i {
8277     macro_rules! call {
8278         ($imm4:expr) => {
8279             vcvtps2ph(a.as_f32x16(), $imm4, src.as_i16x16(), k)
8280         };
8281     }
8282     let r = constify_imm4_sae!(sae, call);
8283     transmute(r)
8284 }
8285
8286 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8287 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8288 ///
8289 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundps_ph&expand=1356)
8290 #[inline]
8291 #[target_feature(enable = "avx512f")]
8292 #[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
8293 #[rustc_args_required_const(2)]
8294 pub unsafe fn _mm512_maskz_cvt_roundps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256i {
8295     macro_rules! call {
8296         ($imm4:expr) => {
8297             vcvtps2ph(a.as_f32x16(), $imm4, _mm256_setzero_si256().as_i16x16(), k)
8298         };
8299     }
8300     let r = constify_imm4_sae!(sae, call);
8301     transmute(r)
8302 }
8303
8304 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
8305 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8306 ///
8307 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtps_ph&expand=1778)
8308 #[inline]
8309 #[target_feature(enable = "avx512f")]
8310 #[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
8311 #[rustc_args_required_const(1)]
8312 pub unsafe fn _mm512_cvtps_ph(a: __m512, sae: i32) -> __m256i {
8313     macro_rules! call {
8314         ($imm4:expr) => {
8315             vcvtps2ph(
8316                 a.as_f32x16(),
8317                 $imm4,
8318                 _mm256_setzero_si256().as_i16x16(),
8319                 0b11111111_11111111,
8320             )
8321         };
8322     }
8323     let r = constify_imm4_sae!(sae, call);
8324     transmute(r)
8325 }
8326
8327 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8328 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8329 ///
8330 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtps_ph&expand=1779)
8331 #[inline]
8332 #[target_feature(enable = "avx512f")]
8333 #[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
8334 #[rustc_args_required_const(3)]
8335 pub unsafe fn _mm512_mask_cvtps_ph(src: __m256i, k: __mmask16, a: __m512, sae: i32) -> __m256i {
8336     macro_rules! call {
8337         ($imm4:expr) => {
8338             vcvtps2ph(a.as_f32x16(), $imm4, src.as_i16x16(), k)
8339         };
8340     }
8341     let r = constify_imm4_sae!(sae, call);
8342     transmute(r)
8343 }
8344
8345 /// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8346 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8347 ///
8348 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtps_ph&expand=1780)
8349 #[inline]
8350 #[target_feature(enable = "avx512f")]
8351 #[cfg_attr(test, assert_instr(vcvtps2ph, sae = 8))]
8352 #[rustc_args_required_const(2)]
8353 pub unsafe fn _mm512_maskz_cvtps_ph(k: __mmask16, a: __m512, sae: i32) -> __m256i {
8354     macro_rules! call {
8355         ($imm4:expr) => {
8356             vcvtps2ph(a.as_f32x16(), $imm4, _mm256_setzero_si256().as_i16x16(), k)
8357         };
8358     }
8359     let r = constify_imm4_sae!(sae, call);
8360     transmute(r)
8361 }
8362
8363 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
8364 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8365 ///
8366 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvt_roundph_ps&expand=1332)
8367 #[inline]
8368 #[target_feature(enable = "avx512f")]
8369 #[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
8370 #[rustc_args_required_const(1)]
8371 pub unsafe fn _mm512_cvt_roundph_ps(a: __m256i, sae: i32) -> __m512 {
8372     macro_rules! call {
8373         ($imm4:expr) => {
8374             vcvtph2ps(
8375                 a.as_i16x16(),
8376                 _mm512_setzero_ps().as_f32x16(),
8377                 0b11111111_11111111,
8378                 $imm4,
8379             )
8380         };
8381     }
8382     let r = constify_imm4_sae!(sae, call);
8383     transmute(r)
8384 }
8385
8386 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8387 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8388 ///
8389 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvt_roundph_ps&expand=1333)
8390 #[inline]
8391 #[target_feature(enable = "avx512f")]
8392 #[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
8393 #[rustc_args_required_const(3)]
8394 pub unsafe fn _mm512_mask_cvt_roundph_ps(
8395     src: __m512,
8396     k: __mmask16,
8397     a: __m256i,
8398     sae: i32,
8399 ) -> __m512 {
8400     macro_rules! call {
8401         ($imm4:expr) => {
8402             vcvtph2ps(a.as_i16x16(), src.as_f32x16(), k, $imm4)
8403         };
8404     }
8405     let r = constify_imm4_sae!(sae, call);
8406     transmute(r)
8407 }
8408
8409 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8410 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8411 ///
8412 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvt_roundph_ps&expand=1334)
8413 #[inline]
8414 #[target_feature(enable = "avx512f")]
8415 #[cfg_attr(test, assert_instr(vcvtph2ps, sae = 8))]
8416 #[rustc_args_required_const(2)]
8417 pub unsafe fn _mm512_maskz_cvt_roundph_ps(k: __mmask16, a: __m256i, sae: i32) -> __m512 {
8418     macro_rules! call {
8419         ($imm4:expr) => {
8420             vcvtph2ps(a.as_i16x16(), _mm512_setzero_ps().as_f32x16(), k, $imm4)
8421         };
8422     }
8423     let r = constify_imm4_sae!(sae, call);
8424     transmute(r)
8425 }
8426
8427 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
8428 ///
8429 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtph_ps&expand=1723)
8430 #[inline]
8431 #[target_feature(enable = "avx512f")]
8432 #[cfg_attr(test, assert_instr(vcvtph2ps))]
8433 pub unsafe fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
8434     transmute(vcvtph2ps(
8435         a.as_i16x16(),
8436         _mm512_setzero_ps().as_f32x16(),
8437         0b11111111_11111111,
8438         _MM_FROUND_NO_EXC,
8439     ))
8440 }
8441
8442 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8443 ///
8444 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtph_ps&expand=1724)
8445 #[inline]
8446 #[target_feature(enable = "avx512f")]
8447 #[cfg_attr(test, assert_instr(vcvtph2ps))]
8448 pub unsafe fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
8449     transmute(vcvtph2ps(
8450         a.as_i16x16(),
8451         src.as_f32x16(),
8452         k,
8453         _MM_FROUND_NO_EXC,
8454     ))
8455 }
8456
8457 /// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8458 ///
8459 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtph_ps&expand=1725)
8460 #[inline]
8461 #[target_feature(enable = "avx512f")]
8462 #[cfg_attr(test, assert_instr(vcvtph2ps))]
8463 pub unsafe fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
8464     transmute(vcvtph2ps(
8465         a.as_i16x16(),
8466         _mm512_setzero_ps().as_f32x16(),
8467         k,
8468         _MM_FROUND_NO_EXC,
8469     ))
8470 }
8471
8472 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
8473 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8474 ///
8475 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundps_epi32&expand=1916)
8476 #[inline]
8477 #[target_feature(enable = "avx512f")]
8478 #[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
8479 #[rustc_args_required_const(1)]
8480 pub unsafe fn _mm512_cvtt_roundps_epi32(a: __m512, sae: i32) -> __m512i {
8481     macro_rules! call {
8482         ($imm4:expr) => {
8483             vcvttps2dq(
8484                 a.as_f32x16(),
8485                 _mm512_setzero_si512().as_i32x16(),
8486                 0b11111111_11111111,
8487                 $imm4,
8488             )
8489         };
8490     }
8491     let r = constify_imm4_sae!(sae, call);
8492     transmute(r)
8493 }
8494
8495 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8496 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8497 ///
8498 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundps_epi32&expand=1917)
8499 #[inline]
8500 #[target_feature(enable = "avx512f")]
8501 #[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
8502 #[rustc_args_required_const(3)]
8503 pub unsafe fn _mm512_mask_cvtt_roundps_epi32(
8504     src: __m512i,
8505     k: __mmask16,
8506     a: __m512,
8507     sae: i32,
8508 ) -> __m512i {
8509     macro_rules! call {
8510         ($imm4:expr) => {
8511             vcvttps2dq(a.as_f32x16(), src.as_i32x16(), k, $imm4)
8512         };
8513     }
8514     let r = constify_imm4_sae!(sae, call);
8515     transmute(r)
8516 }
8517
8518 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8519 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8520 ///
8521 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundps_epi32&expand=1918)
8522 #[inline]
8523 #[target_feature(enable = "avx512f")]
8524 #[cfg_attr(test, assert_instr(vcvttps2dq, sae = 8))]
8525 #[rustc_args_required_const(2)]
8526 pub unsafe fn _mm512_maskz_cvtt_roundps_epi32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
8527     macro_rules! call {
8528         ($imm4:expr) => {
8529             vcvttps2dq(a.as_f32x16(), _mm512_setzero_si512().as_i32x16(), k, $imm4)
8530         };
8531     }
8532     let r = constify_imm4_sae!(sae, call);
8533     transmute(r)
8534 }
8535
8536 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
8537 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8538 ///
8539 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundps_epu32&expand=1922)
8540 #[inline]
8541 #[target_feature(enable = "avx512f")]
8542 #[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
8543 #[rustc_args_required_const(1)]
8544 pub unsafe fn _mm512_cvtt_roundps_epu32(a: __m512, sae: i32) -> __m512i {
8545     macro_rules! call {
8546         ($imm4:expr) => {
8547             vcvttps2udq(
8548                 a.as_f32x16(),
8549                 _mm512_setzero_si512().as_i32x16(),
8550                 0b11111111_11111111,
8551                 $imm4,
8552             )
8553         };
8554     }
8555     let r = constify_imm4_sae!(sae, call);
8556     transmute(r)
8557 }
8558
8559 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8560 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8561 ///
8562 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundps_epu32&expand=1923)
8563 #[inline]
8564 #[target_feature(enable = "avx512f")]
8565 #[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
8566 #[rustc_args_required_const(3)]
8567 pub unsafe fn _mm512_mask_cvtt_roundps_epu32(
8568     src: __m512i,
8569     k: __mmask16,
8570     a: __m512,
8571     sae: i32,
8572 ) -> __m512i {
8573     macro_rules! call {
8574         ($imm4:expr) => {
8575             vcvttps2udq(a.as_f32x16(), src.as_i32x16(), k, $imm4)
8576         };
8577     }
8578     let r = constify_imm4_sae!(sae, call);
8579     transmute(r)
8580 }
8581
8582 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8583 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8584 ///
8585 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundps_epu32&expand=1924)
8586 #[inline]
8587 #[target_feature(enable = "avx512f")]
8588 #[cfg_attr(test, assert_instr(vcvttps2udq, sae = 8))]
8589 #[rustc_args_required_const(2)]
8590 pub unsafe fn _mm512_maskz_cvtt_roundps_epu32(k: __mmask16, a: __m512, sae: i32) -> __m512i {
8591     macro_rules! call {
8592         ($imm4:expr) => {
8593             vcvttps2udq(a.as_f32x16(), _mm512_setzero_si512().as_i32x16(), k, $imm4)
8594         };
8595     }
8596     let r = constify_imm4_sae!(sae, call);
8597     transmute(r)
8598 }
8599
8600 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
8601 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8602 ///
8603 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundpd_epi32&expand=1904)
8604 #[inline]
8605 #[target_feature(enable = "avx512f")]
8606 #[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
8607 #[rustc_args_required_const(1)]
8608 pub unsafe fn _mm512_cvtt_roundpd_epi32(a: __m512d, sae: i32) -> __m256i {
8609     macro_rules! call {
8610         ($imm4:expr) => {
8611             vcvttpd2dq(
8612                 a.as_f64x8(),
8613                 _mm256_setzero_si256().as_i32x8(),
8614                 0b11111111,
8615                 $imm4,
8616             )
8617         };
8618     }
8619     let r = constify_imm4_sae!(sae, call);
8620     transmute(r)
8621 }
8622
8623 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8624 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8625 ///
8626 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundpd_epi32&expand=1905)
8627 #[inline]
8628 #[target_feature(enable = "avx512f")]
8629 #[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
8630 #[rustc_args_required_const(3)]
8631 pub unsafe fn _mm512_mask_cvtt_roundpd_epi32(
8632     src: __m256i,
8633     k: __mmask8,
8634     a: __m512d,
8635     sae: i32,
8636 ) -> __m256i {
8637     macro_rules! call {
8638         ($imm4:expr) => {
8639             vcvttpd2dq(a.as_f64x8(), src.as_i32x8(), k, $imm4)
8640         };
8641     }
8642     let r = constify_imm4_sae!(sae, call);
8643     transmute(r)
8644 }
8645
8646 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8647 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8648 ///
8649 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundps_epi32&expand=1918)
8650 #[inline]
8651 #[target_feature(enable = "avx512f")]
8652 #[cfg_attr(test, assert_instr(vcvttpd2dq, sae = 8))]
8653 #[rustc_args_required_const(2)]
8654 pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32(k: __mmask8, a: __m512d, sae: i32) -> __m256i {
8655     macro_rules! call {
8656         ($imm4:expr) => {
8657             vcvttpd2dq(a.as_f64x8(), _mm256_setzero_si256().as_i32x8(), k, $imm4)
8658         };
8659     }
8660     let r = constify_imm4_sae!(sae, call);
8661     transmute(r)
8662 }
8663
8664 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
8665 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8666 ///
8667 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvtt_roundpd_epu32&expand=1910)
8668 #[inline]
8669 #[target_feature(enable = "avx512f")]
8670 #[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
8671 #[rustc_args_required_const(1)]
8672 pub unsafe fn _mm512_cvtt_roundpd_epu32(a: __m512d, sae: i32) -> __m256i {
8673     macro_rules! call {
8674         ($imm4:expr) => {
8675             vcvttpd2udq(
8676                 a.as_f64x8(),
8677                 _mm256_setzero_si256().as_i32x8(),
8678                 0b11111111,
8679                 $imm4,
8680             )
8681         };
8682     }
8683     let r = constify_imm4_sae!(sae, call);
8684     transmute(r)
8685 }
8686
8687 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
8688 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8689 ///
8690 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvtt_roundpd_epu32&expand=1911)
8691 #[inline]
8692 #[target_feature(enable = "avx512f")]
8693 #[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
8694 #[rustc_args_required_const(3)]
8695 pub unsafe fn _mm512_mask_cvtt_roundpd_epu32(
8696     src: __m256i,
8697     k: __mmask8,
8698     a: __m512d,
8699     sae: i32,
8700 ) -> __m256i {
8701     macro_rules! call {
8702         ($imm4:expr) => {
8703             vcvttpd2udq(a.as_f64x8(), src.as_i32x8(), k, $imm4)
8704         };
8705     }
8706     let r = constify_imm4_sae!(sae, call);
8707     transmute(r)
8708 }
8709
8710 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
8711 ///
8712 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttps_epi32&expand=1984)
8713 #[inline]
8714 #[target_feature(enable = "avx512f")]
8715 #[cfg_attr(test, assert_instr(vcvttps2dq))]
8716 pub unsafe fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
8717     transmute(vcvttps2dq(
8718         a.as_f32x16(),
8719         _mm512_setzero_si512().as_i32x16(),
8720         0b11111111_11111111,
8721         _MM_FROUND_CUR_DIRECTION,
8722     ))
8723 }
8724
8725 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8726 ///
8727 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttps_epi32&expand=1985)
8728 #[inline]
8729 #[target_feature(enable = "avx512f")]
8730 #[cfg_attr(test, assert_instr(vcvttps2dq))]
8731 pub unsafe fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
8732     transmute(vcvttps2dq(
8733         a.as_f32x16(),
8734         src.as_i32x16(),
8735         k,
8736         _MM_FROUND_CUR_DIRECTION,
8737     ))
8738 }
8739
8740 /// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8741 ///
8742 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttps_epi32&expand=1986)
8743 #[inline]
8744 #[target_feature(enable = "avx512f")]
8745 #[cfg_attr(test, assert_instr(vcvttps2dq))]
8746 pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
8747     transmute(vcvttps2dq(
8748         a.as_f32x16(),
8749         _mm512_setzero_si512().as_i32x16(),
8750         k,
8751         _MM_FROUND_CUR_DIRECTION,
8752     ))
8753 }
8754
8755 /// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
8756 ///
8757 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttps_epu32&expand=2002)
8758 #[inline]
8759 #[target_feature(enable = "avx512f")]
8760 #[cfg_attr(test, assert_instr(vcvttps2udq))]
8761 pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
8762     transmute(vcvttps2udq(
8763         a.as_f32x16(),
8764         _mm512_setzero_si512().as_i32x16(),
8765         0b11111111_11111111,
8766         _MM_FROUND_CUR_DIRECTION,
8767     ))
8768 }
8769
8770 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8771 ///
8772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttps_epu32&expand=2003)
8773 #[inline]
8774 #[target_feature(enable = "avx512f")]
8775 #[cfg_attr(test, assert_instr(vcvttps2udq))]
8776 pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
8777     transmute(vcvttps2udq(
8778         a.as_f32x16(),
8779         src.as_i32x16(),
8780         k,
8781         _MM_FROUND_CUR_DIRECTION,
8782     ))
8783 }
8784
8785 /// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8786 ///
8787 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttps_epu32&expand=2004)
8788 #[inline]
8789 #[target_feature(enable = "avx512f")]
8790 #[cfg_attr(test, assert_instr(vcvttps2udq))]
8791 pub unsafe fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
8792     transmute(vcvttps2udq(
8793         a.as_f32x16(),
8794         _mm512_setzero_si512().as_i32x16(),
8795         k,
8796         _MM_FROUND_CUR_DIRECTION,
8797     ))
8798 }
8799
8800 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
8801 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8802 ///
8803 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvtt_roundpd_epu32&expand=1912)
8804 #[inline]
8805 #[target_feature(enable = "avx512f")]
8806 #[cfg_attr(test, assert_instr(vcvttpd2udq, sae = 8))]
8807 #[rustc_args_required_const(2)]
8808 pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32(k: __mmask8, a: __m512d, sae: i32) -> __m256i {
8809     macro_rules! call {
8810         ($imm4:expr) => {
8811             vcvttpd2udq(a.as_f64x8(), _mm256_setzero_si256().as_i32x8(), k, $imm4)
8812         };
8813     }
8814     let r = constify_imm4_sae!(sae, call);
8815     transmute(r)
8816 }
8817
8818 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
8819 ///
8820 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttpd_epi32&expand=1947)
8821 #[inline]
8822 #[target_feature(enable = "avx512f")]
8823 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
8824 pub unsafe fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
8825     transmute(vcvttpd2dq(
8826         a.as_f64x8(),
8827         _mm256_setzero_si256().as_i32x8(),
8828         0b11111111,
8829         _MM_FROUND_CUR_DIRECTION,
8830     ))
8831 }
8832
8833 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8834 ///
8835 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttpd_epi32&expand=1948)
8836 #[inline]
8837 #[target_feature(enable = "avx512f")]
8838 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
8839 pub unsafe fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
8840     transmute(vcvttpd2dq(
8841         a.as_f64x8(),
8842         src.as_i32x8(),
8843         k,
8844         _MM_FROUND_CUR_DIRECTION,
8845     ))
8846 }
8847
8848 /// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8849 ///
8850 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttpd_epi32&expand=1949)
8851 #[inline]
8852 #[target_feature(enable = "avx512f")]
8853 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
8854 pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
8855     transmute(vcvttpd2dq(
8856         a.as_f64x8(),
8857         _mm256_setzero_si256().as_i32x8(),
8858         k,
8859         _MM_FROUND_CUR_DIRECTION,
8860     ))
8861 }
8862
8863 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
8864 ///
8865 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cvttpd_epu32&expand=1965)
8866 #[inline]
8867 #[target_feature(enable = "avx512f")]
8868 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
8869 pub unsafe fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
8870     transmute(vcvttpd2udq(
8871         a.as_f64x8(),
8872         _mm256_setzero_si256().as_i32x8(),
8873         0b11111111,
8874         _MM_FROUND_CUR_DIRECTION,
8875     ))
8876 }
8877
8878 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8879 ///
8880 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cvttpd_epu32&expand=1966)
8881 #[inline]
8882 #[target_feature(enable = "avx512f")]
8883 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
8884 pub unsafe fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
8885     transmute(vcvttpd2udq(
8886         a.as_f64x8(),
8887         src.as_i32x8(),
8888         k,
8889         _MM_FROUND_CUR_DIRECTION,
8890     ))
8891 }
8892
8893 /// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8894 ///
8895 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_cvttpd_epu32&expand=1967)
8896 #[inline]
8897 #[target_feature(enable = "avx512f")]
8898 #[cfg_attr(test, assert_instr(vcvttpd2udq))]
8899 pub unsafe fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
8900     transmute(vcvttpd2udq(
8901         a.as_f64x8(),
8902         _mm256_setzero_si256().as_i32x8(),
8903         k,
8904         _MM_FROUND_CUR_DIRECTION,
8905     ))
8906 }
8907
8908 /// Returns vector of type `__m512d` with all elements set to zero.
8909 ///
8910 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
8911 #[inline]
8912 #[target_feature(enable = "avx512f")]
8913 #[cfg_attr(test, assert_instr(vxorps))]
8914 pub unsafe fn _mm512_setzero_pd() -> __m512d {
8915     // All-0 is a properly initialized __m512d
8916     mem::zeroed()
8917 }
8918
8919 /// Returns vector of type `__m512d` with all elements set to zero.
8920 ///
8921 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_pd)
8922 #[inline]
8923 #[target_feature(enable = "avx512f")]
8924 #[cfg_attr(test, assert_instr(vxorps))]
8925 pub unsafe fn _mm512_setzero_ps() -> __m512 {
8926     // All-0 is a properly initialized __m512
8927     mem::zeroed()
8928 }
8929
8930 /// Return vector of type __m512 with all elements set to zero.
8931 ///
8932 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setzero&expand=5014)
8933 #[inline]
8934 #[target_feature(enable = "avx512f")]
8935 #[cfg_attr(test, assert_instr(vxorps))]
8936 pub unsafe fn _mm512_setzero() -> __m512 {
8937     // All-0 is a properly initialized __m512
8938     mem::zeroed()
8939 }
8940
8941 /// Returns vector of type `__m512i` with all elements set to zero.
8942 ///
8943 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#avx512techs=AVX512F&expand=33,34,4990&text=_mm512_setzero_si512)
8944 #[inline]
8945 #[target_feature(enable = "avx512f")]
8946 #[cfg_attr(test, assert_instr(vxorps))]
8947 pub unsafe fn _mm512_setzero_si512() -> __m512i {
8948     // All-0 is a properly initialized __m512i
8949     mem::zeroed()
8950 }
8951
8952 /// Return vector of type __m512i with all elements set to zero.
8953 ///
8954 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setzero_epi32&expand=5015)
8955 #[inline]
8956 #[target_feature(enable = "avx512f")]
8957 #[cfg_attr(test, assert_instr(vxorps))]
8958 pub unsafe fn _mm512_setzero_epi32() -> __m512i {
8959     // All-0 is a properly initialized __m512i
8960     mem::zeroed()
8961 }
8962
8963 /// Sets packed 32-bit integers in `dst` with the supplied values in reverse
8964 /// order.
8965 #[inline]
8966 #[target_feature(enable = "avx512f")]
8967 pub unsafe fn _mm512_setr_epi32(
8968     e15: i32,
8969     e14: i32,
8970     e13: i32,
8971     e12: i32,
8972     e11: i32,
8973     e10: i32,
8974     e9: i32,
8975     e8: i32,
8976     e7: i32,
8977     e6: i32,
8978     e5: i32,
8979     e4: i32,
8980     e3: i32,
8981     e2: i32,
8982     e1: i32,
8983     e0: i32,
8984 ) -> __m512i {
8985     let r = i32x16(
8986         e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
8987     );
8988     transmute(r)
8989 }
8990
8991 /// Set packed 8-bit integers in dst with the supplied values.
8992 ///
8993 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set_epi8&expand=4915)
8994 #[inline]
8995 #[target_feature(enable = "avx512f")]
8996 pub unsafe fn _mm512_set_epi8(
8997     e63: i8,
8998     e62: i8,
8999     e61: i8,
9000     e60: i8,
9001     e59: i8,
9002     e58: i8,
9003     e57: i8,
9004     e56: i8,
9005     e55: i8,
9006     e54: i8,
9007     e53: i8,
9008     e52: i8,
9009     e51: i8,
9010     e50: i8,
9011     e49: i8,
9012     e48: i8,
9013     e47: i8,
9014     e46: i8,
9015     e45: i8,
9016     e44: i8,
9017     e43: i8,
9018     e42: i8,
9019     e41: i8,
9020     e40: i8,
9021     e39: i8,
9022     e38: i8,
9023     e37: i8,
9024     e36: i8,
9025     e35: i8,
9026     e34: i8,
9027     e33: i8,
9028     e32: i8,
9029     e31: i8,
9030     e30: i8,
9031     e29: i8,
9032     e28: i8,
9033     e27: i8,
9034     e26: i8,
9035     e25: i8,
9036     e24: i8,
9037     e23: i8,
9038     e22: i8,
9039     e21: i8,
9040     e20: i8,
9041     e19: i8,
9042     e18: i8,
9043     e17: i8,
9044     e16: i8,
9045     e15: i8,
9046     e14: i8,
9047     e13: i8,
9048     e12: i8,
9049     e11: i8,
9050     e10: i8,
9051     e9: i8,
9052     e8: i8,
9053     e7: i8,
9054     e6: i8,
9055     e5: i8,
9056     e4: i8,
9057     e3: i8,
9058     e2: i8,
9059     e1: i8,
9060     e0: i8,
9061 ) -> __m512i {
9062     let r = i8x64(
9063         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
9064         e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35, e36, e37,
9065         e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52, e53, e54, e55,
9066         e56, e57, e58, e59, e60, e61, e62, e63,
9067     );
9068     transmute(r)
9069 }
9070
9071 /// Set packed 16-bit integers in dst with the supplied values.
9072 ///
9073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set_epi16&expand=4905)
9074 #[inline]
9075 #[target_feature(enable = "avx512f")]
9076 pub unsafe fn _mm512_set_epi16(
9077     e31: i16,
9078     e30: i16,
9079     e29: i16,
9080     e28: i16,
9081     e27: i16,
9082     e26: i16,
9083     e25: i16,
9084     e24: i16,
9085     e23: i16,
9086     e22: i16,
9087     e21: i16,
9088     e20: i16,
9089     e19: i16,
9090     e18: i16,
9091     e17: i16,
9092     e16: i16,
9093     e15: i16,
9094     e14: i16,
9095     e13: i16,
9096     e12: i16,
9097     e11: i16,
9098     e10: i16,
9099     e9: i16,
9100     e8: i16,
9101     e7: i16,
9102     e6: i16,
9103     e5: i16,
9104     e4: i16,
9105     e3: i16,
9106     e2: i16,
9107     e1: i16,
9108     e0: i16,
9109 ) -> __m512i {
9110     let r = i16x32(
9111         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
9112         e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
9113     );
9114     transmute(r)
9115 }
9116
9117 /// Set packed 32-bit integers in dst with the repeated 4 element sequence.
9118 ///
9119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_epi32&expand=4982)
9120 #[inline]
9121 #[target_feature(enable = "avx512f")]
9122 pub unsafe fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
9123     _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
9124 }
9125
9126 /// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
9127 ///
9128 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_ps&expand=4985)
9129 #[inline]
9130 #[target_feature(enable = "avx512f")]
9131 pub unsafe fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
9132     _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
9133 }
9134
9135 /// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
9136 ///
9137 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_pd&expand=4984)
9138 #[inline]
9139 #[target_feature(enable = "avx512f")]
9140 pub unsafe fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
9141     _mm512_set_pd(d, c, b, a, d, c, b, a)
9142 }
9143
9144 /// Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
9145 ///
9146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setr4_epi32&expand=5009)
9147 #[inline]
9148 #[target_feature(enable = "avx512f")]
9149 pub unsafe fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
9150     _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
9151 }
9152
9153 /// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
9154 ///
9155 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setr4_ps&expand=5012)
9156 #[inline]
9157 #[target_feature(enable = "avx512f")]
9158 pub unsafe fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
9159     _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
9160 }
9161
9162 /// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
9163 ///
9164 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setr4_pd&expand=5011)
9165 #[inline]
9166 #[target_feature(enable = "avx512f")]
9167 pub unsafe fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
9168     _mm512_set_pd(a, b, c, d, a, b, c, d)
9169 }
9170
9171 /// Set packed 64-bit integers in dst with the supplied values.
9172 ///
9173 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set_epi64&expand=4910)
9174 #[inline]
9175 #[target_feature(enable = "avx512f")]
9176 pub unsafe fn _mm512_set_epi64(
9177     e0: i64,
9178     e1: i64,
9179     e2: i64,
9180     e3: i64,
9181     e4: i64,
9182     e5: i64,
9183     e6: i64,
9184     e7: i64,
9185 ) -> __m512i {
9186     _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
9187 }
9188
9189 /// Set packed 64-bit integers in dst with the supplied values in reverse order.
9190 ///
9191 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setr_epi64&expand=4993)
9192 #[inline]
9193 #[target_feature(enable = "avx512f")]
9194 pub unsafe fn _mm512_setr_epi64(
9195     e0: i64,
9196     e1: i64,
9197     e2: i64,
9198     e3: i64,
9199     e4: i64,
9200     e5: i64,
9201     e6: i64,
9202     e7: i64,
9203 ) -> __m512i {
9204     let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
9205     transmute(r)
9206 }
9207
9208 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9209 ///
9210 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32gather_pd&expand=3002)
9211 #[inline]
9212 #[target_feature(enable = "avx512f")]
9213 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
9214 #[rustc_args_required_const(2)]
9215 pub unsafe fn _mm512_i32gather_pd(offsets: __m256i, slice: *const u8, scale: i32) -> __m512d {
9216     let zero = _mm512_setzero_pd().as_f64x8();
9217     let neg_one = -1;
9218     let slice = slice as *const i8;
9219     let offsets = offsets.as_i32x8();
9220     macro_rules! call {
9221         ($imm8:expr) => {
9222             vgatherdpd(zero, slice, offsets, neg_one, $imm8)
9223         };
9224     }
9225     let r = constify_imm8_gather!(scale, call);
9226     transmute(r)
9227 }
9228
9229 /// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9230 ///
9231 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32gather_pd&expand=3003)
9232 #[inline]
9233 #[target_feature(enable = "avx512f")]
9234 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
9235 #[rustc_args_required_const(4)]
9236 pub unsafe fn _mm512_mask_i32gather_pd(
9237     src: __m512d,
9238     mask: __mmask8,
9239     offsets: __m256i,
9240     slice: *const u8,
9241     scale: i32,
9242 ) -> __m512d {
9243     let src = src.as_f64x8();
9244     let slice = slice as *const i8;
9245     let offsets = offsets.as_i32x8();
9246     macro_rules! call {
9247         ($imm8:expr) => {
9248             vgatherdpd(src, slice, offsets, mask as i8, $imm8)
9249         };
9250     }
9251     let r = constify_imm8_gather!(scale, call);
9252     transmute(r)
9253 }
9254
9255 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9256 ///
9257 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64gather_pd&expand=3092)
9258 #[inline]
9259 #[target_feature(enable = "avx512f")]
9260 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
9261 #[rustc_args_required_const(2)]
9262 pub unsafe fn _mm512_i64gather_pd(offsets: __m512i, slice: *const u8, scale: i32) -> __m512d {
9263     let zero = _mm512_setzero_pd().as_f64x8();
9264     let neg_one = -1;
9265     let slice = slice as *const i8;
9266     let offsets = offsets.as_i64x8();
9267     macro_rules! call {
9268         ($imm8:expr) => {
9269             vgatherqpd(zero, slice, offsets, neg_one, $imm8)
9270         };
9271     }
9272     let r = constify_imm8_gather!(scale, call);
9273     transmute(r)
9274 }
9275
9276 /// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9277 ///
9278 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64gather_pd&expand=3093)
9279 #[inline]
9280 #[target_feature(enable = "avx512f")]
9281 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
9282 #[rustc_args_required_const(4)]
9283 pub unsafe fn _mm512_mask_i64gather_pd(
9284     src: __m512d,
9285     mask: __mmask8,
9286     offsets: __m512i,
9287     slice: *const u8,
9288     scale: i32,
9289 ) -> __m512d {
9290     let src = src.as_f64x8();
9291     let slice = slice as *const i8;
9292     let offsets = offsets.as_i64x8();
9293     macro_rules! call {
9294         ($imm8:expr) => {
9295             vgatherqpd(src, slice, offsets, mask as i8, $imm8)
9296         };
9297     }
9298     let r = constify_imm8_gather!(scale, call);
9299     transmute(r)
9300 }
9301
9302 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9303 ///
9304 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64gather_ps&expand=3100)
9305 #[inline]
9306 #[target_feature(enable = "avx512f")]
9307 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
9308 #[rustc_args_required_const(2)]
9309 pub unsafe fn _mm512_i64gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m256 {
9310     let zero = _mm256_setzero_ps().as_f32x8();
9311     let neg_one = -1;
9312     let slice = slice as *const i8;
9313     let offsets = offsets.as_i64x8();
9314     macro_rules! call {
9315         ($imm8:expr) => {
9316             vgatherqps(zero, slice, offsets, neg_one, $imm8)
9317         };
9318     }
9319     let r = constify_imm8_gather!(scale, call);
9320     transmute(r)
9321 }
9322
9323 /// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9324 ///
9325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64gather_ps&expand=3101)
9326 #[inline]
9327 #[target_feature(enable = "avx512f")]
9328 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
9329 #[rustc_args_required_const(4)]
9330 pub unsafe fn _mm512_mask_i64gather_ps(
9331     src: __m256,
9332     mask: __mmask8,
9333     offsets: __m512i,
9334     slice: *const u8,
9335     scale: i32,
9336 ) -> __m256 {
9337     let src = src.as_f32x8();
9338     let slice = slice as *const i8;
9339     let offsets = offsets.as_i64x8();
9340     macro_rules! call {
9341         ($imm8:expr) => {
9342             vgatherqps(src, slice, offsets, mask as i8, $imm8)
9343         };
9344     }
9345     let r = constify_imm8_gather!(scale, call);
9346     transmute(r)
9347 }
9348
9349 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9350 ///
9351 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32gather_ps&expand=3010)
9352 #[inline]
9353 #[target_feature(enable = "avx512f")]
9354 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
9355 #[rustc_args_required_const(2)]
9356 pub unsafe fn _mm512_i32gather_ps(offsets: __m512i, slice: *const u8, scale: i32) -> __m512 {
9357     let zero = _mm512_setzero_ps().as_f32x16();
9358     let neg_one = -1;
9359     let slice = slice as *const i8;
9360     let offsets = offsets.as_i32x16();
9361     macro_rules! call {
9362         ($imm8:expr) => {
9363             vgatherdps(zero, slice, offsets, neg_one, $imm8)
9364         };
9365     }
9366     let r = constify_imm8_gather!(scale, call);
9367     transmute(r)
9368 }
9369
9370 /// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9371 ///
9372 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32gather_ps&expand=3011)
9373 #[inline]
9374 #[target_feature(enable = "avx512f")]
9375 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
9376 #[rustc_args_required_const(4)]
9377 pub unsafe fn _mm512_mask_i32gather_ps(
9378     src: __m512,
9379     mask: __mmask16,
9380     offsets: __m512i,
9381     slice: *const u8,
9382     scale: i32,
9383 ) -> __m512 {
9384     let src = src.as_f32x16();
9385     let slice = slice as *const i8;
9386     let offsets = offsets.as_i32x16();
9387     macro_rules! call {
9388         ($imm8:expr) => {
9389             vgatherdps(src, slice, offsets, mask as i16, $imm8)
9390         };
9391     }
9392     let r = constify_imm8_gather!(scale, call);
9393     transmute(r)
9394 }
9395
9396 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9397 ///
9398 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32gather_epi32&expand=2986)
9399 #[inline]
9400 #[target_feature(enable = "avx512f")]
9401 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
9402 #[rustc_args_required_const(2)]
9403 pub unsafe fn _mm512_i32gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
9404     let zero = _mm512_setzero_si512().as_i32x16();
9405     let neg_one = -1;
9406     let slice = slice as *const i8;
9407     let offsets = offsets.as_i32x16();
9408     macro_rules! call {
9409         ($imm8:expr) => {
9410             vpgatherdd(zero, slice, offsets, neg_one, $imm8)
9411         };
9412     }
9413     let r = constify_imm8_gather!(scale, call);
9414     transmute(r)
9415 }
9416
9417 /// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9418 ///
9419 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32gather_epi32&expand=2987)
9420 #[inline]
9421 #[target_feature(enable = "avx512f")]
9422 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
9423 #[rustc_args_required_const(4)]
9424 pub unsafe fn _mm512_mask_i32gather_epi32(
9425     src: __m512i,
9426     mask: __mmask16,
9427     offsets: __m512i,
9428     slice: *const u8,
9429     scale: i32,
9430 ) -> __m512i {
9431     let src = src.as_i32x16();
9432     let mask = mask as i16;
9433     let slice = slice as *const i8;
9434     let offsets = offsets.as_i32x16();
9435     macro_rules! call {
9436         ($imm8:expr) => {
9437             vpgatherdd(src, slice, offsets, mask, $imm8)
9438         };
9439     }
9440     let r = constify_imm8_gather!(scale, call);
9441     transmute(r)
9442 }
9443
9444 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9445 ///
9446 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32gather_epi64&expand=2994)
9447 #[inline]
9448 #[target_feature(enable = "avx512f")]
9449 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
9450 #[rustc_args_required_const(2)]
9451 pub unsafe fn _mm512_i32gather_epi64(offsets: __m256i, slice: *const u8, scale: i32) -> __m512i {
9452     let zero = _mm512_setzero_si512().as_i64x8();
9453     let neg_one = -1;
9454     let slice = slice as *const i8;
9455     let offsets = offsets.as_i32x8();
9456     macro_rules! call {
9457         ($imm8:expr) => {
9458             vpgatherdq(zero, slice, offsets, neg_one, $imm8)
9459         };
9460     }
9461     let r = constify_imm8_gather!(scale, call);
9462     transmute(r)
9463 }
9464
9465 /// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9466 ///
9467 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32gather_epi64&expand=2995)
9468 #[inline]
9469 #[target_feature(enable = "avx512f")]
9470 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
9471 #[rustc_args_required_const(4)]
9472 pub unsafe fn _mm512_mask_i32gather_epi64(
9473     src: __m512i,
9474     mask: __mmask8,
9475     offsets: __m256i,
9476     slice: *const u8,
9477     scale: i32,
9478 ) -> __m512i {
9479     let src = src.as_i64x8();
9480     let mask = mask as i8;
9481     let slice = slice as *const i8;
9482     let offsets = offsets.as_i32x8();
9483     macro_rules! call {
9484         ($imm8:expr) => {
9485             vpgatherdq(src, slice, offsets, mask, $imm8)
9486         };
9487     }
9488     let r = constify_imm8_gather!(scale, call);
9489     transmute(r)
9490 }
9491
9492 /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9493 ///
9494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64gather_epi64&expand=3084)
9495 #[inline]
9496 #[target_feature(enable = "avx512f")]
9497 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
9498 #[rustc_args_required_const(2)]
9499 pub unsafe fn _mm512_i64gather_epi64(offsets: __m512i, slice: *const u8, scale: i32) -> __m512i {
9500     let zero = _mm512_setzero_si512().as_i64x8();
9501     let neg_one = -1;
9502     let slice = slice as *const i8;
9503     let offsets = offsets.as_i64x8();
9504     macro_rules! call {
9505         ($imm8:expr) => {
9506             vpgatherqq(zero, slice, offsets, neg_one, $imm8)
9507         };
9508     }
9509     let r = constify_imm8_gather!(scale, call);
9510     transmute(r)
9511 }
9512
9513 /// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9514 ///
9515 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64gather_epi64&expand=3085)
9516 #[inline]
9517 #[target_feature(enable = "avx512f")]
9518 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
9519 #[rustc_args_required_const(4)]
9520 pub unsafe fn _mm512_mask_i64gather_epi64(
9521     src: __m512i,
9522     mask: __mmask8,
9523     offsets: __m512i,
9524     slice: *const u8,
9525     scale: i32,
9526 ) -> __m512i {
9527     let src = src.as_i64x8();
9528     let mask = mask as i8;
9529     let slice = slice as *const i8;
9530     let offsets = offsets.as_i64x8();
9531     macro_rules! call {
9532         ($imm8:expr) => {
9533             vpgatherqq(src, slice, offsets, mask, $imm8)
9534         };
9535     }
9536     let r = constify_imm8_gather!(scale, call);
9537     transmute(r)
9538 }
9539
9540 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
9541 ///
9542 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64gather_epi32&expand=3074)
9543 #[inline]
9544 #[target_feature(enable = "avx512f")]
9545 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
9546 #[rustc_args_required_const(2)]
9547 pub unsafe fn _mm512_i64gather_epi32(offsets: __m512i, slice: *const u8, scale: i32) -> __m256i {
9548     let zeros = _mm256_setzero_si256().as_i32x8();
9549     let neg_one = -1;
9550     let slice = slice as *const i8;
9551     let offsets = offsets.as_i64x8();
9552     macro_rules! call {
9553         ($imm8:expr) => {
9554             vpgatherqd(zeros, slice, offsets, neg_one, $imm8)
9555         };
9556     }
9557     let r = constify_imm8_gather!(scale, call);
9558     transmute(r)
9559 }
9560
9561 /// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9562 ///
9563 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64gather_epi32&expand=3075)
9564 #[inline]
9565 #[target_feature(enable = "avx512f")]
9566 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
9567 #[rustc_args_required_const(4)]
9568 pub unsafe fn _mm512_mask_i64gather_epi32(
9569     src: __m256i,
9570     mask: __mmask8,
9571     offsets: __m512i,
9572     slice: *const u8,
9573     scale: i32,
9574 ) -> __m256i {
9575     let src = src.as_i32x8();
9576     let mask = mask as i8;
9577     let slice = slice as *const i8;
9578     let offsets = offsets.as_i64x8();
9579     macro_rules! call {
9580         ($imm8:expr) => {
9581             vpgatherqd(src, slice, offsets, mask, $imm8)
9582         };
9583     }
9584     let r = constify_imm8_gather!(scale, call);
9585     transmute(r)
9586 }
9587
9588 /// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9589 ///
9590 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32scatter_pd&expand=3044)
9591 #[inline]
9592 #[target_feature(enable = "avx512f")]
9593 #[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))]
9594 #[rustc_args_required_const(3)]
9595 pub unsafe fn _mm512_i32scatter_pd(slice: *mut u8, offsets: __m256i, src: __m512d, scale: i32) {
9596     let src = src.as_f64x8();
9597     let neg_one = -1;
9598     let slice = slice as *mut i8;
9599     let offsets = offsets.as_i32x8();
9600     macro_rules! call {
9601         ($imm8:expr) => {
9602             vscatterdpd(slice, neg_one, offsets, src, $imm8)
9603         };
9604     }
9605     constify_imm8_gather!(scale, call);
9606 }
9607
9608 /// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9609 ///
9610 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32scatter_pd&expand=3045)
9611 #[inline]
9612 #[target_feature(enable = "avx512f")]
9613 #[cfg_attr(test, assert_instr(vscatterdpd, scale = 1))]
9614 #[rustc_args_required_const(4)]
9615 pub unsafe fn _mm512_mask_i32scatter_pd(
9616     slice: *mut u8,
9617     mask: __mmask8,
9618     offsets: __m256i,
9619     src: __m512d,
9620     scale: i32,
9621 ) {
9622     let src = src.as_f64x8();
9623     let slice = slice as *mut i8;
9624     let offsets = offsets.as_i32x8();
9625     macro_rules! call {
9626         ($imm8:expr) => {
9627             vscatterdpd(slice, mask as i8, offsets, src, $imm8)
9628         };
9629     }
9630     constify_imm8_gather!(scale, call);
9631 }
9632
9633 /// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9634 ///
9635 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64scatter_pd&expand=3122)
9636 #[inline]
9637 #[target_feature(enable = "avx512f")]
9638 #[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))]
9639 #[rustc_args_required_const(3)]
9640 pub unsafe fn _mm512_i64scatter_pd(slice: *mut u8, offsets: __m512i, src: __m512d, scale: i32) {
9641     let src = src.as_f64x8();
9642     let neg_one = -1;
9643     let slice = slice as *mut i8;
9644     let offsets = offsets.as_i64x8();
9645     macro_rules! call {
9646         ($imm8:expr) => {
9647             vscatterqpd(slice, neg_one, offsets, src, $imm8)
9648         };
9649     }
9650     constify_imm8_gather!(scale, call);
9651 }
9652
9653 /// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9654 ///
9655 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64scatter_pd&expand=3123)
9656 #[inline]
9657 #[target_feature(enable = "avx512f")]
9658 #[cfg_attr(test, assert_instr(vscatterqpd, scale = 1))]
9659 #[rustc_args_required_const(4)]
9660 pub unsafe fn _mm512_mask_i64scatter_pd(
9661     slice: *mut u8,
9662     mask: __mmask8,
9663     offsets: __m512i,
9664     src: __m512d,
9665     scale: i32,
9666 ) {
9667     let src = src.as_f64x8();
9668     let slice = slice as *mut i8;
9669     let offsets = offsets.as_i64x8();
9670     macro_rules! call {
9671         ($imm8:expr) => {
9672             vscatterqpd(slice, mask as i8, offsets, src, $imm8)
9673         };
9674     }
9675     constify_imm8_gather!(scale, call);
9676 }
9677
9678 /// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9679 ///
9680 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32scatter_ps&expand=3050)
9681 #[inline]
9682 #[target_feature(enable = "avx512f")]
9683 #[cfg_attr(test, assert_instr(vscatterdps, scale = 1))]
9684 #[rustc_args_required_const(3)]
9685 pub unsafe fn _mm512_i32scatter_ps(slice: *mut u8, offsets: __m512i, src: __m512, scale: i32) {
9686     let src = src.as_f32x16();
9687     let neg_one = -1;
9688     let slice = slice as *mut i8;
9689     let offsets = offsets.as_i32x16();
9690     macro_rules! call {
9691         ($imm8:expr) => {
9692             vscatterdps(slice, neg_one, offsets, src, $imm8)
9693         };
9694     }
9695     constify_imm8_gather!(scale, call);
9696 }
9697
9698 /// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9699 ///
9700 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32scatter_ps&expand=3051)
9701 #[inline]
9702 #[target_feature(enable = "avx512f")]
9703 #[cfg_attr(test, assert_instr(vscatterdps, scale = 1))]
9704 #[rustc_args_required_const(4)]
9705 pub unsafe fn _mm512_mask_i32scatter_ps(
9706     slice: *mut u8,
9707     mask: __mmask16,
9708     offsets: __m512i,
9709     src: __m512,
9710     scale: i32,
9711 ) {
9712     let src = src.as_f32x16();
9713     let slice = slice as *mut i8;
9714     let offsets = offsets.as_i32x16();
9715     macro_rules! call {
9716         ($imm8:expr) => {
9717             vscatterdps(slice, mask as i16, offsets, src, $imm8)
9718         };
9719     }
9720     constify_imm8_gather!(scale, call);
9721 }
9722
9723 /// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9724 ///
9725 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64scatter_ps&expand=3128)
9726 #[inline]
9727 #[target_feature(enable = "avx512f")]
9728 #[cfg_attr(test, assert_instr(vscatterqps, scale = 1))]
9729 #[rustc_args_required_const(3)]
9730 pub unsafe fn _mm512_i64scatter_ps(slice: *mut u8, offsets: __m512i, src: __m256, scale: i32) {
9731     let src = src.as_f32x8();
9732     let neg_one = -1;
9733     let slice = slice as *mut i8;
9734     let offsets = offsets.as_i64x8();
9735     macro_rules! call {
9736         ($imm8:expr) => {
9737             vscatterqps(slice, neg_one, offsets, src, $imm8)
9738         };
9739     }
9740     constify_imm8_gather!(scale, call);
9741 }
9742
9743 /// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9744 ///
9745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64scatter_ps&expand=3129)
9746 #[inline]
9747 #[target_feature(enable = "avx512f")]
9748 #[cfg_attr(test, assert_instr(vscatterqps, scale = 1))]
9749 #[rustc_args_required_const(4)]
9750 pub unsafe fn _mm512_mask_i64scatter_ps(
9751     slice: *mut u8,
9752     mask: __mmask8,
9753     offsets: __m512i,
9754     src: __m256,
9755     scale: i32,
9756 ) {
9757     let src = src.as_f32x8();
9758     let slice = slice as *mut i8;
9759     let offsets = offsets.as_i64x8();
9760     macro_rules! call {
9761         ($imm8:expr) => {
9762             vscatterqps(slice, mask as i8, offsets, src, $imm8)
9763         };
9764     }
9765     constify_imm8_gather!(scale, call);
9766 }
9767
9768 /// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9769 ///
9770 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32scatter_epi64&expand=3038)
9771 #[inline]
9772 #[target_feature(enable = "avx512f")]
9773 #[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))]
9774 #[rustc_args_required_const(3)]
9775 pub unsafe fn _mm512_i32scatter_epi64(slice: *mut u8, offsets: __m256i, src: __m512i, scale: i32) {
9776     let src = src.as_i64x8();
9777     let neg_one = -1;
9778     let slice = slice as *mut i8;
9779     let offsets = offsets.as_i32x8();
9780     macro_rules! call {
9781         ($imm8:expr) => {
9782             vpscatterdq(slice, neg_one, offsets, src, $imm8)
9783         };
9784     }
9785     constify_imm8_gather!(scale, call);
9786 }
9787
9788 /// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9789 ///
9790 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32scatter_epi64&expand=3039)
9791 #[inline]
9792 #[target_feature(enable = "avx512f")]
9793 #[cfg_attr(test, assert_instr(vpscatterdq, scale = 1))]
9794 #[rustc_args_required_const(4)]
9795 pub unsafe fn _mm512_mask_i32scatter_epi64(
9796     slice: *mut u8,
9797     mask: __mmask8,
9798     offsets: __m256i,
9799     src: __m512i,
9800     scale: i32,
9801 ) {
9802     let src = src.as_i64x8();
9803     let mask = mask as i8;
9804     let slice = slice as *mut i8;
9805     let offsets = offsets.as_i32x8();
9806     macro_rules! call {
9807         ($imm8:expr) => {
9808             vpscatterdq(slice, mask, offsets, src, $imm8)
9809         };
9810     }
9811     constify_imm8_gather!(scale, call);
9812 }
9813
9814 /// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9815 ///
9816 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64scatter_epi64&expand=3116)
9817 #[inline]
9818 #[target_feature(enable = "avx512f")]
9819 #[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))]
9820 #[rustc_args_required_const(3)]
9821 pub unsafe fn _mm512_i64scatter_epi64(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) {
9822     let src = src.as_i64x8();
9823     let neg_one = -1;
9824     let slice = slice as *mut i8;
9825     let offsets = offsets.as_i64x8();
9826     macro_rules! call {
9827         ($imm8:expr) => {
9828             vpscatterqq(slice, neg_one, offsets, src, $imm8)
9829         };
9830     }
9831     constify_imm8_gather!(scale, call);
9832 }
9833
9834 /// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9835 ///
9836 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64scatter_epi64&expand=3117)
9837 #[inline]
9838 #[target_feature(enable = "avx512f")]
9839 #[cfg_attr(test, assert_instr(vpscatterqq, scale = 1))]
9840 #[rustc_args_required_const(4)]
9841 pub unsafe fn _mm512_mask_i64scatter_epi64(
9842     slice: *mut u8,
9843     mask: __mmask8,
9844     offsets: __m512i,
9845     src: __m512i,
9846     scale: i32,
9847 ) {
9848     let src = src.as_i64x8();
9849     let mask = mask as i8;
9850     let slice = slice as *mut i8;
9851     let offsets = offsets.as_i64x8();
9852     macro_rules! call {
9853         ($imm8:expr) => {
9854             vpscatterqq(slice, mask, offsets, src, $imm8)
9855         };
9856     }
9857     constify_imm8_gather!(scale, call);
9858 }
9859
9860 /// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9861 ///
9862 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i32scatter_epi32&expand=3032)
9863 #[inline]
9864 #[target_feature(enable = "avx512f")]
9865 #[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))]
9866 #[rustc_args_required_const(3)]
9867 pub unsafe fn _mm512_i32scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m512i, scale: i32) {
9868     let src = src.as_i32x16();
9869     let neg_one = -1;
9870     let slice = slice as *mut i8;
9871     let offsets = offsets.as_i32x16();
9872     macro_rules! call {
9873         ($imm8:expr) => {
9874             vpscatterdd(slice, neg_one, offsets, src, $imm8)
9875         };
9876     }
9877     constify_imm8_gather!(scale, call);
9878 }
9879
9880 /// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9881 ///
9882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i32scatter_epi32&expand=3033)
9883 #[inline]
9884 #[target_feature(enable = "avx512f")]
9885 #[cfg_attr(test, assert_instr(vpscatterdd, scale = 1))]
9886 #[rustc_args_required_const(4)]
9887 pub unsafe fn _mm512_mask_i32scatter_epi32(
9888     slice: *mut u8,
9889     mask: __mmask16,
9890     offsets: __m512i,
9891     src: __m512i,
9892     scale: i32,
9893 ) {
9894     let src = src.as_i32x16();
9895     let mask = mask as i16;
9896     let slice = slice as *mut i8;
9897     let offsets = offsets.as_i32x16();
9898     macro_rules! call {
9899         ($imm8:expr) => {
9900             vpscatterdd(slice, mask, offsets, src, $imm8)
9901         };
9902     }
9903     constify_imm8_gather!(scale, call);
9904 }
9905
9906 /// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
9907 ///
9908 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_i64scatter_epi32&expand=3108)
9909 #[inline]
9910 #[target_feature(enable = "avx512f")]
9911 #[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))]
9912 #[rustc_args_required_const(3)]
9913 pub unsafe fn _mm512_i64scatter_epi32(slice: *mut u8, offsets: __m512i, src: __m256i, scale: i32) {
9914     let src = src.as_i32x8();
9915     let neg_one = -1;
9916     let slice = slice as *mut i8;
9917     let offsets = offsets.as_i64x8();
9918     macro_rules! call {
9919         ($imm8:expr) => {
9920             vpscatterqd(slice, neg_one, offsets, src, $imm8)
9921         };
9922     }
9923     constify_imm8_gather!(scale, call);
9924 }
9925
9926 /// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
9927 ///
9928 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_i64scatter_epi32&expand=3109)
9929 #[inline]
9930 #[target_feature(enable = "avx512f")]
9931 #[cfg_attr(test, assert_instr(vpscatterqd, scale = 1))]
9932 #[rustc_args_required_const(4)]
9933 pub unsafe fn _mm512_mask_i64scatter_epi32(
9934     slice: *mut u8,
9935     mask: __mmask8,
9936     offsets: __m512i,
9937     src: __m256i,
9938     scale: i32,
9939 ) {
9940     let src = src.as_i32x8();
9941     let mask = mask as i8;
9942     let slice = slice as *mut i8;
9943     let offsets = offsets.as_i64x8();
9944     macro_rules! call {
9945         ($imm8:expr) => {
9946             vpscatterqd(slice, mask, offsets, src, $imm8)
9947         };
9948     }
9949     constify_imm8_gather!(scale, call);
9950 }
9951
9952 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
9953 ///
9954 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_compress_epi32&expand=1198)
9955 #[inline]
9956 #[target_feature(enable = "avx512f")]
9957 #[cfg_attr(test, assert_instr(vpcompressd))]
9958 pub unsafe fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
9959     transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k))
9960 }
9961
9962 /// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
9963 ///
9964 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_compress_epi32&expand=1199)
9965 #[inline]
9966 #[target_feature(enable = "avx512f")]
9967 #[cfg_attr(test, assert_instr(vpcompressd))]
9968 pub unsafe fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
9969     transmute(vpcompressd(
9970         a.as_i32x16(),
9971         _mm512_setzero_si512().as_i32x16(),
9972         k,
9973     ))
9974 }
9975
9976 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
9977 ///
9978 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_compress_epi64&expand=1204)
9979 #[inline]
9980 #[target_feature(enable = "avx512f")]
9981 #[cfg_attr(test, assert_instr(vpcompressq))]
9982 pub unsafe fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
9983     transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k))
9984 }
9985
9986 /// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
9987 ///
9988 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_compress_epi64&expand=1205)
9989 #[inline]
9990 #[target_feature(enable = "avx512f")]
9991 #[cfg_attr(test, assert_instr(vpcompressq))]
9992 pub unsafe fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
9993     transmute(vpcompressq(
9994         a.as_i64x8(),
9995         _mm512_setzero_si512().as_i64x8(),
9996         k,
9997     ))
9998 }
9999
10000 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
10001 ///
10002 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_compress_ps&expand=1222)
10003 #[inline]
10004 #[target_feature(enable = "avx512f")]
10005 #[cfg_attr(test, assert_instr(vcompressps))]
10006 pub unsafe fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
10007     transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k))
10008 }
10009
10010 /// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
10011 ///
10012 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_compress_ps&expand=1223)
10013 #[inline]
10014 #[target_feature(enable = "avx512f")]
10015 #[cfg_attr(test, assert_instr(vcompressps))]
10016 pub unsafe fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
10017     transmute(vcompressps(
10018         a.as_f32x16(),
10019         _mm512_setzero_ps().as_f32x16(),
10020         k,
10021     ))
10022 }
10023
10024 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
10025 ///
10026 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_compress_pd&expand=1216)
10027 #[inline]
10028 #[target_feature(enable = "avx512f")]
10029 #[cfg_attr(test, assert_instr(vcompresspd))]
10030 pub unsafe fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
10031     transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k))
10032 }
10033
10034 /// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
10035 ///
10036 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_compress_pd&expand=1217)
10037 #[inline]
10038 #[target_feature(enable = "avx512f")]
10039 #[cfg_attr(test, assert_instr(vcompresspd))]
10040 pub unsafe fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
10041     transmute(vcompresspd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
10042 }
10043
10044 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10045 ///
10046 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_expand_epi32&expand=2316)
10047 #[inline]
10048 #[target_feature(enable = "avx512f")]
10049 #[cfg_attr(test, assert_instr(vpexpandd))]
10050 pub unsafe fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
10051     transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k))
10052 }
10053
10054 /// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10055 ///
10056 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_expand_epi32&expand=2317)
10057 #[inline]
10058 #[target_feature(enable = "avx512f")]
10059 #[cfg_attr(test, assert_instr(vpexpandd))]
10060 pub unsafe fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
10061     transmute(vpexpandd(
10062         a.as_i32x16(),
10063         _mm512_setzero_si512().as_i32x16(),
10064         k,
10065     ))
10066 }
10067
10068 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10069 ///
10070 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_expand_epi64&expand=2322)
10071 #[inline]
10072 #[target_feature(enable = "avx512f")]
10073 #[cfg_attr(test, assert_instr(vpexpandq))]
10074 pub unsafe fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
10075     transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k))
10076 }
10077
10078 /// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10079 ///
10080 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_expand_epi64&expand=2323)
10081 #[inline]
10082 #[target_feature(enable = "avx512f")]
10083 #[cfg_attr(test, assert_instr(vpexpandq))]
10084 pub unsafe fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
10085     transmute(vpexpandq(
10086         a.as_i64x8(),
10087         _mm512_setzero_si512().as_i64x8(),
10088         k,
10089     ))
10090 }
10091
10092 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10093 ///
10094 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_expand_ps&expand=2340)
10095 #[inline]
10096 #[target_feature(enable = "avx512f")]
10097 #[cfg_attr(test, assert_instr(vexpandps))]
10098 pub unsafe fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
10099     transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k))
10100 }
10101
10102 /// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10103 ///
10104 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_expand_ps&expand=2341)
10105 #[inline]
10106 #[target_feature(enable = "avx512f")]
10107 #[cfg_attr(test, assert_instr(vexpandps))]
10108 pub unsafe fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
10109     transmute(vexpandps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k))
10110 }
10111
10112 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10113 ///
10114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_expand_pd&expand=2334)
10115 #[inline]
10116 #[target_feature(enable = "avx512f")]
10117 #[cfg_attr(test, assert_instr(vexpandpd))]
10118 pub unsafe fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
10119     transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k))
10120 }
10121
10122 /// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10123 ///
10124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_expand_pd&expand=2335)
10125 #[inline]
10126 #[target_feature(enable = "avx512f")]
10127 #[cfg_attr(test, assert_instr(vexpandpd))]
10128 pub unsafe fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
10129     transmute(vexpandpd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
10130 }
10131
10132 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
10133 ///
10134 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rol_epi32&expand=4685)
10135 #[inline]
10136 #[target_feature(enable = "avx512f")]
10137 #[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
10138 #[rustc_args_required_const(1)]
10139 pub unsafe fn _mm512_rol_epi32(a: __m512i, imm8: i32) -> __m512i {
10140     let a = a.as_i32x16();
10141     macro_rules! call {
10142         ($imm8:expr) => {
10143             vprold(a, $imm8)
10144         };
10145     }
10146     let r = constify_imm8_sae!(imm8, call);
10147     transmute(r)
10148 }
10149
10150 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10151 ///
10152 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rol_epi32&expand=4683)
10153 #[inline]
10154 #[target_feature(enable = "avx512f")]
10155 #[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
10156 #[rustc_args_required_const(3)]
10157 pub unsafe fn _mm512_mask_rol_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
10158     let a = a.as_i32x16();
10159     macro_rules! call {
10160         ($imm8:expr) => {
10161             vprold(a, $imm8)
10162         };
10163     }
10164     let rol = constify_imm8_sae!(imm8, call);
10165     transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
10166 }
10167
10168 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10169 ///
10170 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rol_epi32&expand=4684)
10171 #[inline]
10172 #[target_feature(enable = "avx512f")]
10173 #[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
10174 #[rustc_args_required_const(2)]
10175 pub unsafe fn _mm512_maskz_rol_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
10176     let a = a.as_i32x16();
10177     macro_rules! call {
10178         ($imm8:expr) => {
10179             vprold(a, $imm8)
10180         };
10181     }
10182     let rol = constify_imm8_sae!(imm8, call);
10183     let zero = _mm512_setzero_si512().as_i32x16();
10184     transmute(simd_select_bitmask(k, rol, zero))
10185 }
10186
10187 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
10188 ///
10189 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ror_epi32&expand=4721)
10190 #[inline]
10191 #[target_feature(enable = "avx512f")]
10192 #[cfg_attr(test, assert_instr(vprold, imm8 = 1))]
10193 #[rustc_args_required_const(1)]
10194 pub unsafe fn _mm512_ror_epi32(a: __m512i, imm8: i32) -> __m512i {
10195     let a = a.as_i32x16();
10196     macro_rules! call {
10197         ($imm8:expr) => {
10198             vprord(a, $imm8)
10199         };
10200     }
10201     let r = constify_imm8_sae!(imm8, call);
10202     transmute(r)
10203 }
10204
10205 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10206 ///
10207 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ror_epi32&expand=4719)
10208 #[inline]
10209 #[target_feature(enable = "avx512f")]
10210 #[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
10211 #[rustc_args_required_const(3)]
10212 pub unsafe fn _mm512_mask_ror_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
10213     let a = a.as_i32x16();
10214     macro_rules! call {
10215         ($imm8:expr) => {
10216             vprord(a, $imm8)
10217         };
10218     }
10219     let ror = constify_imm8_sae!(imm8, call);
10220     transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
10221 }
10222
10223 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10224 ///
10225 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ror_epi32&expand=4720)
10226 #[inline]
10227 #[target_feature(enable = "avx512f")]
10228 #[cfg_attr(test, assert_instr(vprold, imm8 = 123))]
10229 #[rustc_args_required_const(2)]
10230 pub unsafe fn _mm512_maskz_ror_epi32(k: __mmask16, a: __m512i, imm8: i32) -> __m512i {
10231     let a = a.as_i32x16();
10232     macro_rules! call {
10233         ($imm8:expr) => {
10234             vprord(a, $imm8)
10235         };
10236     }
10237     let ror = constify_imm8_sae!(imm8, call);
10238     let zero = _mm512_setzero_si512().as_i32x16();
10239     transmute(simd_select_bitmask(k, ror, zero))
10240 }
10241
10242 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
10243 ///
10244 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rol_epi64&expand=4694)
10245 #[inline]
10246 #[target_feature(enable = "avx512f")]
10247 #[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
10248 #[rustc_args_required_const(1)]
10249 pub unsafe fn _mm512_rol_epi64(a: __m512i, imm8: i32) -> __m512i {
10250     let a = a.as_i64x8();
10251     macro_rules! call {
10252         ($imm8:expr) => {
10253             vprolq(a, $imm8)
10254         };
10255     }
10256     let r = constify_imm8_sae!(imm8, call);
10257     transmute(r)
10258 }
10259
10260 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10261 ///
10262 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rol_epi64&expand=4692)
10263 #[inline]
10264 #[target_feature(enable = "avx512f")]
10265 #[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
10266 #[rustc_args_required_const(3)]
10267 pub unsafe fn _mm512_mask_rol_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
10268     let a = a.as_i64x8();
10269     macro_rules! call {
10270         ($imm8:expr) => {
10271             vprolq(a, $imm8)
10272         };
10273     }
10274     let rol = constify_imm8_sae!(imm8, call);
10275     transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
10276 }
10277
10278 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10279 ///
10280 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rol_epi64&expand=4693)
10281 #[inline]
10282 #[target_feature(enable = "avx512f")]
10283 #[cfg_attr(test, assert_instr(vprolq, imm8 = 1))]
10284 #[rustc_args_required_const(2)]
10285 pub unsafe fn _mm512_maskz_rol_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
10286     let a = a.as_i64x8();
10287     macro_rules! call {
10288         ($imm8:expr) => {
10289             vprolq(a, $imm8)
10290         };
10291     }
10292     let rol = constify_imm8_sae!(imm8, call);
10293     let zero = _mm512_setzero_si512().as_i64x8();
10294     transmute(simd_select_bitmask(k, rol, zero))
10295 }
10296
10297 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
10298 ///
10299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_ror_epi64&expand=4730)
10300 #[inline]
10301 #[target_feature(enable = "avx512f")]
10302 #[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
10303 #[rustc_args_required_const(1)]
10304 pub unsafe fn _mm512_ror_epi64(a: __m512i, imm8: i32) -> __m512i {
10305     let a = a.as_i64x8();
10306     macro_rules! call {
10307         ($imm8:expr) => {
10308             vprorq(a, $imm8)
10309         };
10310     }
10311     let r = constify_imm8_sae!(imm8, call);
10312     transmute(r)
10313 }
10314
10315 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10316 ///
10317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_ror_epi64&expand=4728)
10318 #[inline]
10319 #[target_feature(enable = "avx512f")]
10320 #[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
10321 #[rustc_args_required_const(3)]
10322 pub unsafe fn _mm512_mask_ror_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
10323     let a = a.as_i64x8();
10324     macro_rules! call {
10325         ($imm8:expr) => {
10326             vprorq(a, $imm8)
10327         };
10328     }
10329     let ror = constify_imm8_sae!(imm8, call);
10330     transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
10331 }
10332
10333 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10334 ///
10335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_ror_epi64&expand=4729)
10336 #[inline]
10337 #[target_feature(enable = "avx512f")]
10338 #[cfg_attr(test, assert_instr(vprolq, imm8 = 15))]
10339 #[rustc_args_required_const(2)]
10340 pub unsafe fn _mm512_maskz_ror_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
10341     let a = a.as_i64x8();
10342     macro_rules! call {
10343         ($imm8:expr) => {
10344             vprorq(a, $imm8)
10345         };
10346     }
10347     let ror = constify_imm8_sae!(imm8, call);
10348     let zero = _mm512_setzero_si512().as_i64x8();
10349     transmute(simd_select_bitmask(k, ror, zero))
10350 }
10351
10352 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
10353 ///
10354 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_slli_epi32&expand=5310)
10355 #[inline]
10356 #[target_feature(enable = "avx512f")]
10357 #[cfg_attr(test, assert_instr(vpslld, imm8 = 5))]
10358 #[rustc_args_required_const(1)]
10359 pub unsafe fn _mm512_slli_epi32(a: __m512i, imm8: u32) -> __m512i {
10360     let a = a.as_i32x16();
10361     macro_rules! call {
10362         ($imm8:expr) => {
10363             vpsllid(a, $imm8)
10364         };
10365     }
10366     let r = constify_imm8_sae!(imm8, call);
10367     transmute(r)
10368 }
10369
10370 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10371 ///
10372 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_slli_epi32&expand=5308)
10373 #[inline]
10374 #[target_feature(enable = "avx512f")]
10375 #[cfg_attr(test, assert_instr(vpslld, imm8 = 5))]
10376 #[rustc_args_required_const(3)]
10377 pub unsafe fn _mm512_mask_slli_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
10378     let a = a.as_i32x16();
10379     macro_rules! call {
10380         ($imm8:expr) => {
10381             vpsllid(a, $imm8)
10382         };
10383     }
10384     let shf = constify_imm8_sae!(imm8, call);
10385     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10386 }
10387
10388 /// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10389 ///
10390 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_slli_epi32&expand=5309)
10391 #[inline]
10392 #[target_feature(enable = "avx512f")]
10393 #[cfg_attr(test, assert_instr(vpslld, imm8 = 5))]
10394 #[rustc_args_required_const(2)]
10395 pub unsafe fn _mm512_maskz_slli_epi32(k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
10396     let a = a.as_i32x16();
10397     macro_rules! call {
10398         ($imm8:expr) => {
10399             vpsllid(a, $imm8)
10400         };
10401     }
10402     let shf = constify_imm8_sae!(imm8, call);
10403     let zero = _mm512_setzero_si512().as_i32x16();
10404     transmute(simd_select_bitmask(k, shf, zero))
10405 }
10406
10407 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
10408 ///
10409 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srli_epi32&expand=5522)
10410 #[inline]
10411 #[target_feature(enable = "avx512f")]
10412 #[cfg_attr(test, assert_instr(vpsrld, imm8 = 1))]
10413 #[rustc_args_required_const(1)]
10414 pub unsafe fn _mm512_srli_epi32(a: __m512i, imm8: u32) -> __m512i {
10415     let a = a.as_i32x16();
10416     macro_rules! call {
10417         ($imm8:expr) => {
10418             vpsrlid(a, $imm8)
10419         };
10420     }
10421     let r = constify_imm8_sae!(imm8, call);
10422     transmute(r)
10423 }
10424
10425 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10426 ///
10427 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srli_epi32&expand=5520)
10428 #[inline]
10429 #[target_feature(enable = "avx512f")]
10430 #[cfg_attr(test, assert_instr(vpsrld, imm8 = 1))]
10431 #[rustc_args_required_const(3)]
10432 pub unsafe fn _mm512_mask_srli_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
10433     let a = a.as_i32x16();
10434     macro_rules! call {
10435         ($imm8:expr) => {
10436             vpsrlid(a, $imm8)
10437         };
10438     }
10439     let shf = constify_imm8_sae!(imm8, call);
10440     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10441 }
10442
10443 /// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10444 ///
10445 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srli_epi32&expand=5521)
10446 #[inline]
10447 #[target_feature(enable = "avx512f")]
10448 #[cfg_attr(test, assert_instr(vpsrld, imm8 = 1))]
10449 #[rustc_args_required_const(2)]
10450 pub unsafe fn _mm512_maskz_srli_epi32(k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
10451     let a = a.as_i32x16();
10452     macro_rules! call {
10453         ($imm8:expr) => {
10454             vpsrlid(a, $imm8)
10455         };
10456     }
10457     let shf = constify_imm8_sae!(imm8, call);
10458     let zero = _mm512_setzero_si512().as_i32x16();
10459     transmute(simd_select_bitmask(k, shf, zero))
10460 }
10461
10462 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
10463 ///
10464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_slli_epi64&expand=5319)
10465 #[inline]
10466 #[target_feature(enable = "avx512f")]
10467 #[cfg_attr(test, assert_instr(vpsllq, imm8 = 5))]
10468 #[rustc_args_required_const(1)]
10469 pub unsafe fn _mm512_slli_epi64(a: __m512i, imm8: u32) -> __m512i {
10470     let a = a.as_i64x8();
10471     macro_rules! call {
10472         ($imm8:expr) => {
10473             vpslliq(a, $imm8)
10474         };
10475     }
10476     let r = constify_imm8_sae!(imm8, call);
10477     transmute(r)
10478 }
10479
10480 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10481 ///
10482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_slli_epi64&expand=5317)
10483 #[inline]
10484 #[target_feature(enable = "avx512f")]
10485 #[cfg_attr(test, assert_instr(vpsllq, imm8 = 5))]
10486 #[rustc_args_required_const(3)]
10487 pub unsafe fn _mm512_mask_slli_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: u32) -> __m512i {
10488     let a = a.as_i64x8();
10489     macro_rules! call {
10490         ($imm8:expr) => {
10491             vpslliq(a, $imm8)
10492         };
10493     }
10494     let shf = constify_imm8_sae!(imm8, call);
10495     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10496 }
10497
10498 /// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10499 ///
10500 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_slli_epi64&expand=5318)
10501 #[inline]
10502 #[target_feature(enable = "avx512f")]
10503 #[cfg_attr(test, assert_instr(vpsllq, imm8 = 5))]
10504 #[rustc_args_required_const(2)]
10505 pub unsafe fn _mm512_maskz_slli_epi64(k: __mmask8, a: __m512i, imm8: u32) -> __m512i {
10506     let a = a.as_i64x8();
10507     macro_rules! call {
10508         ($imm8:expr) => {
10509             vpslliq(a, $imm8)
10510         };
10511     }
10512     let shf = constify_imm8_sae!(imm8, call);
10513     let zero = _mm512_setzero_si512().as_i64x8();
10514     transmute(simd_select_bitmask(k, shf, zero))
10515 }
10516
10517 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
10518 ///
10519 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srli_epi64&expand=5531)
10520 #[inline]
10521 #[target_feature(enable = "avx512f")]
10522 #[cfg_attr(test, assert_instr(vpsrlq, imm8 = 1))]
10523 #[rustc_args_required_const(1)]
10524 pub unsafe fn _mm512_srli_epi64(a: __m512i, imm8: u32) -> __m512i {
10525     let a = a.as_i64x8();
10526     macro_rules! call {
10527         ($imm8:expr) => {
10528             vpsrliq(a, $imm8)
10529         };
10530     }
10531     let r = constify_imm8_sae!(imm8, call);
10532     transmute(r)
10533 }
10534
10535 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10536 ///
10537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srli_epi64&expand=5529)
10538 #[inline]
10539 #[target_feature(enable = "avx512f")]
10540 #[cfg_attr(test, assert_instr(vpsrlq, imm8 = 1))]
10541 #[rustc_args_required_const(3)]
10542 pub unsafe fn _mm512_mask_srli_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: u32) -> __m512i {
10543     let a = a.as_i64x8();
10544     macro_rules! call {
10545         ($imm8:expr) => {
10546             vpsrliq(a, $imm8)
10547         };
10548     }
10549     let shf = constify_imm8_sae!(imm8, call);
10550     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10551 }
10552
10553 /// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10554 ///
10555 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srli_epi64&expand=5530)
10556 #[inline]
10557 #[target_feature(enable = "avx512f")]
10558 #[cfg_attr(test, assert_instr(vpsrlq, imm8 = 1))]
10559 #[rustc_args_required_const(2)]
10560 pub unsafe fn _mm512_maskz_srli_epi64(k: __mmask8, a: __m512i, imm8: u32) -> __m512i {
10561     let a = a.as_i64x8();
10562     macro_rules! call {
10563         ($imm8:expr) => {
10564             vpsrliq(a, $imm8)
10565         };
10566     }
10567     let shf = constify_imm8_sae!(imm8, call);
10568     let zero = _mm512_setzero_si512().as_i64x8();
10569     transmute(simd_select_bitmask(k, shf, zero))
10570 }
10571
10572 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
10573 ///
10574 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sll_epi32&expand=5280)
10575 #[inline]
10576 #[target_feature(enable = "avx512f")]
10577 #[cfg_attr(test, assert_instr(vpslld))]
10578 pub unsafe fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
10579     transmute(vpslld(a.as_i32x16(), count.as_i32x4()))
10580 }
10581
10582 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10583 ///
10584 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sll_epi32&expand=5278)
10585 #[inline]
10586 #[target_feature(enable = "avx512f")]
10587 #[cfg_attr(test, assert_instr(vpslld))]
10588 pub unsafe fn _mm512_mask_sll_epi32(
10589     src: __m512i,
10590     k: __mmask16,
10591     a: __m512i,
10592     count: __m128i,
10593 ) -> __m512i {
10594     let shf = _mm512_sll_epi32(a, count).as_i32x16();
10595     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10596 }
10597
10598 /// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10599 ///
10600 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sll_epi32&expand=5279)
10601 #[inline]
10602 #[target_feature(enable = "avx512f")]
10603 #[cfg_attr(test, assert_instr(vpslld))]
10604 pub unsafe fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
10605     let shf = _mm512_sll_epi32(a, count).as_i32x16();
10606     let zero = _mm512_setzero_si512().as_i32x16();
10607     transmute(simd_select_bitmask(k, shf, zero))
10608 }
10609
10610 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
10611 ///
10612 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srl_epi32&expand=5492)
10613 #[inline]
10614 #[target_feature(enable = "avx512f")]
10615 #[cfg_attr(test, assert_instr(vpsrld))]
10616 pub unsafe fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
10617     transmute(vpsrld(a.as_i32x16(), count.as_i32x4()))
10618 }
10619
10620 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10621 ///
10622 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srl_epi32&expand=5490)
10623 #[inline]
10624 #[target_feature(enable = "avx512f")]
10625 #[cfg_attr(test, assert_instr(vpsrld))]
10626 pub unsafe fn _mm512_mask_srl_epi32(
10627     src: __m512i,
10628     k: __mmask16,
10629     a: __m512i,
10630     count: __m128i,
10631 ) -> __m512i {
10632     let shf = _mm512_srl_epi32(a, count).as_i32x16();
10633     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10634 }
10635
10636 /// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10637 ///
10638 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srl_epi32&expand=5491)
10639 #[inline]
10640 #[target_feature(enable = "avx512f")]
10641 #[cfg_attr(test, assert_instr(vpsrld))]
10642 pub unsafe fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
10643     let shf = _mm512_srl_epi32(a, count).as_i32x16();
10644     let zero = _mm512_setzero_si512().as_i32x16();
10645     transmute(simd_select_bitmask(k, shf, zero))
10646 }
10647
10648 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
10649 ///
10650 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sll_epi64&expand=5289)
10651 #[inline]
10652 #[target_feature(enable = "avx512f")]
10653 #[cfg_attr(test, assert_instr(vpsllq))]
10654 pub unsafe fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
10655     transmute(vpsllq(a.as_i64x8(), count.as_i64x2()))
10656 }
10657
10658 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10659 ///
10660 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sll_epi64&expand=5287)
10661 #[inline]
10662 #[target_feature(enable = "avx512f")]
10663 #[cfg_attr(test, assert_instr(vpsllq))]
10664 pub unsafe fn _mm512_mask_sll_epi64(
10665     src: __m512i,
10666     k: __mmask8,
10667     a: __m512i,
10668     count: __m128i,
10669 ) -> __m512i {
10670     let shf = _mm512_sll_epi64(a, count).as_i64x8();
10671     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10672 }
10673
10674 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10675 ///
10676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sll_epi64&expand=5288)
10677 #[inline]
10678 #[target_feature(enable = "avx512f")]
10679 #[cfg_attr(test, assert_instr(vpsllq))]
10680 pub unsafe fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
10681     let shf = _mm512_sll_epi64(a, count).as_i64x8();
10682     let zero = _mm512_setzero_si512().as_i64x8();
10683     transmute(simd_select_bitmask(k, shf, zero))
10684 }
10685
10686 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
10687 ///
10688 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srl_epi64&expand=5501)
10689 #[inline]
10690 #[target_feature(enable = "avx512f")]
10691 #[cfg_attr(test, assert_instr(vpsrlq))]
10692 pub unsafe fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
10693     transmute(vpsrlq(a.as_i64x8(), count.as_i64x2()))
10694 }
10695
10696 /// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10697 ///
10698 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srl_epi64&expand=5499)
10699 #[inline]
10700 #[target_feature(enable = "avx512f")]
10701 #[cfg_attr(test, assert_instr(vpsrlq))]
10702 pub unsafe fn _mm512_mask_srl_epi64(
10703     src: __m512i,
10704     k: __mmask8,
10705     a: __m512i,
10706     count: __m128i,
10707 ) -> __m512i {
10708     let shf = _mm512_srl_epi64(a, count).as_i64x8();
10709     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10710 }
10711
10712 /// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10713 ///
10714 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sll_epi64&expand=5288)
10715 #[inline]
10716 #[target_feature(enable = "avx512f")]
10717 #[cfg_attr(test, assert_instr(vpsrlq))]
10718 pub unsafe fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
10719     let shf = _mm512_srl_epi64(a, count).as_i64x8();
10720     let zero = _mm512_setzero_si512().as_i64x8();
10721     transmute(simd_select_bitmask(k, shf, zero))
10722 }
10723
10724 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
10725 ///
10726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sra_epi32&expand=5407)
10727 #[inline]
10728 #[target_feature(enable = "avx512f")]
10729 #[cfg_attr(test, assert_instr(vpsrad))]
10730 pub unsafe fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
10731     transmute(vpsrad(a.as_i32x16(), count.as_i32x4()))
10732 }
10733
10734 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10735 ///
10736 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sra_epi32&expand=5405)
10737 #[inline]
10738 #[target_feature(enable = "avx512f")]
10739 #[cfg_attr(test, assert_instr(vpsrad))]
10740 pub unsafe fn _mm512_mask_sra_epi32(
10741     src: __m512i,
10742     k: __mmask16,
10743     a: __m512i,
10744     count: __m128i,
10745 ) -> __m512i {
10746     let shf = _mm512_sra_epi32(a, count).as_i32x16();
10747     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10748 }
10749
10750 /// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10751 ///
10752 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sra_epi32&expand=5406)
10753 #[inline]
10754 #[target_feature(enable = "avx512f")]
10755 #[cfg_attr(test, assert_instr(vpsrad))]
10756 pub unsafe fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
10757     let shf = _mm512_sra_epi32(a, count).as_i32x16();
10758     let zero = _mm512_setzero_si512().as_i32x16();
10759     transmute(simd_select_bitmask(k, shf, zero))
10760 }
10761
10762 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
10763 ///
10764 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sra_epi64&expand=5416)
10765 #[inline]
10766 #[target_feature(enable = "avx512f")]
10767 #[cfg_attr(test, assert_instr(vpsraq))]
10768 pub unsafe fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
10769     transmute(vpsraq(a.as_i64x8(), count.as_i64x2()))
10770 }
10771
10772 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10773 ///
10774 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sra_epi64&expand=5414)
10775 #[inline]
10776 #[target_feature(enable = "avx512f")]
10777 #[cfg_attr(test, assert_instr(vpsraq))]
10778 pub unsafe fn _mm512_mask_sra_epi64(
10779     src: __m512i,
10780     k: __mmask8,
10781     a: __m512i,
10782     count: __m128i,
10783 ) -> __m512i {
10784     let shf = _mm512_sra_epi64(a, count).as_i64x8();
10785     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10786 }
10787
10788 /// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10789 ///
10790 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sra_epi64&expand=5415)
10791 #[inline]
10792 #[target_feature(enable = "avx512f")]
10793 #[cfg_attr(test, assert_instr(vpsraq))]
10794 pub unsafe fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
10795     let shf = _mm512_sra_epi64(a, count).as_i64x8();
10796     let zero = _mm512_setzero_si512().as_i64x8();
10797     transmute(simd_select_bitmask(k, shf, zero))
10798 }
10799
10800 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
10801 ///
10802 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srai_epi32&expand=5436)
10803 #[inline]
10804 #[target_feature(enable = "avx512f")]
10805 #[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
10806 #[rustc_args_required_const(1)]
10807 pub unsafe fn _mm512_srai_epi32(a: __m512i, imm8: u32) -> __m512i {
10808     let a = a.as_i32x16();
10809     macro_rules! call {
10810         ($imm8:expr) => {
10811             vpsraid(a, $imm8)
10812         };
10813     }
10814     let r = constify_imm8_sae!(imm8, call);
10815     transmute(r)
10816 }
10817
10818 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10819 ///
10820 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srai_epi32&expand=5434)
10821 #[inline]
10822 #[target_feature(enable = "avx512f")]
10823 #[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
10824 #[rustc_args_required_const(3)]
10825 pub unsafe fn _mm512_mask_srai_epi32(src: __m512i, k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
10826     let a = a.as_i32x16();
10827     macro_rules! call {
10828         ($imm8:expr) => {
10829             vpsraid(a, $imm8)
10830         };
10831     }
10832     let shf = constify_imm8_sae!(imm8, call);
10833     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10834 }
10835
10836 /// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10837 ///
10838 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srai_epi32&expand=5435)
10839 #[inline]
10840 #[target_feature(enable = "avx512f")]
10841 #[cfg_attr(test, assert_instr(vpsrad, imm8 = 1))]
10842 #[rustc_args_required_const(2)]
10843 pub unsafe fn _mm512_maskz_srai_epi32(k: __mmask16, a: __m512i, imm8: u32) -> __m512i {
10844     let a = a.as_i32x16();
10845     macro_rules! call {
10846         ($imm8:expr) => {
10847             vpsraid(a, $imm8)
10848         };
10849     }
10850     let shf = constify_imm8_sae!(imm8, call);
10851     let zero = _mm512_setzero_si512().as_i32x16();
10852     transmute(simd_select_bitmask(k, shf, zero))
10853 }
10854
10855 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
10856 ///
10857 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srai_epi64&expand=5445)
10858 #[inline]
10859 #[target_feature(enable = "avx512f")]
10860 #[cfg_attr(test, assert_instr(vpsraq, imm8 = 1))]
10861 #[rustc_args_required_const(1)]
10862 pub unsafe fn _mm512_srai_epi64(a: __m512i, imm8: u32) -> __m512i {
10863     let a = a.as_i64x8();
10864     macro_rules! call {
10865         ($imm8:expr) => {
10866             vpsraiq(a, $imm8)
10867         };
10868     }
10869     let r = constify_imm8_sae!(imm8, call);
10870     transmute(r)
10871 }
10872
10873 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10874 ///
10875 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srai_epi64&expand=5443)
10876 #[inline]
10877 #[target_feature(enable = "avx512f")]
10878 #[cfg_attr(test, assert_instr(vpsraq, imm8 = 1))]
10879 #[rustc_args_required_const(3)]
10880 pub unsafe fn _mm512_mask_srai_epi64(src: __m512i, k: __mmask8, a: __m512i, imm8: u32) -> __m512i {
10881     let a = a.as_i64x8();
10882     macro_rules! call {
10883         ($imm8:expr) => {
10884             vpsraiq(a, $imm8)
10885         };
10886     }
10887     let shf = constify_imm8_sae!(imm8, call);
10888     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10889 }
10890
10891 /// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10892 ///
10893 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srai_epi64&expand=5444)
10894 #[inline]
10895 #[target_feature(enable = "avx512f")]
10896 #[cfg_attr(test, assert_instr(vpsraq, imm8 = 1))]
10897 #[rustc_args_required_const(2)]
10898 pub unsafe fn _mm512_maskz_srai_epi64(k: __mmask8, a: __m512i, imm8: u32) -> __m512i {
10899     let a = a.as_i64x8();
10900     macro_rules! call {
10901         ($imm8:expr) => {
10902             vpsraiq(a, $imm8)
10903         };
10904     }
10905     let shf = constify_imm8_sae!(imm8, call);
10906     let zero = _mm512_setzero_si512().as_i64x8();
10907     transmute(simd_select_bitmask(k, shf, zero))
10908 }
10909
10910 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
10911 ///
10912 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srav_epi32&expand=5465)
10913 #[inline]
10914 #[target_feature(enable = "avx512f")]
10915 #[cfg_attr(test, assert_instr(vpsravd))]
10916 pub unsafe fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
10917     transmute(vpsravd(a.as_i32x16(), count.as_i32x16()))
10918 }
10919
10920 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10921 ///
10922 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srav_epi32&expand=5463)
10923 #[inline]
10924 #[target_feature(enable = "avx512f")]
10925 #[cfg_attr(test, assert_instr(vpsravd))]
10926 pub unsafe fn _mm512_mask_srav_epi32(
10927     src: __m512i,
10928     k: __mmask16,
10929     a: __m512i,
10930     count: __m512i,
10931 ) -> __m512i {
10932     let shf = _mm512_srav_epi32(a, count).as_i32x16();
10933     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
10934 }
10935
10936 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10937 ///
10938 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srav_epi32&expand=5464)
10939 #[inline]
10940 #[target_feature(enable = "avx512f")]
10941 #[cfg_attr(test, assert_instr(vpsravd))]
10942 pub unsafe fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
10943     let shf = _mm512_srav_epi32(a, count).as_i32x16();
10944     let zero = _mm512_setzero_si512().as_i32x16();
10945     transmute(simd_select_bitmask(k, shf, zero))
10946 }
10947
10948 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
10949 ///
10950 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srav_epi64&expand=5474)
10951 #[inline]
10952 #[target_feature(enable = "avx512f")]
10953 #[cfg_attr(test, assert_instr(vpsravq))]
10954 pub unsafe fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
10955     transmute(vpsravq(a.as_i64x8(), count.as_i64x8()))
10956 }
10957
10958 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10959 ///
10960 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srav_epi64&expand=5472)
10961 #[inline]
10962 #[target_feature(enable = "avx512f")]
10963 #[cfg_attr(test, assert_instr(vpsravq))]
10964 pub unsafe fn _mm512_mask_srav_epi64(
10965     src: __m512i,
10966     k: __mmask8,
10967     a: __m512i,
10968     count: __m512i,
10969 ) -> __m512i {
10970     let shf = _mm512_srav_epi64(a, count).as_i64x8();
10971     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
10972 }
10973
10974 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10975 ///
10976 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srav_epi64&expand=5473)
10977 #[inline]
10978 #[target_feature(enable = "avx512f")]
10979 #[cfg_attr(test, assert_instr(vpsravq))]
10980 pub unsafe fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
10981     let shf = _mm512_srav_epi64(a, count).as_i64x8();
10982     let zero = _mm512_setzero_si512().as_i64x8();
10983     transmute(simd_select_bitmask(k, shf, zero))
10984 }
10985
10986 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
10987 ///
10988 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rolv_epi32&expand=4703)
10989 #[inline]
10990 #[target_feature(enable = "avx512f")]
10991 #[cfg_attr(test, assert_instr(vprolvd))]
10992 pub unsafe fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
10993     transmute(vprolvd(a.as_i32x16(), b.as_i32x16()))
10994 }
10995
10996 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10997 ///
10998 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rolv_epi32&expand=4701)
10999 #[inline]
11000 #[target_feature(enable = "avx512f")]
11001 #[cfg_attr(test, assert_instr(vprolvd))]
11002 pub unsafe fn _mm512_mask_rolv_epi32(
11003     src: __m512i,
11004     k: __mmask16,
11005     a: __m512i,
11006     b: __m512i,
11007 ) -> __m512i {
11008     let rol = _mm512_rolv_epi32(a, b).as_i32x16();
11009     transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
11010 }
11011
11012 /// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11013 ///
11014 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rolv_epi32&expand=4702)
11015 #[inline]
11016 #[target_feature(enable = "avx512f")]
11017 #[cfg_attr(test, assert_instr(vprolvd))]
11018 pub unsafe fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
11019     let rol = _mm512_rolv_epi32(a, b).as_i32x16();
11020     let zero = _mm512_setzero_si512().as_i32x16();
11021     transmute(simd_select_bitmask(k, rol, zero))
11022 }
11023
11024 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
11025 ///
11026 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rorv_epi32&expand=4739)
11027 #[inline]
11028 #[target_feature(enable = "avx512f")]
11029 #[cfg_attr(test, assert_instr(vprorvd))]
11030 pub unsafe fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
11031     transmute(vprorvd(a.as_i32x16(), b.as_i32x16()))
11032 }
11033
11034 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11035 ///
11036 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rorv_epi32&expand=4737)
11037 #[inline]
11038 #[target_feature(enable = "avx512f")]
11039 #[cfg_attr(test, assert_instr(vprorvd))]
11040 pub unsafe fn _mm512_mask_rorv_epi32(
11041     src: __m512i,
11042     k: __mmask16,
11043     a: __m512i,
11044     b: __m512i,
11045 ) -> __m512i {
11046     let ror = _mm512_rorv_epi32(a, b).as_i32x16();
11047     transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
11048 }
11049
11050 /// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11051 ///
11052 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rorv_epi32&expand=4738)
11053 #[inline]
11054 #[target_feature(enable = "avx512f")]
11055 #[cfg_attr(test, assert_instr(vprorvd))]
11056 pub unsafe fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
11057     let ror = _mm512_rorv_epi32(a, b).as_i32x16();
11058     let zero = _mm512_setzero_si512().as_i32x16();
11059     transmute(simd_select_bitmask(k, ror, zero))
11060 }
11061
11062 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
11063 ///
11064 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rolv_epi64&expand=4712)
11065 #[inline]
11066 #[target_feature(enable = "avx512f")]
11067 #[cfg_attr(test, assert_instr(vprolvq))]
11068 pub unsafe fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
11069     transmute(vprolvq(a.as_i64x8(), b.as_i64x8()))
11070 }
11071
11072 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11073 ///
11074 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rolv_epi64&expand=4710)
11075 #[inline]
11076 #[target_feature(enable = "avx512f")]
11077 #[cfg_attr(test, assert_instr(vprolvq))]
11078 pub unsafe fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
11079     let rol = _mm512_rolv_epi64(a, b).as_i64x8();
11080     transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
11081 }
11082
11083 /// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11084 ///
11085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rolv_epi64&expand=4711)
11086 #[inline]
11087 #[target_feature(enable = "avx512f")]
11088 #[cfg_attr(test, assert_instr(vprolvq))]
11089 pub unsafe fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
11090     let rol = _mm512_rolv_epi64(a, b).as_i64x8();
11091     let zero = _mm512_setzero_si512().as_i64x8();
11092     transmute(simd_select_bitmask(k, rol, zero))
11093 }
11094
11095 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
11096 ///
11097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_rorv_epi64&expand=4748)
11098 #[inline]
11099 #[target_feature(enable = "avx512f")]
11100 #[cfg_attr(test, assert_instr(vprorvq))]
11101 pub unsafe fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
11102     transmute(vprorvq(a.as_i64x8(), b.as_i64x8()))
11103 }
11104
11105 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11106 ///
11107 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_rorv_epi64&expand=4746)
11108 #[inline]
11109 #[target_feature(enable = "avx512f")]
11110 #[cfg_attr(test, assert_instr(vprorvq))]
11111 pub unsafe fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
11112     let ror = _mm512_rorv_epi64(a, b).as_i64x8();
11113     transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
11114 }
11115
11116 /// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11117 ///
11118 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_rorv_epi64&expand=4747)
11119 #[inline]
11120 #[target_feature(enable = "avx512f")]
11121 #[cfg_attr(test, assert_instr(vprorvq))]
11122 pub unsafe fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
11123     let ror = _mm512_rorv_epi64(a, b).as_i64x8();
11124     let zero = _mm512_setzero_si512().as_i64x8();
11125     transmute(simd_select_bitmask(k, ror, zero))
11126 }
11127
11128 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
11129 ///
11130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sllv_epi32&expand=5342)
11131 #[inline]
11132 #[target_feature(enable = "avx512f")]
11133 #[cfg_attr(test, assert_instr(vpsllvd))]
11134 pub unsafe fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
11135     transmute(vpsllvd(a.as_i32x16(), count.as_i32x16()))
11136 }
11137
11138 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11139 ///
11140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sllv_epi32&expand=5340)
11141 #[inline]
11142 #[target_feature(enable = "avx512f")]
11143 #[cfg_attr(test, assert_instr(vpsllvd))]
11144 pub unsafe fn _mm512_mask_sllv_epi32(
11145     src: __m512i,
11146     k: __mmask16,
11147     a: __m512i,
11148     count: __m512i,
11149 ) -> __m512i {
11150     let shf = _mm512_sllv_epi32(a, count).as_i32x16();
11151     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
11152 }
11153
11154 /// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11155 ///
11156 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sllv_epi32&expand=5341)
11157 #[inline]
11158 #[target_feature(enable = "avx512f")]
11159 #[cfg_attr(test, assert_instr(vpsllvd))]
11160 pub unsafe fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
11161     let shf = _mm512_sllv_epi32(a, count).as_i32x16();
11162     let zero = _mm512_setzero_si512().as_i32x16();
11163     transmute(simd_select_bitmask(k, shf, zero))
11164 }
11165
11166 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
11167 ///
11168 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srlv_epi32&expand=5554)
11169 #[inline]
11170 #[target_feature(enable = "avx512f")]
11171 #[cfg_attr(test, assert_instr(vpsrlvd))]
11172 pub unsafe fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
11173     transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16()))
11174 }
11175
11176 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11177 ///
11178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_srlv_epi32&expand=5552)
11179 #[inline]
11180 #[target_feature(enable = "avx512f")]
11181 #[cfg_attr(test, assert_instr(vpsrlvd))]
11182 pub unsafe fn _mm512_mask_srlv_epi32(
11183     src: __m512i,
11184     k: __mmask16,
11185     a: __m512i,
11186     count: __m512i,
11187 ) -> __m512i {
11188     let shf = _mm512_srlv_epi32(a, count).as_i32x16();
11189     transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
11190 }
11191
11192 /// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11193 ///
11194 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srlv_epi32&expand=5553)
11195 #[inline]
11196 #[target_feature(enable = "avx512f")]
11197 #[cfg_attr(test, assert_instr(vpsrlvd))]
11198 pub unsafe fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
11199     let shf = _mm512_srlv_epi32(a, count).as_i32x16();
11200     let zero = _mm512_setzero_si512().as_i32x16();
11201     transmute(simd_select_bitmask(k, shf, zero))
11202 }
11203
11204 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
11205 ///
11206 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_sllv_epi64&expand=5351)
11207 #[inline]
11208 #[target_feature(enable = "avx512f")]
11209 #[cfg_attr(test, assert_instr(vpsllvq))]
11210 pub unsafe fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
11211     transmute(vpsllvq(a.as_i64x8(), count.as_i64x8()))
11212 }
11213
11214 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11215 ///
11216 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_sllv_epi64&expand=5349)
11217 #[inline]
11218 #[target_feature(enable = "avx512f")]
11219 #[cfg_attr(test, assert_instr(vpsllvq))]
11220 pub unsafe fn _mm512_mask_sllv_epi64(
11221     src: __m512i,
11222     k: __mmask8,
11223     a: __m512i,
11224     count: __m512i,
11225 ) -> __m512i {
11226     let shf = _mm512_sllv_epi64(a, count).as_i64x8();
11227     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
11228 }
11229
11230 /// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11231 ///
11232 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_sllv_epi64&expand=5350)
11233 #[inline]
11234 #[target_feature(enable = "avx512f")]
11235 #[cfg_attr(test, assert_instr(vpsllvq))]
11236 pub unsafe fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
11237     let shf = _mm512_sllv_epi64(a, count).as_i64x8();
11238     let zero = _mm512_setzero_si512().as_i64x8();
11239     transmute(simd_select_bitmask(k, shf, zero))
11240 }
11241
11242 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
11243 ///
11244 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_srlv_epi64&expand=5563)
11245 #[inline]
11246 #[target_feature(enable = "avx512f")]
11247 #[cfg_attr(test, assert_instr(vpsrlvq))]
11248 pub unsafe fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
11249     transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8()))
11250 }
11251
11252 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11253 ///
11254 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mask_srlv_epi64&expand=5561)
11255 #[inline]
11256 #[target_feature(enable = "avx512f")]
11257 #[cfg_attr(test, assert_instr(vpsrlvq))]
11258 pub unsafe fn _mm512_mask_srlv_epi64(
11259     src: __m512i,
11260     k: __mmask8,
11261     a: __m512i,
11262     count: __m512i,
11263 ) -> __m512i {
11264     let shf = _mm512_srlv_epi64(a, count).as_i64x8();
11265     transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
11266 }
11267
11268 /// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11269 ///
11270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_srlv_epi64&expand=5562)
11271 #[inline]
11272 #[target_feature(enable = "avx512f")]
11273 #[cfg_attr(test, assert_instr(vpsrlvq))]
11274 pub unsafe fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
11275     let shf = _mm512_srlv_epi64(a, count).as_i64x8();
11276     let zero = _mm512_setzero_si512().as_i64x8();
11277     transmute(simd_select_bitmask(k, shf, zero))
11278 }
11279
11280 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
11281 ///
11282 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permute_ps&expand=4170)
11283 #[inline]
11284 #[target_feature(enable = "avx512f")]
11285 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
11286 #[rustc_args_required_const(1)]
11287 pub unsafe fn _mm512_permute_ps(a: __m512, imm8: i32) -> __m512 {
11288     let a = a.as_f32x16();
11289     macro_rules! call {
11290         ($imm8:expr) => {
11291             vpermilps(a, _mm512_set1_epi32($imm8).as_i32x16())
11292         };
11293     }
11294     let r = constify_imm8_sae!(imm8, call);
11295     transmute(r)
11296 }
11297
11298 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11299 ///
11300 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permute_ps&expand=4168)
11301 #[inline]
11302 #[target_feature(enable = "avx512f")]
11303 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
11304 #[rustc_args_required_const(3)]
11305 pub unsafe fn _mm512_mask_permute_ps(src: __m512, k: __mmask16, a: __m512, imm8: i32) -> __m512 {
11306     let a = a.as_f32x16();
11307     macro_rules! call {
11308         ($imm8:expr) => {
11309             vpermilps(a, _mm512_set1_epi32($imm8).as_i32x16())
11310         };
11311     }
11312     let permute = constify_imm8_sae!(imm8, call);
11313     transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
11314 }
11315
11316 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11317 ///
11318 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permute_ps&expand=4169)
11319 #[inline]
11320 #[target_feature(enable = "avx512f")]
11321 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 1))]
11322 #[rustc_args_required_const(2)]
11323 pub unsafe fn _mm512_maskz_permute_ps(k: __mmask16, a: __m512, imm8: i32) -> __m512 {
11324     let a = a.as_f32x16();
11325     macro_rules! call {
11326         ($imm8:expr) => {
11327             vpermilps(a, _mm512_set1_epi32($imm8).as_i32x16())
11328         };
11329     }
11330     let permute = constify_imm8_sae!(imm8, call);
11331     let zero = _mm512_setzero_ps().as_f32x16();
11332     transmute(simd_select_bitmask(k, permute, zero))
11333 }
11334
11335 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
11336 ///
11337 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permute_pd&expand=4161)
11338 #[inline]
11339 #[target_feature(enable = "avx512f")]
11340 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))]
11341 #[rustc_args_required_const(1)]
11342 pub unsafe fn _mm512_permute_pd(a: __m512d, imm8: i32) -> __m512d {
11343     let a = a.as_f64x8();
11344     macro_rules! call {
11345         ($imm8:expr) => {
11346             vpermilpd(a, _mm512_set1_epi64($imm8).as_i64x8())
11347         };
11348     }
11349     let r = constify_imm8_sae!(imm8, call);
11350     transmute(r)
11351 }
11352
11353 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11354 ///
11355 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permute_pd&expand=4159)
11356 #[inline]
11357 #[target_feature(enable = "avx512f")]
11358 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))]
11359 #[rustc_args_required_const(3)]
11360 pub unsafe fn _mm512_mask_permute_pd(src: __m512d, k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
11361     let a = a.as_f64x8();
11362     macro_rules! call {
11363         ($imm8:expr) => {
11364             vpermilpd(a, _mm512_set1_epi64($imm8).as_i64x8())
11365         };
11366     }
11367     let permute = constify_imm8_sae!(imm8, call);
11368     transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
11369 }
11370
11371 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11372 ///
11373 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permute_pd&expand=4160)
11374 #[inline]
11375 #[target_feature(enable = "avx512f")]
11376 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 2))]
11377 #[rustc_args_required_const(2)]
11378 pub unsafe fn _mm512_maskz_permute_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
11379     let a = a.as_f64x8();
11380     macro_rules! call {
11381         ($imm8:expr) => {
11382             vpermilpd(a, _mm512_set1_epi64($imm8).as_i64x8())
11383         };
11384     }
11385     let permute = constify_imm8_sae!(imm8, call);
11386     let zero = _mm512_setzero_pd().as_f64x8();
11387     transmute(simd_select_bitmask(k, permute, zero))
11388 }
11389
11390 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
11391 ///
11392 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex_epi64&expand=4208)
11393 #[inline]
11394 #[target_feature(enable = "avx512f")]
11395 #[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))]
11396 //shoud be vpermq, but generate vpermpd. It generates vpermq with mask. change to vbroadcast becaise CI Windows
11397 #[rustc_args_required_const(1)]
11398 pub unsafe fn _mm512_permutex_epi64(a: __m512i, imm8: i32) -> __m512i {
11399     let a = a.as_i64x8();
11400     macro_rules! call {
11401         ($imm8:expr) => {
11402             vpermq(a, _mm512_set1_epi64($imm8).as_i64x8())
11403         };
11404     }
11405     let r = constify_imm8_sae!(imm8, call);
11406     transmute(r)
11407 }
11408
11409 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11410 ///
11411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex_epi64&expand=4206)
11412 #[inline]
11413 #[target_feature(enable = "avx512f")]
11414 #[cfg_attr(test, assert_instr(vpbroadcast, imm8 = 0b11111111))] //shoud be vpermq. change to vpbroadcast becaise CI Windows
11415 #[rustc_args_required_const(3)]
11416 pub unsafe fn _mm512_mask_permutex_epi64(
11417     src: __m512i,
11418     k: __mmask8,
11419     a: __m512i,
11420     imm8: i32,
11421 ) -> __m512i {
11422     let a = a.as_i64x8();
11423     macro_rules! call {
11424         ($imm8:expr) => {
11425             vpermq(a, _mm512_set1_epi64($imm8).as_i64x8())
11426         };
11427     }
11428     let permute = constify_imm8_sae!(imm8, call);
11429     transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
11430 }
11431
11432 /// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11433 ///
11434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex_epi64&expand=4207)
11435 #[inline]
11436 #[target_feature(enable = "avx512f")]
11437 #[cfg_attr(test, assert_instr(vpbroadcast, imm8 = 0b11111111))] //shoud be vpermq. change to vpbroadcast becaise CI Windows
11438 #[rustc_args_required_const(2)]
11439 pub unsafe fn _mm512_maskz_permutex_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m512i {
11440     let a = a.as_i64x8();
11441     macro_rules! call {
11442         ($imm8:expr) => {
11443             vpermq(a, _mm512_set1_epi64($imm8).as_i64x8())
11444         };
11445     }
11446     let permute = constify_imm8_sae!(imm8, call);
11447     let zero = _mm512_setzero_si512().as_i64x8();
11448     transmute(simd_select_bitmask(k, permute, zero))
11449 }
11450
11451 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
11452 ///
11453 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex_pd&expand=4214)
11454 #[inline]
11455 #[target_feature(enable = "avx512f")]
11456 #[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows
11457 #[rustc_args_required_const(1)]
11458 pub unsafe fn _mm512_permutex_pd(a: __m512d, imm8: i32) -> __m512d {
11459     let a = a.as_f64x8();
11460     macro_rules! call {
11461         ($imm8:expr) => {
11462             vpermpd(a, _mm512_set1_epi64($imm8).as_i64x8())
11463         };
11464     }
11465     let r = constify_imm8_sae!(imm8, call);
11466     transmute(r)
11467 }
11468
11469 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11470 ///
11471 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex_pd&expand=4212)
11472 #[inline]
11473 #[target_feature(enable = "avx512f")]
11474 #[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows
11475 #[rustc_args_required_const(3)]
11476 pub unsafe fn _mm512_mask_permutex_pd(src: __m512d, k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
11477     let a = a.as_f64x8();
11478     macro_rules! call {
11479         ($imm8:expr) => {
11480             vpermpd(a, _mm512_set1_epi64($imm8).as_i64x8())
11481         };
11482     }
11483     let permute = constify_imm8_sae!(imm8, call);
11484     transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
11485 }
11486
11487 /// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11488 ///
11489 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex_pd&expand=4213)
11490 #[inline]
11491 #[target_feature(enable = "avx512f")]
11492 #[cfg_attr(test, assert_instr(vbroadcast, imm8 = 0b11111111))] //shoud be vpermpd. change to vbroadcast becaise CI Windows
11493 #[rustc_args_required_const(2)]
11494 pub unsafe fn _mm512_maskz_permutex_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m512d {
11495     let a = a.as_f64x8();
11496     macro_rules! call {
11497         ($imm8:expr) => {
11498             vpermpd(a, _mm512_set1_epi64($imm8).as_i64x8())
11499         };
11500     }
11501     let permute = constify_imm8_sae!(imm8, call);
11502     let zero = _mm512_setzero_pd().as_f64x8();
11503     transmute(simd_select_bitmask(k, permute, zero))
11504 }
11505
11506 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
11507 ///
11508 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_epi32&expand=4182)
11509 #[inline]
11510 #[target_feature(enable = "avx512f")]
11511 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd, but generate vpermps. It generates vpermd with mask
11512 pub unsafe fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
11513     transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
11514 }
11515
11516 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
11517 ///
11518 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutevar_epi32&expand=4181)
11519 #[inline]
11520 #[target_feature(enable = "avx512f")]
11521 #[cfg_attr(test, assert_instr(vpermd))]
11522 pub unsafe fn _mm512_mask_permutevar_epi32(
11523     src: __m512i,
11524     k: __mmask16,
11525     idx: __m512i,
11526     a: __m512i,
11527 ) -> __m512i {
11528     let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
11529     transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
11530 }
11531
11532 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
11533 ///
11534 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_ps&expand=4200)
11535 #[inline]
11536 #[target_feature(enable = "avx512f")]
11537 #[cfg_attr(test, assert_instr(vpermilps))]
11538 pub unsafe fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
11539     transmute(vpermilps(a.as_f32x16(), b.as_i32x16()))
11540 }
11541
11542 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11543 ///
11544 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutevar_ps&expand=4198)
11545 #[inline]
11546 #[target_feature(enable = "avx512f")]
11547 #[cfg_attr(test, assert_instr(vpermilps))]
11548 pub unsafe fn _mm512_mask_permutevar_ps(
11549     src: __m512,
11550     k: __mmask16,
11551     a: __m512,
11552     b: __m512i,
11553 ) -> __m512 {
11554     let permute = _mm512_permutevar_ps(a, b).as_f32x16();
11555     transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
11556 }
11557
11558 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11559 ///
11560 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutevar_ps&expand=4199)
11561 #[inline]
11562 #[target_feature(enable = "avx512f")]
11563 #[cfg_attr(test, assert_instr(vpermilps))]
11564 pub unsafe fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
11565     let permute = _mm512_permutevar_ps(a, b).as_f32x16();
11566     let zero = _mm512_setzero_ps().as_f32x16();
11567     transmute(simd_select_bitmask(k, permute, zero))
11568 }
11569
11570 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
11571 ///
11572 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_pd&expand=4191)
11573 #[inline]
11574 #[target_feature(enable = "avx512f")]
11575 #[cfg_attr(test, assert_instr(vpermilpd))]
11576 pub unsafe fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
11577     transmute(vpermilpd(a.as_f64x8(), b.as_i64x8()))
11578 }
11579
11580 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11581 ///
11582 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutevar_pd&expand=4189)
11583 #[inline]
11584 #[target_feature(enable = "avx512f")]
11585 #[cfg_attr(test, assert_instr(vpermilpd))]
11586 pub unsafe fn _mm512_mask_permutevar_pd(
11587     src: __m512d,
11588     k: __mmask8,
11589     a: __m512d,
11590     b: __m512i,
11591 ) -> __m512d {
11592     let permute = _mm512_permutevar_pd(a, b).as_f64x8();
11593     transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
11594 }
11595
11596 /// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11597 ///
11598 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutevar_pd&expand=4190)
11599 #[inline]
11600 #[target_feature(enable = "avx512f")]
11601 #[cfg_attr(test, assert_instr(vpermilpd))]
11602 pub unsafe fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
11603     let permute = _mm512_permutevar_pd(a, b).as_f64x8();
11604     let zero = _mm512_setzero_pd().as_f64x8();
11605     transmute(simd_select_bitmask(k, permute, zero))
11606 }
11607
11608 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
11609 ///
11610 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutexvar_epi32&expand=4301)
11611 #[inline]
11612 #[target_feature(enable = "avx512f")]
11613 #[cfg_attr(test, assert_instr(vperm))] //should be vpermd, but generate vpermps. It generates vpermd with mask
11614 pub unsafe fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
11615     transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
11616 }
11617
11618 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11619 ///
11620 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_epi32&expand=4299)
11621 #[inline]
11622 #[target_feature(enable = "avx512f")]
11623 #[cfg_attr(test, assert_instr(vpermd))]
11624 pub unsafe fn _mm512_mask_permutexvar_epi32(
11625     src: __m512i,
11626     k: __mmask16,
11627     idx: __m512i,
11628     a: __m512i,
11629 ) -> __m512i {
11630     let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
11631     transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
11632 }
11633
11634 /// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11635 ///
11636 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_epi32&expand=4300)
11637 #[inline]
11638 #[target_feature(enable = "avx512f")]
11639 #[cfg_attr(test, assert_instr(vpermd))]
11640 pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
11641     let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
11642     let zero = _mm512_setzero_si512().as_i32x16();
11643     transmute(simd_select_bitmask(k, permute, zero))
11644 }
11645
11646 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
11647 ///
11648 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutexvar_epi64&expand=4307)
11649 #[inline]
11650 #[target_feature(enable = "avx512f")]
11651 #[cfg_attr(test, assert_instr(vperm))] //should be vpermq, but generate vpermpd. It generates vpermd with mask
11652 pub unsafe fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
11653     transmute(vpermq(a.as_i64x8(), idx.as_i64x8()))
11654 }
11655
11656 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11657 ///
11658 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_epi64&expand=4305)
11659 #[inline]
11660 #[target_feature(enable = "avx512f")]
11661 #[cfg_attr(test, assert_instr(vpermq))]
11662 pub unsafe fn _mm512_mask_permutexvar_epi64(
11663     src: __m512i,
11664     k: __mmask8,
11665     idx: __m512i,
11666     a: __m512i,
11667 ) -> __m512i {
11668     let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
11669     transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
11670 }
11671
11672 /// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11673 ///
11674 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_epi64&expand=4306)
11675 #[inline]
11676 #[target_feature(enable = "avx512f")]
11677 #[cfg_attr(test, assert_instr(vpermq))]
11678 pub unsafe fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
11679     let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
11680     let zero = _mm512_setzero_si512().as_i64x8();
11681     transmute(simd_select_bitmask(k, permute, zero))
11682 }
11683
11684 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
11685 ///
11686 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutevar_ps&expand=4200)
11687 #[inline]
11688 #[target_feature(enable = "avx512f")]
11689 #[cfg_attr(test, assert_instr(vpermps))]
11690 pub unsafe fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
11691     transmute(vpermps(a.as_f32x16(), idx.as_i32x16()))
11692 }
11693
11694 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11695 ///
11696 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_ps&expand=4326)
11697 #[inline]
11698 #[target_feature(enable = "avx512f")]
11699 #[cfg_attr(test, assert_instr(vpermps))]
11700 pub unsafe fn _mm512_mask_permutexvar_ps(
11701     src: __m512,
11702     k: __mmask16,
11703     idx: __m512i,
11704     a: __m512,
11705 ) -> __m512 {
11706     let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
11707     transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
11708 }
11709
11710 /// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11711 ///
11712 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_ps&expand=4327)
11713 #[inline]
11714 #[target_feature(enable = "avx512f")]
11715 #[cfg_attr(test, assert_instr(vpermps))]
11716 pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
11717     let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
11718     let zero = _mm512_setzero_ps().as_f32x16();
11719     transmute(simd_select_bitmask(k, permute, zero))
11720 }
11721
11722 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
11723 ///
11724 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutexvar_pd&expand=4322)
11725 #[inline]
11726 #[target_feature(enable = "avx512f")]
11727 #[cfg_attr(test, assert_instr(vpermpd))]
11728 pub unsafe fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
11729     transmute(vpermpd(a.as_f64x8(), idx.as_i64x8()))
11730 }
11731
11732 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
11733 ///
11734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutexvar_pd&expand=4320)
11735 #[inline]
11736 #[target_feature(enable = "avx512f")]
11737 #[cfg_attr(test, assert_instr(vpermpd))]
11738 pub unsafe fn _mm512_mask_permutexvar_pd(
11739     src: __m512d,
11740     k: __mmask8,
11741     idx: __m512i,
11742     a: __m512d,
11743 ) -> __m512d {
11744     let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
11745     transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
11746 }
11747
11748 /// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11749 ///
11750 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutexvar_pd&expand=4321)
11751 #[inline]
11752 #[target_feature(enable = "avx512f")]
11753 #[cfg_attr(test, assert_instr(vpermpd))]
11754 pub unsafe fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
11755     let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
11756     let zero = _mm512_setzero_pd().as_f64x8();
11757     transmute(simd_select_bitmask(k, permute, zero))
11758 }
11759
11760 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
11761 ///
11762 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_epi32&expand=4238)
11763 #[inline]
11764 #[target_feature(enable = "avx512f")]
11765 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
11766 pub unsafe fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
11767     transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16()))
11768 }
11769
11770 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
11771 ///
11772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_epi32&expand=4235)
11773 #[inline]
11774 #[target_feature(enable = "avx512f")]
11775 #[cfg_attr(test, assert_instr(vpermt2d))]
11776 pub unsafe fn _mm512_mask_permutex2var_epi32(
11777     a: __m512i,
11778     k: __mmask16,
11779     idx: __m512i,
11780     b: __m512i,
11781 ) -> __m512i {
11782     let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
11783     transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
11784 }
11785
11786 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11787 ///
11788 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_epi32&expand=4237)
11789 #[inline]
11790 #[target_feature(enable = "avx512f")]
11791 #[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
11792 pub unsafe fn _mm512_maskz_permutex2var_epi32(
11793     k: __mmask16,
11794     a: __m512i,
11795     idx: __m512i,
11796     b: __m512i,
11797 ) -> __m512i {
11798     let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
11799     let zero = _mm512_setzero_si512().as_i32x16();
11800     transmute(simd_select_bitmask(k, permute, zero))
11801 }
11802
11803 /// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
11804 ///
11805 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_epi32&expand=4236)
11806 #[inline]
11807 #[target_feature(enable = "avx512f")]
11808 #[cfg_attr(test, assert_instr(vpermi2d))]
11809 pub unsafe fn _mm512_mask2_permutex2var_epi32(
11810     a: __m512i,
11811     idx: __m512i,
11812     k: __mmask16,
11813     b: __m512i,
11814 ) -> __m512i {
11815     let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
11816     transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
11817 }
11818
11819 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
11820 ///
11821 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_epi64&expand=4250)
11822 #[inline]
11823 #[target_feature(enable = "avx512f")]
11824 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
11825 pub unsafe fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
11826     transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8()))
11827 }
11828
11829 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
11830 ///
11831 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_epi64&expand=4247)
11832 #[inline]
11833 #[target_feature(enable = "avx512f")]
11834 #[cfg_attr(test, assert_instr(vpermt2q))]
11835 pub unsafe fn _mm512_mask_permutex2var_epi64(
11836     a: __m512i,
11837     k: __mmask8,
11838     idx: __m512i,
11839     b: __m512i,
11840 ) -> __m512i {
11841     let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
11842     transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
11843 }
11844
11845 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11846 ///
11847 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_epi64&expand=4249)
11848 #[inline]
11849 #[target_feature(enable = "avx512f")]
11850 #[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
11851 pub unsafe fn _mm512_maskz_permutex2var_epi64(
11852     k: __mmask8,
11853     a: __m512i,
11854     idx: __m512i,
11855     b: __m512i,
11856 ) -> __m512i {
11857     let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
11858     let zero = _mm512_setzero_si512().as_i64x8();
11859     transmute(simd_select_bitmask(k, permute, zero))
11860 }
11861
11862 /// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
11863 ///
11864 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_epi64&expand=4248)
11865 #[inline]
11866 #[target_feature(enable = "avx512f")]
11867 #[cfg_attr(test, assert_instr(vpermi2q))]
11868 pub unsafe fn _mm512_mask2_permutex2var_epi64(
11869     a: __m512i,
11870     idx: __m512i,
11871     k: __mmask8,
11872     b: __m512i,
11873 ) -> __m512i {
11874     let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
11875     transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
11876 }
11877
11878 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
11879 ///
11880 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_ps&expand=4286)
11881 #[inline]
11882 #[target_feature(enable = "avx512f")]
11883 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
11884 pub unsafe fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
11885     transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16()))
11886 }
11887
11888 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
11889 ///
11890 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_ps&expand=4283)
11891 #[inline]
11892 #[target_feature(enable = "avx512f")]
11893 #[cfg_attr(test, assert_instr(vpermt2ps))]
11894 pub unsafe fn _mm512_mask_permutex2var_ps(
11895     a: __m512,
11896     k: __mmask16,
11897     idx: __m512i,
11898     b: __m512,
11899 ) -> __m512 {
11900     let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
11901     transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
11902 }
11903
11904 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11905 ///
11906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_ps&expand=4285)
11907 #[inline]
11908 #[target_feature(enable = "avx512f")]
11909 #[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
11910 pub unsafe fn _mm512_maskz_permutex2var_ps(
11911     k: __mmask16,
11912     a: __m512,
11913     idx: __m512i,
11914     b: __m512,
11915 ) -> __m512 {
11916     let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
11917     let zero = _mm512_setzero_ps().as_f32x16();
11918     transmute(simd_select_bitmask(k, permute, zero))
11919 }
11920
11921 /// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
11922 ///
11923 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_ps&expand=4284)
11924 #[inline]
11925 #[target_feature(enable = "avx512f")]
11926 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
11927 pub unsafe fn _mm512_mask2_permutex2var_ps(
11928     a: __m512,
11929     idx: __m512i,
11930     k: __mmask16,
11931     b: __m512,
11932 ) -> __m512 {
11933     let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
11934     let zero = _mm512_setzero_ps().as_f32x16();
11935     transmute(simd_select_bitmask(k, permute, zero))
11936 }
11937
11938 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
11939 ///
11940 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_permutex2var_pd&expand=4274)
11941 #[inline]
11942 #[target_feature(enable = "avx512f")]
11943 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
11944 pub unsafe fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
11945     transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8()))
11946 }
11947
11948 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
11949 ///
11950 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_permutex2var_pd&expand=4271)
11951 #[inline]
11952 #[target_feature(enable = "avx512f")]
11953 #[cfg_attr(test, assert_instr(vpermt2pd))]
11954 pub unsafe fn _mm512_mask_permutex2var_pd(
11955     a: __m512d,
11956     k: __mmask8,
11957     idx: __m512i,
11958     b: __m512d,
11959 ) -> __m512d {
11960     let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
11961     transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
11962 }
11963
11964 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11965 ///
11966 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_permutex2var_pd&expand=4273)
11967 #[inline]
11968 #[target_feature(enable = "avx512f")]
11969 #[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
11970 pub unsafe fn _mm512_maskz_permutex2var_pd(
11971     k: __mmask8,
11972     a: __m512d,
11973     idx: __m512i,
11974     b: __m512d,
11975 ) -> __m512d {
11976     let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
11977     let zero = _mm512_setzero_pd().as_f64x8();
11978     transmute(simd_select_bitmask(k, permute, zero))
11979 }
11980
11981 /// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
11982 ///
11983 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2_permutex2var_pd&expand=4272)
11984 #[inline]
11985 #[target_feature(enable = "avx512f")]
11986 #[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
11987 pub unsafe fn _mm512_mask2_permutex2var_pd(
11988     a: __m512d,
11989     idx: __m512i,
11990     k: __mmask8,
11991     b: __m512d,
11992 ) -> __m512d {
11993     let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
11994     let zero = _mm512_setzero_pd().as_f64x8();
11995     transmute(simd_select_bitmask(k, permute, zero))
11996 }
11997
11998 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
11999 ///
12000 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_epi32&expand=5150)
12001 #[inline]
12002 #[target_feature(enable = "avx512f")]
12003 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))] //should be vpshufd, but generate vpermilps
12004 #[rustc_args_required_const(1)]
12005 pub unsafe fn _mm512_shuffle_epi32(a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
12006     let imm8 = (imm8 & 0xFF) as u8;
12007
12008     let a = a.as_i32x16();
12009     macro_rules! shuffle4 {
12010         (
12011             $a:expr,
12012             $b:expr,
12013             $c:expr,
12014             $d:expr,
12015             $e:expr,
12016             $f:expr,
12017             $g:expr,
12018             $h:expr,
12019             $i:expr,
12020             $j:expr,
12021             $k:expr,
12022             $l:expr,
12023             $m:expr,
12024             $n:expr,
12025             $o:expr,
12026             $p:expr
12027         ) => {
12028             simd_shuffle16(
12029                 a,
12030                 a,
12031                 [
12032                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12033                 ],
12034             )
12035         };
12036     }
12037     macro_rules! shuffle3 {
12038         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12039             match (imm8 >> 6) & 0x3 {
12040                 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
12041                 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
12042                 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
12043                 _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
12044             }
12045         };
12046     }
12047     macro_rules! shuffle2 {
12048         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12049             match (imm8 >> 4) & 0x3 {
12050                 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
12051                 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
12052                 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
12053                 _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
12054             }
12055         };
12056     }
12057     macro_rules! shuffle1 {
12058         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12059             match (imm8 >> 2) & 0x3 {
12060                 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
12061                 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
12062                 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
12063                 _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
12064             }
12065         };
12066     }
12067     let r: i32x16 = match imm8 & 0x3 {
12068         0 => shuffle1!(0, 4, 8, 12),
12069         1 => shuffle1!(1, 5, 9, 13),
12070         2 => shuffle1!(2, 6, 10, 14),
12071         _ => shuffle1!(3, 7, 11, 15),
12072     };
12073     transmute(r)
12074 }
12075
12076 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
12077 ///
12078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_epi32&expand=5148)
12079 #[inline]
12080 #[target_feature(enable = "avx512f")]
12081 #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] //should be vpshufd, but generate vpermilps
12082 #[rustc_args_required_const(3)]
12083 pub unsafe fn _mm512_mask_shuffle_epi32(
12084     src: __m512i,
12085     k: __mmask16,
12086     a: __m512i,
12087     imm8: _MM_PERM_ENUM,
12088 ) -> __m512i {
12089     let imm8 = (imm8 & 0xFF) as u8;
12090
12091     let a = a.as_i32x16();
12092     macro_rules! shuffle4 {
12093         (
12094             $a:expr,
12095             $b:expr,
12096             $c:expr,
12097             $d:expr,
12098             $e:expr,
12099             $f:expr,
12100             $g:expr,
12101             $h:expr,
12102             $i:expr,
12103             $j:expr,
12104             $k:expr,
12105             $l:expr,
12106             $m:expr,
12107             $n:expr,
12108             $o:expr,
12109             $p:expr
12110         ) => {
12111             simd_shuffle16(
12112                 a,
12113                 a,
12114                 [
12115                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12116                 ],
12117             )
12118         };
12119     }
12120     macro_rules! shuffle3 {
12121         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12122             match (imm8 >> 6) & 0x3 {
12123                 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
12124                 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
12125                 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
12126                 _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
12127             }
12128         };
12129     }
12130     macro_rules! shuffle2 {
12131         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12132             match (imm8 >> 4) & 0x3 {
12133                 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
12134                 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
12135                 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
12136                 _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
12137             }
12138         };
12139     }
12140     macro_rules! shuffle1 {
12141         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12142             match (imm8 >> 2) & 0x3 {
12143                 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
12144                 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
12145                 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
12146                 _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
12147             }
12148         };
12149     }
12150     let shuffle: i32x16 = match imm8 & 0x3 {
12151         0 => shuffle1!(0, 4, 8, 12),
12152         1 => shuffle1!(1, 5, 9, 13),
12153         2 => shuffle1!(2, 6, 10, 14),
12154         _ => shuffle1!(3, 7, 11, 15),
12155     };
12156     transmute(simd_select_bitmask(k, shuffle, src.as_i32x16()))
12157 }
12158
12159 /// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12160 ///
12161 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_epi32&expand=5149)
12162 #[inline]
12163 #[target_feature(enable = "avx512f")]
12164 #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))] //should be vpshufd, but generate vpermilps
12165 #[rustc_args_required_const(2)]
12166 pub unsafe fn _mm512_maskz_shuffle_epi32(k: __mmask16, a: __m512i, imm8: _MM_PERM_ENUM) -> __m512i {
12167     let imm8 = (imm8 & 0xFF) as u8;
12168
12169     let a = a.as_i32x16();
12170     macro_rules! shuffle4 {
12171         (
12172             $a:expr,
12173             $b:expr,
12174             $c:expr,
12175             $d:expr,
12176             $e:expr,
12177             $f:expr,
12178             $g:expr,
12179             $h:expr,
12180             $i:expr,
12181             $j:expr,
12182             $k:expr,
12183             $l:expr,
12184             $m:expr,
12185             $n:expr,
12186             $o:expr,
12187             $p:expr
12188         ) => {
12189             simd_shuffle16(
12190                 a,
12191                 a,
12192                 [
12193                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12194                 ],
12195             )
12196         };
12197     }
12198     macro_rules! shuffle3 {
12199         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12200             match (imm8 >> 6) & 0x3 {
12201                 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
12202                 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
12203                 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
12204                 _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
12205             }
12206         };
12207     }
12208     macro_rules! shuffle2 {
12209         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12210             match (imm8 >> 4) & 0x3 {
12211                 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
12212                 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
12213                 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
12214                 _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
12215             }
12216         };
12217     }
12218     macro_rules! shuffle1 {
12219         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12220             match (imm8 >> 2) & 0x3 {
12221                 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
12222                 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
12223                 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
12224                 _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
12225             }
12226         };
12227     }
12228     let shuffle: i32x16 = match imm8 & 0x3 {
12229         0 => shuffle1!(0, 4, 8, 12),
12230         1 => shuffle1!(1, 5, 9, 13),
12231         2 => shuffle1!(2, 6, 10, 14),
12232         _ => shuffle1!(3, 7, 11, 15),
12233     };
12234     let zero = _mm512_setzero_si512().as_i32x16();
12235     transmute(simd_select_bitmask(k, shuffle, zero))
12236 }
12237
12238 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
12239 ///
12240 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_ps&expand=5203)
12241 #[inline]
12242 #[target_feature(enable = "avx512f")]
12243 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
12244 #[rustc_args_required_const(2)]
12245 pub unsafe fn _mm512_shuffle_ps(a: __m512, b: __m512, imm8: i32) -> __m512 {
12246     assert!(imm8 >= 0 && imm8 <= 255);
12247     let imm8 = (imm8 & 0xFF) as u8;
12248     macro_rules! shuffle4 {
12249         (
12250             $a:expr,
12251             $b:expr,
12252             $c:expr,
12253             $d:expr,
12254             $e:expr,
12255             $f:expr,
12256             $g:expr,
12257             $h:expr,
12258             $i:expr,
12259             $j:expr,
12260             $k:expr,
12261             $l:expr,
12262             $m:expr,
12263             $n:expr,
12264             $o:expr,
12265             $p:expr
12266         ) => {
12267             simd_shuffle16(
12268                 a,
12269                 b,
12270                 [
12271                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12272                 ],
12273             )
12274         };
12275     }
12276     macro_rules! shuffle3 {
12277         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12278             match (imm8 >> 6) & 0x3 {
12279                 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
12280                 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
12281                 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
12282                 _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
12283             }
12284         };
12285     }
12286     macro_rules! shuffle2 {
12287         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12288             match (imm8 >> 4) & 0x3 {
12289                 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
12290                 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
12291                 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
12292                 _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
12293             }
12294         };
12295     }
12296     macro_rules! shuffle1 {
12297         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12298             match (imm8 >> 2) & 0x3 {
12299                 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
12300                 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
12301                 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
12302                 _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
12303             }
12304         };
12305     }
12306     match imm8 & 0x3 {
12307         0 => shuffle1!(0, 4, 8, 12),
12308         1 => shuffle1!(1, 5, 9, 13),
12309         2 => shuffle1!(2, 6, 10, 14),
12310         _ => shuffle1!(3, 7, 11, 15),
12311     }
12312 }
12313
12314 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
12315 ///
12316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_ps&expand=5201)
12317 #[inline]
12318 #[target_feature(enable = "avx512f")]
12319 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
12320 #[rustc_args_required_const(4)]
12321 pub unsafe fn _mm512_mask_shuffle_ps(
12322     src: __m512,
12323     k: __mmask16,
12324     a: __m512,
12325     b: __m512,
12326     imm8: i32,
12327 ) -> __m512 {
12328     assert!(imm8 >= 0 && imm8 <= 255);
12329     let imm8 = (imm8 & 0xFF) as u8;
12330     macro_rules! shuffle4 {
12331         (
12332             $a:expr,
12333             $b:expr,
12334             $c:expr,
12335             $d:expr,
12336             $e:expr,
12337             $f:expr,
12338             $g:expr,
12339             $h:expr,
12340             $i:expr,
12341             $j:expr,
12342             $k:expr,
12343             $l:expr,
12344             $m:expr,
12345             $n:expr,
12346             $o:expr,
12347             $p:expr
12348         ) => {
12349             simd_shuffle16(
12350                 a,
12351                 b,
12352                 [
12353                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12354                 ],
12355             )
12356         };
12357     }
12358     macro_rules! shuffle3 {
12359         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12360             match (imm8 >> 6) & 0x3 {
12361                 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
12362                 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
12363                 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
12364                 _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
12365             }
12366         };
12367     }
12368     macro_rules! shuffle2 {
12369         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12370             match (imm8 >> 4) & 0x3 {
12371                 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
12372                 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
12373                 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
12374                 _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
12375             }
12376         };
12377     }
12378     macro_rules! shuffle1 {
12379         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12380             match (imm8 >> 2) & 0x3 {
12381                 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
12382                 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
12383                 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
12384                 _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
12385             }
12386         };
12387     }
12388     let shuffle = match imm8 & 0x3 {
12389         0 => shuffle1!(0, 4, 8, 12),
12390         1 => shuffle1!(1, 5, 9, 13),
12391         2 => shuffle1!(2, 6, 10, 14),
12392         _ => shuffle1!(3, 7, 11, 15),
12393     };
12394
12395     transmute(simd_select_bitmask(k, shuffle, src.as_f32x16()))
12396 }
12397
12398 /// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12399 ///
12400 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_ps&expand=5202)
12401 #[inline]
12402 #[target_feature(enable = "avx512f")]
12403 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0))]
12404 #[rustc_args_required_const(3)]
12405 pub unsafe fn _mm512_maskz_shuffle_ps(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
12406     assert!(imm8 >= 0 && imm8 <= 255);
12407     let imm8 = (imm8 & 0xFF) as u8;
12408     macro_rules! shuffle4 {
12409         (
12410             $a:expr,
12411             $b:expr,
12412             $c:expr,
12413             $d:expr,
12414             $e:expr,
12415             $f:expr,
12416             $g:expr,
12417             $h:expr,
12418             $i:expr,
12419             $j:expr,
12420             $k:expr,
12421             $l:expr,
12422             $m:expr,
12423             $n:expr,
12424             $o:expr,
12425             $p:expr
12426         ) => {
12427             simd_shuffle16(
12428                 a,
12429                 b,
12430                 [
12431                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12432                 ],
12433             )
12434         };
12435     }
12436     macro_rules! shuffle3 {
12437         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12438             match (imm8 >> 6) & 0x3 {
12439                 0 => shuffle4!($a, $b, $c, 16, $e, $f, $g, 20, $i, $j, $k, 24, $m, $n, $o, 28),
12440                 1 => shuffle4!($a, $b, $c, 17, $e, $f, $g, 21, $i, $j, $k, 25, $m, $n, $o, 29),
12441                 2 => shuffle4!($a, $b, $c, 18, $e, $f, $g, 22, $i, $j, $k, 26, $m, $n, $o, 30),
12442                 _ => shuffle4!($a, $b, $c, 19, $e, $f, $g, 23, $i, $j, $k, 27, $m, $n, $o, 31),
12443             }
12444         };
12445     }
12446     macro_rules! shuffle2 {
12447         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12448             match (imm8 >> 4) & 0x3 {
12449                 0 => shuffle3!($a, $b, 16, $e, $f, 20, $i, $j, 24, $m, $n, 28),
12450                 1 => shuffle3!($a, $b, 17, $e, $f, 21, $i, $j, 25, $m, $n, 29),
12451                 2 => shuffle3!($a, $b, 18, $e, $f, 22, $i, $j, 26, $m, $n, 30),
12452                 _ => shuffle3!($a, $b, 19, $e, $f, 23, $i, $j, 27, $m, $n, 31),
12453             }
12454         };
12455     }
12456     macro_rules! shuffle1 {
12457         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12458             match (imm8 >> 2) & 0x3 {
12459                 0 => shuffle2!($a, 0, $e, 4, $i, 8, $m, 12),
12460                 1 => shuffle2!($a, 1, $e, 5, $i, 9, $m, 13),
12461                 2 => shuffle2!($a, 2, $e, 6, $i, 10, $m, 14),
12462                 _ => shuffle2!($a, 3, $e, 7, $i, 11, $m, 15),
12463             }
12464         };
12465     }
12466     let shuffle = match imm8 & 0x3 {
12467         0 => shuffle1!(0, 4, 8, 12),
12468         1 => shuffle1!(1, 5, 9, 13),
12469         2 => shuffle1!(2, 6, 10, 14),
12470         _ => shuffle1!(3, 7, 11, 15),
12471     };
12472
12473     let zero = _mm512_setzero_ps().as_f32x16();
12474     transmute(simd_select_bitmask(k, shuffle, zero))
12475 }
12476
12477 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
12478 ///
12479 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_pd&expand=5192)
12480 #[inline]
12481 #[target_feature(enable = "avx512f")]
12482 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
12483 #[rustc_args_required_const(2)]
12484 pub unsafe fn _mm512_shuffle_pd(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
12485     assert!(imm8 >= 0 && imm8 <= 255);
12486     let imm8 = (imm8 & 0xFF) as u8;
12487     macro_rules! shuffle8 {
12488         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
12489             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
12490         };
12491     }
12492     macro_rules! shuffle7 {
12493         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => {
12494             match (imm8 >> 7) & 0x1 {
12495                 0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14),
12496                 _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15),
12497             }
12498         };
12499     }
12500     macro_rules! shuffle6 {
12501         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
12502             match (imm8 >> 6) & 0x1 {
12503                 0 => shuffle7!($a, $b, $c, $d, $e, $f, 6),
12504                 _ => shuffle7!($a, $b, $c, $d, $e, $f, 7),
12505             }
12506         };
12507     }
12508     macro_rules! shuffle5 {
12509         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => {
12510             match (imm8 >> 5) & 0x1 {
12511                 0 => shuffle6!($a, $b, $c, $d, $e, 12),
12512                 _ => shuffle6!($a, $b, $c, $d, $e, 13),
12513             }
12514         };
12515     }
12516     macro_rules! shuffle4 {
12517         ($a:expr, $b:expr, $c:expr, $d:expr) => {
12518             match (imm8 >> 4) & 0x1 {
12519                 0 => shuffle5!($a, $b, $c, $d, 4),
12520                 _ => shuffle5!($a, $b, $c, $d, 5),
12521             }
12522         };
12523     }
12524     macro_rules! shuffle3 {
12525         ($a:expr, $b:expr, $c:expr) => {
12526             match (imm8 >> 3) & 0x1 {
12527                 0 => shuffle4!($a, $b, $c, 10),
12528                 _ => shuffle4!($a, $b, $c, 11),
12529             }
12530         };
12531     }
12532     macro_rules! shuffle2 {
12533         ($a:expr, $b:expr) => {
12534             match (imm8 >> 2) & 0x1 {
12535                 0 => shuffle3!($a, $b, 2),
12536                 _ => shuffle3!($a, $b, 3),
12537             }
12538         };
12539     }
12540     macro_rules! shuffle1 {
12541         ($a:expr) => {
12542             match (imm8 >> 1) & 0x1 {
12543                 0 => shuffle2!($a, 8),
12544                 _ => shuffle2!($a, 9),
12545             }
12546         };
12547     }
12548     match imm8 & 0x1 {
12549         0 => shuffle1!(0),
12550         _ => shuffle1!(1),
12551     }
12552 }
12553
12554 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
12555 ///
12556 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_pd&expand=5190)
12557 #[inline]
12558 #[target_feature(enable = "avx512f")]
12559 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
12560 #[rustc_args_required_const(4)]
12561 pub unsafe fn _mm512_mask_shuffle_pd(
12562     src: __m512d,
12563     k: __mmask8,
12564     a: __m512d,
12565     b: __m512d,
12566     imm8: i32,
12567 ) -> __m512d {
12568     assert!(imm8 >= 0 && imm8 <= 255);
12569     let imm8 = (imm8 & 0xFF) as u8;
12570     macro_rules! shuffle8 {
12571         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
12572             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
12573         };
12574     }
12575     macro_rules! shuffle7 {
12576         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => {
12577             match (imm8 >> 7) & 0x1 {
12578                 0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14),
12579                 _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15),
12580             }
12581         };
12582     }
12583     macro_rules! shuffle6 {
12584         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
12585             match (imm8 >> 6) & 0x1 {
12586                 0 => shuffle7!($a, $b, $c, $d, $e, $f, 6),
12587                 _ => shuffle7!($a, $b, $c, $d, $e, $f, 7),
12588             }
12589         };
12590     }
12591     macro_rules! shuffle5 {
12592         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => {
12593             match (imm8 >> 5) & 0x1 {
12594                 0 => shuffle6!($a, $b, $c, $d, $e, 12),
12595                 _ => shuffle6!($a, $b, $c, $d, $e, 13),
12596             }
12597         };
12598     }
12599     macro_rules! shuffle4 {
12600         ($a:expr, $b:expr, $c:expr, $d:expr) => {
12601             match (imm8 >> 4) & 0x1 {
12602                 0 => shuffle5!($a, $b, $c, $d, 4),
12603                 _ => shuffle5!($a, $b, $c, $d, 5),
12604             }
12605         };
12606     }
12607     macro_rules! shuffle3 {
12608         ($a:expr, $b:expr, $c:expr) => {
12609             match (imm8 >> 3) & 0x1 {
12610                 0 => shuffle4!($a, $b, $c, 10),
12611                 _ => shuffle4!($a, $b, $c, 11),
12612             }
12613         };
12614     }
12615     macro_rules! shuffle2 {
12616         ($a:expr, $b:expr) => {
12617             match (imm8 >> 2) & 0x1 {
12618                 0 => shuffle3!($a, $b, 2),
12619                 _ => shuffle3!($a, $b, 3),
12620             }
12621         };
12622     }
12623     macro_rules! shuffle1 {
12624         ($a:expr) => {
12625             match (imm8 >> 1) & 0x1 {
12626                 0 => shuffle2!($a, 8),
12627                 _ => shuffle2!($a, 9),
12628             }
12629         };
12630     }
12631     let shuffle = match imm8 & 0x1 {
12632         0 => shuffle1!(0),
12633         _ => shuffle1!(1),
12634     };
12635
12636     transmute(simd_select_bitmask(k, shuffle, src.as_f64x8()))
12637 }
12638
12639 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12640 ///
12641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_pd&expand=5191)
12642 #[inline]
12643 #[target_feature(enable = "avx512f")]
12644 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 3))]
12645 #[rustc_args_required_const(3)]
12646 pub unsafe fn _mm512_maskz_shuffle_pd(k: __mmask8, a: __m512d, b: __m512d, imm8: i32) -> __m512d {
12647     assert!(imm8 >= 0 && imm8 <= 255);
12648     let imm8 = (imm8 & 0xFF) as u8;
12649     macro_rules! shuffle8 {
12650         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr, $h:expr) => {
12651             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
12652         };
12653     }
12654     macro_rules! shuffle7 {
12655         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr, $g:expr) => {
12656             match (imm8 >> 7) & 0x1 {
12657                 0 => shuffle8!($a, $b, $c, $d, $e, $f, $g, 14),
12658                 _ => shuffle8!($a, $b, $c, $d, $e, $f, $g, 15),
12659             }
12660         };
12661     }
12662     macro_rules! shuffle6 {
12663         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
12664             match (imm8 >> 6) & 0x1 {
12665                 0 => shuffle7!($a, $b, $c, $d, $e, $f, 6),
12666                 _ => shuffle7!($a, $b, $c, $d, $e, $f, 7),
12667             }
12668         };
12669     }
12670     macro_rules! shuffle5 {
12671         ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr) => {
12672             match (imm8 >> 5) & 0x1 {
12673                 0 => shuffle6!($a, $b, $c, $d, $e, 12),
12674                 _ => shuffle6!($a, $b, $c, $d, $e, 13),
12675             }
12676         };
12677     }
12678     macro_rules! shuffle4 {
12679         ($a:expr, $b:expr, $c:expr, $d:expr) => {
12680             match (imm8 >> 4) & 0x1 {
12681                 0 => shuffle5!($a, $b, $c, $d, 4),
12682                 _ => shuffle5!($a, $b, $c, $d, 5),
12683             }
12684         };
12685     }
12686     macro_rules! shuffle3 {
12687         ($a:expr, $b:expr, $c:expr) => {
12688             match (imm8 >> 3) & 0x1 {
12689                 0 => shuffle4!($a, $b, $c, 10),
12690                 _ => shuffle4!($a, $b, $c, 11),
12691             }
12692         };
12693     }
12694     macro_rules! shuffle2 {
12695         ($a:expr, $b:expr) => {
12696             match (imm8 >> 2) & 0x1 {
12697                 0 => shuffle3!($a, $b, 2),
12698                 _ => shuffle3!($a, $b, 3),
12699             }
12700         };
12701     }
12702     macro_rules! shuffle1 {
12703         ($a:expr) => {
12704             match (imm8 >> 1) & 0x1 {
12705                 0 => shuffle2!($a, 8),
12706                 _ => shuffle2!($a, 9),
12707             }
12708         };
12709     }
12710     let shuffle = match imm8 & 0x1 {
12711         0 => shuffle1!(0),
12712         _ => shuffle1!(1),
12713     };
12714
12715     let zero = _mm512_setzero_pd().as_f64x8();
12716     transmute(simd_select_bitmask(k, shuffle, zero))
12717 }
12718
12719 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
12720 ///
12721 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_i32&expand=5177)
12722 #[inline]
12723 #[target_feature(enable = "avx512f")]
12724 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))] //should be vshufi32x4, but generate vshufi64x2
12725 #[rustc_args_required_const(2)]
12726 pub unsafe fn _mm512_shuffle_i32x4(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
12727     assert!(imm8 >= 0 && imm8 <= 255);
12728     let imm8 = (imm8 & 0xFF) as u8;
12729     let a = a.as_i32x16();
12730     let b = b.as_i32x16();
12731     macro_rules! shuffle4 {
12732         (
12733             $a:expr,
12734             $b:expr,
12735             $c:expr,
12736             $d:expr,
12737             $e:expr,
12738             $f:expr,
12739             $g:expr,
12740             $h:expr,
12741             $i:expr,
12742             $j:expr,
12743             $k:expr,
12744             $l:expr,
12745             $m:expr,
12746             $n:expr,
12747             $o:expr,
12748             $p:expr
12749         ) => {
12750             simd_shuffle16(
12751                 a,
12752                 b,
12753                 [
12754                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12755                 ],
12756             )
12757         };
12758     }
12759     macro_rules! shuffle3 {
12760         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12761             match (imm8 >> 6) & 0x3 {
12762                 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
12763                 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
12764                 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
12765                 _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
12766             }
12767         };
12768     }
12769     macro_rules! shuffle2 {
12770         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12771             match (imm8 >> 4) & 0x3 {
12772                 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
12773                 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
12774                 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
12775                 _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
12776             }
12777         };
12778     }
12779     macro_rules! shuffle1 {
12780         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12781             match (imm8 >> 2) & 0x3 {
12782                 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
12783                 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
12784                 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
12785                 _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
12786             }
12787         };
12788     }
12789     let r: i32x16 = match imm8 & 0x3 {
12790         0 => shuffle1!(0, 1, 2, 3),
12791         1 => shuffle1!(4, 5, 6, 7),
12792         2 => shuffle1!(8, 9, 10, 11),
12793         _ => shuffle1!(12, 13, 14, 15),
12794     };
12795
12796     transmute(r)
12797 }
12798
12799 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
12800 ///
12801 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_i32x&expand=5175)
12802 #[inline]
12803 #[target_feature(enable = "avx512f")]
12804 #[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10111111))]
12805 #[rustc_args_required_const(4)]
12806 pub unsafe fn _mm512_mask_shuffle_i32x4(
12807     src: __m512i,
12808     k: __mmask16,
12809     a: __m512i,
12810     b: __m512i,
12811     imm8: i32,
12812 ) -> __m512i {
12813     assert!(imm8 >= 0 && imm8 <= 255);
12814     let imm8 = (imm8 & 0xFF) as u8;
12815     let a = a.as_i32x16();
12816     let b = b.as_i32x16();
12817     macro_rules! shuffle4 {
12818         (
12819             $a:expr,
12820             $b:expr,
12821             $c:expr,
12822             $d:expr,
12823             $e:expr,
12824             $f:expr,
12825             $g:expr,
12826             $h:expr,
12827             $i:expr,
12828             $j:expr,
12829             $k:expr,
12830             $l:expr,
12831             $m:expr,
12832             $n:expr,
12833             $o:expr,
12834             $p:expr
12835         ) => {
12836             simd_shuffle16(
12837                 a,
12838                 b,
12839                 [
12840                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12841                 ],
12842             )
12843         };
12844     }
12845     macro_rules! shuffle3 {
12846         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12847             match (imm8 >> 6) & 0x3 {
12848                 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
12849                 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
12850                 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
12851                 _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
12852             }
12853         };
12854     }
12855     macro_rules! shuffle2 {
12856         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12857             match (imm8 >> 4) & 0x3 {
12858                 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
12859                 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
12860                 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
12861                 _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
12862             }
12863         };
12864     }
12865     macro_rules! shuffle1 {
12866         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12867             match (imm8 >> 2) & 0x3 {
12868                 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
12869                 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
12870                 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
12871                 _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
12872             }
12873         };
12874     }
12875     let shuffle = match imm8 & 0x3 {
12876         0 => shuffle1!(0, 1, 2, 3),
12877         1 => shuffle1!(4, 5, 6, 7),
12878         2 => shuffle1!(8, 9, 10, 11),
12879         _ => shuffle1!(12, 13, 14, 15),
12880     };
12881
12882     transmute(simd_select_bitmask(k, shuffle, src.as_i32x16()))
12883 }
12884
12885 /// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12886 ///
12887 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_i32&expand=5176)
12888 #[inline]
12889 #[target_feature(enable = "avx512f")]
12890 #[cfg_attr(test, assert_instr(vshufi32x4, imm8 = 0b10111111))]
12891 #[rustc_args_required_const(3)]
12892 pub unsafe fn _mm512_maskz_shuffle_i32x4(
12893     k: __mmask16,
12894     a: __m512i,
12895     b: __m512i,
12896     imm8: i32,
12897 ) -> __m512i {
12898     assert!(imm8 >= 0 && imm8 <= 255);
12899     let imm8 = (imm8 & 0xFF) as u8;
12900     let a = a.as_i32x16();
12901     let b = b.as_i32x16();
12902     macro_rules! shuffle4 {
12903         (
12904             $a:expr,
12905             $b:expr,
12906             $c:expr,
12907             $d:expr,
12908             $e:expr,
12909             $f:expr,
12910             $g:expr,
12911             $h:expr,
12912             $i:expr,
12913             $j:expr,
12914             $k:expr,
12915             $l:expr,
12916             $m:expr,
12917             $n:expr,
12918             $o:expr,
12919             $p:expr
12920         ) => {
12921             simd_shuffle16(
12922                 a,
12923                 b,
12924                 [
12925                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
12926                 ],
12927             )
12928         };
12929     }
12930     macro_rules! shuffle3 {
12931         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
12932             match (imm8 >> 6) & 0x3 {
12933                 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
12934                 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
12935                 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
12936                 _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
12937             }
12938         };
12939     }
12940     macro_rules! shuffle2 {
12941         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
12942             match (imm8 >> 4) & 0x3 {
12943                 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
12944                 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
12945                 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
12946                 _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
12947             }
12948         };
12949     }
12950     macro_rules! shuffle1 {
12951         ($a:expr, $e:expr, $i: expr, $m: expr) => {
12952             match (imm8 >> 2) & 0x3 {
12953                 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
12954                 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
12955                 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
12956                 _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
12957             }
12958         };
12959     }
12960     let shuffle = match imm8 & 0x3 {
12961         0 => shuffle1!(0, 1, 2, 3),
12962         1 => shuffle1!(4, 5, 6, 7),
12963         2 => shuffle1!(8, 9, 10, 11),
12964         _ => shuffle1!(12, 13, 14, 15),
12965     };
12966
12967     let zero = _mm512_setzero_si512().as_i32x16();
12968     transmute(simd_select_bitmask(k, shuffle, zero))
12969 }
12970
12971 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
12972 ///
12973 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_i64x2&expand=5183)
12974 #[inline]
12975 #[target_feature(enable = "avx512f")]
12976 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
12977 #[rustc_args_required_const(2)]
12978 pub unsafe fn _mm512_shuffle_i64x2(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
12979     assert!(imm8 >= 0 && imm8 <= 255);
12980     let imm8 = (imm8 & 0xFF) as u8;
12981     macro_rules! shuffle4 {
12982         (
12983             $a:expr,
12984             $b:expr,
12985             $c:expr,
12986             $d:expr,
12987             $e:expr,
12988             $f:expr,
12989             $g:expr,
12990             $h:expr
12991         ) => {
12992             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
12993         };
12994     }
12995     macro_rules! shuffle3 {
12996         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
12997             match (imm8 >> 6) & 0x3 {
12998                 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
12999                 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
13000                 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
13001                 _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
13002             }
13003         };
13004     }
13005     macro_rules! shuffle2 {
13006         ($a:expr, $b:expr, $e:expr, $f:expr) => {
13007             match (imm8 >> 4) & 0x3 {
13008                 0 => shuffle3!($a, $b, $e, $f, 8, 9),
13009                 1 => shuffle3!($a, $b, $e, $f, 10, 11),
13010                 2 => shuffle3!($a, $b, $e, $f, 12, 13),
13011                 _ => shuffle3!($a, $b, $e, $f, 14, 15),
13012             }
13013         };
13014     }
13015     macro_rules! shuffle1 {
13016         ($a:expr, $e:expr) => {
13017             match (imm8 >> 2) & 0x3 {
13018                 0 => shuffle2!($a, $e, 0, 1),
13019                 1 => shuffle2!($a, $e, 2, 3),
13020                 2 => shuffle2!($a, $e, 4, 5),
13021                 _ => shuffle2!($a, $e, 6, 7),
13022             }
13023         };
13024     }
13025     match imm8 & 0x3 {
13026         0 => shuffle1!(0, 1),
13027         1 => shuffle1!(2, 3),
13028         2 => shuffle1!(4, 5),
13029         _ => shuffle1!(6, 7),
13030     }
13031 }
13032
13033 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13034 ///
13035 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_i64x&expand=5181)
13036 #[inline]
13037 #[target_feature(enable = "avx512f")]
13038 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
13039 #[rustc_args_required_const(4)]
13040 pub unsafe fn _mm512_mask_shuffle_i64x2(
13041     src: __m512i,
13042     k: __mmask8,
13043     a: __m512i,
13044     b: __m512i,
13045     imm8: i32,
13046 ) -> __m512i {
13047     assert!(imm8 >= 0 && imm8 <= 255);
13048     let imm8 = (imm8 & 0xFF) as u8;
13049     macro_rules! shuffle4 {
13050         (
13051             $a:expr,
13052             $b:expr,
13053             $c:expr,
13054             $d:expr,
13055             $e:expr,
13056             $f:expr,
13057             $g:expr,
13058             $h:expr
13059         ) => {
13060             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
13061         };
13062     }
13063     macro_rules! shuffle3 {
13064         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
13065             match (imm8 >> 6) & 0x3 {
13066                 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
13067                 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
13068                 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
13069                 _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
13070             }
13071         };
13072     }
13073     macro_rules! shuffle2 {
13074         ($a:expr, $b:expr, $e:expr, $f:expr) => {
13075             match (imm8 >> 4) & 0x3 {
13076                 0 => shuffle3!($a, $b, $e, $f, 8, 9),
13077                 1 => shuffle3!($a, $b, $e, $f, 10, 11),
13078                 2 => shuffle3!($a, $b, $e, $f, 12, 13),
13079                 _ => shuffle3!($a, $b, $e, $f, 14, 15),
13080             }
13081         };
13082     }
13083     macro_rules! shuffle1 {
13084         ($a:expr, $e:expr) => {
13085             match (imm8 >> 2) & 0x3 {
13086                 0 => shuffle2!($a, $e, 0, 1),
13087                 1 => shuffle2!($a, $e, 2, 3),
13088                 2 => shuffle2!($a, $e, 4, 5),
13089                 _ => shuffle2!($a, $e, 6, 7),
13090             }
13091         };
13092     }
13093     let shuffle = match imm8 & 0x3 {
13094         0 => shuffle1!(0, 1),
13095         1 => shuffle1!(2, 3),
13096         2 => shuffle1!(4, 5),
13097         _ => shuffle1!(6, 7),
13098     };
13099
13100     transmute(simd_select_bitmask(k, shuffle, src.as_i64x8()))
13101 }
13102
13103 /// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13104 ///
13105 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_i64&expand=5182)
13106 #[inline]
13107 #[target_feature(enable = "avx512f")]
13108 #[cfg_attr(test, assert_instr(vshufi64x2, imm8 = 0b10111111))]
13109 #[rustc_args_required_const(3)]
13110 pub unsafe fn _mm512_maskz_shuffle_i64x2(
13111     k: __mmask8,
13112     a: __m512i,
13113     b: __m512i,
13114     imm8: i32,
13115 ) -> __m512i {
13116     assert!(imm8 >= 0 && imm8 <= 255);
13117     let imm8 = (imm8 & 0xFF) as u8;
13118     macro_rules! shuffle4 {
13119         (
13120             $a:expr,
13121             $b:expr,
13122             $c:expr,
13123             $d:expr,
13124             $e:expr,
13125             $f:expr,
13126             $g:expr,
13127             $h:expr
13128         ) => {
13129             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
13130         };
13131     }
13132     macro_rules! shuffle3 {
13133         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
13134             match (imm8 >> 6) & 0x3 {
13135                 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
13136                 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
13137                 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
13138                 _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
13139             }
13140         };
13141     }
13142     macro_rules! shuffle2 {
13143         ($a:expr, $b:expr, $e:expr, $f:expr) => {
13144             match (imm8 >> 4) & 0x3 {
13145                 0 => shuffle3!($a, $b, $e, $f, 8, 9),
13146                 1 => shuffle3!($a, $b, $e, $f, 10, 11),
13147                 2 => shuffle3!($a, $b, $e, $f, 12, 13),
13148                 _ => shuffle3!($a, $b, $e, $f, 14, 15),
13149             }
13150         };
13151     }
13152     macro_rules! shuffle1 {
13153         ($a:expr, $e:expr) => {
13154             match (imm8 >> 2) & 0x3 {
13155                 0 => shuffle2!($a, $e, 0, 1),
13156                 1 => shuffle2!($a, $e, 2, 3),
13157                 2 => shuffle2!($a, $e, 4, 5),
13158                 _ => shuffle2!($a, $e, 6, 7),
13159             }
13160         };
13161     }
13162     let shuffle = match imm8 & 0x3 {
13163         0 => shuffle1!(0, 1),
13164         1 => shuffle1!(2, 3),
13165         2 => shuffle1!(4, 5),
13166         _ => shuffle1!(6, 7),
13167     };
13168
13169     let zero = _mm512_setzero_si512().as_i64x8();
13170     transmute(simd_select_bitmask(k, shuffle, zero))
13171 }
13172
13173 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
13174 ///
13175 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_f32x4&expand=5165)
13176 #[inline]
13177 #[target_feature(enable = "avx512f")]
13178 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))] //should be vshuff32x4, but generate vshuff64x2
13179 #[rustc_args_required_const(2)]
13180 pub unsafe fn _mm512_shuffle_f32x4(a: __m512, b: __m512, imm8: i32) -> __m512 {
13181     assert!(imm8 >= 0 && imm8 <= 255);
13182     let imm8 = (imm8 & 0xFF) as u8;
13183     macro_rules! shuffle4 {
13184         (
13185             $a:expr,
13186             $b:expr,
13187             $c:expr,
13188             $d:expr,
13189             $e:expr,
13190             $f:expr,
13191             $g:expr,
13192             $h:expr,
13193             $i:expr,
13194             $j:expr,
13195             $k:expr,
13196             $l:expr,
13197             $m:expr,
13198             $n:expr,
13199             $o:expr,
13200             $p:expr
13201         ) => {
13202             simd_shuffle16(
13203                 a,
13204                 b,
13205                 [
13206                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
13207                 ],
13208             )
13209         };
13210     }
13211     macro_rules! shuffle3 {
13212         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
13213             match (imm8 >> 6) & 0x3 {
13214                 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
13215                 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
13216                 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
13217                 _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
13218             }
13219         };
13220     }
13221     macro_rules! shuffle2 {
13222         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
13223             match (imm8 >> 4) & 0x3 {
13224                 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
13225                 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
13226                 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
13227                 _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
13228             }
13229         };
13230     }
13231     macro_rules! shuffle1 {
13232         ($a:expr, $e:expr, $i: expr, $m: expr) => {
13233             match (imm8 >> 2) & 0x3 {
13234                 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
13235                 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
13236                 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
13237                 _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
13238             }
13239         };
13240     }
13241     match imm8 & 0x3 {
13242         0 => shuffle1!(0, 1, 2, 3),
13243         1 => shuffle1!(4, 5, 6, 7),
13244         2 => shuffle1!(8, 9, 10, 11),
13245         _ => shuffle1!(12, 13, 14, 15),
13246     }
13247 }
13248
13249 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13250 ///
13251 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_f32&expand=5163)
13252 #[inline]
13253 #[target_feature(enable = "avx512f")]
13254 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
13255 #[rustc_args_required_const(4)]
13256 pub unsafe fn _mm512_mask_shuffle_f32x4(
13257     src: __m512,
13258     k: __mmask16,
13259     a: __m512,
13260     b: __m512,
13261     imm8: i32,
13262 ) -> __m512 {
13263     assert!(imm8 >= 0 && imm8 <= 255);
13264     let imm8 = (imm8 & 0xFF) as u8;
13265     macro_rules! shuffle4 {
13266         (
13267             $a:expr,
13268             $b:expr,
13269             $c:expr,
13270             $d:expr,
13271             $e:expr,
13272             $f:expr,
13273             $g:expr,
13274             $h:expr,
13275             $i:expr,
13276             $j:expr,
13277             $k:expr,
13278             $l:expr,
13279             $m:expr,
13280             $n:expr,
13281             $o:expr,
13282             $p:expr
13283         ) => {
13284             simd_shuffle16(
13285                 a,
13286                 b,
13287                 [
13288                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
13289                 ],
13290             )
13291         };
13292     }
13293     macro_rules! shuffle3 {
13294         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
13295             match (imm8 >> 6) & 0x3 {
13296                 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
13297                 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
13298                 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
13299                 _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
13300             }
13301         };
13302     }
13303     macro_rules! shuffle2 {
13304         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
13305             match (imm8 >> 4) & 0x3 {
13306                 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
13307                 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
13308                 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
13309                 _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
13310             }
13311         };
13312     }
13313     macro_rules! shuffle1 {
13314         ($a:expr, $e:expr, $i: expr, $m: expr) => {
13315             match (imm8 >> 2) & 0x3 {
13316                 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
13317                 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
13318                 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
13319                 _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
13320             }
13321         };
13322     }
13323     let shuffle = match imm8 & 0x3 {
13324         0 => shuffle1!(0, 1, 2, 3),
13325         1 => shuffle1!(4, 5, 6, 7),
13326         2 => shuffle1!(8, 9, 10, 11),
13327         _ => shuffle1!(12, 13, 14, 15),
13328     };
13329
13330     transmute(simd_select_bitmask(k, shuffle, src.as_f32x16()))
13331 }
13332
13333 /// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13334 ///
13335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_f32&expand=5164)
13336 #[inline]
13337 #[target_feature(enable = "avx512f")]
13338 #[cfg_attr(test, assert_instr(vshuff32x4, imm8 = 0b10111111))]
13339 #[rustc_args_required_const(3)]
13340 pub unsafe fn _mm512_maskz_shuffle_f32x4(k: __mmask16, a: __m512, b: __m512, imm8: i32) -> __m512 {
13341     assert!(imm8 >= 0 && imm8 <= 255);
13342     let imm8 = (imm8 & 0xFF) as u8;
13343     macro_rules! shuffle4 {
13344         (
13345             $a:expr,
13346             $b:expr,
13347             $c:expr,
13348             $d:expr,
13349             $e:expr,
13350             $f:expr,
13351             $g:expr,
13352             $h:expr,
13353             $i:expr,
13354             $j:expr,
13355             $k:expr,
13356             $l:expr,
13357             $m:expr,
13358             $n:expr,
13359             $o:expr,
13360             $p:expr
13361         ) => {
13362             simd_shuffle16(
13363                 a,
13364                 b,
13365                 [
13366                     $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
13367                 ],
13368             )
13369         };
13370     }
13371     macro_rules! shuffle3 {
13372         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr, $i:expr, $j:expr, $k:expr, $m:expr, $n:expr, $o:expr) => {
13373             match (imm8 >> 6) & 0x3 {
13374                 0 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 16, 17, 18, 19),
13375                 1 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 20, 21, 22, 23),
13376                 2 => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 24, 25, 26, 27),
13377                 _ => shuffle4!($a, $b, $c, $e, $f, $g, $i, $j, $k, $m, $n, $o, 28, 29, 30, 31),
13378             }
13379         };
13380     }
13381     macro_rules! shuffle2 {
13382         ($a:expr, $b:expr, $e:expr, $f:expr, $i:expr, $j:expr, $m:expr, $n:expr) => {
13383             match (imm8 >> 4) & 0x3 {
13384                 0 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 16, 17, 18, 19),
13385                 1 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 20, 21, 22, 23),
13386                 2 => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 24, 25, 26, 27),
13387                 _ => shuffle3!($a, $b, $e, $f, $i, $j, $m, $n, 28, 29, 30, 31),
13388             }
13389         };
13390     }
13391     macro_rules! shuffle1 {
13392         ($a:expr, $e:expr, $i: expr, $m: expr) => {
13393             match (imm8 >> 2) & 0x3 {
13394                 0 => shuffle2!($a, $e, $i, $m, 0, 1, 2, 3),
13395                 1 => shuffle2!($a, $e, $i, $m, 4, 5, 6, 7),
13396                 2 => shuffle2!($a, $e, $i, $m, 8, 9, 10, 11),
13397                 _ => shuffle2!($a, $e, $i, $m, 12, 13, 14, 15),
13398             }
13399         };
13400     }
13401     let shuffle = match imm8 & 0x3 {
13402         0 => shuffle1!(0, 1, 2, 3),
13403         1 => shuffle1!(4, 5, 6, 7),
13404         2 => shuffle1!(8, 9, 10, 11),
13405         _ => shuffle1!(12, 13, 14, 15),
13406     };
13407
13408     let zero = _mm512_setzero_ps().as_f32x16();
13409     transmute(simd_select_bitmask(k, shuffle, zero))
13410 }
13411
13412 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
13413 ///
13414 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_f64x2&expand=5171)
13415 #[inline]
13416 #[target_feature(enable = "avx512f")]
13417 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
13418 #[rustc_args_required_const(2)]
13419 pub unsafe fn _mm512_shuffle_f64x2(a: __m512d, b: __m512d, imm8: i32) -> __m512d {
13420     assert!(imm8 >= 0 && imm8 <= 255);
13421     let imm8 = (imm8 & 0xFF) as u8;
13422     macro_rules! shuffle4 {
13423         (
13424             $a:expr,
13425             $b:expr,
13426             $c:expr,
13427             $d:expr,
13428             $e:expr,
13429             $f:expr,
13430             $g:expr,
13431             $h:expr
13432         ) => {
13433             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
13434         };
13435     }
13436     macro_rules! shuffle3 {
13437         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
13438             match (imm8 >> 6) & 0x3 {
13439                 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
13440                 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
13441                 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
13442                 _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
13443             }
13444         };
13445     }
13446     macro_rules! shuffle2 {
13447         ($a:expr, $b:expr, $e:expr, $f:expr) => {
13448             match (imm8 >> 4) & 0x3 {
13449                 0 => shuffle3!($a, $b, $e, $f, 8, 9),
13450                 1 => shuffle3!($a, $b, $e, $f, 10, 11),
13451                 2 => shuffle3!($a, $b, $e, $f, 12, 13),
13452                 _ => shuffle3!($a, $b, $e, $f, 14, 15),
13453             }
13454         };
13455     }
13456     macro_rules! shuffle1 {
13457         ($a:expr, $e:expr) => {
13458             match (imm8 >> 2) & 0x3 {
13459                 0 => shuffle2!($a, $e, 0, 1),
13460                 1 => shuffle2!($a, $e, 2, 3),
13461                 2 => shuffle2!($a, $e, 4, 5),
13462                 _ => shuffle2!($a, $e, 6, 7),
13463             }
13464         };
13465     }
13466     match imm8 & 0x3 {
13467         0 => shuffle1!(0, 1),
13468         1 => shuffle1!(2, 3),
13469         2 => shuffle1!(4, 5),
13470         _ => shuffle1!(6, 7),
13471     }
13472 }
13473
13474 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13475 ///
13476 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_shuffle_f64x2&expand=5169)
13477 #[inline]
13478 #[target_feature(enable = "avx512f")]
13479 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
13480 #[rustc_args_required_const(4)]
13481 pub unsafe fn _mm512_mask_shuffle_f64x2(
13482     src: __m512d,
13483     k: __mmask8,
13484     a: __m512d,
13485     b: __m512d,
13486     imm8: i32,
13487 ) -> __m512d {
13488     assert!(imm8 >= 0 && imm8 <= 255);
13489     let imm8 = (imm8 & 0xFF) as u8;
13490     macro_rules! shuffle4 {
13491         (
13492             $a:expr,
13493             $b:expr,
13494             $c:expr,
13495             $d:expr,
13496             $e:expr,
13497             $f:expr,
13498             $g:expr,
13499             $h:expr
13500         ) => {
13501             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
13502         };
13503     }
13504     macro_rules! shuffle3 {
13505         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
13506             match (imm8 >> 6) & 0x3 {
13507                 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
13508                 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
13509                 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
13510                 _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
13511             }
13512         };
13513     }
13514     macro_rules! shuffle2 {
13515         ($a:expr, $b:expr, $e:expr, $f:expr) => {
13516             match (imm8 >> 4) & 0x3 {
13517                 0 => shuffle3!($a, $b, $e, $f, 8, 9),
13518                 1 => shuffle3!($a, $b, $e, $f, 10, 11),
13519                 2 => shuffle3!($a, $b, $e, $f, 12, 13),
13520                 _ => shuffle3!($a, $b, $e, $f, 14, 15),
13521             }
13522         };
13523     }
13524     macro_rules! shuffle1 {
13525         ($a:expr, $e:expr) => {
13526             match (imm8 >> 2) & 0x3 {
13527                 0 => shuffle2!($a, $e, 0, 1),
13528                 1 => shuffle2!($a, $e, 2, 3),
13529                 2 => shuffle2!($a, $e, 4, 5),
13530                 _ => shuffle2!($a, $e, 6, 7),
13531             }
13532         };
13533     }
13534     let shuffle = match imm8 & 0x3 {
13535         0 => shuffle1!(0, 1),
13536         1 => shuffle1!(2, 3),
13537         2 => shuffle1!(4, 5),
13538         _ => shuffle1!(6, 7),
13539     };
13540
13541     transmute(simd_select_bitmask(k, shuffle, src.as_f64x8()))
13542 }
13543
13544 /// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13545 ///
13546 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_shuffle_f64x2&expand=5170)
13547 #[inline]
13548 #[target_feature(enable = "avx512f")]
13549 #[cfg_attr(test, assert_instr(vshuff64x2, imm8 = 0b10111111))]
13550 #[rustc_args_required_const(3)]
13551 pub unsafe fn _mm512_maskz_shuffle_f64x2(
13552     k: __mmask8,
13553     a: __m512d,
13554     b: __m512d,
13555     imm8: i32,
13556 ) -> __m512d {
13557     assert!(imm8 >= 0 && imm8 <= 255);
13558     let imm8 = (imm8 & 0xFF) as u8;
13559     macro_rules! shuffle4 {
13560         (
13561             $a:expr,
13562             $b:expr,
13563             $c:expr,
13564             $d:expr,
13565             $e:expr,
13566             $f:expr,
13567             $g:expr,
13568             $h:expr
13569         ) => {
13570             simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h])
13571         };
13572     }
13573     macro_rules! shuffle3 {
13574         ($a:expr, $b:expr, $c:expr, $e:expr, $f:expr, $g:expr) => {
13575             match (imm8 >> 6) & 0x3 {
13576                 0 => shuffle4!($a, $b, $c, $e, $f, $g, 8, 9),
13577                 1 => shuffle4!($a, $b, $c, $e, $f, $g, 10, 11),
13578                 2 => shuffle4!($a, $b, $c, $e, $f, $g, 12, 13),
13579                 _ => shuffle4!($a, $b, $c, $e, $f, $g, 14, 15),
13580             }
13581         };
13582     }
13583     macro_rules! shuffle2 {
13584         ($a:expr, $b:expr, $e:expr, $f:expr) => {
13585             match (imm8 >> 4) & 0x3 {
13586                 0 => shuffle3!($a, $b, $e, $f, 8, 9),
13587                 1 => shuffle3!($a, $b, $e, $f, 10, 11),
13588                 2 => shuffle3!($a, $b, $e, $f, 12, 13),
13589                 _ => shuffle3!($a, $b, $e, $f, 14, 15),
13590             }
13591         };
13592     }
13593     macro_rules! shuffle1 {
13594         ($a:expr, $e:expr) => {
13595             match (imm8 >> 2) & 0x3 {
13596                 0 => shuffle2!($a, $e, 0, 1),
13597                 1 => shuffle2!($a, $e, 2, 3),
13598                 2 => shuffle2!($a, $e, 4, 5),
13599                 _ => shuffle2!($a, $e, 6, 7),
13600             }
13601         };
13602     }
13603     let shuffle = match imm8 & 0x3 {
13604         0 => shuffle1!(0, 1),
13605         1 => shuffle1!(2, 3),
13606         2 => shuffle1!(4, 5),
13607         _ => shuffle1!(6, 7),
13608     };
13609
13610     let zero = _mm512_setzero_pd().as_f64x8();
13611     transmute(simd_select_bitmask(k, shuffle, zero))
13612 }
13613
13614 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
13615 ///
13616 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_extractf32x4_ps&expand=2442)
13617 #[inline]
13618 #[target_feature(enable = "avx512f")]
13619 #[cfg_attr(
13620     all(test, not(target_os = "windows")),
13621     assert_instr(vextractf32x4, imm8 = 3)
13622 )]
13623 #[rustc_args_required_const(1)]
13624 pub unsafe fn _mm512_extractf32x4_ps(a: __m512, imm8: i32) -> __m128 {
13625     assert!(imm8 >= 0 && imm8 <= 3);
13626     match imm8 & 0x3 {
13627         0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
13628         1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
13629         2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
13630         _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
13631     }
13632 }
13633
13634 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13635 ///
13636 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_extractf32x4_ps&expand=2443)
13637 #[inline]
13638 #[target_feature(enable = "avx512f")]
13639 #[cfg_attr(
13640     all(test, not(target_os = "windows")),
13641     assert_instr(vextractf32x4, imm8 = 3)
13642 )]
13643 #[rustc_args_required_const(3)]
13644 pub unsafe fn _mm512_mask_extractf32x4_ps(
13645     src: __m128,
13646     k: __mmask8,
13647     a: __m512,
13648     imm8: i32,
13649 ) -> __m128 {
13650     assert!(imm8 >= 0 && imm8 <= 3);
13651     let extract: __m128 = match imm8 & 0x3 {
13652         0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
13653         1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
13654         2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
13655         _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
13656     };
13657     transmute(simd_select_bitmask(k, extract.as_f32x4(), src.as_f32x4()))
13658 }
13659
13660 /// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13661 ///
13662 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_extractf32x4_ps&expand=2444)
13663 #[inline]
13664 #[target_feature(enable = "avx512f")]
13665 #[cfg_attr(
13666     all(test, not(target_os = "windows")),
13667     assert_instr(vextractf32x4, imm8 = 3)
13668 )]
13669 #[rustc_args_required_const(2)]
13670 pub unsafe fn _mm512_maskz_extractf32x4_ps(k: __mmask8, a: __m512, imm8: i32) -> __m128 {
13671     assert!(imm8 >= 0 && imm8 <= 3);
13672     let extract: __m128 = match imm8 & 0x3 {
13673         0 => simd_shuffle4(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
13674         1 => simd_shuffle4(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
13675         2 => simd_shuffle4(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
13676         _ => simd_shuffle4(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
13677     };
13678     let zero = _mm_setzero_ps().as_f32x4();
13679     transmute(simd_select_bitmask(k, extract.as_f32x4(), zero))
13680 }
13681
13682 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the result in dst.
13683 ///
13684 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_extracti64x4_epi64&expand=2473)
13685 #[inline]
13686 #[target_feature(enable = "avx512f")]
13687 #[cfg_attr(
13688     all(test, not(target_os = "windows")),
13689     assert_instr(vextractf64x4, imm8 = 1) //should be vextracti64x4
13690 )]
13691 #[rustc_args_required_const(1)]
13692 pub unsafe fn _mm512_extracti64x4_epi64(a: __m512i, imm8: i32) -> __m256i {
13693     assert!(imm8 >= 0 && imm8 <= 1);
13694     match imm8 & 0x1 {
13695         0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
13696         _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
13697     }
13698 }
13699
13700 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13701 ///
13702 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_extracti64x4_epi64&expand=2474)
13703 #[inline]
13704 #[target_feature(enable = "avx512f")]
13705 #[cfg_attr(
13706     all(test, not(target_os = "windows")),
13707     assert_instr(vextracti64x4, imm8 = 1)
13708 )]
13709 #[rustc_args_required_const(3)]
13710 pub unsafe fn _mm512_mask_extracti64x4_epi64(
13711     src: __m256i,
13712     k: __mmask8,
13713     a: __m512i,
13714     imm8: i32,
13715 ) -> __m256i {
13716     assert!(imm8 >= 0 && imm8 <= 1);
13717     let extract = match imm8 & 0x1 {
13718         0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
13719         _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
13720     };
13721     transmute(simd_select_bitmask(k, extract, src.as_i64x4()))
13722 }
13723
13724 /// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13725 ///
13726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_extracti64x4_epi64&expand=2475)
13727 #[inline]
13728 #[target_feature(enable = "avx512f")]
13729 #[cfg_attr(
13730     all(test, not(target_os = "windows")),
13731     assert_instr(vextracti64x4, imm8 = 1)
13732 )]
13733 #[rustc_args_required_const(2)]
13734 pub unsafe fn _mm512_maskz_extracti64x4_epi64(k: __mmask8, a: __m512i, imm8: i32) -> __m256i {
13735     assert!(imm8 >= 0 && imm8 <= 1);
13736     let extract: __m256i = match imm8 & 0x1 {
13737         0 => simd_shuffle4(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
13738         _ => simd_shuffle4(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
13739     };
13740     let zero = _mm256_setzero_si256().as_i64x4();
13741     transmute(simd_select_bitmask(k, extract.as_i64x4(), zero))
13742 }
13743
13744 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
13745 ///
13746 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_extractf64x4_pd&expand=2454)
13747 #[inline]
13748 #[target_feature(enable = "avx512f")]
13749 #[cfg_attr(
13750     all(test, not(target_os = "windows")),
13751     assert_instr(vextractf64x4, imm8 = 1)
13752 )]
13753 #[rustc_args_required_const(1)]
13754 pub unsafe fn _mm512_extractf64x4_pd(a: __m512d, imm8: i32) -> __m256d {
13755     assert!(imm8 >= 0 && imm8 <= 1);
13756     match imm8 & 0x1 {
13757         0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
13758         _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
13759     }
13760 }
13761
13762 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13763 ///
13764 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_extractf64x4_pd&expand=2455)
13765 #[inline]
13766 #[target_feature(enable = "avx512f")]
13767 #[cfg_attr(
13768     all(test, not(target_os = "windows")),
13769     assert_instr(vextractf64x4, imm8 = 1)
13770 )]
13771 #[rustc_args_required_const(3)]
13772 pub unsafe fn _mm512_mask_extractf64x4_pd(
13773     src: __m256d,
13774     k: __mmask8,
13775     a: __m512d,
13776     imm8: i32,
13777 ) -> __m256d {
13778     assert!(imm8 >= 0 && imm8 <= 1);
13779     let extract = match imm8 & 0x1 {
13780         0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
13781         _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
13782     };
13783     transmute(simd_select_bitmask(k, extract, src))
13784 }
13785
13786 /// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13787 ///
13788 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_extractf64x4_pd&expand=2456)
13789 #[inline]
13790 #[target_feature(enable = "avx512f")]
13791 #[cfg_attr(
13792     all(test, not(target_os = "windows")),
13793     assert_instr(vextractf64x4, imm8 = 1)
13794 )]
13795 #[rustc_args_required_const(2)]
13796 pub unsafe fn _mm512_maskz_extractf64x4_pd(k: __mmask8, a: __m512d, imm8: i32) -> __m256d {
13797     assert!(imm8 >= 0 && imm8 <= 1);
13798     let extract = match imm8 & 0x1 {
13799         0 => simd_shuffle4(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
13800         _ => simd_shuffle4(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
13801     };
13802     let zero = _mm256_setzero_pd();
13803     transmute(simd_select_bitmask(k, extract, zero))
13804 }
13805
13806 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
13807 ///
13808 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_extracti32x4_epi32&expand=2461)
13809 #[inline]
13810 #[target_feature(enable = "avx512f")]
13811 #[cfg_attr(
13812     all(test, not(target_os = "windows")),
13813     assert_instr(vextractf32x4, imm8 = 3) //should be vextracti32x4
13814 )]
13815 #[rustc_args_required_const(1)]
13816 pub unsafe fn _mm512_extracti32x4_epi32(a: __m512i, imm8: i32) -> __m128i {
13817     assert!(imm8 >= 0 && imm8 <= 3);
13818     let a = a.as_i32x16();
13819     let undefined = _mm512_undefined_epi32().as_i32x16();
13820     let extract: i32x4 = match imm8 & 0x3 {
13821         0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
13822         1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
13823         2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
13824         _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
13825     };
13826     transmute(extract)
13827 }
13828
13829 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13830 ///
13831 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_extracti32x4_epi32&expand=2462)
13832 #[inline]
13833 #[target_feature(enable = "avx512f")]
13834 #[cfg_attr(
13835     all(test, not(target_os = "windows")),
13836     assert_instr(vextracti32x4, imm8 = 3)
13837 )]
13838 #[rustc_args_required_const(3)]
13839 pub unsafe fn _mm512_mask_extracti32x4_epi32(
13840     src: __m128i,
13841     k: __mmask8,
13842     a: __m512i,
13843     imm8: i32,
13844 ) -> __m128i {
13845     assert!(imm8 >= 0 && imm8 <= 3);
13846     let a = a.as_i32x16();
13847     let undefined = _mm512_undefined_epi32().as_i32x16();
13848     let extract: i32x4 = match imm8 & 0x3 {
13849         0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
13850         1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
13851         2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
13852         _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
13853     };
13854     transmute(simd_select_bitmask(k, extract, src.as_i32x4()))
13855 }
13856
13857 /// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13858 ///
13859 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_extracti32x4_epi32&expand=2463)
13860 #[inline]
13861 #[target_feature(enable = "avx512f")]
13862 #[cfg_attr(
13863     all(test, not(target_os = "windows")),
13864     assert_instr(vextracti32x4, imm8 = 3)
13865 )]
13866 #[rustc_args_required_const(2)]
13867 pub unsafe fn _mm512_maskz_extracti32x4_epi32(k: __mmask8, a: __m512i, imm8: i32) -> __m128i {
13868     assert!(imm8 >= 0 && imm8 <= 3);
13869     let a = a.as_i32x16();
13870     let undefined = _mm512_undefined_epi32().as_i32x16();
13871     let extract: i32x4 = match imm8 & 0x3 {
13872         0 => simd_shuffle4(a, undefined, [0, 1, 2, 3]),
13873         1 => simd_shuffle4(a, undefined, [4, 5, 6, 7]),
13874         2 => simd_shuffle4(a, undefined, [8, 9, 10, 11]),
13875         _ => simd_shuffle4(a, undefined, [12, 13, 14, 15]),
13876     };
13877     let zero = _mm_setzero_si128().as_i32x4();
13878     transmute(simd_select_bitmask(k, extract, zero))
13879 }
13880
13881 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
13882 ///
13883 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_moveldup_ps&expand=3862)
13884 #[inline]
13885 #[target_feature(enable = "avx512f")]
13886 #[cfg_attr(test, assert_instr(vmovsldup))]
13887 pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
13888     let r: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
13889     transmute(r)
13890 }
13891
13892 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13893 ///
13894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_moveldup_ps&expand=3860)
13895 #[inline]
13896 #[target_feature(enable = "avx512f")]
13897 #[cfg_attr(test, assert_instr(vmovsldup))]
13898 pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
13899     let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
13900     transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
13901 }
13902
13903 /// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13904 ///
13905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_moveldup_ps&expand=3861)
13906 #[inline]
13907 #[target_feature(enable = "avx512f")]
13908 #[cfg_attr(test, assert_instr(vmovsldup))]
13909 pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
13910     let mov: f32x16 = simd_shuffle16(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
13911     let zero = _mm512_setzero_ps().as_f32x16();
13912     transmute(simd_select_bitmask(k, mov, zero))
13913 }
13914
13915 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
13916 ///
13917 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_movehdup_ps&expand=3852)
13918 #[inline]
13919 #[target_feature(enable = "avx512f")]
13920 #[cfg_attr(test, assert_instr(vmovshdup))]
13921 pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
13922     let r: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
13923     transmute(r)
13924 }
13925
13926 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13927 ///
13928 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_movehdup&expand=3850)
13929 #[inline]
13930 #[target_feature(enable = "avx512f")]
13931 #[cfg_attr(test, assert_instr(vmovshdup))]
13932 pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
13933     let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
13934     transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
13935 }
13936
13937 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13938 ///
13939 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_moveh&expand=3851)
13940 #[inline]
13941 #[target_feature(enable = "avx512f")]
13942 #[cfg_attr(test, assert_instr(vmovshdup))]
13943 pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
13944     let mov: f32x16 = simd_shuffle16(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
13945     let zero = _mm512_setzero_ps().as_f32x16();
13946     transmute(simd_select_bitmask(k, mov, zero))
13947 }
13948
13949 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
13950 ///
13951 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_movedup_pd&expand=3843)
13952 #[inline]
13953 #[target_feature(enable = "avx512f")]
13954 #[cfg_attr(test, assert_instr(vmovddup))]
13955 pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
13956     let r: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
13957     transmute(r)
13958 }
13959
13960 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13961 ///
13962 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_movedup_pd&expand=3841)
13963 #[inline]
13964 #[target_feature(enable = "avx512f")]
13965 #[cfg_attr(test, assert_instr(vmovddup))]
13966 pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
13967     let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
13968     transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
13969 }
13970
13971 /// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13972 ///
13973 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_movedup_pd&expand=3842)
13974 #[inline]
13975 #[target_feature(enable = "avx512f")]
13976 #[cfg_attr(test, assert_instr(vmovddup))]
13977 pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
13978     let mov: f64x8 = simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
13979     let zero = _mm512_setzero_pd().as_f64x8();
13980     transmute(simd_select_bitmask(k, mov, zero))
13981 }
13982
13983 /// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
13984 ///
13985 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_inserti32x4&expand=3174)
13986 #[inline]
13987 #[target_feature(enable = "avx512f")]
13988 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))] //should be vinserti32x4
13989 #[rustc_args_required_const(2)]
13990 pub unsafe fn _mm512_inserti32x4(a: __m512i, b: __m128i, imm8: i32) -> __m512i {
13991     assert!(imm8 >= 0 && imm8 <= 3);
13992     let a = a.as_i32x16();
13993     let b = _mm512_castsi128_si512(b).as_i32x16();
13994     let ret: i32x16 = match imm8 & 0b11 {
13995         0 => simd_shuffle16(
13996             a,
13997             b,
13998             [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
13999         ),
14000         1 => simd_shuffle16(
14001             a,
14002             b,
14003             [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
14004         ),
14005         2 => simd_shuffle16(
14006             a,
14007             b,
14008             [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
14009         ),
14010         _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
14011     };
14012     transmute(ret)
14013 }
14014
14015 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14016 ///
14017 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_inserti32x4&expand=3175)
14018 #[inline]
14019 #[target_feature(enable = "avx512f")]
14020 #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
14021 #[rustc_args_required_const(4)]
14022 pub unsafe fn _mm512_mask_inserti32x4(
14023     src: __m512i,
14024     k: __mmask16,
14025     a: __m512i,
14026     b: __m128i,
14027     imm8: i32,
14028 ) -> __m512i {
14029     assert!(imm8 >= 0 && imm8 <= 3);
14030     let a = a.as_i32x16();
14031     let b = _mm512_castsi128_si512(b).as_i32x16();
14032     let insert: i32x16 = match imm8 & 0b11 {
14033         0 => simd_shuffle16(
14034             a,
14035             b,
14036             [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
14037         ),
14038         1 => simd_shuffle16(
14039             a,
14040             b,
14041             [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
14042         ),
14043         2 => simd_shuffle16(
14044             a,
14045             b,
14046             [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
14047         ),
14048         _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
14049     };
14050     transmute(simd_select_bitmask(k, insert, src.as_i32x16()))
14051 }
14052
14053 /// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14054 ///
14055 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_inserti32x4&expand=3176)
14056 #[inline]
14057 #[target_feature(enable = "avx512f")]
14058 #[cfg_attr(test, assert_instr(vinserti32x4, imm8 = 2))]
14059 #[rustc_args_required_const(3)]
14060 pub unsafe fn _mm512_maskz_inserti32x4(k: __mmask16, a: __m512i, b: __m128i, imm8: i32) -> __m512i {
14061     assert!(imm8 >= 0 && imm8 <= 3);
14062     let a = a.as_i32x16();
14063     let b = _mm512_castsi128_si512(b).as_i32x16();
14064     let insert = match imm8 & 0b11 {
14065         0 => simd_shuffle16(
14066             a,
14067             b,
14068             [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
14069         ),
14070         1 => simd_shuffle16(
14071             a,
14072             b,
14073             [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
14074         ),
14075         2 => simd_shuffle16(
14076             a,
14077             b,
14078             [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
14079         ),
14080         _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
14081     };
14082     let zero = _mm512_setzero_si512().as_i32x16();
14083     transmute(simd_select_bitmask(k, insert, zero))
14084 }
14085
14086 /// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
14087 ///
14088 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_inserti64x4&expand=3186)
14089 #[inline]
14090 #[target_feature(enable = "avx512f")]
14091 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))] //should be vinserti64x4
14092 #[rustc_args_required_const(2)]
14093 pub unsafe fn _mm512_inserti64x4(a: __m512i, b: __m256i, imm8: i32) -> __m512i {
14094     assert!(imm8 >= 0 && imm8 <= 1);
14095     let b = _mm512_castsi256_si512(b);
14096     match imm8 & 0b1 {
14097         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
14098         _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
14099     }
14100 }
14101
14102 /// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14103 ///
14104 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_inserti64x4&expand=3187)
14105 #[inline]
14106 #[target_feature(enable = "avx512f")]
14107 #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
14108 #[rustc_args_required_const(4)]
14109 pub unsafe fn _mm512_mask_inserti64x4(
14110     src: __m512i,
14111     k: __mmask8,
14112     a: __m512i,
14113     b: __m256i,
14114     imm8: i32,
14115 ) -> __m512i {
14116     assert!(imm8 >= 0 && imm8 <= 1);
14117     let b = _mm512_castsi256_si512(b);
14118     let insert = match imm8 & 0b1 {
14119         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
14120         _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
14121     };
14122     transmute(simd_select_bitmask(k, insert, src.as_i64x8()))
14123 }
14124
14125 /// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14126 ///
14127 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_inserti64x4&expand=3188)
14128 #[inline]
14129 #[target_feature(enable = "avx512f")]
14130 #[cfg_attr(test, assert_instr(vinserti64x4, imm8 = 1))]
14131 #[rustc_args_required_const(3)]
14132 pub unsafe fn _mm512_maskz_inserti64x4(k: __mmask8, a: __m512i, b: __m256i, imm8: i32) -> __m512i {
14133     assert!(imm8 >= 0 && imm8 <= 1);
14134     let b = _mm512_castsi256_si512(b);
14135     let insert = match imm8 & 0b1 {
14136         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
14137         _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
14138     };
14139     let zero = _mm512_setzero_si512().as_i64x8();
14140     transmute(simd_select_bitmask(k, insert, zero))
14141 }
14142
14143 /// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
14144 ///
14145 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf32x4&expand=3155)
14146 #[inline]
14147 #[target_feature(enable = "avx512f")]
14148 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
14149 #[rustc_args_required_const(2)]
14150 pub unsafe fn _mm512_insertf32x4(a: __m512, b: __m128, imm8: i32) -> __m512 {
14151     assert!(imm8 >= 0 && imm8 <= 3);
14152     let b = _mm512_castps128_ps512(b);
14153     match imm8 & 0b11 {
14154         0 => simd_shuffle16(
14155             a,
14156             b,
14157             [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
14158         ),
14159         1 => simd_shuffle16(
14160             a,
14161             b,
14162             [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
14163         ),
14164         2 => simd_shuffle16(
14165             a,
14166             b,
14167             [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
14168         ),
14169         _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
14170     }
14171 }
14172
14173 /// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14174 ///
14175 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_insertf32x4&expand=3156)
14176 #[inline]
14177 #[target_feature(enable = "avx512f")]
14178 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
14179 #[rustc_args_required_const(4)]
14180 pub unsafe fn _mm512_mask_insertf32x4(
14181     src: __m512,
14182     k: __mmask16,
14183     a: __m512,
14184     b: __m128,
14185     imm8: i32,
14186 ) -> __m512 {
14187     assert!(imm8 >= 0 && imm8 <= 3);
14188     let b = _mm512_castps128_ps512(b);
14189     let insert = match imm8 & 0b11 {
14190         0 => simd_shuffle16(
14191             a,
14192             b,
14193             [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
14194         ),
14195         1 => simd_shuffle16(
14196             a,
14197             b,
14198             [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
14199         ),
14200         2 => simd_shuffle16(
14201             a,
14202             b,
14203             [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
14204         ),
14205         _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
14206     };
14207     transmute(simd_select_bitmask(k, insert, src.as_f32x16()))
14208 }
14209
14210 /// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14211 ///
14212 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_insertf32x4&expand=3157)
14213 #[inline]
14214 #[target_feature(enable = "avx512f")]
14215 #[cfg_attr(test, assert_instr(vinsertf32x4, imm8 = 2))]
14216 #[rustc_args_required_const(3)]
14217 pub unsafe fn _mm512_maskz_insertf32x4(k: __mmask16, a: __m512, b: __m128, imm8: i32) -> __m512 {
14218     assert!(imm8 >= 0 && imm8 <= 3);
14219     let b = _mm512_castps128_ps512(b);
14220     let insert = match imm8 & 0b11 {
14221         0 => simd_shuffle16(
14222             a,
14223             b,
14224             [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
14225         ),
14226         1 => simd_shuffle16(
14227             a,
14228             b,
14229             [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
14230         ),
14231         2 => simd_shuffle16(
14232             a,
14233             b,
14234             [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
14235         ),
14236         _ => simd_shuffle16(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
14237     };
14238     let zero = _mm512_setzero_ps().as_f32x16();
14239     transmute(simd_select_bitmask(k, insert, zero))
14240 }
14241
14242 /// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
14243 ///
14244 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_insertf64x4&expand=3167)
14245 #[inline]
14246 #[target_feature(enable = "avx512f")]
14247 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
14248 #[rustc_args_required_const(2)]
14249 pub unsafe fn _mm512_insertf64x4(a: __m512d, b: __m256d, imm8: i32) -> __m512d {
14250     assert!(imm8 >= 0 && imm8 <= 1);
14251     let b = _mm512_castpd256_pd512(b);
14252     match imm8 & 0b1 {
14253         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
14254         _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
14255     }
14256 }
14257
14258 /// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14259 ///
14260 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_insertf64x4&expand=3168)
14261 #[inline]
14262 #[target_feature(enable = "avx512f")]
14263 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
14264 #[rustc_args_required_const(4)]
14265 pub unsafe fn _mm512_mask_insertf64x4(
14266     src: __m512d,
14267     k: __mmask8,
14268     a: __m512d,
14269     b: __m256d,
14270     imm8: i32,
14271 ) -> __m512d {
14272     assert!(imm8 >= 0 && imm8 <= 1);
14273     let b = _mm512_castpd256_pd512(b);
14274     let insert = match imm8 & 0b1 {
14275         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
14276         _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
14277     };
14278     transmute(simd_select_bitmask(k, insert, src.as_f64x8()))
14279 }
14280
14281 /// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14282 ///
14283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_insertf64x4&expand=3169)
14284 #[inline]
14285 #[target_feature(enable = "avx512f")]
14286 #[cfg_attr(test, assert_instr(vinsertf64x4, imm8 = 1))]
14287 #[rustc_args_required_const(3)]
14288 pub unsafe fn _mm512_maskz_insertf64x4(k: __mmask8, a: __m512d, b: __m256d, imm8: i32) -> __m512d {
14289     assert!(imm8 >= 0 && imm8 <= 1);
14290     let b = _mm512_castpd256_pd512(b);
14291     let insert = match imm8 & 0b1 {
14292         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
14293         _ => simd_shuffle8(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
14294     };
14295     let zero = _mm512_setzero_pd().as_f64x8();
14296     transmute(simd_select_bitmask(k, insert, zero))
14297 }
14298
14299 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
14300 ///
14301 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi32&expand=6021)
14302 #[inline]
14303 #[target_feature(enable = "avx512f")]
14304 #[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
14305 pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
14306     let a = a.as_i32x16();
14307     let b = b.as_i32x16();
14308     let r: i32x16 = simd_shuffle16(
14309         a,
14310         b,
14311         [
14312             2,
14313             18,
14314             3,
14315             19,
14316             2 + 4,
14317             18 + 4,
14318             3 + 4,
14319             19 + 4,
14320             2 + 8,
14321             18 + 8,
14322             3 + 8,
14323             19 + 8,
14324             2 + 12,
14325             18 + 12,
14326             3 + 12,
14327             19 + 12,
14328         ],
14329     );
14330     transmute(r)
14331 }
14332
14333 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14334 ///
14335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi32&expand=6019)
14336 #[inline]
14337 #[target_feature(enable = "avx512f")]
14338 #[cfg_attr(test, assert_instr(vpunpckhdq))]
14339 pub unsafe fn _mm512_mask_unpackhi_epi32(
14340     src: __m512i,
14341     k: __mmask16,
14342     a: __m512i,
14343     b: __m512i,
14344 ) -> __m512i {
14345     let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
14346     transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
14347 }
14348
14349 /// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14350 ///
14351 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi32&expand=6020)
14352 #[inline]
14353 #[target_feature(enable = "avx512f")]
14354 #[cfg_attr(test, assert_instr(vpunpckhdq))]
14355 pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
14356     let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
14357     let zero = _mm512_setzero_si512().as_i32x16();
14358     transmute(simd_select_bitmask(k, unpackhi, zero))
14359 }
14360
14361 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and
14362 /// store the results in dst.
14363 ///
14364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_epi64&expand=6030)
14365 #[inline]
14366 #[target_feature(enable = "avx512f")]
14367 #[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
14368 pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
14369     simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
14370 }
14371
14372 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14373 ///
14374 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_epi64&expand=6028)
14375 #[inline]
14376 #[target_feature(enable = "avx512f")]
14377 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
14378 pub unsafe fn _mm512_mask_unpackhi_epi64(
14379     src: __m512i,
14380     k: __mmask8,
14381     a: __m512i,
14382     b: __m512i,
14383 ) -> __m512i {
14384     let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
14385     transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
14386 }
14387
14388 /// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14389 ///
14390 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_epi64&expand=6029)
14391 #[inline]
14392 #[target_feature(enable = "avx512f")]
14393 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
14394 pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
14395     let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
14396     let zero = _mm512_setzero_si512().as_i64x8();
14397     transmute(simd_select_bitmask(k, unpackhi, zero))
14398 }
14399
14400 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
14401 ///
14402 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_ps&expand=6060)
14403 #[inline]
14404 #[target_feature(enable = "avx512f")]
14405 #[cfg_attr(test, assert_instr(vunpckhps))]
14406 pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
14407     simd_shuffle16(
14408         a,
14409         b,
14410         [
14411             2,
14412             18,
14413             3,
14414             19,
14415             2 + 4,
14416             18 + 4,
14417             3 + 4,
14418             19 + 4,
14419             2 + 8,
14420             18 + 8,
14421             3 + 8,
14422             19 + 8,
14423             2 + 12,
14424             18 + 12,
14425             3 + 12,
14426             19 + 12,
14427         ],
14428     )
14429 }
14430
14431 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14432 ///
14433 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_ps&expand=6058)
14434 #[inline]
14435 #[target_feature(enable = "avx512f")]
14436 #[cfg_attr(test, assert_instr(vunpckhps))]
14437 pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
14438     let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
14439     transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
14440 }
14441
14442 /// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14443 ///
14444 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_ps&expand=6059)
14445 #[inline]
14446 #[target_feature(enable = "avx512f")]
14447 #[cfg_attr(test, assert_instr(vunpckhps))]
14448 pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
14449     let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
14450     let zero = _mm512_setzero_ps().as_f32x16();
14451     transmute(simd_select_bitmask(k, unpackhi, zero))
14452 }
14453
14454 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
14455 ///
14456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpackhi_pd&expand=6048)
14457 #[inline]
14458 #[target_feature(enable = "avx512f")]
14459 #[cfg_attr(test, assert_instr(vunpckhpd))]
14460 pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
14461     simd_shuffle8(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
14462 }
14463
14464 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14465 ///
14466 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpackhi_pd&expand=6046)
14467 #[inline]
14468 #[target_feature(enable = "avx512f")]
14469 #[cfg_attr(test, assert_instr(vunpckhpd))]
14470 pub unsafe fn _mm512_mask_unpackhi_pd(
14471     src: __m512d,
14472     k: __mmask8,
14473     a: __m512d,
14474     b: __m512d,
14475 ) -> __m512d {
14476     let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
14477     transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
14478 }
14479
14480 /// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14481 ///
14482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpackhi_pd&expand=6047)
14483 #[inline]
14484 #[target_feature(enable = "avx512f")]
14485 #[cfg_attr(test, assert_instr(vunpckhpd))]
14486 pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
14487     let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
14488     let zero = _mm512_setzero_pd().as_f64x8();
14489     transmute(simd_select_bitmask(k, unpackhi, zero))
14490 }
14491
14492 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
14493 ///
14494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi32&expand=6078)
14495 #[inline]
14496 #[target_feature(enable = "avx512f")]
14497 #[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
14498 pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
14499     let a = a.as_i32x16();
14500     let b = b.as_i32x16();
14501     let r: i32x16 = simd_shuffle16(
14502         a,
14503         b,
14504         [
14505             0,
14506             16,
14507             1,
14508             17,
14509             0 + 4,
14510             16 + 4,
14511             1 + 4,
14512             17 + 4,
14513             0 + 8,
14514             16 + 8,
14515             1 + 8,
14516             17 + 8,
14517             0 + 12,
14518             16 + 12,
14519             1 + 12,
14520             17 + 12,
14521         ],
14522     );
14523     transmute(r)
14524 }
14525
14526 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14527 ///
14528 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi32&expand=6076)
14529 #[inline]
14530 #[target_feature(enable = "avx512f")]
14531 #[cfg_attr(test, assert_instr(vpunpckldq))]
14532 pub unsafe fn _mm512_mask_unpacklo_epi32(
14533     src: __m512i,
14534     k: __mmask16,
14535     a: __m512i,
14536     b: __m512i,
14537 ) -> __m512i {
14538     let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16();
14539     transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
14540 }
14541
14542 /// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14543 ///
14544 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi32&expand=6077)
14545 #[inline]
14546 #[target_feature(enable = "avx512f")]
14547 #[cfg_attr(test, assert_instr(vpunpckldq))]
14548 pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
14549     let unpackhi = _mm512_unpacklo_epi32(a, b).as_i32x16();
14550     let zero = _mm512_setzero_si512().as_i32x16();
14551     transmute(simd_select_bitmask(k, unpackhi, zero))
14552 }
14553
14554 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
14555 ///
14556 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_epi64&expand=6087)
14557 #[inline]
14558 #[target_feature(enable = "avx512f")]
14559 #[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
14560 pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
14561     simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
14562 }
14563
14564 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14565 ///
14566 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_epi64&expand=6085)
14567 #[inline]
14568 #[target_feature(enable = "avx512f")]
14569 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
14570 pub unsafe fn _mm512_mask_unpacklo_epi64(
14571     src: __m512i,
14572     k: __mmask8,
14573     a: __m512i,
14574     b: __m512i,
14575 ) -> __m512i {
14576     let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8();
14577     transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
14578 }
14579
14580 /// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14581 ///
14582 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_epi64&expand=6086)
14583 #[inline]
14584 #[target_feature(enable = "avx512f")]
14585 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
14586 pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
14587     let unpackhi = _mm512_unpacklo_epi64(a, b).as_i64x8();
14588     let zero = _mm512_setzero_si512().as_i64x8();
14589     transmute(simd_select_bitmask(k, unpackhi, zero))
14590 }
14591
14592 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
14593 ///
14594 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_ps&expand=6117)
14595 #[inline]
14596 #[target_feature(enable = "avx512f")]
14597 #[cfg_attr(test, assert_instr(vunpcklps))]
14598 pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
14599     simd_shuffle16(
14600         a,
14601         b,
14602         [
14603             0,
14604             16,
14605             1,
14606             17,
14607             0 + 4,
14608             16 + 4,
14609             1 + 4,
14610             17 + 4,
14611             0 + 8,
14612             16 + 8,
14613             1 + 8,
14614             17 + 8,
14615             0 + 12,
14616             16 + 12,
14617             1 + 12,
14618             17 + 12,
14619         ],
14620     )
14621 }
14622
14623 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14624 ///
14625 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_ps&expand=6115)
14626 #[inline]
14627 #[target_feature(enable = "avx512f")]
14628 #[cfg_attr(test, assert_instr(vunpcklps))]
14629 pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
14630     let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16();
14631     transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
14632 }
14633
14634 /// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14635 ///
14636 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_ps&expand=6116)
14637 #[inline]
14638 #[target_feature(enable = "avx512f")]
14639 #[cfg_attr(test, assert_instr(vunpcklps))]
14640 pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
14641     let unpackhi = _mm512_unpacklo_ps(a, b).as_f32x16();
14642     let zero = _mm512_setzero_ps().as_f32x16();
14643     transmute(simd_select_bitmask(k, unpackhi, zero))
14644 }
14645
14646 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
14647 ///
14648 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_unpacklo_pd&expand=6105)
14649 #[inline]
14650 #[target_feature(enable = "avx512f")]
14651 #[cfg_attr(test, assert_instr(vunpcklpd))]
14652 pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
14653     simd_shuffle8(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
14654 }
14655
14656 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14657 ///
14658 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_unpacklo_pd&expand=6103)
14659 #[inline]
14660 #[target_feature(enable = "avx512f")]
14661 #[cfg_attr(test, assert_instr(vunpcklpd))]
14662 pub unsafe fn _mm512_mask_unpacklo_pd(
14663     src: __m512d,
14664     k: __mmask8,
14665     a: __m512d,
14666     b: __m512d,
14667 ) -> __m512d {
14668     let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8();
14669     transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
14670 }
14671
14672 /// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14673 ///
14674 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_unpacklo_pd&expand=6104)
14675 #[inline]
14676 #[target_feature(enable = "avx512f")]
14677 #[cfg_attr(test, assert_instr(vunpcklpd))]
14678 pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
14679     let unpackhi = _mm512_unpacklo_pd(a, b).as_f64x8();
14680     let zero = _mm512_setzero_pd().as_f64x8();
14681     transmute(simd_select_bitmask(k, unpackhi, zero))
14682 }
14683
14684 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14685 ///
14686 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
14687 #[inline]
14688 #[target_feature(enable = "avx512f")]
14689 pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
14690     simd_shuffle16(
14691         a,
14692         _mm_set1_ps(-1.),
14693         [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
14694     )
14695 }
14696
14697 /// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14698 ///
14699 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps256_ps512&expand=623)
14700 #[inline]
14701 #[target_feature(enable = "avx512f")]
14702 pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
14703     simd_shuffle16(
14704         a,
14705         _mm256_set1_ps(-1.),
14706         [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
14707     )
14708 }
14709
14710 /// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14711 ///
14712 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextps128_ps512&expand=6196)
14713 #[inline]
14714 #[target_feature(enable = "avx512f")]
14715 pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
14716     simd_shuffle16(
14717         a,
14718         _mm_set1_ps(0.),
14719         [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
14720     )
14721 }
14722
14723 /// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14724 ///
14725 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextps256_ps512&expand=6197)
14726 #[inline]
14727 #[target_feature(enable = "avx512f")]
14728 pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
14729     simd_shuffle16(
14730         a,
14731         _mm256_set1_ps(0.),
14732         [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
14733     )
14734 }
14735
14736 /// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14737 ///
14738 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps128&expand=624)
14739 #[inline]
14740 #[target_feature(enable = "avx512f")]
14741 pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
14742     simd_shuffle4(a, a, [0, 1, 2, 3])
14743 }
14744
14745 /// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14746 ///
14747 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps256&expand=625)
14748 #[inline]
14749 #[target_feature(enable = "avx512f")]
14750 pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
14751     simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
14752 }
14753
14754 /// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14755 ///
14756 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_pd&expand=616)
14757 #[inline]
14758 #[target_feature(enable = "avx512f")]
14759 pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d {
14760     transmute(a.as_m512())
14761 }
14762
14763 /// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14764 ///
14765 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_si512&expand=619)
14766 #[inline]
14767 #[target_feature(enable = "avx512f")]
14768 pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
14769     transmute(a.as_m512())
14770 }
14771
14772 /// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14773 ///
14774 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd128_pd512&expand=609)
14775 #[inline]
14776 #[target_feature(enable = "avx512f")]
14777 pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
14778     simd_shuffle8(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
14779 }
14780
14781 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14782 ///
14783 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd256_pd512&expand=611)
14784 #[inline]
14785 #[target_feature(enable = "avx512f")]
14786 pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
14787     simd_shuffle8(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
14788 }
14789
14790 /// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14791 ///
14792 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextpd128_pd512&expand=6193)
14793 #[inline]
14794 #[target_feature(enable = "avx512f")]
14795 pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
14796     simd_shuffle8(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
14797 }
14798
14799 /// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14800 ///
14801 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextpd256_pd512&expand=6194)
14802 #[inline]
14803 #[target_feature(enable = "avx512f")]
14804 pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
14805     simd_shuffle8(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
14806 }
14807
14808 /// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14809 ///
14810 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd128&expand=612)
14811 #[inline]
14812 #[target_feature(enable = "avx512f")]
14813 pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
14814     simd_shuffle2(a, a, [0, 1])
14815 }
14816
14817 /// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14818 ///
14819 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd256&expand=613)
14820 #[inline]
14821 #[target_feature(enable = "avx512f")]
14822 pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
14823     simd_shuffle4(a, a, [0, 1, 2, 3])
14824 }
14825
14826 /// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14827 ///
14828 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_ps&expand=604)
14829 #[inline]
14830 #[target_feature(enable = "avx512f")]
14831 pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 {
14832     transmute(a.as_m512d())
14833 }
14834
14835 /// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14836 ///
14837 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_si512&expand=607)
14838 #[inline]
14839 #[target_feature(enable = "avx512f")]
14840 pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
14841     transmute(a.as_m512d())
14842 }
14843
14844 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14845 ///
14846 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629)
14847 #[inline]
14848 #[target_feature(enable = "avx512f")]
14849 pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
14850     simd_shuffle8(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
14851 }
14852
14853 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14854 ///
14855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi256_si512&expand=633)
14856 #[inline]
14857 #[target_feature(enable = "avx512f")]
14858 pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
14859     simd_shuffle8(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
14860 }
14861
14862 /// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14863 ///
14864 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextsi128_si512&expand=6199)
14865 #[inline]
14866 #[target_feature(enable = "avx512f")]
14867 pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
14868     simd_shuffle8(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
14869 }
14870
14871 /// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14872 ///
14873 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextsi256_si512&expand=6200)
14874 #[inline]
14875 #[target_feature(enable = "avx512f")]
14876 pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
14877     simd_shuffle8(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
14878 }
14879
14880 /// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14881 ///
14882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si128&expand=636)
14883 #[inline]
14884 #[target_feature(enable = "avx512f")]
14885 pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
14886     simd_shuffle2(a, a, [0, 1])
14887 }
14888
14889 /// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14890 ///
14891 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si256&expand=637)
14892 #[inline]
14893 #[target_feature(enable = "avx512f")]
14894 pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
14895     simd_shuffle4(a, a, [0, 1, 2, 3])
14896 }
14897
14898 /// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14899 ///
14900 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_ps&expand=635)
14901 #[inline]
14902 #[target_feature(enable = "avx512f")]
14903 pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
14904     transmute(a)
14905 }
14906
14907 /// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
14908 ///
14909 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_pd&expand=634)
14910 #[inline]
14911 #[target_feature(enable = "avx512f")]
14912 pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
14913     transmute(a)
14914 }
14915
14916 /// Broadcast the low packed 32-bit integer from a to all elements of dst.
14917 ///
14918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastd_epi32&expand=545)
14919 #[inline]
14920 #[target_feature(enable = "avx512f")]
14921 #[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
14922 pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
14923     let a = _mm512_castsi128_si512(a).as_i32x16();
14924     let ret: i32x16 = simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
14925     transmute(ret)
14926 }
14927
14928 /// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14929 ///
14930 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastd_epi32&expand=546)
14931 #[inline]
14932 #[target_feature(enable = "avx512f")]
14933 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
14934 pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
14935     let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
14936     transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
14937 }
14938
14939 /// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14940 ///
14941 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastd_epi32&expand=547)
14942 #[inline]
14943 #[target_feature(enable = "avx512f")]
14944 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
14945 pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
14946     let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
14947     let zero = _mm512_setzero_si512().as_i32x16();
14948     transmute(simd_select_bitmask(k, broadcast, zero))
14949 }
14950
14951 /// Broadcast the low packed 64-bit integer from a to all elements of dst.
14952 ///
14953 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastq_epi64&expand=560)
14954 #[inline]
14955 #[target_feature(enable = "avx512f")]
14956 #[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
14957 pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
14958     simd_shuffle8(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
14959 }
14960
14961 /// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14962 ///
14963 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastq_epi64&expand=561)
14964 #[inline]
14965 #[target_feature(enable = "avx512f")]
14966 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
14967 pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
14968     let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
14969     transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
14970 }
14971
14972 /// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14973 ///
14974 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastq_epi64&expand=562)
14975 #[inline]
14976 #[target_feature(enable = "avx512f")]
14977 #[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
14978 pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
14979     let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
14980     let zero = _mm512_setzero_si512().as_i64x8();
14981     transmute(simd_select_bitmask(k, broadcast, zero))
14982 }
14983
14984 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
14985 ///
14986 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastss_ps&expand=578)
14987 #[inline]
14988 #[target_feature(enable = "avx512f")]
14989 #[cfg_attr(test, assert_instr(vbroadcastss))]
14990 pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
14991     simd_shuffle16(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
14992 }
14993
14994 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14995 ///
14996 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastss_ps&expand=579)
14997 #[inline]
14998 #[target_feature(enable = "avx512f")]
14999 #[cfg_attr(test, assert_instr(vbroadcastss))]
15000 pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
15001     let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
15002     transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
15003 }
15004
15005 /// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15006 ///
15007 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastss_ps&expand=580)
15008 #[inline]
15009 #[target_feature(enable = "avx512f")]
15010 #[cfg_attr(test, assert_instr(vbroadcastss))]
15011 pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
15012     let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
15013     let zero = _mm512_setzero_ps().as_f32x16();
15014     transmute(simd_select_bitmask(k, broadcast, zero))
15015 }
15016
15017 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
15018 ///
15019 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcastsd_pd&expand=567)
15020 #[inline]
15021 #[target_feature(enable = "avx512f")]
15022 #[cfg_attr(test, assert_instr(vbroadcastsd))]
15023 pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
15024     simd_shuffle8(a, a, [1, 1, 1, 1, 1, 1, 1, 1])
15025 }
15026
15027 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15028 ///
15029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcastsd_pd&expand=568)
15030 #[inline]
15031 #[target_feature(enable = "avx512f")]
15032 #[cfg_attr(test, assert_instr(vbroadcastsd))]
15033 pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
15034     let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
15035     transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
15036 }
15037
15038 /// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15039 ///
15040 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcastsd_pd&expand=569)
15041 #[inline]
15042 #[target_feature(enable = "avx512f")]
15043 #[cfg_attr(test, assert_instr(vbroadcastsd))]
15044 pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
15045     let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
15046     let zero = _mm512_setzero_pd().as_f64x8();
15047     transmute(simd_select_bitmask(k, broadcast, zero))
15048 }
15049
15050 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
15051 ///
15052 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i32x4&expand=510)
15053 #[inline]
15054 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
15055 pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
15056     let a = _mm512_castsi128_si512(a).as_i32x16();
15057     let ret: i32x16 = simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
15058     transmute(ret)
15059 }
15060
15061 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15062 ///
15063 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i32x4&expand=511)
15064 #[inline]
15065 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
15066 pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
15067     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
15068     transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
15069 }
15070
15071 /// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15072 ///
15073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i32x4&expand=512)
15074 #[inline]
15075 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
15076 pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
15077     let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
15078     let zero = _mm512_setzero_si512().as_i32x16();
15079     transmute(simd_select_bitmask(k, broadcast, zero))
15080 }
15081
15082 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
15083 ///
15084 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
15085 #[inline]
15086 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
15087 pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
15088     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
15089 }
15090
15091 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15092 ///
15093 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
15094 #[inline]
15095 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
15096 pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
15097     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
15098     transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
15099 }
15100
15101 /// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15102 ///
15103 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
15104 #[inline]
15105 #[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
15106 pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
15107     let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
15108     let zero = _mm512_setzero_si512().as_i64x8();
15109     transmute(simd_select_bitmask(k, broadcast, zero))
15110 }
15111
15112 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
15113 ///
15114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f32x4&expand=483)
15115 #[inline]
15116 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
15117 pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
15118     simd_shuffle16(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
15119 }
15120
15121 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15122 ///
15123 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f32x4&expand=484)
15124 #[inline]
15125 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
15126 pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
15127     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
15128     transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
15129 }
15130
15131 /// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15132 ///
15133 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f32x4&expand=485)
15134 #[inline]
15135 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
15136 pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
15137     let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
15138     let zero = _mm512_setzero_ps().as_f32x16();
15139     transmute(simd_select_bitmask(k, broadcast, zero))
15140 }
15141
15142 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
15143 ///
15144 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
15145 #[inline]
15146 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
15147 pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
15148     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
15149 }
15150
15151 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15152 ///
15153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
15154 #[inline]
15155 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
15156 pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
15157     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
15158     transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
15159 }
15160
15161 /// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15162 ///
15163 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
15164 #[inline]
15165 #[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
15166 pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
15167     let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
15168     let zero = _mm512_setzero_pd().as_f64x8();
15169     transmute(simd_select_bitmask(k, broadcast, zero))
15170 }
15171
15172 /// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
15173 ///
15174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi32&expand=435)
15175 #[inline]
15176 #[target_feature(enable = "avx512f")]
15177 #[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
15178 pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15179     transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16()))
15180 }
15181
15182 /// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
15183 ///
15184 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_epi64&expand=438)
15185 #[inline]
15186 #[target_feature(enable = "avx512f")]
15187 #[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
15188 pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15189     transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8()))
15190 }
15191
15192 /// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
15193 ///
15194 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_ps&expand=451)
15195 #[inline]
15196 #[target_feature(enable = "avx512f")]
15197 #[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
15198 pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
15199     transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16()))
15200 }
15201
15202 /// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
15203 ///
15204 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_blend_pd&expand=446)
15205 #[inline]
15206 #[target_feature(enable = "avx512f")]
15207 #[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
15208 pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
15209     transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8()))
15210 }
15211
15212 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
15213 ///
15214 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi32&expand=245)
15215 #[inline]
15216 #[target_feature(enable = "avx512f")]
15217 #[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
15218 #[rustc_args_required_const(2)]
15219 pub unsafe fn _mm512_alignr_epi32(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
15220     assert!(imm8 >= 0 && imm8 <= 255);
15221     let a = a.as_i32x16();
15222     let b = b.as_i32x16();
15223     let imm8: i32 = imm8 % 16;
15224     let r: i32x16 = match imm8 {
15225         0 => simd_shuffle16(
15226             a,
15227             b,
15228             [
15229                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
15230             ],
15231         ),
15232         1 => simd_shuffle16(
15233             a,
15234             b,
15235             [
15236                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
15237             ],
15238         ),
15239         2 => simd_shuffle16(
15240             a,
15241             b,
15242             [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
15243         ),
15244         3 => simd_shuffle16(
15245             a,
15246             b,
15247             [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
15248         ),
15249         4 => simd_shuffle16(
15250             a,
15251             b,
15252             [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
15253         ),
15254         5 => simd_shuffle16(
15255             a,
15256             b,
15257             [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
15258         ),
15259         6 => simd_shuffle16(
15260             a,
15261             b,
15262             [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
15263         ),
15264         7 => simd_shuffle16(
15265             a,
15266             b,
15267             [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
15268         ),
15269         8 => simd_shuffle16(
15270             a,
15271             b,
15272             [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
15273         ),
15274         9 => simd_shuffle16(
15275             a,
15276             b,
15277             [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
15278         ),
15279         10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
15280         11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
15281         12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
15282         13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
15283         14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
15284         _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
15285     };
15286     transmute(r)
15287 }
15288
15289 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15290 ///
15291 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_alignr_epi32&expand=246)
15292 #[inline]
15293 #[target_feature(enable = "avx512f")]
15294 #[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
15295 #[rustc_args_required_const(4)]
15296 pub unsafe fn _mm512_mask_alignr_epi32(
15297     src: __m512i,
15298     k: __mmask16,
15299     a: __m512i,
15300     b: __m512i,
15301     imm8: i32,
15302 ) -> __m512i {
15303     assert!(imm8 >= 0 && imm8 <= 255);
15304     let a = a.as_i32x16();
15305     let b = b.as_i32x16();
15306     let imm8: i32 = imm8 % 16;
15307     let r: i32x16 = match imm8 {
15308         0 => simd_shuffle16(
15309             a,
15310             b,
15311             [
15312                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
15313             ],
15314         ),
15315         1 => simd_shuffle16(
15316             a,
15317             b,
15318             [
15319                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
15320             ],
15321         ),
15322         2 => simd_shuffle16(
15323             a,
15324             b,
15325             [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
15326         ),
15327         3 => simd_shuffle16(
15328             a,
15329             b,
15330             [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
15331         ),
15332         4 => simd_shuffle16(
15333             a,
15334             b,
15335             [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
15336         ),
15337         5 => simd_shuffle16(
15338             a,
15339             b,
15340             [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
15341         ),
15342         6 => simd_shuffle16(
15343             a,
15344             b,
15345             [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
15346         ),
15347         7 => simd_shuffle16(
15348             a,
15349             b,
15350             [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
15351         ),
15352         8 => simd_shuffle16(
15353             a,
15354             b,
15355             [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
15356         ),
15357         9 => simd_shuffle16(
15358             a,
15359             b,
15360             [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
15361         ),
15362         10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
15363         11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
15364         12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
15365         13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
15366         14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
15367         _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
15368     };
15369     transmute(simd_select_bitmask(k, r, src.as_i32x16()))
15370 }
15371
15372 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15373 ///
15374 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_alignr_epi32&expand=247)
15375 #[inline]
15376 #[target_feature(enable = "avx512f")]
15377 #[cfg_attr(test, assert_instr(valignd, imm8 = 1))]
15378 #[rustc_args_required_const(3)]
15379 pub unsafe fn _mm512_maskz_alignr_epi32(
15380     k: __mmask16,
15381     a: __m512i,
15382     b: __m512i,
15383     imm8: i32,
15384 ) -> __m512i {
15385     assert!(imm8 >= 0 && imm8 <= 255);
15386     let a = a.as_i32x16();
15387     let b = b.as_i32x16();
15388     let imm8: i32 = imm8 % 16;
15389     let r: i32x16 = match imm8 {
15390         0 => simd_shuffle16(
15391             a,
15392             b,
15393             [
15394                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
15395             ],
15396         ),
15397         1 => simd_shuffle16(
15398             a,
15399             b,
15400             [
15401                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,
15402             ],
15403         ),
15404         2 => simd_shuffle16(
15405             a,
15406             b,
15407             [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
15408         ),
15409         3 => simd_shuffle16(
15410             a,
15411             b,
15412             [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
15413         ),
15414         4 => simd_shuffle16(
15415             a,
15416             b,
15417             [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
15418         ),
15419         5 => simd_shuffle16(
15420             a,
15421             b,
15422             [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
15423         ),
15424         6 => simd_shuffle16(
15425             a,
15426             b,
15427             [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
15428         ),
15429         7 => simd_shuffle16(
15430             a,
15431             b,
15432             [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
15433         ),
15434         8 => simd_shuffle16(
15435             a,
15436             b,
15437             [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
15438         ),
15439         9 => simd_shuffle16(
15440             a,
15441             b,
15442             [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
15443         ),
15444         10 => simd_shuffle16(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
15445         11 => simd_shuffle16(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
15446         12 => simd_shuffle16(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
15447         13 => simd_shuffle16(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
15448         14 => simd_shuffle16(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
15449         _ => simd_shuffle16(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
15450     };
15451     let zero = _mm512_setzero_si512().as_i32x16();
15452     transmute(simd_select_bitmask(k, r, zero))
15453 }
15454
15455 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
15456 ///
15457 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi64&expand=254)
15458 #[inline]
15459 #[target_feature(enable = "avx512f")]
15460 #[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
15461 #[rustc_args_required_const(2)]
15462 pub unsafe fn _mm512_alignr_epi64(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
15463     assert!(imm8 >= 0 && imm8 <= 255);
15464     let imm8: i32 = imm8 % 8;
15465     let r: i64x8 = match imm8 {
15466         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
15467         1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
15468         2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
15469         3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
15470         4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
15471         5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
15472         6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
15473         _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
15474     };
15475     transmute(r)
15476 }
15477
15478 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15479 ///
15480 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_alignr_epi64&expand=255)
15481 #[inline]
15482 #[target_feature(enable = "avx512f")]
15483 #[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
15484 #[rustc_args_required_const(4)]
15485 pub unsafe fn _mm512_mask_alignr_epi64(
15486     src: __m512i,
15487     k: __mmask8,
15488     a: __m512i,
15489     b: __m512i,
15490     imm8: i32,
15491 ) -> __m512i {
15492     assert!(imm8 >= 0 && imm8 <= 255);
15493     let imm8: i32 = imm8 % 8;
15494     let r: i64x8 = match imm8 {
15495         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
15496         1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
15497         2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
15498         3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
15499         4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
15500         5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
15501         6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
15502         _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
15503     };
15504     transmute(simd_select_bitmask(k, r, src.as_i64x8()))
15505 }
15506
15507 /// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15508 ///
15509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_alignr_epi64&expand=256)
15510 #[inline]
15511 #[target_feature(enable = "avx512f")]
15512 #[cfg_attr(test, assert_instr(valignq, imm8 = 1))]
15513 #[rustc_args_required_const(3)]
15514 pub unsafe fn _mm512_maskz_alignr_epi64(k: __mmask8, a: __m512i, b: __m512i, imm8: i32) -> __m512i {
15515     assert!(imm8 >= 0 && imm8 <= 255);
15516     let imm8: i32 = imm8 % 8;
15517     let r: i64x8 = match imm8 {
15518         0 => simd_shuffle8(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
15519         1 => simd_shuffle8(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
15520         2 => simd_shuffle8(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
15521         3 => simd_shuffle8(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
15522         4 => simd_shuffle8(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
15523         5 => simd_shuffle8(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
15524         6 => simd_shuffle8(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
15525         _ => simd_shuffle8(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
15526     };
15527     let zero = _mm512_setzero_si512().as_i64x8();
15528     transmute(simd_select_bitmask(k, r, zero))
15529 }
15530
15531 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
15532 ///
15533 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi32&expand=272)
15534 #[inline]
15535 #[target_feature(enable = "avx512f")]
15536 #[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
15537 pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
15538     transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
15539 }
15540
15541 /// Performs element-by-element bitwise AND between packed 32-bit integer elements of v2 and v3, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15542 ///
15543 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_and_epi32&expand=273)
15544 #[inline]
15545 #[target_feature(enable = "avx512f")]
15546 #[cfg_attr(test, assert_instr(vpandd))]
15547 pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15548     let and = _mm512_and_epi32(a, b).as_i32x16();
15549     transmute(simd_select_bitmask(k, and, src.as_i32x16()))
15550 }
15551
15552 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15553 ///
15554 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_and_epi32&expand=274)
15555 #[inline]
15556 #[target_feature(enable = "avx512f")]
15557 #[cfg_attr(test, assert_instr(vpandd))]
15558 pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15559     let and = _mm512_and_epi32(a, b).as_i32x16();
15560     let zero = _mm512_setzero_si512().as_i32x16();
15561     transmute(simd_select_bitmask(k, and, zero))
15562 }
15563
15564 /// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
15565 ///
15566 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_epi64&expand=279)
15567 #[inline]
15568 #[target_feature(enable = "avx512f")]
15569 #[cfg_attr(test, assert_instr(vpandq))]
15570 pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
15571     transmute(simd_and(a.as_i64x8(), b.as_i64x8()))
15572 }
15573
15574 /// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15575 ///
15576 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_and_epi64&expand=280)
15577 #[inline]
15578 #[target_feature(enable = "avx512f")]
15579 #[cfg_attr(test, assert_instr(vpandq))]
15580 pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15581     let and = _mm512_and_epi64(a, b).as_i64x8();
15582     transmute(simd_select_bitmask(k, and, src.as_i64x8()))
15583 }
15584
15585 /// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15586 ///
15587 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_and_Epi32&expand=274)
15588 #[inline]
15589 #[target_feature(enable = "avx512f")]
15590 #[cfg_attr(test, assert_instr(vpandq))]
15591 pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15592     let and = _mm512_and_epi64(a, b).as_i64x8();
15593     let zero = _mm512_setzero_si512().as_i64x8();
15594     transmute(simd_select_bitmask(k, and, zero))
15595 }
15596
15597 /// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
15598 ///
15599 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_and_si512&expand=302)
15600 #[inline]
15601 #[target_feature(enable = "avx512f")]
15602 #[cfg_attr(test, assert_instr(vpandq))]
15603 pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
15604     transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
15605 }
15606
15607 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
15608 ///
15609 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_epi32&expand=4042)
15610 #[inline]
15611 #[target_feature(enable = "avx512f")]
15612 #[cfg_attr(test, assert_instr(vporq))]
15613 pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
15614     transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
15615 }
15616
15617 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15618 ///
15619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_or_epi32&expand=4040)
15620 #[inline]
15621 #[target_feature(enable = "avx512f")]
15622 #[cfg_attr(test, assert_instr(vpord))]
15623 pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15624     let or = _mm512_or_epi32(a, b).as_i32x16();
15625     transmute(simd_select_bitmask(k, or, src.as_i32x16()))
15626 }
15627
15628 /// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15629 ///
15630 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_or_epi32&expand=4041)
15631 #[inline]
15632 #[target_feature(enable = "avx512f")]
15633 #[cfg_attr(test, assert_instr(vpord))]
15634 pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15635     let or = _mm512_or_epi32(a, b).as_i32x16();
15636     let zero = _mm512_setzero_si512().as_i32x16();
15637     transmute(simd_select_bitmask(k, or, zero))
15638 }
15639
15640 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
15641 ///
15642 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_epi64&expand=4051)
15643 #[inline]
15644 #[target_feature(enable = "avx512f")]
15645 #[cfg_attr(test, assert_instr(vporq))]
15646 pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
15647     transmute(simd_or(a.as_i64x8(), b.as_i64x8()))
15648 }
15649
15650 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15651 ///
15652 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_or_epi64&expand=4049)
15653 #[inline]
15654 #[target_feature(enable = "avx512f")]
15655 #[cfg_attr(test, assert_instr(vporq))]
15656 pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15657     let or = _mm512_or_epi64(a, b).as_i64x8();
15658     transmute(simd_select_bitmask(k, or, src.as_i64x8()))
15659 }
15660
15661 /// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15662 ///
15663 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_or_epi64&expand=4050)
15664 #[inline]
15665 #[target_feature(enable = "avx512f")]
15666 #[cfg_attr(test, assert_instr(vporq))]
15667 pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15668     let or = _mm512_or_epi64(a, b).as_i64x8();
15669     let zero = _mm512_setzero_si512().as_i64x8();
15670     transmute(simd_select_bitmask(k, or, zero))
15671 }
15672
15673 /// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
15674 ///
15675 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_or_si512&expand=4072)
15676 #[inline]
15677 #[target_feature(enable = "avx512f")]
15678 #[cfg_attr(test, assert_instr(vporq))]
15679 pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
15680     transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
15681 }
15682
15683 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
15684 ///
15685 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_epi32&expand=6142)
15686 #[inline]
15687 #[target_feature(enable = "avx512f")]
15688 #[cfg_attr(test, assert_instr(vpxorq))]
15689 pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
15690     transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
15691 }
15692
15693 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15694 ///
15695 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_xor_epi32&expand=6140)
15696 #[inline]
15697 #[target_feature(enable = "avx512f")]
15698 #[cfg_attr(test, assert_instr(vpxord))]
15699 pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15700     let xor = _mm512_xor_epi32(a, b).as_i32x16();
15701     transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
15702 }
15703
15704 /// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15705 ///
15706 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_xor_epi32&expand=6141)
15707 #[inline]
15708 #[target_feature(enable = "avx512f")]
15709 #[cfg_attr(test, assert_instr(vpxord))]
15710 pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15711     let xor = _mm512_xor_epi32(a, b).as_i32x16();
15712     let zero = _mm512_setzero_si512().as_i32x16();
15713     transmute(simd_select_bitmask(k, xor, zero))
15714 }
15715
15716 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
15717 ///
15718 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_epi64&expand=6151)
15719 #[inline]
15720 #[target_feature(enable = "avx512f")]
15721 #[cfg_attr(test, assert_instr(vpxorq))]
15722 pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
15723     transmute(simd_xor(a.as_i64x8(), b.as_i64x8()))
15724 }
15725
15726 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15727 ///
15728 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_xor_epi64&expand=6149)
15729 #[inline]
15730 #[target_feature(enable = "avx512f")]
15731 #[cfg_attr(test, assert_instr(vpxorq))]
15732 pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15733     let xor = _mm512_xor_epi64(a, b).as_i64x8();
15734     transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
15735 }
15736
15737 /// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15738 ///
15739 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_xor_epi64&expand=6150)
15740 #[inline]
15741 #[target_feature(enable = "avx512f")]
15742 #[cfg_attr(test, assert_instr(vpxorq))]
15743 pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15744     let xor = _mm512_xor_epi64(a, b).as_i64x8();
15745     let zero = _mm512_setzero_si512().as_i64x8();
15746     transmute(simd_select_bitmask(k, xor, zero))
15747 }
15748
15749 /// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
15750 ///
15751 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_xor_si512&expand=6172)
15752 #[inline]
15753 #[target_feature(enable = "avx512f")]
15754 #[cfg_attr(test, assert_instr(vpxorq))]
15755 pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
15756     transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
15757 }
15758
15759 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
15760 ///
15761 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_epi32&expand=310)
15762 #[inline]
15763 #[target_feature(enable = "avx512f")]
15764 #[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
15765 pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
15766     _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
15767 }
15768
15769 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15770 ///
15771 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_andnot_epi32&expand=311)
15772 #[inline]
15773 #[target_feature(enable = "avx512f")]
15774 #[cfg_attr(test, assert_instr(vpandnd))]
15775 pub unsafe fn _mm512_mask_andnot_epi32(
15776     src: __m512i,
15777     k: __mmask16,
15778     a: __m512i,
15779     b: __m512i,
15780 ) -> __m512i {
15781     let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
15782     transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
15783 }
15784
15785 /// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15786 ///
15787 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_andnot_epi32&expand=312)
15788 #[inline]
15789 #[target_feature(enable = "avx512f")]
15790 #[cfg_attr(test, assert_instr(vpandnd))]
15791 pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
15792     let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
15793     let zero = _mm512_setzero_si512().as_i32x16();
15794     transmute(simd_select_bitmask(k, andnot, zero))
15795 }
15796
15797 /// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
15798 ///
15799 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_epi64&expand=317)
15800 #[inline]
15801 #[target_feature(enable = "avx512f")]
15802 #[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
15803 pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
15804     _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
15805 }
15806
15807 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15808 ///
15809 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_andnot_epi64&expand=318)
15810 #[inline]
15811 #[target_feature(enable = "avx512f")]
15812 #[cfg_attr(test, assert_instr(vpandnq))]
15813 pub unsafe fn _mm512_mask_andnot_epi64(
15814     src: __m512i,
15815     k: __mmask8,
15816     a: __m512i,
15817     b: __m512i,
15818 ) -> __m512i {
15819     let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
15820     transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
15821 }
15822
15823 /// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15824 ///
15825 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_andnot_epi64&expand=319)
15826 #[inline]
15827 #[target_feature(enable = "avx512f")]
15828 #[cfg_attr(test, assert_instr(vpandnq))]
15829 pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
15830     let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
15831     let zero = _mm512_setzero_si512().as_i64x8();
15832     transmute(simd_select_bitmask(k, andnot, zero))
15833 }
15834
15835 /// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
15836 ///
15837 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_andnot_si512&expand=340)
15838 #[inline]
15839 #[target_feature(enable = "avx512f")]
15840 #[cfg_attr(test, assert_instr(vpandnq))]
15841 pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
15842     _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
15843 }
15844
15845 /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
15846 ///
15847 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212)
15848 #[inline]
15849 #[target_feature(enable = "avx512f")]
15850 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
15851 pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
15852     transmute(a & b)
15853 }
15854
15855 /// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
15856 ///
15857 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kand&expand=3210)
15858 #[inline]
15859 #[target_feature(enable = "avx512f")]
15860 #[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
15861 pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
15862     transmute(a & b)
15863 }
15864
15865 /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
15866 ///
15867 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kor_mask16&expand=3239)
15868 #[inline]
15869 #[target_feature(enable = "avx512f")]
15870 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
15871 pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
15872     transmute(a | b)
15873 }
15874
15875 /// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
15876 ///
15877 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kor&expand=3237)
15878 #[inline]
15879 #[target_feature(enable = "avx512f")]
15880 #[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
15881 pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
15882     transmute(a | b)
15883 }
15884
15885 /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
15886 ///
15887 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxor_mask16&expand=3291)
15888 #[inline]
15889 #[target_feature(enable = "avx512f")]
15890 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
15891 pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
15892     transmute(a ^ b)
15893 }
15894
15895 /// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
15896 ///
15897 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxor&expand=3289)
15898 #[inline]
15899 #[target_feature(enable = "avx512f")]
15900 #[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
15901 pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
15902     transmute(a ^ b)
15903 }
15904
15905 /// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
15906 ///
15907 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=knot_mask16&expand=3233)
15908 #[inline]
15909 #[target_feature(enable = "avx512f")]
15910 pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
15911     transmute(a ^ 0b11111111_11111111)
15912 }
15913
15914 /// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
15915 ///
15916 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_knot&expand=3231)
15917 #[inline]
15918 #[target_feature(enable = "avx512f")]
15919 pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
15920     transmute(a ^ 0b11111111_11111111)
15921 }
15922
15923 /// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
15924 ///
15925 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kandn_mask16&expand=3218)
15926 #[inline]
15927 #[target_feature(enable = "avx512f")]
15928 #[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
15929 pub unsafe fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
15930     _mm512_kand(_mm512_knot(a), b)
15931 }
15932
15933 /// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
15934 ///
15935 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kandn&expand=3216)
15936 #[inline]
15937 #[target_feature(enable = "avx512f")]
15938 #[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
15939 pub unsafe fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
15940     _mm512_kand(_mm512_knot(a), b)
15941 }
15942
15943 /// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
15944 ///
15945 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxnor_mask16&expand=3285)
15946 #[inline]
15947 #[target_feature(enable = "avx512f")]
15948 #[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
15949 pub unsafe fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
15950     _mm512_knot(_mm512_kxor(a, b))
15951 }
15952
15953 /// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
15954 ///
15955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxnor&expand=3283)
15956 #[inline]
15957 #[target_feature(enable = "avx512f")]
15958 #[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
15959 pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
15960     _mm512_knot(_mm512_kxor(a, b))
15961 }
15962
15963 /// Copy 16-bit mask a to k.
15964 ///
15965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm512_kmov&expand=3228)
15966 #[inline]
15967 #[target_feature(enable = "avx512f")]
15968 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
15969 pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
15970     let r: u16 = a;
15971     transmute(r)
15972 }
15973
15974 /// Converts integer mask into bitmask, storing the result in dst.
15975 ///
15976 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_int2mask&expand=3189)
15977 #[inline]
15978 #[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
15979 pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
15980     let r: u16 = mask as u16;
15981     transmute(r)
15982 }
15983
15984 /// Converts bit mask k1 into an integer value, storing the results in dst.
15985 ///
15986 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
15987 #[inline]
15988 #[target_feature(enable = "avx512f")]
15989 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
15990 pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
15991     let r: i32 = k1 as i32;
15992     transmute(r)
15993 }
15994
15995 /// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
15996 ///
15997 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kunpackb&expand=3280)
15998 #[inline]
15999 #[target_feature(enable = "avx512f")]
16000 #[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
16001 pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
16002     let a = a & 0b00000000_11111111;
16003     let b = b & 0b11111111_00000000;
16004     transmute(a | b)
16005 }
16006
16007 /// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
16008 ///
16009 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kortestc&expand=3247)
16010 #[inline]
16011 #[target_feature(enable = "avx512f")]
16012 #[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
16013 pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
16014     let r = a | b;
16015     if r == 0b11111111_11111111 {
16016         1
16017     } else {
16018         0
16019     }
16020 }
16021
16022 /// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
16023 ///
16024 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi32_mask&expand=5890)
16025 #[inline]
16026 #[target_feature(enable = "avx512f")]
16027 #[cfg_attr(test, assert_instr(vptestmd))]
16028 pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
16029     let and = _mm512_and_epi32(a, b);
16030     let zero = _mm512_setzero_si512();
16031     _mm512_cmpneq_epi32_mask(and, zero)
16032 }
16033
16034 /// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
16035 ///
16036 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi32_mask&expand=5889)
16037 #[inline]
16038 #[target_feature(enable = "avx512f")]
16039 #[cfg_attr(test, assert_instr(vptestmd))]
16040 pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
16041     let and = _mm512_and_epi32(a, b);
16042     let zero = _mm512_setzero_si512();
16043     _mm512_mask_cmpneq_epi32_mask(k, and, zero)
16044 }
16045
16046 /// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
16047 ///
16048 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_test_epi64_mask&expand=5896)
16049 #[inline]
16050 #[target_feature(enable = "avx512f")]
16051 #[cfg_attr(test, assert_instr(vptestmq))]
16052 pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
16053     let and = _mm512_and_epi64(a, b);
16054     let zero = _mm512_setzero_si512();
16055     _mm512_cmpneq_epi64_mask(and, zero)
16056 }
16057
16058 /// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
16059 ///
16060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_test_epi64_mask&expand=5895)
16061 #[inline]
16062 #[target_feature(enable = "avx512f")]
16063 #[cfg_attr(test, assert_instr(vptestmq))]
16064 pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
16065     let and = _mm512_and_epi64(a, b);
16066     let zero = _mm512_setzero_si512();
16067     _mm512_mask_cmpneq_epi64_mask(k, and, zero)
16068 }
16069
16070 /// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
16071 ///
16072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi32_mask&expand=5921)
16073 #[inline]
16074 #[target_feature(enable = "avx512f")]
16075 #[cfg_attr(test, assert_instr(vptestnmd))]
16076 pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
16077     let and = _mm512_and_epi32(a, b);
16078     let zero = _mm512_setzero_si512();
16079     _mm512_cmpeq_epi32_mask(and, zero)
16080 }
16081
16082 /// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
16083 ///
16084 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi32_mask&expand=5920)
16085 #[inline]
16086 #[target_feature(enable = "avx512f")]
16087 #[cfg_attr(test, assert_instr(vptestnmd))]
16088 pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
16089     let and = _mm512_and_epi32(a, b);
16090     let zero = _mm512_setzero_si512();
16091     _mm512_mask_cmpeq_epi32_mask(k, and, zero)
16092 }
16093
16094 /// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
16095 ///
16096 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_testn_epi64_mask&expand=5927)
16097 #[inline]
16098 #[target_feature(enable = "avx512f")]
16099 #[cfg_attr(test, assert_instr(vptestnmq))]
16100 pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
16101     let and = _mm512_and_epi64(a, b);
16102     let zero = _mm512_setzero_si512();
16103     _mm512_cmpeq_epi64_mask(and, zero)
16104 }
16105
16106 /// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
16107 ///
16108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_testn_epi64_mask&expand=5926)
16109 #[inline]
16110 #[target_feature(enable = "avx512f")]
16111 #[cfg_attr(test, assert_instr(vptestnmq))]
16112 pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
16113     let and = _mm512_and_epi64(a, b);
16114     let zero = _mm512_setzero_si512();
16115     _mm512_mask_cmpeq_epi64_mask(k, and, zero)
16116 }
16117
16118 /// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
16119 ///
16120 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_ps&expand=5671)
16121 #[inline]
16122 #[target_feature(enable = "avx512f")]
16123 #[cfg_attr(test, assert_instr(vmovntps))]
16124 #[allow(clippy::cast_ptr_alignment)]
16125 pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
16126     intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
16127 }
16128
16129 /// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
16130 ///
16131 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_pd&expand=5667)
16132 #[inline]
16133 #[target_feature(enable = "avx512f")]
16134 #[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
16135 #[allow(clippy::cast_ptr_alignment)]
16136 pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
16137     intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
16138 }
16139
16140 /// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
16141 ///
16142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_stream_si512&expand=5675)
16143 #[inline]
16144 #[target_feature(enable = "avx512f")]
16145 #[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
16146 #[allow(clippy::cast_ptr_alignment)]
16147 pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
16148     intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
16149 }
16150
16151 /// Sets packed 32-bit integers in `dst` with the supplied values.
16152 ///
16153 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
16154 #[inline]
16155 #[target_feature(enable = "avx512f")]
16156 pub unsafe fn _mm512_set_ps(
16157     e0: f32,
16158     e1: f32,
16159     e2: f32,
16160     e3: f32,
16161     e4: f32,
16162     e5: f32,
16163     e6: f32,
16164     e7: f32,
16165     e8: f32,
16166     e9: f32,
16167     e10: f32,
16168     e11: f32,
16169     e12: f32,
16170     e13: f32,
16171     e14: f32,
16172     e15: f32,
16173 ) -> __m512 {
16174     _mm512_setr_ps(
16175         e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
16176     )
16177 }
16178
16179 /// Sets packed 32-bit integers in `dst` with the supplied values in
16180 /// reverse order.
16181 ///
16182 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=727,1063,4909,1062,1062,4909&text=_mm512_set_ps)
16183 #[inline]
16184 #[target_feature(enable = "avx512f")]
16185 pub unsafe fn _mm512_setr_ps(
16186     e0: f32,
16187     e1: f32,
16188     e2: f32,
16189     e3: f32,
16190     e4: f32,
16191     e5: f32,
16192     e6: f32,
16193     e7: f32,
16194     e8: f32,
16195     e9: f32,
16196     e10: f32,
16197     e11: f32,
16198     e12: f32,
16199     e13: f32,
16200     e14: f32,
16201     e15: f32,
16202 ) -> __m512 {
16203     let r = f32x16::new(
16204         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
16205     );
16206     transmute(r)
16207 }
16208
16209 /// Broadcast 64-bit float `a` to all elements of `dst`.
16210 #[inline]
16211 #[target_feature(enable = "avx512f")]
16212 pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
16213     transmute(f64x8::splat(a))
16214 }
16215
16216 /// Broadcast 32-bit float `a` to all elements of `dst`.
16217 #[inline]
16218 #[target_feature(enable = "avx512f")]
16219 pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 {
16220     transmute(f32x16::splat(a))
16221 }
16222
16223 /// Sets packed 32-bit integers in `dst` with the supplied values.
16224 #[inline]
16225 #[target_feature(enable = "avx512f")]
16226 pub unsafe fn _mm512_set_epi32(
16227     e15: i32,
16228     e14: i32,
16229     e13: i32,
16230     e12: i32,
16231     e11: i32,
16232     e10: i32,
16233     e9: i32,
16234     e8: i32,
16235     e7: i32,
16236     e6: i32,
16237     e5: i32,
16238     e4: i32,
16239     e3: i32,
16240     e2: i32,
16241     e1: i32,
16242     e0: i32,
16243 ) -> __m512i {
16244     _mm512_setr_epi32(
16245         e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
16246     )
16247 }
16248
16249 /// Broadcast 8-bit integer a to all elements of dst.
16250 ///
16251 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set1_epi8&expand=4972)
16252 #[inline]
16253 #[target_feature(enable = "avx512f")]
16254 pub unsafe fn _mm512_set1_epi8(a: i8) -> __m512i {
16255     transmute(i8x64::splat(a))
16256 }
16257
16258 /// Broadcast the low packed 16-bit integer from a to all all elements of dst.
16259 ///
16260 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set1_epi16&expand=4944)
16261 #[inline]
16262 #[target_feature(enable = "avx512f")]
16263 pub unsafe fn _mm512_set1_epi16(a: i16) -> __m512i {
16264     transmute(i16x32::splat(a))
16265 }
16266
16267 /// Broadcast 32-bit integer `a` to all elements of `dst`.
16268 #[inline]
16269 #[target_feature(enable = "avx512f")]
16270 pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
16271     transmute(i32x16::splat(a))
16272 }
16273
16274 /// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
16275 ///
16276 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi32&expand=4951)
16277 #[inline]
16278 #[target_feature(enable = "avx512f")]
16279 #[cfg_attr(test, assert_instr(vpbroadcastd))]
16280 pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
16281     let r = _mm512_set1_epi32(a).as_i32x16();
16282     transmute(simd_select_bitmask(k, r, src.as_i32x16()))
16283 }
16284
16285 /// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
16286 ///
16287 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi32&expand=4952)
16288 #[inline]
16289 #[target_feature(enable = "avx512f")]
16290 #[cfg_attr(test, assert_instr(vpbroadcastd))]
16291 pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
16292     let r = _mm512_set1_epi32(a).as_i32x16();
16293     let zero = _mm512_setzero_si512().as_i32x16();
16294     transmute(simd_select_bitmask(k, r, zero))
16295 }
16296
16297 /// Broadcast 64-bit integer `a` to all elements of `dst`.
16298 #[inline]
16299 #[target_feature(enable = "avx512f")]
16300 pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
16301     transmute(i64x8::splat(a))
16302 }
16303
16304 /// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
16305 ///
16306 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_set1_epi64&expand=4959)
16307 #[inline]
16308 #[target_feature(enable = "avx512f")]
16309 #[cfg_attr(test, assert_instr(vpbroadcastq))]
16310 pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
16311     let r = _mm512_set1_epi64(a).as_i64x8();
16312     transmute(simd_select_bitmask(k, r, src.as_i64x8()))
16313 }
16314
16315 /// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
16316 ///
16317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_set1_epi64&expand=4960)
16318 #[inline]
16319 #[target_feature(enable = "avx512f")]
16320 #[cfg_attr(test, assert_instr(vpbroadcastq))]
16321 pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
16322     let r = _mm512_set1_epi64(a).as_i64x8();
16323     let zero = _mm512_setzero_si512().as_i64x8();
16324     transmute(simd_select_bitmask(k, r, zero))
16325 }
16326
16327 /// Set packed 64-bit integers in dst with the repeated 4 element sequence.
16328 ///
16329 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_set4_epi64&expand=4983)
16330 #[inline]
16331 #[target_feature(enable = "avx512f")]
16332 pub unsafe fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
16333     let r = i64x8::new(d, c, b, a, d, c, b, a);
16334     transmute(r)
16335 }
16336
16337 /// Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
16338 ///
16339 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_setr4_epi64&expand=5010)
16340 #[inline]
16341 #[target_feature(enable = "avx512f")]
16342 pub unsafe fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
16343     let r = i64x8::new(a, b, c, d, a, b, c, d);
16344     transmute(r)
16345 }
16346
16347 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
16348 ///
16349 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmplt_ps_mask&expand=1074)
16350 #[inline]
16351 #[target_feature(enable = "avx512f")]
16352 #[cfg_attr(test, assert_instr(vcmp))]
16353 pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16354     _mm512_cmp_ps_mask(a, b, _CMP_LT_OS)
16355 }
16356
16357 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16358 ///
16359 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmplt_ps_mask&expand=1075)
16360 #[inline]
16361 #[target_feature(enable = "avx512f")]
16362 #[cfg_attr(test, assert_instr(vcmp))]
16363 pub unsafe fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16364     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_LT_OS)
16365 }
16366
16367 /// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
16368 ///
16369 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmpnlt_ps_mask&expand=1154)
16370 #[inline]
16371 #[target_feature(enable = "avx512f")]
16372 #[cfg_attr(test, assert_instr(vcmp))]
16373 pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16374     _mm512_cmp_ps_mask(a, b, _CMP_NLT_US)
16375 }
16376
16377 /// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16378 ///
16379 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmpnlt_ps_mask&expand=1155)
16380 #[inline]
16381 #[target_feature(enable = "avx512f")]
16382 #[cfg_attr(test, assert_instr(vcmp))]
16383 pub unsafe fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16384     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_NLT_US)
16385 }
16386
16387 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
16388 ///
16389 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmple_ps_mask&expand=1013)
16390 #[inline]
16391 #[target_feature(enable = "avx512f")]
16392 #[cfg_attr(test, assert_instr(vcmp))]
16393 pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16394     _mm512_cmp_ps_mask(a, b, _CMP_LE_OS)
16395 }
16396
16397 /// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16398 ///
16399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmple_ps_mask&expand=1014)
16400 #[inline]
16401 #[target_feature(enable = "avx512f")]
16402 #[cfg_attr(test, assert_instr(vcmp))]
16403 pub unsafe fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16404     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_LE_OS)
16405 }
16406
16407 /// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
16408 ///
16409 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmpnle_ps_mask&expand=1146)
16410 #[inline]
16411 #[target_feature(enable = "avx512f")]
16412 #[cfg_attr(test, assert_instr(vcmp))]
16413 pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16414     _mm512_cmp_ps_mask(a, b, _CMP_NLE_US)
16415 }
16416
16417 /// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16418 ///
16419 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmpnle_ps_mask&expand=1147)
16420 #[inline]
16421 #[target_feature(enable = "avx512f")]
16422 #[cfg_attr(test, assert_instr(vcmp))]
16423 pub unsafe fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16424     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_NLE_US)
16425 }
16426
16427 /// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
16428 ///
16429 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmpeq_ps_mask&expand=828)
16430 #[inline]
16431 #[target_feature(enable = "avx512f")]
16432 #[cfg_attr(test, assert_instr(vcmp))]
16433 pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16434     _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ)
16435 }
16436
16437 /// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16438 ///
16439 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmpeq_ps_mask&expand=829)
16440 #[inline]
16441 #[target_feature(enable = "avx512f")]
16442 #[cfg_attr(test, assert_instr(vcmp))]
16443 pub unsafe fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16444     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_EQ_OQ)
16445 }
16446
16447 /// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
16448 ///
16449 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmpneq_ps_mask&expand=1130)
16450 #[inline]
16451 #[target_feature(enable = "avx512f")]
16452 #[cfg_attr(test, assert_instr(vcmp))]
16453 pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16454     _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ)
16455 }
16456
16457 /// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16458 ///
16459 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmpneq_ps_mask&expand=1131)
16460 #[inline]
16461 #[target_feature(enable = "avx512f")]
16462 #[cfg_attr(test, assert_instr(vcmp))]
16463 pub unsafe fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16464     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_NEQ_UQ)
16465 }
16466
16467 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
16468 ///
16469 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmp_ps_mask&expand=749)
16470 #[inline]
16471 #[target_feature(enable = "avx512f")]
16472 #[rustc_args_required_const(2)]
16473 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16474 pub unsafe fn _mm512_cmp_ps_mask(a: __m512, b: __m512, imm8: i32) -> __mmask16 {
16475     let neg_one = -1;
16476     macro_rules! call {
16477         ($imm5:expr) => {
16478             vcmpps(
16479                 a.as_f32x16(),
16480                 b.as_f32x16(),
16481                 $imm5,
16482                 neg_one,
16483                 _MM_FROUND_CUR_DIRECTION,
16484             )
16485         };
16486     }
16487     let r = constify_imm5!(imm8, call);
16488     transmute(r)
16489 }
16490
16491 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16492 ///
16493 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmp_ps_mask&expand=750)
16494 #[inline]
16495 #[target_feature(enable = "avx512f")]
16496 #[rustc_args_required_const(3)]
16497 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16498 pub unsafe fn _mm512_mask_cmp_ps_mask(k1: __mmask16, a: __m512, b: __m512, imm8: i32) -> __mmask16 {
16499     macro_rules! call {
16500         ($imm5:expr) => {
16501             vcmpps(
16502                 a.as_f32x16(),
16503                 b.as_f32x16(),
16504                 $imm5,
16505                 k1 as i16,
16506                 _MM_FROUND_CUR_DIRECTION,
16507             )
16508         };
16509     }
16510     let r = constify_imm5!(imm8, call);
16511     transmute(r)
16512 }
16513
16514 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
16515 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16516 ///
16517 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_cmp_round_ps_mask&expand=753)
16518 #[inline]
16519 #[target_feature(enable = "avx512f")]
16520 #[rustc_args_required_const(2, 3)]
16521 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16522 pub unsafe fn _mm512_cmp_round_ps_mask(a: __m512, b: __m512, imm8: i32, sae: i32) -> __mmask16 {
16523     let neg_one = -1;
16524     macro_rules! call {
16525         ($imm5:expr, $imm4:expr) => {
16526             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, neg_one, $imm4)
16527         };
16528     }
16529     let r = constify_imm5_sae!(imm8, sae, call);
16530     transmute(r)
16531 }
16532
16533 /// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
16534 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16535 ///
16536 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_cmp_round_ps_mask&expand=754)
16537 #[inline]
16538 #[target_feature(enable = "avx512f")]
16539 #[rustc_args_required_const(3, 4)]
16540 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16541 pub unsafe fn _mm512_mask_cmp_round_ps_mask(
16542     m: __mmask16,
16543     a: __m512,
16544     b: __m512,
16545     imm8: i32,
16546     sae: i32,
16547 ) -> __mmask16 {
16548     macro_rules! call {
16549         ($imm5:expr, $imm4:expr) => {
16550             vcmpps(a.as_f32x16(), b.as_f32x16(), $imm5, m as i16, $imm4)
16551         };
16552     }
16553     let r = constify_imm5_sae!(imm8, sae, call);
16554     transmute(r)
16555 }
16556
16557 /// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
16558 ///
16559 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpord_ps_mask&expand=1162)
16560 #[inline]
16561 #[target_feature(enable = "avx512f")]
16562 #[cfg_attr(test, assert_instr(vcmp))]
16563 pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16564     _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q)
16565 }
16566
16567 /// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16568 ///
16569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpord_ps_mask&expand=1163)
16570 #[inline]
16571 #[target_feature(enable = "avx512f")]
16572 #[cfg_attr(test, assert_instr(vcmp))]
16573 pub unsafe fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16574     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_ORD_Q)
16575 }
16576
16577 /// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
16578 ///
16579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpunord_ps_mask&expand=1170)
16580 #[inline]
16581 #[target_feature(enable = "avx512f")]
16582 #[cfg_attr(test, assert_instr(vcmp))]
16583 pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
16584     _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q)
16585 }
16586
16587 /// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16588 ///
16589 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpunord_ps_mask&expand=1171)
16590 #[inline]
16591 #[target_feature(enable = "avx512f")]
16592 #[cfg_attr(test, assert_instr(vcmp))]
16593 pub unsafe fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
16594     _mm512_mask_cmp_ps_mask(k1, a, b, _CMP_UNORD_Q)
16595 }
16596
16597 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
16598 ///
16599 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_pd_mask&expand=1071)
16600 #[inline]
16601 #[target_feature(enable = "avx512f")]
16602 #[cfg_attr(test, assert_instr(vcmp))]
16603 pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16604     _mm512_cmp_pd_mask(a, b, _CMP_LT_OS)
16605 }
16606
16607 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16608 ///
16609 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_pd_mask&expand=1072)
16610 #[inline]
16611 #[target_feature(enable = "avx512f")]
16612 #[cfg_attr(test, assert_instr(vcmp))]
16613 pub unsafe fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16614     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_LT_OS)
16615 }
16616
16617 /// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
16618 ///
16619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnlt_pd_mask&expand=1151)
16620 #[inline]
16621 #[target_feature(enable = "avx512f")]
16622 #[cfg_attr(test, assert_instr(vcmp))]
16623 pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16624     _mm512_cmp_pd_mask(a, b, _CMP_NLT_US)
16625 }
16626
16627 /// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16628 ///
16629 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnlt_pd_mask&expand=1152)
16630 #[inline]
16631 #[target_feature(enable = "avx512f")]
16632 #[cfg_attr(test, assert_instr(vcmp))]
16633 pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16634     _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US)
16635 }
16636
16637 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
16638 ///
16639 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_pd_mask&expand=1010)
16640 #[inline]
16641 #[target_feature(enable = "avx512f")]
16642 #[cfg_attr(test, assert_instr(vcmp))]
16643 pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16644     _mm512_cmp_pd_mask(a, b, _CMP_LE_OS)
16645 }
16646
16647 /// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16648 ///
16649 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_pd_mask&expand=1011)
16650 #[inline]
16651 #[target_feature(enable = "avx512f")]
16652 #[cfg_attr(test, assert_instr(vcmp))]
16653 pub unsafe fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16654     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_LE_OS)
16655 }
16656
16657 /// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
16658 ///
16659 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnle_pd_mask&expand=1143)
16660 #[inline]
16661 #[target_feature(enable = "avx512f")]
16662 #[cfg_attr(test, assert_instr(vcmp))]
16663 pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16664     _mm512_cmp_pd_mask(a, b, _CMP_NLE_US)
16665 }
16666
16667 /// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16668 ///
16669 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnle_pd_mask&expand=1144)
16670 #[inline]
16671 #[target_feature(enable = "avx512f")]
16672 #[cfg_attr(test, assert_instr(vcmp))]
16673 pub unsafe fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16674     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_NLE_US)
16675 }
16676
16677 /// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
16678 ///
16679 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_pd_mask&expand=822)
16680 #[inline]
16681 #[target_feature(enable = "avx512f")]
16682 #[cfg_attr(test, assert_instr(vcmp))]
16683 pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16684     _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ)
16685 }
16686
16687 /// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16688 ///
16689 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_pd_mask&expand=823)
16690 #[inline]
16691 #[target_feature(enable = "avx512f")]
16692 #[cfg_attr(test, assert_instr(vcmp))]
16693 pub unsafe fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16694     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_EQ_OQ)
16695 }
16696
16697 /// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
16698 ///
16699 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_pd_mask&expand=1127)
16700 #[inline]
16701 #[target_feature(enable = "avx512f")]
16702 #[cfg_attr(test, assert_instr(vcmp))]
16703 pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16704     _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ)
16705 }
16706
16707 /// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16708 ///
16709 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_pd_mask&expand=1128)
16710 #[inline]
16711 #[target_feature(enable = "avx512f")]
16712 #[cfg_attr(test, assert_instr(vcmp))]
16713 pub unsafe fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16714     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_NEQ_UQ)
16715 }
16716
16717 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
16718 ///
16719 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_pd_mask&expand=741)
16720 #[inline]
16721 #[target_feature(enable = "avx512f")]
16722 #[rustc_args_required_const(2)]
16723 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16724 pub unsafe fn _mm512_cmp_pd_mask(a: __m512d, b: __m512d, imm8: i32) -> __mmask8 {
16725     let neg_one = -1;
16726     macro_rules! call {
16727         ($imm5:expr) => {
16728             vcmppd(
16729                 a.as_f64x8(),
16730                 b.as_f64x8(),
16731                 $imm5,
16732                 neg_one,
16733                 _MM_FROUND_CUR_DIRECTION,
16734             )
16735         };
16736     }
16737     let r = constify_imm5!(imm8, call);
16738     transmute(r)
16739 }
16740
16741 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16742 ///
16743 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_pd_mask&expand=742)
16744 #[inline]
16745 #[target_feature(enable = "avx512f")]
16746 #[rustc_args_required_const(3)]
16747 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16748 pub unsafe fn _mm512_mask_cmp_pd_mask(k1: __mmask8, a: __m512d, b: __m512d, imm8: i32) -> __mmask8 {
16749     macro_rules! call {
16750         ($imm5:expr) => {
16751             vcmppd(
16752                 a.as_f64x8(),
16753                 b.as_f64x8(),
16754                 $imm5,
16755                 k1 as i8,
16756                 _MM_FROUND_CUR_DIRECTION,
16757             )
16758         };
16759     }
16760     let r = constify_imm5!(imm8, call);
16761     transmute(r)
16762 }
16763
16764 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
16765 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16766 ///
16767 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_pd_mask&expand=751)
16768 #[inline]
16769 #[target_feature(enable = "avx512f")]
16770 #[rustc_args_required_const(2, 3)]
16771 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16772 pub unsafe fn _mm512_cmp_round_pd_mask(a: __m512d, b: __m512d, imm8: i32, sae: i32) -> __mmask8 {
16773     let neg_one = -1;
16774     macro_rules! call {
16775         ($imm5:expr, $imm4:expr) => {
16776             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, neg_one, $imm4)
16777         };
16778     }
16779     let r = constify_imm5_sae!(imm8, sae, call);
16780     transmute(r)
16781 }
16782
16783 /// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
16784 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16785 ///
16786 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_pd_mask&expand=752)
16787 #[inline]
16788 #[target_feature(enable = "avx512f")]
16789 #[rustc_args_required_const(3, 4)]
16790 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16791 pub unsafe fn _mm512_mask_cmp_round_pd_mask(
16792     k1: __mmask8,
16793     a: __m512d,
16794     b: __m512d,
16795     imm8: i32,
16796     sae: i32,
16797 ) -> __mmask8 {
16798     macro_rules! call {
16799         ($imm5:expr, $imm4:expr) => {
16800             vcmppd(a.as_f64x8(), b.as_f64x8(), $imm5, k1 as i8, $imm4)
16801         };
16802     }
16803     let r = constify_imm5_sae!(imm8, sae, call);
16804     transmute(r)
16805 }
16806
16807 /// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
16808 ///
16809 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpord_pd_mask&expand=1159)
16810 #[inline]
16811 #[target_feature(enable = "avx512f")]
16812 #[cfg_attr(test, assert_instr(vcmp))]
16813 pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16814     _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q)
16815 }
16816
16817 /// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16818 ///
16819 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpord_pd_mask&expand=1160)
16820 #[inline]
16821 #[target_feature(enable = "avx512f")]
16822 #[cfg_attr(test, assert_instr(vcmp))]
16823 pub unsafe fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16824     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_ORD_Q)
16825 }
16826
16827 /// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
16828 ///
16829 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpunord_pd_mask&expand=1167)
16830 #[inline]
16831 #[target_feature(enable = "avx512f")]
16832 #[cfg_attr(test, assert_instr(vcmp))]
16833 pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
16834     _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q)
16835 }
16836
16837 /// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
16838 ///
16839 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpunord_pd_mask&expand=1168)
16840 #[inline]
16841 #[target_feature(enable = "avx512f")]
16842 #[cfg_attr(test, assert_instr(vcmp))]
16843 pub unsafe fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
16844     _mm512_mask_cmp_pd_mask(k1, a, b, _CMP_UNORD_Q)
16845 }
16846
16847 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
16848 ///
16849 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=763)
16850 #[inline]
16851 #[target_feature(enable = "avx512f")]
16852 #[rustc_args_required_const(2)]
16853 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16854 pub unsafe fn _mm_cmp_ss_mask(a: __m128, b: __m128, imm8: i32) -> __mmask8 {
16855     let neg_one = -1;
16856     macro_rules! call {
16857         ($imm5:expr) => {
16858             vcmpss(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION)
16859         };
16860     }
16861     let r = constify_imm5!(imm8, call);
16862     transmute(r)
16863 }
16864
16865 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
16866 ///
16867 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=764)
16868 #[inline]
16869 #[target_feature(enable = "avx512f")]
16870 #[rustc_args_required_const(3)]
16871 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16872 pub unsafe fn _mm_mask_cmp_ss_mask(k1: __mmask8, a: __m128, b: __m128, imm8: i32) -> __mmask8 {
16873     macro_rules! call {
16874         ($imm5:expr) => {
16875             vcmpss(a, b, $imm5, k1 as i8, _MM_FROUND_CUR_DIRECTION)
16876         };
16877     }
16878     let r = constify_imm5!(imm8, call);
16879     transmute(r)
16880 }
16881
16882 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
16883 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16884 ///
16885 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=757)
16886 #[inline]
16887 #[target_feature(enable = "avx512f")]
16888 #[rustc_args_required_const(2, 3)]
16889 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16890 pub unsafe fn _mm_cmp_round_ss_mask(a: __m128, b: __m128, imm8: i32, sae: i32) -> __mmask8 {
16891     let neg_one = -1;
16892     macro_rules! call {
16893         ($imm5:expr, $imm4:expr) => {
16894             vcmpss(a, b, $imm5, neg_one, $imm4)
16895         };
16896     }
16897     let r = constify_imm5_sae!(imm8, sae, call);
16898     transmute(r)
16899 }
16900
16901 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
16902 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16903 ///
16904 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=758)
16905 #[inline]
16906 #[target_feature(enable = "avx512f")]
16907 #[rustc_args_required_const(3, 4)]
16908 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16909 pub unsafe fn _mm_mask_cmp_round_ss_mask(
16910     k1: __mmask8,
16911     a: __m128,
16912     b: __m128,
16913     imm8: i32,
16914     sae: i32,
16915 ) -> __mmask8 {
16916     macro_rules! call {
16917         ($imm5:expr, $imm4:expr) => {
16918             vcmpss(a, b, $imm5, k1 as i8, $imm4)
16919         };
16920     }
16921     let r = constify_imm5_sae!(imm8, sae, call);
16922     transmute(r)
16923 }
16924
16925 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
16926 ///
16927 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=760)
16928 #[inline]
16929 #[target_feature(enable = "avx512f")]
16930 #[rustc_args_required_const(2)]
16931 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16932 pub unsafe fn _mm_cmp_sd_mask(a: __m128d, b: __m128d, imm8: i32) -> __mmask8 {
16933     let neg_one = -1;
16934     macro_rules! call {
16935         ($imm5:expr) => {
16936             vcmpsd(a, b, $imm5, neg_one, _MM_FROUND_CUR_DIRECTION)
16937         };
16938     }
16939     let r = constify_imm5!(imm8, call);
16940     transmute(r)
16941 }
16942
16943 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
16944 ///
16945 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=761)
16946 #[inline]
16947 #[target_feature(enable = "avx512f")]
16948 #[rustc_args_required_const(3)]
16949 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0))]
16950 pub unsafe fn _mm_mask_cmp_sd_mask(k1: __mmask8, a: __m128d, b: __m128d, imm8: i32) -> __mmask8 {
16951     macro_rules! call {
16952         ($imm5:expr) => {
16953             vcmpsd(a, b, $imm5, k1 as i8, _MM_FROUND_CUR_DIRECTION)
16954         };
16955     }
16956     let r = constify_imm5!(imm8, call);
16957     transmute(r)
16958 }
16959
16960 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
16961 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16962 ///
16963 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=755)
16964 #[inline]
16965 #[target_feature(enable = "avx512f")]
16966 #[rustc_args_required_const(2, 3)]
16967 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16968 pub unsafe fn _mm_cmp_round_sd_mask(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> __mmask8 {
16969     let neg_one = -1;
16970     macro_rules! call {
16971         ($imm5:expr, $imm4:expr) => {
16972             vcmpsd(a, b, $imm5, neg_one, $imm4)
16973         };
16974     }
16975     let r = constify_imm5_sae!(imm8, sae, call);
16976     transmute(r)
16977 }
16978
16979 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
16980 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16981 ///
16982 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=756)
16983 #[inline]
16984 #[target_feature(enable = "avx512f")]
16985 #[rustc_args_required_const(3, 4)]
16986 #[cfg_attr(test, assert_instr(vcmp, imm8 = 0, sae = 4))]
16987 pub unsafe fn _mm_mask_cmp_round_sd_mask(
16988     k1: __mmask8,
16989     a: __m128d,
16990     b: __m128d,
16991     imm8: i32,
16992     sae: i32,
16993 ) -> __mmask8 {
16994     macro_rules! call {
16995         ($imm5:expr, $imm4:expr) => {
16996             vcmpsd(a, b, $imm5, k1 as i8, $imm4)
16997         };
16998     }
16999     let r = constify_imm5_sae!(imm8, sae, call);
17000     transmute(r)
17001 }
17002
17003 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
17004 ///
17005 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epu32_mask&expand=1056)
17006 #[inline]
17007 #[target_feature(enable = "avx512f")]
17008 #[cfg_attr(test, assert_instr(vpcmp))]
17009 pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17010     simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16()))
17011 }
17012
17013 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17014 ///
17015 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
17016 #[inline]
17017 #[target_feature(enable = "avx512f")]
17018 #[cfg_attr(test, assert_instr(vpcmp))]
17019 pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17020     _mm512_cmplt_epu32_mask(a, b) & k1
17021 }
17022
17023 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
17024 ///
17025 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu32_mask&expand=933)
17026 #[inline]
17027 #[target_feature(enable = "avx512f")]
17028 #[cfg_attr(test, assert_instr(vpcmp))]
17029 pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17030     simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16()))
17031 }
17032
17033 /// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17034 ///
17035 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu32_mask&expand=934)
17036 #[inline]
17037 #[target_feature(enable = "avx512f")]
17038 #[cfg_attr(test, assert_instr(vpcmp))]
17039 pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17040     _mm512_cmpgt_epu32_mask(a, b) & k1
17041 }
17042
17043 /// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
17044 ///
17045 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu32_mask&expand=995)
17046 #[inline]
17047 #[target_feature(enable = "avx512f")]
17048 #[cfg_attr(test, assert_instr(vpcmp))]
17049 pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17050     simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16()))
17051 }
17052
17053 /// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17054 ///
17055 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu32_mask&expand=996)
17056 #[inline]
17057 #[target_feature(enable = "avx512f")]
17058 #[cfg_attr(test, assert_instr(vpcmp))]
17059 pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17060     _mm512_cmple_epu32_mask(a, b) & k1
17061 }
17062
17063 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
17064 ///
17065 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu32_mask&expand=873)
17066 #[inline]
17067 #[target_feature(enable = "avx512f")]
17068 #[cfg_attr(test, assert_instr(vpcmp))]
17069 pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17070     simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16()))
17071 }
17072
17073 /// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17074 ///
17075 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu32_mask&expand=874)
17076 #[inline]
17077 #[target_feature(enable = "avx512f")]
17078 #[cfg_attr(test, assert_instr(vpcmp))]
17079 pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17080     _mm512_cmpge_epu32_mask(a, b) & k1
17081 }
17082
17083 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
17084 ///
17085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu32_mask&expand=807)
17086 #[inline]
17087 #[target_feature(enable = "avx512f")]
17088 #[cfg_attr(test, assert_instr(vpcmp))]
17089 pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17090     simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16()))
17091 }
17092
17093 /// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17094 ///
17095 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu32_mask&expand=808)
17096 #[inline]
17097 #[target_feature(enable = "avx512f")]
17098 #[cfg_attr(test, assert_instr(vpcmp))]
17099 pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17100     _mm512_cmpeq_epu32_mask(a, b) & k1
17101 }
17102
17103 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
17104 ///
17105 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu32_mask&expand=1112)
17106 #[inline]
17107 #[target_feature(enable = "avx512f")]
17108 #[cfg_attr(test, assert_instr(vpcmp))]
17109 pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17110     simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16()))
17111 }
17112
17113 /// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17114 ///
17115 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu32_mask&expand=1113)
17116 #[inline]
17117 #[target_feature(enable = "avx512f")]
17118 #[cfg_attr(test, assert_instr(vpcmp))]
17119 pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17120     _mm512_cmpneq_epu32_mask(a, b) & k1
17121 }
17122
17123 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
17124 ///
17125 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu32_mask&expand=721)
17126 #[inline]
17127 #[target_feature(enable = "avx512f")]
17128 #[rustc_args_required_const(2)]
17129 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17130 pub unsafe fn _mm512_cmp_epu32_mask(a: __m512i, b: __m512i, imm8: _MM_CMPINT_ENUM) -> __mmask16 {
17131     let neg_one = -1;
17132     macro_rules! call {
17133         ($imm3:expr) => {
17134             vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one)
17135         };
17136     }
17137     let r = constify_imm3!(imm8, call);
17138     transmute(r)
17139 }
17140
17141 /// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17142 ///
17143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu32_mask&expand=722)
17144 #[inline]
17145 #[target_feature(enable = "avx512f")]
17146 #[rustc_args_required_const(3)]
17147 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17148 pub unsafe fn _mm512_mask_cmp_epu32_mask(
17149     k1: __mmask16,
17150     a: __m512i,
17151     b: __m512i,
17152     imm8: _MM_CMPINT_ENUM,
17153 ) -> __mmask16 {
17154     macro_rules! call {
17155         ($imm3:expr) => {
17156             vpcmpud(a.as_i32x16(), b.as_i32x16(), $imm3, k1 as i16)
17157         };
17158     }
17159     let r = constify_imm3!(imm8, call);
17160     transmute(r)
17161 }
17162
17163 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
17164 ///
17165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi32_mask&expand=1029)
17166 #[inline]
17167 #[target_feature(enable = "avx512f")]
17168 #[cfg_attr(test, assert_instr(vpcmp))]
17169 pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17170     simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
17171 }
17172
17173 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17174 ///
17175 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
17176 #[inline]
17177 #[target_feature(enable = "avx512f")]
17178 #[cfg_attr(test, assert_instr(vpcmp))]
17179 pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17180     _mm512_cmplt_epi32_mask(a, b) & k1
17181 }
17182
17183 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
17184 ///
17185 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi32_mask&expand=905)
17186 #[inline]
17187 #[target_feature(enable = "avx512f")]
17188 #[cfg_attr(test, assert_instr(vpcmp))]
17189 pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17190     simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16()))
17191 }
17192
17193 /// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17194 ///
17195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi32_mask&expand=906)
17196 #[inline]
17197 #[target_feature(enable = "avx512f")]
17198 #[cfg_attr(test, assert_instr(vpcmp))]
17199 pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17200     _mm512_cmpgt_epi32_mask(a, b) & k1
17201 }
17202
17203 /// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
17204 ///
17205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi32_mask&expand=971)
17206 #[inline]
17207 #[target_feature(enable = "avx512f")]
17208 #[cfg_attr(test, assert_instr(vpcmp))]
17209 pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17210     simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16()))
17211 }
17212
17213 /// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17214 ///
17215 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi32_mask&expand=972)
17216 #[inline]
17217 #[target_feature(enable = "avx512f")]
17218 #[cfg_attr(test, assert_instr(vpcmp))]
17219 pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17220     _mm512_cmple_epi32_mask(a, b) & k1
17221 }
17222
17223 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
17224 ///
17225 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi32_mask&expand=849)
17226 #[inline]
17227 #[target_feature(enable = "avx512f")]
17228 #[cfg_attr(test, assert_instr(vpcmp))]
17229 pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17230     simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16()))
17231 }
17232
17233 /// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17234 ///
17235 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi32_mask&expand=850)
17236 #[inline]
17237 #[target_feature(enable = "avx512f")]
17238 #[cfg_attr(test, assert_instr(vpcmp))]
17239 pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17240     _mm512_cmpge_epi32_mask(a, b) & k1
17241 }
17242
17243 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
17244 ///
17245 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi32_mask&expand=779)
17246 #[inline]
17247 #[target_feature(enable = "avx512f")]
17248 #[cfg_attr(test, assert_instr(vpcmp))]
17249 pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17250     simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16()))
17251 }
17252
17253 /// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17254 ///
17255 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi32_mask&expand=780)
17256 #[inline]
17257 #[target_feature(enable = "avx512f")]
17258 #[cfg_attr(test, assert_instr(vpcmp))]
17259 pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17260     _mm512_cmpeq_epi32_mask(a, b) & k1
17261 }
17262
17263 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
17264 ///
17265 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi32_mask&expand=1088)
17266 #[inline]
17267 #[target_feature(enable = "avx512f")]
17268 #[cfg_attr(test, assert_instr(vpcmp))]
17269 pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
17270     simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16()))
17271 }
17272
17273 /// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17274 ///
17275 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi32_mask&expand=1089)
17276 #[inline]
17277 #[target_feature(enable = "avx512f")]
17278 #[cfg_attr(test, assert_instr(vpcmp))]
17279 pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
17280     _mm512_cmpneq_epi32_mask(a, b) & k1
17281 }
17282
17283 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
17284 ///
17285 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi32_mask&expand=697)
17286 #[inline]
17287 #[target_feature(enable = "avx512f")]
17288 #[rustc_args_required_const(2)]
17289 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17290 pub unsafe fn _mm512_cmp_epi32_mask(a: __m512i, b: __m512i, imm8: _MM_CMPINT_ENUM) -> __mmask16 {
17291     let neg_one = -1;
17292     macro_rules! call {
17293         ($imm3:expr) => {
17294             vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, neg_one)
17295         };
17296     }
17297     let r = constify_imm3!(imm8, call);
17298     transmute(r)
17299 }
17300
17301 /// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17302 ///
17303 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi32_mask&expand=698)
17304 #[inline]
17305 #[target_feature(enable = "avx512f")]
17306 #[rustc_args_required_const(3)]
17307 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17308 pub unsafe fn _mm512_mask_cmp_epi32_mask(
17309     k1: __mmask16,
17310     a: __m512i,
17311     b: __m512i,
17312     imm8: _MM_CMPINT_ENUM,
17313 ) -> __mmask16 {
17314     macro_rules! call {
17315         ($imm3:expr) => {
17316             vpcmpd(a.as_i32x16(), b.as_i32x16(), $imm3, k1 as i16)
17317         };
17318     }
17319     let r = constify_imm3!(imm8, call);
17320     transmute(r)
17321 }
17322
17323 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
17324 ///
17325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epu64_mask&expand=1062)
17326 #[inline]
17327 #[target_feature(enable = "avx512f")]
17328 #[cfg_attr(test, assert_instr(vpcmp))]
17329 pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17330     simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8()))
17331 }
17332
17333 /// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17334 ///
17335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu64_mask&expand=1063)
17336 #[inline]
17337 #[target_feature(enable = "avx512f")]
17338 #[cfg_attr(test, assert_instr(vpcmp))]
17339 pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17340     _mm512_cmplt_epu64_mask(a, b) & k1
17341 }
17342
17343 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
17344 ///
17345 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu64_mask&expand=939)
17346 #[inline]
17347 #[target_feature(enable = "avx512f")]
17348 #[cfg_attr(test, assert_instr(vpcmp))]
17349 pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17350     simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8()))
17351 }
17352
17353 /// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17354 ///
17355 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu64_mask&expand=940)
17356 #[inline]
17357 #[target_feature(enable = "avx512f")]
17358 #[cfg_attr(test, assert_instr(vpcmp))]
17359 pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17360     _mm512_cmpgt_epu64_mask(a, b) & k1
17361 }
17362
17363 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
17364 ///
17365 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu64_mask&expand=1001)
17366 #[inline]
17367 #[target_feature(enable = "avx512f")]
17368 #[cfg_attr(test, assert_instr(vpcmp))]
17369 pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17370     simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8()))
17371 }
17372
17373 /// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17374 ///
17375 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu64_mask&expand=1002)
17376 #[inline]
17377 #[target_feature(enable = "avx512f")]
17378 #[cfg_attr(test, assert_instr(vpcmp))]
17379 pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17380     _mm512_cmple_epu64_mask(a, b) & k1
17381 }
17382
17383 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
17384 ///
17385 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu64_mask&expand=879)
17386 #[inline]
17387 #[target_feature(enable = "avx512f")]
17388 #[cfg_attr(test, assert_instr(vpcmp))]
17389 pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17390     simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8()))
17391 }
17392
17393 /// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17394 ///
17395 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu64_mask&expand=880)
17396 #[inline]
17397 #[target_feature(enable = "avx512f")]
17398 #[cfg_attr(test, assert_instr(vpcmp))]
17399 pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17400     _mm512_cmpge_epu64_mask(b, a) & k1
17401 }
17402
17403 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
17404 ///
17405 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu64_mask&expand=813)
17406 #[inline]
17407 #[target_feature(enable = "avx512f")]
17408 #[cfg_attr(test, assert_instr(vpcmp))]
17409 pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17410     simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8()))
17411 }
17412
17413 /// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17414 ///
17415 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu64_mask&expand=814)
17416 #[inline]
17417 #[target_feature(enable = "avx512f")]
17418 #[cfg_attr(test, assert_instr(vpcmp))]
17419 pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17420     _mm512_cmpeq_epu64_mask(a, b) & k1
17421 }
17422
17423 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
17424 ///
17425 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu64_mask&expand=1118)
17426 #[inline]
17427 #[target_feature(enable = "avx512f")]
17428 #[cfg_attr(test, assert_instr(vpcmp))]
17429 pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17430     simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
17431 }
17432
17433 /// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17434 ///
17435 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu64_mask&expand=1119)
17436 #[inline]
17437 #[target_feature(enable = "avx512f")]
17438 #[cfg_attr(test, assert_instr(vpcmp))]
17439 pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17440     _mm512_cmpneq_epu64_mask(a, b) & k1
17441 }
17442
17443 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
17444 ///
17445 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu64_mask&expand=727)
17446 #[inline]
17447 #[target_feature(enable = "avx512f")]
17448 #[rustc_args_required_const(2)]
17449 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17450 pub unsafe fn _mm512_cmp_epu64_mask(a: __m512i, b: __m512i, imm8: _MM_CMPINT_ENUM) -> __mmask8 {
17451     let neg_one = -1;
17452     macro_rules! call {
17453         ($imm3:expr) => {
17454             vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one)
17455         };
17456     }
17457     let r = constify_imm3!(imm8, call);
17458     transmute(r)
17459 }
17460
17461 /// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17462 ///
17463 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu64_mask&expand=728)
17464 #[inline]
17465 #[target_feature(enable = "avx512f")]
17466 #[rustc_args_required_const(3)]
17467 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17468 pub unsafe fn _mm512_mask_cmp_epu64_mask(
17469     k1: __mmask8,
17470     a: __m512i,
17471     b: __m512i,
17472     imm8: _MM_CMPINT_ENUM,
17473 ) -> __mmask8 {
17474     macro_rules! call {
17475         ($imm3:expr) => {
17476             vpcmpuq(a.as_i64x8(), b.as_i64x8(), $imm3, k1 as i8)
17477         };
17478     }
17479     let r = constify_imm3!(imm8, call);
17480     transmute(r)
17481 }
17482
17483 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
17484 ///
17485 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi64_mask&expand=1037)
17486 #[inline]
17487 #[target_feature(enable = "avx512f")]
17488 #[cfg_attr(test, assert_instr(vpcmp))]
17489 pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17490     simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8()))
17491 }
17492
17493 /// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17494 ///
17495 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi64_mask&expand=1038)
17496 #[inline]
17497 #[target_feature(enable = "avx512f")]
17498 #[cfg_attr(test, assert_instr(vpcmp))]
17499 pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17500     _mm512_cmplt_epi64_mask(a, b) & k1
17501 }
17502
17503 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
17504 ///
17505 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi64_mask&expand=913)
17506 #[inline]
17507 #[target_feature(enable = "avx512f")]
17508 #[cfg_attr(test, assert_instr(vpcmp))]
17509 pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17510     simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8()))
17511 }
17512
17513 /// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17514 ///
17515 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi64_mask&expand=914)
17516 #[inline]
17517 #[target_feature(enable = "avx512f")]
17518 #[cfg_attr(test, assert_instr(vpcmp))]
17519 pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17520     _mm512_cmpgt_epi64_mask(a, b) & k1
17521 }
17522
17523 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
17524 ///
17525 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi64_mask&expand=977)
17526 #[inline]
17527 #[target_feature(enable = "avx512f")]
17528 #[cfg_attr(test, assert_instr(vpcmp))]
17529 pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17530     simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8()))
17531 }
17532
17533 /// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17534 ///
17535 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi64_mask&expand=978)
17536 #[inline]
17537 #[target_feature(enable = "avx512f")]
17538 #[cfg_attr(test, assert_instr(vpcmp))]
17539 pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17540     _mm512_cmple_epi64_mask(a, b) & k1
17541 }
17542
17543 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
17544 ///
17545 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi64_mask&expand=855)
17546 #[inline]
17547 #[target_feature(enable = "avx512f")]
17548 #[cfg_attr(test, assert_instr(vpcmp))]
17549 pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17550     simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8()))
17551 }
17552
17553 /// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17554 ///
17555 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi64_mask&expand=856)
17556 #[inline]
17557 #[target_feature(enable = "avx512f")]
17558 #[cfg_attr(test, assert_instr(vpcmp))]
17559 pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17560     _mm512_cmpge_epi64_mask(b, a) & k1
17561 }
17562
17563 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
17564 ///
17565 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi64_mask&expand=787)
17566 #[inline]
17567 #[target_feature(enable = "avx512f")]
17568 #[cfg_attr(test, assert_instr(vpcmp))]
17569 pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17570     simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8()))
17571 }
17572
17573 /// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17574 ///
17575 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi64_mask&expand=788)
17576 #[inline]
17577 #[target_feature(enable = "avx512f")]
17578 #[cfg_attr(test, assert_instr(vpcmp))]
17579 pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17580     _mm512_cmpeq_epi64_mask(a, b) & k1
17581 }
17582
17583 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
17584 ///
17585 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi64_mask&expand=1094)
17586 #[inline]
17587 #[target_feature(enable = "avx512f")]
17588 #[cfg_attr(test, assert_instr(vpcmp))]
17589 pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
17590     simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
17591 }
17592
17593 /// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17594 ///
17595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi64_mask&expand=1095)
17596 #[inline]
17597 #[target_feature(enable = "avx512f")]
17598 #[cfg_attr(test, assert_instr(vpcmp))]
17599 pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
17600     _mm512_cmpneq_epi64_mask(a, b) & k1
17601 }
17602
17603 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
17604 ///
17605 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi64_mask&expand=703)
17606 #[inline]
17607 #[target_feature(enable = "avx512f")]
17608 #[rustc_args_required_const(2)]
17609 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17610 pub unsafe fn _mm512_cmp_epi64_mask(a: __m512i, b: __m512i, imm8: _MM_CMPINT_ENUM) -> __mmask8 {
17611     let neg_one = -1;
17612     macro_rules! call {
17613         ($imm3:expr) => {
17614             vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, neg_one)
17615         };
17616     }
17617     let r = constify_imm3!(imm8, call);
17618     transmute(r)
17619 }
17620
17621 /// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
17622 ///
17623 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi64_mask&expand=704)
17624 #[inline]
17625 #[target_feature(enable = "avx512f")]
17626 #[rustc_args_required_const(3)]
17627 #[cfg_attr(test, assert_instr(vpcmp, imm8 = 0))]
17628 pub unsafe fn _mm512_mask_cmp_epi64_mask(
17629     k1: __mmask8,
17630     a: __m512i,
17631     b: __m512i,
17632     imm8: _MM_CMPINT_ENUM,
17633 ) -> __mmask8 {
17634     macro_rules! call {
17635         ($imm3:expr) => {
17636             vpcmpq(a.as_i64x8(), b.as_i64x8(), $imm3, k1 as i8)
17637         };
17638     }
17639     let r = constify_imm3!(imm8, call);
17640     transmute(r)
17641 }
17642
17643 /// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
17644 ///
17645 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_epi32&expand=4556)
17646 #[inline]
17647 #[target_feature(enable = "avx512f")]
17648 pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
17649     simd_reduce_add_unordered(a.as_i32x16())
17650 }
17651
17652 /// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
17653 ///
17654 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi32&expand=4555)
17655 #[inline]
17656 #[target_feature(enable = "avx512f")]
17657 pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
17658     simd_reduce_add_unordered(simd_select_bitmask(
17659         k,
17660         a.as_i32x16(),
17661         _mm512_setzero_si512().as_i32x16(),
17662     ))
17663 }
17664
17665 /// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
17666 ///
17667 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_epi64&expand=4558)
17668 #[inline]
17669 #[target_feature(enable = "avx512f")]
17670 pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
17671     simd_reduce_add_unordered(a.as_i64x8())
17672 }
17673
17674 /// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
17675 ///
17676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557)
17677 #[inline]
17678 #[target_feature(enable = "avx512f")]
17679 pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
17680     simd_reduce_add_unordered(simd_select_bitmask(
17681         k,
17682         a.as_i64x8(),
17683         _mm512_setzero_si512().as_i64x8(),
17684     ))
17685 }
17686
17687 /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
17688 ///
17689 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_ps&expand=4562)
17690 #[inline]
17691 #[target_feature(enable = "avx512f")]
17692 pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
17693     simd_reduce_add_unordered(a.as_f32x16())
17694 }
17695
17696 /// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
17697 ///
17698 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_ps&expand=4561)
17699 #[inline]
17700 #[target_feature(enable = "avx512f")]
17701 pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
17702     simd_reduce_add_unordered(simd_select_bitmask(
17703         k,
17704         a.as_f32x16(),
17705         _mm512_setzero_ps().as_f32x16(),
17706     ))
17707 }
17708
17709 /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
17710 ///
17711 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_add_pd&expand=4560)
17712 #[inline]
17713 #[target_feature(enable = "avx512f")]
17714 pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
17715     simd_reduce_add_unordered(a.as_f64x8())
17716 }
17717
17718 /// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
17719 ///
17720 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_pd&expand=4559)
17721 #[inline]
17722 #[target_feature(enable = "avx512f")]
17723 pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
17724     simd_reduce_add_unordered(simd_select_bitmask(
17725         k,
17726         a.as_f64x8(),
17727         _mm512_setzero_pd().as_f64x8(),
17728     ))
17729 }
17730
17731 /// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
17732 ///
17733 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_epi32&expand=4600)
17734 #[inline]
17735 #[target_feature(enable = "avx512f")]
17736 pub unsafe fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
17737     simd_reduce_mul_unordered(a.as_i32x16())
17738 }
17739
17740 /// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
17741 ///
17742 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_epi32&expand=4599)
17743 #[inline]
17744 #[target_feature(enable = "avx512f")]
17745 pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
17746     simd_reduce_mul_unordered(simd_select_bitmask(
17747         k,
17748         a.as_i32x16(),
17749         _mm512_set1_epi32(1).as_i32x16(),
17750     ))
17751 }
17752
17753 /// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
17754 ///
17755 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_epi64&expand=4602)
17756 #[inline]
17757 #[target_feature(enable = "avx512f")]
17758 pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
17759     simd_reduce_mul_unordered(a.as_i64x8())
17760 }
17761
17762 /// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
17763 ///
17764 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_epi64&expand=4601)
17765 #[inline]
17766 #[target_feature(enable = "avx512f")]
17767 pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
17768     simd_reduce_mul_unordered(simd_select_bitmask(
17769         k,
17770         a.as_i64x8(),
17771         _mm512_set1_epi64(1).as_i64x8(),
17772     ))
17773 }
17774
17775 /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
17776 ///
17777 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_ps&expand=4606)
17778 #[inline]
17779 #[target_feature(enable = "avx512f")]
17780 pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
17781     simd_reduce_mul_unordered(a.as_f32x16())
17782 }
17783
17784 /// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
17785 ///
17786 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_ps&expand=4605)
17787 #[inline]
17788 #[target_feature(enable = "avx512f")]
17789 pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
17790     simd_reduce_mul_unordered(simd_select_bitmask(
17791         k,
17792         a.as_f32x16(),
17793         _mm512_set1_ps(1.).as_f32x16(),
17794     ))
17795 }
17796
17797 /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
17798 ///
17799 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_mul_pd&expand=4604)
17800 #[inline]
17801 #[target_feature(enable = "avx512f")]
17802 pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
17803     simd_reduce_mul_unordered(a.as_f64x8())
17804 }
17805
17806 /// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
17807 ///
17808 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_mul_pd&expand=4603)
17809 #[inline]
17810 #[target_feature(enable = "avx512f")]
17811 pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
17812     simd_reduce_mul_unordered(simd_select_bitmask(
17813         k,
17814         a.as_f64x8(),
17815         _mm512_set1_pd(1.).as_f64x8(),
17816     ))
17817 }
17818
17819 /// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
17820 ///
17821 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epi32&expand=4576)
17822 #[inline]
17823 #[target_feature(enable = "avx512f")]
17824 pub unsafe fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
17825     simd_reduce_max(a.as_i32x16())
17826 }
17827
17828 /// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
17829 ///
17830 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epi32&expand=4575)
17831 #[inline]
17832 #[target_feature(enable = "avx512f")]
17833 pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
17834     simd_reduce_max(simd_select_bitmask(
17835         k,
17836         a.as_i32x16(),
17837         _mm512_undefined_epi32().as_i32x16(),
17838     ))
17839 }
17840
17841 /// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
17842 ///
17843 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epi64&expand=4578)
17844 #[inline]
17845 #[target_feature(enable = "avx512f")]
17846 pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
17847     simd_reduce_max(a.as_i64x8())
17848 }
17849
17850 /// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
17851 ///
17852 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epi64&expand=4577)
17853 #[inline]
17854 #[target_feature(enable = "avx512f")]
17855 pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
17856     simd_reduce_max(simd_select_bitmask(
17857         k,
17858         a.as_i64x8(),
17859         _mm512_set1_epi64(0).as_i64x8(),
17860     ))
17861 }
17862
17863 /// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
17864 ///
17865 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu32&expand=4580)
17866 #[inline]
17867 #[target_feature(enable = "avx512f")]
17868 pub unsafe fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
17869     simd_reduce_max(a.as_u32x16())
17870 }
17871
17872 /// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
17873 ///
17874 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epu32&expand=4579)
17875 #[inline]
17876 #[target_feature(enable = "avx512f")]
17877 pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
17878     simd_reduce_max(simd_select_bitmask(
17879         k,
17880         a.as_u32x16(),
17881         _mm512_undefined_epi32().as_u32x16(),
17882     ))
17883 }
17884
17885 /// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
17886 ///
17887 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_epu64&expand=4582)
17888 #[inline]
17889 #[target_feature(enable = "avx512f")]
17890 pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
17891     simd_reduce_max(a.as_u64x8())
17892 }
17893
17894 /// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
17895 ///
17896 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_epu64&expand=4581)
17897 #[inline]
17898 #[target_feature(enable = "avx512f")]
17899 pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
17900     simd_reduce_max(simd_select_bitmask(
17901         k,
17902         a.as_u64x8(),
17903         _mm512_set1_epi64(0).as_u64x8(),
17904     ))
17905 }
17906
17907 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
17908 ///
17909 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_ps&expand=4586)
17910 #[inline]
17911 #[target_feature(enable = "avx512f")]
17912 pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
17913     simd_reduce_max(a.as_f32x16())
17914 }
17915
17916 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
17917 ///
17918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_ps&expand=4585)
17919 #[inline]
17920 #[target_feature(enable = "avx512f")]
17921 pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
17922     simd_reduce_max(simd_select_bitmask(
17923         k,
17924         a.as_f32x16(),
17925         _mm512_undefined_ps().as_f32x16(),
17926     ))
17927 }
17928
17929 /// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
17930 ///
17931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_max_pd&expand=4584)
17932 #[inline]
17933 #[target_feature(enable = "avx512f")]
17934 pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
17935     simd_reduce_max(a.as_f64x8())
17936 }
17937
17938 /// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
17939 ///
17940 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_max_pd&expand=4583)
17941 #[inline]
17942 #[target_feature(enable = "avx512f")]
17943 pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
17944     simd_reduce_max(simd_select_bitmask(
17945         k,
17946         a.as_f64x8(),
17947         _mm512_undefined_pd().as_f64x8(),
17948     ))
17949 }
17950
17951 /// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
17952 ///
17953 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epi32&expand=4588)
17954 #[inline]
17955 #[target_feature(enable = "avx512f")]
17956 pub unsafe fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
17957     simd_reduce_min(a.as_i32x16())
17958 }
17959
17960 /// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
17961 ///
17962 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi32&expand=4587)
17963 #[inline]
17964 #[target_feature(enable = "avx512f")]
17965 pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
17966     simd_reduce_min(simd_select_bitmask(
17967         k,
17968         a.as_i32x16(),
17969         _mm512_undefined_epi32().as_i32x16(),
17970     ))
17971 }
17972
17973 /// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
17974 ///
17975 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epi64&expand=4590)
17976 #[inline]
17977 #[target_feature(enable = "avx512f")]
17978 pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
17979     simd_reduce_min(a.as_i64x8())
17980 }
17981
17982 /// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
17983 ///
17984 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589)
17985 #[inline]
17986 #[target_feature(enable = "avx512f")]
17987 pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
17988     simd_reduce_min(simd_select_bitmask(
17989         k,
17990         a.as_i64x8(),
17991         _mm512_set1_epi64(0).as_i64x8(),
17992     ))
17993 }
17994
17995 /// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
17996 ///
17997 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu32&expand=4592)
17998 #[inline]
17999 #[target_feature(enable = "avx512f")]
18000 pub unsafe fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
18001     simd_reduce_min(a.as_u32x16())
18002 }
18003
18004 /// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
18005 ///
18006 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epu32&expand=4591)
18007 #[inline]
18008 #[target_feature(enable = "avx512f")]
18009 pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
18010     simd_reduce_min(simd_select_bitmask(
18011         k,
18012         a.as_u32x16(),
18013         _mm512_undefined_epi32().as_u32x16(),
18014     ))
18015 }
18016
18017 /// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
18018 ///
18019 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_epu64&expand=4594)
18020 #[inline]
18021 #[target_feature(enable = "avx512f")]
18022 pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
18023     simd_reduce_min(a.as_u64x8())
18024 }
18025
18026 /// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
18027 ///
18028 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_epi64&expand=4589)
18029 #[inline]
18030 #[target_feature(enable = "avx512f")]
18031 pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
18032     simd_reduce_min(simd_select_bitmask(
18033         k,
18034         a.as_u64x8(),
18035         _mm512_set1_epi64(0).as_u64x8(),
18036     ))
18037 }
18038
18039 /// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
18040 ///
18041 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_ps&expand=4598)
18042 #[inline]
18043 #[target_feature(enable = "avx512f")]
18044 pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
18045     simd_reduce_min(a.as_f32x16())
18046 }
18047
18048 /// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
18049 ///
18050 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_ps&expand=4597)
18051 #[inline]
18052 #[target_feature(enable = "avx512f")]
18053 pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
18054     simd_reduce_min(simd_select_bitmask(
18055         k,
18056         a.as_f32x16(),
18057         _mm512_undefined_ps().as_f32x16(),
18058     ))
18059 }
18060
18061 /// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
18062 ///
18063 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_min_pd&expand=4596)
18064 #[inline]
18065 #[target_feature(enable = "avx512f")]
18066 pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
18067     simd_reduce_min(a.as_f64x8())
18068 }
18069
18070 /// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
18071 ///
18072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_min_pd&expand=4595)
18073 #[inline]
18074 #[target_feature(enable = "avx512f")]
18075 pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
18076     simd_reduce_min(simd_select_bitmask(
18077         k,
18078         a.as_f64x8(),
18079         _mm512_undefined_pd().as_f64x8(),
18080     ))
18081 }
18082
18083 /// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
18084 ///
18085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_and_epi32&expand=4564)
18086 #[inline]
18087 #[target_feature(enable = "avx512f")]
18088 pub unsafe fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
18089     simd_reduce_and(a.as_i32x16())
18090 }
18091
18092 /// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
18093 ///
18094 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_and_epi32&expand=4563)
18095 #[inline]
18096 #[target_feature(enable = "avx512f")]
18097 pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
18098     simd_reduce_and(simd_select_bitmask(
18099         k,
18100         a.as_i32x16(),
18101         _mm512_set1_epi32(
18102             1 << 0
18103                 | 1 << 1
18104                 | 1 << 2
18105                 | 1 << 3
18106                 | 1 << 4
18107                 | 1 << 5
18108                 | 1 << 6
18109                 | 1 << 7
18110                 | 1 << 8
18111                 | 1 << 9
18112                 | 1 << 10
18113                 | 1 << 11
18114                 | 1 << 12
18115                 | 1 << 13
18116                 | 1 << 14
18117                 | 1 << 15,
18118         )
18119         .as_i32x16(),
18120     ))
18121 }
18122
18123 /// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
18124 ///
18125 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_and_epi64&expand=4566)
18126 #[inline]
18127 #[target_feature(enable = "avx512f")]
18128 pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
18129     simd_reduce_and(a.as_i64x8())
18130 }
18131
18132 /// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
18133 ///
18134 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_add_epi64&expand=4557)
18135 #[inline]
18136 #[target_feature(enable = "avx512f")]
18137 pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
18138     simd_reduce_and(simd_select_bitmask(
18139         k,
18140         a.as_i64x8(),
18141         _mm512_set1_epi64(1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7)
18142             .as_i64x8(),
18143     ))
18144 }
18145
18146 /// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
18147 ///
18148 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi32&expand=4608)
18149 #[inline]
18150 #[target_feature(enable = "avx512f")]
18151 pub unsafe fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
18152     simd_reduce_or(a.as_i32x16())
18153 }
18154
18155 /// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
18156 ///
18157 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_or_epi32&expand=4607)
18158 #[inline]
18159 #[target_feature(enable = "avx512f")]
18160 pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
18161     simd_reduce_or(simd_select_bitmask(
18162         k,
18163         a.as_i32x16(),
18164         _mm512_setzero_si512().as_i32x16(),
18165     ))
18166 }
18167
18168 /// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
18169 ///
18170 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_reduce_or_epi64&expand=4610)
18171 #[inline]
18172 #[target_feature(enable = "avx512f")]
18173 pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
18174     simd_reduce_or(a.as_i64x8())
18175 }
18176
18177 /// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
18178 ///
18179 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_reduce_or_epi64&expand=4609)
18180 #[inline]
18181 #[target_feature(enable = "avx512f")]
18182 pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
18183     simd_reduce_or(simd_select_bitmask(
18184         k,
18185         a.as_i64x8(),
18186         _mm512_setzero_si512().as_i64x8(),
18187     ))
18188 }
18189
18190 /// Returns vector of type `__m512d` with undefined elements.
18191 ///
18192 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd)
18193 #[inline]
18194 #[target_feature(enable = "avx512f")]
18195 // This intrinsic has no corresponding instruction.
18196 pub unsafe fn _mm512_undefined_pd() -> __m512d {
18197     _mm512_set1_pd(0.0)
18198 }
18199
18200 /// Returns vector of type `__m512` with undefined elements.
18201 ///
18202 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps)
18203 #[inline]
18204 #[target_feature(enable = "avx512f")]
18205 // This intrinsic has no corresponding instruction.
18206 pub unsafe fn _mm512_undefined_ps() -> __m512 {
18207     _mm512_set1_ps(0.0)
18208 }
18209
18210 /// Return vector of type __m512i with undefined elements.
18211 ///
18212 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_undefined_epi32&expand=5995)
18213 #[inline]
18214 #[target_feature(enable = "avx512f")]
18215 // This intrinsic has no corresponding instruction.
18216 pub unsafe fn _mm512_undefined_epi32() -> __m512i {
18217     _mm512_set1_epi32(0)
18218 }
18219
18220 /// Return vector of type __m512 with undefined elements.
18221 ///
18222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_undefined&expand=5994)
18223 #[inline]
18224 #[target_feature(enable = "avx512f")]
18225 // This intrinsic has no corresponding instruction.
18226 pub unsafe fn _mm512_undefined() -> __m512 {
18227     _mm512_set1_ps(0.0)
18228 }
18229
18230 /// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
18231 ///
18232 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_loadu_epi32&expand=3377)
18233 #[inline]
18234 #[target_feature(enable = "avx512f")]
18235 #[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
18236 pub unsafe fn _mm512_loadu_epi32(mem_addr: *const i32) -> __m512i {
18237     ptr::read_unaligned(mem_addr as *const __m512i)
18238 }
18239
18240 /// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
18241 ///
18242 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_storeu_epi32&expand=5628)
18243 #[inline]
18244 #[target_feature(enable = "avx512f")]
18245 #[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
18246 pub unsafe fn _mm512_storeu_epi32(mem_addr: *mut i32, a: __m512i) {
18247     ptr::write_unaligned(mem_addr as *mut __m512i, a);
18248 }
18249
18250 /// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
18251 ///
18252 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_loadu_epi64&expand=3386)
18253 #[inline]
18254 #[target_feature(enable = "avx512f")]
18255 #[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
18256 pub unsafe fn _mm512_loadu_epi64(mem_addr: *const i64) -> __m512i {
18257     ptr::read_unaligned(mem_addr as *const __m512i)
18258 }
18259
18260 /// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
18261 ///
18262 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_storeu_epi64&expand=5634)
18263 #[inline]
18264 #[target_feature(enable = "avx512f")]
18265 #[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
18266 pub unsafe fn _mm512_storeu_epi64(mem_addr: *mut i64, a: __m512i) {
18267     ptr::write_unaligned(mem_addr as *mut __m512i, a);
18268 }
18269
18270 /// Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
18271 ///
18272 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_loadu_si512&expand=3420)
18273 #[inline]
18274 #[target_feature(enable = "avx512f")]
18275 #[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
18276 pub unsafe fn _mm512_loadu_si512(mem_addr: *const i32) -> __m512i {
18277     ptr::read_unaligned(mem_addr as *const __m512i)
18278 }
18279
18280 /// Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
18281 ///
18282 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_storeu_si512&expand=5657)
18283 #[inline]
18284 #[target_feature(enable = "avx512f")]
18285 #[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
18286 pub unsafe fn _mm512_storeu_si512(mem_addr: *mut i32, a: __m512i) {
18287     ptr::write_unaligned(mem_addr as *mut __m512i, a);
18288 }
18289
18290 /// Loads 512-bits (composed of 8 packed double-precision (64-bit)
18291 /// floating-point elements) from memory into result.
18292 /// `mem_addr` does not need to be aligned on any particular boundary.
18293 ///
18294 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_pd)
18295 #[inline]
18296 #[target_feature(enable = "avx512f")]
18297 #[cfg_attr(test, assert_instr(vmovups))]
18298 pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d {
18299     ptr::read_unaligned(mem_addr as *const __m512d)
18300 }
18301
18302 /// Stores 512-bits (composed of 8 packed double-precision (64-bit)
18303 /// floating-point elements) from `a` into memory.
18304 /// `mem_addr` does not need to be aligned on any particular boundary.
18305 ///
18306 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_pd)
18307 #[inline]
18308 #[target_feature(enable = "avx512f")]
18309 #[cfg_attr(test, assert_instr(vmovups))]
18310 pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) {
18311     ptr::write_unaligned(mem_addr as *mut __m512d, a);
18312 }
18313
18314 /// Loads 512-bits (composed of 16 packed single-precision (32-bit)
18315 /// floating-point elements) from memory into result.
18316 /// `mem_addr` does not need to be aligned on any particular boundary.
18317 ///
18318 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ps)
18319 #[inline]
18320 #[target_feature(enable = "avx512f")]
18321 #[cfg_attr(test, assert_instr(vmovups))]
18322 pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 {
18323     ptr::read_unaligned(mem_addr as *const __m512)
18324 }
18325
18326 /// Stores 512-bits (composed of 16 packed single-precision (32-bit)
18327 /// floating-point elements) from `a` into memory.
18328 /// `mem_addr` does not need to be aligned on any particular boundary.
18329 ///
18330 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ps)
18331 #[inline]
18332 #[target_feature(enable = "avx512f")]
18333 #[cfg_attr(test, assert_instr(vmovups))]
18334 #[stable(feature = "simd_x86", since = "1.27.0")]
18335 pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
18336     ptr::write_unaligned(mem_addr as *mut __m512, a);
18337 }
18338
18339 /// Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18340 ///
18341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_si512&expand=3345)
18342 #[inline]
18343 #[target_feature(enable = "avx512f")]
18344 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
18345 pub unsafe fn _mm512_load_si512(mem_addr: *const i32) -> __m512i {
18346     ptr::read(mem_addr as *const __m512i)
18347 }
18348
18349 /// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18350 ///
18351 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_si512&expand=5598)
18352 #[inline]
18353 #[target_feature(enable = "avx512f")]
18354 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
18355 pub unsafe fn _mm512_store_si512(mem_addr: *mut i32, a: __m512i) {
18356     ptr::write(mem_addr as *mut __m512i, a);
18357 }
18358
18359 /// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18360 ///
18361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_epi32&expand=3304)
18362 #[inline]
18363 #[target_feature(enable = "avx512f")]
18364 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
18365 pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
18366     ptr::read(mem_addr as *const __m512i)
18367 }
18368
18369 /// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18370 ///
18371 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_epi32&expand=5569)
18372 #[inline]
18373 #[target_feature(enable = "avx512f")]
18374 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
18375 pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
18376     ptr::write(mem_addr as *mut __m512i, a);
18377 }
18378
18379 /// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18380 ///
18381 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_epi64&expand=3313)
18382 #[inline]
18383 #[target_feature(enable = "avx512f")]
18384 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
18385 pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
18386     ptr::read(mem_addr as *const __m512i)
18387 }
18388
18389 /// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18390 ///
18391 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_epi64&expand=5575)
18392 #[inline]
18393 #[target_feature(enable = "avx512f")]
18394 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
18395 pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
18396     ptr::write(mem_addr as *mut __m512i, a);
18397 }
18398
18399 /// Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18400 ///
18401 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_ps&expand=3336)
18402 #[inline]
18403 #[target_feature(enable = "avx512f")]
18404 #[cfg_attr(test, assert_instr(vmovaps))]
18405 pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
18406     ptr::read(mem_addr as *const __m512)
18407 }
18408
18409 /// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18410 ///
18411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_ps&expand=5592)
18412 #[inline]
18413 #[target_feature(enable = "avx512f")]
18414 #[cfg_attr(test, assert_instr(vmovaps))]
18415 pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
18416     ptr::write(mem_addr as *mut __m512, a);
18417 }
18418
18419 /// Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18420 ///
18421 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_pd&expand=3326)
18422 #[inline]
18423 #[target_feature(enable = "avx512f")]
18424 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
18425 pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
18426     ptr::read(mem_addr as *const __m512d)
18427 }
18428
18429 /// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
18430 ///
18431 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_pd&expand=5585)
18432 #[inline]
18433 #[target_feature(enable = "avx512f")]
18434 #[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
18435 pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
18436     ptr::write(mem_addr as *mut __m512d, a);
18437 }
18438
18439 /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
18440 ///
18441 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002)
18442 #[inline]
18443 #[target_feature(enable = "avx512f")]
18444 pub unsafe fn _mm512_setr_pd(
18445     e0: f64,
18446     e1: f64,
18447     e2: f64,
18448     e3: f64,
18449     e4: f64,
18450     e5: f64,
18451     e6: f64,
18452     e7: f64,
18453 ) -> __m512d {
18454     let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
18455     transmute(r)
18456 }
18457
18458 /// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
18459 ///
18460 /// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924)
18461 #[inline]
18462 #[target_feature(enable = "avx512f")]
18463 pub unsafe fn _mm512_set_pd(
18464     e0: f64,
18465     e1: f64,
18466     e2: f64,
18467     e3: f64,
18468     e4: f64,
18469     e5: f64,
18470     e6: f64,
18471     e7: f64,
18472 ) -> __m512d {
18473     _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
18474 }
18475
18476 /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18477 ///
18478 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
18479 #[inline]
18480 #[target_feature(enable = "avx512f")]
18481 #[cfg_attr(test, assert_instr(vmovss))]
18482 pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18483     let extractsrc: f32 = simd_extract(src, 0);
18484     let mut mov: f32 = extractsrc;
18485     if (k & 0b00000001) != 0 {
18486         mov = simd_extract(b, 0);
18487     }
18488     let r = simd_insert(a, 0, mov);
18489     transmute(r)
18490 }
18491
18492 /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18493 ///
18494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
18495 #[inline]
18496 #[target_feature(enable = "avx512f")]
18497 #[cfg_attr(test, assert_instr(vmovss))]
18498 pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18499     let mut mov: f32 = 0.;
18500     if (k & 0b00000001) != 0 {
18501         mov = simd_extract(b, 0);
18502     }
18503     let r = simd_insert(a, 0, mov);
18504     transmute(r)
18505 }
18506
18507 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18508 ///
18509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
18510 #[inline]
18511 #[target_feature(enable = "avx512f")]
18512 #[cfg_attr(test, assert_instr(vmovsd))]
18513 pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18514     let extractsrc: f64 = simd_extract(src, 0);
18515     let mut mov: f64 = extractsrc;
18516     if (k & 0b00000001) != 0 {
18517         mov = simd_extract(b, 0);
18518     }
18519     let r = simd_insert(a, 0, mov);
18520     transmute(r)
18521 }
18522
18523 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18524 ///
18525 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
18526 #[inline]
18527 #[target_feature(enable = "avx512f")]
18528 #[cfg_attr(test, assert_instr(vmovsd))]
18529 pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18530     let mut mov: f64 = 0.;
18531     if (k & 0b00000001) != 0 {
18532         mov = simd_extract(b, 0);
18533     }
18534     let r = simd_insert(a, 0, mov);
18535     transmute(r)
18536 }
18537
18538 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18539 ///
18540 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
18541 #[inline]
18542 #[target_feature(enable = "avx512f")]
18543 #[cfg_attr(test, assert_instr(vaddss))]
18544 pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18545     let extractsrc: f32 = simd_extract(src, 0);
18546     let mut add: f32 = extractsrc;
18547     if (k & 0b00000001) != 0 {
18548         let extracta: f32 = simd_extract(a, 0);
18549         let extractb: f32 = simd_extract(b, 0);
18550         add = extracta + extractb;
18551     }
18552     let r = simd_insert(a, 0, add);
18553     transmute(r)
18554 }
18555
18556 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18557 ///
18558 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
18559 #[inline]
18560 #[target_feature(enable = "avx512f")]
18561 #[cfg_attr(test, assert_instr(vaddss))]
18562 pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18563     let mut add: f32 = 0.;
18564     if (k & 0b00000001) != 0 {
18565         let extracta: f32 = simd_extract(a, 0);
18566         let extractb: f32 = simd_extract(b, 0);
18567         add = extracta + extractb;
18568     }
18569     let r = simd_insert(a, 0, add);
18570     transmute(r)
18571 }
18572
18573 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18574 ///
18575 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
18576 #[inline]
18577 #[target_feature(enable = "avx512f")]
18578 #[cfg_attr(test, assert_instr(vaddsd))]
18579 pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18580     let extractsrc: f64 = simd_extract(src, 0);
18581     let mut add: f64 = extractsrc;
18582     if (k & 0b00000001) != 0 {
18583         let extracta: f64 = simd_extract(a, 0);
18584         let extractb: f64 = simd_extract(b, 0);
18585         add = extracta + extractb;
18586     }
18587     let r = simd_insert(a, 0, add);
18588     transmute(r)
18589 }
18590
18591 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18592 ///
18593 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
18594 #[inline]
18595 #[target_feature(enable = "avx512f")]
18596 #[cfg_attr(test, assert_instr(vaddsd))]
18597 pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18598     let mut add: f64 = 0.;
18599     if (k & 0b00000001) != 0 {
18600         let extracta: f64 = simd_extract(a, 0);
18601         let extractb: f64 = simd_extract(b, 0);
18602         add = extracta + extractb;
18603     }
18604     let r = simd_insert(a, 0, add);
18605     transmute(r)
18606 }
18607
18608 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18609 ///
18610 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
18611 #[inline]
18612 #[target_feature(enable = "avx512f")]
18613 #[cfg_attr(test, assert_instr(vsubss))]
18614 pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18615     let extractsrc: f32 = simd_extract(src, 0);
18616     let mut add: f32 = extractsrc;
18617     if (k & 0b00000001) != 0 {
18618         let extracta: f32 = simd_extract(a, 0);
18619         let extractb: f32 = simd_extract(b, 0);
18620         add = extracta - extractb;
18621     }
18622     let r = simd_insert(a, 0, add);
18623     transmute(r)
18624 }
18625
18626 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18627 ///
18628 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
18629 #[inline]
18630 #[target_feature(enable = "avx512f")]
18631 #[cfg_attr(test, assert_instr(vsubss))]
18632 pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18633     let mut add: f32 = 0.;
18634     if (k & 0b00000001) != 0 {
18635         let extracta: f32 = simd_extract(a, 0);
18636         let extractb: f32 = simd_extract(b, 0);
18637         add = extracta - extractb;
18638     }
18639     let r = simd_insert(a, 0, add);
18640     transmute(r)
18641 }
18642
18643 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18644 ///
18645 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
18646 #[inline]
18647 #[target_feature(enable = "avx512f")]
18648 #[cfg_attr(test, assert_instr(vsubsd))]
18649 pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18650     let extractsrc: f64 = simd_extract(src, 0);
18651     let mut add: f64 = extractsrc;
18652     if (k & 0b00000001) != 0 {
18653         let extracta: f64 = simd_extract(a, 0);
18654         let extractb: f64 = simd_extract(b, 0);
18655         add = extracta - extractb;
18656     }
18657     let r = simd_insert(a, 0, add);
18658     transmute(r)
18659 }
18660
18661 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18662 ///
18663 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
18664 #[inline]
18665 #[target_feature(enable = "avx512f")]
18666 #[cfg_attr(test, assert_instr(vsubsd))]
18667 pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18668     let mut add: f64 = 0.;
18669     if (k & 0b00000001) != 0 {
18670         let extracta: f64 = simd_extract(a, 0);
18671         let extractb: f64 = simd_extract(b, 0);
18672         add = extracta - extractb;
18673     }
18674     let r = simd_insert(a, 0, add);
18675     transmute(r)
18676 }
18677
18678 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18679 ///
18680 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
18681 #[inline]
18682 #[target_feature(enable = "avx512f")]
18683 #[cfg_attr(test, assert_instr(vmulss))]
18684 pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18685     let extractsrc: f32 = simd_extract(src, 0);
18686     let mut add: f32 = extractsrc;
18687     if (k & 0b00000001) != 0 {
18688         let extracta: f32 = simd_extract(a, 0);
18689         let extractb: f32 = simd_extract(b, 0);
18690         add = extracta * extractb;
18691     }
18692     let r = simd_insert(a, 0, add);
18693     transmute(r)
18694 }
18695
18696 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18697 ///
18698 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
18699 #[inline]
18700 #[target_feature(enable = "avx512f")]
18701 #[cfg_attr(test, assert_instr(vmulss))]
18702 pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18703     let mut add: f32 = 0.;
18704     if (k & 0b00000001) != 0 {
18705         let extracta: f32 = simd_extract(a, 0);
18706         let extractb: f32 = simd_extract(b, 0);
18707         add = extracta * extractb;
18708     }
18709     let r = simd_insert(a, 0, add);
18710     transmute(r)
18711 }
18712
18713 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18714 ///
18715 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
18716 #[inline]
18717 #[target_feature(enable = "avx512f")]
18718 #[cfg_attr(test, assert_instr(vmulsd))]
18719 pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18720     let extractsrc: f64 = simd_extract(src, 0);
18721     let mut add: f64 = extractsrc;
18722     if (k & 0b00000001) != 0 {
18723         let extracta: f64 = simd_extract(a, 0);
18724         let extractb: f64 = simd_extract(b, 0);
18725         add = extracta * extractb;
18726     }
18727     let r = simd_insert(a, 0, add);
18728     transmute(r)
18729 }
18730
18731 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18732 ///
18733 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
18734 #[inline]
18735 #[target_feature(enable = "avx512f")]
18736 #[cfg_attr(test, assert_instr(vmulsd))]
18737 pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18738     let mut add: f64 = 0.;
18739     if (k & 0b00000001) != 0 {
18740         let extracta: f64 = simd_extract(a, 0);
18741         let extractb: f64 = simd_extract(b, 0);
18742         add = extracta * extractb;
18743     }
18744     let r = simd_insert(a, 0, add);
18745     transmute(r)
18746 }
18747
18748 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18749 ///
18750 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
18751 #[inline]
18752 #[target_feature(enable = "avx512f")]
18753 #[cfg_attr(test, assert_instr(vdivss))]
18754 pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18755     let extractsrc: f32 = simd_extract(src, 0);
18756     let mut add: f32 = extractsrc;
18757     if (k & 0b00000001) != 0 {
18758         let extracta: f32 = simd_extract(a, 0);
18759         let extractb: f32 = simd_extract(b, 0);
18760         add = extracta / extractb;
18761     }
18762     let r = simd_insert(a, 0, add);
18763     transmute(r)
18764 }
18765
18766 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18767 ///
18768 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
18769 #[inline]
18770 #[target_feature(enable = "avx512f")]
18771 #[cfg_attr(test, assert_instr(vdivss))]
18772 pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18773     let mut add: f32 = 0.;
18774     if (k & 0b00000001) != 0 {
18775         let extracta: f32 = simd_extract(a, 0);
18776         let extractb: f32 = simd_extract(b, 0);
18777         add = extracta / extractb;
18778     }
18779     let r = simd_insert(a, 0, add);
18780     transmute(r)
18781 }
18782
18783 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18784 ///
18785 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
18786 #[inline]
18787 #[target_feature(enable = "avx512f")]
18788 #[cfg_attr(test, assert_instr(vdivsd))]
18789 pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18790     let extractsrc: f64 = simd_extract(src, 0);
18791     let mut add: f64 = extractsrc;
18792     if (k & 0b00000001) != 0 {
18793         let extracta: f64 = simd_extract(a, 0);
18794         let extractb: f64 = simd_extract(b, 0);
18795         add = extracta / extractb;
18796     }
18797     let r = simd_insert(a, 0, add);
18798     transmute(r)
18799 }
18800
18801 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18802 ///
18803 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
18804 #[inline]
18805 #[target_feature(enable = "avx512f")]
18806 #[cfg_attr(test, assert_instr(vdivsd))]
18807 pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18808     let mut add: f64 = 0.;
18809     if (k & 0b00000001) != 0 {
18810         let extracta: f64 = simd_extract(a, 0);
18811         let extractb: f64 = simd_extract(b, 0);
18812         add = extracta / extractb;
18813     }
18814     let r = simd_insert(a, 0, add);
18815     transmute(r)
18816 }
18817
18818 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18819 ///
18820 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
18821 #[inline]
18822 #[target_feature(enable = "avx512f")]
18823 #[cfg_attr(test, assert_instr(vmaxss))]
18824 pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18825     transmute(vmaxss(
18826         a.as_f32x4(),
18827         b.as_f32x4(),
18828         src.as_f32x4(),
18829         k,
18830         _MM_FROUND_CUR_DIRECTION,
18831     ))
18832 }
18833
18834 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18835 ///
18836 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
18837 #[inline]
18838 #[target_feature(enable = "avx512f")]
18839 #[cfg_attr(test, assert_instr(vmaxss))]
18840 pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18841     transmute(vmaxss(
18842         a.as_f32x4(),
18843         b.as_f32x4(),
18844         _mm_setzero_ps().as_f32x4(),
18845         k,
18846         _MM_FROUND_CUR_DIRECTION,
18847     ))
18848 }
18849
18850 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18851 ///
18852 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
18853 #[inline]
18854 #[target_feature(enable = "avx512f")]
18855 #[cfg_attr(test, assert_instr(vmaxsd))]
18856 pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18857     transmute(vmaxsd(
18858         a.as_f64x2(),
18859         b.as_f64x2(),
18860         src.as_f64x2(),
18861         k,
18862         _MM_FROUND_CUR_DIRECTION,
18863     ))
18864 }
18865
18866 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18867 ///
18868 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
18869 #[inline]
18870 #[target_feature(enable = "avx512f")]
18871 #[cfg_attr(test, assert_instr(vmaxsd))]
18872 pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18873     transmute(vmaxsd(
18874         a.as_f64x2(),
18875         b.as_f64x2(),
18876         _mm_setzero_pd().as_f64x2(),
18877         k,
18878         _MM_FROUND_CUR_DIRECTION,
18879     ))
18880 }
18881
18882 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18883 ///
18884 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
18885 #[inline]
18886 #[target_feature(enable = "avx512f")]
18887 #[cfg_attr(test, assert_instr(vminss))]
18888 pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18889     transmute(vminss(
18890         a.as_f32x4(),
18891         b.as_f32x4(),
18892         src.as_f32x4(),
18893         k,
18894         _MM_FROUND_CUR_DIRECTION,
18895     ))
18896 }
18897
18898 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18899 ///
18900 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
18901 #[inline]
18902 #[target_feature(enable = "avx512f")]
18903 #[cfg_attr(test, assert_instr(vminss))]
18904 pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18905     transmute(vminss(
18906         a.as_f32x4(),
18907         b.as_f32x4(),
18908         _mm_setzero_ps().as_f32x4(),
18909         k,
18910         _MM_FROUND_CUR_DIRECTION,
18911     ))
18912 }
18913
18914 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18915 ///
18916 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
18917 #[inline]
18918 #[target_feature(enable = "avx512f")]
18919 #[cfg_attr(test, assert_instr(vminsd))]
18920 pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18921     transmute(vminsd(
18922         a.as_f64x2(),
18923         b.as_f64x2(),
18924         src.as_f64x2(),
18925         k,
18926         _MM_FROUND_CUR_DIRECTION,
18927     ))
18928 }
18929
18930 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18931 ///
18932 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
18933 #[inline]
18934 #[target_feature(enable = "avx512f")]
18935 #[cfg_attr(test, assert_instr(vminsd))]
18936 pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18937     transmute(vminsd(
18938         a.as_f64x2(),
18939         b.as_f64x2(),
18940         _mm_setzero_pd().as_f64x2(),
18941         k,
18942         _MM_FROUND_CUR_DIRECTION,
18943     ))
18944 }
18945
18946 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18947 ///
18948 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
18949 #[inline]
18950 #[target_feature(enable = "avx512f")]
18951 #[cfg_attr(test, assert_instr(vsqrtss))]
18952 pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
18953     transmute(vsqrtss(
18954         a.as_f32x4(),
18955         b.as_f32x4(),
18956         src.as_f32x4(),
18957         k,
18958         _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
18959     ))
18960 }
18961
18962 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
18963 ///
18964 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
18965 #[inline]
18966 #[target_feature(enable = "avx512f")]
18967 #[cfg_attr(test, assert_instr(vsqrtss))]
18968 pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
18969     transmute(vsqrtss(
18970         a.as_f32x4(),
18971         b.as_f32x4(),
18972         _mm_setzero_ps().as_f32x4(),
18973         k,
18974         _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
18975     ))
18976 }
18977
18978 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18979 ///
18980 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
18981 #[inline]
18982 #[target_feature(enable = "avx512f")]
18983 #[cfg_attr(test, assert_instr(vsqrtsd))]
18984 pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
18985     transmute(vsqrtsd(
18986         a.as_f64x2(),
18987         b.as_f64x2(),
18988         src.as_f64x2(),
18989         k,
18990         _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
18991     ))
18992 }
18993
18994 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
18995 ///
18996 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
18997 #[inline]
18998 #[target_feature(enable = "avx512f")]
18999 #[cfg_attr(test, assert_instr(vsqrtsd))]
19000 pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19001     transmute(vsqrtsd(
19002         a.as_f64x2(),
19003         b.as_f64x2(),
19004         _mm_setzero_pd().as_f64x2(),
19005         k,
19006         _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
19007     ))
19008 }
19009
19010 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
19011 ///
19012 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
19013 #[inline]
19014 #[target_feature(enable = "avx512f")]
19015 #[cfg_attr(test, assert_instr(vrsqrt14ss))]
19016 pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
19017     transmute(vrsqrt14ss(
19018         a.as_f32x4(),
19019         b.as_f32x4(),
19020         _mm_setzero_ps().as_f32x4(),
19021         0b1,
19022     ))
19023 }
19024
19025 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
19026 ///
19027 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
19028 #[inline]
19029 #[target_feature(enable = "avx512f")]
19030 #[cfg_attr(test, assert_instr(vrsqrt14ss))]
19031 pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
19032     transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
19033 }
19034
19035 /// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
19036 ///
19037 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
19038 #[inline]
19039 #[target_feature(enable = "avx512f")]
19040 #[cfg_attr(test, assert_instr(vrsqrt14ss))]
19041 pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
19042     transmute(vrsqrt14ss(
19043         a.as_f32x4(),
19044         b.as_f32x4(),
19045         _mm_setzero_ps().as_f32x4(),
19046         k,
19047     ))
19048 }
19049
19050 /// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
19051 ///
19052 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
19053 #[inline]
19054 #[target_feature(enable = "avx512f")]
19055 #[cfg_attr(test, assert_instr(vrsqrt14sd))]
19056 pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
19057     transmute(vrsqrt14sd(
19058         a.as_f64x2(),
19059         b.as_f64x2(),
19060         _mm_setzero_pd().as_f64x2(),
19061         0b1,
19062     ))
19063 }
19064
19065 /// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
19066 ///
19067 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
19068 #[inline]
19069 #[target_feature(enable = "avx512f")]
19070 #[cfg_attr(test, assert_instr(vrsqrt14sd))]
19071 pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19072     transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
19073 }
19074
19075 /// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
19076 ///
19077 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
19078 #[inline]
19079 #[target_feature(enable = "avx512f")]
19080 #[cfg_attr(test, assert_instr(vrsqrt14sd))]
19081 pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19082     transmute(vrsqrt14sd(
19083         a.as_f64x2(),
19084         b.as_f64x2(),
19085         _mm_setzero_pd().as_f64x2(),
19086         k,
19087     ))
19088 }
19089
19090 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
19091 ///
19092 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
19093 #[inline]
19094 #[target_feature(enable = "avx512f")]
19095 #[cfg_attr(test, assert_instr(vrcp14ss))]
19096 pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
19097     transmute(vrcp14ss(
19098         a.as_f32x4(),
19099         b.as_f32x4(),
19100         _mm_setzero_ps().as_f32x4(),
19101         0b1,
19102     ))
19103 }
19104
19105 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
19106 ///
19107 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
19108 #[inline]
19109 #[target_feature(enable = "avx512f")]
19110 #[cfg_attr(test, assert_instr(vrcp14ss))]
19111 pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
19112     transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
19113 }
19114
19115 /// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
19116 ///
19117 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
19118 #[inline]
19119 #[target_feature(enable = "avx512f")]
19120 #[cfg_attr(test, assert_instr(vrcp14ss))]
19121 pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
19122     transmute(vrcp14ss(
19123         a.as_f32x4(),
19124         b.as_f32x4(),
19125         _mm_setzero_ps().as_f32x4(),
19126         k,
19127     ))
19128 }
19129
19130 /// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
19131 ///
19132 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
19133 #[inline]
19134 #[target_feature(enable = "avx512f")]
19135 #[cfg_attr(test, assert_instr(vrcp14sd))]
19136 pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
19137     transmute(vrcp14sd(
19138         a.as_f64x2(),
19139         b.as_f64x2(),
19140         _mm_setzero_pd().as_f64x2(),
19141         0b1,
19142     ))
19143 }
19144
19145 /// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
19146 ///
19147 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
19148 #[inline]
19149 #[target_feature(enable = "avx512f")]
19150 #[cfg_attr(test, assert_instr(vrcp14sd))]
19151 pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19152     transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
19153 }
19154
19155 /// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
19156 ///
19157 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
19158 #[inline]
19159 #[target_feature(enable = "avx512f")]
19160 #[cfg_attr(test, assert_instr(vrcp14sd))]
19161 pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19162     transmute(vrcp14sd(
19163         a.as_f64x2(),
19164         b.as_f64x2(),
19165         _mm_setzero_pd().as_f64x2(),
19166         k,
19167     ))
19168 }
19169
19170 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
19171 ///
19172 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
19173 #[inline]
19174 #[target_feature(enable = "avx512f")]
19175 #[cfg_attr(test, assert_instr(vgetexpss))]
19176 pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
19177     transmute(vgetexpss(
19178         a.as_f32x4(),
19179         b.as_f32x4(),
19180         _mm_setzero_ps().as_f32x4(),
19181         0b1,
19182         _MM_FROUND_NO_EXC,
19183     ))
19184 }
19185
19186 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
19187 ///
19188 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
19189 #[inline]
19190 #[target_feature(enable = "avx512f")]
19191 #[cfg_attr(test, assert_instr(vgetexpss))]
19192 pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
19193     transmute(vgetexpss(
19194         a.as_f32x4(),
19195         b.as_f32x4(),
19196         src.as_f32x4(),
19197         k,
19198         _MM_FROUND_NO_EXC,
19199     ))
19200 }
19201
19202 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
19203 ///
19204 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
19205 #[inline]
19206 #[target_feature(enable = "avx512f")]
19207 #[cfg_attr(test, assert_instr(vgetexpss))]
19208 pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
19209     transmute(vgetexpss(
19210         a.as_f32x4(),
19211         b.as_f32x4(),
19212         _mm_setzero_ps().as_f32x4(),
19213         k,
19214         _MM_FROUND_NO_EXC,
19215     ))
19216 }
19217
19218 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
19219 ///
19220 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
19221 #[inline]
19222 #[target_feature(enable = "avx512f")]
19223 #[cfg_attr(test, assert_instr(vgetexpsd))]
19224 pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
19225     transmute(vgetexpsd(
19226         a.as_f64x2(),
19227         b.as_f64x2(),
19228         _mm_setzero_pd().as_f64x2(),
19229         0b1,
19230         _MM_FROUND_NO_EXC,
19231     ))
19232 }
19233
19234 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
19235 ///
19236 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
19237 #[inline]
19238 #[target_feature(enable = "avx512f")]
19239 #[cfg_attr(test, assert_instr(vgetexpsd))]
19240 pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19241     transmute(vgetexpsd(
19242         a.as_f64x2(),
19243         b.as_f64x2(),
19244         src.as_f64x2(),
19245         k,
19246         _MM_FROUND_NO_EXC,
19247     ))
19248 }
19249
19250 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
19251 ///
19252 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
19253 #[inline]
19254 #[target_feature(enable = "avx512f")]
19255 #[cfg_attr(test, assert_instr(vgetexpsd))]
19256 pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19257     transmute(vgetexpsd(
19258         a.as_f64x2(),
19259         b.as_f64x2(),
19260         _mm_setzero_pd().as_f64x2(),
19261         k,
19262         _MM_FROUND_NO_EXC,
19263     ))
19264 }
19265
19266 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
19267 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
19268 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
19269 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
19270 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
19271 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
19272 /// The sign is determined by sc which can take the following values:\
19273 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
19274 ///    _MM_MANT_SIGN_zero    // sign = 0\
19275 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
19276 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
19277 ///
19278 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
19279 #[inline]
19280 #[target_feature(enable = "avx512f")]
19281 #[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
19282 #[rustc_args_required_const(2, 3)]
19283 pub unsafe fn _mm_getmant_ss(
19284     a: __m128,
19285     b: __m128,
19286     norm: _MM_MANTISSA_NORM_ENUM,
19287     sign: _MM_MANTISSA_SIGN_ENUM,
19288 ) -> __m128 {
19289     macro_rules! call {
19290         ($imm4_1:expr, $imm2:expr) => {
19291             vgetmantss(
19292                 a.as_f32x4(),
19293                 b.as_f32x4(),
19294                 $imm2 << 2 | $imm4_1,
19295                 _mm_setzero_ps().as_f32x4(),
19296                 0b1,
19297                 _MM_FROUND_CUR_DIRECTION,
19298             )
19299         };
19300     }
19301     let r = constify_imm4_mantissas!(norm, sign, call);
19302     transmute(r)
19303 }
19304
19305 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
19306 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
19307 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
19308 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
19309 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
19310 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
19311 /// The sign is determined by sc which can take the following values:\
19312 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
19313 ///    _MM_MANT_SIGN_zero    // sign = 0\
19314 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
19315 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
19316 ///
19317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
19318 #[inline]
19319 #[target_feature(enable = "avx512f")]
19320 #[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
19321 #[rustc_args_required_const(4, 5)]
19322 pub unsafe fn _mm_mask_getmant_ss(
19323     src: __m128,
19324     k: __mmask8,
19325     a: __m128,
19326     b: __m128,
19327     norm: _MM_MANTISSA_NORM_ENUM,
19328     sign: _MM_MANTISSA_SIGN_ENUM,
19329 ) -> __m128 {
19330     macro_rules! call {
19331         ($imm4_1:expr, $imm2:expr) => {
19332             vgetmantss(
19333                 a.as_f32x4(),
19334                 b.as_f32x4(),
19335                 $imm2 << 2 | $imm4_1,
19336                 src.as_f32x4(),
19337                 k,
19338                 _MM_FROUND_CUR_DIRECTION,
19339             )
19340         };
19341     }
19342     let r = constify_imm4_mantissas!(norm, sign, call);
19343     transmute(r)
19344 }
19345
19346 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
19347 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
19348 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
19349 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
19350 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
19351 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
19352 /// The sign is determined by sc which can take the following values:\
19353 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
19354 ///    _MM_MANT_SIGN_zero    // sign = 0\
19355 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
19356 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
19357 ///
19358 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
19359 #[inline]
19360 #[target_feature(enable = "avx512f")]
19361 #[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0))]
19362 #[rustc_args_required_const(3, 4)]
19363 pub unsafe fn _mm_maskz_getmant_ss(
19364     k: __mmask8,
19365     a: __m128,
19366     b: __m128,
19367     norm: _MM_MANTISSA_NORM_ENUM,
19368     sign: _MM_MANTISSA_SIGN_ENUM,
19369 ) -> __m128 {
19370     macro_rules! call {
19371         ($imm4_1:expr, $imm2:expr) => {
19372             vgetmantss(
19373                 a.as_f32x4(),
19374                 b.as_f32x4(),
19375                 $imm2 << 2 | $imm4_1,
19376                 _mm_setzero_ps().as_f32x4(),
19377                 k,
19378                 _MM_FROUND_CUR_DIRECTION,
19379             )
19380         };
19381     }
19382     let r = constify_imm4_mantissas!(norm, sign, call);
19383     transmute(r)
19384 }
19385
19386 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
19387 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
19388 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
19389 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
19390 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
19391 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
19392 /// The sign is determined by sc which can take the following values:\
19393 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
19394 ///    _MM_MANT_SIGN_zero    // sign = 0\
19395 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
19396 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
19397 ///
19398 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
19399 #[inline]
19400 #[target_feature(enable = "avx512f")]
19401 #[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
19402 #[rustc_args_required_const(2, 3)]
19403 pub unsafe fn _mm_getmant_sd(
19404     a: __m128d,
19405     b: __m128d,
19406     norm: _MM_MANTISSA_NORM_ENUM,
19407     sign: _MM_MANTISSA_SIGN_ENUM,
19408 ) -> __m128d {
19409     macro_rules! call {
19410         ($imm4_1:expr, $imm2:expr) => {
19411             vgetmantsd(
19412                 a.as_f64x2(),
19413                 b.as_f64x2(),
19414                 $imm2 << 2 | $imm4_1,
19415                 _mm_setzero_pd().as_f64x2(),
19416                 0b1,
19417                 _MM_FROUND_CUR_DIRECTION,
19418             )
19419         };
19420     }
19421     let r = constify_imm4_mantissas!(norm, sign, call);
19422     transmute(r)
19423 }
19424
19425 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
19426 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
19427 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
19428 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
19429 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
19430 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
19431 /// The sign is determined by sc which can take the following values:\
19432 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
19433 ///    _MM_MANT_SIGN_zero    // sign = 0\
19434 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
19435 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
19436 ///
19437 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
19438 #[inline]
19439 #[target_feature(enable = "avx512f")]
19440 #[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
19441 #[rustc_args_required_const(4, 5)]
19442 pub unsafe fn _mm_mask_getmant_sd(
19443     src: __m128d,
19444     k: __mmask8,
19445     a: __m128d,
19446     b: __m128d,
19447     norm: _MM_MANTISSA_NORM_ENUM,
19448     sign: _MM_MANTISSA_SIGN_ENUM,
19449 ) -> __m128d {
19450     macro_rules! call {
19451         ($imm4_1:expr, $imm2:expr) => {
19452             vgetmantsd(
19453                 a.as_f64x2(),
19454                 b.as_f64x2(),
19455                 $imm2 << 2 | $imm4_1,
19456                 src.as_f64x2(),
19457                 k,
19458                 _MM_FROUND_CUR_DIRECTION,
19459             )
19460         };
19461     }
19462     let r = constify_imm4_mantissas!(norm, sign, call);
19463     transmute(r)
19464 }
19465
19466 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
19467 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
19468 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
19469 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
19470 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
19471 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
19472 /// The sign is determined by sc which can take the following values:\
19473 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
19474 ///    _MM_MANT_SIGN_zero    // sign = 0\
19475 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
19476 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
19477 ///
19478 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
19479 #[inline]
19480 #[target_feature(enable = "avx512f")]
19481 #[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0))]
19482 #[rustc_args_required_const(3, 4)]
19483 pub unsafe fn _mm_maskz_getmant_sd(
19484     k: __mmask8,
19485     a: __m128d,
19486     b: __m128d,
19487     norm: _MM_MANTISSA_NORM_ENUM,
19488     sign: _MM_MANTISSA_SIGN_ENUM,
19489 ) -> __m128d {
19490     macro_rules! call {
19491         ($imm4_1:expr, $imm2:expr) => {
19492             vgetmantsd(
19493                 a.as_f64x2(),
19494                 b.as_f64x2(),
19495                 $imm2 << 2 | $imm4_1,
19496                 _mm_setzero_pd().as_f64x2(),
19497                 k,
19498                 _MM_FROUND_CUR_DIRECTION,
19499             )
19500         };
19501     }
19502     let r = constify_imm4_mantissas!(norm, sign, call);
19503     transmute(r)
19504 }
19505
19506 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
19507 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
19508 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
19509 ///    _MM_FROUND_TO_NEG_INF     // round down\
19510 ///    _MM_FROUND_TO_POS_INF     // round up\
19511 ///    _MM_FROUND_TO_ZERO        // truncate\
19512 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
19513 ///
19514 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
19515 #[inline]
19516 #[target_feature(enable = "avx512f")]
19517 #[cfg_attr(test, assert_instr(vrndscaless, imm8 = 255))]
19518 #[rustc_args_required_const(2)]
19519 pub unsafe fn _mm_roundscale_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
19520     let a = a.as_f32x4();
19521     let b = b.as_f32x4();
19522     let zero = _mm_setzero_ps().as_f32x4();
19523     macro_rules! call {
19524         ($imm8:expr) => {
19525             vrndscaless(a, b, zero, 0b11111111, $imm8, _MM_FROUND_CUR_DIRECTION)
19526         };
19527     }
19528     let r = constify_imm8_sae!(imm8, call);
19529     transmute(r)
19530 }
19531
19532 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
19533 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
19534 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
19535 ///    _MM_FROUND_TO_NEG_INF     // round down\
19536 ///    _MM_FROUND_TO_POS_INF     // round up\
19537 ///    _MM_FROUND_TO_ZERO        // truncate\
19538 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
19539 ///
19540 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
19541 #[inline]
19542 #[target_feature(enable = "avx512f")]
19543 #[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))]
19544 #[rustc_args_required_const(4)]
19545 pub unsafe fn _mm_mask_roundscale_ss(
19546     src: __m128,
19547     k: __mmask8,
19548     a: __m128,
19549     b: __m128,
19550     imm8: i32,
19551 ) -> __m128 {
19552     let a = a.as_f32x4();
19553     let b = b.as_f32x4();
19554     let src = src.as_f32x4();
19555     macro_rules! call {
19556         ($imm8:expr) => {
19557             vrndscaless(a, b, src, k, $imm8, _MM_FROUND_CUR_DIRECTION)
19558         };
19559     }
19560     let r = constify_imm8_sae!(imm8, call);
19561     transmute(r)
19562 }
19563
19564 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
19565 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
19566 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
19567 ///    _MM_FROUND_TO_NEG_INF     // round down\
19568 ///    _MM_FROUND_TO_POS_INF     // round up\
19569 ///    _MM_FROUND_TO_ZERO        // truncate\
19570 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
19571 ///
19572 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
19573 #[inline]
19574 #[target_feature(enable = "avx512f")]
19575 #[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0))]
19576 #[rustc_args_required_const(3)]
19577 pub unsafe fn _mm_maskz_roundscale_ss(k: __mmask8, a: __m128, b: __m128, imm8: i32) -> __m128 {
19578     let a = a.as_f32x4();
19579     let b = b.as_f32x4();
19580     let zero = _mm_setzero_ps().as_f32x4();
19581     macro_rules! call {
19582         ($imm8:expr) => {
19583             vrndscaless(a, b, zero, k, $imm8, _MM_FROUND_CUR_DIRECTION)
19584         };
19585     }
19586     let r = constify_imm8_sae!(imm8, call);
19587     transmute(r)
19588 }
19589
19590 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
19591 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
19592 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
19593 ///    _MM_FROUND_TO_NEG_INF     // round down\
19594 ///    _MM_FROUND_TO_POS_INF     // round up\
19595 ///    _MM_FROUND_TO_ZERO        // truncate\
19596 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
19597 ///
19598 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
19599 #[inline]
19600 #[target_feature(enable = "avx512f")]
19601 #[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 255))]
19602 #[rustc_args_required_const(2)]
19603 pub unsafe fn _mm_roundscale_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
19604     let a = a.as_f64x2();
19605     let b = b.as_f64x2();
19606     let zero = _mm_setzero_pd().as_f64x2();
19607     macro_rules! call {
19608         ($imm8:expr) => {
19609             vrndscalesd(a, b, zero, 0b11111111, $imm8, _MM_FROUND_CUR_DIRECTION)
19610         };
19611     }
19612     let r = constify_imm8_sae!(imm8, call);
19613     transmute(r)
19614 }
19615
19616 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
19617 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
19618 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
19619 ///    _MM_FROUND_TO_NEG_INF     // round down\
19620 ///    _MM_FROUND_TO_POS_INF     // round up\
19621 ///    _MM_FROUND_TO_ZERO        // truncate\
19622 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
19623 ///
19624 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
19625 #[inline]
19626 #[target_feature(enable = "avx512f")]
19627 #[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))]
19628 #[rustc_args_required_const(4)]
19629 pub unsafe fn _mm_mask_roundscale_sd(
19630     src: __m128d,
19631     k: __mmask8,
19632     a: __m128d,
19633     b: __m128d,
19634     imm8: i32,
19635 ) -> __m128d {
19636     let a = a.as_f64x2();
19637     let b = b.as_f64x2();
19638     let src = src.as_f64x2();
19639     macro_rules! call {
19640         ($imm8:expr) => {
19641             vrndscalesd(a, b, src, k, $imm8, _MM_FROUND_CUR_DIRECTION)
19642         };
19643     }
19644     let r = constify_imm8_sae!(imm8, call);
19645     transmute(r)
19646 }
19647
19648 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
19649 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
19650 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
19651 ///    _MM_FROUND_TO_NEG_INF     // round down\
19652 ///    _MM_FROUND_TO_POS_INF     // round up\
19653 ///    _MM_FROUND_TO_ZERO        // truncate\
19654 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
19655 ///
19656 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
19657 #[inline]
19658 #[target_feature(enable = "avx512f")]
19659 #[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0))]
19660 #[rustc_args_required_const(3)]
19661 pub unsafe fn _mm_maskz_roundscale_sd(k: __mmask8, a: __m128d, b: __m128d, imm8: i32) -> __m128d {
19662     let a = a.as_f64x2();
19663     let b = b.as_f64x2();
19664     let zero = _mm_setzero_pd().as_f64x2();
19665     macro_rules! call {
19666         ($imm8:expr) => {
19667             vrndscalesd(a, b, zero, k, $imm8, _MM_FROUND_CUR_DIRECTION)
19668         };
19669     }
19670     let r = constify_imm8_sae!(imm8, call);
19671     transmute(r)
19672 }
19673
19674 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
19675 ///
19676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
19677 #[inline]
19678 #[target_feature(enable = "avx512f")]
19679 #[cfg_attr(test, assert_instr(vscalefss))]
19680 pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
19681     transmute(vscalefss(
19682         a.as_f32x4(),
19683         b.as_f32x4(),
19684         _mm_setzero_ps().as_f32x4(),
19685         0b11111111,
19686         _MM_FROUND_CUR_DIRECTION,
19687     ))
19688 }
19689
19690 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
19691 ///
19692 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
19693 #[inline]
19694 #[target_feature(enable = "avx512f")]
19695 #[cfg_attr(test, assert_instr(vscalefss))]
19696 pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
19697     transmute(vscalefss(
19698         a.as_f32x4(),
19699         b.as_f32x4(),
19700         src.as_f32x4(),
19701         k,
19702         _MM_FROUND_CUR_DIRECTION,
19703     ))
19704 }
19705
19706 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
19707 ///
19708 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
19709 #[inline]
19710 #[target_feature(enable = "avx512f")]
19711 #[cfg_attr(test, assert_instr(vscalefss))]
19712 pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
19713     transmute(vscalefss(
19714         a.as_f32x4(),
19715         b.as_f32x4(),
19716         _mm_setzero_ps().as_f32x4(),
19717         k,
19718         _MM_FROUND_CUR_DIRECTION,
19719     ))
19720 }
19721
19722 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
19723 ///
19724 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
19725 #[inline]
19726 #[target_feature(enable = "avx512f")]
19727 #[cfg_attr(test, assert_instr(vscalefsd))]
19728 pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
19729     transmute(vscalefsd(
19730         a.as_f64x2(),
19731         b.as_f64x2(),
19732         _mm_setzero_pd().as_f64x2(),
19733         0b11111111,
19734         _MM_FROUND_CUR_DIRECTION,
19735     ))
19736 }
19737
19738 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
19739 ///
19740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
19741 #[inline]
19742 #[target_feature(enable = "avx512f")]
19743 #[cfg_attr(test, assert_instr(vscalefsd))]
19744 pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19745     transmute(vscalefsd(
19746         a.as_f64x2(),
19747         b.as_f64x2(),
19748         src.as_f64x2(),
19749         k,
19750         _MM_FROUND_CUR_DIRECTION,
19751     ))
19752 }
19753
19754 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
19755 ///
19756 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
19757 #[inline]
19758 #[target_feature(enable = "avx512f")]
19759 #[cfg_attr(test, assert_instr(vscalefsd))]
19760 pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
19761     transmute(vscalefsd(
19762         a.as_f64x2(),
19763         b.as_f64x2(),
19764         _mm_setzero_pd().as_f64x2(),
19765         k,
19766         _MM_FROUND_CUR_DIRECTION,
19767     ))
19768 }
19769
19770 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
19771 ///
19772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
19773 #[inline]
19774 #[target_feature(enable = "avx512f")]
19775 #[cfg_attr(test, assert_instr(vfmadd213ss))]
19776 pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
19777     let mut fmadd: f32 = simd_extract(a, 0);
19778     if (k & 0b00000001) != 0 {
19779         let extractb: f32 = simd_extract(b, 0);
19780         let extractc: f32 = simd_extract(c, 0);
19781         fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19782     }
19783     let r = simd_insert(a, 0, fmadd);
19784     transmute(r)
19785 }
19786
19787 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
19788 ///
19789 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
19790 #[inline]
19791 #[target_feature(enable = "avx512f")]
19792 #[cfg_attr(test, assert_instr(vfmadd213ss))]
19793 pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
19794     let mut fmadd: f32 = 0.;
19795     if (k & 0b00000001) != 0 {
19796         let extracta: f32 = simd_extract(a, 0);
19797         let extractb: f32 = simd_extract(b, 0);
19798         let extractc: f32 = simd_extract(c, 0);
19799         fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19800     }
19801     let r = simd_insert(a, 0, fmadd);
19802     transmute(r)
19803 }
19804
19805 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
19806 ///
19807 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
19808 #[inline]
19809 #[target_feature(enable = "avx512f")]
19810 #[cfg_attr(test, assert_instr(vfmadd213ss))]
19811 pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
19812     let mut fmadd: f32 = simd_extract(c, 0);
19813     if (k & 0b00000001) != 0 {
19814         let extracta: f32 = simd_extract(a, 0);
19815         let extractb: f32 = simd_extract(b, 0);
19816         fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
19817     }
19818     let r = simd_insert(c, 0, fmadd);
19819     transmute(r)
19820 }
19821
19822 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
19823 ///
19824 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
19825 #[inline]
19826 #[target_feature(enable = "avx512f")]
19827 #[cfg_attr(test, assert_instr(vfmadd213sd))]
19828 pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
19829     let mut fmadd: f64 = simd_extract(a, 0);
19830     if (k & 0b00000001) != 0 {
19831         let extractb: f64 = simd_extract(b, 0);
19832         let extractc: f64 = simd_extract(c, 0);
19833         fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19834     }
19835     let r = simd_insert(a, 0, fmadd);
19836     transmute(r)
19837 }
19838
19839 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
19840 ///
19841 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
19842 #[inline]
19843 #[target_feature(enable = "avx512f")]
19844 #[cfg_attr(test, assert_instr(vfmadd213sd))]
19845 pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
19846     let mut fmadd: f64 = 0.;
19847     if (k & 0b00000001) != 0 {
19848         let extracta: f64 = simd_extract(a, 0);
19849         let extractb: f64 = simd_extract(b, 0);
19850         let extractc: f64 = simd_extract(c, 0);
19851         fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19852     }
19853     let r = simd_insert(a, 0, fmadd);
19854     transmute(r)
19855 }
19856
19857 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
19858 ///
19859 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
19860 #[inline]
19861 #[target_feature(enable = "avx512f")]
19862 #[cfg_attr(test, assert_instr(vfmadd213sd))]
19863 pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
19864     let mut fmadd: f64 = simd_extract(c, 0);
19865     if (k & 0b00000001) != 0 {
19866         let extracta: f64 = simd_extract(a, 0);
19867         let extractb: f64 = simd_extract(b, 0);
19868         fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
19869     }
19870     let r = simd_insert(c, 0, fmadd);
19871     transmute(r)
19872 }
19873
19874 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
19875 ///
19876 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668)
19877 #[inline]
19878 #[target_feature(enable = "avx512f")]
19879 #[cfg_attr(test, assert_instr(vfmsub213ss))]
19880 pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
19881     let mut fmsub: f32 = simd_extract(a, 0);
19882     if (k & 0b00000001) != 0 {
19883         let extractb: f32 = simd_extract(b, 0);
19884         let extractc: f32 = simd_extract(c, 0);
19885         let extractc = -extractc;
19886         fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19887     }
19888     let r = simd_insert(a, 0, fmsub);
19889     transmute(r)
19890 }
19891
19892 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
19893 ///
19894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670)
19895 #[inline]
19896 #[target_feature(enable = "avx512f")]
19897 #[cfg_attr(test, assert_instr(vfmsub213ss))]
19898 pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
19899     let mut fmsub: f32 = 0.;
19900     if (k & 0b00000001) != 0 {
19901         let extracta: f32 = simd_extract(a, 0);
19902         let extractb: f32 = simd_extract(b, 0);
19903         let extractc: f32 = simd_extract(c, 0);
19904         let extractc = -extractc;
19905         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19906     }
19907     let r = simd_insert(a, 0, fmsub);
19908     transmute(r)
19909 }
19910
19911 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
19912 ///
19913 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669)
19914 #[inline]
19915 #[target_feature(enable = "avx512f")]
19916 #[cfg_attr(test, assert_instr(vfmsub213ss))]
19917 pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
19918     let mut fmsub: f32 = simd_extract(c, 0);
19919     if (k & 0b00000001) != 0 {
19920         let extracta: f32 = simd_extract(a, 0);
19921         let extractb: f32 = simd_extract(b, 0);
19922         let extractc = -fmsub;
19923         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19924     }
19925     let r = simd_insert(c, 0, fmsub);
19926     transmute(r)
19927 }
19928
19929 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
19930 ///
19931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664)
19932 #[inline]
19933 #[target_feature(enable = "avx512f")]
19934 #[cfg_attr(test, assert_instr(vfmsub213sd))]
19935 pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
19936     let mut fmsub: f64 = simd_extract(a, 0);
19937     if (k & 0b00000001) != 0 {
19938         let extractb: f64 = simd_extract(b, 0);
19939         let extractc: f64 = simd_extract(c, 0);
19940         let extractc = -extractc;
19941         fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19942     }
19943     let r = simd_insert(a, 0, fmsub);
19944     transmute(r)
19945 }
19946
19947 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
19948 ///
19949 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666)
19950 #[inline]
19951 #[target_feature(enable = "avx512f")]
19952 #[cfg_attr(test, assert_instr(vfmsub213sd))]
19953 pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
19954     let mut fmsub: f64 = 0.;
19955     if (k & 0b00000001) != 0 {
19956         let extracta: f64 = simd_extract(a, 0);
19957         let extractb: f64 = simd_extract(b, 0);
19958         let extractc: f64 = simd_extract(c, 0);
19959         let extractc = -extractc;
19960         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19961     }
19962     let r = simd_insert(a, 0, fmsub);
19963     transmute(r)
19964 }
19965
19966 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
19967 ///
19968 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665)
19969 #[inline]
19970 #[target_feature(enable = "avx512f")]
19971 #[cfg_attr(test, assert_instr(vfmsub213sd))]
19972 pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
19973     let mut fmsub: f64 = simd_extract(c, 0);
19974     if (k & 0b00000001) != 0 {
19975         let extracta: f64 = simd_extract(a, 0);
19976         let extractb: f64 = simd_extract(b, 0);
19977         let extractc = -fmsub;
19978         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19979     }
19980     let r = simd_insert(c, 0, fmsub);
19981     transmute(r)
19982 }
19983
19984 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
19985 ///
19986 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748)
19987 #[inline]
19988 #[target_feature(enable = "avx512f")]
19989 #[cfg_attr(test, assert_instr(vfnmadd213ss))]
19990 pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
19991     let mut fnmadd: f32 = simd_extract(a, 0);
19992     if (k & 0b00000001) != 0 {
19993         let extracta = -fnmadd;
19994         let extractb: f32 = simd_extract(b, 0);
19995         let extractc: f32 = simd_extract(c, 0);
19996         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
19997     }
19998     let r = simd_insert(a, 0, fnmadd);
19999     transmute(r)
20000 }
20001
20002 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
20003 ///
20004 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750)
20005 #[inline]
20006 #[target_feature(enable = "avx512f")]
20007 #[cfg_attr(test, assert_instr(vfnmadd213ss))]
20008 pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
20009     let mut fnmadd: f32 = 0.;
20010     if (k & 0b00000001) != 0 {
20011         let extracta: f32 = simd_extract(a, 0);
20012         let extracta = -extracta;
20013         let extractb: f32 = simd_extract(b, 0);
20014         let extractc: f32 = simd_extract(c, 0);
20015         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20016     }
20017     let r = simd_insert(a, 0, fnmadd);
20018     transmute(r)
20019 }
20020
20021 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
20022 ///
20023 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749)
20024 #[inline]
20025 #[target_feature(enable = "avx512f")]
20026 #[cfg_attr(test, assert_instr(vfnmadd213ss))]
20027 pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
20028     let mut fnmadd: f32 = simd_extract(c, 0);
20029     if (k & 0b00000001) != 0 {
20030         let extracta: f32 = simd_extract(a, 0);
20031         let extracta = -extracta;
20032         let extractb: f32 = simd_extract(b, 0);
20033         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
20034     }
20035     let r = simd_insert(c, 0, fnmadd);
20036     transmute(r)
20037 }
20038
20039 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
20040 ///
20041 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744)
20042 #[inline]
20043 #[target_feature(enable = "avx512f")]
20044 #[cfg_attr(test, assert_instr(vfnmadd213sd))]
20045 pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
20046     let mut fnmadd: f64 = simd_extract(a, 0);
20047     if (k & 0b00000001) != 0 {
20048         let extracta = -fnmadd;
20049         let extractb: f64 = simd_extract(b, 0);
20050         let extractc: f64 = simd_extract(c, 0);
20051         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20052     }
20053     let r = simd_insert(a, 0, fnmadd);
20054     transmute(r)
20055 }
20056
20057 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
20058 ///
20059 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746)
20060 #[inline]
20061 #[target_feature(enable = "avx512f")]
20062 #[cfg_attr(test, assert_instr(vfnmadd213sd))]
20063 pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
20064     let mut fnmadd: f64 = 0.;
20065     if (k & 0b00000001) != 0 {
20066         let extracta: f64 = simd_extract(a, 0);
20067         let extracta = -extracta;
20068         let extractb: f64 = simd_extract(b, 0);
20069         let extractc: f64 = simd_extract(c, 0);
20070         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20071     }
20072     let r = simd_insert(a, 0, fnmadd);
20073     transmute(r)
20074 }
20075
20076 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
20077 ///
20078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745)
20079 #[inline]
20080 #[target_feature(enable = "avx512f")]
20081 #[cfg_attr(test, assert_instr(vfnmadd213sd))]
20082 pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
20083     let mut fnmadd: f64 = simd_extract(c, 0);
20084     if (k & 0b00000001) != 0 {
20085         let extracta: f64 = simd_extract(a, 0);
20086         let extracta = -extracta;
20087         let extractb: f64 = simd_extract(b, 0);
20088         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
20089     }
20090     let r = simd_insert(c, 0, fnmadd);
20091     transmute(r)
20092 }
20093
20094 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
20095 ///
20096 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796)
20097 #[inline]
20098 #[target_feature(enable = "avx512f")]
20099 #[cfg_attr(test, assert_instr(vfnmsub213ss))]
20100 pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
20101     let mut fnmsub: f32 = simd_extract(a, 0);
20102     if (k & 0b00000001) != 0 {
20103         let extracta = -fnmsub;
20104         let extractb: f32 = simd_extract(b, 0);
20105         let extractc: f32 = simd_extract(c, 0);
20106         let extractc = -extractc;
20107         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20108     }
20109     let r = simd_insert(a, 0, fnmsub);
20110     transmute(r)
20111 }
20112
20113 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
20114 ///
20115 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798)
20116 #[inline]
20117 #[target_feature(enable = "avx512f")]
20118 #[cfg_attr(test, assert_instr(vfnmsub213ss))]
20119 pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
20120     let mut fnmsub: f32 = 0.;
20121     if (k & 0b00000001) != 0 {
20122         let extracta: f32 = simd_extract(a, 0);
20123         let extracta = -extracta;
20124         let extractb: f32 = simd_extract(b, 0);
20125         let extractc: f32 = simd_extract(c, 0);
20126         let extractc = -extractc;
20127         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20128     }
20129     let r = simd_insert(a, 0, fnmsub);
20130     transmute(r)
20131 }
20132
20133 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
20134 ///
20135 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797)
20136 #[inline]
20137 #[target_feature(enable = "avx512f")]
20138 #[cfg_attr(test, assert_instr(vfnmsub213ss))]
20139 pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
20140     let mut fnmsub: f32 = simd_extract(c, 0);
20141     if (k & 0b00000001) != 0 {
20142         let extracta: f32 = simd_extract(a, 0);
20143         let extracta = -extracta;
20144         let extractb: f32 = simd_extract(b, 0);
20145         let extractc = -fnmsub;
20146         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20147     }
20148     let r = simd_insert(c, 0, fnmsub);
20149     transmute(r)
20150 }
20151
20152 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
20153 ///
20154 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792)
20155 #[inline]
20156 #[target_feature(enable = "avx512f")]
20157 #[cfg_attr(test, assert_instr(vfnmsub213sd))]
20158 pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
20159     let mut fnmsub: f64 = simd_extract(a, 0);
20160     if (k & 0b00000001) != 0 {
20161         let extracta = -fnmsub;
20162         let extractb: f64 = simd_extract(b, 0);
20163         let extractc: f64 = simd_extract(c, 0);
20164         let extractc = -extractc;
20165         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20166     }
20167     let r = simd_insert(a, 0, fnmsub);
20168     transmute(r)
20169 }
20170
20171 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
20172 ///
20173 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794)
20174 #[inline]
20175 #[target_feature(enable = "avx512f")]
20176 #[cfg_attr(test, assert_instr(vfnmsub213sd))]
20177 pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
20178     let mut fnmsub: f64 = 0.;
20179     if (k & 0b00000001) != 0 {
20180         let extracta: f64 = simd_extract(a, 0);
20181         let extracta = -extracta;
20182         let extractb: f64 = simd_extract(b, 0);
20183         let extractc: f64 = simd_extract(c, 0);
20184         let extractc = -extractc;
20185         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20186     }
20187     let r = simd_insert(a, 0, fnmsub);
20188     transmute(r)
20189 }
20190
20191 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
20192 ///
20193 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793)
20194 #[inline]
20195 #[target_feature(enable = "avx512f")]
20196 #[cfg_attr(test, assert_instr(vfnmsub213sd))]
20197 pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
20198     let mut fnmsub: f64 = simd_extract(c, 0);
20199     if (k & 0b00000001) != 0 {
20200         let extracta: f64 = simd_extract(a, 0);
20201         let extracta = -extracta;
20202         let extractb: f64 = simd_extract(b, 0);
20203         let extractc = -fnmsub;
20204         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
20205     }
20206     let r = simd_insert(c, 0, fnmsub);
20207     transmute(r)
20208 }
20209
20210 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
20211 ///
20212 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20213 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20214 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20215 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20216 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20217 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20218 ///
20219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
20220 #[inline]
20221 #[target_feature(enable = "avx512f")]
20222 #[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
20223 #[rustc_args_required_const(2)]
20224 pub unsafe fn _mm_add_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
20225     macro_rules! call {
20226         ($imm4:expr) => {
20227             vaddss(
20228                 a.as_f32x4(),
20229                 b.as_f32x4(),
20230                 _mm_setzero_ps().as_f32x4(),
20231                 0b1,
20232                 $imm4,
20233             )
20234         };
20235     }
20236     transmute(constify_imm4_round!(rounding, call))
20237 }
20238
20239 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20240 ///
20241 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20242 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20243 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20244 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20245 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20246 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20247 ///
20248 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
20249 #[inline]
20250 #[target_feature(enable = "avx512f")]
20251 #[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
20252 #[rustc_args_required_const(4)]
20253 pub unsafe fn _mm_mask_add_round_ss(
20254     src: __m128,
20255     k: __mmask8,
20256     a: __m128,
20257     b: __m128,
20258     rounding: i32,
20259 ) -> __m128 {
20260     macro_rules! call {
20261         ($imm4:expr) => {
20262             vaddss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
20263         };
20264     }
20265     transmute(constify_imm4_round!(rounding, call))
20266 }
20267
20268 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20269 ///
20270 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20271 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20272 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20273 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20274 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20275 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20276 ///
20277 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
20278 #[inline]
20279 #[target_feature(enable = "avx512f")]
20280 #[cfg_attr(test, assert_instr(vaddss, rounding = 8))]
20281 #[rustc_args_required_const(3)]
20282 pub unsafe fn _mm_maskz_add_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
20283     macro_rules! call {
20284         ($imm4:expr) => {
20285             vaddss(
20286                 a.as_f32x4(),
20287                 b.as_f32x4(),
20288                 _mm_setzero_ps().as_f32x4(),
20289                 k,
20290                 $imm4,
20291             )
20292         };
20293     }
20294     transmute(constify_imm4_round!(rounding, call))
20295 }
20296
20297 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
20298 ///
20299 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20300 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20301 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20302 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20303 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20304 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20305 ///
20306 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
20307 #[inline]
20308 #[target_feature(enable = "avx512f")]
20309 #[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
20310 #[rustc_args_required_const(2)]
20311 pub unsafe fn _mm_add_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
20312     macro_rules! call {
20313         ($imm4:expr) => {
20314             vaddsd(
20315                 a.as_f64x2(),
20316                 b.as_f64x2(),
20317                 _mm_setzero_pd().as_f64x2(),
20318                 0b1,
20319                 $imm4,
20320             )
20321         };
20322     }
20323     transmute(constify_imm4_round!(rounding, call))
20324 }
20325
20326 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20327 ///
20328 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20329 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20330 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20331 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20332 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20333 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20334 ///
20335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
20336 #[inline]
20337 #[target_feature(enable = "avx512f")]
20338 #[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
20339 #[rustc_args_required_const(4)]
20340 pub unsafe fn _mm_mask_add_round_sd(
20341     src: __m128d,
20342     k: __mmask8,
20343     a: __m128d,
20344     b: __m128d,
20345     rounding: i32,
20346 ) -> __m128d {
20347     macro_rules! call {
20348         ($imm4:expr) => {
20349             vaddsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
20350         };
20351     }
20352     transmute(constify_imm4_round!(rounding, call))
20353 }
20354
20355 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20356 ///
20357 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20358 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20359 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20360 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20361 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20362 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20363 ///
20364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
20365 #[inline]
20366 #[target_feature(enable = "avx512f")]
20367 #[cfg_attr(test, assert_instr(vaddsd, rounding = 8))]
20368 #[rustc_args_required_const(3)]
20369 pub unsafe fn _mm_maskz_add_round_sd(
20370     k: __mmask8,
20371     a: __m128d,
20372     b: __m128d,
20373     rounding: i32,
20374 ) -> __m128d {
20375     macro_rules! call {
20376         ($imm4:expr) => {
20377             vaddsd(
20378                 a.as_f64x2(),
20379                 b.as_f64x2(),
20380                 _mm_setzero_pd().as_f64x2(),
20381                 k,
20382                 $imm4,
20383             )
20384         };
20385     }
20386     transmute(constify_imm4_round!(rounding, call))
20387 }
20388
20389 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
20390 ///
20391 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20392 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20393 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20394 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20395 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20396 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20397 ///
20398 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
20399 #[inline]
20400 #[target_feature(enable = "avx512f")]
20401 #[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
20402 #[rustc_args_required_const(2)]
20403 pub unsafe fn _mm_sub_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
20404     macro_rules! call {
20405         ($imm4:expr) => {
20406             vsubss(
20407                 a.as_f32x4(),
20408                 b.as_f32x4(),
20409                 _mm_setzero_ps().as_f32x4(),
20410                 0b1,
20411                 $imm4,
20412             )
20413         };
20414     }
20415     transmute(constify_imm4_round!(rounding, call))
20416 }
20417
20418 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20419 ///
20420 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20421 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20422 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20423 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20424 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20425 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20426 ///
20427 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
20428 #[inline]
20429 #[target_feature(enable = "avx512f")]
20430 #[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
20431 #[rustc_args_required_const(4)]
20432 pub unsafe fn _mm_mask_sub_round_ss(
20433     src: __m128,
20434     k: __mmask8,
20435     a: __m128,
20436     b: __m128,
20437     rounding: i32,
20438 ) -> __m128 {
20439     macro_rules! call {
20440         ($imm4:expr) => {
20441             vsubss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
20442         };
20443     }
20444     transmute(constify_imm4_round!(rounding, call))
20445 }
20446
20447 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20448 ///
20449 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20450 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20451 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20452 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20453 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20454 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20455 ///
20456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
20457 #[inline]
20458 #[target_feature(enable = "avx512f")]
20459 #[cfg_attr(test, assert_instr(vsubss, rounding = 8))]
20460 #[rustc_args_required_const(3)]
20461 pub unsafe fn _mm_maskz_sub_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
20462     macro_rules! call {
20463         ($imm4:expr) => {
20464             vsubss(
20465                 a.as_f32x4(),
20466                 b.as_f32x4(),
20467                 _mm_setzero_ps().as_f32x4(),
20468                 k,
20469                 $imm4,
20470             )
20471         };
20472     }
20473     transmute(constify_imm4_round!(rounding, call))
20474 }
20475
20476 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
20477 ///
20478 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20479 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20480 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20481 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20482 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20483 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20484 ///
20485 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
20486 #[inline]
20487 #[target_feature(enable = "avx512f")]
20488 #[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
20489 #[rustc_args_required_const(2)]
20490 pub unsafe fn _mm_sub_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
20491     macro_rules! call {
20492         ($imm4:expr) => {
20493             vsubsd(
20494                 a.as_f64x2(),
20495                 b.as_f64x2(),
20496                 _mm_setzero_pd().as_f64x2(),
20497                 0b1,
20498                 $imm4,
20499             )
20500         };
20501     }
20502     transmute(constify_imm4_round!(rounding, call))
20503 }
20504
20505 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20506 ///
20507 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20508 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20509 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20510 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20511 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20512 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20513 ///
20514 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
20515 #[inline]
20516 #[target_feature(enable = "avx512f")]
20517 #[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
20518 #[rustc_args_required_const(4)]
20519 pub unsafe fn _mm_mask_sub_round_sd(
20520     src: __m128d,
20521     k: __mmask8,
20522     a: __m128d,
20523     b: __m128d,
20524     rounding: i32,
20525 ) -> __m128d {
20526     macro_rules! call {
20527         ($imm4:expr) => {
20528             vsubsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
20529         };
20530     }
20531     transmute(constify_imm4_round!(rounding, call))
20532 }
20533
20534 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20535 ///
20536 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20537 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20538 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20539 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20540 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20541 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20542 ///
20543 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
20544 #[inline]
20545 #[target_feature(enable = "avx512f")]
20546 #[cfg_attr(test, assert_instr(vsubsd, rounding = 8))]
20547 #[rustc_args_required_const(3)]
20548 pub unsafe fn _mm_maskz_sub_round_sd(
20549     k: __mmask8,
20550     a: __m128d,
20551     b: __m128d,
20552     rounding: i32,
20553 ) -> __m128d {
20554     macro_rules! call {
20555         ($imm4:expr) => {
20556             vsubsd(
20557                 a.as_f64x2(),
20558                 b.as_f64x2(),
20559                 _mm_setzero_pd().as_f64x2(),
20560                 k,
20561                 $imm4,
20562             )
20563         };
20564     }
20565     transmute(constify_imm4_round!(rounding, call))
20566 }
20567
20568 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
20569 ///
20570 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20571 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20572 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20573 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20574 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20575 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20576 ///
20577 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
20578 #[inline]
20579 #[target_feature(enable = "avx512f")]
20580 #[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
20581 #[rustc_args_required_const(2)]
20582 pub unsafe fn _mm_mul_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
20583     macro_rules! call {
20584         ($imm4:expr) => {
20585             vmulss(
20586                 a.as_f32x4(),
20587                 b.as_f32x4(),
20588                 _mm_setzero_ps().as_f32x4(),
20589                 0b1,
20590                 $imm4,
20591             )
20592         };
20593     }
20594     transmute(constify_imm4_round!(rounding, call))
20595 }
20596
20597 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20598 ///
20599 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20600 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20601 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20602 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20603 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20604 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20605 ///
20606 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
20607 #[inline]
20608 #[target_feature(enable = "avx512f")]
20609 #[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
20610 #[rustc_args_required_const(4)]
20611 pub unsafe fn _mm_mask_mul_round_ss(
20612     src: __m128,
20613     k: __mmask8,
20614     a: __m128,
20615     b: __m128,
20616     rounding: i32,
20617 ) -> __m128 {
20618     macro_rules! call {
20619         ($imm4:expr) => {
20620             vmulss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
20621         };
20622     }
20623     transmute(constify_imm4_round!(rounding, call))
20624 }
20625
20626 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20627 ///
20628 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20629 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20630 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20631 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20632 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20633 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20634 ///
20635 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
20636 #[inline]
20637 #[target_feature(enable = "avx512f")]
20638 #[cfg_attr(test, assert_instr(vmulss, rounding = 8))]
20639 #[rustc_args_required_const(3)]
20640 pub unsafe fn _mm_maskz_mul_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
20641     macro_rules! call {
20642         ($imm4:expr) => {
20643             vmulss(
20644                 a.as_f32x4(),
20645                 b.as_f32x4(),
20646                 _mm_setzero_ps().as_f32x4(),
20647                 k,
20648                 $imm4,
20649             )
20650         };
20651     }
20652     transmute(constify_imm4_round!(rounding, call))
20653 }
20654
20655 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
20656 ///
20657 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20658 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20659 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20660 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20661 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20662 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20663 ///
20664 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
20665 #[inline]
20666 #[target_feature(enable = "avx512f")]
20667 #[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
20668 #[rustc_args_required_const(2)]
20669 pub unsafe fn _mm_mul_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
20670     macro_rules! call {
20671         ($imm4:expr) => {
20672             vmulsd(
20673                 a.as_f64x2(),
20674                 b.as_f64x2(),
20675                 _mm_setzero_pd().as_f64x2(),
20676                 0b1,
20677                 $imm4,
20678             )
20679         };
20680     }
20681     transmute(constify_imm4_round!(rounding, call))
20682 }
20683
20684 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20685 ///
20686 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20687 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20688 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20689 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20690 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20691 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20692 ///
20693 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
20694 #[inline]
20695 #[target_feature(enable = "avx512f")]
20696 #[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
20697 #[rustc_args_required_const(4)]
20698 pub unsafe fn _mm_mask_mul_round_sd(
20699     src: __m128d,
20700     k: __mmask8,
20701     a: __m128d,
20702     b: __m128d,
20703     rounding: i32,
20704 ) -> __m128d {
20705     macro_rules! call {
20706         ($imm4:expr) => {
20707             vmulsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
20708         };
20709     }
20710     transmute(constify_imm4_round!(rounding, call))
20711 }
20712
20713 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20714 ///
20715 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20716 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20717 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20718 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20719 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20720 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20721 ///
20722 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
20723 #[inline]
20724 #[target_feature(enable = "avx512f")]
20725 #[cfg_attr(test, assert_instr(vmulsd, rounding = 8))]
20726 #[rustc_args_required_const(3)]
20727 pub unsafe fn _mm_maskz_mul_round_sd(
20728     k: __mmask8,
20729     a: __m128d,
20730     b: __m128d,
20731     rounding: i32,
20732 ) -> __m128d {
20733     macro_rules! call {
20734         ($imm4:expr) => {
20735             vmulsd(
20736                 a.as_f64x2(),
20737                 b.as_f64x2(),
20738                 _mm_setzero_pd().as_f64x2(),
20739                 k,
20740                 $imm4,
20741             )
20742         };
20743     }
20744     transmute(constify_imm4_round!(rounding, call))
20745 }
20746
20747 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
20748 ///
20749 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20750 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20751 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20752 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20753 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20754 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20755 ///
20756 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
20757 #[inline]
20758 #[target_feature(enable = "avx512f")]
20759 #[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
20760 #[rustc_args_required_const(2)]
20761 pub unsafe fn _mm_div_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
20762     macro_rules! call {
20763         ($imm4:expr) => {
20764             vdivss(
20765                 a.as_f32x4(),
20766                 b.as_f32x4(),
20767                 _mm_setzero_ps().as_f32x4(),
20768                 0b1,
20769                 $imm4,
20770             )
20771         };
20772     }
20773     transmute(constify_imm4_round!(rounding, call))
20774 }
20775
20776 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20777 ///
20778 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20779 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20780 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20781 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20782 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20783 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20784 ///
20785 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
20786 #[inline]
20787 #[target_feature(enable = "avx512f")]
20788 #[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
20789 #[rustc_args_required_const(4)]
20790 pub unsafe fn _mm_mask_div_round_ss(
20791     src: __m128,
20792     k: __mmask8,
20793     a: __m128,
20794     b: __m128,
20795     rounding: i32,
20796 ) -> __m128 {
20797     macro_rules! call {
20798         ($imm4:expr) => {
20799             vdivss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
20800         };
20801     }
20802     transmute(constify_imm4_round!(rounding, call))
20803 }
20804
20805 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20806 ///
20807 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20808 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20809 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20810 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20811 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20812 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20813 ///
20814 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
20815 #[inline]
20816 #[target_feature(enable = "avx512f")]
20817 #[cfg_attr(test, assert_instr(vdivss, rounding = 8))]
20818 #[rustc_args_required_const(3)]
20819 pub unsafe fn _mm_maskz_div_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
20820     macro_rules! call {
20821         ($imm4:expr) => {
20822             vdivss(
20823                 a.as_f32x4(),
20824                 b.as_f32x4(),
20825                 _mm_setzero_ps().as_f32x4(),
20826                 k,
20827                 $imm4,
20828             )
20829         };
20830     }
20831     transmute(constify_imm4_round!(rounding, call))
20832 }
20833
20834 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
20835 ///
20836 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20837 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20838 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20839 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20840 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20841 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20842 ///
20843 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
20844 #[inline]
20845 #[target_feature(enable = "avx512f")]
20846 #[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
20847 #[rustc_args_required_const(2)]
20848 pub unsafe fn _mm_div_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
20849     macro_rules! call {
20850         ($imm4:expr) => {
20851             vdivsd(
20852                 a.as_f64x2(),
20853                 b.as_f64x2(),
20854                 _mm_setzero_pd().as_f64x2(),
20855                 0b1,
20856                 $imm4,
20857             )
20858         };
20859     }
20860     transmute(constify_imm4_round!(rounding, call))
20861 }
20862
20863 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20864 ///
20865 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20866 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20867 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20868 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20869 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20870 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20871 ///
20872 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
20873 #[inline]
20874 #[target_feature(enable = "avx512f")]
20875 #[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
20876 #[rustc_args_required_const(4)]
20877 pub unsafe fn _mm_mask_div_round_sd(
20878     src: __m128d,
20879     k: __mmask8,
20880     a: __m128d,
20881     b: __m128d,
20882     rounding: i32,
20883 ) -> __m128d {
20884     macro_rules! call {
20885         ($imm4:expr) => {
20886             vdivsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
20887         };
20888     }
20889     transmute(constify_imm4_round!(rounding, call))
20890 }
20891
20892 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
20893 ///
20894 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
20895 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
20896 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
20897 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
20898 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
20899 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
20900 ///
20901 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
20902 #[inline]
20903 #[target_feature(enable = "avx512f")]
20904 #[cfg_attr(test, assert_instr(vdivsd, rounding = 8))]
20905 #[rustc_args_required_const(3)]
20906 pub unsafe fn _mm_maskz_div_round_sd(
20907     k: __mmask8,
20908     a: __m128d,
20909     b: __m128d,
20910     rounding: i32,
20911 ) -> __m128d {
20912     macro_rules! call {
20913         ($imm4:expr) => {
20914             vdivsd(
20915                 a.as_f64x2(),
20916                 b.as_f64x2(),
20917                 _mm_setzero_pd().as_f64x2(),
20918                 k,
20919                 $imm4,
20920             )
20921         };
20922     }
20923     transmute(constify_imm4_round!(rounding, call))
20924 }
20925
20926 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
20927 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
20928 ///
20929 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
20930 #[inline]
20931 #[target_feature(enable = "avx512f")]
20932 #[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
20933 #[rustc_args_required_const(2)]
20934 pub unsafe fn _mm_max_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
20935     macro_rules! call {
20936         ($imm4:expr) => {
20937             vmaxss(
20938                 a.as_f32x4(),
20939                 b.as_f32x4(),
20940                 _mm_setzero_ps().as_f32x4(),
20941                 0b1,
20942                 $imm4,
20943             )
20944         };
20945     }
20946     transmute(constify_imm4_sae!(sae, call))
20947 }
20948
20949 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20950 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
20951 ///
20952 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
20953 #[inline]
20954 #[target_feature(enable = "avx512f")]
20955 #[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
20956 #[rustc_args_required_const(4)]
20957 pub unsafe fn _mm_mask_max_round_ss(
20958     src: __m128,
20959     k: __mmask8,
20960     a: __m128,
20961     b: __m128,
20962     sae: i32,
20963 ) -> __m128 {
20964     macro_rules! call {
20965         ($imm4:expr) => {
20966             vmaxss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
20967         };
20968     }
20969     transmute(constify_imm4_sae!(sae, call))
20970 }
20971
20972 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
20973 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
20974 ///
20975 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
20976 #[inline]
20977 #[target_feature(enable = "avx512f")]
20978 #[cfg_attr(test, assert_instr(vmaxss, sae = 8))]
20979 #[rustc_args_required_const(3)]
20980 pub unsafe fn _mm_maskz_max_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
20981     macro_rules! call {
20982         ($imm4:expr) => {
20983             vmaxss(
20984                 a.as_f32x4(),
20985                 b.as_f32x4(),
20986                 _mm_setzero_ps().as_f32x4(),
20987                 k,
20988                 $imm4,
20989             )
20990         };
20991     }
20992     transmute(constify_imm4_sae!(sae, call))
20993 }
20994
20995 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
20996 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
20997 ///
20998 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
20999 #[inline]
21000 #[target_feature(enable = "avx512f")]
21001 #[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
21002 #[rustc_args_required_const(2)]
21003 pub unsafe fn _mm_max_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
21004     macro_rules! call {
21005         ($imm4:expr) => {
21006             vmaxsd(
21007                 a.as_f64x2(),
21008                 b.as_f64x2(),
21009                 _mm_setzero_pd().as_f64x2(),
21010                 0b1,
21011                 $imm4,
21012             )
21013         };
21014     }
21015     transmute(constify_imm4_sae!(sae, call))
21016 }
21017
21018 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21019 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21020 ///
21021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
21022 #[inline]
21023 #[target_feature(enable = "avx512f")]
21024 #[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
21025 #[rustc_args_required_const(4)]
21026 pub unsafe fn _mm_mask_max_round_sd(
21027     src: __m128d,
21028     k: __mmask8,
21029     a: __m128d,
21030     b: __m128d,
21031     sae: i32,
21032 ) -> __m128d {
21033     macro_rules! call {
21034         ($imm4:expr) => {
21035             vmaxsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
21036         };
21037     }
21038     transmute(constify_imm4_sae!(sae, call))
21039 }
21040
21041 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21042 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21043 ///
21044 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
21045 #[inline]
21046 #[target_feature(enable = "avx512f")]
21047 #[cfg_attr(test, assert_instr(vmaxsd, sae = 8))]
21048 #[rustc_args_required_const(3)]
21049 pub unsafe fn _mm_maskz_max_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
21050     macro_rules! call {
21051         ($imm4:expr) => {
21052             vmaxsd(
21053                 a.as_f64x2(),
21054                 b.as_f64x2(),
21055                 _mm_setzero_pd().as_f64x2(),
21056                 k,
21057                 $imm4,
21058             )
21059         };
21060     }
21061     transmute(constify_imm4_sae!(sae, call))
21062 }
21063
21064 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
21065 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21066 ///
21067 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
21068 #[inline]
21069 #[target_feature(enable = "avx512f")]
21070 #[cfg_attr(test, assert_instr(vminss, sae = 8))]
21071 #[rustc_args_required_const(2)]
21072 pub unsafe fn _mm_min_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
21073     macro_rules! call {
21074         ($imm4:expr) => {
21075             vminss(
21076                 a.as_f32x4(),
21077                 b.as_f32x4(),
21078                 _mm_setzero_ps().as_f32x4(),
21079                 0b1,
21080                 $imm4,
21081             )
21082         };
21083     }
21084     transmute(constify_imm4_sae!(sae, call))
21085 }
21086
21087 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21088 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21089 ///
21090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
21091 #[inline]
21092 #[target_feature(enable = "avx512f")]
21093 #[cfg_attr(test, assert_instr(vminss, sae = 8))]
21094 #[rustc_args_required_const(4)]
21095 pub unsafe fn _mm_mask_min_round_ss(
21096     src: __m128,
21097     k: __mmask8,
21098     a: __m128,
21099     b: __m128,
21100     sae: i32,
21101 ) -> __m128 {
21102     macro_rules! call {
21103         ($imm4:expr) => {
21104             vminss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
21105         };
21106     }
21107     transmute(constify_imm4_sae!(sae, call))
21108 }
21109
21110 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21111 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21112 ///
21113 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
21114 #[inline]
21115 #[target_feature(enable = "avx512f")]
21116 #[cfg_attr(test, assert_instr(vminss, sae = 8))]
21117 #[rustc_args_required_const(3)]
21118 pub unsafe fn _mm_maskz_min_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
21119     macro_rules! call {
21120         ($imm4:expr) => {
21121             vminss(
21122                 a.as_f32x4(),
21123                 b.as_f32x4(),
21124                 _mm_setzero_ps().as_f32x4(),
21125                 k,
21126                 $imm4,
21127             )
21128         };
21129     }
21130     transmute(constify_imm4_sae!(sae, call))
21131 }
21132
21133 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
21134 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21135 ///
21136 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
21137 #[inline]
21138 #[target_feature(enable = "avx512f")]
21139 #[cfg_attr(test, assert_instr(vminsd, sae = 8))]
21140 #[rustc_args_required_const(2)]
21141 pub unsafe fn _mm_min_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
21142     macro_rules! call {
21143         ($imm4:expr) => {
21144             vminsd(
21145                 a.as_f64x2(),
21146                 b.as_f64x2(),
21147                 _mm_setzero_pd().as_f64x2(),
21148                 0b1,
21149                 $imm4,
21150             )
21151         };
21152     }
21153     transmute(constify_imm4_sae!(sae, call))
21154 }
21155
21156 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21157 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21158 ///
21159 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
21160 #[inline]
21161 #[target_feature(enable = "avx512f")]
21162 #[cfg_attr(test, assert_instr(vminsd, sae = 8))]
21163 #[rustc_args_required_const(4)]
21164 pub unsafe fn _mm_mask_min_round_sd(
21165     src: __m128d,
21166     k: __mmask8,
21167     a: __m128d,
21168     b: __m128d,
21169     sae: i32,
21170 ) -> __m128d {
21171     macro_rules! call {
21172         ($imm4:expr) => {
21173             vminsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
21174         };
21175     }
21176     transmute(constify_imm4_sae!(sae, call))
21177 }
21178
21179 /// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21180 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21181 ///
21182 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
21183 #[inline]
21184 #[target_feature(enable = "avx512f")]
21185 #[cfg_attr(test, assert_instr(vminsd, sae = 8))]
21186 #[rustc_args_required_const(3)]
21187 pub unsafe fn _mm_maskz_min_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
21188     macro_rules! call {
21189         ($imm4:expr) => {
21190             vminsd(
21191                 a.as_f64x2(),
21192                 b.as_f64x2(),
21193                 _mm_setzero_pd().as_f64x2(),
21194                 k,
21195                 $imm4,
21196             )
21197         };
21198     }
21199     transmute(constify_imm4_sae!(sae, call))
21200 }
21201
21202 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
21203 ///
21204 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21205 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21206 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21207 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21208 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21209 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21210 ///
21211 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
21212 #[inline]
21213 #[target_feature(enable = "avx512f")]
21214 #[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
21215 #[rustc_args_required_const(2)]
21216 pub unsafe fn _mm_sqrt_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
21217     macro_rules! call {
21218         ($imm4:expr) => {
21219             vsqrtss(
21220                 a.as_f32x4(),
21221                 b.as_f32x4(),
21222                 _mm_setzero_ps().as_f32x4(),
21223                 0b1,
21224                 $imm4,
21225             )
21226         };
21227     }
21228     transmute(constify_imm4_round!(rounding, call))
21229 }
21230
21231 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21232 ///
21233 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21234 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21235 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21236 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21237 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21238 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21239 ///
21240 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
21241 #[inline]
21242 #[target_feature(enable = "avx512f")]
21243 #[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
21244 #[rustc_args_required_const(4)]
21245 pub unsafe fn _mm_mask_sqrt_round_ss(
21246     src: __m128,
21247     k: __mmask8,
21248     a: __m128,
21249     b: __m128,
21250     rounding: i32,
21251 ) -> __m128 {
21252     macro_rules! call {
21253         ($imm4:expr) => {
21254             vsqrtss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
21255         };
21256     }
21257     transmute(constify_imm4_round!(rounding, call))
21258 }
21259
21260 /// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21261 ///
21262 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21263 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21264 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21265 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21266 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21267 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21268 ///
21269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
21270 #[inline]
21271 #[target_feature(enable = "avx512f")]
21272 #[cfg_attr(test, assert_instr(vsqrtss, rounding = 8))]
21273 #[rustc_args_required_const(3)]
21274 pub unsafe fn _mm_maskz_sqrt_round_ss(k: __mmask8, a: __m128, b: __m128, rounding: i32) -> __m128 {
21275     macro_rules! call {
21276         ($imm4:expr) => {
21277             vsqrtss(
21278                 a.as_f32x4(),
21279                 b.as_f32x4(),
21280                 _mm_setzero_ps().as_f32x4(),
21281                 k,
21282                 $imm4,
21283             )
21284         };
21285     }
21286     transmute(constify_imm4_round!(rounding, call))
21287 }
21288
21289 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
21290 ///
21291 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21292 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21293 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21294 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21295 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21296 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21297 ///
21298 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
21299 #[inline]
21300 #[target_feature(enable = "avx512f")]
21301 #[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
21302 #[rustc_args_required_const(2)]
21303 pub unsafe fn _mm_sqrt_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
21304     macro_rules! call {
21305         ($imm4:expr) => {
21306             vsqrtsd(
21307                 a.as_f64x2(),
21308                 b.as_f64x2(),
21309                 _mm_setzero_pd().as_f64x2(),
21310                 0b1,
21311                 $imm4,
21312             )
21313         };
21314     }
21315     transmute(constify_imm4_round!(rounding, call))
21316 }
21317
21318 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21319 ///
21320 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21321 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21322 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21323 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21324 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21325 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21326 ///
21327 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
21328 #[inline]
21329 #[target_feature(enable = "avx512f")]
21330 #[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
21331 #[rustc_args_required_const(4)]
21332 pub unsafe fn _mm_mask_sqrt_round_sd(
21333     src: __m128d,
21334     k: __mmask8,
21335     a: __m128d,
21336     b: __m128d,
21337     rounding: i32,
21338 ) -> __m128d {
21339     macro_rules! call {
21340         ($imm4:expr) => {
21341             vsqrtsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
21342         };
21343     }
21344     transmute(constify_imm4_round!(rounding, call))
21345 }
21346
21347 /// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21348 ///
21349 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21350 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21351 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21352 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21353 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21354 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21355 ///
21356 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
21357 #[inline]
21358 #[target_feature(enable = "avx512f")]
21359 #[cfg_attr(test, assert_instr(vsqrtsd, rounding = 8))]
21360 #[rustc_args_required_const(3)]
21361 pub unsafe fn _mm_maskz_sqrt_round_sd(
21362     k: __mmask8,
21363     a: __m128d,
21364     b: __m128d,
21365     rounding: i32,
21366 ) -> __m128d {
21367     macro_rules! call {
21368         ($imm4:expr) => {
21369             vsqrtsd(
21370                 a.as_f64x2(),
21371                 b.as_f64x2(),
21372                 _mm_setzero_pd().as_f64x2(),
21373                 k,
21374                 $imm4,
21375             )
21376         };
21377     }
21378     transmute(constify_imm4_round!(rounding, call))
21379 }
21380
21381 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
21382 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21383 ///
21384 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
21385 #[inline]
21386 #[target_feature(enable = "avx512f")]
21387 #[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
21388 #[rustc_args_required_const(2)]
21389 pub unsafe fn _mm_getexp_round_ss(a: __m128, b: __m128, sae: i32) -> __m128 {
21390     macro_rules! call {
21391         ($imm4:expr) => {
21392             vgetexpss(
21393                 a.as_f32x4(),
21394                 b.as_f32x4(),
21395                 _mm_setzero_ps().as_f32x4(),
21396                 0b1,
21397                 $imm4,
21398             )
21399         };
21400     }
21401     let r = constify_imm4_sae!(sae, call);
21402     transmute(r)
21403 }
21404
21405 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
21406 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21407 ///
21408 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
21409 #[inline]
21410 #[target_feature(enable = "avx512f")]
21411 #[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
21412 #[rustc_args_required_const(4)]
21413 pub unsafe fn _mm_mask_getexp_round_ss(
21414     src: __m128,
21415     k: __mmask8,
21416     a: __m128,
21417     b: __m128,
21418     sae: i32,
21419 ) -> __m128 {
21420     macro_rules! call {
21421         ($imm4:expr) => {
21422             vgetexpss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
21423         };
21424     }
21425     let r = constify_imm4_sae!(sae, call);
21426     transmute(r)
21427 }
21428
21429 /// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
21430 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21431 ///
21432 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
21433 #[inline]
21434 #[target_feature(enable = "avx512f")]
21435 #[cfg_attr(test, assert_instr(vgetexpss, sae = 8))]
21436 #[rustc_args_required_const(3)]
21437 pub unsafe fn _mm_maskz_getexp_round_ss(k: __mmask8, a: __m128, b: __m128, sae: i32) -> __m128 {
21438     macro_rules! call {
21439         ($imm4:expr) => {
21440             vgetexpss(
21441                 a.as_f32x4(),
21442                 b.as_f32x4(),
21443                 _mm_setzero_ps().as_f32x4(),
21444                 k,
21445                 $imm4,
21446             )
21447         };
21448     }
21449     let r = constify_imm4_sae!(sae, call);
21450     transmute(r)
21451 }
21452
21453 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
21454 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21455 ///
21456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
21457 #[inline]
21458 #[target_feature(enable = "avx512f")]
21459 #[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
21460 #[rustc_args_required_const(2)]
21461 pub unsafe fn _mm_getexp_round_sd(a: __m128d, b: __m128d, sae: i32) -> __m128d {
21462     macro_rules! call {
21463         ($imm4:expr) => {
21464             vgetexpsd(
21465                 a.as_f64x2(),
21466                 b.as_f64x2(),
21467                 _mm_setzero_pd().as_f64x2(),
21468                 0b1,
21469                 $imm4,
21470             )
21471         };
21472     }
21473     let r = constify_imm4_sae!(sae, call);
21474     transmute(r)
21475 }
21476
21477 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
21478 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21479 ///
21480 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
21481 #[inline]
21482 #[target_feature(enable = "avx512f")]
21483 #[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
21484 #[rustc_args_required_const(4)]
21485 pub unsafe fn _mm_mask_getexp_round_sd(
21486     src: __m128d,
21487     k: __mmask8,
21488     a: __m128d,
21489     b: __m128d,
21490     sae: i32,
21491 ) -> __m128d {
21492     macro_rules! call {
21493         ($imm4:expr) => {
21494             vgetexpsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
21495         };
21496     }
21497     let r = constify_imm4_sae!(sae, call);
21498     transmute(r)
21499 }
21500
21501 /// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
21502 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21503 ///
21504 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
21505 #[inline]
21506 #[target_feature(enable = "avx512f")]
21507 #[cfg_attr(test, assert_instr(vgetexpsd, sae = 8))]
21508 #[rustc_args_required_const(3)]
21509 pub unsafe fn _mm_maskz_getexp_round_sd(k: __mmask8, a: __m128d, b: __m128d, sae: i32) -> __m128d {
21510     macro_rules! call {
21511         ($imm4:expr) => {
21512             vgetexpsd(
21513                 a.as_f64x2(),
21514                 b.as_f64x2(),
21515                 _mm_setzero_pd().as_f64x2(),
21516                 k,
21517                 $imm4,
21518             )
21519         };
21520     }
21521     let r = constify_imm4_sae!(sae, call);
21522     transmute(r)
21523 }
21524
21525 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
21526 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
21527 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
21528 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
21529 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
21530 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
21531 /// The sign is determined by sc which can take the following values:\
21532 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
21533 ///    _MM_MANT_SIGN_zero    // sign = 0\
21534 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
21535 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21536 ///
21537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_ss&expand=2892)
21538 #[inline]
21539 #[target_feature(enable = "avx512f")]
21540 #[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
21541 #[rustc_args_required_const(2, 3, 4)]
21542 pub unsafe fn _mm_getmant_round_ss(
21543     a: __m128,
21544     b: __m128,
21545     norm: _MM_MANTISSA_NORM_ENUM,
21546     sign: _MM_MANTISSA_SIGN_ENUM,
21547     sae: i32,
21548 ) -> __m128 {
21549     macro_rules! call {
21550         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
21551             vgetmantss(
21552                 a.as_f32x4(),
21553                 b.as_f32x4(),
21554                 $imm2 << 2 | $imm4_1,
21555                 _mm_setzero_ps().as_f32x4(),
21556                 0b1,
21557                 $imm4_2,
21558             )
21559         };
21560     }
21561     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
21562     transmute(r)
21563 }
21564
21565 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
21566 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
21567 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
21568 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
21569 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
21570 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
21571 /// The sign is determined by sc which can take the following values:\
21572 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
21573 ///    _MM_MANT_SIGN_zero    // sign = 0\
21574 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
21575 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21576 ///
21577 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_ss&expand=2893)
21578 #[inline]
21579 #[target_feature(enable = "avx512f")]
21580 #[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
21581 #[rustc_args_required_const(4, 5, 6)]
21582 pub unsafe fn _mm_mask_getmant_round_ss(
21583     src: __m128,
21584     k: __mmask8,
21585     a: __m128,
21586     b: __m128,
21587     norm: _MM_MANTISSA_NORM_ENUM,
21588     sign: _MM_MANTISSA_SIGN_ENUM,
21589     sae: i32,
21590 ) -> __m128 {
21591     macro_rules! call {
21592         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
21593             vgetmantss(
21594                 a.as_f32x4(),
21595                 b.as_f32x4(),
21596                 $imm2 << 2 | $imm4_1,
21597                 src.as_f32x4(),
21598                 k,
21599                 $imm4_2,
21600             )
21601         };
21602     }
21603     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
21604     transmute(r)
21605 }
21606
21607 /// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
21608 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
21609 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
21610 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
21611 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
21612 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
21613 /// The sign is determined by sc which can take the following values:\
21614 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
21615 ///    _MM_MANT_SIGN_zero    // sign = 0\
21616 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
21617 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21618 ///
21619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_ss&expand=2894)
21620 #[inline]
21621 #[target_feature(enable = "avx512f")]
21622 #[cfg_attr(test, assert_instr(vgetmantss, norm = 0, sign = 0, sae = 4))]
21623 #[rustc_args_required_const(3, 4, 5)]
21624 pub unsafe fn _mm_maskz_getmant_round_ss(
21625     k: __mmask8,
21626     a: __m128,
21627     b: __m128,
21628     norm: _MM_MANTISSA_NORM_ENUM,
21629     sign: _MM_MANTISSA_SIGN_ENUM,
21630     sae: i32,
21631 ) -> __m128 {
21632     macro_rules! call {
21633         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
21634             vgetmantss(
21635                 a.as_f32x4(),
21636                 b.as_f32x4(),
21637                 $imm2 << 2 | $imm4_1,
21638                 _mm_setzero_ps().as_f32x4(),
21639                 k,
21640                 $imm4_2,
21641             )
21642         };
21643     }
21644     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
21645     transmute(r)
21646 }
21647
21648 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
21649 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
21650 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
21651 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
21652 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
21653 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
21654 /// The sign is determined by sc which can take the following values:\
21655 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
21656 ///    _MM_MANT_SIGN_zero    // sign = 0\
21657 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
21658 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21659 ///
21660 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
21661 #[inline]
21662 #[target_feature(enable = "avx512f")]
21663 #[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
21664 #[rustc_args_required_const(2, 3, 4)]
21665 pub unsafe fn _mm_getmant_round_sd(
21666     a: __m128d,
21667     b: __m128d,
21668     norm: _MM_MANTISSA_NORM_ENUM,
21669     sign: _MM_MANTISSA_SIGN_ENUM,
21670     sae: i32,
21671 ) -> __m128d {
21672     macro_rules! call {
21673         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
21674             vgetmantsd(
21675                 a.as_f64x2(),
21676                 b.as_f64x2(),
21677                 $imm2 << 2 | $imm4_1,
21678                 _mm_setzero_pd().as_f64x2(),
21679                 0b1,
21680                 $imm4_2,
21681             )
21682         };
21683     }
21684     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
21685     transmute(r)
21686 }
21687
21688 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
21689 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
21690 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
21691 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
21692 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
21693 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
21694 /// The sign is determined by sc which can take the following values:\
21695 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
21696 ///    _MM_MANT_SIGN_zero    // sign = 0\
21697 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
21698 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21699 ///
21700 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
21701 #[inline]
21702 #[target_feature(enable = "avx512f")]
21703 #[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
21704 #[rustc_args_required_const(4, 5, 6)]
21705 pub unsafe fn _mm_mask_getmant_round_sd(
21706     src: __m128d,
21707     k: __mmask8,
21708     a: __m128d,
21709     b: __m128d,
21710     norm: _MM_MANTISSA_NORM_ENUM,
21711     sign: _MM_MANTISSA_SIGN_ENUM,
21712     sae: i32,
21713 ) -> __m128d {
21714     macro_rules! call {
21715         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
21716             vgetmantsd(
21717                 a.as_f64x2(),
21718                 b.as_f64x2(),
21719                 $imm2 << 2 | $imm4_1,
21720                 src.as_f64x2(),
21721                 k,
21722                 $imm4_2,
21723             )
21724         };
21725     }
21726     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
21727     transmute(r)
21728 }
21729
21730 /// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
21731 /// The mantissa is normalized to the interval specified by interv, which can take the following values:\
21732 ///    _MM_MANT_NORM_1_2     // interval [1, 2)\
21733 ///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
21734 ///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
21735 ///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
21736 /// The sign is determined by sc which can take the following values:\
21737 ///    _MM_MANT_SIGN_src     // sign = sign(src)\
21738 ///    _MM_MANT_SIGN_zero    // sign = 0\
21739 ///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
21740 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21741 ///
21742 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
21743 #[inline]
21744 #[target_feature(enable = "avx512f")]
21745 #[cfg_attr(test, assert_instr(vgetmantsd, norm = 0, sign = 0, sae = 4))]
21746 #[rustc_args_required_const(3, 4, 5)]
21747 pub unsafe fn _mm_maskz_getmant_round_sd(
21748     k: __mmask8,
21749     a: __m128d,
21750     b: __m128d,
21751     norm: _MM_MANTISSA_NORM_ENUM,
21752     sign: _MM_MANTISSA_SIGN_ENUM,
21753     sae: i32,
21754 ) -> __m128d {
21755     macro_rules! call {
21756         ($imm4_1:expr, $imm2:expr, $imm4_2:expr) => {
21757             vgetmantsd(
21758                 a.as_f64x2(),
21759                 b.as_f64x2(),
21760                 $imm2 << 2 | $imm4_1,
21761                 _mm_setzero_pd().as_f64x2(),
21762                 k,
21763                 $imm4_2,
21764             )
21765         };
21766     }
21767     let r = constify_imm4_mantissas_sae!(norm, sign, sae, call);
21768     transmute(r)
21769 }
21770
21771 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
21772 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
21773 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
21774 ///    _MM_FROUND_TO_NEG_INF     // round down\
21775 ///    _MM_FROUND_TO_POS_INF     // round up\
21776 ///    _MM_FROUND_TO_ZERO        // truncate\
21777 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
21778 ///
21779 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21780 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_ss&expand=4796)
21781 #[inline]
21782 #[target_feature(enable = "avx512f")]
21783 #[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
21784 #[rustc_args_required_const(2, 3)]
21785 pub unsafe fn _mm_roundscale_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> __m128 {
21786     let a = a.as_f32x4();
21787     let b = b.as_f32x4();
21788     let zero = _mm_setzero_ps().as_f32x4();
21789     macro_rules! call {
21790         ($imm8:expr, $imm4:expr) => {
21791             vrndscaless(a, b, zero, 0b11111111, $imm8, $imm4)
21792         };
21793     }
21794     let r = constify_imm8_roundscale!(imm8, sae, call);
21795     transmute(r)
21796 }
21797
21798 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21799 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
21800 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
21801 ///    _MM_FROUND_TO_NEG_INF     // round down\
21802 ///    _MM_FROUND_TO_POS_INF     // round up\
21803 ///    _MM_FROUND_TO_ZERO        // truncate\
21804 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
21805 ///
21806 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21807 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_ss&expand=4794)
21808 #[inline]
21809 #[target_feature(enable = "avx512f")]
21810 #[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
21811 #[rustc_args_required_const(4, 5)]
21812 pub unsafe fn _mm_mask_roundscale_round_ss(
21813     src: __m128,
21814     k: __mmask8,
21815     a: __m128,
21816     b: __m128,
21817     imm8: i32,
21818     sae: i32,
21819 ) -> __m128 {
21820     let a = a.as_f32x4();
21821     let b = b.as_f32x4();
21822     let src = src.as_f32x4();
21823     macro_rules! call {
21824         ($imm8:expr, $imm4:expr) => {
21825             vrndscaless(a, b, src, k, $imm8, $imm4)
21826         };
21827     }
21828     let r = constify_imm8_roundscale!(imm8, sae, call);
21829     transmute(r)
21830 }
21831
21832 /// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21833 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
21834 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
21835 ///    _MM_FROUND_TO_NEG_INF     // round down\
21836 ///    _MM_FROUND_TO_POS_INF     // round up\
21837 ///    _MM_FROUND_TO_ZERO        // truncate\
21838 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
21839 ///
21840 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21841 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_ss&expand=4795)
21842 #[inline]
21843 #[target_feature(enable = "avx512f")]
21844 #[cfg_attr(test, assert_instr(vrndscaless, imm8 = 0, sae = 8))]
21845 #[rustc_args_required_const(3, 4)]
21846 pub unsafe fn _mm_maskz_roundscale_round_ss(
21847     k: __mmask8,
21848     a: __m128,
21849     b: __m128,
21850     imm8: i32,
21851     sae: i32,
21852 ) -> __m128 {
21853     let a = a.as_f32x4();
21854     let b = b.as_f32x4();
21855     let zero = _mm_setzero_ps().as_f32x4();
21856     macro_rules! call {
21857         ($imm8:expr, $imm4:expr) => {
21858             vrndscaless(a, b, zero, k, $imm8, $imm4)
21859         };
21860     }
21861     let r = constify_imm8_roundscale!(imm8, sae, call);
21862     transmute(r)
21863 }
21864
21865 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
21866 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
21867 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
21868 ///    _MM_FROUND_TO_NEG_INF     // round down\
21869 ///    _MM_FROUND_TO_POS_INF     // round up\
21870 ///    _MM_FROUND_TO_ZERO        // truncate\
21871 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
21872 ///
21873 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21874 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_sd&expand=4793)
21875 #[inline]
21876 #[target_feature(enable = "avx512f")]
21877 #[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
21878 #[rustc_args_required_const(2, 3)]
21879 pub unsafe fn _mm_roundscale_round_sd(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> __m128d {
21880     let a = a.as_f64x2();
21881     let b = b.as_f64x2();
21882     let zero = _mm_setzero_pd().as_f64x2();
21883     macro_rules! call {
21884         ($imm8:expr, $imm4:expr) => {
21885             vrndscalesd(a, b, zero, 0b11111111, $imm8, $imm4)
21886         };
21887     }
21888     let r = constify_imm8_roundscale!(imm8, sae, call);
21889     transmute(r)
21890 }
21891
21892 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21893 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
21894 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
21895 ///    _MM_FROUND_TO_NEG_INF     // round down\
21896 ///    _MM_FROUND_TO_POS_INF     // round up\
21897 ///    _MM_FROUND_TO_ZERO        // truncate\
21898 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
21899 ///
21900 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21901 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_sd&expand=4791)
21902 #[inline]
21903 #[target_feature(enable = "avx512f")]
21904 #[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
21905 #[rustc_args_required_const(4, 5)]
21906 pub unsafe fn _mm_mask_roundscale_round_sd(
21907     src: __m128d,
21908     k: __mmask8,
21909     a: __m128d,
21910     b: __m128d,
21911     imm8: i32,
21912     sae: i32,
21913 ) -> __m128d {
21914     let a = a.as_f64x2();
21915     let b = b.as_f64x2();
21916     let src = src.as_f64x2();
21917     macro_rules! call {
21918         ($imm8:expr, $imm4:expr) => {
21919             vrndscalesd(a, b, src, k, $imm8, $imm4)
21920         };
21921     }
21922     let r = constify_imm8_roundscale!(imm8, sae, call);
21923     transmute(r)
21924 }
21925
21926 /// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
21927 /// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
21928 ///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
21929 ///    _MM_FROUND_TO_NEG_INF     // round down\
21930 ///    _MM_FROUND_TO_POS_INF     // round up\
21931 ///    _MM_FROUND_TO_ZERO        // truncate\
21932 ///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
21933 ///
21934 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
21935 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_sd&expand=4792)
21936 #[inline]
21937 #[target_feature(enable = "avx512f")]
21938 #[cfg_attr(test, assert_instr(vrndscalesd, imm8 = 0, sae = 8))]
21939 #[rustc_args_required_const(3, 4)]
21940 pub unsafe fn _mm_maskz_roundscale_round_sd(
21941     k: __mmask8,
21942     a: __m128d,
21943     b: __m128d,
21944     imm8: i32,
21945     sae: i32,
21946 ) -> __m128d {
21947     let a = a.as_f64x2();
21948     let b = b.as_f64x2();
21949     let zero = _mm_setzero_pd().as_f64x2();
21950     macro_rules! call {
21951         ($imm8:expr, $imm4:expr) => {
21952             vrndscalesd(a, b, zero, k, $imm8, $imm4)
21953         };
21954     }
21955     let r = constify_imm8_roundscale!(imm8, sae, call);
21956     transmute(r)
21957 }
21958
21959 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
21960 ///
21961 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21962 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21963 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21964 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21965 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21966 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21967 ///
21968 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
21969 #[inline]
21970 #[target_feature(enable = "avx512f")]
21971 #[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
21972 #[rustc_args_required_const(2)]
21973 pub unsafe fn _mm_scalef_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
21974     let a = a.as_f32x4();
21975     let b = b.as_f32x4();
21976     let zero = _mm_setzero_ps().as_f32x4();
21977     macro_rules! call {
21978         ($imm4:expr) => {
21979             vscalefss(a, b, zero, 0b11111111, $imm4)
21980         };
21981     }
21982     let r = constify_imm4_round!(rounding, call);
21983     transmute(r)
21984 }
21985
21986 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
21987 ///
21988 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
21989 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
21990 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
21991 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
21992 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
21993 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
21994 ///
21995 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
21996 #[inline]
21997 #[target_feature(enable = "avx512f")]
21998 #[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
21999 #[rustc_args_required_const(4)]
22000 pub unsafe fn _mm_mask_scalef_round_ss(
22001     src: __m128,
22002     k: __mmask8,
22003     a: __m128,
22004     b: __m128,
22005     rounding: i32,
22006 ) -> __m128 {
22007     macro_rules! call {
22008         ($imm4:expr) => {
22009             vscalefss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k, $imm4)
22010         };
22011     }
22012     let r = constify_imm4_round!(rounding, call);
22013     transmute(r)
22014 }
22015
22016 /// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22017 ///
22018 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22019 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22020 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22021 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22022 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22023 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22024 ///
22025 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
22026 #[inline]
22027 #[target_feature(enable = "avx512f")]
22028 #[cfg_attr(test, assert_instr(vscalefss, rounding = 8))]
22029 #[rustc_args_required_const(3)]
22030 pub unsafe fn _mm_maskz_scalef_round_ss(
22031     k: __mmask8,
22032     a: __m128,
22033     b: __m128,
22034     rounding: i32,
22035 ) -> __m128 {
22036     macro_rules! call {
22037         ($imm4:expr) => {
22038             vscalefss(
22039                 a.as_f32x4(),
22040                 b.as_f32x4(),
22041                 _mm_setzero_ps().as_f32x4(),
22042                 k,
22043                 $imm4,
22044             )
22045         };
22046     }
22047     let r = constify_imm4_round!(rounding, call);
22048     transmute(r)
22049 }
22050
22051 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
22052 ///
22053 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22054 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22055 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22056 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22057 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22058 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22059 ///
22060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
22061 #[inline]
22062 #[target_feature(enable = "avx512f")]
22063 #[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
22064 #[rustc_args_required_const(2)]
22065 pub unsafe fn _mm_scalef_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
22066     macro_rules! call {
22067         ($imm4:expr) => {
22068             vscalefsd(
22069                 a.as_f64x2(),
22070                 b.as_f64x2(),
22071                 _mm_setzero_pd().as_f64x2(),
22072                 0b11111111,
22073                 $imm4,
22074             )
22075         };
22076     }
22077     let r = constify_imm4_round!(rounding, call);
22078     transmute(r)
22079 }
22080
22081 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22082 ///
22083 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22084 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22085 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22086 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22087 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22088 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22089 ///
22090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
22091 #[inline]
22092 #[target_feature(enable = "avx512f")]
22093 #[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
22094 #[rustc_args_required_const(4)]
22095 pub unsafe fn _mm_mask_scalef_round_sd(
22096     src: __m128d,
22097     k: __mmask8,
22098     a: __m128d,
22099     b: __m128d,
22100     rounding: i32,
22101 ) -> __m128d {
22102     macro_rules! call {
22103         ($imm4:expr) => {
22104             vscalefsd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k, $imm4)
22105         };
22106     }
22107     let r = constify_imm4_round!(rounding, call);
22108     transmute(r)
22109 }
22110
22111 /// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22112 ///
22113 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22114 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22115 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22116 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22117 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22118 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22119 ///
22120 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
22121 #[inline]
22122 #[target_feature(enable = "avx512f")]
22123 #[cfg_attr(test, assert_instr(vscalefsd, rounding = 8))]
22124 #[rustc_args_required_const(3)]
22125 pub unsafe fn _mm_maskz_scalef_round_sd(
22126     k: __mmask8,
22127     a: __m128d,
22128     b: __m128d,
22129     rounding: i32,
22130 ) -> __m128d {
22131     macro_rules! call {
22132         ($imm4:expr) => {
22133             vscalefsd(
22134                 a.as_f64x2(),
22135                 b.as_f64x2(),
22136                 _mm_setzero_pd().as_f64x2(),
22137                 k,
22138                 $imm4,
22139             )
22140         };
22141     }
22142     let r = constify_imm4_round!(rounding, call);
22143     transmute(r)
22144 }
22145
22146 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
22147 ///
22148 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22149 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22150 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22151 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22152 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22153 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22154 ///
22155 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
22156 #[inline]
22157 #[target_feature(enable = "avx512f")]
22158 #[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
22159 #[rustc_args_required_const(3)]
22160 pub unsafe fn _mm_fmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
22161     let extracta: f32 = simd_extract(a, 0);
22162     let extractb: f32 = simd_extract(b, 0);
22163     let extractc: f32 = simd_extract(c, 0);
22164     macro_rules! call {
22165         ($imm4:expr) => {
22166             vfmadd132ss(extracta, extractb, extractc, $imm4)
22167         };
22168     }
22169     let fmadd = constify_imm4_round!(rounding, call);
22170     let r = simd_insert(a, 0, fmadd);
22171     transmute(r)
22172 }
22173
22174 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22175 ///
22176 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22177 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22178 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22179 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22180 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22181 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22182 ///
22183 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
22184 #[inline]
22185 #[target_feature(enable = "avx512f")]
22186 #[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
22187 #[rustc_args_required_const(4)]
22188 pub unsafe fn _mm_mask_fmadd_round_ss(
22189     a: __m128,
22190     k: __mmask8,
22191     b: __m128,
22192     c: __m128,
22193     rounding: i32,
22194 ) -> __m128 {
22195     let mut fmadd: f32 = simd_extract(a, 0);
22196     if (k & 0b00000001) != 0 {
22197         let extractb: f32 = simd_extract(b, 0);
22198         let extractc: f32 = simd_extract(c, 0);
22199         macro_rules! call {
22200             ($imm4:expr) => {
22201                 vfmadd132ss(fmadd, extractb, extractc, $imm4)
22202             };
22203         }
22204         fmadd = constify_imm4_round!(rounding, call);
22205     }
22206     let r = simd_insert(a, 0, fmadd);
22207     transmute(r)
22208 }
22209
22210 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22211 ///
22212 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22213 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22214 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22215 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22216 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22217 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22218 ///
22219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
22220 #[inline]
22221 #[target_feature(enable = "avx512f")]
22222 #[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
22223 #[rustc_args_required_const(4)]
22224 pub unsafe fn _mm_maskz_fmadd_round_ss(
22225     k: __mmask8,
22226     a: __m128,
22227     b: __m128,
22228     c: __m128,
22229     rounding: i32,
22230 ) -> __m128 {
22231     let mut fmadd: f32 = 0.;
22232     if (k & 0b00000001) != 0 {
22233         let extracta: f32 = simd_extract(a, 0);
22234         let extractb: f32 = simd_extract(b, 0);
22235         let extractc: f32 = simd_extract(c, 0);
22236         macro_rules! call {
22237             ($imm4:expr) => {
22238                 vfmadd132ss(extracta, extractb, extractc, $imm4)
22239             };
22240         }
22241         fmadd = constify_imm4_round!(rounding, call);
22242     }
22243     let r = simd_insert(a, 0, fmadd);
22244     transmute(r)
22245 }
22246
22247 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
22248 ///
22249 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22250 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22251 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22252 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22253 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22254 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22255 ///
22256 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
22257 #[inline]
22258 #[target_feature(enable = "avx512f")]
22259 #[cfg_attr(test, assert_instr(vfmadd213ss, rounding = 8))]
22260 #[rustc_args_required_const(4)]
22261 pub unsafe fn _mm_mask3_fmadd_round_ss(
22262     a: __m128,
22263     b: __m128,
22264     c: __m128,
22265     k: __mmask8,
22266     rounding: i32,
22267 ) -> __m128 {
22268     let mut fmadd: f32 = simd_extract(c, 0);
22269     if (k & 0b00000001) != 0 {
22270         let extracta: f32 = simd_extract(a, 0);
22271         let extractb: f32 = simd_extract(b, 0);
22272         macro_rules! call {
22273             ($imm4:expr) => {
22274                 vfmadd132ss(extracta, extractb, fmadd, $imm4)
22275             };
22276         }
22277         fmadd = constify_imm4_round!(rounding, call);
22278     }
22279     let r = simd_insert(c, 0, fmadd);
22280     transmute(r)
22281 }
22282
22283 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
22284 ///
22285 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22286 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22287 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22288 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22289 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22290 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22291 ///
22292 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
22293 #[inline]
22294 #[target_feature(enable = "avx512f")]
22295 #[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
22296 #[rustc_args_required_const(3)]
22297 pub unsafe fn _mm_fmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
22298     let extracta: f64 = simd_extract(a, 0);
22299     let extractb: f64 = simd_extract(b, 0);
22300     let extractc: f64 = simd_extract(c, 0);
22301     macro_rules! call {
22302         ($imm4:expr) => {
22303             vfmadd132sd(extracta, extractb, extractc, $imm4)
22304         };
22305     }
22306     let fmadd = constify_imm4_round!(rounding, call);
22307     let r = simd_insert(a, 0, fmadd);
22308     transmute(r)
22309 }
22310
22311 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22312 ///
22313 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22314 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22315 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22316 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22317 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22318 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22319 ///
22320 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
22321 #[inline]
22322 #[target_feature(enable = "avx512f")]
22323 #[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
22324 #[rustc_args_required_const(4)]
22325 pub unsafe fn _mm_mask_fmadd_round_sd(
22326     a: __m128d,
22327     k: __mmask8,
22328     b: __m128d,
22329     c: __m128d,
22330     rounding: i32,
22331 ) -> __m128d {
22332     let mut fmadd: f64 = simd_extract(a, 0);
22333     if (k & 0b00000001) != 0 {
22334         let extractb: f64 = simd_extract(b, 0);
22335         let extractc: f64 = simd_extract(c, 0);
22336         macro_rules! call {
22337             ($imm4:expr) => {
22338                 vfmadd132sd(fmadd, extractb, extractc, $imm4)
22339             };
22340         }
22341         fmadd = constify_imm4_round!(rounding, call);
22342     }
22343     let r = simd_insert(a, 0, fmadd);
22344     transmute(r)
22345 }
22346
22347 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22348 ///
22349 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22350 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22351 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22352 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22353 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22354 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22355 ///
22356 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
22357 #[inline]
22358 #[target_feature(enable = "avx512f")]
22359 #[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
22360 #[rustc_args_required_const(4)]
22361 pub unsafe fn _mm_maskz_fmadd_round_sd(
22362     k: __mmask8,
22363     a: __m128d,
22364     b: __m128d,
22365     c: __m128d,
22366     rounding: i32,
22367 ) -> __m128d {
22368     let mut fmadd: f64 = 0.;
22369     if (k & 0b00000001) != 0 {
22370         let extracta: f64 = simd_extract(a, 0);
22371         let extractb: f64 = simd_extract(b, 0);
22372         let extractc: f64 = simd_extract(c, 0);
22373         macro_rules! call {
22374             ($imm4:expr) => {
22375                 vfmadd132sd(extracta, extractb, extractc, $imm4)
22376             };
22377         }
22378         fmadd = constify_imm4_round!(rounding, call);
22379     }
22380     let r = simd_insert(a, 0, fmadd);
22381     transmute(r)
22382 }
22383
22384 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
22385 ///
22386 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22387 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22388 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22389 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22390 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22391 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22392 ///
22393 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
22394 #[inline]
22395 #[target_feature(enable = "avx512f")]
22396 #[cfg_attr(test, assert_instr(vfmadd213sd, rounding = 8))]
22397 #[rustc_args_required_const(4)]
22398 pub unsafe fn _mm_mask3_fmadd_round_sd(
22399     a: __m128d,
22400     b: __m128d,
22401     c: __m128d,
22402     k: __mmask8,
22403     rounding: i32,
22404 ) -> __m128d {
22405     let mut fmadd: f64 = simd_extract(c, 0);
22406     if (k & 0b00000001) != 0 {
22407         let extracta: f64 = simd_extract(a, 0);
22408         let extractb: f64 = simd_extract(b, 0);
22409         macro_rules! call {
22410             ($imm4:expr) => {
22411                 vfmadd132sd(extracta, extractb, fmadd, $imm4)
22412             };
22413         }
22414         fmadd = constify_imm4_round!(rounding, call);
22415     }
22416     let r = simd_insert(c, 0, fmadd);
22417     transmute(r)
22418 }
22419
22420 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
22421 ///
22422 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22423 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22424 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22425 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22426 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22427 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22428 ///
22429 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_ss&expand=2659)
22430 #[inline]
22431 #[target_feature(enable = "avx512f")]
22432 #[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
22433 #[rustc_args_required_const(3)]
22434 pub unsafe fn _mm_fmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
22435     let extracta: f32 = simd_extract(a, 0);
22436     let extractb: f32 = simd_extract(b, 0);
22437     let extractc: f32 = simd_extract(c, 0);
22438     let extractc = -extractc;
22439     macro_rules! call {
22440         ($imm4:expr) => {
22441             vfmadd132ss(extracta, extractb, extractc, $imm4)
22442         };
22443     }
22444     let fmsub = constify_imm4_round!(rounding, call);
22445     let r = simd_insert(a, 0, fmsub);
22446     transmute(r)
22447 }
22448
22449 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22450 ///
22451 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22452 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22453 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22454 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22455 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22456 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22457 ///
22458 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_ss&expand=2660)
22459 #[inline]
22460 #[target_feature(enable = "avx512f")]
22461 #[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
22462 #[rustc_args_required_const(4)]
22463 pub unsafe fn _mm_mask_fmsub_round_ss(
22464     a: __m128,
22465     k: __mmask8,
22466     b: __m128,
22467     c: __m128,
22468     rounding: i32,
22469 ) -> __m128 {
22470     let mut fmsub: f32 = simd_extract(a, 0);
22471     if (k & 0b00000001) != 0 {
22472         let extractb: f32 = simd_extract(b, 0);
22473         let extractc: f32 = simd_extract(c, 0);
22474         let extractc = -extractc;
22475         macro_rules! call {
22476             ($imm4:expr) => {
22477                 vfmadd132ss(fmsub, extractb, extractc, $imm4)
22478             };
22479         }
22480         fmsub = constify_imm4_round!(rounding, call);
22481     }
22482     let r = simd_insert(a, 0, fmsub);
22483     transmute(r)
22484 }
22485
22486 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22487 ///
22488 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22489 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22490 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22491 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22492 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22493 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22494 ///
22495 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_ss&expand=2662)
22496 #[inline]
22497 #[target_feature(enable = "avx512f")]
22498 #[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
22499 #[rustc_args_required_const(4)]
22500 pub unsafe fn _mm_maskz_fmsub_round_ss(
22501     k: __mmask8,
22502     a: __m128,
22503     b: __m128,
22504     c: __m128,
22505     rounding: i32,
22506 ) -> __m128 {
22507     let mut fmsub: f32 = 0.;
22508     if (k & 0b00000001) != 0 {
22509         let extracta: f32 = simd_extract(a, 0);
22510         let extractb: f32 = simd_extract(b, 0);
22511         let extractc: f32 = simd_extract(c, 0);
22512         let extractc = -extractc;
22513         macro_rules! call {
22514             ($imm4:expr) => {
22515                 vfmadd132ss(extracta, extractb, extractc, $imm4)
22516             };
22517         }
22518         fmsub = constify_imm4_round!(rounding, call);
22519     }
22520     let r = simd_insert(a, 0, fmsub);
22521     transmute(r)
22522 }
22523
22524 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
22525 ///
22526 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22527 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22528 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22529 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22530 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22531 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22532 ///
22533 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_ss&expand=2661)
22534 #[inline]
22535 #[target_feature(enable = "avx512f")]
22536 #[cfg_attr(test, assert_instr(vfmsub213ss, rounding = 8))]
22537 #[rustc_args_required_const(4)]
22538 pub unsafe fn _mm_mask3_fmsub_round_ss(
22539     a: __m128,
22540     b: __m128,
22541     c: __m128,
22542     k: __mmask8,
22543     rounding: i32,
22544 ) -> __m128 {
22545     let mut fmsub: f32 = simd_extract(c, 0);
22546     if (k & 0b00000001) != 0 {
22547         let extracta: f32 = simd_extract(a, 0);
22548         let extractb: f32 = simd_extract(b, 0);
22549         let extractc = -fmsub;
22550         macro_rules! call {
22551             ($imm4:expr) => {
22552                 vfmadd132ss(extracta, extractb, extractc, $imm4)
22553             };
22554         }
22555         fmsub = constify_imm4_round!(rounding, call);
22556     }
22557     let r = simd_insert(c, 0, fmsub);
22558     transmute(r)
22559 }
22560
22561 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
22562 ///
22563 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22564 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22565 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22566 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22567 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22568 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22569 ///
22570 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_sd&expand=2655)
22571 #[inline]
22572 #[target_feature(enable = "avx512f")]
22573 #[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
22574 #[rustc_args_required_const(3)]
22575 pub unsafe fn _mm_fmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
22576     let extracta: f64 = simd_extract(a, 0);
22577     let extractb: f64 = simd_extract(b, 0);
22578     let extractc: f64 = simd_extract(c, 0);
22579     let extractc = -extractc;
22580     macro_rules! call {
22581         ($imm4:expr) => {
22582             vfmadd132sd(extracta, extractb, extractc, $imm4)
22583         };
22584     }
22585     let fmsub = constify_imm4_round!(rounding, call);
22586     let r = simd_insert(a, 0, fmsub);
22587     transmute(r)
22588 }
22589
22590 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22591 ///
22592 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22593 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22594 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22595 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22596 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22597 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22598 ///
22599 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_sd&expand=2656)
22600 #[inline]
22601 #[target_feature(enable = "avx512f")]
22602 #[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
22603 #[rustc_args_required_const(4)]
22604 pub unsafe fn _mm_mask_fmsub_round_sd(
22605     a: __m128d,
22606     k: __mmask8,
22607     b: __m128d,
22608     c: __m128d,
22609     rounding: i32,
22610 ) -> __m128d {
22611     let mut fmsub: f64 = simd_extract(a, 0);
22612     if (k & 0b00000001) != 0 {
22613         let extractb: f64 = simd_extract(b, 0);
22614         let extractc: f64 = simd_extract(c, 0);
22615         let extractc = -extractc;
22616         macro_rules! call {
22617             ($imm4:expr) => {
22618                 vfmadd132sd(fmsub, extractb, extractc, $imm4)
22619             };
22620         }
22621         fmsub = constify_imm4_round!(rounding, call);
22622     }
22623     let r = simd_insert(a, 0, fmsub);
22624     transmute(r)
22625 }
22626
22627 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22628 ///
22629 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22630 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22631 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22632 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22633 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22634 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22635 ///
22636 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_sd&expand=2658)
22637 #[inline]
22638 #[target_feature(enable = "avx512f")]
22639 #[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
22640 #[rustc_args_required_const(4)]
22641 pub unsafe fn _mm_maskz_fmsub_round_sd(
22642     k: __mmask8,
22643     a: __m128d,
22644     b: __m128d,
22645     c: __m128d,
22646     rounding: i32,
22647 ) -> __m128d {
22648     let mut fmsub: f64 = 0.;
22649     if (k & 0b00000001) != 0 {
22650         let extracta: f64 = simd_extract(a, 0);
22651         let extractb: f64 = simd_extract(b, 0);
22652         let extractc: f64 = simd_extract(c, 0);
22653         let extractc = -extractc;
22654         macro_rules! call {
22655             ($imm4:expr) => {
22656                 vfmadd132sd(extracta, extractb, extractc, $imm4)
22657             };
22658         }
22659         fmsub = constify_imm4_round!(rounding, call);
22660     }
22661     let r = simd_insert(a, 0, fmsub);
22662     transmute(r)
22663 }
22664
22665 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
22666 ///
22667 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22668 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22669 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22670 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22671 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22672 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22673 ///
22674 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_sd&expand=2657)
22675 #[inline]
22676 #[target_feature(enable = "avx512f")]
22677 #[cfg_attr(test, assert_instr(vfmsub213sd, rounding = 8))]
22678 #[rustc_args_required_const(4)]
22679 pub unsafe fn _mm_mask3_fmsub_round_sd(
22680     a: __m128d,
22681     b: __m128d,
22682     c: __m128d,
22683     k: __mmask8,
22684     rounding: i32,
22685 ) -> __m128d {
22686     let mut fmsub: f64 = simd_extract(c, 0);
22687     if (k & 0b00000001) != 0 {
22688         let extracta: f64 = simd_extract(a, 0);
22689         let extractb: f64 = simd_extract(b, 0);
22690         let extractc = -fmsub;
22691         macro_rules! call {
22692             ($imm4:expr) => {
22693                 vfmadd132sd(extracta, extractb, extractc, $imm4)
22694             };
22695         }
22696         fmsub = constify_imm4_round!(rounding, call);
22697     }
22698     let r = simd_insert(c, 0, fmsub);
22699     transmute(r)
22700 }
22701
22702 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
22703 ///
22704 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22705 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22706 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22707 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22708 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22709 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22710 ///
22711 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_ss&expand=2739)
22712 #[inline]
22713 #[target_feature(enable = "avx512f")]
22714 #[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
22715 #[rustc_args_required_const(3)]
22716 pub unsafe fn _mm_fnmadd_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
22717     let extracta: f32 = simd_extract(a, 0);
22718     let extracta = -extracta;
22719     let extractb: f32 = simd_extract(b, 0);
22720     let extractc: f32 = simd_extract(c, 0);
22721     macro_rules! call {
22722         ($imm4:expr) => {
22723             vfmadd132ss(extracta, extractb, extractc, $imm4)
22724         };
22725     }
22726     let fnmadd = constify_imm4_round!(rounding, call);
22727     let r = simd_insert(a, 0, fnmadd);
22728     transmute(r)
22729 }
22730
22731 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22732 ///
22733 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22734 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22735 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22736 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22737 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22738 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22739 ///
22740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_ss&expand=2740)
22741 #[inline]
22742 #[target_feature(enable = "avx512f")]
22743 #[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
22744 #[rustc_args_required_const(4)]
22745 pub unsafe fn _mm_mask_fnmadd_round_ss(
22746     a: __m128,
22747     k: __mmask8,
22748     b: __m128,
22749     c: __m128,
22750     rounding: i32,
22751 ) -> __m128 {
22752     let mut fnmadd: f32 = simd_extract(a, 0);
22753     if (k & 0b00000001) != 0 {
22754         let extracta = -fnmadd;
22755         let extractb: f32 = simd_extract(b, 0);
22756         let extractc: f32 = simd_extract(c, 0);
22757         macro_rules! call {
22758             ($imm4:expr) => {
22759                 vfmadd132ss(extracta, extractb, extractc, $imm4)
22760             };
22761         }
22762         fnmadd = constify_imm4_round!(rounding, call);
22763     }
22764     let r = simd_insert(a, 0, fnmadd);
22765     transmute(r)
22766 }
22767
22768 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
22769 ///
22770 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22771 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22772 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22773 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22774 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22775 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22776 ///
22777 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_ss&expand=2742)
22778 #[inline]
22779 #[target_feature(enable = "avx512f")]
22780 #[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
22781 #[rustc_args_required_const(4)]
22782 pub unsafe fn _mm_maskz_fnmadd_round_ss(
22783     k: __mmask8,
22784     a: __m128,
22785     b: __m128,
22786     c: __m128,
22787     rounding: i32,
22788 ) -> __m128 {
22789     let mut fnmadd: f32 = 0.;
22790     if (k & 0b00000001) != 0 {
22791         let extracta: f32 = simd_extract(a, 0);
22792         let extracta = -extracta;
22793         let extractb: f32 = simd_extract(b, 0);
22794         let extractc: f32 = simd_extract(c, 0);
22795         macro_rules! call {
22796             ($imm4:expr) => {
22797                 vfmadd132ss(extracta, extractb, extractc, $imm4)
22798             };
22799         }
22800         fnmadd = constify_imm4_round!(rounding, call);
22801     }
22802     let r = simd_insert(a, 0, fnmadd);
22803     transmute(r)
22804 }
22805
22806 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
22807 ///
22808 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22809 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22810 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22811 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22812 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22813 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22814 ///
22815 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_ss&expand=2741)
22816 #[inline]
22817 #[target_feature(enable = "avx512f")]
22818 #[cfg_attr(test, assert_instr(vfnmadd213ss, rounding = 8))]
22819 #[rustc_args_required_const(4)]
22820 pub unsafe fn _mm_mask3_fnmadd_round_ss(
22821     a: __m128,
22822     b: __m128,
22823     c: __m128,
22824     k: __mmask8,
22825     rounding: i32,
22826 ) -> __m128 {
22827     let mut fnmadd: f32 = simd_extract(c, 0);
22828     if (k & 0b00000001) != 0 {
22829         let extracta: f32 = simd_extract(a, 0);
22830         let extracta = -extracta;
22831         let extractb: f32 = simd_extract(b, 0);
22832         macro_rules! call {
22833             ($imm4:expr) => {
22834                 vfmadd132ss(extracta, extractb, fnmadd, $imm4)
22835             };
22836         }
22837         fnmadd = constify_imm4_round!(rounding, call);
22838     }
22839     let r = simd_insert(c, 0, fnmadd);
22840     transmute(r)
22841 }
22842
22843 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
22844 ///
22845 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22846 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22847 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22848 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22849 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22850 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22851 ///
22852 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_sd&expand=2735)
22853 #[inline]
22854 #[target_feature(enable = "avx512f")]
22855 #[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
22856 #[rustc_args_required_const(3)]
22857 pub unsafe fn _mm_fnmadd_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
22858     let extracta: f64 = simd_extract(a, 0);
22859     let extracta = -extracta;
22860     let extractb: f64 = simd_extract(b, 0);
22861     let extractc: f64 = simd_extract(c, 0);
22862     macro_rules! call {
22863         ($imm4:expr) => {
22864             vfmadd132sd(extracta, extractb, extractc, $imm4)
22865         };
22866     }
22867     let fnmadd = constify_imm4_round!(rounding, call);
22868     let r = simd_insert(a, 0, fnmadd);
22869     transmute(r)
22870 }
22871
22872 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22873 ///
22874 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22875 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22876 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22877 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22878 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22879 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22880 ///
22881 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_sd&expand=2736)
22882 #[inline]
22883 #[target_feature(enable = "avx512f")]
22884 #[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
22885 #[rustc_args_required_const(4)]
22886 pub unsafe fn _mm_mask_fnmadd_round_sd(
22887     a: __m128d,
22888     k: __mmask8,
22889     b: __m128d,
22890     c: __m128d,
22891     rounding: i32,
22892 ) -> __m128d {
22893     let mut fnmadd: f64 = simd_extract(a, 0);
22894     if (k & 0b00000001) != 0 {
22895         let extracta = -fnmadd;
22896         let extractb: f64 = simd_extract(b, 0);
22897         let extractc: f64 = simd_extract(c, 0);
22898         macro_rules! call {
22899             ($imm4:expr) => {
22900                 vfmadd132sd(extracta, extractb, extractc, $imm4)
22901             };
22902         }
22903         fnmadd = constify_imm4_round!(rounding, call);
22904     }
22905     let r = simd_insert(a, 0, fnmadd);
22906     transmute(r)
22907 }
22908
22909 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
22910 ///
22911 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22912 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22913 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22914 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22915 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22916 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22917 ///
22918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_sd&expand=2738)
22919 #[inline]
22920 #[target_feature(enable = "avx512f")]
22921 #[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
22922 #[rustc_args_required_const(4)]
22923 pub unsafe fn _mm_maskz_fnmadd_round_sd(
22924     k: __mmask8,
22925     a: __m128d,
22926     b: __m128d,
22927     c: __m128d,
22928     rounding: i32,
22929 ) -> __m128d {
22930     let mut fnmadd: f64 = 0.;
22931     if (k & 0b00000001) != 0 {
22932         let extracta: f64 = simd_extract(a, 0);
22933         let extracta = -extracta;
22934         let extractb: f64 = simd_extract(b, 0);
22935         let extractc: f64 = simd_extract(c, 0);
22936         macro_rules! call {
22937             ($imm4:expr) => {
22938                 vfmadd132sd(extracta, extractb, extractc, $imm4)
22939             };
22940         }
22941         fnmadd = constify_imm4_round!(rounding, call);
22942     }
22943     let r = simd_insert(a, 0, fnmadd);
22944     transmute(r)
22945 }
22946
22947 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
22948 ///
22949 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22950 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22951 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22952 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22953 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22954 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22955 ///
22956 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_Sd&expand=2737)
22957 #[inline]
22958 #[target_feature(enable = "avx512f")]
22959 #[cfg_attr(test, assert_instr(vfnmadd213sd, rounding = 8))]
22960 #[rustc_args_required_const(4)]
22961 pub unsafe fn _mm_mask3_fnmadd_round_sd(
22962     a: __m128d,
22963     b: __m128d,
22964     c: __m128d,
22965     k: __mmask8,
22966     rounding: i32,
22967 ) -> __m128d {
22968     let mut fnmadd: f64 = simd_extract(c, 0);
22969     if (k & 0b00000001) != 0 {
22970         let extracta: f64 = simd_extract(a, 0);
22971         let extracta = -extracta;
22972         let extractb: f64 = simd_extract(b, 0);
22973         macro_rules! call {
22974             ($imm4:expr) => {
22975                 vfmadd132sd(extracta, extractb, fnmadd, $imm4)
22976             };
22977         }
22978         fnmadd = constify_imm4_round!(rounding, call);
22979     }
22980     let r = simd_insert(c, 0, fnmadd);
22981     transmute(r)
22982 }
22983
22984 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
22985 ///
22986 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
22987 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
22988 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
22989 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
22990 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
22991 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
22992 ///
22993 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_ss&expand=2787)
22994 #[inline]
22995 #[target_feature(enable = "avx512f")]
22996 #[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
22997 #[rustc_args_required_const(3)]
22998 pub unsafe fn _mm_fnmsub_round_ss(a: __m128, b: __m128, c: __m128, rounding: i32) -> __m128 {
22999     let extracta: f32 = simd_extract(a, 0);
23000     let extracta = -extracta;
23001     let extractb: f32 = simd_extract(b, 0);
23002     let extractc: f32 = simd_extract(c, 0);
23003     let extractc = -extractc;
23004     macro_rules! call {
23005         ($imm4:expr) => {
23006             vfmadd132ss(extracta, extractb, extractc, $imm4)
23007         };
23008     }
23009     let fnmsub = constify_imm4_round!(rounding, call);
23010     let r = simd_insert(a, 0, fnmsub);
23011     transmute(r)
23012 }
23013
23014 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
23015 ///
23016 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23017 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23018 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23019 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23020 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23021 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23022 ///
23023 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_ss&expand=2788)
23024 #[inline]
23025 #[target_feature(enable = "avx512f")]
23026 #[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
23027 #[rustc_args_required_const(4)]
23028 pub unsafe fn _mm_mask_fnmsub_round_ss(
23029     a: __m128,
23030     k: __mmask8,
23031     b: __m128,
23032     c: __m128,
23033     rounding: i32,
23034 ) -> __m128 {
23035     let mut fnmsub: f32 = simd_extract(a, 0);
23036     if (k & 0b00000001) != 0 {
23037         let extracta = -fnmsub;
23038         let extractb: f32 = simd_extract(b, 0);
23039         let extractc: f32 = simd_extract(c, 0);
23040         let extractc = -extractc;
23041         macro_rules! call {
23042             ($imm4:expr) => {
23043                 vfmadd132ss(extracta, extractb, extractc, $imm4)
23044             };
23045         }
23046         fnmsub = constify_imm4_round!(rounding, call);
23047     }
23048     let r = simd_insert(a, 0, fnmsub);
23049     transmute(r)
23050 }
23051
23052 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
23053 ///
23054 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23055 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23056 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23057 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23058 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23059 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23060 ///
23061 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_ss&expand=2790)
23062 #[inline]
23063 #[target_feature(enable = "avx512f")]
23064 #[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
23065 #[rustc_args_required_const(4)]
23066 pub unsafe fn _mm_maskz_fnmsub_round_ss(
23067     k: __mmask8,
23068     a: __m128,
23069     b: __m128,
23070     c: __m128,
23071     rounding: i32,
23072 ) -> __m128 {
23073     let mut fnmsub: f32 = 0.;
23074     if (k & 0b00000001) != 0 {
23075         let extracta: f32 = simd_extract(a, 0);
23076         let extracta = -extracta;
23077         let extractb: f32 = simd_extract(b, 0);
23078         let extractc: f32 = simd_extract(c, 0);
23079         let extractc = -extractc;
23080         macro_rules! call {
23081             ($imm4:expr) => {
23082                 vfmadd132ss(extracta, extractb, extractc, $imm4)
23083             };
23084         }
23085         fnmsub = constify_imm4_round!(rounding, call);
23086     }
23087     let r = simd_insert(a, 0, fnmsub);
23088     transmute(r)
23089 }
23090
23091 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
23092 ///
23093 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23094 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23095 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23096 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23097 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23098 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23099 ///
23100 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_ss&expand=2789)
23101 #[inline]
23102 #[target_feature(enable = "avx512f")]
23103 #[cfg_attr(test, assert_instr(vfnmsub213ss, rounding = 8))]
23104 #[rustc_args_required_const(4)]
23105 pub unsafe fn _mm_mask3_fnmsub_round_ss(
23106     a: __m128,
23107     b: __m128,
23108     c: __m128,
23109     k: __mmask8,
23110     rounding: i32,
23111 ) -> __m128 {
23112     let mut fnmsub: f32 = simd_extract(c, 0);
23113     if (k & 0b00000001) != 0 {
23114         let extracta: f32 = simd_extract(a, 0);
23115         let extracta = -extracta;
23116         let extractb: f32 = simd_extract(b, 0);
23117         let extractc = -fnmsub;
23118         macro_rules! call {
23119             ($imm4:expr) => {
23120                 vfmadd132ss(extracta, extractb, extractc, $imm4)
23121             };
23122         }
23123         fnmsub = constify_imm4_round!(rounding, call);
23124     }
23125     let r = simd_insert(c, 0, fnmsub);
23126     transmute(r)
23127 }
23128
23129 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
23130 ///
23131 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23132 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23133 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23134 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23135 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23136 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23137 ///
23138 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_sd&expand=2783)
23139 #[inline]
23140 #[target_feature(enable = "avx512f")]
23141 #[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
23142 #[rustc_args_required_const(3)]
23143 pub unsafe fn _mm_fnmsub_round_sd(a: __m128d, b: __m128d, c: __m128d, rounding: i32) -> __m128d {
23144     let extracta: f64 = simd_extract(a, 0);
23145     let extracta = -extracta;
23146     let extractb: f64 = simd_extract(b, 0);
23147     let extractc: f64 = simd_extract(c, 0);
23148     let extractc = -extractc;
23149     macro_rules! call {
23150         ($imm4:expr) => {
23151             vfmadd132sd(extracta, extractb, extractc, $imm4)
23152         };
23153     }
23154     let fnmsub = constify_imm4_round!(rounding, call);
23155     let r = simd_insert(a, 0, fnmsub);
23156     transmute(r)
23157 }
23158
23159 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
23160 ///
23161 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23162 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23163 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23164 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23165 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23166 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23167 ///
23168 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_sd&expand=2784)
23169 #[inline]
23170 #[target_feature(enable = "avx512f")]
23171 #[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
23172 #[rustc_args_required_const(4)]
23173 pub unsafe fn _mm_mask_fnmsub_round_sd(
23174     a: __m128d,
23175     k: __mmask8,
23176     b: __m128d,
23177     c: __m128d,
23178     rounding: i32,
23179 ) -> __m128d {
23180     let mut fnmsub: f64 = simd_extract(a, 0);
23181     if (k & 0b00000001) != 0 {
23182         let extracta = -fnmsub;
23183         let extractb: f64 = simd_extract(b, 0);
23184         let extractc: f64 = simd_extract(c, 0);
23185         let extractc = -extractc;
23186         macro_rules! call {
23187             ($imm4:expr) => {
23188                 vfmadd132sd(extracta, extractb, extractc, $imm4)
23189             };
23190         }
23191         fnmsub = constify_imm4_round!(rounding, call);
23192     }
23193     let r = simd_insert(a, 0, fnmsub);
23194     transmute(r)
23195 }
23196
23197 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
23198 ///
23199 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23200 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23201 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23202 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23203 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23204 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23205 ///
23206 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_sd&expand=2786)
23207 #[inline]
23208 #[target_feature(enable = "avx512f")]
23209 #[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
23210 #[rustc_args_required_const(4)]
23211 pub unsafe fn _mm_maskz_fnmsub_round_sd(
23212     k: __mmask8,
23213     a: __m128d,
23214     b: __m128d,
23215     c: __m128d,
23216     rounding: i32,
23217 ) -> __m128d {
23218     let mut fnmsub: f64 = 0.;
23219     if (k & 0b00000001) != 0 {
23220         let extracta: f64 = simd_extract(a, 0);
23221         let extracta = -extracta;
23222         let extractb: f64 = simd_extract(b, 0);
23223         let extractc: f64 = simd_extract(c, 0);
23224         let extractc = -extractc;
23225         macro_rules! call {
23226             ($imm4:expr) => {
23227                 vfmadd132sd(extracta, extractb, extractc, $imm4)
23228             };
23229         }
23230         fnmsub = constify_imm4_round!(rounding, call);
23231     }
23232     let r = simd_insert(a, 0, fnmsub);
23233     transmute(r)
23234 }
23235
23236 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
23237 ///
23238 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23239 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23240 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23241 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23242 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23243 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23244 ///
23245 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_sd&expand=2785)
23246 #[inline]
23247 #[target_feature(enable = "avx512f")]
23248 #[cfg_attr(test, assert_instr(vfnmsub213sd, rounding = 8))]
23249 #[rustc_args_required_const(4)]
23250 pub unsafe fn _mm_mask3_fnmsub_round_sd(
23251     a: __m128d,
23252     b: __m128d,
23253     c: __m128d,
23254     k: __mmask8,
23255     rounding: i32,
23256 ) -> __m128d {
23257     let mut fnmsub: f64 = simd_extract(c, 0);
23258     if (k & 0b00000001) != 0 {
23259         let extracta: f64 = simd_extract(a, 0);
23260         let extracta = -extracta;
23261         let extractb: f64 = simd_extract(b, 0);
23262         let extractc = -fnmsub;
23263         macro_rules! call {
23264             ($imm4:expr) => {
23265                 vfmadd132sd(extracta, extractb, extractc, $imm4)
23266             };
23267         }
23268         fnmsub = constify_imm4_round!(rounding, call);
23269     }
23270     let r = simd_insert(c, 0, fnmsub);
23271     transmute(r)
23272 }
23273
23274 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
23275 ///
23276 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_ss&expand=2517)
23277 #[inline]
23278 #[target_feature(enable = "avx512f")]
23279 #[cfg_attr(test, assert_instr(vfixupimmss, imm8 = 0))]
23280 #[rustc_args_required_const(3)]
23281 pub unsafe fn _mm_fixupimm_ss(a: __m128, b: __m128, c: __m128i, imm8: i32) -> __m128 {
23282     let a = a.as_f32x4();
23283     let b = b.as_f32x4();
23284     let c = c.as_i32x4();
23285     macro_rules! call {
23286         ($imm8:expr) => {
23287             vfixupimmss(a, b, c, $imm8, 0b11111111, _MM_FROUND_CUR_DIRECTION)
23288         };
23289     }
23290     let fixupimm = constify_imm8_sae!(imm8, call);
23291     let fixupimm: f32 = simd_extract(fixupimm, 0);
23292     let r = simd_insert(a, 0, fixupimm);
23293     transmute(r)
23294 }
23295
23296 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
23297 ///
23298 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_ss&expand=2518)
23299 #[inline]
23300 #[target_feature(enable = "avx512f")]
23301 #[cfg_attr(test, assert_instr(vfixupimmss, imm8 = 0))]
23302 #[rustc_args_required_const(4)]
23303 pub unsafe fn _mm_mask_fixupimm_ss(
23304     a: __m128,
23305     k: __mmask8,
23306     b: __m128,
23307     c: __m128i,
23308     imm8: i32,
23309 ) -> __m128 {
23310     let a = a.as_f32x4();
23311     let b = b.as_f32x4();
23312     let c = c.as_i32x4();
23313     macro_rules! call {
23314         ($imm8:expr) => {
23315             vfixupimmss(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
23316         };
23317     }
23318     let fixupimm = constify_imm8_sae!(imm8, call);
23319     let fixupimm: f32 = simd_extract(fixupimm, 0);
23320     let r = simd_insert(a, 0, fixupimm);
23321     transmute(r)
23322 }
23323
23324 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
23325 ///
23326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_ss&expand=2519)
23327 #[inline]
23328 #[target_feature(enable = "avx512f")]
23329 #[cfg_attr(test, assert_instr(vfixupimmss, imm8 = 0))]
23330 #[rustc_args_required_const(4)]
23331 pub unsafe fn _mm_maskz_fixupimm_ss(
23332     k: __mmask8,
23333     a: __m128,
23334     b: __m128,
23335     c: __m128i,
23336     imm8: i32,
23337 ) -> __m128 {
23338     let a = a.as_f32x4();
23339     let b = b.as_f32x4();
23340     let c = c.as_i32x4();
23341     macro_rules! call {
23342         ($imm8:expr) => {
23343             vfixupimmssz(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
23344         };
23345     }
23346     let fixupimm = constify_imm8_sae!(imm8, call);
23347     let fixupimm: f32 = simd_extract(fixupimm, 0);
23348     let r = simd_insert(a, 0, fixupimm);
23349     transmute(r)
23350 }
23351
23352 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
23353 ///
23354 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_sd&expand=2514)
23355 #[inline]
23356 #[target_feature(enable = "avx512f")]
23357 #[cfg_attr(test, assert_instr(vfixupimmsd, imm8 = 0))]
23358 #[rustc_args_required_const(3)]
23359 pub unsafe fn _mm_fixupimm_sd(a: __m128d, b: __m128d, c: __m128i, imm8: i32) -> __m128d {
23360     let a = a.as_f64x2();
23361     let b = b.as_f64x2();
23362     let c = c.as_i64x2();
23363     macro_rules! call {
23364         ($imm8:expr) => {
23365             vfixupimmsd(a, b, c, $imm8, 0b11111111, _MM_FROUND_CUR_DIRECTION)
23366         };
23367     }
23368     let fixupimm = constify_imm8_sae!(imm8, call);
23369     let fixupimm: f64 = simd_extract(fixupimm, 0);
23370     let r = simd_insert(a, 0, fixupimm);
23371     transmute(r)
23372 }
23373
23374 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
23375 ///
23376 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_sd&expand=2515)
23377 #[inline]
23378 #[target_feature(enable = "avx512f")]
23379 #[cfg_attr(test, assert_instr(vfixupimmsd, imm8 = 0))]
23380 #[rustc_args_required_const(4)]
23381 pub unsafe fn _mm_mask_fixupimm_sd(
23382     a: __m128d,
23383     k: __mmask8,
23384     b: __m128d,
23385     c: __m128i,
23386     imm8: i32,
23387 ) -> __m128d {
23388     let a = a.as_f64x2();
23389     let b = b.as_f64x2();
23390     let c = c.as_i64x2();
23391     macro_rules! call {
23392         ($imm8:expr) => {
23393             vfixupimmsd(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
23394         };
23395     }
23396     let fixupimm = constify_imm8_sae!(imm8, call);
23397     let fixupimm: f64 = simd_extract(fixupimm, 0);
23398     let r = simd_insert(a, 0, fixupimm);
23399     transmute(r)
23400 }
23401
23402 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
23403 ///
23404 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_sd&expand=2516)
23405 #[inline]
23406 #[target_feature(enable = "avx512f")]
23407 #[cfg_attr(test, assert_instr(vfixupimmsd, imm8 = 0))]
23408 #[rustc_args_required_const(4)]
23409 pub unsafe fn _mm_maskz_fixupimm_sd(
23410     k: __mmask8,
23411     a: __m128d,
23412     b: __m128d,
23413     c: __m128i,
23414     imm8: i32,
23415 ) -> __m128d {
23416     let a = a.as_f64x2();
23417     let b = b.as_f64x2();
23418     let c = c.as_i64x2();
23419     macro_rules! call {
23420         ($imm8:expr) => {
23421             vfixupimmsdz(a, b, c, $imm8, k, _MM_FROUND_CUR_DIRECTION)
23422         };
23423     }
23424     let fixupimm = constify_imm8_sae!(imm8, call);
23425     let fixupimm: f64 = simd_extract(fixupimm, 0);
23426     let r = simd_insert(a, 0, fixupimm);
23427     transmute(r)
23428 }
23429
23430 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
23431 ///
23432 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23433 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_ss&expand=2511)
23434 #[inline]
23435 #[target_feature(enable = "avx512f")]
23436 #[cfg_attr(test, assert_instr(vfixupimmss, imm8 = 0, sae = 8))]
23437 #[rustc_args_required_const(3, 4)]
23438 pub unsafe fn _mm_fixupimm_round_ss(
23439     a: __m128,
23440     b: __m128,
23441     c: __m128i,
23442     imm8: i32,
23443     sae: i32,
23444 ) -> __m128 {
23445     let a = a.as_f32x4();
23446     let b = b.as_f32x4();
23447     let c = c.as_i32x4();
23448     macro_rules! call {
23449         ($imm8:expr, $imm4:expr) => {
23450             vfixupimmss(a, b, c, $imm8, 0b11111111, $imm4)
23451         };
23452     }
23453     let fixupimm = constify_imm8_roundscale!(imm8, sae, call);
23454     let fixupimm: f32 = simd_extract(fixupimm, 0);
23455     let r = simd_insert(a, 0, fixupimm);
23456     transmute(r)
23457 }
23458
23459 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
23460 ///
23461 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23462 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_ss&expand=2512)
23463 #[inline]
23464 #[target_feature(enable = "avx512f")]
23465 #[cfg_attr(test, assert_instr(vfixupimmss, imm8 = 0, sae = 8))]
23466 #[rustc_args_required_const(4, 5)]
23467 pub unsafe fn _mm_mask_fixupimm_round_ss(
23468     a: __m128,
23469     k: __mmask8,
23470     b: __m128,
23471     c: __m128i,
23472     imm8: i32,
23473     sae: i32,
23474 ) -> __m128 {
23475     let a = a.as_f32x4();
23476     let b = b.as_f32x4();
23477     let c = c.as_i32x4();
23478     macro_rules! call {
23479         ($imm8:expr, $imm4:expr) => {
23480             vfixupimmss(a, b, c, $imm8, k, $imm4)
23481         };
23482     }
23483     let fixupimm = constify_imm8_roundscale!(imm8, sae, call);
23484     let fixupimm: f32 = simd_extract(fixupimm, 0);
23485     let r = simd_insert(a, 0, fixupimm);
23486     transmute(r)
23487 }
23488
23489 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
23490 ///
23491 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23492 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_ss&expand=2513)
23493 #[inline]
23494 #[target_feature(enable = "avx512f")]
23495 #[cfg_attr(test, assert_instr(vfixupimmss, imm8 = 0, sae = 8))]
23496 #[rustc_args_required_const(4, 5)]
23497 pub unsafe fn _mm_maskz_fixupimm_round_ss(
23498     k: __mmask8,
23499     a: __m128,
23500     b: __m128,
23501     c: __m128i,
23502     imm8: i32,
23503     sae: i32,
23504 ) -> __m128 {
23505     let a = a.as_f32x4();
23506     let b = b.as_f32x4();
23507     let c = c.as_i32x4();
23508     macro_rules! call {
23509         ($imm8:expr, $imm4:expr) => {
23510             vfixupimmssz(a, b, c, $imm8, k, $imm4)
23511         };
23512     }
23513     let fixupimm = constify_imm8_roundscale!(imm8, sae, call);
23514     let fixupimm: f32 = simd_extract(fixupimm, 0);
23515     let r = simd_insert(a, 0, fixupimm);
23516     transmute(r)
23517 }
23518
23519 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
23520 ///
23521 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23522 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_sd&expand=2508)
23523 #[inline]
23524 #[target_feature(enable = "avx512f")]
23525 #[cfg_attr(test, assert_instr(vfixupimmsd, imm8 = 0, sae = 8))]
23526 #[rustc_args_required_const(3, 4)]
23527 pub unsafe fn _mm_fixupimm_round_sd(
23528     a: __m128d,
23529     b: __m128d,
23530     c: __m128i,
23531     imm8: i32,
23532     sae: i32,
23533 ) -> __m128d {
23534     let a = a.as_f64x2();
23535     let b = b.as_f64x2();
23536     let c = c.as_i64x2();
23537     macro_rules! call {
23538         ($imm8:expr, $imm4:expr) => {
23539             vfixupimmsd(a, b, c, $imm8, 0b11111111, $imm4)
23540         };
23541     }
23542     let fixupimm = constify_imm8_roundscale!(imm8, sae, call);
23543     let fixupimm: f64 = simd_extract(fixupimm, 0);
23544     let r = simd_insert(a, 0, fixupimm);
23545     transmute(r)
23546 }
23547
23548 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
23549 ///
23550 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23551 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_sd&expand=2509)
23552 #[inline]
23553 #[target_feature(enable = "avx512f")]
23554 #[cfg_attr(test, assert_instr(vfixupimmsd, imm8 = 0, sae = 8))]
23555 #[rustc_args_required_const(4, 5)]
23556 pub unsafe fn _mm_mask_fixupimm_round_sd(
23557     a: __m128d,
23558     k: __mmask8,
23559     b: __m128d,
23560     c: __m128i,
23561     imm8: i32,
23562     sae: i32,
23563 ) -> __m128d {
23564     let a = a.as_f64x2();
23565     let b = b.as_f64x2();
23566     let c = c.as_i64x2();
23567     macro_rules! call {
23568         ($imm8:expr, $imm4:expr) => {
23569             vfixupimmsd(a, b, c, $imm8, k, $imm4)
23570         };
23571     }
23572     let fixupimm = constify_imm8_roundscale!(imm8, sae, call);
23573     let fixupimm: f64 = simd_extract(fixupimm, 0);
23574     let r = simd_insert(a, 0, fixupimm);
23575     transmute(r)
23576 }
23577
23578 /// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
23579 ///
23580 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_sd&expand=2510)
23582 #[inline]
23583 #[target_feature(enable = "avx512f")]
23584 #[cfg_attr(test, assert_instr(vfixupimmsd, imm8 = 0, sae = 8))]
23585 #[rustc_args_required_const(4, 5)]
23586 pub unsafe fn _mm_maskz_fixupimm_round_sd(
23587     k: __mmask8,
23588     a: __m128d,
23589     b: __m128d,
23590     c: __m128i,
23591     imm8: i32,
23592     sae: i32,
23593 ) -> __m128d {
23594     let a = a.as_f64x2();
23595     let b = b.as_f64x2();
23596     let c = c.as_i64x2();
23597     macro_rules! call {
23598         ($imm8:expr, $imm4:expr) => {
23599             vfixupimmsdz(a, b, c, $imm8, k, $imm4)
23600         };
23601     }
23602     let fixupimm = constify_imm8_roundscale!(imm8, sae, call);
23603     let fixupimm: f64 = simd_extract(fixupimm, 0);
23604     let r = simd_insert(a, 0, fixupimm);
23605     transmute(r)
23606 }
23607
23608 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
23609 ///
23610 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvtss_sd&expand=1896)
23611 #[inline]
23612 #[target_feature(enable = "avx512f")]
23613 #[cfg_attr(test, assert_instr(vcvtss2sd))]
23614 pub unsafe fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
23615     transmute(vcvtss2sd(
23616         a.as_f64x2(),
23617         b.as_f32x4(),
23618         src.as_f64x2(),
23619         k,
23620         _MM_FROUND_CUR_DIRECTION,
23621     ))
23622 }
23623
23624 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
23625 ///
23626 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvtss_sd&expand=1897)
23627 #[inline]
23628 #[target_feature(enable = "avx512f")]
23629 #[cfg_attr(test, assert_instr(vcvtss2sd))]
23630 pub unsafe fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
23631     transmute(vcvtss2sd(
23632         a.as_f64x2(),
23633         b.as_f32x4(),
23634         _mm_setzero_pd().as_f64x2(),
23635         k,
23636         _MM_FROUND_CUR_DIRECTION,
23637     ))
23638 }
23639
23640 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
23641 ///
23642 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvtsd_ss&expand=1797)
23643 #[inline]
23644 #[target_feature(enable = "avx512f")]
23645 #[cfg_attr(test, assert_instr(vcvtsd2ss))]
23646 pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
23647     transmute(vcvtsd2ss(
23648         a.as_f32x4(),
23649         b.as_f64x2(),
23650         src.as_f32x4(),
23651         k,
23652         _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
23653     ))
23654 }
23655
23656 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
23657 ///
23658 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvtsd_ss&expand=1798)
23659 #[inline]
23660 #[target_feature(enable = "avx512f")]
23661 #[cfg_attr(test, assert_instr(vcvtsd2ss))]
23662 pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
23663     transmute(vcvtsd2ss(
23664         a.as_f32x4(),
23665         b.as_f64x2(),
23666         _mm_setzero_ps().as_f32x4(),
23667         k,
23668         _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
23669     ))
23670 }
23671
23672 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
23673 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23674 ///
23675 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_sd&expand=1371)
23676 #[inline]
23677 #[target_feature(enable = "avx512f")]
23678 #[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
23679 #[rustc_args_required_const(2)]
23680 pub unsafe fn _mm_cvt_roundss_sd(a: __m128d, b: __m128, sae: i32) -> __m128d {
23681     macro_rules! call {
23682         ($imm4:expr) => {
23683             vcvtss2sd(
23684                 a.as_f64x2(),
23685                 b.as_f32x4(),
23686                 _mm_setzero_pd().as_f64x2(),
23687                 0b11111111,
23688                 $imm4,
23689             )
23690         };
23691     }
23692     let r = constify_imm4_sae!(sae, call);
23693     transmute(r)
23694 }
23695
23696 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
23697 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23698 ///
23699 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundss_sd&expand=1372)
23700 #[inline]
23701 #[target_feature(enable = "avx512f")]
23702 #[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
23703 #[rustc_args_required_const(4)]
23704 pub unsafe fn _mm_mask_cvt_roundss_sd(
23705     src: __m128d,
23706     k: __mmask8,
23707     a: __m128d,
23708     b: __m128,
23709     sae: i32,
23710 ) -> __m128d {
23711     macro_rules! call {
23712         ($imm4:expr) => {
23713             vcvtss2sd(a.as_f64x2(), b.as_f32x4(), src.as_f64x2(), k, $imm4)
23714         };
23715     }
23716     let r = constify_imm4_sae!(sae, call);
23717     transmute(r)
23718 }
23719
23720 /// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
23721 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
23722 ///
23723 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundss_sd&expand=1373)
23724 #[inline]
23725 #[target_feature(enable = "avx512f")]
23726 #[cfg_attr(test, assert_instr(vcvtss2sd, sae = 8))]
23727 #[rustc_args_required_const(3)]
23728 pub unsafe fn _mm_maskz_cvt_roundss_sd(k: __mmask8, a: __m128d, b: __m128, sae: i32) -> __m128d {
23729     macro_rules! call {
23730         ($imm4:expr) => {
23731             vcvtss2sd(
23732                 a.as_f64x2(),
23733                 b.as_f32x4(),
23734                 _mm_setzero_pd().as_f64x2(),
23735                 k,
23736                 $imm4,
23737             )
23738         };
23739     }
23740     let r = constify_imm4_sae!(sae, call);
23741     transmute(r)
23742 }
23743
23744 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
23745 ///
23746 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
23747 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
23748 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
23749 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
23750 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
23751 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23752 ///
23753 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_ss&expand=1361)
23754 #[inline]
23755 #[target_feature(enable = "avx512f")]
23756 #[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
23757 #[rustc_args_required_const(2)]
23758 pub unsafe fn _mm_cvt_roundsd_ss(a: __m128, b: __m128d, rounding: i32) -> __m128 {
23759     macro_rules! call {
23760         ($imm4:expr) => {
23761             vcvtsd2ss(
23762                 a.as_f32x4(),
23763                 b.as_f64x2(),
23764                 _mm_setzero_ps().as_f32x4(),
23765                 0b11111111,
23766                 $imm4,
23767             )
23768         };
23769     }
23770     let r = constify_imm4_round!(rounding, call);
23771     transmute(r)
23772 }
23773
23774 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
23775 ///
23776 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
23777 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
23778 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
23779 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
23780 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
23781 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23782 ///
23783 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundsd_ss&expand=1362)
23784 #[inline]
23785 #[target_feature(enable = "avx512f")]
23786 #[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
23787 #[rustc_args_required_const(4)]
23788 pub unsafe fn _mm_mask_cvt_roundsd_ss(
23789     src: __m128,
23790     k: __mmask8,
23791     a: __m128,
23792     b: __m128d,
23793     rounding: i32,
23794 ) -> __m128 {
23795     macro_rules! call {
23796         ($imm4:expr) => {
23797             vcvtsd2ss(a.as_f32x4(), b.as_f64x2(), src.as_f32x4(), k, $imm4)
23798         };
23799     }
23800     let r = constify_imm4_round!(rounding, call);
23801     transmute(r)
23802 }
23803
23804 /// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
23805 ///
23806 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23807 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23808 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23809 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23810 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23811 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23812 ///
23813 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundsd_ss&expand=1363)
23814 #[inline]
23815 #[target_feature(enable = "avx512f")]
23816 #[cfg_attr(test, assert_instr(vcvtsd2ss, rounding = 8))]
23817 #[rustc_args_required_const(3)]
23818 pub unsafe fn _mm_maskz_cvt_roundsd_ss(
23819     k: __mmask8,
23820     a: __m128,
23821     b: __m128d,
23822     rounding: i32,
23823 ) -> __m128 {
23824     macro_rules! call {
23825         ($imm4:expr) => {
23826             vcvtsd2ss(
23827                 a.as_f32x4(),
23828                 b.as_f64x2(),
23829                 _mm_setzero_ps().as_f32x4(),
23830                 k,
23831                 $imm4,
23832             )
23833         };
23834     }
23835     let r = constify_imm4_round!(rounding, call);
23836     transmute(r)
23837 }
23838
23839 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
23840 ///
23841 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23842 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23843 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23844 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23845 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23846 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23847 ///
23848 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_si32&expand=1374)
23849 #[inline]
23850 #[target_feature(enable = "avx512f")]
23851 #[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
23852 #[rustc_args_required_const(1)]
23853 pub unsafe fn _mm_cvt_roundss_si32(a: __m128, rounding: i32) -> i32 {
23854     macro_rules! call {
23855         ($imm4:expr) => {
23856             vcvtss2si(a.as_f32x4(), $imm4)
23857         };
23858     }
23859     let r = constify_imm4_round!(rounding, call);
23860     transmute(r)
23861 }
23862
23863 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
23864 ///
23865 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23866 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23867 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23868 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23869 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23870 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23871 ///
23872 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_i32&expand=1369)
23873 #[inline]
23874 #[target_feature(enable = "avx512f")]
23875 #[cfg_attr(test, assert_instr(vcvtss2si, rounding = 8))]
23876 #[rustc_args_required_const(1)]
23877 pub unsafe fn _mm_cvt_roundss_i32(a: __m128, rounding: i32) -> i32 {
23878     macro_rules! call {
23879         ($imm4:expr) => {
23880             vcvtss2si(a.as_f32x4(), $imm4)
23881         };
23882     }
23883     let r = constify_imm4_round!(rounding, call);
23884     transmute(r)
23885 }
23886
23887 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
23888 ///
23889 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23890 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23891 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23892 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23893 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23894 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23895 ///
23896 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_u32&expand=1376)
23897 #[inline]
23898 #[target_feature(enable = "avx512f")]
23899 #[cfg_attr(test, assert_instr(vcvtss2usi, rounding = 8))]
23900 #[rustc_args_required_const(1)]
23901 pub unsafe fn _mm_cvt_roundss_u32(a: __m128, rounding: i32) -> u32 {
23902     macro_rules! call {
23903         ($imm4:expr) => {
23904             vcvtss2usi(a.as_f32x4(), $imm4)
23905         };
23906     }
23907     let r = constify_imm4_round!(rounding, call);
23908     transmute(r)
23909 }
23910
23911 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
23912 ///
23913 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_i32&expand=1893)
23914 #[inline]
23915 #[target_feature(enable = "avx512f")]
23916 #[cfg_attr(test, assert_instr(vcvtss2si))]
23917 pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
23918     transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
23919 }
23920
23921 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
23922 ///
23923 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_u32&expand=1901)
23924 #[inline]
23925 #[target_feature(enable = "avx512f")]
23926 #[cfg_attr(test, assert_instr(vcvtss2usi))]
23927 pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
23928     transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
23929 }
23930
23931 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
23932 ///
23933 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23934 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23935 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23936 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23937 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23938 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23939 ///
23940 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_si32&expand=1359)
23941 #[inline]
23942 #[target_feature(enable = "avx512f")]
23943 #[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
23944 #[rustc_args_required_const(1)]
23945 pub unsafe fn _mm_cvt_roundsd_si32(a: __m128d, rounding: i32) -> i32 {
23946     macro_rules! call {
23947         ($imm4:expr) => {
23948             vcvtsd2si(a.as_f64x2(), $imm4)
23949         };
23950     }
23951     let r = constify_imm4_round!(rounding, call);
23952     transmute(r)
23953 }
23954
23955 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
23956 ///
23957 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23958 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23959 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23960 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23961 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23962 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23963 ///
23964 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_i32&expand=1357)
23965 #[inline]
23966 #[target_feature(enable = "avx512f")]
23967 #[cfg_attr(test, assert_instr(vcvtsd2si, rounding = 8))]
23968 #[rustc_args_required_const(1)]
23969 pub unsafe fn _mm_cvt_roundsd_i32(a: __m128d, rounding: i32) -> i32 {
23970     macro_rules! call {
23971         ($imm4:expr) => {
23972             vcvtsd2si(a.as_f64x2(), $imm4)
23973         };
23974     }
23975     let r = constify_imm4_round!(rounding, call);
23976     transmute(r)
23977 }
23978
23979 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
23980 ///
23981 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
23982 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
23983 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
23984 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
23985 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
23986 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
23987 ///
23988 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvt_roundsd_u32&expand=1364)
23989 #[inline]
23990 #[target_feature(enable = "avx512f")]
23991 #[cfg_attr(test, assert_instr(vcvtsd2usi, rounding = 8))]
23992 #[rustc_args_required_const(1)]
23993 pub unsafe fn _mm_cvt_roundsd_u32(a: __m128d, rounding: i32) -> u32 {
23994     macro_rules! call {
23995         ($imm4:expr) => {
23996             vcvtsd2usi(a.as_f64x2(), $imm4)
23997         };
23998     }
23999     let r = constify_imm4_round!(rounding, call);
24000     transmute(r)
24001 }
24002
24003 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
24004 ///
24005 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_i32&expand=1791)
24006 #[inline]
24007 #[target_feature(enable = "avx512f")]
24008 #[cfg_attr(test, assert_instr(vcvtsd2si))]
24009 pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
24010     transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
24011 }
24012
24013 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
24014 ///
24015 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_u32&expand=1799)
24016 #[inline]
24017 #[target_feature(enable = "avx512f")]
24018 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
24019 pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
24020     transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
24021 }
24022
24023 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
24024 ///
24025 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
24026 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
24027 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
24028 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
24029 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
24030 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
24031 ///
24032 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundi32_ss&expand=1312)
24033 #[inline]
24034 #[target_feature(enable = "avx512f")]
24035 #[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
24036 #[rustc_args_required_const(2)]
24037 pub unsafe fn _mm_cvt_roundi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
24038     macro_rules! call {
24039         ($imm4:expr) => {
24040             vcvtsi2ss(a.as_f32x4(), b, $imm4)
24041         };
24042     }
24043     let r = constify_imm4_round!(rounding, call);
24044     transmute(r)
24045 }
24046
24047 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
24048 ///
24049 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
24050 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
24051 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
24052 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
24053 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
24054 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
24055 ///
24056 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsi32_ss&expand=1366)
24057 #[inline]
24058 #[target_feature(enable = "avx512f")]
24059 #[cfg_attr(test, assert_instr(vcvtsi2ss, rounding = 8))]
24060 #[rustc_args_required_const(2)]
24061 pub unsafe fn _mm_cvt_roundsi32_ss(a: __m128, b: i32, rounding: i32) -> __m128 {
24062     macro_rules! call {
24063         ($imm4:expr) => {
24064             vcvtsi2ss(a.as_f32x4(), b, $imm4)
24065         };
24066     }
24067     let r = constify_imm4_round!(rounding, call);
24068     transmute(r)
24069 }
24070
24071 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
24072 ///
24073 /// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
24074 ///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
24075 ///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
24076 ///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
24077 ///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
24078 ///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
24079 ///
24080 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundu32_ss&expand=1378)
24081 #[inline]
24082 #[target_feature(enable = "avx512f")]
24083 #[cfg_attr(test, assert_instr(vcvtusi2ss, rounding = 8))]
24084 #[rustc_args_required_const(2)]
24085 pub unsafe fn _mm_cvt_roundu32_ss(a: __m128, b: u32, rounding: i32) -> __m128 {
24086     macro_rules! call {
24087         ($imm4:expr) => {
24088             vcvtusi2ss(a.as_f32x4(), b, $imm4)
24089         };
24090     }
24091     let r = constify_imm4_round!(rounding, call);
24092     transmute(r)
24093 }
24094
24095 /// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
24096 ///
24097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
24098 #[inline]
24099 #[target_feature(enable = "avx512f")]
24100 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
24101 pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
24102     let b = b as f32;
24103     let r = simd_insert(a, 0, b);
24104     transmute(r)
24105 }
24106
24107 /// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
24108 ///
24109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_sd&expand=1642)
24110 #[inline]
24111 #[target_feature(enable = "avx512f")]
24112 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
24113 pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
24114     let b = b as f64;
24115     let r = simd_insert(a, 0, b);
24116     transmute(r)
24117 }
24118
24119 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
24120 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24121 ///
24122 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_Si32&expand=1936)
24123 #[inline]
24124 #[target_feature(enable = "avx512f")]
24125 #[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
24126 #[rustc_args_required_const(1)]
24127 pub unsafe fn _mm_cvtt_roundss_si32(a: __m128, sae: i32) -> i32 {
24128     macro_rules! call {
24129         ($imm4:expr) => {
24130             vcvtss2si(a.as_f32x4(), $imm4)
24131         };
24132     }
24133     let r = constify_imm4_sae!(sae, call);
24134     transmute(r)
24135 }
24136
24137 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
24138 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24139 ///
24140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_i32&expand=1934)
24141 #[inline]
24142 #[target_feature(enable = "avx512f")]
24143 #[cfg_attr(test, assert_instr(vcvtss2si, sae = 8))]
24144 #[rustc_args_required_const(1)]
24145 pub unsafe fn _mm_cvtt_roundss_i32(a: __m128, sae: i32) -> i32 {
24146     macro_rules! call {
24147         ($imm4:expr) => {
24148             vcvtss2si(a.as_f32x4(), $imm4)
24149         };
24150     }
24151     let r = constify_imm4_sae!(sae, call);
24152     transmute(r)
24153 }
24154
24155 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
24156 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24157 ///
24158 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_u32&expand=1938)
24159 #[inline]
24160 #[target_feature(enable = "avx512f")]
24161 #[cfg_attr(test, assert_instr(vcvtss2usi, sae = 8))]
24162 #[rustc_args_required_const(1)]
24163 pub unsafe fn _mm_cvtt_roundss_u32(a: __m128, sae: i32) -> u32 {
24164     macro_rules! call {
24165         ($imm4:expr) => {
24166             vcvtss2usi(a.as_f32x4(), $imm4)
24167         };
24168     }
24169     let r = constify_imm4_sae!(sae, call);
24170     transmute(r)
24171 }
24172
24173 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
24174 ///
24175 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttss_i32&expand=2022)
24176 #[inline]
24177 #[target_feature(enable = "avx512f")]
24178 #[cfg_attr(test, assert_instr(vcvtss2si))]
24179 pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
24180     transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
24181 }
24182
24183 /// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
24184 ///
24185 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttss_u32&expand=2026)
24186 #[inline]
24187 #[target_feature(enable = "avx512f")]
24188 #[cfg_attr(test, assert_instr(vcvtss2usi))]
24189 pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
24190     transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
24191 }
24192
24193 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
24194 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24195 ///
24196 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_si32&expand=1930)
24197 #[inline]
24198 #[target_feature(enable = "avx512f")]
24199 #[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
24200 #[rustc_args_required_const(1)]
24201 pub unsafe fn _mm_cvtt_roundsd_si32(a: __m128d, sae: i32) -> i32 {
24202     macro_rules! call {
24203         ($imm4:expr) => {
24204             vcvtsd2si(a.as_f64x2(), $imm4)
24205         };
24206     }
24207     let r = constify_imm4_sae!(sae, call);
24208     transmute(r)
24209 }
24210
24211 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
24212 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24213 ///
24214 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_i32&expand=1928)
24215 #[inline]
24216 #[target_feature(enable = "avx512f")]
24217 #[cfg_attr(test, assert_instr(vcvtsd2si, sae = 8))]
24218 #[rustc_args_required_const(1)]
24219 pub unsafe fn _mm_cvtt_roundsd_i32(a: __m128d, sae: i32) -> i32 {
24220     macro_rules! call {
24221         ($imm4:expr) => {
24222             vcvtsd2si(a.as_f64x2(), $imm4)
24223         };
24224     }
24225     let r = constify_imm4_sae!(sae, call);
24226     transmute(r)
24227 }
24228
24229 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
24230 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24231 ///
24232 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_u32&expand=1932)
24233 #[inline]
24234 #[target_feature(enable = "avx512f")]
24235 #[cfg_attr(test, assert_instr(vcvtsd2usi, sae = 8))]
24236 #[rustc_args_required_const(1)]
24237 pub unsafe fn _mm_cvtt_roundsd_u32(a: __m128d, sae: i32) -> u32 {
24238     macro_rules! call {
24239         ($imm4:expr) => {
24240             vcvtsd2usi(a.as_f64x2(), $imm4)
24241         };
24242     }
24243     let r = constify_imm4_sae!(sae, call);
24244     transmute(r)
24245 }
24246
24247 /// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
24248 ///
24249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttsd_i32&expand=2015)
24250 #[inline]
24251 #[target_feature(enable = "avx512f")]
24252 #[cfg_attr(test, assert_instr(vcvtsd2si))]
24253 pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
24254     transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
24255 }
24256
24257 /// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
24258 ///
24259 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvttsd_u32&expand=2020)
24260 #[inline]
24261 #[target_feature(enable = "avx512f")]
24262 #[cfg_attr(test, assert_instr(vcvtsd2usi))]
24263 pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
24264     transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
24265 }
24266
24267 /// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
24268 ///
24269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu32_ss&expand=2032)
24270 #[inline]
24271 #[target_feature(enable = "avx512f")]
24272 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
24273 pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
24274     let b = b as f32;
24275     let r = simd_insert(a, 0, b);
24276     transmute(r)
24277 }
24278
24279 /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
24280 ///
24281 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu32_sd&expand=2031)
24282 #[inline]
24283 #[target_feature(enable = "avx512f")]
24284 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
24285 pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
24286     let b = b as f64;
24287     let r = simd_insert(a, 0, b);
24288     transmute(r)
24289 }
24290
24291 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
24292 ///
24293 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu64_ss&expand=2035)
24294 #[inline]
24295 #[target_feature(enable = "avx512f")]
24296 #[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2ss
24297 pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
24298     let b = b as f32;
24299     let r = simd_insert(a, 0, b);
24300     transmute(r)
24301 }
24302
24303 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
24304 ///
24305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtu64_sd&expand=2034)
24306 #[inline]
24307 #[target_feature(enable = "avx512f")]
24308 #[cfg_attr(test, assert_instr(mov))] // should be vcvtusi2sd
24309 pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
24310     let b = b as f64;
24311     let r = simd_insert(a, 0, b);
24312     transmute(r)
24313 }
24314
24315 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
24316 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24317 ///
24318 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_comi_round_ss&expand=1175)
24319 #[inline]
24320 #[target_feature(enable = "avx512f")]
24321 #[cfg_attr(test, assert_instr(vcmp, imm8 = 5, sae = 4))] //should be vcomiss
24322 #[rustc_args_required_const(2, 3)]
24323 pub unsafe fn _mm_comi_round_ss(a: __m128, b: __m128, imm8: i32, sae: i32) -> i32 {
24324     macro_rules! call {
24325         ($imm8:expr, $imm4:expr) => {
24326             vcomiss(a.as_f32x4(), b.as_f32x4(), $imm8, $imm4)
24327         };
24328     }
24329     let r = constify_imm5_sae!(imm8, sae, call);
24330     transmute(r)
24331 }
24332
24333 /// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
24334 /// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
24335 ///
24336 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_comi_round_sd&expand=1174)
24337 #[inline]
24338 #[target_feature(enable = "avx512f")]
24339 #[cfg_attr(test, assert_instr(vcmp, imm8 = 5, sae = 4))] //should be vcomisd
24340 #[rustc_args_required_const(2, 3)]
24341 pub unsafe fn _mm_comi_round_sd(a: __m128d, b: __m128d, imm8: i32, sae: i32) -> i32 {
24342     macro_rules! call {
24343         ($imm8:expr, $imm4:expr) => {
24344             vcomisd(a.as_f64x2(), b.as_f64x2(), $imm8, $imm4)
24345         };
24346     }
24347     let r = constify_imm5_sae!(imm8, sae, call);
24348     transmute(r)
24349 }
24350
24351 /// Equal
24352 pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
24353 /// Less-than
24354 pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
24355 /// Less-than-or-equal
24356 pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
24357 /// False
24358 pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
24359 /// Not-equal
24360 pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
24361 /// Not less-than
24362 pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
24363 /// Not less-than-or-equal
24364 pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
24365 /// True
24366 pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;
24367
24368 /// interval [1, 2)
24369 pub const _MM_MANT_NORM_1_2: _MM_MANTISSA_NORM_ENUM = 0x00;
24370 /// interval [0.5, 2)
24371 pub const _MM_MANT_NORM_P5_2: _MM_MANTISSA_NORM_ENUM = 0x01;
24372 /// interval [0.5, 1)
24373 pub const _MM_MANT_NORM_P5_1: _MM_MANTISSA_NORM_ENUM = 0x02;
24374 /// interval [0.75, 1.5)
24375 pub const _MM_MANT_NORM_P75_1P5: _MM_MANTISSA_NORM_ENUM = 0x03;
24376
24377 /// sign = sign(SRC)
24378 pub const _MM_MANT_SIGN_SRC: _MM_MANTISSA_SIGN_ENUM = 0x00;
24379 /// sign = 0
24380 pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01;
24381 /// DEST = NaN if sign(SRC) = 1
24382 pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02;
24383
24384 pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00;
24385 pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01;
24386 pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02;
24387 pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03;
24388 pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04;
24389 pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05;
24390 pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06;
24391 pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07;
24392 pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08;
24393 pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09;
24394 pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A;
24395 pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B;
24396 pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C;
24397 pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D;
24398 pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E;
24399 pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F;
24400 pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10;
24401 pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11;
24402 pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12;
24403 pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13;
24404 pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14;
24405 pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15;
24406 pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16;
24407 pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17;
24408 pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18;
24409 pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19;
24410 pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A;
24411 pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B;
24412 pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C;
24413 pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D;
24414 pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E;
24415 pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F;
24416 pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20;
24417 pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21;
24418 pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22;
24419 pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23;
24420 pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24;
24421 pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25;
24422 pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26;
24423 pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27;
24424 pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28;
24425 pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29;
24426 pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A;
24427 pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B;
24428 pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C;
24429 pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D;
24430 pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E;
24431 pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F;
24432 pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30;
24433 pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31;
24434 pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32;
24435 pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33;
24436 pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34;
24437 pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35;
24438 pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36;
24439 pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37;
24440 pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38;
24441 pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39;
24442 pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A;
24443 pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B;
24444 pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C;
24445 pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D;
24446 pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E;
24447 pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F;
24448 pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40;
24449 pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41;
24450 pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42;
24451 pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43;
24452 pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44;
24453 pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45;
24454 pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46;
24455 pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47;
24456 pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48;
24457 pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49;
24458 pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A;
24459 pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B;
24460 pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C;
24461 pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D;
24462 pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E;
24463 pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F;
24464 pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50;
24465 pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51;
24466 pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52;
24467 pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53;
24468 pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54;
24469 pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55;
24470 pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56;
24471 pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57;
24472 pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58;
24473 pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59;
24474 pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A;
24475 pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B;
24476 pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C;
24477 pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D;
24478 pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E;
24479 pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F;
24480 pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60;
24481 pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61;
24482 pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62;
24483 pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63;
24484 pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64;
24485 pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65;
24486 pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66;
24487 pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67;
24488 pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68;
24489 pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69;
24490 pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A;
24491 pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B;
24492 pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C;
24493 pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D;
24494 pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E;
24495 pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F;
24496 pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70;
24497 pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71;
24498 pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72;
24499 pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73;
24500 pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74;
24501 pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75;
24502 pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76;
24503 pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77;
24504 pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78;
24505 pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79;
24506 pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A;
24507 pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B;
24508 pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C;
24509 pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D;
24510 pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E;
24511 pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F;
24512 pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80;
24513 pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81;
24514 pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82;
24515 pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83;
24516 pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84;
24517 pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85;
24518 pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86;
24519 pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87;
24520 pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88;
24521 pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89;
24522 pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A;
24523 pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B;
24524 pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C;
24525 pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D;
24526 pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E;
24527 pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F;
24528 pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90;
24529 pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91;
24530 pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92;
24531 pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93;
24532 pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94;
24533 pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95;
24534 pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96;
24535 pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97;
24536 pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98;
24537 pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99;
24538 pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A;
24539 pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B;
24540 pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C;
24541 pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D;
24542 pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E;
24543 pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F;
24544 pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0;
24545 pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1;
24546 pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2;
24547 pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3;
24548 pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4;
24549 pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5;
24550 pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6;
24551 pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7;
24552 pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8;
24553 pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9;
24554 pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA;
24555 pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB;
24556 pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC;
24557 pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD;
24558 pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE;
24559 pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF;
24560 pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0;
24561 pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1;
24562 pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2;
24563 pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3;
24564 pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4;
24565 pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5;
24566 pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6;
24567 pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7;
24568 pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8;
24569 pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9;
24570 pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA;
24571 pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB;
24572 pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC;
24573 pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD;
24574 pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE;
24575 pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF;
24576 pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0;
24577 pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1;
24578 pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2;
24579 pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3;
24580 pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4;
24581 pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5;
24582 pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6;
24583 pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7;
24584 pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8;
24585 pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9;
24586 pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA;
24587 pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB;
24588 pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC;
24589 pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD;
24590 pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE;
24591 pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF;
24592 pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0;
24593 pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1;
24594 pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2;
24595 pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3;
24596 pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4;
24597 pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5;
24598 pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6;
24599 pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7;
24600 pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8;
24601 pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9;
24602 pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA;
24603 pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB;
24604 pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC;
24605 pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD;
24606 pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE;
24607 pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF;
24608 pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0;
24609 pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1;
24610 pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2;
24611 pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3;
24612 pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4;
24613 pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5;
24614 pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6;
24615 pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7;
24616 pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8;
24617 pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9;
24618 pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA;
24619 pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB;
24620 pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC;
24621 pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED;
24622 pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE;
24623 pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF;
24624 pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0;
24625 pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1;
24626 pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2;
24627 pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3;
24628 pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4;
24629 pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5;
24630 pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6;
24631 pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7;
24632 pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8;
24633 pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9;
24634 pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA;
24635 pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB;
24636 pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC;
24637 pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD;
24638 pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE;
24639 pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;
24640
24641 #[allow(improper_ctypes)]
24642 extern "C" {
24643     #[link_name = "llvm.x86.avx512.pmul.dq.512"]
24644     fn vpmuldq(a: i32x16, b: i32x16) -> i64x8;
24645     #[link_name = "llvm.x86.avx512.pmulu.dq.512"]
24646     fn vpmuludq(a: u32x16, b: u32x16) -> u64x8;
24647
24648     #[link_name = "llvm.x86.avx512.mask.pmaxs.d.512"]
24649     fn vpmaxsd(a: i32x16, b: i32x16) -> i32x16;
24650     #[link_name = "llvm.x86.avx512.mask.pmaxs.q.512"]
24651     fn vpmaxsq(a: i64x8, b: i64x8) -> i64x8;
24652     #[link_name = "llvm.x86.avx512.mask.pmins.d.512"]
24653     fn vpminsd(a: i32x16, b: i32x16) -> i32x16;
24654     #[link_name = "llvm.x86.avx512.mask.pmins.q.512"]
24655     fn vpminsq(a: i64x8, b: i64x8) -> i64x8;
24656
24657     #[link_name = "llvm.x86.avx512.mask.pmaxu.d.512"]
24658     fn vpmaxud(a: u32x16, b: u32x16) -> u32x16;
24659     #[link_name = "llvm.x86.avx512.mask.pmaxu.q.512"]
24660     fn vpmaxuq(a: u64x8, b: u64x8) -> i64x8;
24661     #[link_name = "llvm.x86.avx512.mask.pminu.d.512"]
24662     fn vpminud(a: u32x16, b: u32x16) -> u32x16;
24663     #[link_name = "llvm.x86.avx512.mask.pminu.q.512"]
24664     fn vpminuq(a: u64x8, b: u64x8) -> i64x8;
24665
24666     #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
24667     fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
24668     #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
24669     fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;
24670
24671     #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
24672     fn vfmadd132ps(a: f32x16, b: f32x16, c: f32x16, rounding: i32) -> f32x16;
24673     #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
24674     fn vfmadd132pd(a: f64x8, b: f64x8, c: f64x8, rounding: i32) -> f64x8;
24675
24676     #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
24677     fn vfmaddsub213ps(a: f32x16, b: f32x16, c: f32x16, d: i32) -> f32x16; //from clang
24678     #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
24679     fn vfmaddsub213pd(a: f64x8, b: f64x8, c: f64x8, d: i32) -> f64x8; //from clang
24680
24681     #[link_name = "llvm.x86.avx512.add.ps.512"]
24682     fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
24683     #[link_name = "llvm.x86.avx512.add.pd.512"]
24684     fn vaddpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
24685     #[link_name = "llvm.x86.avx512.sub.ps.512"]
24686     fn vsubps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
24687     #[link_name = "llvm.x86.avx512.sub.pd.512"]
24688     fn vsubpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
24689     #[link_name = "llvm.x86.avx512.mul.ps.512"]
24690     fn vmulps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
24691     #[link_name = "llvm.x86.avx512.mul.pd.512"]
24692     fn vmulpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
24693     #[link_name = "llvm.x86.avx512.div.ps.512"]
24694     fn vdivps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
24695     #[link_name = "llvm.x86.avx512.div.pd.512"]
24696     fn vdivpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
24697
24698     #[link_name = "llvm.x86.avx512.max.ps.512"]
24699     fn vmaxps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
24700     #[link_name = "llvm.x86.avx512.max.pd.512"]
24701     fn vmaxpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
24702     #[link_name = "llvm.x86.avx512.min.ps.512"]
24703     fn vminps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
24704     #[link_name = "llvm.x86.avx512.min.pd.512"]
24705     fn vminpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
24706
24707     #[link_name = "llvm.x86.avx512.mask.getexp.ps.512"]
24708     fn vgetexpps(a: f32x16, src: f32x16, m: u16, sae: i32) -> f32x16;
24709     #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
24710     fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
24711
24712     #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
24713     fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
24714     #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
24715     fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
24716     #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
24717     fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
24718     #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
24719     fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
24720
24721     #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
24722     fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
24723     #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
24724     fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
24725     #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
24726     fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
24727     #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
24728     fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
24729
24730     #[link_name = "llvm.x86.avx512.pternlog.d.512"]
24731     fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, sae: i32) -> i32x16;
24732     #[link_name = "llvm.x86.avx512.pternlog.q.512"]
24733     fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, sae: i32) -> i64x8;
24734
24735     #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
24736     fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
24737     #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
24738     fn vgetmantpd(a: f64x8, mantissas: i32, src: f64x8, m: u8, sae: i32) -> f64x8;
24739
24740     #[link_name = "llvm.x86.avx512.rcp14.ps.512"]
24741     fn vrcp14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
24742     #[link_name = "llvm.x86.avx512.rcp14.pd.512"]
24743     fn vrcp14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
24744     #[link_name = "llvm.x86.avx512.rsqrt14.ps.512"]
24745     fn vrsqrt14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
24746     #[link_name = "llvm.x86.avx512.rsqrt14.pd.512"]
24747     fn vrsqrt14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
24748
24749     #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
24750     fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
24751     #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
24752     fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
24753     #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
24754     fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
24755     #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
24756     fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;
24757     #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
24758     fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
24759     #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
24760     fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
24761     #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
24762     fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
24763     #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
24764     fn vcvtudq2ps(a: u32x16, rounding: i32) -> f32x16;
24765
24766     #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.512"]
24767     fn vcvtps2ph(a: f32x16, sae: i32, src: i16x16, mask: u16) -> i16x16;
24768     #[link_name = "llvm.x86.avx512.mask.vcvtph2ps.512"]
24769     fn vcvtph2ps(a: i16x16, src: f32x16, mask: u16, sae: i32) -> f32x16;
24770
24771     #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
24772     fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
24773     #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
24774     fn vcvttps2udq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> u32x16;
24775     #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
24776     fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
24777     #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
24778     fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
24779
24780     #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
24781     fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
24782     #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
24783     fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
24784     #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
24785     fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
24786     #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
24787     fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
24788     #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
24789     fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
24790     #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
24791     fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
24792     #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
24793     fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
24794     #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
24795     fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
24796     #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
24797     fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
24798     #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
24799     fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
24800     #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
24801     fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
24802
24803     #[link_name = "llvm.x86.avx512.gather.dpd.512"]
24804     fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
24805     #[link_name = "llvm.x86.avx512.gather.dps.512"]
24806     fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
24807     #[link_name = "llvm.x86.avx512.gather.qpd.512"]
24808     fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
24809     #[link_name = "llvm.x86.avx512.gather.qps.512"]
24810     fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
24811     #[link_name = "llvm.x86.avx512.gather.dpq.512"]
24812     fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
24813     #[link_name = "llvm.x86.avx512.gather.dpi.512"]
24814     fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
24815     #[link_name = "llvm.x86.avx512.gather.qpq.512"]
24816     fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
24817     #[link_name = "llvm.x86.avx512.gather.qpi.512"]
24818     fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;
24819
24820     #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
24821     fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
24822     #[link_name = "llvm.x86.avx512.scatter.dps.512"]
24823     fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
24824     #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
24825     fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
24826     #[link_name = "llvm.x86.avx512.scatter.qps.512"]
24827     fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
24828     #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
24829     fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);
24830     #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
24831     fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
24832     #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
24833     fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
24834     #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
24835     fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);
24836
24837     #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
24838     fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
24839     #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
24840     fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;
24841     #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
24842     fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
24843     #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
24844     fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
24845     #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
24846     fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
24847     #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]
24848     fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
24849     #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"]
24850     fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
24851     #[link_name = "llvm.x86.avx512.mask.cmp.d.512"]
24852     fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
24853
24854     #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
24855     fn vprold(a: i32x16, i8: i32) -> i32x16;
24856     #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
24857     fn vprord(a: i32x16, i8: i32) -> i32x16;
24858     #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
24859     fn vprolq(a: i64x8, i8: i32) -> i64x8;
24860     #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
24861     fn vprorq(a: i64x8, i8: i32) -> i64x8;
24862
24863     #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
24864     fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
24865     #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
24866     fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
24867     #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
24868     fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
24869     #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
24870     fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
24871
24872     #[link_name = "llvm.x86.avx512.psllv.d.512"]
24873     fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
24874     #[link_name = "llvm.x86.avx512.psrlv.d.512"]
24875     fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
24876     #[link_name = "llvm.x86.avx512.psllv.q.512"]
24877     fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
24878     #[link_name = "llvm.x86.avx512.psrlv.q.512"]
24879     fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;
24880
24881     #[link_name = "llvm.x86.avx512.pslli.d.512"]
24882     fn vpsllid(a: i32x16, imm8: u32) -> i32x16;
24883     #[link_name = "llvm.x86.avx512.psrli.d.512"]
24884     fn vpsrlid(a: i32x16, imm8: u32) -> i32x16;
24885     #[link_name = "llvm.x86.avx512.pslli.q.512"]
24886     fn vpslliq(a: i64x8, imm8: u32) -> i64x8;
24887     #[link_name = "llvm.x86.avx512.psrli.q.512"]
24888     fn vpsrliq(a: i64x8, imm8: u32) -> i64x8;
24889
24890     #[link_name = "llvm.x86.avx512.psll.d.512"]
24891     fn vpslld(a: i32x16, count: i32x4) -> i32x16;
24892     #[link_name = "llvm.x86.avx512.psrl.d.512"]
24893     fn vpsrld(a: i32x16, count: i32x4) -> i32x16;
24894     #[link_name = "llvm.x86.avx512.psll.q.512"]
24895     fn vpsllq(a: i64x8, count: i64x2) -> i64x8;
24896     #[link_name = "llvm.x86.avx512.psrl.q.512"]
24897     fn vpsrlq(a: i64x8, count: i64x2) -> i64x8;
24898
24899     #[link_name = "llvm.x86.avx512.psra.d.512"]
24900     fn vpsrad(a: i32x16, count: i32x4) -> i32x16;
24901     #[link_name = "llvm.x86.avx512.psra.q.512"]
24902     fn vpsraq(a: i64x8, count: i64x2) -> i64x8;
24903
24904     #[link_name = "llvm.x86.avx512.psrai.d.512"]
24905     fn vpsraid(a: i32x16, imm8: u32) -> i32x16;
24906     #[link_name = "llvm.x86.avx512.psrai.q.512"]
24907     fn vpsraiq(a: i64x8, imm8: u32) -> i64x8;
24908
24909     #[link_name = "llvm.x86.avx512.psrav.d.512"]
24910     fn vpsravd(a: i32x16, count: i32x16) -> i32x16;
24911     #[link_name = "llvm.x86.avx512.psrav.q.512"]
24912     fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
24913
24914     #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
24915     fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
24916     #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
24917     fn vpermilpd(a: f64x8, b: i64x8) -> f64x8;
24918
24919     #[link_name = "llvm.x86.avx512.permvar.si.512"]
24920     fn vpermd(a: i32x16, idx: i32x16) -> i32x16;
24921     #[link_name = "llvm.x86.avx512.permvar.di.512"]
24922     fn vpermq(a: i64x8, idx: i64x8) -> i64x8;
24923     #[link_name = "llvm.x86.avx512.permvar.sf.512"]
24924     fn vpermps(a: f32x16, idx: i32x16) -> f32x16;
24925     #[link_name = "llvm.x86.avx512.permvar.df.512"]
24926     fn vpermpd(a: f64x8, idx: i64x8) -> f64x8;
24927
24928     #[link_name = "llvm.x86.avx512.vpermi2var.d.512"]
24929     fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16;
24930     #[link_name = "llvm.x86.avx512.vpermi2var.q.512"]
24931     fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8;
24932     #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"]
24933     fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16;
24934     #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"]
24935     fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8;
24936
24937     #[link_name = "llvm.x86.avx512.mask.compress.d.512"]
24938     fn vpcompressd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
24939     #[link_name = "llvm.x86.avx512.mask.compress.q.512"]
24940     fn vpcompressq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
24941     #[link_name = "llvm.x86.avx512.mask.compress.ps.512"]
24942     fn vcompressps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
24943     #[link_name = "llvm.x86.avx512.mask.compress.pd.512"]
24944     fn vcompresspd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
24945     #[link_name = "llvm.x86.avx512.mask.expand.d.512"]
24946     fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
24947     #[link_name = "llvm.x86.avx512.mask.expand.q.512"]
24948     fn vpexpandq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
24949     #[link_name = "llvm.x86.avx512.mask.expand.ps.512"]
24950     fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
24951     #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
24952     fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
24953
24954     #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
24955     fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
24956     #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
24957     fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
24958     #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
24959     fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
24960     #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
24961     fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
24962     #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
24963     fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
24964     #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
24965     fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
24966     #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
24967     fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
24968     #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
24969     fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
24970     #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
24971     fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
24972     #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
24973     fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
24974     #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
24975     fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
24976     #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
24977     fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
24978     #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
24979     fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
24980     #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
24981     fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
24982     #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
24983     fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
24984     #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
24985     fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
24986     #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
24987     fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
24988     #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
24989     fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;
24990
24991     #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
24992     fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
24993     #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
24994     fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
24995     #[link_name = "llvm.x86.avx512.rcp14.ss"]
24996     fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
24997     #[link_name = "llvm.x86.avx512.rcp14.sd"]
24998     fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
24999
25000     #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
25001     fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
25002     #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
25003     fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
25004     #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
25005     fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
25006     #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
25007     fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
25008
25009     #[link_name = "llvm.x86.avx512.vfmadd.f32"]
25010     fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32;
25011     #[link_name = "llvm.x86.avx512.vfmadd.f64"]
25012     fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64;
25013
25014     #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
25015     fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
25016     #[link_name = "llvm.x86.avx512.mask.fixupimm.sd"]
25017     fn vfixupimmsd(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
25018     #[link_name = "llvm.x86.avx512.maskz.fixupimm.ss"]
25019     fn vfixupimmssz(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
25020     #[link_name = "llvm.x86.avx512.maskz.fixupimm.sd"]
25021     fn vfixupimmsdz(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
25022
25023     #[link_name = "llvm.x86.avx512.mask.cvtss2sd.round"]
25024     fn vcvtss2sd(a: f64x2, a: f32x4, src: f64x2, mask: u8, sae: i32) -> f64x2;
25025     #[link_name = "llvm.x86.avx512.mask.cvtsd2ss.round"]
25026     fn vcvtsd2ss(a: f32x4, b: f64x2, src: f32x4, mask: u8, rounding: i32) -> f32x4;
25027
25028     #[link_name = "llvm.x86.avx512.vcvtss2si32"]
25029     fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
25030     #[link_name = "llvm.x86.avx512.vcvtss2si64"]
25031     fn vcvtss2si64(a: f32x4, rounding: i32) -> i64;
25032     #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
25033     fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;
25034     #[link_name = "llvm.x86.avx512.vcvtss2usi64"]
25035     fn vcvtss2usi64(a: f32x4, rounding: i32) -> u64;
25036     #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
25037     fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
25038     #[link_name = "llvm.x86.avx512.vcvtsd2si64"]
25039     fn vcvtsd2si64(a: f64x2, rounding: i32) -> i64;
25040     #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
25041     fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;
25042     #[link_name = "llvm.x86.avx512.vcvtsd2usi64"]
25043     fn vcvtsd2usi64(a: f64x2, rounding: i32) -> u64;
25044
25045     #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
25046     fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
25047     #[link_name = "llvm.x86.avx512.cvtsi2ss64"]
25048     fn vcvtsi2ss64(a: f32x4, b: i64, rounding: i32) -> f32x4;
25049     #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
25050     fn vcvtsi2sd(a: f64x2, b: i64, rounding: i32) -> f64x2;
25051     #[link_name = "llvm.x86.avx512.cvtusi2ss"]
25052     fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
25053     #[link_name = "llvm.x86.avx512.cvtusi642ss"]
25054     fn vcvtusi2ss64(a: f32x4, b: u64, rounding: i32) -> f32x4;
25055     #[link_name = "llvm.x86.avx512.cvtusi642sd"]
25056     fn vcvtusi2sd(a: f64x2, b: u64, rounding: i32) -> f64x2;
25057
25058     #[link_name = "llvm.x86.avx512.vcomi.ss"]
25059     fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32;
25060     #[link_name = "llvm.x86.avx512.vcomi.sd"]
25061     fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32;
25062 }
25063
25064 #[cfg(test)]
25065 mod tests {
25066
25067     use stdarch_test::simd_test;
25068
25069     use crate::core_arch::x86::*;
25070     use crate::hint::black_box;
25071     use crate::mem::{self};
25072
25073     #[simd_test(enable = "avx512f")]
25074     unsafe fn test_mm512_abs_epi32() {
25075         #[rustfmt::skip]
25076         let a = _mm512_setr_epi32(
25077             0, 1, -1, i32::MAX,
25078             i32::MIN, 100, -100, -32,
25079             0, 1, -1, i32::MAX,
25080             i32::MIN, 100, -100, -32,
25081         );
25082         let r = _mm512_abs_epi32(a);
25083         let e = _mm512_setr_epi32(
25084             0,
25085             1,
25086             1,
25087             i32::MAX,
25088             i32::MAX.wrapping_add(1),
25089             100,
25090             100,
25091             32,
25092             0,
25093             1,
25094             1,
25095             i32::MAX,
25096             i32::MAX.wrapping_add(1),
25097             100,
25098             100,
25099             32,
25100         );
25101         assert_eq_m512i(r, e);
25102     }
25103
25104     #[simd_test(enable = "avx512f")]
25105     unsafe fn test_mm512_mask_abs_epi32() {
25106         #[rustfmt::skip]
25107         let a = _mm512_setr_epi32(
25108             0, 1, -1, i32::MAX,
25109             i32::MIN, 100, -100, -32,
25110             0, 1, -1, i32::MAX,
25111             i32::MIN, 100, -100, -32,
25112         );
25113         let r = _mm512_mask_abs_epi32(a, 0, a);
25114         assert_eq_m512i(r, a);
25115         let r = _mm512_mask_abs_epi32(a, 0b00000000_11111111, a);
25116         let e = _mm512_setr_epi32(
25117             0,
25118             1,
25119             1,
25120             i32::MAX,
25121             i32::MAX.wrapping_add(1),
25122             100,
25123             100,
25124             32,
25125             0,
25126             1,
25127             -1,
25128             i32::MAX,
25129             i32::MIN,
25130             100,
25131             -100,
25132             -32,
25133         );
25134         assert_eq_m512i(r, e);
25135     }
25136
25137     #[simd_test(enable = "avx512f")]
25138     unsafe fn test_mm512_maskz_abs_epi32() {
25139         #[rustfmt::skip]
25140         let a = _mm512_setr_epi32(
25141             0, 1, -1, i32::MAX,
25142             i32::MIN, 100, -100, -32,
25143             0, 1, -1, i32::MAX,
25144             i32::MIN, 100, -100, -32,
25145         );
25146         let r = _mm512_maskz_abs_epi32(0, a);
25147         assert_eq_m512i(r, _mm512_setzero_si512());
25148         let r = _mm512_maskz_abs_epi32(0b00000000_11111111, a);
25149         let e = _mm512_setr_epi32(
25150             0,
25151             1,
25152             1,
25153             i32::MAX,
25154             i32::MAX.wrapping_add(1),
25155             100,
25156             100,
25157             32,
25158             0,
25159             0,
25160             0,
25161             0,
25162             0,
25163             0,
25164             0,
25165             0,
25166         );
25167         assert_eq_m512i(r, e);
25168     }
25169
25170     #[simd_test(enable = "avx512f")]
25171     unsafe fn test_mm512_abs_ps() {
25172         #[rustfmt::skip]
25173         let a = _mm512_setr_ps(
25174             0., 1., -1., f32::MAX,
25175             f32::MIN, 100., -100., -32.,
25176             0., 1., -1., f32::MAX,
25177             f32::MIN, 100., -100., -32.,
25178         );
25179         let r = _mm512_abs_ps(a);
25180         let e = _mm512_setr_ps(
25181             0.,
25182             1.,
25183             1.,
25184             f32::MAX,
25185             f32::MAX,
25186             100.,
25187             100.,
25188             32.,
25189             0.,
25190             1.,
25191             1.,
25192             f32::MAX,
25193             f32::MAX,
25194             100.,
25195             100.,
25196             32.,
25197         );
25198         assert_eq_m512(r, e);
25199     }
25200
25201     #[simd_test(enable = "avx512f")]
25202     unsafe fn test_mm512_mask_abs_ps() {
25203         let a = _mm512_setr_ps(
25204             0.,
25205             1.,
25206             -1.,
25207             f32::MAX,
25208             f32::MIN,
25209             100.,
25210             -100.,
25211             -32.,
25212             0.,
25213             1.,
25214             -1.,
25215             f32::MAX,
25216             f32::MIN,
25217             100.,
25218             -100.,
25219             -32.,
25220         );
25221         let r = _mm512_mask_abs_ps(a, 0, a);
25222         assert_eq_m512(r, a);
25223         let r = _mm512_mask_abs_ps(a, 0b00000000_11111111, a);
25224         let e = _mm512_setr_ps(
25225             0.,
25226             1.,
25227             1.,
25228             f32::MAX,
25229             f32::MAX,
25230             100.,
25231             100.,
25232             32.,
25233             0.,
25234             1.,
25235             -1.,
25236             f32::MAX,
25237             f32::MIN,
25238             100.,
25239             -100.,
25240             -32.,
25241         );
25242         assert_eq_m512(r, e);
25243     }
25244
25245     #[simd_test(enable = "avx512f")]
25246     unsafe fn test_mm512_mask_mov_epi32() {
25247         let src = _mm512_set1_epi32(1);
25248         let a = _mm512_set1_epi32(2);
25249         let r = _mm512_mask_mov_epi32(src, 0, a);
25250         assert_eq_m512i(r, src);
25251         let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
25252         assert_eq_m512i(r, a);
25253     }
25254
25255     #[simd_test(enable = "avx512f")]
25256     unsafe fn test_mm512_maskz_mov_epi32() {
25257         let a = _mm512_set1_epi32(2);
25258         let r = _mm512_maskz_mov_epi32(0, a);
25259         assert_eq_m512i(r, _mm512_setzero_si512());
25260         let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
25261         assert_eq_m512i(r, a);
25262     }
25263
25264     #[simd_test(enable = "avx512f")]
25265     unsafe fn test_mm512_mask_mov_ps() {
25266         let src = _mm512_set1_ps(1.);
25267         let a = _mm512_set1_ps(2.);
25268         let r = _mm512_mask_mov_ps(src, 0, a);
25269         assert_eq_m512(r, src);
25270         let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
25271         assert_eq_m512(r, a);
25272     }
25273
25274     #[simd_test(enable = "avx512f")]
25275     unsafe fn test_mm512_maskz_mov_ps() {
25276         let a = _mm512_set1_ps(2.);
25277         let r = _mm512_maskz_mov_ps(0, a);
25278         assert_eq_m512(r, _mm512_setzero_ps());
25279         let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
25280         assert_eq_m512(r, a);
25281     }
25282
25283     #[simd_test(enable = "avx512f")]
25284     unsafe fn test_mm512_add_epi32() {
25285         let a = _mm512_setr_epi32(
25286             0,
25287             1,
25288             -1,
25289             i32::MAX,
25290             i32::MIN,
25291             100,
25292             -100,
25293             -32,
25294             0,
25295             1,
25296             -1,
25297             i32::MAX,
25298             i32::MIN,
25299             100,
25300             -100,
25301             -32,
25302         );
25303         let b = _mm512_set1_epi32(1);
25304         let r = _mm512_add_epi32(a, b);
25305         let e = _mm512_setr_epi32(
25306             1,
25307             2,
25308             0,
25309             i32::MIN,
25310             i32::MIN + 1,
25311             101,
25312             -99,
25313             -31,
25314             1,
25315             2,
25316             0,
25317             i32::MIN,
25318             i32::MIN + 1,
25319             101,
25320             -99,
25321             -31,
25322         );
25323         assert_eq_m512i(r, e);
25324     }
25325
25326     #[simd_test(enable = "avx512f")]
25327     unsafe fn test_mm512_mask_add_epi32() {
25328         #[rustfmt::skip]
25329         let a = _mm512_setr_epi32(
25330             0, 1, -1, i32::MAX,
25331             i32::MIN, 100, -100, -32,
25332             0, 1, -1, i32::MAX,
25333             i32::MIN, 100, -100, -32,
25334         );
25335         let b = _mm512_set1_epi32(1);
25336         let r = _mm512_mask_add_epi32(a, 0, a, b);
25337         assert_eq_m512i(r, a);
25338         let r = _mm512_mask_add_epi32(a, 0b00000000_11111111, a, b);
25339         let e = _mm512_setr_epi32(
25340             1,
25341             2,
25342             0,
25343             i32::MIN,
25344             i32::MIN + 1,
25345             101,
25346             -99,
25347             -31,
25348             0,
25349             1,
25350             -1,
25351             i32::MAX,
25352             i32::MIN,
25353             100,
25354             -100,
25355             -32,
25356         );
25357         assert_eq_m512i(r, e);
25358     }
25359
25360     #[simd_test(enable = "avx512f")]
25361     unsafe fn test_mm512_maskz_add_epi32() {
25362         #[rustfmt::skip]
25363         let a = _mm512_setr_epi32(
25364             0, 1, -1, i32::MAX,
25365             i32::MIN, 100, -100, -32,
25366             0, 1, -1, i32::MAX,
25367             i32::MIN, 100, -100, -32,
25368         );
25369         let b = _mm512_set1_epi32(1);
25370         let r = _mm512_maskz_add_epi32(0, a, b);
25371         assert_eq_m512i(r, _mm512_setzero_si512());
25372         let r = _mm512_maskz_add_epi32(0b00000000_11111111, a, b);
25373         let e = _mm512_setr_epi32(
25374             1,
25375             2,
25376             0,
25377             i32::MIN,
25378             i32::MIN + 1,
25379             101,
25380             -99,
25381             -31,
25382             0,
25383             0,
25384             0,
25385             0,
25386             0,
25387             0,
25388             0,
25389             0,
25390         );
25391         assert_eq_m512i(r, e);
25392     }
25393
25394     #[simd_test(enable = "avx512f")]
25395     unsafe fn test_mm512_add_ps() {
25396         let a = _mm512_setr_ps(
25397             0.,
25398             1.,
25399             -1.,
25400             f32::MAX,
25401             f32::MIN,
25402             100.,
25403             -100.,
25404             -32.,
25405             0.,
25406             1.,
25407             -1.,
25408             f32::MAX,
25409             f32::MIN,
25410             100.,
25411             -100.,
25412             -32.,
25413         );
25414         let b = _mm512_set1_ps(1.);
25415         let r = _mm512_add_ps(a, b);
25416         let e = _mm512_setr_ps(
25417             1.,
25418             2.,
25419             0.,
25420             f32::MAX,
25421             f32::MIN + 1.,
25422             101.,
25423             -99.,
25424             -31.,
25425             1.,
25426             2.,
25427             0.,
25428             f32::MAX,
25429             f32::MIN + 1.,
25430             101.,
25431             -99.,
25432             -31.,
25433         );
25434         assert_eq_m512(r, e);
25435     }
25436
25437     #[simd_test(enable = "avx512f")]
25438     unsafe fn test_mm512_mask_add_ps() {
25439         let a = _mm512_setr_ps(
25440             0.,
25441             1.,
25442             -1.,
25443             f32::MAX,
25444             f32::MIN,
25445             100.,
25446             -100.,
25447             -32.,
25448             0.,
25449             1.,
25450             -1.,
25451             f32::MAX,
25452             f32::MIN,
25453             100.,
25454             -100.,
25455             -32.,
25456         );
25457         let b = _mm512_set1_ps(1.);
25458         let r = _mm512_mask_add_ps(a, 0, a, b);
25459         assert_eq_m512(r, a);
25460         let r = _mm512_mask_add_ps(a, 0b00000000_11111111, a, b);
25461         let e = _mm512_setr_ps(
25462             1.,
25463             2.,
25464             0.,
25465             f32::MAX,
25466             f32::MIN + 1.,
25467             101.,
25468             -99.,
25469             -31.,
25470             0.,
25471             1.,
25472             -1.,
25473             f32::MAX,
25474             f32::MIN,
25475             100.,
25476             -100.,
25477             -32.,
25478         );
25479         assert_eq_m512(r, e);
25480     }
25481
25482     #[simd_test(enable = "avx512f")]
25483     unsafe fn test_mm512_maskz_add_ps() {
25484         let a = _mm512_setr_ps(
25485             0.,
25486             1.,
25487             -1.,
25488             f32::MAX,
25489             f32::MIN,
25490             100.,
25491             -100.,
25492             -32.,
25493             0.,
25494             1.,
25495             -1.,
25496             f32::MAX,
25497             f32::MIN,
25498             100.,
25499             -100.,
25500             -32.,
25501         );
25502         let b = _mm512_set1_ps(1.);
25503         let r = _mm512_maskz_add_ps(0, a, b);
25504         assert_eq_m512(r, _mm512_setzero_ps());
25505         let r = _mm512_maskz_add_ps(0b00000000_11111111, a, b);
25506         let e = _mm512_setr_ps(
25507             1.,
25508             2.,
25509             0.,
25510             f32::MAX,
25511             f32::MIN + 1.,
25512             101.,
25513             -99.,
25514             -31.,
25515             0.,
25516             0.,
25517             0.,
25518             0.,
25519             0.,
25520             0.,
25521             0.,
25522             0.,
25523         );
25524         assert_eq_m512(r, e);
25525     }
25526
25527     #[simd_test(enable = "avx512f")]
25528     unsafe fn test_mm512_sub_epi32() {
25529         let a = _mm512_setr_epi32(
25530             0,
25531             1,
25532             -1,
25533             i32::MAX,
25534             i32::MIN,
25535             100,
25536             -100,
25537             -32,
25538             0,
25539             1,
25540             -1,
25541             i32::MAX,
25542             i32::MIN,
25543             100,
25544             -100,
25545             -32,
25546         );
25547         let b = _mm512_set1_epi32(1);
25548         let r = _mm512_sub_epi32(a, b);
25549         let e = _mm512_setr_epi32(
25550             -1,
25551             0,
25552             -2,
25553             i32::MAX - 1,
25554             i32::MAX,
25555             99,
25556             -101,
25557             -33,
25558             -1,
25559             0,
25560             -2,
25561             i32::MAX - 1,
25562             i32::MAX,
25563             99,
25564             -101,
25565             -33,
25566         );
25567         assert_eq_m512i(r, e);
25568     }
25569
25570     #[simd_test(enable = "avx512f")]
25571     unsafe fn test_mm512_mask_sub_epi32() {
25572         let a = _mm512_setr_epi32(
25573             0,
25574             1,
25575             -1,
25576             i32::MAX,
25577             i32::MIN,
25578             100,
25579             -100,
25580             -32,
25581             0,
25582             1,
25583             -1,
25584             i32::MAX,
25585             i32::MIN,
25586             100,
25587             -100,
25588             -32,
25589         );
25590         let b = _mm512_set1_epi32(1);
25591         let r = _mm512_mask_sub_epi32(a, 0, a, b);
25592         assert_eq_m512i(r, a);
25593         let r = _mm512_mask_sub_epi32(a, 0b00000000_11111111, a, b);
25594         let e = _mm512_setr_epi32(
25595             -1,
25596             0,
25597             -2,
25598             i32::MAX - 1,
25599             i32::MAX,
25600             99,
25601             -101,
25602             -33,
25603             0,
25604             1,
25605             -1,
25606             i32::MAX,
25607             i32::MIN,
25608             100,
25609             -100,
25610             -32,
25611         );
25612         assert_eq_m512i(r, e);
25613     }
25614
25615     #[simd_test(enable = "avx512f")]
25616     unsafe fn test_mm512_maskz_sub_epi32() {
25617         let a = _mm512_setr_epi32(
25618             0,
25619             1,
25620             -1,
25621             i32::MAX,
25622             i32::MIN,
25623             100,
25624             -100,
25625             -32,
25626             0,
25627             1,
25628             -1,
25629             i32::MAX,
25630             i32::MIN,
25631             100,
25632             -100,
25633             -32,
25634         );
25635         let b = _mm512_set1_epi32(1);
25636         let r = _mm512_maskz_sub_epi32(0, a, b);
25637         assert_eq_m512i(r, _mm512_setzero_si512());
25638         let r = _mm512_maskz_sub_epi32(0b00000000_11111111, a, b);
25639         let e = _mm512_setr_epi32(
25640             -1,
25641             0,
25642             -2,
25643             i32::MAX - 1,
25644             i32::MAX,
25645             99,
25646             -101,
25647             -33,
25648             0,
25649             0,
25650             0,
25651             0,
25652             0,
25653             0,
25654             0,
25655             0,
25656         );
25657         assert_eq_m512i(r, e);
25658     }
25659
25660     #[simd_test(enable = "avx512f")]
25661     unsafe fn test_mm512_sub_ps() {
25662         let a = _mm512_setr_ps(
25663             0.,
25664             1.,
25665             -1.,
25666             f32::MAX,
25667             f32::MIN,
25668             100.,
25669             -100.,
25670             -32.,
25671             0.,
25672             1.,
25673             -1.,
25674             f32::MAX,
25675             f32::MIN,
25676             100.,
25677             -100.,
25678             -32.,
25679         );
25680         let b = _mm512_set1_ps(1.);
25681         let r = _mm512_sub_ps(a, b);
25682         let e = _mm512_setr_ps(
25683             -1.,
25684             0.,
25685             -2.,
25686             f32::MAX - 1.,
25687             f32::MIN,
25688             99.,
25689             -101.,
25690             -33.,
25691             -1.,
25692             0.,
25693             -2.,
25694             f32::MAX - 1.,
25695             f32::MIN,
25696             99.,
25697             -101.,
25698             -33.,
25699         );
25700         assert_eq_m512(r, e);
25701     }
25702
25703     #[simd_test(enable = "avx512f")]
25704     unsafe fn test_mm512_mask_sub_ps() {
25705         let a = _mm512_setr_ps(
25706             0.,
25707             1.,
25708             -1.,
25709             f32::MAX,
25710             f32::MIN,
25711             100.,
25712             -100.,
25713             -32.,
25714             0.,
25715             1.,
25716             -1.,
25717             f32::MAX,
25718             f32::MIN,
25719             100.,
25720             -100.,
25721             -32.,
25722         );
25723         let b = _mm512_set1_ps(1.);
25724         let r = _mm512_mask_sub_ps(a, 0, a, b);
25725         assert_eq_m512(r, a);
25726         let r = _mm512_mask_sub_ps(a, 0b00000000_11111111, a, b);
25727         let e = _mm512_setr_ps(
25728             -1.,
25729             0.,
25730             -2.,
25731             f32::MAX - 1.,
25732             f32::MIN,
25733             99.,
25734             -101.,
25735             -33.,
25736             0.,
25737             1.,
25738             -1.,
25739             f32::MAX,
25740             f32::MIN,
25741             100.,
25742             -100.,
25743             -32.,
25744         );
25745         assert_eq_m512(r, e);
25746     }
25747
25748     #[simd_test(enable = "avx512f")]
25749     unsafe fn test_mm512_maskz_sub_ps() {
25750         let a = _mm512_setr_ps(
25751             0.,
25752             1.,
25753             -1.,
25754             f32::MAX,
25755             f32::MIN,
25756             100.,
25757             -100.,
25758             -32.,
25759             0.,
25760             1.,
25761             -1.,
25762             f32::MAX,
25763             f32::MIN,
25764             100.,
25765             -100.,
25766             -32.,
25767         );
25768         let b = _mm512_set1_ps(1.);
25769         let r = _mm512_maskz_sub_ps(0, a, b);
25770         assert_eq_m512(r, _mm512_setzero_ps());
25771         let r = _mm512_maskz_sub_ps(0b00000000_11111111, a, b);
25772         let e = _mm512_setr_ps(
25773             -1.,
25774             0.,
25775             -2.,
25776             f32::MAX - 1.,
25777             f32::MIN,
25778             99.,
25779             -101.,
25780             -33.,
25781             0.,
25782             0.,
25783             0.,
25784             0.,
25785             0.,
25786             0.,
25787             0.,
25788             0.,
25789         );
25790         assert_eq_m512(r, e);
25791     }
25792
25793     #[simd_test(enable = "avx512f")]
25794     unsafe fn test_mm512_mullo_epi32() {
25795         let a = _mm512_setr_epi32(
25796             0,
25797             1,
25798             -1,
25799             i32::MAX,
25800             i32::MIN,
25801             100,
25802             -100,
25803             -32,
25804             0,
25805             1,
25806             -1,
25807             i32::MAX,
25808             i32::MIN,
25809             100,
25810             -100,
25811             -32,
25812         );
25813         let b = _mm512_set1_epi32(2);
25814         let r = _mm512_mullo_epi32(a, b);
25815         let e = _mm512_setr_epi32(
25816             0, 2, -2, -2, 0, 200, -200, -64, 0, 2, -2, -2, 0, 200, -200, -64,
25817         );
25818         assert_eq_m512i(r, e);
25819     }
25820
25821     #[simd_test(enable = "avx512f")]
25822     unsafe fn test_mm512_mask_mullo_epi32() {
25823         let a = _mm512_setr_epi32(
25824             0,
25825             1,
25826             -1,
25827             i32::MAX,
25828             i32::MIN,
25829             100,
25830             -100,
25831             -32,
25832             0,
25833             1,
25834             -1,
25835             i32::MAX,
25836             i32::MIN,
25837             100,
25838             -100,
25839             -32,
25840         );
25841         let b = _mm512_set1_epi32(2);
25842         let r = _mm512_mask_mullo_epi32(a, 0, a, b);
25843         assert_eq_m512i(r, a);
25844         let r = _mm512_mask_mullo_epi32(a, 0b00000000_11111111, a, b);
25845         let e = _mm512_setr_epi32(
25846             0,
25847             2,
25848             -2,
25849             -2,
25850             0,
25851             200,
25852             -200,
25853             -64,
25854             0,
25855             1,
25856             -1,
25857             i32::MAX,
25858             i32::MIN,
25859             100,
25860             -100,
25861             -32,
25862         );
25863         assert_eq_m512i(r, e);
25864     }
25865
25866     #[simd_test(enable = "avx512f")]
25867     unsafe fn test_mm512_maskz_mullo_epi32() {
25868         let a = _mm512_setr_epi32(
25869             0,
25870             1,
25871             -1,
25872             i32::MAX,
25873             i32::MIN,
25874             100,
25875             -100,
25876             -32,
25877             0,
25878             1,
25879             -1,
25880             i32::MAX,
25881             i32::MIN,
25882             100,
25883             -100,
25884             -32,
25885         );
25886         let b = _mm512_set1_epi32(2);
25887         let r = _mm512_maskz_mullo_epi32(0, a, b);
25888         assert_eq_m512i(r, _mm512_setzero_si512());
25889         let r = _mm512_maskz_mullo_epi32(0b00000000_11111111, a, b);
25890         let e = _mm512_setr_epi32(0, 2, -2, -2, 0, 200, -200, -64, 0, 0, 0, 0, 0, 0, 0, 0);
25891         assert_eq_m512i(r, e);
25892     }
25893
25894     #[simd_test(enable = "avx512f")]
25895     unsafe fn test_mm512_mul_ps() {
25896         let a = _mm512_setr_ps(
25897             0.,
25898             1.,
25899             -1.,
25900             f32::MAX,
25901             f32::MIN,
25902             100.,
25903             -100.,
25904             -32.,
25905             0.,
25906             1.,
25907             -1.,
25908             f32::MAX,
25909             f32::MIN,
25910             100.,
25911             -100.,
25912             -32.,
25913         );
25914         let b = _mm512_set1_ps(2.);
25915         let r = _mm512_mul_ps(a, b);
25916         let e = _mm512_setr_ps(
25917             0.,
25918             2.,
25919             -2.,
25920             f32::INFINITY,
25921             f32::NEG_INFINITY,
25922             200.,
25923             -200.,
25924             -64.,
25925             0.,
25926             2.,
25927             -2.,
25928             f32::INFINITY,
25929             f32::NEG_INFINITY,
25930             200.,
25931             -200.,
25932             -64.,
25933         );
25934         assert_eq_m512(r, e);
25935     }
25936
25937     #[simd_test(enable = "avx512f")]
25938     unsafe fn test_mm512_mask_mul_ps() {
25939         let a = _mm512_setr_ps(
25940             0.,
25941             1.,
25942             -1.,
25943             f32::MAX,
25944             f32::MIN,
25945             100.,
25946             -100.,
25947             -32.,
25948             0.,
25949             1.,
25950             -1.,
25951             f32::MAX,
25952             f32::MIN,
25953             100.,
25954             -100.,
25955             -32.,
25956         );
25957         let b = _mm512_set1_ps(2.);
25958         let r = _mm512_mask_mul_ps(a, 0, a, b);
25959         assert_eq_m512(r, a);
25960         let r = _mm512_mask_mul_ps(a, 0b00000000_11111111, a, b);
25961         let e = _mm512_setr_ps(
25962             0.,
25963             2.,
25964             -2.,
25965             f32::INFINITY,
25966             f32::NEG_INFINITY,
25967             200.,
25968             -200.,
25969             -64.,
25970             0.,
25971             1.,
25972             -1.,
25973             f32::MAX,
25974             f32::MIN,
25975             100.,
25976             -100.,
25977             -32.,
25978         );
25979         assert_eq_m512(r, e);
25980     }
25981
25982     #[simd_test(enable = "avx512f")]
25983     unsafe fn test_mm512_maskz_mul_ps() {
25984         let a = _mm512_setr_ps(
25985             0.,
25986             1.,
25987             -1.,
25988             f32::MAX,
25989             f32::MIN,
25990             100.,
25991             -100.,
25992             -32.,
25993             0.,
25994             1.,
25995             -1.,
25996             f32::MAX,
25997             f32::MIN,
25998             100.,
25999             -100.,
26000             -32.,
26001         );
26002         let b = _mm512_set1_ps(2.);
26003         let r = _mm512_maskz_mul_ps(0, a, b);
26004         assert_eq_m512(r, _mm512_setzero_ps());
26005         let r = _mm512_maskz_mul_ps(0b00000000_11111111, a, b);
26006         let e = _mm512_setr_ps(
26007             0.,
26008             2.,
26009             -2.,
26010             f32::INFINITY,
26011             f32::NEG_INFINITY,
26012             200.,
26013             -200.,
26014             -64.,
26015             0.,
26016             0.,
26017             0.,
26018             0.,
26019             0.,
26020             0.,
26021             0.,
26022             0.,
26023         );
26024         assert_eq_m512(r, e);
26025     }
26026
26027     #[simd_test(enable = "avx512f")]
26028     unsafe fn test_mm512_div_ps() {
26029         let a = _mm512_setr_ps(
26030             0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
26031         );
26032         let b = _mm512_setr_ps(
26033             2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
26034         );
26035         let r = _mm512_div_ps(a, b);
26036         let e = _mm512_setr_ps(
26037             0.,
26038             0.5,
26039             -0.5,
26040             -1.,
26041             50.,
26042             f32::INFINITY,
26043             -50.,
26044             -16.,
26045             0.,
26046             0.5,
26047             -0.5,
26048             500.,
26049             f32::NEG_INFINITY,
26050             50.,
26051             -50.,
26052             -16.,
26053         );
26054         assert_eq_m512(r, e); // 0/0 = NAN
26055     }
26056
26057     #[simd_test(enable = "avx512f")]
26058     unsafe fn test_mm512_mask_div_ps() {
26059         let a = _mm512_setr_ps(
26060             0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
26061         );
26062         let b = _mm512_setr_ps(
26063             2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
26064         );
26065         let r = _mm512_mask_div_ps(a, 0, a, b);
26066         assert_eq_m512(r, a);
26067         let r = _mm512_mask_div_ps(a, 0b00000000_11111111, a, b);
26068         let e = _mm512_setr_ps(
26069             0.,
26070             0.5,
26071             -0.5,
26072             -1.,
26073             50.,
26074             f32::INFINITY,
26075             -50.,
26076             -16.,
26077             0.,
26078             1.,
26079             -1.,
26080             1000.,
26081             -131.,
26082             100.,
26083             -100.,
26084             -32.,
26085         );
26086         assert_eq_m512(r, e);
26087     }
26088
26089     #[simd_test(enable = "avx512f")]
26090     unsafe fn test_mm512_maskz_div_ps() {
26091         let a = _mm512_setr_ps(
26092             0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
26093         );
26094         let b = _mm512_setr_ps(
26095             2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
26096         );
26097         let r = _mm512_maskz_div_ps(0, a, b);
26098         assert_eq_m512(r, _mm512_setzero_ps());
26099         let r = _mm512_maskz_div_ps(0b00000000_11111111, a, b);
26100         let e = _mm512_setr_ps(
26101             0.,
26102             0.5,
26103             -0.5,
26104             -1.,
26105             50.,
26106             f32::INFINITY,
26107             -50.,
26108             -16.,
26109             0.,
26110             0.,
26111             0.,
26112             0.,
26113             0.,
26114             0.,
26115             0.,
26116             0.,
26117         );
26118         assert_eq_m512(r, e);
26119     }
26120
26121     #[simd_test(enable = "avx512f")]
26122     unsafe fn test_mm512_max_epi32() {
26123         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26124         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26125         let r = _mm512_max_epi32(a, b);
26126         let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
26127         assert_eq_m512i(r, e);
26128     }
26129
26130     #[simd_test(enable = "avx512f")]
26131     unsafe fn test_mm512_mask_max_epi32() {
26132         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26133         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26134         let r = _mm512_mask_max_epi32(a, 0, a, b);
26135         assert_eq_m512i(r, a);
26136         let r = _mm512_mask_max_epi32(a, 0b00000000_11111111, a, b);
26137         let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
26138         assert_eq_m512i(r, e);
26139     }
26140
26141     #[simd_test(enable = "avx512f")]
26142     unsafe fn test_mm512_maskz_max_epi32() {
26143         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26144         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26145         let r = _mm512_maskz_max_epi32(0, a, b);
26146         assert_eq_m512i(r, _mm512_setzero_si512());
26147         let r = _mm512_maskz_max_epi32(0b00000000_11111111, a, b);
26148         let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
26149         assert_eq_m512i(r, e);
26150     }
26151
26152     #[simd_test(enable = "avx512f")]
26153     unsafe fn test_mm512_max_ps() {
26154         let a = _mm512_setr_ps(
26155             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26156         );
26157         let b = _mm512_setr_ps(
26158             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
26159         );
26160         let r = _mm512_max_ps(a, b);
26161         let e = _mm512_setr_ps(
26162             15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
26163         );
26164         assert_eq_m512(r, e);
26165     }
26166
26167     #[simd_test(enable = "avx512f")]
26168     unsafe fn test_mm512_mask_max_ps() {
26169         let a = _mm512_setr_ps(
26170             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26171         );
26172         let b = _mm512_setr_ps(
26173             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
26174         );
26175         let r = _mm512_mask_max_ps(a, 0, a, b);
26176         assert_eq_m512(r, a);
26177         let r = _mm512_mask_max_ps(a, 0b00000000_11111111, a, b);
26178         let e = _mm512_setr_ps(
26179             15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
26180         );
26181         assert_eq_m512(r, e);
26182     }
26183
26184     #[simd_test(enable = "avx512f")]
26185     unsafe fn test_mm512_maskz_max_ps() {
26186         let a = _mm512_setr_ps(
26187             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26188         );
26189         let b = _mm512_setr_ps(
26190             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
26191         );
26192         let r = _mm512_maskz_max_ps(0, a, b);
26193         assert_eq_m512(r, _mm512_setzero_ps());
26194         let r = _mm512_maskz_max_ps(0b00000000_11111111, a, b);
26195         let e = _mm512_setr_ps(
26196             15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
26197         );
26198         assert_eq_m512(r, e);
26199     }
26200
26201     #[simd_test(enable = "avx512f")]
26202     unsafe fn test_mm512_max_epu32() {
26203         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26204         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26205         let r = _mm512_max_epu32(a, b);
26206         let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
26207         assert_eq_m512i(r, e);
26208     }
26209
26210     #[simd_test(enable = "avx512f")]
26211     unsafe fn test_mm512_mask_max_epu32() {
26212         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26213         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26214         let r = _mm512_mask_max_epu32(a, 0, a, b);
26215         assert_eq_m512i(r, a);
26216         let r = _mm512_mask_max_epu32(a, 0b00000000_11111111, a, b);
26217         let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
26218         assert_eq_m512i(r, e);
26219     }
26220
26221     #[simd_test(enable = "avx512f")]
26222     unsafe fn test_mm512_maskz_max_epu32() {
26223         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26224         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26225         let r = _mm512_maskz_max_epu32(0, a, b);
26226         assert_eq_m512i(r, _mm512_setzero_si512());
26227         let r = _mm512_maskz_max_epu32(0b00000000_11111111, a, b);
26228         let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
26229         assert_eq_m512i(r, e);
26230     }
26231
26232     #[simd_test(enable = "avx512f")]
26233     unsafe fn test_mm512_min_epi32() {
26234         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26235         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26236         let r = _mm512_min_epi32(a, b);
26237         let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
26238         assert_eq_m512i(r, e);
26239     }
26240
26241     #[simd_test(enable = "avx512f")]
26242     unsafe fn test_mm512_mask_min_epi32() {
26243         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26244         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26245         let r = _mm512_mask_min_epi32(a, 0, a, b);
26246         assert_eq_m512i(r, a);
26247         let r = _mm512_mask_min_epi32(a, 0b00000000_11111111, a, b);
26248         let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26249         assert_eq_m512i(r, e);
26250     }
26251
26252     #[simd_test(enable = "avx512f")]
26253     unsafe fn test_mm512_maskz_min_epi32() {
26254         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26255         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26256         let r = _mm512_maskz_min_epi32(0, a, b);
26257         assert_eq_m512i(r, _mm512_setzero_si512());
26258         let r = _mm512_maskz_min_epi32(0b00000000_11111111, a, b);
26259         let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
26260         assert_eq_m512i(r, e);
26261     }
26262
26263     #[simd_test(enable = "avx512f")]
26264     unsafe fn test_mm512_min_ps() {
26265         let a = _mm512_setr_ps(
26266             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26267         );
26268         let b = _mm512_setr_ps(
26269             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
26270         );
26271         let r = _mm512_min_ps(a, b);
26272         let e = _mm512_setr_ps(
26273             0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
26274         );
26275         assert_eq_m512(r, e);
26276     }
26277
26278     #[simd_test(enable = "avx512f")]
26279     unsafe fn test_mm512_mask_min_ps() {
26280         let a = _mm512_setr_ps(
26281             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26282         );
26283         let b = _mm512_setr_ps(
26284             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
26285         );
26286         let r = _mm512_mask_min_ps(a, 0, a, b);
26287         assert_eq_m512(r, a);
26288         let r = _mm512_mask_min_ps(a, 0b00000000_11111111, a, b);
26289         let e = _mm512_setr_ps(
26290             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26291         );
26292         assert_eq_m512(r, e);
26293     }
26294
26295     #[simd_test(enable = "avx512f")]
26296     unsafe fn test_mm512_maskz_min_ps() {
26297         let a = _mm512_setr_ps(
26298             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26299         );
26300         let b = _mm512_setr_ps(
26301             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
26302         );
26303         let r = _mm512_maskz_min_ps(0, a, b);
26304         assert_eq_m512(r, _mm512_setzero_ps());
26305         let r = _mm512_maskz_min_ps(0b00000000_11111111, a, b);
26306         let e = _mm512_setr_ps(
26307             0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
26308         );
26309         assert_eq_m512(r, e);
26310     }
26311
26312     #[simd_test(enable = "avx512f")]
26313     unsafe fn test_mm512_min_epu32() {
26314         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26315         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26316         let r = _mm512_min_epu32(a, b);
26317         let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
26318         assert_eq_m512i(r, e);
26319     }
26320
26321     #[simd_test(enable = "avx512f")]
26322     unsafe fn test_mm512_mask_min_epu32() {
26323         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26324         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26325         let r = _mm512_mask_min_epu32(a, 0, a, b);
26326         assert_eq_m512i(r, a);
26327         let r = _mm512_mask_min_epu32(a, 0b00000000_11111111, a, b);
26328         let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26329         assert_eq_m512i(r, e);
26330     }
26331
26332     #[simd_test(enable = "avx512f")]
26333     unsafe fn test_mm512_maskz_min_epu32() {
26334         let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
26335         let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
26336         let r = _mm512_maskz_min_epu32(0, a, b);
26337         assert_eq_m512i(r, _mm512_setzero_si512());
26338         let r = _mm512_maskz_min_epu32(0b00000000_11111111, a, b);
26339         let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
26340         assert_eq_m512i(r, e);
26341     }
26342
26343     #[simd_test(enable = "avx512f")]
26344     unsafe fn test_mm512_sqrt_ps() {
26345         let a = _mm512_setr_ps(
26346             0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
26347         );
26348         let r = _mm512_sqrt_ps(a);
26349         let e = _mm512_setr_ps(
26350             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26351         );
26352         assert_eq_m512(r, e);
26353     }
26354
26355     #[simd_test(enable = "avx512f")]
26356     unsafe fn test_mm512_mask_sqrt_ps() {
26357         let a = _mm512_setr_ps(
26358             0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
26359         );
26360         let r = _mm512_mask_sqrt_ps(a, 0, a);
26361         assert_eq_m512(r, a);
26362         let r = _mm512_mask_sqrt_ps(a, 0b00000000_11111111, a);
26363         let e = _mm512_setr_ps(
26364             0., 1., 2., 3., 4., 5., 6., 7., 64., 81., 100., 121., 144., 169., 196., 225.,
26365         );
26366         assert_eq_m512(r, e);
26367     }
26368
26369     #[simd_test(enable = "avx512f")]
26370     unsafe fn test_mm512_maskz_sqrt_ps() {
26371         let a = _mm512_setr_ps(
26372             0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
26373         );
26374         let r = _mm512_maskz_sqrt_ps(0, a);
26375         assert_eq_m512(r, _mm512_setzero_ps());
26376         let r = _mm512_maskz_sqrt_ps(0b00000000_11111111, a);
26377         let e = _mm512_setr_ps(
26378             0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
26379         );
26380         assert_eq_m512(r, e);
26381     }
26382
26383     #[simd_test(enable = "avx512f")]
26384     unsafe fn test_mm512_fmadd_ps() {
26385         let a = _mm512_setr_ps(
26386             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26387         );
26388         let b = _mm512_setr_ps(
26389             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26390         );
26391         let c = _mm512_setr_ps(
26392             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26393         );
26394         let r = _mm512_fmadd_ps(a, b, c);
26395         let e = _mm512_setr_ps(
26396             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
26397         );
26398         assert_eq_m512(r, e);
26399     }
26400
26401     #[simd_test(enable = "avx512f")]
26402     unsafe fn test_mm512_mask_fmadd_ps() {
26403         let a = _mm512_setr_ps(
26404             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26405         );
26406         let b = _mm512_setr_ps(
26407             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26408         );
26409         let c = _mm512_setr_ps(
26410             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26411         );
26412         let r = _mm512_mask_fmadd_ps(a, 0, b, c);
26413         assert_eq_m512(r, a);
26414         let r = _mm512_mask_fmadd_ps(a, 0b00000000_11111111, b, c);
26415         let e = _mm512_setr_ps(
26416             1., 2., 3., 4., 5., 6., 7., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
26417         );
26418         assert_eq_m512(r, e);
26419     }
26420
26421     #[simd_test(enable = "avx512f")]
26422     unsafe fn test_mm512_maskz_fmadd_ps() {
26423         let a = _mm512_setr_ps(
26424             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26425         );
26426         let b = _mm512_setr_ps(
26427             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26428         );
26429         let c = _mm512_setr_ps(
26430             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26431         );
26432         let r = _mm512_maskz_fmadd_ps(0, a, b, c);
26433         assert_eq_m512(r, _mm512_setzero_ps());
26434         let r = _mm512_maskz_fmadd_ps(0b00000000_11111111, a, b, c);
26435         let e = _mm512_setr_ps(
26436             1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
26437         );
26438         assert_eq_m512(r, e);
26439     }
26440
26441     #[simd_test(enable = "avx512f")]
26442     unsafe fn test_mm512_mask3_fmadd_ps() {
26443         let a = _mm512_setr_ps(
26444             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26445         );
26446         let b = _mm512_setr_ps(
26447             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26448         );
26449         let c = _mm512_setr_ps(
26450             1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
26451         );
26452         let r = _mm512_mask3_fmadd_ps(a, b, c, 0);
26453         assert_eq_m512(r, c);
26454         let r = _mm512_mask3_fmadd_ps(a, b, c, 0b00000000_11111111);
26455         let e = _mm512_setr_ps(
26456             1., 2., 3., 4., 5., 6., 7., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
26457         );
26458         assert_eq_m512(r, e);
26459     }
26460
26461     #[simd_test(enable = "avx512f")]
26462     unsafe fn test_mm512_fmsub_ps() {
26463         let a = _mm512_setr_ps(
26464             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26465         );
26466         let b = _mm512_setr_ps(
26467             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26468         );
26469         let c = _mm512_setr_ps(
26470             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26471         );
26472         let r = _mm512_fmsub_ps(a, b, c);
26473         let e = _mm512_setr_ps(
26474             -1., 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
26475         );
26476         assert_eq_m512(r, e);
26477     }
26478
26479     #[simd_test(enable = "avx512f")]
26480     unsafe fn test_mm512_mask_fmsub_ps() {
26481         let a = _mm512_setr_ps(
26482             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26483         );
26484         let b = _mm512_setr_ps(
26485             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26486         );
26487         let c = _mm512_setr_ps(
26488             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26489         );
26490         let r = _mm512_mask_fmsub_ps(a, 0, b, c);
26491         assert_eq_m512(r, a);
26492         let r = _mm512_mask_fmsub_ps(a, 0b00000000_11111111, b, c);
26493         let e = _mm512_setr_ps(
26494             -1., 0., 1., 2., 3., 4., 5., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
26495         );
26496         assert_eq_m512(r, e);
26497     }
26498
26499     #[simd_test(enable = "avx512f")]
26500     unsafe fn test_mm512_maskz_fmsub_ps() {
26501         let a = _mm512_setr_ps(
26502             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26503         );
26504         let b = _mm512_setr_ps(
26505             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26506         );
26507         let c = _mm512_setr_ps(
26508             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26509         );
26510         let r = _mm512_maskz_fmsub_ps(0, a, b, c);
26511         assert_eq_m512(r, _mm512_setzero_ps());
26512         let r = _mm512_maskz_fmsub_ps(0b00000000_11111111, a, b, c);
26513         let e = _mm512_setr_ps(
26514             -1., 0., 1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
26515         );
26516         assert_eq_m512(r, e);
26517     }
26518
26519     #[simd_test(enable = "avx512f")]
26520     unsafe fn test_mm512_mask3_fmsub_ps() {
26521         let a = _mm512_setr_ps(
26522             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26523         );
26524         let b = _mm512_setr_ps(
26525             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26526         );
26527         let c = _mm512_setr_ps(
26528             1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
26529         );
26530         let r = _mm512_mask3_fmsub_ps(a, b, c, 0);
26531         assert_eq_m512(r, c);
26532         let r = _mm512_mask3_fmsub_ps(a, b, c, 0b00000000_11111111);
26533         let e = _mm512_setr_ps(
26534             -1., 0., 1., 2., 3., 4., 5., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
26535         );
26536         assert_eq_m512(r, e);
26537     }
26538
26539     #[simd_test(enable = "avx512f")]
26540     unsafe fn test_mm512_fmaddsub_ps() {
26541         let a = _mm512_setr_ps(
26542             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26543         );
26544         let b = _mm512_setr_ps(
26545             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26546         );
26547         let c = _mm512_setr_ps(
26548             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26549         );
26550         let r = _mm512_fmaddsub_ps(a, b, c);
26551         let e = _mm512_setr_ps(
26552             -1., 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16.,
26553         );
26554         assert_eq_m512(r, e);
26555     }
26556
26557     #[simd_test(enable = "avx512f")]
26558     unsafe fn test_mm512_mask_fmaddsub_ps() {
26559         let a = _mm512_setr_ps(
26560             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26561         );
26562         let b = _mm512_setr_ps(
26563             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26564         );
26565         let c = _mm512_setr_ps(
26566             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26567         );
26568         let r = _mm512_mask_fmaddsub_ps(a, 0, b, c);
26569         assert_eq_m512(r, a);
26570         let r = _mm512_mask_fmaddsub_ps(a, 0b00000000_11111111, b, c);
26571         let e = _mm512_setr_ps(
26572             -1., 2., 1., 4., 3., 6., 5., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
26573         );
26574         assert_eq_m512(r, e);
26575     }
26576
26577     #[simd_test(enable = "avx512f")]
26578     unsafe fn test_mm512_maskz_fmaddsub_ps() {
26579         let a = _mm512_setr_ps(
26580             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26581         );
26582         let b = _mm512_setr_ps(
26583             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26584         );
26585         let c = _mm512_setr_ps(
26586             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26587         );
26588         let r = _mm512_maskz_fmaddsub_ps(0, a, b, c);
26589         assert_eq_m512(r, _mm512_setzero_ps());
26590         let r = _mm512_maskz_fmaddsub_ps(0b00000000_11111111, a, b, c);
26591         let e = _mm512_setr_ps(
26592             -1., 2., 1., 4., 3., 6., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
26593         );
26594         assert_eq_m512(r, e);
26595     }
26596
26597     #[simd_test(enable = "avx512f")]
26598     unsafe fn test_mm512_mask3_fmaddsub_ps() {
26599         let a = _mm512_setr_ps(
26600             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26601         );
26602         let b = _mm512_setr_ps(
26603             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26604         );
26605         let c = _mm512_setr_ps(
26606             1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
26607         );
26608         let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0);
26609         assert_eq_m512(r, c);
26610         let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0b00000000_11111111);
26611         let e = _mm512_setr_ps(
26612             -1., 2., 1., 4., 3., 6., 5., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
26613         );
26614         assert_eq_m512(r, e);
26615     }
26616
26617     #[simd_test(enable = "avx512f")]
26618     unsafe fn test_mm512_fmsubadd_ps() {
26619         let a = _mm512_setr_ps(
26620             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26621         );
26622         let b = _mm512_setr_ps(
26623             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26624         );
26625         let c = _mm512_setr_ps(
26626             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26627         );
26628         let r = _mm512_fmsubadd_ps(a, b, c);
26629         let e = _mm512_setr_ps(
26630             1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10., 13., 12., 15., 14.,
26631         );
26632         assert_eq_m512(r, e);
26633     }
26634
26635     #[simd_test(enable = "avx512f")]
26636     unsafe fn test_mm512_mask_fmsubadd_ps() {
26637         let a = _mm512_setr_ps(
26638             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26639         );
26640         let b = _mm512_setr_ps(
26641             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26642         );
26643         let c = _mm512_setr_ps(
26644             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26645         );
26646         let r = _mm512_mask_fmsubadd_ps(a, 0, b, c);
26647         assert_eq_m512(r, a);
26648         let r = _mm512_mask_fmsubadd_ps(a, 0b00000000_11111111, b, c);
26649         let e = _mm512_setr_ps(
26650             1., 0., 3., 2., 5., 4., 7., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
26651         );
26652         assert_eq_m512(r, e);
26653     }
26654
26655     #[simd_test(enable = "avx512f")]
26656     unsafe fn test_mm512_maskz_fmsubadd_ps() {
26657         let a = _mm512_setr_ps(
26658             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26659         );
26660         let b = _mm512_setr_ps(
26661             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26662         );
26663         let c = _mm512_setr_ps(
26664             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26665         );
26666         let r = _mm512_maskz_fmsubadd_ps(0, a, b, c);
26667         assert_eq_m512(r, _mm512_setzero_ps());
26668         let r = _mm512_maskz_fmsubadd_ps(0b00000000_11111111, a, b, c);
26669         let e = _mm512_setr_ps(
26670             1., 0., 3., 2., 5., 4., 7., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
26671         );
26672         assert_eq_m512(r, e);
26673     }
26674
26675     #[simd_test(enable = "avx512f")]
26676     unsafe fn test_mm512_mask3_fmsubadd_ps() {
26677         let a = _mm512_setr_ps(
26678             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26679         );
26680         let b = _mm512_setr_ps(
26681             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26682         );
26683         let c = _mm512_setr_ps(
26684             1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
26685         );
26686         let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0);
26687         assert_eq_m512(r, c);
26688         let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0b00000000_11111111);
26689         let e = _mm512_setr_ps(
26690             1., 0., 3., 2., 5., 4., 7., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
26691         );
26692         assert_eq_m512(r, e);
26693     }
26694
26695     #[simd_test(enable = "avx512f")]
26696     unsafe fn test_mm512_fnmadd_ps() {
26697         let a = _mm512_setr_ps(
26698             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26699         );
26700         let b = _mm512_setr_ps(
26701             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26702         );
26703         let c = _mm512_setr_ps(
26704             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26705         );
26706         let r = _mm512_fnmadd_ps(a, b, c);
26707         let e = _mm512_setr_ps(
26708             1., 0., -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14.,
26709         );
26710         assert_eq_m512(r, e);
26711     }
26712
26713     #[simd_test(enable = "avx512f")]
26714     unsafe fn test_mm512_mask_fnmadd_ps() {
26715         let a = _mm512_setr_ps(
26716             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26717         );
26718         let b = _mm512_setr_ps(
26719             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26720         );
26721         let c = _mm512_setr_ps(
26722             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26723         );
26724         let r = _mm512_mask_fnmadd_ps(a, 0, b, c);
26725         assert_eq_m512(r, a);
26726         let r = _mm512_mask_fnmadd_ps(a, 0b00000000_11111111, b, c);
26727         let e = _mm512_setr_ps(
26728             1., 0., -1., -2., -3., -4., -5., -6., 1., 1., 1., 1., 1., 1., 1., 1.,
26729         );
26730         assert_eq_m512(r, e);
26731     }
26732
26733     #[simd_test(enable = "avx512f")]
26734     unsafe fn test_mm512_maskz_fnmadd_ps() {
26735         let a = _mm512_setr_ps(
26736             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26737         );
26738         let b = _mm512_setr_ps(
26739             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26740         );
26741         let c = _mm512_setr_ps(
26742             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26743         );
26744         let r = _mm512_maskz_fnmadd_ps(0, a, b, c);
26745         assert_eq_m512(r, _mm512_setzero_ps());
26746         let r = _mm512_maskz_fnmadd_ps(0b00000000_11111111, a, b, c);
26747         let e = _mm512_setr_ps(
26748             1., 0., -1., -2., -3., -4., -5., -6., 0., 0., 0., 0., 0., 0., 0., 0.,
26749         );
26750         assert_eq_m512(r, e);
26751     }
26752
26753     #[simd_test(enable = "avx512f")]
26754     unsafe fn test_mm512_mask3_fnmadd_ps() {
26755         let a = _mm512_setr_ps(
26756             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26757         );
26758         let b = _mm512_setr_ps(
26759             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26760         );
26761         let c = _mm512_setr_ps(
26762             1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
26763         );
26764         let r = _mm512_mask3_fnmadd_ps(a, b, c, 0);
26765         assert_eq_m512(r, c);
26766         let r = _mm512_mask3_fnmadd_ps(a, b, c, 0b00000000_11111111);
26767         let e = _mm512_setr_ps(
26768             1., 0., -1., -2., -3., -4., -5., -6., 2., 2., 2., 2., 2., 2., 2., 2.,
26769         );
26770         assert_eq_m512(r, e);
26771     }
26772
26773     #[simd_test(enable = "avx512f")]
26774     unsafe fn test_mm512_fnmsub_ps() {
26775         let a = _mm512_setr_ps(
26776             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26777         );
26778         let b = _mm512_setr_ps(
26779             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26780         );
26781         let c = _mm512_setr_ps(
26782             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26783         );
26784         let r = _mm512_fnmsub_ps(a, b, c);
26785         let e = _mm512_setr_ps(
26786             -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14., -15., -16.,
26787         );
26788         assert_eq_m512(r, e);
26789     }
26790
26791     #[simd_test(enable = "avx512f")]
26792     unsafe fn test_mm512_mask_fnmsub_ps() {
26793         let a = _mm512_setr_ps(
26794             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26795         );
26796         let b = _mm512_setr_ps(
26797             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26798         );
26799         let c = _mm512_setr_ps(
26800             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26801         );
26802         let r = _mm512_mask_fnmsub_ps(a, 0, b, c);
26803         assert_eq_m512(r, a);
26804         let r = _mm512_mask_fnmsub_ps(a, 0b00000000_11111111, b, c);
26805         let e = _mm512_setr_ps(
26806             -1., -2., -3., -4., -5., -6., -7., -8., 1., 1., 1., 1., 1., 1., 1., 1.,
26807         );
26808         assert_eq_m512(r, e);
26809     }
26810
26811     #[simd_test(enable = "avx512f")]
26812     unsafe fn test_mm512_maskz_fnmsub_ps() {
26813         let a = _mm512_setr_ps(
26814             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26815         );
26816         let b = _mm512_setr_ps(
26817             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26818         );
26819         let c = _mm512_setr_ps(
26820             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26821         );
26822         let r = _mm512_maskz_fnmsub_ps(0, a, b, c);
26823         assert_eq_m512(r, _mm512_setzero_ps());
26824         let r = _mm512_maskz_fnmsub_ps(0b00000000_11111111, a, b, c);
26825         let e = _mm512_setr_ps(
26826             -1., -2., -3., -4., -5., -6., -7., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
26827         );
26828         assert_eq_m512(r, e);
26829     }
26830
26831     #[simd_test(enable = "avx512f")]
26832     unsafe fn test_mm512_mask3_fnmsub_ps() {
26833         let a = _mm512_setr_ps(
26834             1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
26835         );
26836         let b = _mm512_setr_ps(
26837             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
26838         );
26839         let c = _mm512_setr_ps(
26840             1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
26841         );
26842         let r = _mm512_mask3_fnmsub_ps(a, b, c, 0);
26843         assert_eq_m512(r, c);
26844         let r = _mm512_mask3_fnmsub_ps(a, b, c, 0b00000000_11111111);
26845         let e = _mm512_setr_ps(
26846             -1., -2., -3., -4., -5., -6., -7., -8., 2., 2., 2., 2., 2., 2., 2., 2.,
26847         );
26848         assert_eq_m512(r, e);
26849     }
26850
26851     #[simd_test(enable = "avx512f")]
26852     unsafe fn test_mm512_rcp14_ps() {
26853         let a = _mm512_set1_ps(3.);
26854         let r = _mm512_rcp14_ps(a);
26855         let e = _mm512_set1_ps(0.33333206);
26856         assert_eq_m512(r, e);
26857     }
26858
26859     #[simd_test(enable = "avx512f")]
26860     unsafe fn test_mm512_mask_rcp14_ps() {
26861         let a = _mm512_set1_ps(3.);
26862         let r = _mm512_mask_rcp14_ps(a, 0, a);
26863         assert_eq_m512(r, a);
26864         let r = _mm512_mask_rcp14_ps(a, 0b11111111_00000000, a);
26865         let e = _mm512_setr_ps(
26866             3., 3., 3., 3., 3., 3., 3., 3., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
26867             0.33333206, 0.33333206, 0.33333206, 0.33333206,
26868         );
26869         assert_eq_m512(r, e);
26870     }
26871
26872     #[simd_test(enable = "avx512f")]
26873     unsafe fn test_mm512_maskz_rcp14_ps() {
26874         let a = _mm512_set1_ps(3.);
26875         let r = _mm512_maskz_rcp14_ps(0, a);
26876         assert_eq_m512(r, _mm512_setzero_ps());
26877         let r = _mm512_maskz_rcp14_ps(0b11111111_00000000, a);
26878         let e = _mm512_setr_ps(
26879             0., 0., 0., 0., 0., 0., 0., 0., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
26880             0.33333206, 0.33333206, 0.33333206, 0.33333206,
26881         );
26882         assert_eq_m512(r, e);
26883     }
26884
26885     #[simd_test(enable = "avx512f")]
26886     unsafe fn test_mm512_rsqrt14_ps() {
26887         let a = _mm512_set1_ps(3.);
26888         let r = _mm512_rsqrt14_ps(a);
26889         let e = _mm512_set1_ps(0.5773392);
26890         assert_eq_m512(r, e);
26891     }
26892
26893     #[simd_test(enable = "avx512f")]
26894     unsafe fn test_mm512_mask_rsqrt14_ps() {
26895         let a = _mm512_set1_ps(3.);
26896         let r = _mm512_mask_rsqrt14_ps(a, 0, a);
26897         assert_eq_m512(r, a);
26898         let r = _mm512_mask_rsqrt14_ps(a, 0b11111111_00000000, a);
26899         let e = _mm512_setr_ps(
26900             3., 3., 3., 3., 3., 3., 3., 3., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
26901             0.5773392, 0.5773392, 0.5773392,
26902         );
26903         assert_eq_m512(r, e);
26904     }
26905
26906     #[simd_test(enable = "avx512f")]
26907     unsafe fn test_mm512_maskz_rsqrt14_ps() {
26908         let a = _mm512_set1_ps(3.);
26909         let r = _mm512_maskz_rsqrt14_ps(0, a);
26910         assert_eq_m512(r, _mm512_setzero_ps());
26911         let r = _mm512_maskz_rsqrt14_ps(0b11111111_00000000, a);
26912         let e = _mm512_setr_ps(
26913             0., 0., 0., 0., 0., 0., 0., 0., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
26914             0.5773392, 0.5773392, 0.5773392,
26915         );
26916         assert_eq_m512(r, e);
26917     }
26918
26919     #[simd_test(enable = "avx512f")]
26920     unsafe fn test_mm512_getexp_ps() {
26921         let a = _mm512_set1_ps(3.);
26922         let r = _mm512_getexp_ps(a);
26923         let e = _mm512_set1_ps(1.);
26924         assert_eq_m512(r, e);
26925     }
26926
26927     #[simd_test(enable = "avx512f")]
26928     unsafe fn test_mm512_mask_getexp_ps() {
26929         let a = _mm512_set1_ps(3.);
26930         let r = _mm512_mask_getexp_ps(a, 0, a);
26931         assert_eq_m512(r, a);
26932         let r = _mm512_mask_getexp_ps(a, 0b11111111_00000000, a);
26933         let e = _mm512_setr_ps(
26934             3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
26935         );
26936         assert_eq_m512(r, e);
26937     }
26938
26939     #[simd_test(enable = "avx512f")]
26940     unsafe fn test_mm512_maskz_getexp_ps() {
26941         let a = _mm512_set1_ps(3.);
26942         let r = _mm512_maskz_getexp_ps(0, a);
26943         assert_eq_m512(r, _mm512_setzero_ps());
26944         let r = _mm512_maskz_getexp_ps(0b11111111_00000000, a);
26945         let e = _mm512_setr_ps(
26946             0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
26947         );
26948         assert_eq_m512(r, e);
26949     }
26950
26951     #[simd_test(enable = "avx512f")]
26952     unsafe fn test_mm512_roundscale_ps() {
26953         let a = _mm512_set1_ps(1.1);
26954         let r = _mm512_roundscale_ps(a, 0);
26955         let e = _mm512_set1_ps(1.0);
26956         assert_eq_m512(r, e);
26957     }
26958
26959     #[simd_test(enable = "avx512f")]
26960     unsafe fn test_mm512_mask_roundscale_ps() {
26961         let a = _mm512_set1_ps(1.1);
26962         let r = _mm512_mask_roundscale_ps(a, 0, a, 0);
26963         let e = _mm512_set1_ps(1.1);
26964         assert_eq_m512(r, e);
26965         let r = _mm512_mask_roundscale_ps(a, 0b11111111_11111111, a, 0);
26966         let e = _mm512_set1_ps(1.0);
26967         assert_eq_m512(r, e);
26968     }
26969
26970     #[simd_test(enable = "avx512f")]
26971     unsafe fn test_mm512_maskz_roundscale_ps() {
26972         let a = _mm512_set1_ps(1.1);
26973         let r = _mm512_maskz_roundscale_ps(0, a, 0);
26974         assert_eq_m512(r, _mm512_setzero_ps());
26975         let r = _mm512_maskz_roundscale_ps(0b11111111_11111111, a, 0);
26976         let e = _mm512_set1_ps(1.0);
26977         assert_eq_m512(r, e);
26978     }
26979
26980     #[simd_test(enable = "avx512f")]
26981     unsafe fn test_mm512_scalef_ps() {
26982         let a = _mm512_set1_ps(1.);
26983         let b = _mm512_set1_ps(3.);
26984         let r = _mm512_scalef_ps(a, b);
26985         let e = _mm512_set1_ps(8.);
26986         assert_eq_m512(r, e);
26987     }
26988
26989     #[simd_test(enable = "avx512f")]
26990     unsafe fn test_mm512_mask_scalef_ps() {
26991         let a = _mm512_set1_ps(1.);
26992         let b = _mm512_set1_ps(3.);
26993         let r = _mm512_mask_scalef_ps(a, 0, a, b);
26994         assert_eq_m512(r, a);
26995         let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
26996         let e = _mm512_set_ps(
26997             8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
26998         );
26999         assert_eq_m512(r, e);
27000     }
27001
27002     #[simd_test(enable = "avx512f")]
27003     unsafe fn test_mm512_maskz_scalef_ps() {
27004         let a = _mm512_set1_ps(1.);
27005         let b = _mm512_set1_ps(3.);
27006         let r = _mm512_maskz_scalef_ps(0, a, b);
27007         assert_eq_m512(r, _mm512_setzero_ps());
27008         let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
27009         let e = _mm512_set_ps(
27010             8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
27011         );
27012         assert_eq_m512(r, e);
27013     }
27014
27015     #[simd_test(enable = "avx512f")]
27016     unsafe fn test_mm512_fixupimm_ps() {
27017         let a = _mm512_set1_ps(f32::NAN);
27018         let b = _mm512_set1_ps(f32::MAX);
27019         let c = _mm512_set1_epi32(i32::MAX);
27020         let r = _mm512_fixupimm_ps(a, b, c, 5);
27021         let e = _mm512_set1_ps(0.0);
27022         assert_eq_m512(r, e);
27023     }
27024
27025     #[simd_test(enable = "avx512f")]
27026     unsafe fn test_mm512_mask_fixupimm_ps() {
27027         let a = _mm512_set_ps(
27028             f32::NAN,
27029             f32::NAN,
27030             f32::NAN,
27031             f32::NAN,
27032             f32::NAN,
27033             f32::NAN,
27034             f32::NAN,
27035             f32::NAN,
27036             1.,
27037             1.,
27038             1.,
27039             1.,
27040             1.,
27041             1.,
27042             1.,
27043             1.,
27044         );
27045         let b = _mm512_set1_ps(f32::MAX);
27046         let c = _mm512_set1_epi32(i32::MAX);
27047         let r = _mm512_mask_fixupimm_ps(a, 0b11111111_00000000, b, c, 5);
27048         let e = _mm512_set_ps(
27049             0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
27050         );
27051         assert_eq_m512(r, e);
27052     }
27053
27054     #[simd_test(enable = "avx512f")]
27055     unsafe fn test_mm512_maskz_fixupimm_ps() {
27056         let a = _mm512_set_ps(
27057             f32::NAN,
27058             f32::NAN,
27059             f32::NAN,
27060             f32::NAN,
27061             f32::NAN,
27062             f32::NAN,
27063             f32::NAN,
27064             f32::NAN,
27065             1.,
27066             1.,
27067             1.,
27068             1.,
27069             1.,
27070             1.,
27071             1.,
27072             1.,
27073         );
27074         let b = _mm512_set1_ps(f32::MAX);
27075         let c = _mm512_set1_epi32(i32::MAX);
27076         let r = _mm512_maskz_fixupimm_ps(0b11111111_00000000, a, b, c, 5);
27077         let e = _mm512_set_ps(
27078             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
27079         );
27080         assert_eq_m512(r, e);
27081     }
27082
27083     #[simd_test(enable = "avx512f")]
27084     unsafe fn test_mm512_ternarylogic_epi32() {
27085         let a = _mm512_set1_epi32(1 << 2);
27086         let b = _mm512_set1_epi32(1 << 1);
27087         let c = _mm512_set1_epi32(1 << 0);
27088         let r = _mm512_ternarylogic_epi32(a, b, c, 8);
27089         let e = _mm512_set1_epi32(0);
27090         assert_eq_m512i(r, e);
27091     }
27092
27093     #[simd_test(enable = "avx512f")]
27094     unsafe fn test_mm512_mask_ternarylogic_epi32() {
27095         let src = _mm512_set1_epi32(1 << 2);
27096         let a = _mm512_set1_epi32(1 << 1);
27097         let b = _mm512_set1_epi32(1 << 0);
27098         let r = _mm512_mask_ternarylogic_epi32(src, 0, a, b, 8);
27099         assert_eq_m512i(r, src);
27100         let r = _mm512_mask_ternarylogic_epi32(src, 0b11111111_11111111, a, b, 8);
27101         let e = _mm512_set1_epi32(0);
27102         assert_eq_m512i(r, e);
27103     }
27104
27105     #[simd_test(enable = "avx512f")]
27106     unsafe fn test_mm512_maskz_ternarylogic_epi32() {
27107         let a = _mm512_set1_epi32(1 << 2);
27108         let b = _mm512_set1_epi32(1 << 1);
27109         let c = _mm512_set1_epi32(1 << 0);
27110         let r = _mm512_maskz_ternarylogic_epi32(0, a, b, c, 9);
27111         assert_eq_m512i(r, _mm512_setzero_si512());
27112         let r = _mm512_maskz_ternarylogic_epi32(0b11111111_11111111, a, b, c, 8);
27113         let e = _mm512_set1_epi32(0);
27114         assert_eq_m512i(r, e);
27115     }
27116
27117     #[simd_test(enable = "avx512f")]
27118     unsafe fn test_mm512_getmant_ps() {
27119         let a = _mm512_set1_ps(10.);
27120         let r = _mm512_getmant_ps(a, _MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN);
27121         let e = _mm512_set1_ps(1.25);
27122         assert_eq_m512(r, e);
27123     }
27124
27125     #[simd_test(enable = "avx512f")]
27126     unsafe fn test_mm512_mask_getmant_ps() {
27127         let a = _mm512_set1_ps(10.);
27128         let r = _mm512_mask_getmant_ps(a, 0, a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
27129         assert_eq_m512(r, a);
27130         let r = _mm512_mask_getmant_ps(
27131             a,
27132             0b11111111_00000000,
27133             a,
27134             _MM_MANT_NORM_1_2,
27135             _MM_MANT_SIGN_SRC,
27136         );
27137         let e = _mm512_setr_ps(
27138             10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
27139         );
27140         assert_eq_m512(r, e);
27141     }
27142
27143     #[simd_test(enable = "avx512f")]
27144     unsafe fn test_mm512_maskz_getmant_ps() {
27145         let a = _mm512_set1_ps(10.);
27146         let r = _mm512_maskz_getmant_ps(0, a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
27147         assert_eq_m512(r, _mm512_setzero_ps());
27148         let r =
27149             _mm512_maskz_getmant_ps(0b11111111_00000000, a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
27150         let e = _mm512_setr_ps(
27151             0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
27152         );
27153         assert_eq_m512(r, e);
27154     }
27155
27156     #[simd_test(enable = "avx512f")]
27157     unsafe fn test_mm512_add_round_ps() {
27158         let a = _mm512_setr_ps(
27159             0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
27160         );
27161         let b = _mm512_set1_ps(-1.);
27162         let r = _mm512_add_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27163         let e = _mm512_setr_ps(
27164             -1.,
27165             0.5,
27166             1.,
27167             2.5,
27168             3.,
27169             4.5,
27170             5.,
27171             6.5,
27172             7.,
27173             8.5,
27174             9.,
27175             10.5,
27176             11.,
27177             12.5,
27178             13.,
27179             -0.99999994,
27180         );
27181         assert_eq_m512(r, e);
27182         let r = _mm512_add_round_ps(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27183         let e = _mm512_setr_ps(
27184             -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
27185         );
27186         assert_eq_m512(r, e);
27187     }
27188
27189     #[simd_test(enable = "avx512f")]
27190     unsafe fn test_mm512_mask_add_round_ps() {
27191         let a = _mm512_setr_ps(
27192             0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
27193         );
27194         let b = _mm512_set1_ps(-1.);
27195         let r = _mm512_mask_add_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27196         assert_eq_m512(r, a);
27197         let r = _mm512_mask_add_round_ps(
27198             a,
27199             0b11111111_00000000,
27200             a,
27201             b,
27202             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27203         );
27204         let e = _mm512_setr_ps(
27205             0.,
27206             1.5,
27207             2.,
27208             3.5,
27209             4.,
27210             5.5,
27211             6.,
27212             7.5,
27213             7.,
27214             8.5,
27215             9.,
27216             10.5,
27217             11.,
27218             12.5,
27219             13.,
27220             -0.99999994,
27221         );
27222         assert_eq_m512(r, e);
27223     }
27224
27225     #[simd_test(enable = "avx512f")]
27226     unsafe fn test_mm512_maskz_add_round_ps() {
27227         let a = _mm512_setr_ps(
27228             0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
27229         );
27230         let b = _mm512_set1_ps(-1.);
27231         let r = _mm512_maskz_add_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27232         assert_eq_m512(r, _mm512_setzero_ps());
27233         let r = _mm512_maskz_add_round_ps(
27234             0b11111111_00000000,
27235             a,
27236             b,
27237             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27238         );
27239         let e = _mm512_setr_ps(
27240             0.,
27241             0.,
27242             0.,
27243             0.,
27244             0.,
27245             0.,
27246             0.,
27247             0.,
27248             7.,
27249             8.5,
27250             9.,
27251             10.5,
27252             11.,
27253             12.5,
27254             13.,
27255             -0.99999994,
27256         );
27257         assert_eq_m512(r, e);
27258     }
27259
27260     #[simd_test(enable = "avx512f")]
27261     unsafe fn test_mm512_sub_round_ps() {
27262         let a = _mm512_setr_ps(
27263             0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
27264         );
27265         let b = _mm512_set1_ps(1.);
27266         let r = _mm512_sub_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27267         let e = _mm512_setr_ps(
27268             -1.,
27269             0.5,
27270             1.,
27271             2.5,
27272             3.,
27273             4.5,
27274             5.,
27275             6.5,
27276             7.,
27277             8.5,
27278             9.,
27279             10.5,
27280             11.,
27281             12.5,
27282             13.,
27283             -0.99999994,
27284         );
27285         assert_eq_m512(r, e);
27286         let r = _mm512_sub_round_ps(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27287         let e = _mm512_setr_ps(
27288             -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
27289         );
27290         assert_eq_m512(r, e);
27291     }
27292
27293     #[simd_test(enable = "avx512f")]
27294     unsafe fn test_mm512_mask_sub_round_ps() {
27295         let a = _mm512_setr_ps(
27296             0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
27297         );
27298         let b = _mm512_set1_ps(1.);
27299         let r = _mm512_mask_sub_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27300         assert_eq_m512(r, a);
27301         let r = _mm512_mask_sub_round_ps(
27302             a,
27303             0b11111111_00000000,
27304             a,
27305             b,
27306             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27307         );
27308         let e = _mm512_setr_ps(
27309             0.,
27310             1.5,
27311             2.,
27312             3.5,
27313             4.,
27314             5.5,
27315             6.,
27316             7.5,
27317             7.,
27318             8.5,
27319             9.,
27320             10.5,
27321             11.,
27322             12.5,
27323             13.,
27324             -0.99999994,
27325         );
27326         assert_eq_m512(r, e);
27327     }
27328
27329     #[simd_test(enable = "avx512f")]
27330     unsafe fn test_mm512_maskz_sub_round_ps() {
27331         let a = _mm512_setr_ps(
27332             0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
27333         );
27334         let b = _mm512_set1_ps(1.);
27335         let r = _mm512_maskz_sub_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27336         assert_eq_m512(r, _mm512_setzero_ps());
27337         let r = _mm512_maskz_sub_round_ps(
27338             0b11111111_00000000,
27339             a,
27340             b,
27341             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27342         );
27343         let e = _mm512_setr_ps(
27344             0.,
27345             0.,
27346             0.,
27347             0.,
27348             0.,
27349             0.,
27350             0.,
27351             0.,
27352             7.,
27353             8.5,
27354             9.,
27355             10.5,
27356             11.,
27357             12.5,
27358             13.,
27359             -0.99999994,
27360         );
27361         assert_eq_m512(r, e);
27362     }
27363
27364     #[simd_test(enable = "avx512f")]
27365     unsafe fn test_mm512_mul_round_ps() {
27366         let a = _mm512_setr_ps(
27367             0.,
27368             1.5,
27369             2.,
27370             3.5,
27371             4.,
27372             5.5,
27373             6.,
27374             7.5,
27375             8.,
27376             9.5,
27377             10.,
27378             11.5,
27379             12.,
27380             13.5,
27381             14.,
27382             0.00000000000000000000007,
27383         );
27384         let b = _mm512_set1_ps(0.1);
27385         let r = _mm512_mul_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27386         let e = _mm512_setr_ps(
27387             0.,
27388             0.15,
27389             0.2,
27390             0.35,
27391             0.4,
27392             0.55,
27393             0.6,
27394             0.75,
27395             0.8,
27396             0.95,
27397             1.0,
27398             1.15,
27399             1.2,
27400             1.35,
27401             1.4,
27402             0.000000000000000000000007000001,
27403         );
27404         assert_eq_m512(r, e);
27405         let r = _mm512_mul_round_ps(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27406         let e = _mm512_setr_ps(
27407             0.,
27408             0.14999999,
27409             0.2,
27410             0.35,
27411             0.4,
27412             0.54999995,
27413             0.59999996,
27414             0.75,
27415             0.8,
27416             0.95,
27417             1.0,
27418             1.15,
27419             1.1999999,
27420             1.3499999,
27421             1.4,
27422             0.000000000000000000000007,
27423         );
27424         assert_eq_m512(r, e);
27425     }
27426
27427     #[simd_test(enable = "avx512f")]
27428     unsafe fn test_mm512_mask_mul_round_ps() {
27429         let a = _mm512_setr_ps(
27430             0.,
27431             1.5,
27432             2.,
27433             3.5,
27434             4.,
27435             5.5,
27436             6.,
27437             7.5,
27438             8.,
27439             9.5,
27440             10.,
27441             11.5,
27442             12.,
27443             13.5,
27444             14.,
27445             0.00000000000000000000007,
27446         );
27447         let b = _mm512_set1_ps(0.1);
27448         let r = _mm512_mask_mul_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27449         assert_eq_m512(r, a);
27450         let r = _mm512_mask_mul_round_ps(
27451             a,
27452             0b11111111_00000000,
27453             a,
27454             b,
27455             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27456         );
27457         let e = _mm512_setr_ps(
27458             0.,
27459             1.5,
27460             2.,
27461             3.5,
27462             4.,
27463             5.5,
27464             6.,
27465             7.5,
27466             0.8,
27467             0.95,
27468             1.0,
27469             1.15,
27470             1.2,
27471             1.35,
27472             1.4,
27473             0.000000000000000000000007000001,
27474         );
27475         assert_eq_m512(r, e);
27476     }
27477
27478     #[simd_test(enable = "avx512f")]
27479     unsafe fn test_mm512_maskz_mul_round_ps() {
27480         let a = _mm512_setr_ps(
27481             0.,
27482             1.5,
27483             2.,
27484             3.5,
27485             4.,
27486             5.5,
27487             6.,
27488             7.5,
27489             8.,
27490             9.5,
27491             10.,
27492             11.5,
27493             12.,
27494             13.5,
27495             14.,
27496             0.00000000000000000000007,
27497         );
27498         let b = _mm512_set1_ps(0.1);
27499         let r = _mm512_maskz_mul_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27500         assert_eq_m512(r, _mm512_setzero_ps());
27501         let r = _mm512_maskz_mul_round_ps(
27502             0b11111111_00000000,
27503             a,
27504             b,
27505             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27506         );
27507         let e = _mm512_setr_ps(
27508             0.,
27509             0.,
27510             0.,
27511             0.,
27512             0.,
27513             0.,
27514             0.,
27515             0.,
27516             0.8,
27517             0.95,
27518             1.0,
27519             1.15,
27520             1.2,
27521             1.35,
27522             1.4,
27523             0.000000000000000000000007000001,
27524         );
27525         assert_eq_m512(r, e);
27526     }
27527
27528     #[simd_test(enable = "avx512f")]
27529     unsafe fn test_mm512_div_round_ps() {
27530         let a = _mm512_set1_ps(1.);
27531         let b = _mm512_set1_ps(3.);
27532         let r = _mm512_div_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27533         let e = _mm512_set1_ps(0.33333334);
27534         assert_eq_m512(r, e);
27535         let r = _mm512_div_round_ps(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27536         let e = _mm512_set1_ps(0.3333333);
27537         assert_eq_m512(r, e);
27538     }
27539
27540     #[simd_test(enable = "avx512f")]
27541     unsafe fn test_mm512_mask_div_round_ps() {
27542         let a = _mm512_set1_ps(1.);
27543         let b = _mm512_set1_ps(3.);
27544         let r = _mm512_mask_div_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27545         assert_eq_m512(r, a);
27546         let r = _mm512_mask_div_round_ps(
27547             a,
27548             0b11111111_00000000,
27549             a,
27550             b,
27551             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27552         );
27553         let e = _mm512_setr_ps(
27554             1., 1., 1., 1., 1., 1., 1., 1., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
27555             0.33333334, 0.33333334, 0.33333334, 0.33333334,
27556         );
27557         assert_eq_m512(r, e);
27558     }
27559
27560     #[simd_test(enable = "avx512f")]
27561     unsafe fn test_mm512_maskz_div_round_ps() {
27562         let a = _mm512_set1_ps(1.);
27563         let b = _mm512_set1_ps(3.);
27564         let r = _mm512_maskz_div_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27565         assert_eq_m512(r, _mm512_setzero_ps());
27566         let r = _mm512_maskz_div_round_ps(
27567             0b11111111_00000000,
27568             a,
27569             b,
27570             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27571         );
27572         let e = _mm512_setr_ps(
27573             0., 0., 0., 0., 0., 0., 0., 0., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
27574             0.33333334, 0.33333334, 0.33333334, 0.33333334,
27575         );
27576         assert_eq_m512(r, e);
27577     }
27578
27579     #[simd_test(enable = "avx512f")]
27580     unsafe fn test_mm512_sqrt_round_ps() {
27581         let a = _mm512_set1_ps(3.);
27582         let r = _mm512_sqrt_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27583         let e = _mm512_set1_ps(1.7320508);
27584         assert_eq_m512(r, e);
27585         let r = _mm512_sqrt_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
27586         let e = _mm512_set1_ps(1.7320509);
27587         assert_eq_m512(r, e);
27588     }
27589
27590     #[simd_test(enable = "avx512f")]
27591     unsafe fn test_mm512_mask_sqrt_round_ps() {
27592         let a = _mm512_set1_ps(3.);
27593         let r = _mm512_mask_sqrt_round_ps(a, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27594         assert_eq_m512(r, a);
27595         let r = _mm512_mask_sqrt_round_ps(
27596             a,
27597             0b11111111_00000000,
27598             a,
27599             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27600         );
27601         let e = _mm512_setr_ps(
27602             3., 3., 3., 3., 3., 3., 3., 3., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
27603             1.7320508, 1.7320508, 1.7320508,
27604         );
27605         assert_eq_m512(r, e);
27606     }
27607
27608     #[simd_test(enable = "avx512f")]
27609     unsafe fn test_mm512_maskz_sqrt_round_ps() {
27610         let a = _mm512_set1_ps(3.);
27611         let r = _mm512_maskz_sqrt_round_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27612         assert_eq_m512(r, _mm512_setzero_ps());
27613         let r = _mm512_maskz_sqrt_round_ps(
27614             0b11111111_00000000,
27615             a,
27616             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27617         );
27618         let e = _mm512_setr_ps(
27619             0., 0., 0., 0., 0., 0., 0., 0., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
27620             1.7320508, 1.7320508, 1.7320508,
27621         );
27622         assert_eq_m512(r, e);
27623     }
27624
27625     #[simd_test(enable = "avx512f")]
27626     unsafe fn test_mm512_fmadd_round_ps() {
27627         let a = _mm512_set1_ps(0.00000007);
27628         let b = _mm512_set1_ps(1.);
27629         let c = _mm512_set1_ps(-1.);
27630         let r = _mm512_fmadd_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27631         let e = _mm512_set1_ps(-0.99999994);
27632         assert_eq_m512(r, e);
27633         let r = _mm512_fmadd_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27634         let e = _mm512_set1_ps(-0.9999999);
27635         assert_eq_m512(r, e);
27636     }
27637
27638     #[simd_test(enable = "avx512f")]
27639     unsafe fn test_mm512_mask_fmadd_round_ps() {
27640         let a = _mm512_set1_ps(0.00000007);
27641         let b = _mm512_set1_ps(1.);
27642         let c = _mm512_set1_ps(-1.);
27643         let r =
27644             _mm512_mask_fmadd_round_ps(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27645         assert_eq_m512(r, a);
27646         let r = _mm512_mask_fmadd_round_ps(
27647             a,
27648             0b00000000_11111111,
27649             b,
27650             c,
27651             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27652         );
27653         let e = _mm512_setr_ps(
27654             -0.99999994,
27655             -0.99999994,
27656             -0.99999994,
27657             -0.99999994,
27658             -0.99999994,
27659             -0.99999994,
27660             -0.99999994,
27661             -0.99999994,
27662             0.00000007,
27663             0.00000007,
27664             0.00000007,
27665             0.00000007,
27666             0.00000007,
27667             0.00000007,
27668             0.00000007,
27669             0.00000007,
27670         );
27671         assert_eq_m512(r, e);
27672     }
27673
27674     #[simd_test(enable = "avx512f")]
27675     unsafe fn test_mm512_maskz_fmadd_round_ps() {
27676         let a = _mm512_set1_ps(0.00000007);
27677         let b = _mm512_set1_ps(1.);
27678         let c = _mm512_set1_ps(-1.);
27679         let r =
27680             _mm512_maskz_fmadd_round_ps(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27681         assert_eq_m512(r, _mm512_setzero_ps());
27682         let r = _mm512_maskz_fmadd_round_ps(
27683             0b00000000_11111111,
27684             a,
27685             b,
27686             c,
27687             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27688         );
27689         let e = _mm512_setr_ps(
27690             -0.99999994,
27691             -0.99999994,
27692             -0.99999994,
27693             -0.99999994,
27694             -0.99999994,
27695             -0.99999994,
27696             -0.99999994,
27697             -0.99999994,
27698             0.,
27699             0.,
27700             0.,
27701             0.,
27702             0.,
27703             0.,
27704             0.,
27705             0.,
27706         );
27707         assert_eq_m512(r, e);
27708     }
27709
27710     #[simd_test(enable = "avx512f")]
27711     unsafe fn test_mm512_mask3_fmadd_round_ps() {
27712         let a = _mm512_set1_ps(0.00000007);
27713         let b = _mm512_set1_ps(1.);
27714         let c = _mm512_set1_ps(-1.);
27715         let r =
27716             _mm512_mask3_fmadd_round_ps(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27717         assert_eq_m512(r, c);
27718         let r = _mm512_mask3_fmadd_round_ps(
27719             a,
27720             b,
27721             c,
27722             0b00000000_11111111,
27723             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27724         );
27725         let e = _mm512_setr_ps(
27726             -0.99999994,
27727             -0.99999994,
27728             -0.99999994,
27729             -0.99999994,
27730             -0.99999994,
27731             -0.99999994,
27732             -0.99999994,
27733             -0.99999994,
27734             -1.,
27735             -1.,
27736             -1.,
27737             -1.,
27738             -1.,
27739             -1.,
27740             -1.,
27741             -1.,
27742         );
27743         assert_eq_m512(r, e);
27744     }
27745
27746     #[simd_test(enable = "avx512f")]
27747     unsafe fn test_mm512_fmsub_round_ps() {
27748         let a = _mm512_set1_ps(0.00000007);
27749         let b = _mm512_set1_ps(1.);
27750         let c = _mm512_set1_ps(1.);
27751         let r = _mm512_fmsub_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27752         let e = _mm512_set1_ps(-0.99999994);
27753         assert_eq_m512(r, e);
27754         let r = _mm512_fmsub_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27755         let e = _mm512_set1_ps(-0.9999999);
27756         assert_eq_m512(r, e);
27757     }
27758
27759     #[simd_test(enable = "avx512f")]
27760     unsafe fn test_mm512_mask_fmsub_round_ps() {
27761         let a = _mm512_set1_ps(0.00000007);
27762         let b = _mm512_set1_ps(1.);
27763         let c = _mm512_set1_ps(1.);
27764         let r =
27765             _mm512_mask_fmsub_round_ps(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27766         assert_eq_m512(r, a);
27767         let r = _mm512_mask_fmsub_round_ps(
27768             a,
27769             0b00000000_11111111,
27770             b,
27771             c,
27772             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27773         );
27774         let e = _mm512_setr_ps(
27775             -0.99999994,
27776             -0.99999994,
27777             -0.99999994,
27778             -0.99999994,
27779             -0.99999994,
27780             -0.99999994,
27781             -0.99999994,
27782             -0.99999994,
27783             0.00000007,
27784             0.00000007,
27785             0.00000007,
27786             0.00000007,
27787             0.00000007,
27788             0.00000007,
27789             0.00000007,
27790             0.00000007,
27791         );
27792         assert_eq_m512(r, e);
27793     }
27794
27795     #[simd_test(enable = "avx512f")]
27796     unsafe fn test_mm512_maskz_fmsub_round_ps() {
27797         let a = _mm512_set1_ps(0.00000007);
27798         let b = _mm512_set1_ps(1.);
27799         let c = _mm512_set1_ps(1.);
27800         let r =
27801             _mm512_maskz_fmsub_round_ps(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27802         assert_eq_m512(r, _mm512_setzero_ps());
27803         let r = _mm512_maskz_fmsub_round_ps(
27804             0b00000000_11111111,
27805             a,
27806             b,
27807             c,
27808             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27809         );
27810         let e = _mm512_setr_ps(
27811             -0.99999994,
27812             -0.99999994,
27813             -0.99999994,
27814             -0.99999994,
27815             -0.99999994,
27816             -0.99999994,
27817             -0.99999994,
27818             -0.99999994,
27819             0.,
27820             0.,
27821             0.,
27822             0.,
27823             0.,
27824             0.,
27825             0.,
27826             0.,
27827         );
27828         assert_eq_m512(r, e);
27829     }
27830
27831     #[simd_test(enable = "avx512f")]
27832     unsafe fn test_mm512_mask3_fmsub_round_ps() {
27833         let a = _mm512_set1_ps(0.00000007);
27834         let b = _mm512_set1_ps(1.);
27835         let c = _mm512_set1_ps(1.);
27836         let r =
27837             _mm512_mask3_fmsub_round_ps(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27838         assert_eq_m512(r, c);
27839         let r = _mm512_mask3_fmsub_round_ps(
27840             a,
27841             b,
27842             c,
27843             0b00000000_11111111,
27844             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27845         );
27846         let e = _mm512_setr_ps(
27847             -0.99999994,
27848             -0.99999994,
27849             -0.99999994,
27850             -0.99999994,
27851             -0.99999994,
27852             -0.99999994,
27853             -0.99999994,
27854             -0.99999994,
27855             1.,
27856             1.,
27857             1.,
27858             1.,
27859             1.,
27860             1.,
27861             1.,
27862             1.,
27863         );
27864         assert_eq_m512(r, e);
27865     }
27866
27867     #[simd_test(enable = "avx512f")]
27868     unsafe fn test_mm512_fmaddsub_round_ps() {
27869         let a = _mm512_set1_ps(0.00000007);
27870         let b = _mm512_set1_ps(1.);
27871         let c = _mm512_set1_ps(-1.);
27872         let r = _mm512_fmaddsub_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
27873         let e = _mm512_setr_ps(
27874             1.0000001,
27875             -0.99999994,
27876             1.0000001,
27877             -0.99999994,
27878             1.0000001,
27879             -0.99999994,
27880             1.0000001,
27881             -0.99999994,
27882             1.0000001,
27883             -0.99999994,
27884             1.0000001,
27885             -0.99999994,
27886             1.0000001,
27887             -0.99999994,
27888             1.0000001,
27889             -0.99999994,
27890         );
27891         assert_eq_m512(r, e);
27892         let r = _mm512_fmaddsub_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
27893         let e = _mm512_setr_ps(
27894             1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
27895             -0.9999999, 1., -0.9999999, 1., -0.9999999,
27896         );
27897         assert_eq_m512(r, e);
27898     }
27899
27900     #[simd_test(enable = "avx512f")]
27901     unsafe fn test_mm512_mask_fmaddsub_round_ps() {
27902         let a = _mm512_set1_ps(0.00000007);
27903         let b = _mm512_set1_ps(1.);
27904         let c = _mm512_set1_ps(-1.);
27905         let r = _mm512_mask_fmaddsub_round_ps(
27906             a,
27907             0,
27908             b,
27909             c,
27910             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27911         );
27912         assert_eq_m512(r, a);
27913         let r = _mm512_mask_fmaddsub_round_ps(
27914             a,
27915             0b00000000_11111111,
27916             b,
27917             c,
27918             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27919         );
27920         let e = _mm512_setr_ps(
27921             1.0000001,
27922             -0.99999994,
27923             1.0000001,
27924             -0.99999994,
27925             1.0000001,
27926             -0.99999994,
27927             1.0000001,
27928             -0.99999994,
27929             0.00000007,
27930             0.00000007,
27931             0.00000007,
27932             0.00000007,
27933             0.00000007,
27934             0.00000007,
27935             0.00000007,
27936             0.00000007,
27937         );
27938         assert_eq_m512(r, e);
27939     }
27940
27941     #[simd_test(enable = "avx512f")]
27942     unsafe fn test_mm512_maskz_fmaddsub_round_ps() {
27943         let a = _mm512_set1_ps(0.00000007);
27944         let b = _mm512_set1_ps(1.);
27945         let c = _mm512_set1_ps(-1.);
27946         let r = _mm512_maskz_fmaddsub_round_ps(
27947             0,
27948             a,
27949             b,
27950             c,
27951             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27952         );
27953         assert_eq_m512(r, _mm512_setzero_ps());
27954         let r = _mm512_maskz_fmaddsub_round_ps(
27955             0b00000000_11111111,
27956             a,
27957             b,
27958             c,
27959             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27960         );
27961         let e = _mm512_setr_ps(
27962             1.0000001,
27963             -0.99999994,
27964             1.0000001,
27965             -0.99999994,
27966             1.0000001,
27967             -0.99999994,
27968             1.0000001,
27969             -0.99999994,
27970             0.,
27971             0.,
27972             0.,
27973             0.,
27974             0.,
27975             0.,
27976             0.,
27977             0.,
27978         );
27979         assert_eq_m512(r, e);
27980     }
27981
27982     #[simd_test(enable = "avx512f")]
27983     unsafe fn test_mm512_mask3_fmaddsub_round_ps() {
27984         let a = _mm512_set1_ps(0.00000007);
27985         let b = _mm512_set1_ps(1.);
27986         let c = _mm512_set1_ps(-1.);
27987         let r = _mm512_mask3_fmaddsub_round_ps(
27988             a,
27989             b,
27990             c,
27991             0,
27992             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
27993         );
27994         assert_eq_m512(r, c);
27995         let r = _mm512_mask3_fmaddsub_round_ps(
27996             a,
27997             b,
27998             c,
27999             0b00000000_11111111,
28000             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28001         );
28002         let e = _mm512_setr_ps(
28003             1.0000001,
28004             -0.99999994,
28005             1.0000001,
28006             -0.99999994,
28007             1.0000001,
28008             -0.99999994,
28009             1.0000001,
28010             -0.99999994,
28011             -1.,
28012             -1.,
28013             -1.,
28014             -1.,
28015             -1.,
28016             -1.,
28017             -1.,
28018             -1.,
28019         );
28020         assert_eq_m512(r, e);
28021     }
28022
28023     #[simd_test(enable = "avx512f")]
28024     unsafe fn test_mm512_fmsubadd_round_ps() {
28025         let a = _mm512_set1_ps(0.00000007);
28026         let b = _mm512_set1_ps(1.);
28027         let c = _mm512_set1_ps(-1.);
28028         let r = _mm512_fmsubadd_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28029         let e = _mm512_setr_ps(
28030             -0.99999994,
28031             1.0000001,
28032             -0.99999994,
28033             1.0000001,
28034             -0.99999994,
28035             1.0000001,
28036             -0.99999994,
28037             1.0000001,
28038             -0.99999994,
28039             1.0000001,
28040             -0.99999994,
28041             1.0000001,
28042             -0.99999994,
28043             1.0000001,
28044             -0.99999994,
28045             1.0000001,
28046         );
28047         assert_eq_m512(r, e);
28048         let r = _mm512_fmsubadd_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
28049         let e = _mm512_setr_ps(
28050             -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
28051             -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
28052         );
28053         assert_eq_m512(r, e);
28054     }
28055
28056     #[simd_test(enable = "avx512f")]
28057     unsafe fn test_mm512_mask_fmsubadd_round_ps() {
28058         let a = _mm512_set1_ps(0.00000007);
28059         let b = _mm512_set1_ps(1.);
28060         let c = _mm512_set1_ps(-1.);
28061         let r = _mm512_mask_fmsubadd_round_ps(
28062             a,
28063             0,
28064             b,
28065             c,
28066             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28067         );
28068         assert_eq_m512(r, a);
28069         let r = _mm512_mask_fmsubadd_round_ps(
28070             a,
28071             0b00000000_11111111,
28072             b,
28073             c,
28074             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28075         );
28076         let e = _mm512_setr_ps(
28077             -0.99999994,
28078             1.0000001,
28079             -0.99999994,
28080             1.0000001,
28081             -0.99999994,
28082             1.0000001,
28083             -0.99999994,
28084             1.0000001,
28085             0.00000007,
28086             0.00000007,
28087             0.00000007,
28088             0.00000007,
28089             0.00000007,
28090             0.00000007,
28091             0.00000007,
28092             0.00000007,
28093         );
28094         assert_eq_m512(r, e);
28095     }
28096
28097     #[simd_test(enable = "avx512f")]
28098     unsafe fn test_mm512_maskz_fmsubadd_round_ps() {
28099         let a = _mm512_set1_ps(0.00000007);
28100         let b = _mm512_set1_ps(1.);
28101         let c = _mm512_set1_ps(-1.);
28102         let r = _mm512_maskz_fmsubadd_round_ps(
28103             0,
28104             a,
28105             b,
28106             c,
28107             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28108         );
28109         assert_eq_m512(r, _mm512_setzero_ps());
28110         let r = _mm512_maskz_fmsubadd_round_ps(
28111             0b00000000_11111111,
28112             a,
28113             b,
28114             c,
28115             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28116         );
28117         let e = _mm512_setr_ps(
28118             -0.99999994,
28119             1.0000001,
28120             -0.99999994,
28121             1.0000001,
28122             -0.99999994,
28123             1.0000001,
28124             -0.99999994,
28125             1.0000001,
28126             0.,
28127             0.,
28128             0.,
28129             0.,
28130             0.,
28131             0.,
28132             0.,
28133             0.,
28134         );
28135         assert_eq_m512(r, e);
28136     }
28137
28138     #[simd_test(enable = "avx512f")]
28139     unsafe fn test_mm512_mask3_fmsubadd_round_ps() {
28140         let a = _mm512_set1_ps(0.00000007);
28141         let b = _mm512_set1_ps(1.);
28142         let c = _mm512_set1_ps(-1.);
28143         let r = _mm512_mask3_fmsubadd_round_ps(
28144             a,
28145             b,
28146             c,
28147             0,
28148             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28149         );
28150         assert_eq_m512(r, c);
28151         let r = _mm512_mask3_fmsubadd_round_ps(
28152             a,
28153             b,
28154             c,
28155             0b00000000_11111111,
28156             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28157         );
28158         let e = _mm512_setr_ps(
28159             -0.99999994,
28160             1.0000001,
28161             -0.99999994,
28162             1.0000001,
28163             -0.99999994,
28164             1.0000001,
28165             -0.99999994,
28166             1.0000001,
28167             -1.,
28168             -1.,
28169             -1.,
28170             -1.,
28171             -1.,
28172             -1.,
28173             -1.,
28174             -1.,
28175         );
28176         assert_eq_m512(r, e);
28177     }
28178
28179     #[simd_test(enable = "avx512f")]
28180     unsafe fn test_mm512_fnmadd_round_ps() {
28181         let a = _mm512_set1_ps(0.00000007);
28182         let b = _mm512_set1_ps(1.);
28183         let c = _mm512_set1_ps(1.);
28184         let r = _mm512_fnmadd_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28185         let e = _mm512_set1_ps(0.99999994);
28186         assert_eq_m512(r, e);
28187         let r = _mm512_fnmadd_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
28188         let e = _mm512_set1_ps(0.9999999);
28189         assert_eq_m512(r, e);
28190     }
28191
28192     #[simd_test(enable = "avx512f")]
28193     unsafe fn test_mm512_mask_fnmadd_round_ps() {
28194         let a = _mm512_set1_ps(0.00000007);
28195         let b = _mm512_set1_ps(1.);
28196         let c = _mm512_set1_ps(1.);
28197         let r =
28198             _mm512_mask_fnmadd_round_ps(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28199         assert_eq_m512(r, a);
28200         let r = _mm512_mask_fnmadd_round_ps(
28201             a,
28202             0b00000000_11111111,
28203             b,
28204             c,
28205             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28206         );
28207         let e = _mm512_setr_ps(
28208             0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
28209             0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
28210             0.00000007, 0.00000007,
28211         );
28212         assert_eq_m512(r, e);
28213     }
28214
28215     #[simd_test(enable = "avx512f")]
28216     unsafe fn test_mm512_maskz_fnmadd_round_ps() {
28217         let a = _mm512_set1_ps(0.00000007);
28218         let b = _mm512_set1_ps(1.);
28219         let c = _mm512_set1_ps(1.);
28220         let r =
28221             _mm512_maskz_fnmadd_round_ps(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28222         assert_eq_m512(r, _mm512_setzero_ps());
28223         let r = _mm512_maskz_fnmadd_round_ps(
28224             0b00000000_11111111,
28225             a,
28226             b,
28227             c,
28228             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28229         );
28230         let e = _mm512_setr_ps(
28231             0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
28232             0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
28233         );
28234         assert_eq_m512(r, e);
28235     }
28236
28237     #[simd_test(enable = "avx512f")]
28238     unsafe fn test_mm512_mask3_fnmadd_round_ps() {
28239         let a = _mm512_set1_ps(0.00000007);
28240         let b = _mm512_set1_ps(1.);
28241         let c = _mm512_set1_ps(1.);
28242         let r =
28243             _mm512_mask3_fnmadd_round_ps(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28244         assert_eq_m512(r, c);
28245         let r = _mm512_mask3_fnmadd_round_ps(
28246             a,
28247             b,
28248             c,
28249             0b00000000_11111111,
28250             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28251         );
28252         let e = _mm512_setr_ps(
28253             0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
28254             0.99999994, 1., 1., 1., 1., 1., 1., 1., 1.,
28255         );
28256         assert_eq_m512(r, e);
28257     }
28258
28259     #[simd_test(enable = "avx512f")]
28260     unsafe fn test_mm512_fnmsub_round_ps() {
28261         let a = _mm512_set1_ps(0.00000007);
28262         let b = _mm512_set1_ps(1.);
28263         let c = _mm512_set1_ps(-1.);
28264         let r = _mm512_fnmsub_round_ps(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28265         let e = _mm512_set1_ps(0.99999994);
28266         assert_eq_m512(r, e);
28267         let r = _mm512_fnmsub_round_ps(a, b, c, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
28268         let e = _mm512_set1_ps(0.9999999);
28269         assert_eq_m512(r, e);
28270     }
28271
28272     #[simd_test(enable = "avx512f")]
28273     unsafe fn test_mm512_mask_fnmsub_round_ps() {
28274         let a = _mm512_set1_ps(0.00000007);
28275         let b = _mm512_set1_ps(1.);
28276         let c = _mm512_set1_ps(-1.);
28277         let r =
28278             _mm512_mask_fnmsub_round_ps(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28279         assert_eq_m512(r, a);
28280         let r = _mm512_mask_fnmsub_round_ps(
28281             a,
28282             0b00000000_11111111,
28283             b,
28284             c,
28285             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28286         );
28287         let e = _mm512_setr_ps(
28288             0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
28289             0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
28290             0.00000007, 0.00000007,
28291         );
28292         assert_eq_m512(r, e);
28293     }
28294
28295     #[simd_test(enable = "avx512f")]
28296     unsafe fn test_mm512_maskz_fnmsub_round_ps() {
28297         let a = _mm512_set1_ps(0.00000007);
28298         let b = _mm512_set1_ps(1.);
28299         let c = _mm512_set1_ps(-1.);
28300         let r =
28301             _mm512_maskz_fnmsub_round_ps(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28302         assert_eq_m512(r, _mm512_setzero_ps());
28303         let r = _mm512_maskz_fnmsub_round_ps(
28304             0b00000000_11111111,
28305             a,
28306             b,
28307             c,
28308             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28309         );
28310         let e = _mm512_setr_ps(
28311             0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
28312             0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
28313         );
28314         assert_eq_m512(r, e);
28315     }
28316
28317     #[simd_test(enable = "avx512f")]
28318     unsafe fn test_mm512_mask3_fnmsub_round_ps() {
28319         let a = _mm512_set1_ps(0.00000007);
28320         let b = _mm512_set1_ps(1.);
28321         let c = _mm512_set1_ps(-1.);
28322         let r =
28323             _mm512_mask3_fnmsub_round_ps(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28324         assert_eq_m512(r, c);
28325         let r = _mm512_mask3_fnmsub_round_ps(
28326             a,
28327             b,
28328             c,
28329             0b00000000_11111111,
28330             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28331         );
28332         let e = _mm512_setr_ps(
28333             0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
28334             0.99999994, -1., -1., -1., -1., -1., -1., -1., -1.,
28335         );
28336         assert_eq_m512(r, e);
28337     }
28338
28339     #[simd_test(enable = "avx512f")]
28340     unsafe fn test_mm512_max_round_ps() {
28341         let a = _mm512_setr_ps(
28342             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28343         );
28344         let b = _mm512_setr_ps(
28345             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
28346         );
28347         let r = _mm512_max_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
28348         let e = _mm512_setr_ps(
28349             15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
28350         );
28351         assert_eq_m512(r, e);
28352     }
28353
28354     #[simd_test(enable = "avx512f")]
28355     unsafe fn test_mm512_mask_max_round_ps() {
28356         let a = _mm512_setr_ps(
28357             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28358         );
28359         let b = _mm512_setr_ps(
28360             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
28361         );
28362         let r = _mm512_mask_max_round_ps(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
28363         assert_eq_m512(r, a);
28364         let r = _mm512_mask_max_round_ps(a, 0b00000000_11111111, a, b, _MM_FROUND_CUR_DIRECTION);
28365         let e = _mm512_setr_ps(
28366             15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
28367         );
28368         assert_eq_m512(r, e);
28369     }
28370
28371     #[simd_test(enable = "avx512f")]
28372     unsafe fn test_mm512_maskz_max_round_ps() {
28373         let a = _mm512_setr_ps(
28374             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28375         );
28376         let b = _mm512_setr_ps(
28377             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
28378         );
28379         let r = _mm512_maskz_max_round_ps(0, a, b, _MM_FROUND_CUR_DIRECTION);
28380         assert_eq_m512(r, _mm512_setzero_ps());
28381         let r = _mm512_maskz_max_round_ps(0b00000000_11111111, a, b, _MM_FROUND_CUR_DIRECTION);
28382         let e = _mm512_setr_ps(
28383             15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
28384         );
28385         assert_eq_m512(r, e);
28386     }
28387
28388     #[simd_test(enable = "avx512f")]
28389     unsafe fn test_mm512_min_round_ps() {
28390         let a = _mm512_setr_ps(
28391             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28392         );
28393         let b = _mm512_setr_ps(
28394             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
28395         );
28396         let r = _mm512_min_round_ps(a, b, _MM_FROUND_CUR_DIRECTION);
28397         let e = _mm512_setr_ps(
28398             0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
28399         );
28400         assert_eq_m512(r, e);
28401     }
28402
28403     #[simd_test(enable = "avx512f")]
28404     unsafe fn test_mm512_mask_min_round_ps() {
28405         let a = _mm512_setr_ps(
28406             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28407         );
28408         let b = _mm512_setr_ps(
28409             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
28410         );
28411         let r = _mm512_mask_min_round_ps(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
28412         assert_eq_m512(r, a);
28413         let r = _mm512_mask_min_round_ps(a, 0b00000000_11111111, a, b, _MM_FROUND_CUR_DIRECTION);
28414         let e = _mm512_setr_ps(
28415             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28416         );
28417         assert_eq_m512(r, e);
28418     }
28419
28420     #[simd_test(enable = "avx512f")]
28421     unsafe fn test_mm512_maskz_min_round_ps() {
28422         let a = _mm512_setr_ps(
28423             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28424         );
28425         let b = _mm512_setr_ps(
28426             15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
28427         );
28428         let r = _mm512_maskz_min_round_ps(0, a, b, _MM_FROUND_CUR_DIRECTION);
28429         assert_eq_m512(r, _mm512_setzero_ps());
28430         let r = _mm512_maskz_min_round_ps(0b00000000_11111111, a, b, _MM_FROUND_CUR_DIRECTION);
28431         let e = _mm512_setr_ps(
28432             0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
28433         );
28434         assert_eq_m512(r, e);
28435     }
28436
28437     #[simd_test(enable = "avx512f")]
28438     unsafe fn test_mm512_getexp_round_ps() {
28439         let a = _mm512_set1_ps(3.);
28440         let r = _mm512_getexp_round_ps(a, _MM_FROUND_CUR_DIRECTION);
28441         let e = _mm512_set1_ps(1.);
28442         assert_eq_m512(r, e);
28443         let r = _mm512_getexp_round_ps(a, _MM_FROUND_NO_EXC);
28444         let e = _mm512_set1_ps(1.);
28445         assert_eq_m512(r, e);
28446     }
28447
28448     #[simd_test(enable = "avx512f")]
28449     unsafe fn test_mm512_mask_getexp_round_ps() {
28450         let a = _mm512_set1_ps(3.);
28451         let r = _mm512_mask_getexp_round_ps(a, 0, a, _MM_FROUND_CUR_DIRECTION);
28452         assert_eq_m512(r, a);
28453         let r = _mm512_mask_getexp_round_ps(a, 0b11111111_00000000, a, _MM_FROUND_CUR_DIRECTION);
28454         let e = _mm512_setr_ps(
28455             3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
28456         );
28457         assert_eq_m512(r, e);
28458     }
28459
28460     #[simd_test(enable = "avx512f")]
28461     unsafe fn test_mm512_maskz_getexp_round_ps() {
28462         let a = _mm512_set1_ps(3.);
28463         let r = _mm512_maskz_getexp_round_ps(0, a, _MM_FROUND_CUR_DIRECTION);
28464         assert_eq_m512(r, _mm512_setzero_ps());
28465         let r = _mm512_maskz_getexp_round_ps(0b11111111_00000000, a, _MM_FROUND_CUR_DIRECTION);
28466         let e = _mm512_setr_ps(
28467             0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
28468         );
28469         assert_eq_m512(r, e);
28470     }
28471
28472     #[simd_test(enable = "avx512f")]
28473     unsafe fn test_mm512_roundscale_round_ps() {
28474         let a = _mm512_set1_ps(1.1);
28475         let r = _mm512_roundscale_round_ps(a, 0, _MM_FROUND_CUR_DIRECTION);
28476         let e = _mm512_set1_ps(1.0);
28477         assert_eq_m512(r, e);
28478     }
28479
28480     #[simd_test(enable = "avx512f")]
28481     unsafe fn test_mm512_mask_roundscale_round_ps() {
28482         let a = _mm512_set1_ps(1.1);
28483         let r = _mm512_mask_roundscale_round_ps(a, 0, a, 0, _MM_FROUND_CUR_DIRECTION);
28484         let e = _mm512_set1_ps(1.1);
28485         assert_eq_m512(r, e);
28486         let r =
28487             _mm512_mask_roundscale_round_ps(a, 0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
28488         let e = _mm512_set1_ps(1.0);
28489         assert_eq_m512(r, e);
28490     }
28491
28492     #[simd_test(enable = "avx512f")]
28493     unsafe fn test_mm512_maskz_roundscale_round_ps() {
28494         let a = _mm512_set1_ps(1.1);
28495         let r = _mm512_maskz_roundscale_round_ps(0, a, 0, _MM_FROUND_CUR_DIRECTION);
28496         assert_eq_m512(r, _mm512_setzero_ps());
28497         let r =
28498             _mm512_maskz_roundscale_round_ps(0b11111111_11111111, a, 0, _MM_FROUND_CUR_DIRECTION);
28499         let e = _mm512_set1_ps(1.0);
28500         assert_eq_m512(r, e);
28501     }
28502
28503     #[simd_test(enable = "avx512f")]
28504     unsafe fn test_mm512_scalef_round_ps() {
28505         let a = _mm512_set1_ps(1.);
28506         let b = _mm512_set1_ps(3.);
28507         let r = _mm512_scalef_round_ps(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28508         let e = _mm512_set1_ps(8.);
28509         assert_eq_m512(r, e);
28510     }
28511
28512     #[simd_test(enable = "avx512f")]
28513     unsafe fn test_mm512_mask_scalef_round_ps() {
28514         let a = _mm512_set1_ps(1.);
28515         let b = _mm512_set1_ps(3.);
28516         let r =
28517             _mm512_mask_scalef_round_ps(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28518         assert_eq_m512(r, a);
28519         let r = _mm512_mask_scalef_round_ps(
28520             a,
28521             0b11111111_00000000,
28522             a,
28523             b,
28524             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28525         );
28526         let e = _mm512_set_ps(
28527             8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
28528         );
28529         assert_eq_m512(r, e);
28530     }
28531
28532     #[simd_test(enable = "avx512f")]
28533     unsafe fn test_mm512_maskz_scalef_round_ps() {
28534         let a = _mm512_set1_ps(1.);
28535         let b = _mm512_set1_ps(3.);
28536         let r =
28537             _mm512_maskz_scalef_round_ps(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
28538         assert_eq_m512(r, _mm512_setzero_ps());
28539         let r = _mm512_maskz_scalef_round_ps(
28540             0b11111111_00000000,
28541             a,
28542             b,
28543             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
28544         );
28545         let e = _mm512_set_ps(
28546             8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
28547         );
28548         assert_eq_m512(r, e);
28549     }
28550
28551     #[simd_test(enable = "avx512f")]
28552     unsafe fn test_mm512_fixupimm_round_ps() {
28553         let a = _mm512_set1_ps(f32::NAN);
28554         let b = _mm512_set1_ps(f32::MAX);
28555         let c = _mm512_set1_epi32(i32::MAX);
28556         let r = _mm512_fixupimm_round_ps(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
28557         let e = _mm512_set1_ps(0.0);
28558         assert_eq_m512(r, e);
28559     }
28560
28561     #[simd_test(enable = "avx512f")]
28562     unsafe fn test_mm512_mask_fixupimm_round_ps() {
28563         let a = _mm512_set_ps(
28564             f32::NAN,
28565             f32::NAN,
28566             f32::NAN,
28567             f32::NAN,
28568             f32::NAN,
28569             f32::NAN,
28570             f32::NAN,
28571             f32::NAN,
28572             1.,
28573             1.,
28574             1.,
28575             1.,
28576             1.,
28577             1.,
28578             1.,
28579             1.,
28580         );
28581         let b = _mm512_set1_ps(f32::MAX);
28582         let c = _mm512_set1_epi32(i32::MAX);
28583         let r = _mm512_mask_fixupimm_round_ps(
28584             a,
28585             0b11111111_00000000,
28586             b,
28587             c,
28588             5,
28589             _MM_FROUND_CUR_DIRECTION,
28590         );
28591         let e = _mm512_set_ps(
28592             0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
28593         );
28594         assert_eq_m512(r, e);
28595     }
28596
28597     #[simd_test(enable = "avx512f")]
28598     unsafe fn test_mm512_maskz_fixupimm_round_ps() {
28599         let a = _mm512_set_ps(
28600             f32::NAN,
28601             f32::NAN,
28602             f32::NAN,
28603             f32::NAN,
28604             f32::NAN,
28605             f32::NAN,
28606             f32::NAN,
28607             f32::NAN,
28608             1.,
28609             1.,
28610             1.,
28611             1.,
28612             1.,
28613             1.,
28614             1.,
28615             1.,
28616         );
28617         let b = _mm512_set1_ps(f32::MAX);
28618         let c = _mm512_set1_epi32(i32::MAX);
28619         let r = _mm512_maskz_fixupimm_round_ps(
28620             0b11111111_00000000,
28621             a,
28622             b,
28623             c,
28624             5,
28625             _MM_FROUND_CUR_DIRECTION,
28626         );
28627         let e = _mm512_set_ps(
28628             0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
28629         );
28630         assert_eq_m512(r, e);
28631     }
28632
28633     #[simd_test(enable = "avx512f")]
28634     unsafe fn test_mm512_getmant_round_ps() {
28635         let a = _mm512_set1_ps(10.);
28636         let r = _mm512_getmant_round_ps(
28637             a,
28638             _MM_MANT_NORM_1_2,
28639             _MM_MANT_SIGN_SRC,
28640             _MM_FROUND_CUR_DIRECTION,
28641         );
28642         let e = _mm512_set1_ps(1.25);
28643         assert_eq_m512(r, e);
28644     }
28645
28646     #[simd_test(enable = "avx512f")]
28647     unsafe fn test_mm512_mask_getmant_round_ps() {
28648         let a = _mm512_set1_ps(10.);
28649         let r = _mm512_mask_getmant_round_ps(
28650             a,
28651             0,
28652             a,
28653             _MM_MANT_NORM_1_2,
28654             _MM_MANT_SIGN_SRC,
28655             _MM_FROUND_CUR_DIRECTION,
28656         );
28657         assert_eq_m512(r, a);
28658         let r = _mm512_mask_getmant_round_ps(
28659             a,
28660             0b11111111_00000000,
28661             a,
28662             _MM_MANT_NORM_1_2,
28663             _MM_MANT_SIGN_SRC,
28664             _MM_FROUND_CUR_DIRECTION,
28665         );
28666         let e = _mm512_setr_ps(
28667             10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
28668         );
28669         assert_eq_m512(r, e);
28670     }
28671
28672     #[simd_test(enable = "avx512f")]
28673     unsafe fn test_mm512_maskz_getmant_round_ps() {
28674         let a = _mm512_set1_ps(10.);
28675         let r = _mm512_maskz_getmant_round_ps(
28676             0,
28677             a,
28678             _MM_MANT_NORM_1_2,
28679             _MM_MANT_SIGN_SRC,
28680             _MM_FROUND_CUR_DIRECTION,
28681         );
28682         assert_eq_m512(r, _mm512_setzero_ps());
28683         let r = _mm512_maskz_getmant_round_ps(
28684             0b11111111_00000000,
28685             a,
28686             _MM_MANT_NORM_1_2,
28687             _MM_MANT_SIGN_SRC,
28688             _MM_FROUND_CUR_DIRECTION,
28689         );
28690         let e = _mm512_setr_ps(
28691             0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
28692         );
28693         assert_eq_m512(r, e);
28694     }
28695
28696     #[simd_test(enable = "avx512f")]
28697     unsafe fn test_mm512_cvtps_epi32() {
28698         let a = _mm512_setr_ps(
28699             0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
28700         );
28701         let r = _mm512_cvtps_epi32(a);
28702         let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
28703         assert_eq_m512i(r, e);
28704     }
28705
28706     #[simd_test(enable = "avx512f")]
28707     unsafe fn test_mm512_mask_cvtps_epi32() {
28708         let a = _mm512_setr_ps(
28709             0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
28710         );
28711         let src = _mm512_set1_epi32(0);
28712         let r = _mm512_mask_cvtps_epi32(src, 0, a);
28713         assert_eq_m512i(r, src);
28714         let r = _mm512_mask_cvtps_epi32(src, 0b00000000_11111111, a);
28715         let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
28716         assert_eq_m512i(r, e);
28717     }
28718
28719     #[simd_test(enable = "avx512f")]
28720     unsafe fn test_mm512_maskz_cvtps_epi32() {
28721         let a = _mm512_setr_ps(
28722             0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
28723         );
28724         let r = _mm512_maskz_cvtps_epi32(0, a);
28725         assert_eq_m512i(r, _mm512_setzero_si512());
28726         let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a);
28727         let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
28728         assert_eq_m512i(r, e);
28729     }
28730
28731     #[simd_test(enable = "avx512f")]
28732     unsafe fn test_mm512_cvtps_epu32() {
28733         let a = _mm512_setr_ps(
28734             0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
28735         );
28736         let r = _mm512_cvtps_epu32(a);
28737         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
28738         assert_eq_m512i(r, e);
28739     }
28740
28741     #[simd_test(enable = "avx512f")]
28742     unsafe fn test_mm512_mask_cvtps_epu32() {
28743         let a = _mm512_setr_ps(
28744             0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
28745         );
28746         let src = _mm512_set1_epi32(0);
28747         let r = _mm512_mask_cvtps_epu32(src, 0, a);
28748         assert_eq_m512i(r, src);
28749         let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a);
28750         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
28751         assert_eq_m512i(r, e);
28752     }
28753
28754     #[simd_test(enable = "avx512f")]
28755     unsafe fn test_mm512_maskz_cvtps_epu32() {
28756         let a = _mm512_setr_ps(
28757             0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
28758         );
28759         let r = _mm512_maskz_cvtps_epu32(0, a);
28760         assert_eq_m512i(r, _mm512_setzero_si512());
28761         let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a);
28762         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
28763         assert_eq_m512i(r, e);
28764     }
28765
28766     #[simd_test(enable = "avx512f")]
28767     unsafe fn test_mm512_cvtepi8_epi32() {
28768         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28769         let r = _mm512_cvtepi8_epi32(a);
28770         let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28771         assert_eq_m512i(r, e);
28772     }
28773
28774     #[simd_test(enable = "avx512f")]
28775     unsafe fn test_mm512_mask_cvtepi8_epi32() {
28776         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28777         let src = _mm512_set1_epi32(-1);
28778         let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
28779         assert_eq_m512i(r, src);
28780         let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
28781         let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
28782         assert_eq_m512i(r, e);
28783     }
28784
28785     #[simd_test(enable = "avx512f")]
28786     unsafe fn test_mm512_maskz_cvtepi8_epi32() {
28787         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28788         let r = _mm512_maskz_cvtepi8_epi32(0, a);
28789         assert_eq_m512i(r, _mm512_setzero_si512());
28790         let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
28791         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
28792         assert_eq_m512i(r, e);
28793     }
28794
28795     #[simd_test(enable = "avx512f")]
28796     unsafe fn test_mm512_cvtepu8_epi32() {
28797         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28798         let r = _mm512_cvtepu8_epi32(a);
28799         let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28800         assert_eq_m512i(r, e);
28801     }
28802
28803     #[simd_test(enable = "avx512f")]
28804     unsafe fn test_mm512_mask_cvtepu8_epi32() {
28805         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28806         let src = _mm512_set1_epi32(-1);
28807         let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
28808         assert_eq_m512i(r, src);
28809         let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
28810         let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
28811         assert_eq_m512i(r, e);
28812     }
28813
28814     #[simd_test(enable = "avx512f")]
28815     unsafe fn test_mm512_maskz_cvtepu8_epi32() {
28816         let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28817         let r = _mm512_maskz_cvtepu8_epi32(0, a);
28818         assert_eq_m512i(r, _mm512_setzero_si512());
28819         let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
28820         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
28821         assert_eq_m512i(r, e);
28822     }
28823
28824     #[simd_test(enable = "avx512f")]
28825     unsafe fn test_mm512_cvtepi16_epi32() {
28826         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28827         let r = _mm512_cvtepi16_epi32(a);
28828         let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28829         assert_eq_m512i(r, e);
28830     }
28831
28832     #[simd_test(enable = "avx512f")]
28833     unsafe fn test_mm512_mask_cvtepi16_epi32() {
28834         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28835         let src = _mm512_set1_epi32(-1);
28836         let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
28837         assert_eq_m512i(r, src);
28838         let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
28839         let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
28840         assert_eq_m512i(r, e);
28841     }
28842
28843     #[simd_test(enable = "avx512f")]
28844     unsafe fn test_mm512_maskz_cvtepi16_epi32() {
28845         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28846         let r = _mm512_maskz_cvtepi16_epi32(0, a);
28847         assert_eq_m512i(r, _mm512_setzero_si512());
28848         let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
28849         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
28850         assert_eq_m512i(r, e);
28851     }
28852
28853     #[simd_test(enable = "avx512f")]
28854     unsafe fn test_mm512_cvtepu16_epi32() {
28855         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28856         let r = _mm512_cvtepu16_epi32(a);
28857         let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28858         assert_eq_m512i(r, e);
28859     }
28860
28861     #[simd_test(enable = "avx512f")]
28862     unsafe fn test_mm512_mask_cvtepu16_epi32() {
28863         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28864         let src = _mm512_set1_epi32(-1);
28865         let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
28866         assert_eq_m512i(r, src);
28867         let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
28868         let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
28869         assert_eq_m512i(r, e);
28870     }
28871
28872     #[simd_test(enable = "avx512f")]
28873     unsafe fn test_mm512_maskz_cvtepu16_epi32() {
28874         let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28875         let r = _mm512_maskz_cvtepu16_epi32(0, a);
28876         assert_eq_m512i(r, _mm512_setzero_si512());
28877         let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
28878         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
28879         assert_eq_m512i(r, e);
28880     }
28881
28882     #[simd_test(enable = "avx512f")]
28883     unsafe fn test_mm512_cvtepi32_ps() {
28884         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28885         let r = _mm512_cvtepi32_ps(a);
28886         let e = _mm512_set_ps(
28887             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28888         );
28889         assert_eq_m512(r, e);
28890     }
28891
28892     #[simd_test(enable = "avx512f")]
28893     unsafe fn test_mm512_mask_cvtepi32_ps() {
28894         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28895         let src = _mm512_set1_ps(-1.);
28896         let r = _mm512_mask_cvtepi32_ps(src, 0, a);
28897         assert_eq_m512(r, src);
28898         let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
28899         let e = _mm512_set_ps(
28900             -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
28901         );
28902         assert_eq_m512(r, e);
28903     }
28904
28905     #[simd_test(enable = "avx512f")]
28906     unsafe fn test_mm512_maskz_cvtepi32_ps() {
28907         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28908         let r = _mm512_maskz_cvtepi32_ps(0, a);
28909         assert_eq_m512(r, _mm512_setzero_ps());
28910         let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
28911         let e = _mm512_set_ps(
28912             0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
28913         );
28914         assert_eq_m512(r, e);
28915     }
28916
28917     #[simd_test(enable = "avx512f")]
28918     unsafe fn test_mm512_cvtepu32_ps() {
28919         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28920         let r = _mm512_cvtepu32_ps(a);
28921         let e = _mm512_set_ps(
28922             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
28923         );
28924         assert_eq_m512(r, e);
28925     }
28926
28927     #[simd_test(enable = "avx512f")]
28928     unsafe fn test_mm512_mask_cvtepu32_ps() {
28929         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28930         let src = _mm512_set1_ps(-1.);
28931         let r = _mm512_mask_cvtepu32_ps(src, 0, a);
28932         assert_eq_m512(r, src);
28933         let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
28934         let e = _mm512_set_ps(
28935             -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
28936         );
28937         assert_eq_m512(r, e);
28938     }
28939
28940     #[simd_test(enable = "avx512f")]
28941     unsafe fn test_mm512_maskz_cvtepu32_ps() {
28942         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28943         let r = _mm512_maskz_cvtepu32_ps(0, a);
28944         assert_eq_m512(r, _mm512_setzero_ps());
28945         let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
28946         let e = _mm512_set_ps(
28947             0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
28948         );
28949         assert_eq_m512(r, e);
28950     }
28951
28952     #[simd_test(enable = "avx512f")]
28953     unsafe fn test_mm512_cvtepi32_epi16() {
28954         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28955         let r = _mm512_cvtepi32_epi16(a);
28956         let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28957         assert_eq_m256i(r, e);
28958     }
28959
28960     #[simd_test(enable = "avx512f")]
28961     unsafe fn test_mm512_mask_cvtepi32_epi16() {
28962         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28963         let src = _mm256_set1_epi16(-1);
28964         let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
28965         assert_eq_m256i(r, src);
28966         let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
28967         let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
28968         assert_eq_m256i(r, e);
28969     }
28970
28971     #[simd_test(enable = "avx512f")]
28972     unsafe fn test_mm512_maskz_cvtepi32_epi16() {
28973         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28974         let r = _mm512_maskz_cvtepi32_epi16(0, a);
28975         assert_eq_m256i(r, _mm256_setzero_si256());
28976         let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
28977         let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
28978         assert_eq_m256i(r, e);
28979     }
28980
28981     #[simd_test(enable = "avx512f")]
28982     unsafe fn test_mm512_cvtepi32_epi8() {
28983         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28984         let r = _mm512_cvtepi32_epi8(a);
28985         let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28986         assert_eq_m128i(r, e);
28987     }
28988
28989     #[simd_test(enable = "avx512f")]
28990     unsafe fn test_mm512_mask_cvtepi32_epi8() {
28991         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
28992         let src = _mm_set1_epi8(-1);
28993         let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
28994         assert_eq_m128i(r, src);
28995         let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
28996         let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
28997         assert_eq_m128i(r, e);
28998     }
28999
29000     #[simd_test(enable = "avx512f")]
29001     unsafe fn test_mm512_maskz_cvtepi32_epi8() {
29002         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
29003         let r = _mm512_maskz_cvtepi32_epi8(0, a);
29004         assert_eq_m128i(r, _mm_setzero_si128());
29005         let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
29006         let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
29007         assert_eq_m128i(r, e);
29008     }
29009
29010     #[simd_test(enable = "avx512f")]
29011     unsafe fn test_mm512_cvtsepi32_epi16() {
29012         let a = _mm512_set_epi32(
29013             0,
29014             1,
29015             2,
29016             3,
29017             4,
29018             5,
29019             6,
29020             7,
29021             8,
29022             9,
29023             10,
29024             11,
29025             12,
29026             13,
29027             i32::MIN,
29028             i32::MAX,
29029         );
29030         let r = _mm512_cvtsepi32_epi16(a);
29031         let e = _mm256_set_epi16(
29032             0,
29033             1,
29034             2,
29035             3,
29036             4,
29037             5,
29038             6,
29039             7,
29040             8,
29041             9,
29042             10,
29043             11,
29044             12,
29045             13,
29046             i16::MIN,
29047             i16::MAX,
29048         );
29049         assert_eq_m256i(r, e);
29050     }
29051
29052     #[simd_test(enable = "avx512f")]
29053     unsafe fn test_mm512_mask_cvtsepi32_epi16() {
29054         let a = _mm512_set_epi32(
29055             0,
29056             1,
29057             2,
29058             3,
29059             4,
29060             5,
29061             6,
29062             7,
29063             8,
29064             9,
29065             10,
29066             11,
29067             12,
29068             13,
29069             i32::MIN,
29070             i32::MAX,
29071         );
29072         let src = _mm256_set1_epi16(-1);
29073         let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
29074         assert_eq_m256i(r, src);
29075         let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
29076         let e = _mm256_set_epi16(
29077             -1,
29078             -1,
29079             -1,
29080             -1,
29081             -1,
29082             -1,
29083             -1,
29084             -1,
29085             8,
29086             9,
29087             10,
29088             11,
29089             12,
29090             13,
29091             i16::MIN,
29092             i16::MAX,
29093         );
29094         assert_eq_m256i(r, e);
29095     }
29096
29097     #[simd_test(enable = "avx512f")]
29098     unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
29099         let a = _mm512_set_epi32(
29100             0,
29101             1,
29102             2,
29103             3,
29104             4,
29105             5,
29106             6,
29107             7,
29108             8,
29109             9,
29110             10,
29111             11,
29112             12,
29113             13,
29114             i32::MIN,
29115             i32::MAX,
29116         );
29117         let r = _mm512_maskz_cvtsepi32_epi16(0, a);
29118         assert_eq_m256i(r, _mm256_setzero_si256());
29119         let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
29120         let e = _mm256_set_epi16(
29121             0,
29122             0,
29123             0,
29124             0,
29125             0,
29126             0,
29127             0,
29128             0,
29129             8,
29130             9,
29131             10,
29132             11,
29133             12,
29134             13,
29135             i16::MIN,
29136             i16::MAX,
29137         );
29138         assert_eq_m256i(r, e);
29139     }
29140
29141     #[simd_test(enable = "avx512f")]
29142     unsafe fn test_mm512_cvtsepi32_epi8() {
29143         let a = _mm512_set_epi32(
29144             0,
29145             1,
29146             2,
29147             3,
29148             4,
29149             5,
29150             6,
29151             7,
29152             8,
29153             9,
29154             10,
29155             11,
29156             12,
29157             13,
29158             i32::MIN,
29159             i32::MAX,
29160         );
29161         let r = _mm512_cvtsepi32_epi8(a);
29162         let e = _mm_set_epi8(
29163             0,
29164             1,
29165             2,
29166             3,
29167             4,
29168             5,
29169             6,
29170             7,
29171             8,
29172             9,
29173             10,
29174             11,
29175             12,
29176             13,
29177             i8::MIN,
29178             i8::MAX,
29179         );
29180         assert_eq_m128i(r, e);
29181     }
29182
29183     #[simd_test(enable = "avx512f")]
29184     unsafe fn test_mm512_mask_cvtsepi32_epi8() {
29185         let a = _mm512_set_epi32(
29186             0,
29187             1,
29188             2,
29189             3,
29190             4,
29191             5,
29192             6,
29193             7,
29194             8,
29195             9,
29196             10,
29197             11,
29198             12,
29199             13,
29200             i32::MIN,
29201             i32::MAX,
29202         );
29203         let src = _mm_set1_epi8(-1);
29204         let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
29205         assert_eq_m128i(r, src);
29206         let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
29207         let e = _mm_set_epi8(
29208             -1,
29209             -1,
29210             -1,
29211             -1,
29212             -1,
29213             -1,
29214             -1,
29215             -1,
29216             8,
29217             9,
29218             10,
29219             11,
29220             12,
29221             13,
29222             i8::MIN,
29223             i8::MAX,
29224         );
29225         assert_eq_m128i(r, e);
29226     }
29227
29228     #[simd_test(enable = "avx512f")]
29229     unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
29230         let a = _mm512_set_epi32(
29231             0,
29232             1,
29233             2,
29234             3,
29235             4,
29236             5,
29237             6,
29238             7,
29239             8,
29240             9,
29241             10,
29242             11,
29243             12,
29244             13,
29245             i32::MIN,
29246             i32::MAX,
29247         );
29248         let r = _mm512_maskz_cvtsepi32_epi8(0, a);
29249         assert_eq_m128i(r, _mm_setzero_si128());
29250         let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
29251         let e = _mm_set_epi8(
29252             0,
29253             0,
29254             0,
29255             0,
29256             0,
29257             0,
29258             0,
29259             0,
29260             8,
29261             9,
29262             10,
29263             11,
29264             12,
29265             13,
29266             i8::MIN,
29267             i8::MAX,
29268         );
29269         assert_eq_m128i(r, e);
29270     }
29271
29272     #[simd_test(enable = "avx512f")]
29273     unsafe fn test_mm512_cvtusepi32_epi16() {
29274         let a = _mm512_set_epi32(
29275             0,
29276             1,
29277             2,
29278             3,
29279             4,
29280             5,
29281             6,
29282             7,
29283             8,
29284             9,
29285             10,
29286             11,
29287             12,
29288             13,
29289             i32::MIN,
29290             i32::MIN,
29291         );
29292         let r = _mm512_cvtusepi32_epi16(a);
29293         let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
29294         assert_eq_m256i(r, e);
29295     }
29296
29297     #[simd_test(enable = "avx512f")]
29298     unsafe fn test_mm512_mask_cvtusepi32_epi16() {
29299         let a = _mm512_set_epi32(
29300             0,
29301             1,
29302             2,
29303             3,
29304             4,
29305             5,
29306             6,
29307             7,
29308             8,
29309             9,
29310             10,
29311             11,
29312             12,
29313             13,
29314             i32::MIN,
29315             i32::MIN,
29316         );
29317         let src = _mm256_set1_epi16(-1);
29318         let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
29319         assert_eq_m256i(r, src);
29320         let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
29321         let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
29322         assert_eq_m256i(r, e);
29323     }
29324
29325     #[simd_test(enable = "avx512f")]
29326     unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
29327         let a = _mm512_set_epi32(
29328             0,
29329             1,
29330             2,
29331             3,
29332             4,
29333             5,
29334             6,
29335             7,
29336             8,
29337             9,
29338             10,
29339             11,
29340             12,
29341             13,
29342             i32::MIN,
29343             i32::MIN,
29344         );
29345         let r = _mm512_maskz_cvtusepi32_epi16(0, a);
29346         assert_eq_m256i(r, _mm256_setzero_si256());
29347         let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
29348         let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
29349         assert_eq_m256i(r, e);
29350     }
29351
29352     #[simd_test(enable = "avx512f")]
29353     unsafe fn test_mm512_cvtusepi32_epi8() {
29354         let a = _mm512_set_epi32(
29355             0,
29356             1,
29357             2,
29358             3,
29359             4,
29360             5,
29361             6,
29362             7,
29363             8,
29364             9,
29365             10,
29366             11,
29367             12,
29368             13,
29369             i32::MIN,
29370             i32::MIN,
29371         );
29372         let r = _mm512_cvtusepi32_epi8(a);
29373         let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
29374         assert_eq_m128i(r, e);
29375     }
29376
29377     #[simd_test(enable = "avx512f")]
29378     unsafe fn test_mm512_mask_cvtusepi32_epi8() {
29379         let a = _mm512_set_epi32(
29380             0,
29381             1,
29382             2,
29383             3,
29384             4,
29385             5,
29386             6,
29387             7,
29388             8,
29389             9,
29390             10,
29391             11,
29392             12,
29393             13,
29394             i32::MIN,
29395             i32::MIN,
29396         );
29397         let src = _mm_set1_epi8(-1);
29398         let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
29399         assert_eq_m128i(r, src);
29400         let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
29401         let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
29402         assert_eq_m128i(r, e);
29403     }
29404
29405     #[simd_test(enable = "avx512f")]
29406     unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
29407         let a = _mm512_set_epi32(
29408             0,
29409             1,
29410             2,
29411             3,
29412             4,
29413             5,
29414             6,
29415             7,
29416             8,
29417             9,
29418             10,
29419             11,
29420             12,
29421             13,
29422             i32::MIN,
29423             i32::MIN,
29424         );
29425         let r = _mm512_maskz_cvtusepi32_epi8(0, a);
29426         assert_eq_m128i(r, _mm_setzero_si128());
29427         let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
29428         let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
29429         assert_eq_m128i(r, e);
29430     }
29431
29432     #[simd_test(enable = "avx512f")]
29433     unsafe fn test_mm512_cvt_roundps_epi32() {
29434         let a = _mm512_setr_ps(
29435             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29436         );
29437         let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29438         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29439         assert_eq_m512i(r, e);
29440         let r = _mm512_cvt_roundps_epi32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
29441         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
29442         assert_eq_m512i(r, e);
29443     }
29444
29445     #[simd_test(enable = "avx512f")]
29446     unsafe fn test_mm512_mask_cvt_roundps_epi32() {
29447         let a = _mm512_setr_ps(
29448             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29449         );
29450         let src = _mm512_set1_epi32(0);
29451         let r =
29452             _mm512_mask_cvt_roundps_epi32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29453         assert_eq_m512i(r, src);
29454         let r = _mm512_mask_cvt_roundps_epi32(
29455             src,
29456             0b00000000_11111111,
29457             a,
29458             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29459         );
29460         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
29461         assert_eq_m512i(r, e);
29462     }
29463
29464     #[simd_test(enable = "avx512f")]
29465     unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
29466         let a = _mm512_setr_ps(
29467             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29468         );
29469         let r = _mm512_maskz_cvt_roundps_epi32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29470         assert_eq_m512i(r, _mm512_setzero_si512());
29471         let r = _mm512_maskz_cvt_roundps_epi32(
29472             0b00000000_11111111,
29473             a,
29474             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29475         );
29476         let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
29477         assert_eq_m512i(r, e);
29478     }
29479
29480     #[simd_test(enable = "avx512f")]
29481     unsafe fn test_mm512_cvt_roundps_epu32() {
29482         let a = _mm512_setr_ps(
29483             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29484         );
29485         let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29486         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
29487         assert_eq_m512i(r, e);
29488         let r = _mm512_cvt_roundps_epu32(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
29489         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
29490         assert_eq_m512i(r, e);
29491     }
29492
29493     #[simd_test(enable = "avx512f")]
29494     unsafe fn test_mm512_mask_cvt_roundps_epu32() {
29495         let a = _mm512_setr_ps(
29496             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29497         );
29498         let src = _mm512_set1_epi32(0);
29499         let r =
29500             _mm512_mask_cvt_roundps_epu32(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29501         assert_eq_m512i(r, src);
29502         let r = _mm512_mask_cvt_roundps_epu32(
29503             src,
29504             0b00000000_11111111,
29505             a,
29506             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29507         );
29508         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
29509         assert_eq_m512i(r, e);
29510     }
29511
29512     #[simd_test(enable = "avx512f")]
29513     unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
29514         let a = _mm512_setr_ps(
29515             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29516         );
29517         let r = _mm512_maskz_cvt_roundps_epu32(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29518         assert_eq_m512i(r, _mm512_setzero_si512());
29519         let r = _mm512_maskz_cvt_roundps_epu32(
29520             0b00000000_11111111,
29521             a,
29522             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29523         );
29524         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
29525         assert_eq_m512i(r, e);
29526     }
29527
29528     #[simd_test(enable = "avx512f")]
29529     unsafe fn test_mm512_cvt_roundepi32_ps() {
29530         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29531         let r = _mm512_cvt_roundepi32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29532         let e = _mm512_setr_ps(
29533             0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
29534         );
29535         assert_eq_m512(r, e);
29536     }
29537
29538     #[simd_test(enable = "avx512f")]
29539     unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
29540         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29541         let src = _mm512_set1_ps(0.);
29542         let r =
29543             _mm512_mask_cvt_roundepi32_ps(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29544         assert_eq_m512(r, src);
29545         let r = _mm512_mask_cvt_roundepi32_ps(
29546             src,
29547             0b00000000_11111111,
29548             a,
29549             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29550         );
29551         let e = _mm512_setr_ps(
29552             0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
29553         );
29554         assert_eq_m512(r, e);
29555     }
29556
29557     #[simd_test(enable = "avx512f")]
29558     unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
29559         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29560         let r = _mm512_maskz_cvt_roundepi32_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29561         assert_eq_m512(r, _mm512_setzero_ps());
29562         let r = _mm512_maskz_cvt_roundepi32_ps(
29563             0b00000000_11111111,
29564             a,
29565             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29566         );
29567         let e = _mm512_setr_ps(
29568             0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
29569         );
29570         assert_eq_m512(r, e);
29571     }
29572
29573     #[simd_test(enable = "avx512f")]
29574     unsafe fn test_mm512_cvt_roundepu32_ps() {
29575         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29576         let r = _mm512_cvt_roundepu32_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29577         let e = _mm512_setr_ps(
29578             0.,
29579             4294967300.,
29580             2.,
29581             4294967300.,
29582             4.,
29583             4294967300.,
29584             6.,
29585             4294967300.,
29586             8.,
29587             10.,
29588             10.,
29589             12.,
29590             12.,
29591             14.,
29592             14.,
29593             16.,
29594         );
29595         assert_eq_m512(r, e);
29596     }
29597
29598     #[simd_test(enable = "avx512f")]
29599     unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
29600         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29601         let src = _mm512_set1_ps(0.);
29602         let r =
29603             _mm512_mask_cvt_roundepu32_ps(src, 0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29604         assert_eq_m512(r, src);
29605         let r = _mm512_mask_cvt_roundepu32_ps(
29606             src,
29607             0b00000000_11111111,
29608             a,
29609             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29610         );
29611         let e = _mm512_setr_ps(
29612             0.,
29613             4294967300.,
29614             2.,
29615             4294967300.,
29616             4.,
29617             4294967300.,
29618             6.,
29619             4294967300.,
29620             0.,
29621             0.,
29622             0.,
29623             0.,
29624             0.,
29625             0.,
29626             0.,
29627             0.,
29628         );
29629         assert_eq_m512(r, e);
29630     }
29631
29632     #[simd_test(enable = "avx512f")]
29633     unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
29634         let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
29635         let r = _mm512_maskz_cvt_roundepu32_ps(0, a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
29636         assert_eq_m512(r, _mm512_setzero_ps());
29637         let r = _mm512_maskz_cvt_roundepu32_ps(
29638             0b00000000_11111111,
29639             a,
29640             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
29641         );
29642         let e = _mm512_setr_ps(
29643             0.,
29644             4294967300.,
29645             2.,
29646             4294967300.,
29647             4.,
29648             4294967300.,
29649             6.,
29650             4294967300.,
29651             0.,
29652             0.,
29653             0.,
29654             0.,
29655             0.,
29656             0.,
29657             0.,
29658             0.,
29659         );
29660         assert_eq_m512(r, e);
29661     }
29662
29663     #[simd_test(enable = "avx512f")]
29664     unsafe fn test_mm512_cvt_roundps_ph() {
29665         let a = _mm512_set1_ps(1.);
29666         let r = _mm512_cvt_roundps_ph(a, _MM_FROUND_NO_EXC);
29667         let e = _mm256_setr_epi64x(
29668             4323521613979991040,
29669             4323521613979991040,
29670             4323521613979991040,
29671             4323521613979991040,
29672         );
29673         assert_eq_m256i(r, e);
29674     }
29675
29676     #[simd_test(enable = "avx512f")]
29677     unsafe fn test_mm512_mask_cvt_roundps_ph() {
29678         let a = _mm512_set1_ps(1.);
29679         let src = _mm256_set1_epi16(0);
29680         let r = _mm512_mask_cvt_roundps_ph(src, 0, a, _MM_FROUND_NO_EXC);
29681         assert_eq_m256i(r, src);
29682         let r = _mm512_mask_cvt_roundps_ph(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29683         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
29684         assert_eq_m256i(r, e);
29685     }
29686
29687     #[simd_test(enable = "avx512f")]
29688     unsafe fn test_mm512_maskz_cvt_roundps_ph() {
29689         let a = _mm512_set1_ps(1.);
29690         let r = _mm512_maskz_cvt_roundps_ph(0, a, _MM_FROUND_NO_EXC);
29691         assert_eq_m256i(r, _mm256_setzero_si256());
29692         let r = _mm512_maskz_cvt_roundps_ph(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29693         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
29694         assert_eq_m256i(r, e);
29695     }
29696
29697     #[simd_test(enable = "avx512f")]
29698     unsafe fn test_mm512_cvtps_ph() {
29699         let a = _mm512_set1_ps(1.);
29700         let r = _mm512_cvtps_ph(a, _MM_FROUND_NO_EXC);
29701         let e = _mm256_setr_epi64x(
29702             4323521613979991040,
29703             4323521613979991040,
29704             4323521613979991040,
29705             4323521613979991040,
29706         );
29707         assert_eq_m256i(r, e);
29708     }
29709
29710     #[simd_test(enable = "avx512f")]
29711     unsafe fn test_mm512_mask_cvtps_ph() {
29712         let a = _mm512_set1_ps(1.);
29713         let src = _mm256_set1_epi16(0);
29714         let r = _mm512_mask_cvtps_ph(src, 0, a, _MM_FROUND_NO_EXC);
29715         assert_eq_m256i(r, src);
29716         let r = _mm512_mask_cvtps_ph(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29717         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
29718         assert_eq_m256i(r, e);
29719     }
29720
29721     #[simd_test(enable = "avx512f")]
29722     unsafe fn test_mm512_maskz_cvtps_ph() {
29723         let a = _mm512_set1_ps(1.);
29724         let r = _mm512_maskz_cvtps_ph(0, a, _MM_FROUND_NO_EXC);
29725         assert_eq_m256i(r, _mm256_setzero_si256());
29726         let r = _mm512_maskz_cvtps_ph(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29727         let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
29728         assert_eq_m256i(r, e);
29729     }
29730
29731     #[simd_test(enable = "avx512f")]
29732     unsafe fn test_mm512_cvt_roundph_ps() {
29733         let a = _mm256_setr_epi64x(
29734             4323521613979991040,
29735             4323521613979991040,
29736             4323521613979991040,
29737             4323521613979991040,
29738         );
29739         let r = _mm512_cvt_roundph_ps(a, _MM_FROUND_NO_EXC);
29740         let e = _mm512_set1_ps(1.);
29741         assert_eq_m512(r, e);
29742     }
29743
29744     #[simd_test(enable = "avx512f")]
29745     unsafe fn test_mm512_mask_cvt_roundph_ps() {
29746         let a = _mm256_setr_epi64x(
29747             4323521613979991040,
29748             4323521613979991040,
29749             4323521613979991040,
29750             4323521613979991040,
29751         );
29752         let src = _mm512_set1_ps(0.);
29753         let r = _mm512_mask_cvt_roundph_ps(src, 0, a, _MM_FROUND_NO_EXC);
29754         assert_eq_m512(r, src);
29755         let r = _mm512_mask_cvt_roundph_ps(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29756         let e = _mm512_setr_ps(
29757             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
29758         );
29759         assert_eq_m512(r, e);
29760     }
29761
29762     #[simd_test(enable = "avx512f")]
29763     unsafe fn test_mm512_maskz_cvt_roundph_ps() {
29764         let a = _mm256_setr_epi64x(
29765             4323521613979991040,
29766             4323521613979991040,
29767             4323521613979991040,
29768             4323521613979991040,
29769         );
29770         let r = _mm512_maskz_cvt_roundph_ps(0, a, _MM_FROUND_NO_EXC);
29771         assert_eq_m512(r, _mm512_setzero_ps());
29772         let r = _mm512_maskz_cvt_roundph_ps(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29773         let e = _mm512_setr_ps(
29774             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
29775         );
29776         assert_eq_m512(r, e);
29777     }
29778
29779     #[simd_test(enable = "avx512f")]
29780     unsafe fn test_mm512_cvtph_ps() {
29781         let a = _mm256_setr_epi64x(
29782             4323521613979991040,
29783             4323521613979991040,
29784             4323521613979991040,
29785             4323521613979991040,
29786         );
29787         let r = _mm512_cvtph_ps(a);
29788         let e = _mm512_set1_ps(1.);
29789         assert_eq_m512(r, e);
29790     }
29791
29792     #[simd_test(enable = "avx512f")]
29793     unsafe fn test_mm512_mask_cvtph_ps() {
29794         let a = _mm256_setr_epi64x(
29795             4323521613979991040,
29796             4323521613979991040,
29797             4323521613979991040,
29798             4323521613979991040,
29799         );
29800         let src = _mm512_set1_ps(0.);
29801         let r = _mm512_mask_cvtph_ps(src, 0, a);
29802         assert_eq_m512(r, src);
29803         let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
29804         let e = _mm512_setr_ps(
29805             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
29806         );
29807         assert_eq_m512(r, e);
29808     }
29809
29810     #[simd_test(enable = "avx512f")]
29811     unsafe fn test_mm512_maskz_cvtph_ps() {
29812         let a = _mm256_setr_epi64x(
29813             4323521613979991040,
29814             4323521613979991040,
29815             4323521613979991040,
29816             4323521613979991040,
29817         );
29818         let r = _mm512_maskz_cvtph_ps(0, a);
29819         assert_eq_m512(r, _mm512_setzero_ps());
29820         let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
29821         let e = _mm512_setr_ps(
29822             1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
29823         );
29824         assert_eq_m512(r, e);
29825     }
29826
29827     #[simd_test(enable = "avx512f")]
29828     unsafe fn test_mm512_cvtt_roundps_epi32() {
29829         let a = _mm512_setr_ps(
29830             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29831         );
29832         let r = _mm512_cvtt_roundps_epi32(a, _MM_FROUND_NO_EXC);
29833         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
29834         assert_eq_m512i(r, e);
29835     }
29836
29837     #[simd_test(enable = "avx512f")]
29838     unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
29839         let a = _mm512_setr_ps(
29840             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29841         );
29842         let src = _mm512_set1_epi32(0);
29843         let r = _mm512_mask_cvtt_roundps_epi32(src, 0, a, _MM_FROUND_NO_EXC);
29844         assert_eq_m512i(r, src);
29845         let r = _mm512_mask_cvtt_roundps_epi32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29846         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
29847         assert_eq_m512i(r, e);
29848     }
29849
29850     #[simd_test(enable = "avx512f")]
29851     unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
29852         let a = _mm512_setr_ps(
29853             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29854         );
29855         let r = _mm512_maskz_cvtt_roundps_epi32(0, a, _MM_FROUND_NO_EXC);
29856         assert_eq_m512i(r, _mm512_setzero_si512());
29857         let r = _mm512_maskz_cvtt_roundps_epi32(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29858         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
29859         assert_eq_m512i(r, e);
29860     }
29861
29862     #[simd_test(enable = "avx512f")]
29863     unsafe fn test_mm512_cvtt_roundps_epu32() {
29864         let a = _mm512_setr_ps(
29865             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29866         );
29867         let r = _mm512_cvtt_roundps_epu32(a, _MM_FROUND_NO_EXC);
29868         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
29869         assert_eq_m512i(r, e);
29870     }
29871
29872     #[simd_test(enable = "avx512f")]
29873     unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
29874         let a = _mm512_setr_ps(
29875             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29876         );
29877         let src = _mm512_set1_epi32(0);
29878         let r = _mm512_mask_cvtt_roundps_epu32(src, 0, a, _MM_FROUND_NO_EXC);
29879         assert_eq_m512i(r, src);
29880         let r = _mm512_mask_cvtt_roundps_epu32(src, 0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29881         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
29882         assert_eq_m512i(r, e);
29883     }
29884
29885     #[simd_test(enable = "avx512f")]
29886     unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
29887         let a = _mm512_setr_ps(
29888             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29889         );
29890         let r = _mm512_maskz_cvtt_roundps_epu32(0, a, _MM_FROUND_NO_EXC);
29891         assert_eq_m512i(r, _mm512_setzero_si512());
29892         let r = _mm512_maskz_cvtt_roundps_epu32(0b00000000_11111111, a, _MM_FROUND_NO_EXC);
29893         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
29894         assert_eq_m512i(r, e);
29895     }
29896
29897     #[simd_test(enable = "avx512f")]
29898     unsafe fn test_mm512_cvttps_epi32() {
29899         let a = _mm512_setr_ps(
29900             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29901         );
29902         let r = _mm512_cvttps_epi32(a);
29903         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
29904         assert_eq_m512i(r, e);
29905     }
29906
29907     #[simd_test(enable = "avx512f")]
29908     unsafe fn test_mm512_mask_cvttps_epi32() {
29909         let a = _mm512_setr_ps(
29910             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29911         );
29912         let src = _mm512_set1_epi32(0);
29913         let r = _mm512_mask_cvttps_epi32(src, 0, a);
29914         assert_eq_m512i(r, src);
29915         let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
29916         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
29917         assert_eq_m512i(r, e);
29918     }
29919
29920     #[simd_test(enable = "avx512f")]
29921     unsafe fn test_mm512_maskz_cvttps_epi32() {
29922         let a = _mm512_setr_ps(
29923             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29924         );
29925         let r = _mm512_maskz_cvttps_epi32(0, a);
29926         assert_eq_m512i(r, _mm512_setzero_si512());
29927         let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
29928         let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
29929         assert_eq_m512i(r, e);
29930     }
29931
29932     #[simd_test(enable = "avx512f")]
29933     unsafe fn test_mm512_cvttps_epu32() {
29934         let a = _mm512_setr_ps(
29935             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29936         );
29937         let r = _mm512_cvttps_epu32(a);
29938         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
29939         assert_eq_m512i(r, e);
29940     }
29941
29942     #[simd_test(enable = "avx512f")]
29943     unsafe fn test_mm512_mask_cvttps_epu32() {
29944         let a = _mm512_setr_ps(
29945             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29946         );
29947         let src = _mm512_set1_epi32(0);
29948         let r = _mm512_mask_cvttps_epu32(src, 0, a);
29949         assert_eq_m512i(r, src);
29950         let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
29951         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
29952         assert_eq_m512i(r, e);
29953     }
29954
29955     #[simd_test(enable = "avx512f")]
29956     unsafe fn test_mm512_maskz_cvttps_epu32() {
29957         let a = _mm512_setr_ps(
29958             0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
29959         );
29960         let r = _mm512_maskz_cvttps_epu32(0, a);
29961         assert_eq_m512i(r, _mm512_setzero_si512());
29962         let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
29963         let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
29964         assert_eq_m512i(r, e);
29965     }
29966
29967     #[simd_test(enable = "avx512f")]
29968     unsafe fn test_mm512_i32gather_ps() {
29969         let mut arr = [0f32; 256];
29970         for i in 0..256 {
29971             arr[i] = i as f32;
29972         }
29973         // A multiplier of 4 is word-addressing
29974         #[rustfmt::skip]
29975         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
29976                                       120, 128, 136, 144, 152, 160, 168, 176);
29977         let r = _mm512_i32gather_ps(index, arr.as_ptr() as *const u8, 4);
29978         #[rustfmt::skip]
29979         assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
29980                                          120., 128., 136., 144., 152., 160., 168., 176.));
29981     }
29982
29983     #[simd_test(enable = "avx512f")]
29984     unsafe fn test_mm512_mask_i32gather_ps() {
29985         let mut arr = [0f32; 256];
29986         for i in 0..256 {
29987             arr[i] = i as f32;
29988         }
29989         let src = _mm512_set1_ps(2.);
29990         let mask = 0b10101010_10101010;
29991         #[rustfmt::skip]
29992         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
29993                                       120, 128, 136, 144, 152, 160, 168, 176);
29994         // A multiplier of 4 is word-addressing
29995         let r = _mm512_mask_i32gather_ps(src, mask, index, arr.as_ptr() as *const u8, 4);
29996         #[rustfmt::skip]
29997         assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
29998                                          2., 128., 2., 144., 2., 160., 2., 176.));
29999     }
30000
30001     #[simd_test(enable = "avx512f")]
30002     unsafe fn test_mm512_i32gather_epi32() {
30003         let mut arr = [0i32; 256];
30004         for i in 0..256 {
30005             arr[i] = i as i32;
30006         }
30007         // A multiplier of 4 is word-addressing
30008         #[rustfmt::skip]
30009         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30010                                       120, 128, 136, 144, 152, 160, 168, 176);
30011         let r = _mm512_i32gather_epi32(index, arr.as_ptr() as *const u8, 4);
30012         #[rustfmt::skip]
30013         assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30014                                              120, 128, 136, 144, 152, 160, 168, 176));
30015     }
30016
30017     #[simd_test(enable = "avx512f")]
30018     unsafe fn test_mm512_mask_i32gather_epi32() {
30019         let mut arr = [0i32; 256];
30020         for i in 0..256 {
30021             arr[i] = i as i32;
30022         }
30023         let src = _mm512_set1_epi32(2);
30024         let mask = 0b10101010_10101010;
30025         #[rustfmt::skip]
30026         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30027                                       128, 144, 160, 176, 192, 208, 224, 240);
30028         // A multiplier of 4 is word-addressing
30029         let r = _mm512_mask_i32gather_epi32(src, mask, index, arr.as_ptr() as *const u8, 4);
30030         #[rustfmt::skip]
30031         assert_eq_m512i(r, _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112,
30032                                              2, 144, 2, 176, 2, 208, 2, 240));
30033     }
30034
30035     #[simd_test(enable = "avx512f")]
30036     unsafe fn test_mm512_i32scatter_ps() {
30037         let mut arr = [0f32; 256];
30038         #[rustfmt::skip]
30039         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30040                                       128, 144, 160, 176, 192, 208, 224, 240);
30041         let src = _mm512_setr_ps(
30042             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
30043         );
30044         // A multiplier of 4 is word-addressing
30045         _mm512_i32scatter_ps(arr.as_mut_ptr() as *mut u8, index, src, 4);
30046         let mut expected = [0f32; 256];
30047         for i in 0..16 {
30048             expected[i * 16] = (i + 1) as f32;
30049         }
30050         assert_eq!(&arr[..], &expected[..],);
30051     }
30052
30053     #[simd_test(enable = "avx512f")]
30054     unsafe fn test_mm512_mask_i32scatter_ps() {
30055         let mut arr = [0f32; 256];
30056         let mask = 0b10101010_10101010;
30057         #[rustfmt::skip]
30058         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30059                                       128, 144, 160, 176, 192, 208, 224, 240);
30060         let src = _mm512_setr_ps(
30061             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
30062         );
30063         // A multiplier of 4 is word-addressing
30064         _mm512_mask_i32scatter_ps(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
30065         let mut expected = [0f32; 256];
30066         for i in 0..8 {
30067             expected[i * 32 + 16] = 2. * (i + 1) as f32;
30068         }
30069         assert_eq!(&arr[..], &expected[..],);
30070     }
30071
30072     #[simd_test(enable = "avx512f")]
30073     unsafe fn test_mm512_i32scatter_epi32() {
30074         let mut arr = [0i32; 256];
30075         #[rustfmt::skip]
30076
30077         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30078                                       128, 144, 160, 176, 192, 208, 224, 240);
30079         let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
30080         // A multiplier of 4 is word-addressing
30081         _mm512_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, index, src, 4);
30082         let mut expected = [0i32; 256];
30083         for i in 0..16 {
30084             expected[i * 16] = (i + 1) as i32;
30085         }
30086         assert_eq!(&arr[..], &expected[..],);
30087     }
30088
30089     #[simd_test(enable = "avx512f")]
30090     unsafe fn test_mm512_mask_i32scatter_epi32() {
30091         let mut arr = [0i32; 256];
30092         let mask = 0b10101010_10101010;
30093         #[rustfmt::skip]
30094         let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
30095                                       128, 144, 160, 176, 192, 208, 224, 240);
30096         let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
30097         // A multiplier of 4 is word-addressing
30098         _mm512_mask_i32scatter_epi32(arr.as_mut_ptr() as *mut u8, mask, index, src, 4);
30099         let mut expected = [0i32; 256];
30100         for i in 0..8 {
30101             expected[i * 32 + 16] = 2 * (i + 1) as i32;
30102         }
30103         assert_eq!(&arr[..], &expected[..],);
30104     }
30105
30106     #[simd_test(enable = "avx512f")]
30107     unsafe fn test_mm512_cmplt_ps_mask() {
30108         #[rustfmt::skip]
30109         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30110                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30111         let b = _mm512_set1_ps(-1.);
30112         let m = _mm512_cmplt_ps_mask(a, b);
30113         assert_eq!(m, 0b00000101_00000101);
30114     }
30115
30116     #[simd_test(enable = "avx512f")]
30117     unsafe fn test_mm512_mask_cmplt_ps_mask() {
30118         #[rustfmt::skip]
30119         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30120                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30121         let b = _mm512_set1_ps(-1.);
30122         let mask = 0b01100110_01100110;
30123         let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
30124         assert_eq!(r, 0b00000100_00000100);
30125     }
30126
30127     #[simd_test(enable = "avx512f")]
30128     unsafe fn test_mm512_cmpnlt_ps_mask() {
30129         #[rustfmt::skip]
30130         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30131                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30132         let b = _mm512_set1_ps(-1.);
30133         assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
30134     }
30135
30136     #[simd_test(enable = "avx512f")]
30137     unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
30138         #[rustfmt::skip]
30139         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30140                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30141         let b = _mm512_set1_ps(-1.);
30142         let mask = 0b01111010_01111010;
30143         assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
30144     }
30145
30146     #[simd_test(enable = "avx512f")]
30147     unsafe fn test_mm512_cmpnle_ps_mask() {
30148         #[rustfmt::skip]
30149         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30150                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30151         let b = _mm512_set1_ps(-1.);
30152         let m = _mm512_cmpnle_ps_mask(b, a);
30153         assert_eq!(m, 0b00001101_00001101);
30154     }
30155
30156     #[simd_test(enable = "avx512f")]
30157     unsafe fn test_mm512_mask_cmpnle_ps_mask() {
30158         #[rustfmt::skip]
30159         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30160                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30161         let b = _mm512_set1_ps(-1.);
30162         let mask = 0b01100110_01100110;
30163         let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
30164         assert_eq!(r, 0b00000100_00000100);
30165     }
30166
30167     #[simd_test(enable = "avx512f")]
30168     unsafe fn test_mm512_cmple_ps_mask() {
30169         #[rustfmt::skip]
30170         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30171                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30172         let b = _mm512_set1_ps(-1.);
30173         assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
30174     }
30175
30176     #[simd_test(enable = "avx512f")]
30177     unsafe fn test_mm512_mask_cmple_ps_mask() {
30178         #[rustfmt::skip]
30179         let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
30180                               0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
30181         let b = _mm512_set1_ps(-1.);
30182         let mask = 0b01111010_01111010;
30183         assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
30184     }
30185
30186     #[simd_test(enable = "avx512f")]
30187     unsafe fn test_mm512_cmpeq_ps_mask() {
30188         #[rustfmt::skip]
30189         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
30190                               0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
30191         #[rustfmt::skip]
30192         let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
30193                               0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
30194         let m = _mm512_cmpeq_ps_mask(b, a);
30195         assert_eq!(m, 0b11001101_11001101);
30196     }
30197
30198     #[simd_test(enable = "avx512f")]
30199     unsafe fn test_mm512_mask_cmpeq_ps_mask() {
30200         #[rustfmt::skip]
30201         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
30202                               0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
30203         #[rustfmt::skip]
30204         let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
30205                               0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
30206         let mask = 0b01111010_01111010;
30207         let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
30208         assert_eq!(r, 0b01001000_01001000);
30209     }
30210
30211     #[simd_test(enable = "avx512f")]
30212     unsafe fn test_mm512_cmpneq_ps_mask() {
30213         #[rustfmt::skip]
30214         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
30215                               0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
30216         #[rustfmt::skip]
30217         let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
30218                               0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
30219         let m = _mm512_cmpneq_ps_mask(b, a);
30220         assert_eq!(m, 0b00110010_00110010);
30221     }
30222
30223     #[simd_test(enable = "avx512f")]
30224     unsafe fn test_mm512_mask_cmpneq_ps_mask() {
30225         #[rustfmt::skip]
30226         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
30227                               0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
30228         #[rustfmt::skip]
30229         let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
30230                               0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
30231         let mask = 0b01111010_01111010;
30232         let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
30233         assert_eq!(r, 0b00110010_00110010)
30234     }
30235
30236     #[simd_test(enable = "avx512f")]
30237     unsafe fn test_mm512_cmp_ps_mask() {
30238         #[rustfmt::skip]
30239         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
30240                               0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
30241         let b = _mm512_set1_ps(-1.);
30242         let m = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
30243         assert_eq!(m, 0b00000101_00000101);
30244     }
30245
30246     #[simd_test(enable = "avx512f")]
30247     unsafe fn test_mm512_mask_cmp_ps_mask() {
30248         #[rustfmt::skip]
30249         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
30250                               0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
30251         let b = _mm512_set1_ps(-1.);
30252         let mask = 0b01100110_01100110;
30253         let r = _mm512_mask_cmp_ps_mask(mask, a, b, _CMP_LT_OQ);
30254         assert_eq!(r, 0b00000100_00000100);
30255     }
30256
30257     #[simd_test(enable = "avx512f")]
30258     unsafe fn test_mm512_cmp_round_ps_mask() {
30259         #[rustfmt::skip]
30260         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
30261                               0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
30262         let b = _mm512_set1_ps(-1.);
30263         let m = _mm512_cmp_round_ps_mask(a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION);
30264         assert_eq!(m, 0b00000101_00000101);
30265     }
30266
30267     #[simd_test(enable = "avx512f")]
30268     unsafe fn test_mm512_mask_cmp_round_ps_mask() {
30269         #[rustfmt::skip]
30270         let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
30271                               0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
30272         let b = _mm512_set1_ps(-1.);
30273         let mask = 0b01100110_01100110;
30274         let r = _mm512_mask_cmp_round_ps_mask(mask, a, b, _CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION);
30275         assert_eq!(r, 0b00000100_00000100);
30276     }
30277
30278     #[simd_test(enable = "avx512f")]
30279     unsafe fn test_mm512_cmpord_ps_mask() {
30280         #[rustfmt::skip]
30281         let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
30282                               f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
30283         #[rustfmt::skip]
30284         let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
30285                               f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
30286         let m = _mm512_cmpord_ps_mask(a, b);
30287         assert_eq!(m, 0b00000101_00000101);
30288     }
30289
30290     #[simd_test(enable = "avx512f")]
30291     unsafe fn test_mm512_mask_cmpord_ps_mask() {
30292         #[rustfmt::skip]
30293         let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
30294                               f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
30295         #[rustfmt::skip]
30296         let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
30297                               f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
30298         let mask = 0b11000011_11000011;
30299         let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
30300         assert_eq!(m, 0b00000001_00000001);
30301     }
30302
30303     #[simd_test(enable = "avx512f")]
30304     unsafe fn test_mm512_cmpunord_ps_mask() {
30305         #[rustfmt::skip]
30306         let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
30307                               f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
30308         #[rustfmt::skip]
30309         let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
30310                               f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
30311         let m = _mm512_cmpunord_ps_mask(a, b);
30312
30313         assert_eq!(m, 0b11111010_11111010);
30314     }
30315
30316     #[simd_test(enable = "avx512f")]
30317     unsafe fn test_mm512_mask_cmpunord_ps_mask() {
30318         #[rustfmt::skip]
30319         let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
30320                               f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
30321         #[rustfmt::skip]
30322         let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
30323                               f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
30324         let mask = 0b00001111_00001111;
30325         let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
30326         assert_eq!(m, 0b000001010_00001010);
30327     }
30328
30329     #[simd_test(enable = "avx512f")]
30330     unsafe fn test_mm_cmp_ss_mask() {
30331         let a = _mm_setr_ps(2., 1., 1., 1.);
30332         let b = _mm_setr_ps(1., 2., 2., 2.);
30333         let m = _mm_cmp_ss_mask(a, b, _CMP_GE_OS);
30334         assert_eq!(m, 1);
30335     }
30336
30337     #[simd_test(enable = "avx512f")]
30338     unsafe fn test_mm_mask_cmp_ss_mask() {
30339         let a = _mm_setr_ps(2., 1., 1., 1.);
30340         let b = _mm_setr_ps(1., 2., 2., 2.);
30341         let m = _mm_mask_cmp_ss_mask(0b10, a, b, _CMP_GE_OS);
30342         assert_eq!(m, 0);
30343         let m = _mm_mask_cmp_ss_mask(0b1, a, b, _CMP_GE_OS);
30344         assert_eq!(m, 1);
30345     }
30346
30347     #[simd_test(enable = "avx512f")]
30348     unsafe fn test_mm_cmp_round_ss_mask() {
30349         let a = _mm_setr_ps(2., 1., 1., 1.);
30350         let b = _mm_setr_ps(1., 2., 2., 2.);
30351         let m = _mm_cmp_round_ss_mask(a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
30352         assert_eq!(m, 1);
30353     }
30354
30355     #[simd_test(enable = "avx512f")]
30356     unsafe fn test_mm_mask_cmp_round_ss_mask() {
30357         let a = _mm_setr_ps(2., 1., 1., 1.);
30358         let b = _mm_setr_ps(1., 2., 2., 2.);
30359         let m = _mm_mask_cmp_round_ss_mask(0b10, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
30360         assert_eq!(m, 0);
30361         let m = _mm_mask_cmp_round_ss_mask(0b1, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
30362         assert_eq!(m, 1);
30363     }
30364
30365     #[simd_test(enable = "avx512f")]
30366     unsafe fn test_mm_cmp_sd_mask() {
30367         let a = _mm_setr_pd(2., 1.);
30368         let b = _mm_setr_pd(1., 2.);
30369         let m = _mm_cmp_sd_mask(a, b, _CMP_GE_OS);
30370         assert_eq!(m, 1);
30371     }
30372
30373     #[simd_test(enable = "avx512f")]
30374     unsafe fn test_mm_mask_cmp_sd_mask() {
30375         let a = _mm_setr_pd(2., 1.);
30376         let b = _mm_setr_pd(1., 2.);
30377         let m = _mm_mask_cmp_sd_mask(0b10, a, b, _CMP_GE_OS);
30378         assert_eq!(m, 0);
30379         let m = _mm_mask_cmp_sd_mask(0b1, a, b, _CMP_GE_OS);
30380         assert_eq!(m, 1);
30381     }
30382
30383     #[simd_test(enable = "avx512f")]
30384     unsafe fn test_mm_cmp_round_sd_mask() {
30385         let a = _mm_setr_pd(2., 1.);
30386         let b = _mm_setr_pd(1., 2.);
30387         let m = _mm_cmp_round_sd_mask(a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
30388         assert_eq!(m, 1);
30389     }
30390
30391     #[simd_test(enable = "avx512f")]
30392     unsafe fn test_mm_mask_cmp_round_sd_mask() {
30393         let a = _mm_setr_pd(2., 1.);
30394         let b = _mm_setr_pd(1., 2.);
30395         let m = _mm_mask_cmp_round_sd_mask(0b10, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
30396         assert_eq!(m, 0);
30397         let m = _mm_mask_cmp_round_sd_mask(0b1, a, b, _CMP_GE_OS, _MM_FROUND_CUR_DIRECTION);
30398         assert_eq!(m, 1);
30399     }
30400
30401     #[simd_test(enable = "avx512f")]
30402     unsafe fn test_mm512_cmplt_epu32_mask() {
30403         #[rustfmt::skip]
30404         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30405                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30406         let b = _mm512_set1_epi32(-1);
30407         let m = _mm512_cmplt_epu32_mask(a, b);
30408         assert_eq!(m, 0b11001111_11001111);
30409     }
30410
30411     #[simd_test(enable = "avx512f")]
30412     unsafe fn test_mm512_mask_cmplt_epu32_mask() {
30413         #[rustfmt::skip]
30414         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30415                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30416         let b = _mm512_set1_epi32(-1);
30417         let mask = 0b01111010_01111010;
30418         let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
30419         assert_eq!(r, 0b01001010_01001010);
30420     }
30421
30422     #[simd_test(enable = "avx512f")]
30423     unsafe fn test_mm512_cmpgt_epu32_mask() {
30424         #[rustfmt::skip]
30425         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30426                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30427         let b = _mm512_set1_epi32(-1);
30428         let m = _mm512_cmpgt_epu32_mask(b, a);
30429         assert_eq!(m, 0b11001111_11001111);
30430     }
30431
30432     #[simd_test(enable = "avx512f")]
30433     unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
30434         #[rustfmt::skip]
30435         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30436                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30437         let b = _mm512_set1_epi32(-1);
30438         let mask = 0b01111010_01111010;
30439         let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
30440         assert_eq!(r, 0b01001010_01001010);
30441     }
30442
30443     #[simd_test(enable = "avx512f")]
30444     unsafe fn test_mm512_cmple_epu32_mask() {
30445         #[rustfmt::skip]
30446         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30447                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30448         let b = _mm512_set1_epi32(-1);
30449         assert_eq!(
30450             _mm512_cmple_epu32_mask(a, b),
30451             !_mm512_cmpgt_epu32_mask(a, b)
30452         )
30453     }
30454
30455     #[simd_test(enable = "avx512f")]
30456     unsafe fn test_mm512_mask_cmple_epu32_mask() {
30457         #[rustfmt::skip]
30458         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30459                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30460         let b = _mm512_set1_epi32(-1);
30461         let mask = 0b01111010_01111010;
30462         assert_eq!(
30463             _mm512_mask_cmple_epu32_mask(mask, a, b),
30464             0b01111010_01111010
30465         );
30466     }
30467
30468     #[simd_test(enable = "avx512f")]
30469     unsafe fn test_mm512_cmpge_epu32_mask() {
30470         #[rustfmt::skip]
30471         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30472                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30473         let b = _mm512_set1_epi32(-1);
30474         assert_eq!(
30475             _mm512_cmpge_epu32_mask(a, b),
30476             !_mm512_cmplt_epu32_mask(a, b)
30477         )
30478     }
30479
30480     #[simd_test(enable = "avx512f")]
30481     unsafe fn test_mm512_mask_cmpge_epu32_mask() {
30482         #[rustfmt::skip]
30483         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30484                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30485         let b = _mm512_set1_epi32(-1);
30486         let mask = 0b01111010_01111010;
30487         assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
30488     }
30489
30490     #[simd_test(enable = "avx512f")]
30491     unsafe fn test_mm512_cmpeq_epu32_mask() {
30492         #[rustfmt::skip]
30493         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30494                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30495         #[rustfmt::skip]
30496         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30497                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30498         let m = _mm512_cmpeq_epu32_mask(b, a);
30499         assert_eq!(m, 0b11001111_11001111);
30500     }
30501
30502     #[simd_test(enable = "avx512f")]
30503     unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
30504         #[rustfmt::skip]
30505         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30506                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30507         #[rustfmt::skip]
30508         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30509                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30510         let mask = 0b01111010_01111010;
30511         let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
30512         assert_eq!(r, 0b01001010_01001010);
30513     }
30514
30515     #[simd_test(enable = "avx512f")]
30516     unsafe fn test_mm512_cmpneq_epu32_mask() {
30517         #[rustfmt::skip]
30518         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30519                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30520         #[rustfmt::skip]
30521         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30522                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30523         let m = _mm512_cmpneq_epu32_mask(b, a);
30524         assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
30525     }
30526
30527     #[simd_test(enable = "avx512f")]
30528     unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
30529         #[rustfmt::skip]
30530         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
30531                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
30532         #[rustfmt::skip]
30533         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30534                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30535         let mask = 0b01111010_01111010;
30536         let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
30537         assert_eq!(r, 0b00110010_00110010);
30538     }
30539
30540     #[simd_test(enable = "avx512f")]
30541     unsafe fn test_mm512_cmp_epu32_mask() {
30542         #[rustfmt::skip]
30543         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30544                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30545         let b = _mm512_set1_epi32(-1);
30546         let m = _mm512_cmp_epu32_mask(a, b, _MM_CMPINT_LT);
30547         assert_eq!(m, 0b11001111_11001111);
30548     }
30549
30550     #[simd_test(enable = "avx512f")]
30551     unsafe fn test_mm512_mask_cmp_epu32_mask() {
30552         #[rustfmt::skip]
30553         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30554                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30555         let b = _mm512_set1_epi32(-1);
30556         let mask = 0b01111010_01111010;
30557         let r = _mm512_mask_cmp_epu32_mask(mask, a, b, _MM_CMPINT_LT);
30558         assert_eq!(r, 0b01001010_01001010);
30559     }
30560
30561     #[simd_test(enable = "avx512f")]
30562     unsafe fn test_mm512_cmplt_epi32_mask() {
30563         #[rustfmt::skip]
30564         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30565                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30566         let b = _mm512_set1_epi32(-1);
30567         let m = _mm512_cmplt_epi32_mask(a, b);
30568         assert_eq!(m, 0b00000101_00000101);
30569     }
30570
30571     #[simd_test(enable = "avx512f")]
30572     unsafe fn test_mm512_mask_cmplt_epi32_mask() {
30573         #[rustfmt::skip]
30574         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30575                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30576         let b = _mm512_set1_epi32(-1);
30577         let mask = 0b01100110_01100110;
30578         let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
30579         assert_eq!(r, 0b00000100_00000100);
30580     }
30581
30582     #[simd_test(enable = "avx512f")]
30583     unsafe fn test_mm512_cmpgt_epi32_mask() {
30584         #[rustfmt::skip]
30585         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30586                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30587         let b = _mm512_set1_epi32(-1);
30588         let m = _mm512_cmpgt_epi32_mask(b, a);
30589         assert_eq!(m, 0b00000101_00000101);
30590     }
30591
30592     #[simd_test(enable = "avx512f")]
30593     unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
30594         #[rustfmt::skip]
30595         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30596                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30597         let b = _mm512_set1_epi32(-1);
30598         let mask = 0b01100110_01100110;
30599         let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
30600         assert_eq!(r, 0b00000100_00000100);
30601     }
30602
30603     #[simd_test(enable = "avx512f")]
30604     unsafe fn test_mm512_cmple_epi32_mask() {
30605         #[rustfmt::skip]
30606         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30607                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30608         let b = _mm512_set1_epi32(-1);
30609         assert_eq!(
30610             _mm512_cmple_epi32_mask(a, b),
30611             !_mm512_cmpgt_epi32_mask(a, b)
30612         )
30613     }
30614
30615     #[simd_test(enable = "avx512f")]
30616     unsafe fn test_mm512_mask_cmple_epi32_mask() {
30617         #[rustfmt::skip]
30618         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30619                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30620         let b = _mm512_set1_epi32(-1);
30621         let mask = 0b01111010_01111010;
30622         assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
30623     }
30624
30625     #[simd_test(enable = "avx512f")]
30626     unsafe fn test_mm512_cmpge_epi32_mask() {
30627         #[rustfmt::skip]
30628         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30629                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30630         let b = _mm512_set1_epi32(-1);
30631         assert_eq!(
30632             _mm512_cmpge_epi32_mask(a, b),
30633             !_mm512_cmplt_epi32_mask(a, b)
30634         )
30635     }
30636
30637     #[simd_test(enable = "avx512f")]
30638     unsafe fn test_mm512_mask_cmpge_epi32_mask() {
30639         #[rustfmt::skip]
30640         let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
30641                                  0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
30642         let b = _mm512_set1_epi32(-1);
30643         let mask = 0b01111010_01111010;
30644         assert_eq!(
30645             _mm512_mask_cmpge_epi32_mask(mask, a, b),
30646             0b01111010_01111010
30647         );
30648     }
30649
30650     #[simd_test(enable = "avx512f")]
30651     unsafe fn test_mm512_cmpeq_epi32_mask() {
30652         #[rustfmt::skip]
30653         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30654                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30655         #[rustfmt::skip]
30656         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30657                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30658         let m = _mm512_cmpeq_epi32_mask(b, a);
30659         assert_eq!(m, 0b11001111_11001111);
30660     }
30661
30662     #[simd_test(enable = "avx512f")]
30663     unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
30664         #[rustfmt::skip]
30665         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30666                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30667         #[rustfmt::skip]
30668         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30669                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30670         let mask = 0b01111010_01111010;
30671         let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
30672         assert_eq!(r, 0b01001010_01001010);
30673     }
30674
30675     #[simd_test(enable = "avx512f")]
30676     unsafe fn test_mm512_cmpneq_epi32_mask() {
30677         #[rustfmt::skip]
30678         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30679                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30680         #[rustfmt::skip]
30681         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30682                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30683         let m = _mm512_cmpneq_epi32_mask(b, a);
30684         assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
30685     }
30686
30687     #[simd_test(enable = "avx512f")]
30688     unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
30689         #[rustfmt::skip]
30690         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
30691                                  0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
30692         #[rustfmt::skip]
30693         let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
30694                                  0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
30695         let mask = 0b01111010_01111010;
30696         let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
30697         assert_eq!(r, 0b00110010_00110010)
30698     }
30699
30700     #[simd_test(enable = "avx512f")]
30701     unsafe fn test_mm512_cmp_epi32_mask() {
30702         #[rustfmt::skip]
30703         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30704                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30705         let b = _mm512_set1_epi32(-1);
30706         let m = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
30707         assert_eq!(m, 0b00000101_00000101);
30708     }
30709
30710     #[simd_test(enable = "avx512f")]
30711     unsafe fn test_mm512_mask_cmp_epi32_mask() {
30712         #[rustfmt::skip]
30713         let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
30714                                  0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
30715         let b = _mm512_set1_epi32(-1);
30716         let mask = 0b01100110_01100110;
30717         let r = _mm512_mask_cmp_epi32_mask(mask, a, b, _MM_CMPINT_LT);
30718         assert_eq!(r, 0b00000100_00000100);
30719     }
30720
30721     #[simd_test(enable = "avx512f")]
30722     unsafe fn test_mm512_set_epi8() {
30723         let r = _mm512_set1_epi8(2);
30724         assert_eq_m512i(
30725             r,
30726             _mm512_set_epi8(
30727                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
30728                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
30729                 2, 2, 2, 2, 2, 2, 2, 2,
30730             ),
30731         )
30732     }
30733
30734     #[simd_test(enable = "avx512f")]
30735     unsafe fn test_mm512_set_epi16() {
30736         let r = _mm512_set1_epi16(2);
30737         assert_eq_m512i(
30738             r,
30739             _mm512_set_epi16(
30740                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
30741                 2, 2, 2, 2,
30742             ),
30743         )
30744     }
30745
30746     #[simd_test(enable = "avx512f")]
30747     unsafe fn test_mm512_set_epi32() {
30748         let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
30749         assert_eq_m512i(
30750             r,
30751             _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
30752         )
30753     }
30754
30755     #[simd_test(enable = "avx512f")]
30756     unsafe fn test_mm512_setr_epi32() {
30757         let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
30758         assert_eq_m512i(
30759             r,
30760             _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
30761         )
30762     }
30763
30764     #[simd_test(enable = "avx512f")]
30765     unsafe fn test_mm512_set1_epi8() {
30766         let r = _mm512_set_epi8(
30767             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
30768             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
30769             2, 2, 2, 2, 2, 2,
30770         );
30771         assert_eq_m512i(r, _mm512_set1_epi8(2));
30772     }
30773
30774     #[simd_test(enable = "avx512f")]
30775     unsafe fn test_mm512_set1_epi16() {
30776         let r = _mm512_set_epi16(
30777             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
30778             2, 2, 2,
30779         );
30780         assert_eq_m512i(r, _mm512_set1_epi16(2));
30781     }
30782
30783     #[simd_test(enable = "avx512f")]
30784     unsafe fn test_mm512_set1_epi32() {
30785         let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30786         assert_eq_m512i(r, _mm512_set1_epi32(2));
30787     }
30788
30789     #[simd_test(enable = "avx512f")]
30790     unsafe fn test_mm512_setzero_si512() {
30791         assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
30792     }
30793
30794     #[simd_test(enable = "avx512f")]
30795     unsafe fn test_mm512_setzero_epi32() {
30796         assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
30797     }
30798
30799     #[simd_test(enable = "avx512f")]
30800     unsafe fn test_mm512_set_ps() {
30801         let r = _mm512_setr_ps(
30802             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
30803         );
30804         assert_eq_m512(
30805             r,
30806             _mm512_set_ps(
30807                 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
30808             ),
30809         )
30810     }
30811
30812     #[simd_test(enable = "avx512f")]
30813     unsafe fn test_mm512_setr_ps() {
30814         let r = _mm512_set_ps(
30815             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
30816         );
30817         assert_eq_m512(
30818             r,
30819             _mm512_setr_ps(
30820                 15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
30821             ),
30822         )
30823     }
30824
30825     #[simd_test(enable = "avx512f")]
30826     unsafe fn test_mm512_set1_ps() {
30827         #[rustfmt::skip]
30828         let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
30829                                      2., 2., 2., 2., 2., 2., 2., 2.);
30830         assert_eq_m512(expected, _mm512_set1_ps(2.));
30831     }
30832
30833     #[simd_test(enable = "avx512f")]
30834     unsafe fn test_mm512_set4_epi32() {
30835         let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
30836         assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
30837     }
30838
30839     #[simd_test(enable = "avx512f")]
30840     unsafe fn test_mm512_set4_ps() {
30841         let r = _mm512_set_ps(
30842             4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
30843         );
30844         assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
30845     }
30846
30847     #[simd_test(enable = "avx512f")]
30848     unsafe fn test_mm512_setr4_epi32() {
30849         let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
30850         assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
30851     }
30852
30853     #[simd_test(enable = "avx512f")]
30854     unsafe fn test_mm512_setr4_ps() {
30855         let r = _mm512_set_ps(
30856             4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
30857         );
30858         assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
30859     }
30860
30861     #[simd_test(enable = "avx512f")]
30862     unsafe fn test_mm512_setzero_ps() {
30863         assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
30864     }
30865
30866     #[simd_test(enable = "avx512f")]
30867     unsafe fn test_mm512_setzero() {
30868         assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
30869     }
30870
30871     #[simd_test(enable = "avx512f")]
30872     unsafe fn test_mm512_loadu_pd() {
30873         let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
30874         let p = a.as_ptr();
30875         let r = _mm512_loadu_pd(black_box(p));
30876         let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
30877         assert_eq_m512d(r, e);
30878     }
30879
30880     #[simd_test(enable = "avx512f")]
30881     unsafe fn test_mm512_storeu_pd() {
30882         let a = _mm512_set1_pd(9.);
30883         let mut r = _mm512_undefined_pd();
30884         _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
30885         assert_eq_m512d(r, a);
30886     }
30887
30888     #[simd_test(enable = "avx512f")]
30889     unsafe fn test_mm512_loadu_ps() {
30890         let a = &[
30891             4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
30892         ];
30893         let p = a.as_ptr();
30894         let r = _mm512_loadu_ps(black_box(p));
30895         let e = _mm512_setr_ps(
30896             4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
30897         );
30898         assert_eq_m512(r, e);
30899     }
30900
30901     #[simd_test(enable = "avx512f")]
30902     unsafe fn test_mm512_storeu_ps() {
30903         let a = _mm512_set1_ps(9.);
30904         let mut r = _mm512_undefined_ps();
30905         _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
30906         assert_eq_m512(r, a);
30907     }
30908
30909     #[simd_test(enable = "avx512f")]
30910     unsafe fn test_mm512_setr_pd() {
30911         let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
30912         assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
30913     }
30914
30915     #[simd_test(enable = "avx512f")]
30916     unsafe fn test_mm512_set_pd() {
30917         let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
30918         assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
30919     }
30920
30921     #[simd_test(enable = "avx512f")]
30922     unsafe fn test_mm512_rol_epi32() {
30923         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
30924         let r = _mm512_rol_epi32(a, 1);
30925         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30926         assert_eq_m512i(r, e);
30927     }
30928
30929     #[simd_test(enable = "avx512f")]
30930     unsafe fn test_mm512_mask_rol_epi32() {
30931         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
30932         let r = _mm512_mask_rol_epi32(a, 0, a, 1);
30933         assert_eq_m512i(r, a);
30934
30935         let r = _mm512_mask_rol_epi32(a, 0b11111111_11111111, a, 1);
30936         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30937         assert_eq_m512i(r, e);
30938     }
30939
30940     #[simd_test(enable = "avx512f")]
30941     unsafe fn test_mm512_maskz_rol_epi32() {
30942         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
30943         let r = _mm512_maskz_rol_epi32(0, a, 1);
30944         assert_eq_m512i(r, _mm512_setzero_si512());
30945
30946         let r = _mm512_maskz_rol_epi32(0b00000000_11111111, a, 1);
30947         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
30948         assert_eq_m512i(r, e);
30949     }
30950
30951     #[simd_test(enable = "avx512f")]
30952     unsafe fn test_mm512_ror_epi32() {
30953         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30954         let r = _mm512_ror_epi32(a, 1);
30955         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
30956         assert_eq_m512i(r, e);
30957     }
30958
30959     #[simd_test(enable = "avx512f")]
30960     unsafe fn test_mm512_mask_ror_epi32() {
30961         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30962         let r = _mm512_mask_ror_epi32(a, 0, a, 1);
30963         assert_eq_m512i(r, a);
30964
30965         let r = _mm512_mask_ror_epi32(a, 0b11111111_11111111, a, 1);
30966         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
30967         assert_eq_m512i(r, e);
30968     }
30969
30970     #[simd_test(enable = "avx512f")]
30971     unsafe fn test_mm512_maskz_ror_epi32() {
30972         let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
30973         let r = _mm512_maskz_ror_epi32(0, a, 1);
30974         assert_eq_m512i(r, _mm512_setzero_si512());
30975
30976         let r = _mm512_maskz_ror_epi32(0b00000000_11111111, a, 1);
30977         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
30978         assert_eq_m512i(r, e);
30979     }
30980
30981     #[simd_test(enable = "avx512f")]
30982     unsafe fn test_mm512_slli_epi32() {
30983         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
30984         let r = _mm512_slli_epi32(a, 1);
30985         let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30986         assert_eq_m512i(r, e);
30987     }
30988
30989     #[simd_test(enable = "avx512f")]
30990     unsafe fn test_mm512_mask_slli_epi32() {
30991         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
30992         let r = _mm512_mask_slli_epi32(a, 0, a, 1);
30993         assert_eq_m512i(r, a);
30994
30995         let r = _mm512_mask_slli_epi32(a, 0b11111111_11111111, a, 1);
30996         let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
30997         assert_eq_m512i(r, e);
30998     }
30999
31000     #[simd_test(enable = "avx512f")]
31001     unsafe fn test_mm512_maskz_slli_epi32() {
31002         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
31003         let r = _mm512_maskz_slli_epi32(0, a, 1);
31004         assert_eq_m512i(r, _mm512_setzero_si512());
31005
31006         let r = _mm512_maskz_slli_epi32(0b00000000_11111111, a, 1);
31007         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
31008         assert_eq_m512i(r, e);
31009     }
31010
31011     #[simd_test(enable = "avx512f")]
31012     unsafe fn test_mm512_srli_epi32() {
31013         let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31014         let r = _mm512_srli_epi32(a, 1);
31015         let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31016         assert_eq_m512i(r, e);
31017     }
31018
31019     #[simd_test(enable = "avx512f")]
31020     unsafe fn test_mm512_mask_srli_epi32() {
31021         let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31022         let r = _mm512_mask_srli_epi32(a, 0, a, 1);
31023         assert_eq_m512i(r, a);
31024
31025         let r = _mm512_mask_srli_epi32(a, 0b11111111_11111111, a, 1);
31026         let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31027         assert_eq_m512i(r, e);
31028     }
31029
31030     #[simd_test(enable = "avx512f")]
31031     unsafe fn test_mm512_maskz_srli_epi32() {
31032         let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
31033         let r = _mm512_maskz_srli_epi32(0, a, 1);
31034         assert_eq_m512i(r, _mm512_setzero_si512());
31035
31036         let r = _mm512_maskz_srli_epi32(0b00000000_11111111, a, 1);
31037         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0 << 31);
31038         assert_eq_m512i(r, e);
31039     }
31040
31041     #[simd_test(enable = "avx512f")]
31042     unsafe fn test_mm512_rolv_epi32() {
31043         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31044         let b = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31045
31046         let r = _mm512_rolv_epi32(a, b);
31047
31048         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31049         assert_eq_m512i(r, e);
31050     }
31051
31052     #[simd_test(enable = "avx512f")]
31053     unsafe fn test_mm512_mask_rolv_epi32() {
31054         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31055         let b = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31056
31057         let r = _mm512_mask_rolv_epi32(a, 0, a, b);
31058         assert_eq_m512i(r, a);
31059
31060         let r = _mm512_mask_rolv_epi32(a, 0b11111111_11111111, a, b);
31061
31062         let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31063         assert_eq_m512i(r, e);
31064     }
31065
31066     #[simd_test(enable = "avx512f")]
31067     unsafe fn test_mm512_maskz_rolv_epi32() {
31068         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
31069         let b = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31070
31071         let r = _mm512_maskz_rolv_epi32(0, a, b);
31072         assert_eq_m512i(r, _mm512_setzero_si512());
31073
31074         let r = _mm512_maskz_rolv_epi32(0b00000000_11111111, a, b);
31075
31076         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
31077         assert_eq_m512i(r, e);
31078     }
31079
31080     #[simd_test(enable = "avx512f")]
31081     unsafe fn test_mm512_rorv_epi32() {
31082         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31083         let b = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31084
31085         let r = _mm512_rorv_epi32(a, b);
31086
31087         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31088         assert_eq_m512i(r, e);
31089     }
31090
31091     #[simd_test(enable = "avx512f")]
31092     unsafe fn test_mm512_mask_rorv_epi32() {
31093         let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31094         let b = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31095
31096         let r = _mm512_mask_rorv_epi32(a, 0, a, b);
31097         assert_eq_m512i(r, a);
31098
31099         let r = _mm512_mask_rorv_epi32(a, 0b11111111_11111111, a, b);
31100
31101         let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31102         assert_eq_m512i(r, e);
31103     }
31104
31105     #[simd_test(enable = "avx512f")]
31106     unsafe fn test_mm512_maskz_rorv_epi32() {
31107         let a = _mm512_set_epi32(3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
31108         let b = _mm512_set_epi32(2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31109
31110         let r = _mm512_maskz_rorv_epi32(0, a, b);
31111         assert_eq_m512i(r, _mm512_setzero_si512());
31112
31113         let r = _mm512_maskz_rorv_epi32(0b00000000_11111111, a, b);
31114
31115         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
31116         assert_eq_m512i(r, e);
31117     }
31118
31119     #[simd_test(enable = "avx512f")]
31120     unsafe fn test_mm512_sllv_epi32() {
31121         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31122         let count = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31123
31124         let r = _mm512_sllv_epi32(a, count);
31125
31126         let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31127         assert_eq_m512i(r, e);
31128     }
31129
31130     #[simd_test(enable = "avx512f")]
31131     unsafe fn test_mm512_mask_sllv_epi32() {
31132         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31133         let count = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31134
31135         let r = _mm512_mask_sllv_epi32(a, 0, a, count);
31136         assert_eq_m512i(r, a);
31137
31138         let r = _mm512_mask_sllv_epi32(a, 0b11111111_11111111, a, count);
31139
31140         let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31141         assert_eq_m512i(r, e);
31142     }
31143
31144     #[simd_test(enable = "avx512f")]
31145     unsafe fn test_mm512_maskz_sllv_epi32() {
31146         let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
31147         let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31148
31149         let r = _mm512_maskz_sllv_epi32(0, a, count);
31150         assert_eq_m512i(r, _mm512_setzero_si512());
31151
31152         let r = _mm512_maskz_sllv_epi32(0b00000000_11111111, a, count);
31153
31154         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
31155         assert_eq_m512i(r, e);
31156     }
31157
31158     #[simd_test(enable = "avx512f")]
31159     unsafe fn test_mm512_srlv_epi32() {
31160         let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31161         let count = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31162
31163         let r = _mm512_srlv_epi32(a, count);
31164
31165         let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31166         assert_eq_m512i(r, e);
31167     }
31168
31169     #[simd_test(enable = "avx512f")]
31170     unsafe fn test_mm512_mask_srlv_epi32() {
31171         let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
31172         let count = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31173
31174         let r = _mm512_mask_srlv_epi32(a, 0, a, count);
31175         assert_eq_m512i(r, a);
31176
31177         let r = _mm512_mask_srlv_epi32(a, 0b11111111_11111111, a, count);
31178
31179         let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31180         assert_eq_m512i(r, e);
31181     }
31182
31183     #[simd_test(enable = "avx512f")]
31184     unsafe fn test_mm512_maskz_srlv_epi32() {
31185         let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
31186         let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
31187
31188         let r = _mm512_maskz_srlv_epi32(0, a, count);
31189         assert_eq_m512i(r, _mm512_setzero_si512());
31190
31191         let r = _mm512_maskz_srlv_epi32(0b00000000_11111111, a, count);
31192
31193         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0);
31194         assert_eq_m512i(r, e);
31195     }
31196
31197     #[simd_test(enable = "avx512f")]
31198     unsafe fn test_mm512_sll_epi32() {
31199         let a = _mm512_set_epi32(
31200             1 << 31,
31201             1 << 0,
31202             1 << 1,
31203             1 << 2,
31204             0,
31205             0,
31206             0,
31207             0,
31208             0,
31209             0,
31210             0,
31211             0,
31212             0,
31213             0,
31214             0,
31215             0,
31216         );
31217         let count = _mm_set_epi32(0, 0, 0, 2);
31218         let r = _mm512_sll_epi32(a, count);
31219         let e = _mm512_set_epi32(
31220             0,
31221             1 << 2,
31222             1 << 3,
31223             1 << 4,
31224             0,
31225             0,
31226             0,
31227             0,
31228             0,
31229             0,
31230             0,
31231             0,
31232             0,
31233             0,
31234             0,
31235             0,
31236         );
31237         assert_eq_m512i(r, e);
31238     }
31239
31240     #[simd_test(enable = "avx512f")]
31241     unsafe fn test_mm512_mask_sll_epi32() {
31242         let a = _mm512_set_epi32(
31243             1 << 31,
31244             1 << 0,
31245             1 << 1,
31246             1 << 2,
31247             0,
31248             0,
31249             0,
31250             0,
31251             0,
31252             0,
31253             0,
31254             0,
31255             0,
31256             0,
31257             0,
31258             0,
31259         );
31260         let count = _mm_set_epi32(0, 0, 0, 2);
31261         let r = _mm512_mask_sll_epi32(a, 0, a, count);
31262         assert_eq_m512i(r, a);
31263
31264         let r = _mm512_mask_sll_epi32(a, 0b11111111_11111111, a, count);
31265         let e = _mm512_set_epi32(
31266             0,
31267             1 << 2,
31268             1 << 3,
31269             1 << 4,
31270             0,
31271             0,
31272             0,
31273             0,
31274             0,
31275             0,
31276             0,
31277             0,
31278             0,
31279             0,
31280             0,
31281             0,
31282         );
31283         assert_eq_m512i(r, e);
31284     }
31285
31286     #[simd_test(enable = "avx512f")]
31287     unsafe fn test_mm512_maskz_sll_epi32() {
31288         let a = _mm512_set_epi32(
31289             1 << 31,
31290             1 << 0,
31291             1 << 1,
31292             1 << 2,
31293             0,
31294             0,
31295             0,
31296             0,
31297             0,
31298             0,
31299             0,
31300             0,
31301             0,
31302             0,
31303             0,
31304             1 << 31,
31305         );
31306         let count = _mm_set_epi32(2, 0, 0, 2);
31307         let r = _mm512_maskz_sll_epi32(0, a, count);
31308         assert_eq_m512i(r, _mm512_setzero_si512());
31309
31310         let r = _mm512_maskz_sll_epi32(0b00000000_11111111, a, count);
31311         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31312         assert_eq_m512i(r, e);
31313     }
31314
31315     #[simd_test(enable = "avx512f")]
31316     unsafe fn test_mm512_srl_epi32() {
31317         let a = _mm512_set_epi32(
31318             1 << 31,
31319             1 << 0,
31320             1 << 1,
31321             1 << 2,
31322             0,
31323             0,
31324             0,
31325             0,
31326             0,
31327             0,
31328             0,
31329             0,
31330             0,
31331             0,
31332             0,
31333             0,
31334         );
31335         let count = _mm_set_epi32(0, 0, 0, 2);
31336         let r = _mm512_srl_epi32(a, count);
31337         let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31338         assert_eq_m512i(r, e);
31339     }
31340
31341     #[simd_test(enable = "avx512f")]
31342     unsafe fn test_mm512_mask_srl_epi32() {
31343         let a = _mm512_set_epi32(
31344             1 << 31,
31345             1 << 0,
31346             1 << 1,
31347             1 << 2,
31348             0,
31349             0,
31350             0,
31351             0,
31352             0,
31353             0,
31354             0,
31355             0,
31356             0,
31357             0,
31358             0,
31359             0,
31360         );
31361         let count = _mm_set_epi32(0, 0, 0, 2);
31362         let r = _mm512_mask_srl_epi32(a, 0, a, count);
31363         assert_eq_m512i(r, a);
31364
31365         let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count);
31366         let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31367         assert_eq_m512i(r, e);
31368     }
31369
31370     #[simd_test(enable = "avx512f")]
31371     unsafe fn test_mm512_maskz_srl_epi32() {
31372         let a = _mm512_set_epi32(
31373             1 << 31,
31374             1 << 0,
31375             1 << 1,
31376             1 << 2,
31377             0,
31378             0,
31379             0,
31380             0,
31381             0,
31382             0,
31383             0,
31384             0,
31385             0,
31386             0,
31387             0,
31388             1 << 31,
31389         );
31390         let count = _mm_set_epi32(2, 0, 0, 2);
31391         let r = _mm512_maskz_srl_epi32(0, a, count);
31392         assert_eq_m512i(r, _mm512_setzero_si512());
31393
31394         let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count);
31395         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29);
31396         assert_eq_m512i(r, e);
31397     }
31398
31399     #[simd_test(enable = "avx512f")]
31400     unsafe fn test_mm512_sra_epi32() {
31401         let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
31402         let count = _mm_set_epi32(1, 0, 0, 2);
31403         let r = _mm512_sra_epi32(a, count);
31404         let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31405         assert_eq_m512i(r, e);
31406     }
31407
31408     #[simd_test(enable = "avx512f")]
31409     unsafe fn test_mm512_mask_sra_epi32() {
31410         let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
31411         let count = _mm_set_epi32(0, 0, 0, 2);
31412         let r = _mm512_mask_sra_epi32(a, 0, a, count);
31413         assert_eq_m512i(r, a);
31414
31415         let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count);
31416         let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4);
31417         assert_eq_m512i(r, e);
31418     }
31419
31420     #[simd_test(enable = "avx512f")]
31421     unsafe fn test_mm512_maskz_sra_epi32() {
31422         let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
31423         let count = _mm_set_epi32(2, 0, 0, 2);
31424         let r = _mm512_maskz_sra_epi32(0, a, count);
31425         assert_eq_m512i(r, _mm512_setzero_si512());
31426
31427         let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count);
31428         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
31429         assert_eq_m512i(r, e);
31430     }
31431
31432     #[simd_test(enable = "avx512f")]
31433     unsafe fn test_mm512_srav_epi32() {
31434         let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
31435         let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
31436         let r = _mm512_srav_epi32(a, count);
31437         let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
31438         assert_eq_m512i(r, e);
31439     }
31440
31441     #[simd_test(enable = "avx512f")]
31442     unsafe fn test_mm512_mask_srav_epi32() {
31443         let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
31444         let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
31445         let r = _mm512_mask_srav_epi32(a, 0, a, count);
31446         assert_eq_m512i(r, a);
31447
31448         let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count);
31449         let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8);
31450         assert_eq_m512i(r, e);
31451     }
31452
31453     #[simd_test(enable = "avx512f")]
31454     unsafe fn test_mm512_maskz_srav_epi32() {
31455         let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
31456         let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2);
31457         let r = _mm512_maskz_srav_epi32(0, a, count);
31458         assert_eq_m512i(r, _mm512_setzero_si512());
31459
31460         let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count);
31461         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
31462         assert_eq_m512i(r, e);
31463     }
31464
31465     #[simd_test(enable = "avx512f")]
31466     unsafe fn test_mm512_srai_epi32() {
31467         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
31468         let r = _mm512_srai_epi32(a, 2);
31469         let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
31470         assert_eq_m512i(r, e);
31471     }
31472
31473     #[simd_test(enable = "avx512f")]
31474     unsafe fn test_mm512_mask_srai_epi32() {
31475         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
31476         let r = _mm512_mask_srai_epi32(a, 0, a, 2);
31477         assert_eq_m512i(r, a);
31478
31479         let r = _mm512_mask_srai_epi32(a, 0b11111111_11111111, a, 2);
31480         let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
31481         assert_eq_m512i(r, e);
31482     }
31483
31484     #[simd_test(enable = "avx512f")]
31485     unsafe fn test_mm512_maskz_srai_epi32() {
31486         let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
31487         let r = _mm512_maskz_srai_epi32(0, a, 2);
31488         assert_eq_m512i(r, _mm512_setzero_si512());
31489
31490         let r = _mm512_maskz_srai_epi32(0b00000000_11111111, a, 2);
31491         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
31492         assert_eq_m512i(r, e);
31493     }
31494
31495     #[simd_test(enable = "avx512f")]
31496     unsafe fn test_mm512_permute_ps() {
31497         let a = _mm512_set_ps(
31498             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31499         );
31500         let r = _mm512_permute_ps(a, 1);
31501         let e = _mm512_set_ps(
31502             2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
31503         );
31504         assert_eq_m512(r, e);
31505     }
31506
31507     #[simd_test(enable = "avx512f")]
31508     unsafe fn test_mm512_mask_permute_ps() {
31509         let a = _mm512_set_ps(
31510             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31511         );
31512         let r = _mm512_mask_permute_ps(a, 0b00000000_00000000, a, 1);
31513         assert_eq_m512(r, a);
31514         let r = _mm512_mask_permute_ps(a, 0b11111111_11111111, a, 1);
31515         let e = _mm512_set_ps(
31516             2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
31517         );
31518         assert_eq_m512(r, e);
31519     }
31520
31521     #[simd_test(enable = "avx512f")]
31522     unsafe fn test_mm512_maskz_permute_ps() {
31523         let a = _mm512_set_ps(
31524             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31525         );
31526         let r = _mm512_maskz_permute_ps(0, a, 1);
31527         assert_eq_m512(r, _mm512_setzero_ps());
31528         let r = _mm512_maskz_permute_ps(0b00000000_11111111, a, 1);
31529         let e = _mm512_set_ps(
31530             0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
31531         );
31532         assert_eq_m512(r, e);
31533     }
31534
31535     #[simd_test(enable = "avx512f")]
31536     unsafe fn test_mm512_permutevar_epi32() {
31537         let idx = _mm512_set1_epi32(1);
31538         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31539         let r = _mm512_permutevar_epi32(idx, a);
31540         let e = _mm512_set1_epi32(14);
31541         assert_eq_m512i(r, e);
31542     }
31543
31544     #[simd_test(enable = "avx512f")]
31545     unsafe fn test_mm512_mask_permutevar_epi32() {
31546         let idx = _mm512_set1_epi32(1);
31547         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31548         let r = _mm512_mask_permutevar_epi32(a, 0, idx, a);
31549         assert_eq_m512i(r, a);
31550         let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a);
31551         let e = _mm512_set1_epi32(14);
31552         assert_eq_m512i(r, e);
31553     }
31554
31555     #[simd_test(enable = "avx512f")]
31556     unsafe fn test_mm512_permutevar_ps() {
31557         let a = _mm512_set_ps(
31558             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31559         );
31560         let b = _mm512_set1_epi32(1);
31561         let r = _mm512_permutevar_ps(a, b);
31562         let e = _mm512_set_ps(
31563             2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
31564         );
31565         assert_eq_m512(r, e);
31566     }
31567
31568     #[simd_test(enable = "avx512f")]
31569     unsafe fn test_mm512_mask_permutevar_ps() {
31570         let a = _mm512_set_ps(
31571             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31572         );
31573         let b = _mm512_set1_epi32(1);
31574         let r = _mm512_mask_permutevar_ps(a, 0, a, b);
31575         assert_eq_m512(r, a);
31576         let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b);
31577         let e = _mm512_set_ps(
31578             2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
31579         );
31580         assert_eq_m512(r, e);
31581     }
31582
31583     #[simd_test(enable = "avx512f")]
31584     unsafe fn test_mm512_maskz_permutevar_ps() {
31585         let a = _mm512_set_ps(
31586             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31587         );
31588         let b = _mm512_set1_epi32(1);
31589         let r = _mm512_maskz_permutevar_ps(0, a, b);
31590         assert_eq_m512(r, _mm512_setzero_ps());
31591         let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b);
31592         let e = _mm512_set_ps(
31593             0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
31594         );
31595         assert_eq_m512(r, e);
31596     }
31597
31598     #[simd_test(enable = "avx512f")]
31599     unsafe fn test_mm512_permutexvar_epi32() {
31600         let idx = _mm512_set1_epi32(1);
31601         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31602         let r = _mm512_permutexvar_epi32(idx, a);
31603         let e = _mm512_set1_epi32(14);
31604         assert_eq_m512i(r, e);
31605     }
31606
31607     #[simd_test(enable = "avx512f")]
31608     unsafe fn test_mm512_mask_permutexvar_epi32() {
31609         let idx = _mm512_set1_epi32(1);
31610         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31611         let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a);
31612         assert_eq_m512i(r, a);
31613         let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a);
31614         let e = _mm512_set1_epi32(14);
31615         assert_eq_m512i(r, e);
31616     }
31617
31618     #[simd_test(enable = "avx512f")]
31619     unsafe fn test_mm512_maskz_permutexvar_epi32() {
31620         let idx = _mm512_set1_epi32(1);
31621         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31622         let r = _mm512_maskz_permutexvar_epi32(0, idx, a);
31623         assert_eq_m512i(r, _mm512_setzero_si512());
31624         let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a);
31625         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14);
31626         assert_eq_m512i(r, e);
31627     }
31628
31629     #[simd_test(enable = "avx512f")]
31630     unsafe fn test_mm512_permutexvar_ps() {
31631         let idx = _mm512_set1_epi32(1);
31632         let a = _mm512_set_ps(
31633             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31634         );
31635         let r = _mm512_permutexvar_ps(idx, a);
31636         let e = _mm512_set1_ps(14.);
31637         assert_eq_m512(r, e);
31638     }
31639
31640     #[simd_test(enable = "avx512f")]
31641     unsafe fn test_mm512_mask_permutexvar_ps() {
31642         let idx = _mm512_set1_epi32(1);
31643         let a = _mm512_set_ps(
31644             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31645         );
31646         let r = _mm512_mask_permutexvar_ps(a, 0, idx, a);
31647         assert_eq_m512(r, a);
31648         let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a);
31649         let e = _mm512_set1_ps(14.);
31650         assert_eq_m512(r, e);
31651     }
31652
31653     #[simd_test(enable = "avx512f")]
31654     unsafe fn test_mm512_maskz_permutexvar_ps() {
31655         let idx = _mm512_set1_epi32(1);
31656         let a = _mm512_set_ps(
31657             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31658         );
31659         let r = _mm512_maskz_permutexvar_ps(0, idx, a);
31660         assert_eq_m512(r, _mm512_setzero_ps());
31661         let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a);
31662         let e = _mm512_set_ps(
31663             0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14.,
31664         );
31665         assert_eq_m512(r, e);
31666     }
31667
31668     #[simd_test(enable = "avx512f")]
31669     unsafe fn test_mm512_permutex2var_epi32() {
31670         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31671         let idx = _mm512_set_epi32(
31672             1,
31673             1 << 4,
31674             2,
31675             1 << 4,
31676             3,
31677             1 << 4,
31678             4,
31679             1 << 4,
31680             5,
31681             1 << 4,
31682             6,
31683             1 << 4,
31684             7,
31685             1 << 4,
31686             8,
31687             1 << 4,
31688         );
31689         let b = _mm512_set1_epi32(100);
31690         let r = _mm512_permutex2var_epi32(a, idx, b);
31691         let e = _mm512_set_epi32(
31692             14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
31693         );
31694         assert_eq_m512i(r, e);
31695     }
31696
31697     #[simd_test(enable = "avx512f")]
31698     unsafe fn test_mm512_mask_permutex2var_epi32() {
31699         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31700         let idx = _mm512_set_epi32(
31701             1,
31702             1 << 4,
31703             2,
31704             1 << 4,
31705             3,
31706             1 << 4,
31707             4,
31708             1 << 4,
31709             5,
31710             1 << 4,
31711             6,
31712             1 << 4,
31713             7,
31714             1 << 4,
31715             8,
31716             1 << 4,
31717         );
31718         let b = _mm512_set1_epi32(100);
31719         let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b);
31720         assert_eq_m512i(r, a);
31721         let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b);
31722         let e = _mm512_set_epi32(
31723             14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
31724         );
31725         assert_eq_m512i(r, e);
31726     }
31727
31728     #[simd_test(enable = "avx512f")]
31729     unsafe fn test_mm512_maskz_permutex2var_epi32() {
31730         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31731         let idx = _mm512_set_epi32(
31732             1,
31733             1 << 4,
31734             2,
31735             1 << 4,
31736             3,
31737             1 << 4,
31738             4,
31739             1 << 4,
31740             5,
31741             1 << 4,
31742             6,
31743             1 << 4,
31744             7,
31745             1 << 4,
31746             8,
31747             1 << 4,
31748         );
31749         let b = _mm512_set1_epi32(100);
31750         let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b);
31751         assert_eq_m512i(r, _mm512_setzero_si512());
31752         let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b);
31753         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100);
31754         assert_eq_m512i(r, e);
31755     }
31756
31757     #[simd_test(enable = "avx512f")]
31758     unsafe fn test_mm512_mask2_permutex2var_epi32() {
31759         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
31760         let idx = _mm512_set_epi32(
31761             1000,
31762             1 << 4,
31763             2000,
31764             1 << 4,
31765             3000,
31766             1 << 4,
31767             4000,
31768             1 << 4,
31769             5,
31770             1 << 4,
31771             6,
31772             1 << 4,
31773             7,
31774             1 << 4,
31775             8,
31776             1 << 4,
31777         );
31778         let b = _mm512_set1_epi32(100);
31779         let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b);
31780         assert_eq_m512i(r, idx);
31781         let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b);
31782         let e = _mm512_set_epi32(
31783             1000,
31784             1 << 4,
31785             2000,
31786             1 << 4,
31787             3000,
31788             1 << 4,
31789             4000,
31790             1 << 4,
31791             10,
31792             100,
31793             9,
31794             100,
31795             8,
31796             100,
31797             7,
31798             100,
31799         );
31800         assert_eq_m512i(r, e);
31801     }
31802
31803     #[simd_test(enable = "avx512f")]
31804     unsafe fn test_mm512_permutex2var_ps() {
31805         let a = _mm512_set_ps(
31806             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31807         );
31808         let idx = _mm512_set_epi32(
31809             1,
31810             1 << 4,
31811             2,
31812             1 << 4,
31813             3,
31814             1 << 4,
31815             4,
31816             1 << 4,
31817             5,
31818             1 << 4,
31819             6,
31820             1 << 4,
31821             7,
31822             1 << 4,
31823             8,
31824             1 << 4,
31825         );
31826         let b = _mm512_set1_ps(100.);
31827         let r = _mm512_permutex2var_ps(a, idx, b);
31828         let e = _mm512_set_ps(
31829             14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
31830         );
31831         assert_eq_m512(r, e);
31832     }
31833
31834     #[simd_test(enable = "avx512f")]
31835     unsafe fn test_mm512_mask_permutex2var_ps() {
31836         let a = _mm512_set_ps(
31837             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31838         );
31839         let idx = _mm512_set_epi32(
31840             1,
31841             1 << 4,
31842             2,
31843             1 << 4,
31844             3,
31845             1 << 4,
31846             4,
31847             1 << 4,
31848             5,
31849             1 << 4,
31850             6,
31851             1 << 4,
31852             7,
31853             1 << 4,
31854             8,
31855             1 << 4,
31856         );
31857         let b = _mm512_set1_ps(100.);
31858         let r = _mm512_mask_permutex2var_ps(a, 0, idx, b);
31859         assert_eq_m512(r, a);
31860         let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b);
31861         let e = _mm512_set_ps(
31862             14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
31863         );
31864         assert_eq_m512(r, e);
31865     }
31866
31867     #[simd_test(enable = "avx512f")]
31868     unsafe fn test_mm512_maskz_permutex2var_ps() {
31869         let a = _mm512_set_ps(
31870             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31871         );
31872         let idx = _mm512_set_epi32(
31873             1,
31874             1 << 4,
31875             2,
31876             1 << 4,
31877             3,
31878             1 << 4,
31879             4,
31880             1 << 4,
31881             5,
31882             1 << 4,
31883             6,
31884             1 << 4,
31885             7,
31886             1 << 4,
31887             8,
31888             1 << 4,
31889         );
31890         let b = _mm512_set1_ps(100.);
31891         let r = _mm512_maskz_permutex2var_ps(0, a, idx, b);
31892         assert_eq_m512(r, _mm512_setzero_ps());
31893         let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b);
31894         let e = _mm512_set_ps(
31895             0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
31896         );
31897         assert_eq_m512(r, e);
31898     }
31899
31900     #[simd_test(enable = "avx512f")]
31901     unsafe fn test_mm512_mask2_permutex2var_ps() {
31902         let a = _mm512_set_ps(
31903             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
31904         );
31905         let idx = _mm512_set_epi32(
31906             1,
31907             1 << 4,
31908             2,
31909             1 << 4,
31910             3,
31911             1 << 4,
31912             4,
31913             1 << 4,
31914             5,
31915             1 << 4,
31916             6,
31917             1 << 4,
31918             7,
31919             1 << 4,
31920             8,
31921             1 << 4,
31922         );
31923         let b = _mm512_set1_ps(100.);
31924         let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b);
31925         assert_eq_m512(r, _mm512_setzero_ps());
31926         let r = _mm512_mask2_permutex2var_ps(a, idx, 0b00000000_11111111, b);
31927         let e = _mm512_set_ps(
31928             0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
31929         );
31930         assert_eq_m512(r, e);
31931     }
31932
31933     #[simd_test(enable = "avx512f")]
31934     unsafe fn test_mm512_shuffle_epi32() {
31935         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
31936         let r = _mm512_shuffle_epi32(a, _MM_PERM_AADD);
31937         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
31938         assert_eq_m512i(r, e);
31939     }
31940
31941     #[simd_test(enable = "avx512f")]
31942     unsafe fn test_mm512_mask_shuffle_epi32() {
31943         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
31944         let r = _mm512_mask_shuffle_epi32(a, 0, a, _MM_PERM_AADD);
31945         assert_eq_m512i(r, a);
31946         let r = _mm512_mask_shuffle_epi32(a, 0b11111111_11111111, a, _MM_PERM_AADD);
31947         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
31948         assert_eq_m512i(r, e);
31949     }
31950
31951     #[simd_test(enable = "avx512f")]
31952     unsafe fn test_mm512_maskz_shuffle_epi32() {
31953         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
31954         let r = _mm512_maskz_shuffle_epi32(0, a, _MM_PERM_AADD);
31955         assert_eq_m512i(r, _mm512_setzero_si512());
31956         let r = _mm512_maskz_shuffle_epi32(0b00000000_11111111, a, _MM_PERM_AADD);
31957         let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
31958         assert_eq_m512i(r, e);
31959     }
31960
31961     #[simd_test(enable = "avx512f")]
31962     unsafe fn test_mm512_shuffle_ps() {
31963         let a = _mm512_setr_ps(
31964             1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
31965         );
31966         let b = _mm512_setr_ps(
31967             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
31968         );
31969         let r = _mm512_shuffle_ps(a, b, 0x0F);
31970         let e = _mm512_setr_ps(
31971             8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
31972         );
31973         assert_eq_m512(r, e);
31974     }
31975
31976     #[simd_test(enable = "avx512f")]
31977     unsafe fn test_mm512_mask_shuffle_ps() {
31978         let a = _mm512_setr_ps(
31979             1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
31980         );
31981         let b = _mm512_setr_ps(
31982             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
31983         );
31984         let r = _mm512_mask_shuffle_ps(a, 0, a, b, 0x0F);
31985         assert_eq_m512(r, a);
31986         let r = _mm512_mask_shuffle_ps(a, 0b11111111_11111111, a, b, 0x0F);
31987         let e = _mm512_setr_ps(
31988             8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
31989         );
31990         assert_eq_m512(r, e);
31991     }
31992
31993     #[simd_test(enable = "avx512f")]
31994     unsafe fn test_mm512_maskz_shuffle_ps() {
31995         let a = _mm512_setr_ps(
31996             1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
31997         );
31998         let b = _mm512_setr_ps(
31999             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
32000         );
32001         let r = _mm512_maskz_shuffle_ps(0, a, b, 0x0F);
32002         assert_eq_m512(r, _mm512_setzero_ps());
32003         let r = _mm512_maskz_shuffle_ps(0b00000000_11111111, a, b, 0x0F);
32004         let e = _mm512_setr_ps(
32005             8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0.,
32006         );
32007         assert_eq_m512(r, e);
32008     }
32009
32010     #[simd_test(enable = "avx512f")]
32011     unsafe fn test_mm512_shuffle_i32x4() {
32012         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
32013         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
32014         let r = _mm512_shuffle_i32x4(a, b, 0b00000000);
32015         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
32016         assert_eq_m512i(r, e);
32017     }
32018
32019     #[simd_test(enable = "avx512f")]
32020     unsafe fn test_mm512_mask_shuffle_i32x4() {
32021         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
32022         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
32023         let r = _mm512_mask_shuffle_i32x4(a, 0, a, b, 0b00000000);
32024         assert_eq_m512i(r, a);
32025         let r = _mm512_mask_shuffle_i32x4(a, 0b11111111_11111111, a, b, 0b00000000);
32026         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
32027         assert_eq_m512i(r, e);
32028     }
32029
32030     #[simd_test(enable = "avx512f")]
32031     unsafe fn test_mm512_maskz_shuffle_i32x4() {
32032         let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
32033         let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
32034         let r = _mm512_maskz_shuffle_i32x4(0, a, b, 0b00000000);
32035         assert_eq_m512i(r, _mm512_setzero_si512());
32036         let r = _mm512_maskz_shuffle_i32x4(0b00000000_11111111, a, b, 0b00000000);
32037         let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
32038         assert_eq_m512i(r, e);
32039     }
32040
32041     #[simd_test(enable = "avx512f")]
32042     unsafe fn test_mm512_shuffle_f32x4() {
32043         let a = _mm512_setr_ps(
32044             1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
32045         );
32046         let b = _mm512_setr_ps(
32047             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
32048         );
32049         let r = _mm512_shuffle_f32x4(a, b, 0b00000000);
32050         let e = _mm512_setr_ps(
32051             1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
32052         );
32053         assert_eq_m512(r, e);
32054     }
32055
32056     #[simd_test(enable = "avx512f")]
32057     unsafe fn test_mm512_mask_shuffle_f32x4() {
32058         let a = _mm512_setr_ps(
32059             1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
32060         );
32061         let b = _mm512_setr_ps(
32062             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
32063         );
32064         let r = _mm512_mask_shuffle_f32x4(a, 0, a, b, 0b00000000);
32065         assert_eq_m512(r, a);
32066         let r = _mm512_mask_shuffle_f32x4(a, 0b11111111_11111111, a, b, 0b00000000);
32067         let e = _mm512_setr_ps(
32068             1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
32069         );
32070         assert_eq_m512(r, e);
32071     }
32072
32073     #[simd_test(enable = "avx512f")]
32074     unsafe fn test_mm512_maskz_shuffle_f32x4() {
32075         let a = _mm512_setr_ps(
32076             1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
32077         );
32078         let b = _mm512_setr_ps(
32079             2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
32080         );
32081         let r = _mm512_maskz_shuffle_f32x4(0, a, b, 0b00000000);
32082         assert_eq_m512(r, _mm512_setzero_ps());
32083         let r = _mm512_maskz_shuffle_f32x4(0b00000000_11111111, a, b, 0b00000000);
32084         let e = _mm512_setr_ps(
32085             1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
32086         );
32087         assert_eq_m512(r, e);
32088     }
32089
32090     #[simd_test(enable = "avx512f")]
32091     unsafe fn test_mm512_extractf32x4_ps() {
32092         let a = _mm512_setr_ps(
32093             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32094         );
32095         let r = _mm512_extractf32x4_ps(a, 0x1);
32096         let e = _mm_setr_ps(5., 6., 7., 8.);
32097         assert_eq_m128(r, e);
32098     }
32099
32100     #[simd_test(enable = "avx512f")]
32101     unsafe fn test_mm512_mask_extractf32x4_ps() {
32102         let a = _mm512_setr_ps(
32103             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32104         );
32105         let src = _mm_set1_ps(100.);
32106         let r = _mm512_mask_extractf32x4_ps(src, 0, a, 0x1);
32107         assert_eq_m128(r, src);
32108         let r = _mm512_mask_extractf32x4_ps(src, 0b11111111, a, 0x1);
32109         let e = _mm_setr_ps(5., 6., 7., 8.);
32110         assert_eq_m128(r, e);
32111     }
32112
32113     #[simd_test(enable = "avx512f")]
32114     unsafe fn test_mm512_maskz_extractf32x4_ps() {
32115         let a = _mm512_setr_ps(
32116             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32117         );
32118         let r = _mm512_maskz_extractf32x4_ps(0, a, 0x1);
32119         assert_eq_m128(r, _mm_setzero_ps());
32120         let r = _mm512_maskz_extractf32x4_ps(0b00000001, a, 0x1);
32121         let e = _mm_setr_ps(5., 0., 0., 0.);
32122         assert_eq_m128(r, e);
32123     }
32124
32125     #[simd_test(enable = "avx512f")]
32126     unsafe fn test_mm512_extracti32x4_epi32() {
32127         let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32128         let r = _mm512_extracti32x4_epi32(a, 0x1);
32129         let e = _mm_setr_epi32(5, 6, 7, 8);
32130         assert_eq_m128i(r, e);
32131     }
32132
32133     #[simd_test(enable = "avx512f")]
32134     unsafe fn test_mm512_mask_extracti32x4_epi32() {
32135         let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32136         let src = _mm_set1_epi32(100);
32137         let r = _mm512_mask_extracti32x4_epi32(src, 0, a, 0x1);
32138         assert_eq_m128i(r, src);
32139         let r = _mm512_mask_extracti32x4_epi32(src, 0b11111111, a, 0x1);
32140         let e = _mm_setr_epi32(5, 6, 7, 8);
32141         assert_eq_m128i(r, e);
32142     }
32143
32144     #[simd_test(enable = "avx512f")]
32145     unsafe fn test_mm512_maskz_extracti32x4_epi32() {
32146         let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32147         let r = _mm512_maskz_extracti32x4_epi32(0, a, 0x1);
32148         assert_eq_m128i(r, _mm_setzero_si128());
32149         let r = _mm512_maskz_extracti32x4_epi32(0b00000001, a, 0x1);
32150         let e = _mm_setr_epi32(5, 0, 0, 0);
32151         assert_eq_m128i(r, e);
32152     }
32153
32154     #[simd_test(enable = "avx512f")]
32155     unsafe fn test_mm512_moveldup_ps() {
32156         let a = _mm512_setr_ps(
32157             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32158         );
32159         let r = _mm512_moveldup_ps(a);
32160         let e = _mm512_setr_ps(
32161             1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
32162         );
32163         assert_eq_m512(r, e);
32164     }
32165
32166     #[simd_test(enable = "avx512f")]
32167     unsafe fn test_mm512_mask_moveldup_ps() {
32168         let a = _mm512_setr_ps(
32169             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32170         );
32171         let r = _mm512_mask_moveldup_ps(a, 0, a);
32172         assert_eq_m512(r, a);
32173         let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a);
32174         let e = _mm512_setr_ps(
32175             1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
32176         );
32177         assert_eq_m512(r, e);
32178     }
32179
32180     #[simd_test(enable = "avx512f")]
32181     unsafe fn test_mm512_maskz_moveldup_ps() {
32182         let a = _mm512_setr_ps(
32183             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32184         );
32185         let r = _mm512_maskz_moveldup_ps(0, a);
32186         assert_eq_m512(r, _mm512_setzero_ps());
32187         let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a);
32188         let e = _mm512_setr_ps(
32189             1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
32190         );
32191         assert_eq_m512(r, e);
32192     }
32193
32194     #[simd_test(enable = "avx512f")]
32195     unsafe fn test_mm512_movehdup_ps() {
32196         let a = _mm512_setr_ps(
32197             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32198         );
32199         let r = _mm512_movehdup_ps(a);
32200         let e = _mm512_setr_ps(
32201             2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
32202         );
32203         assert_eq_m512(r, e);
32204     }
32205
32206     #[simd_test(enable = "avx512f")]
32207     unsafe fn test_mm512_mask_movehdup_ps() {
32208         let a = _mm512_setr_ps(
32209             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32210         );
32211         let r = _mm512_mask_movehdup_ps(a, 0, a);
32212         assert_eq_m512(r, a);
32213         let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a);
32214         let e = _mm512_setr_ps(
32215             2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
32216         );
32217         assert_eq_m512(r, e);
32218     }
32219
32220     #[simd_test(enable = "avx512f")]
32221     unsafe fn test_mm512_maskz_movehdup_ps() {
32222         let a = _mm512_setr_ps(
32223             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32224         );
32225         let r = _mm512_maskz_movehdup_ps(0, a);
32226         assert_eq_m512(r, _mm512_setzero_ps());
32227         let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a);
32228         let e = _mm512_setr_ps(
32229             2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
32230         );
32231         assert_eq_m512(r, e);
32232     }
32233
32234     #[simd_test(enable = "avx512f")]
32235     unsafe fn test_mm512_inserti32x4() {
32236         let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32237         let b = _mm_setr_epi32(17, 18, 19, 20);
32238         let r = _mm512_inserti32x4(a, b, 0);
32239         let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32240         assert_eq_m512i(r, e);
32241     }
32242
32243     #[simd_test(enable = "avx512f")]
32244     unsafe fn test_mm512_mask_inserti32x4() {
32245         let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32246         let b = _mm_setr_epi32(17, 18, 19, 20);
32247         let r = _mm512_mask_inserti32x4(a, 0, a, b, 0);
32248         assert_eq_m512i(r, a);
32249         let r = _mm512_mask_inserti32x4(a, 0b11111111_11111111, a, b, 0);
32250         let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32251         assert_eq_m512i(r, e);
32252     }
32253
32254     #[simd_test(enable = "avx512f")]
32255     unsafe fn test_mm512_maskz_inserti32x4() {
32256         let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32257         let b = _mm_setr_epi32(17, 18, 19, 20);
32258         let r = _mm512_maskz_inserti32x4(0, a, b, 0);
32259         assert_eq_m512i(r, _mm512_setzero_si512());
32260         let r = _mm512_maskz_inserti32x4(0b00000000_11111111, a, b, 0);
32261         let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
32262         assert_eq_m512i(r, e);
32263     }
32264
32265     #[simd_test(enable = "avx512f")]
32266     unsafe fn test_mm512_insertf32x4() {
32267         let a = _mm512_setr_ps(
32268             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32269         );
32270         let b = _mm_setr_ps(17., 18., 19., 20.);
32271         let r = _mm512_insertf32x4(a, b, 0);
32272         let e = _mm512_setr_ps(
32273             17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32274         );
32275         assert_eq_m512(r, e);
32276     }
32277
32278     #[simd_test(enable = "avx512f")]
32279     unsafe fn test_mm512_mask_insertf32x4() {
32280         let a = _mm512_setr_ps(
32281             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32282         );
32283         let b = _mm_setr_ps(17., 18., 19., 20.);
32284         let r = _mm512_mask_insertf32x4(a, 0, a, b, 0);
32285         assert_eq_m512(r, a);
32286         let r = _mm512_mask_insertf32x4(a, 0b11111111_11111111, a, b, 0);
32287         let e = _mm512_setr_ps(
32288             17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32289         );
32290         assert_eq_m512(r, e);
32291     }
32292
32293     #[simd_test(enable = "avx512f")]
32294     unsafe fn test_mm512_maskz_insertf32x4() {
32295         let a = _mm512_setr_ps(
32296             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32297         );
32298         let b = _mm_setr_ps(17., 18., 19., 20.);
32299         let r = _mm512_maskz_insertf32x4(0, a, b, 0);
32300         assert_eq_m512(r, _mm512_setzero_ps());
32301         let r = _mm512_maskz_insertf32x4(0b00000000_11111111, a, b, 0);
32302         let e = _mm512_setr_ps(
32303             17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
32304         );
32305         assert_eq_m512(r, e);
32306     }
32307
32308     #[simd_test(enable = "avx512f")]
32309     unsafe fn test_mm512_castps128_ps512() {
32310         let a = _mm_setr_ps(17., 18., 19., 20.);
32311         let r = _mm512_castps128_ps512(a);
32312         let e = _mm512_setr_ps(
32313             17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
32314         );
32315         assert_eq_m512(r, e);
32316     }
32317
32318     #[simd_test(enable = "avx512f")]
32319     unsafe fn test_mm512_castps256_ps512() {
32320         let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
32321         let r = _mm512_castps256_ps512(a);
32322         let e = _mm512_setr_ps(
32323             17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
32324         );
32325         assert_eq_m512(r, e);
32326     }
32327
32328     #[simd_test(enable = "avx512f")]
32329     unsafe fn test_mm512_zextps128_ps512() {
32330         let a = _mm_setr_ps(17., 18., 19., 20.);
32331         let r = _mm512_zextps128_ps512(a);
32332         let e = _mm512_setr_ps(
32333             17., 18., 19., 20., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
32334         );
32335         assert_eq_m512(r, e);
32336     }
32337
32338     #[simd_test(enable = "avx512f")]
32339     unsafe fn test_mm512_zextps256_ps512() {
32340         let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
32341         let r = _mm512_zextps256_ps512(a);
32342         let e = _mm512_setr_ps(
32343             17., 18., 19., 20., 21., 22., 23., 24., 0., 0., 0., 0., 0., 0., 0., 0.,
32344         );
32345         assert_eq_m512(r, e);
32346     }
32347
32348     #[simd_test(enable = "avx512f")]
32349     unsafe fn test_mm512_castps512_ps128() {
32350         let a = _mm512_setr_ps(
32351             17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
32352         );
32353         let r = _mm512_castps512_ps128(a);
32354         let e = _mm_setr_ps(17., 18., 19., 20.);
32355         assert_eq_m128(r, e);
32356     }
32357
32358     #[simd_test(enable = "avx512f")]
32359     unsafe fn test_mm512_castps512_ps256() {
32360         let a = _mm512_setr_ps(
32361             17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
32362         );
32363         let r = _mm512_castps512_ps256(a);
32364         let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
32365         assert_eq_m256(r, e);
32366     }
32367
32368     #[simd_test(enable = "avx512f")]
32369     unsafe fn test_mm512_castps_pd() {
32370         let a = _mm512_set1_ps(1.);
32371         let r = _mm512_castps_pd(a);
32372         let e = _mm512_set1_pd(0.007812501848093234);
32373         assert_eq_m512d(r, e);
32374     }
32375
32376     #[simd_test(enable = "avx512f")]
32377     unsafe fn test_mm512_castps_si512() {
32378         let a = _mm512_set1_ps(1.);
32379         let r = _mm512_castps_si512(a);
32380         let e = _mm512_set1_epi32(1065353216);
32381         assert_eq_m512i(r, e);
32382     }
32383
32384     #[simd_test(enable = "avx512f")]
32385     unsafe fn test_mm512_broadcastd_epi32() {
32386         let a = _mm_set_epi32(17, 18, 19, 20);
32387         let r = _mm512_broadcastd_epi32(a);
32388         let e = _mm512_set1_epi32(20);
32389         assert_eq_m512i(r, e);
32390     }
32391
32392     #[simd_test(enable = "avx512f")]
32393     unsafe fn test_mm512_mask_broadcastd_epi32() {
32394         let src = _mm512_set1_epi32(20);
32395         let a = _mm_set_epi32(17, 18, 19, 20);
32396         let r = _mm512_mask_broadcastd_epi32(src, 0, a);
32397         assert_eq_m512i(r, src);
32398         let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
32399         let e = _mm512_set1_epi32(20);
32400         assert_eq_m512i(r, e);
32401     }
32402
32403     #[simd_test(enable = "avx512f")]
32404     unsafe fn test_mm512_maskz_broadcastd_epi32() {
32405         let a = _mm_set_epi32(17, 18, 19, 20);
32406         let r = _mm512_maskz_broadcastd_epi32(0, a);
32407         assert_eq_m512i(r, _mm512_setzero_si512());
32408         let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
32409         let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
32410         assert_eq_m512i(r, e);
32411     }
32412
32413     #[simd_test(enable = "avx512f")]
32414     unsafe fn test_mm512_broadcastss_ps() {
32415         let a = _mm_set_ps(17., 18., 19., 20.);
32416         let r = _mm512_broadcastss_ps(a);
32417         let e = _mm512_set1_ps(20.);
32418         assert_eq_m512(r, e);
32419     }
32420
32421     #[simd_test(enable = "avx512f")]
32422     unsafe fn test_mm512_mask_broadcastss_ps() {
32423         let src = _mm512_set1_ps(20.);
32424         let a = _mm_set_ps(17., 18., 19., 20.);
32425         let r = _mm512_mask_broadcastss_ps(src, 0, a);
32426         assert_eq_m512(r, src);
32427         let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
32428         let e = _mm512_set1_ps(20.);
32429         assert_eq_m512(r, e);
32430     }
32431
32432     #[simd_test(enable = "avx512f")]
32433     unsafe fn test_mm512_maskz_broadcastss_ps() {
32434         let a = _mm_set_ps(17., 18., 19., 20.);
32435         let r = _mm512_maskz_broadcastss_ps(0, a);
32436         assert_eq_m512(r, _mm512_setzero_ps());
32437         let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
32438         let e = _mm512_setr_ps(
32439             20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
32440         );
32441         assert_eq_m512(r, e);
32442     }
32443
32444     #[simd_test(enable = "avx512f")]
32445     unsafe fn test_mm512_broadcast_i32x4() {
32446         let a = _mm_set_epi32(17, 18, 19, 20);
32447         let r = _mm512_broadcast_i32x4(a);
32448         let e = _mm512_set_epi32(
32449             17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
32450         );
32451         assert_eq_m512i(r, e);
32452     }
32453
32454     #[simd_test(enable = "avx512f")]
32455     unsafe fn test_mm512_mask_broadcast_i32x4() {
32456         let src = _mm512_set1_epi32(20);
32457         let a = _mm_set_epi32(17, 18, 19, 20);
32458         let r = _mm512_mask_broadcast_i32x4(src, 0, a);
32459         assert_eq_m512i(r, src);
32460         let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
32461         let e = _mm512_set_epi32(
32462             17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
32463         );
32464         assert_eq_m512i(r, e);
32465     }
32466
32467     #[simd_test(enable = "avx512f")]
32468     unsafe fn test_mm512_maskz_broadcast_i32x4() {
32469         let a = _mm_set_epi32(17, 18, 19, 20);
32470         let r = _mm512_maskz_broadcast_i32x4(0, a);
32471         assert_eq_m512i(r, _mm512_setzero_si512());
32472         let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
32473         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
32474         assert_eq_m512i(r, e);
32475     }
32476
32477     #[simd_test(enable = "avx512f")]
32478     unsafe fn test_mm512_broadcast_f32x4() {
32479         let a = _mm_set_ps(17., 18., 19., 20.);
32480         let r = _mm512_broadcast_f32x4(a);
32481         let e = _mm512_set_ps(
32482             17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
32483         );
32484         assert_eq_m512(r, e);
32485     }
32486
32487     #[simd_test(enable = "avx512f")]
32488     unsafe fn test_mm512_mask_broadcast_f32x4() {
32489         let src = _mm512_set1_ps(20.);
32490         let a = _mm_set_ps(17., 18., 19., 20.);
32491         let r = _mm512_mask_broadcast_f32x4(src, 0, a);
32492         assert_eq_m512(r, src);
32493         let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
32494         let e = _mm512_set_ps(
32495             17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
32496         );
32497         assert_eq_m512(r, e);
32498     }
32499
32500     #[simd_test(enable = "avx512f")]
32501     unsafe fn test_mm512_maskz_broadcast_f32x4() {
32502         let a = _mm_set_ps(17., 18., 19., 20.);
32503         let r = _mm512_maskz_broadcast_f32x4(0, a);
32504         assert_eq_m512(r, _mm512_setzero_ps());
32505         let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
32506         let e = _mm512_set_ps(
32507             0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
32508         );
32509         assert_eq_m512(r, e);
32510     }
32511
32512     #[simd_test(enable = "avx512f")]
32513     unsafe fn test_mm512_mask_blend_epi32() {
32514         let a = _mm512_set1_epi32(1);
32515         let b = _mm512_set1_epi32(2);
32516         let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
32517         let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
32518         assert_eq_m512i(r, e);
32519     }
32520
32521     #[simd_test(enable = "avx512f")]
32522     unsafe fn test_mm512_mask_blend_ps() {
32523         let a = _mm512_set1_ps(1.);
32524         let b = _mm512_set1_ps(2.);
32525         let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
32526         let e = _mm512_set_ps(
32527             2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
32528         );
32529         assert_eq_m512(r, e);
32530     }
32531
32532     #[simd_test(enable = "avx512f")]
32533     unsafe fn test_mm512_unpackhi_epi32() {
32534         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32535         let b = _mm512_set_epi32(
32536             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
32537         );
32538         let r = _mm512_unpackhi_epi32(a, b);
32539         let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
32540         assert_eq_m512i(r, e);
32541     }
32542
32543     #[simd_test(enable = "avx512f")]
32544     unsafe fn test_mm512_mask_unpackhi_epi32() {
32545         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32546         let b = _mm512_set_epi32(
32547             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
32548         );
32549         let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
32550         assert_eq_m512i(r, a);
32551         let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
32552         let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
32553         assert_eq_m512i(r, e);
32554     }
32555
32556     #[simd_test(enable = "avx512f")]
32557     unsafe fn test_mm512_maskz_unpackhi_epi32() {
32558         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32559         let b = _mm512_set_epi32(
32560             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
32561         );
32562         let r = _mm512_maskz_unpackhi_epi32(0, a, b);
32563         assert_eq_m512i(r, _mm512_setzero_si512());
32564         let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
32565         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
32566         assert_eq_m512i(r, e);
32567     }
32568
32569     #[simd_test(enable = "avx512f")]
32570     unsafe fn test_mm512_unpackhi_ps() {
32571         let a = _mm512_set_ps(
32572             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32573         );
32574         let b = _mm512_set_ps(
32575             17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
32576         );
32577         let r = _mm512_unpackhi_ps(a, b);
32578         let e = _mm512_set_ps(
32579             17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
32580         );
32581         assert_eq_m512(r, e);
32582     }
32583
32584     #[simd_test(enable = "avx512f")]
32585     unsafe fn test_mm512_mask_unpackhi_ps() {
32586         let a = _mm512_set_ps(
32587             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32588         );
32589         let b = _mm512_set_ps(
32590             17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
32591         );
32592         let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
32593         assert_eq_m512(r, a);
32594         let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
32595         let e = _mm512_set_ps(
32596             17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
32597         );
32598         assert_eq_m512(r, e);
32599     }
32600
32601     #[simd_test(enable = "avx512f")]
32602     unsafe fn test_mm512_maskz_unpackhi_ps() {
32603         let a = _mm512_set_ps(
32604             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32605         );
32606         let b = _mm512_set_ps(
32607             17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
32608         );
32609         let r = _mm512_maskz_unpackhi_ps(0, a, b);
32610         assert_eq_m512(r, _mm512_setzero_ps());
32611         let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
32612         let e = _mm512_set_ps(
32613             0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
32614         );
32615         assert_eq_m512(r, e);
32616     }
32617
32618     #[simd_test(enable = "avx512f")]
32619     unsafe fn test_mm512_unpacklo_epi32() {
32620         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32621         let b = _mm512_set_epi32(
32622             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
32623         );
32624         let r = _mm512_unpacklo_epi32(a, b);
32625         let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
32626         assert_eq_m512i(r, e);
32627     }
32628
32629     #[simd_test(enable = "avx512f")]
32630     unsafe fn test_mm512_mask_unpacklo_epi32() {
32631         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32632         let b = _mm512_set_epi32(
32633             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
32634         );
32635         let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
32636         assert_eq_m512i(r, a);
32637         let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
32638         let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
32639         assert_eq_m512i(r, e);
32640     }
32641
32642     #[simd_test(enable = "avx512f")]
32643     unsafe fn test_mm512_maskz_unpacklo_epi32() {
32644         let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
32645         let b = _mm512_set_epi32(
32646             17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
32647         );
32648         let r = _mm512_maskz_unpacklo_epi32(0, a, b);
32649         assert_eq_m512i(r, _mm512_setzero_si512());
32650         let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
32651         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
32652         assert_eq_m512i(r, e);
32653     }
32654
32655     #[simd_test(enable = "avx512f")]
32656     unsafe fn test_mm512_unpacklo_ps() {
32657         let a = _mm512_set_ps(
32658             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32659         );
32660         let b = _mm512_set_ps(
32661             17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
32662         );
32663         let r = _mm512_unpacklo_ps(a, b);
32664         let e = _mm512_set_ps(
32665             19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
32666         );
32667         assert_eq_m512(r, e);
32668     }
32669
32670     #[simd_test(enable = "avx512f")]
32671     unsafe fn test_mm512_mask_unpacklo_ps() {
32672         let a = _mm512_set_ps(
32673             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32674         );
32675         let b = _mm512_set_ps(
32676             17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
32677         );
32678         let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
32679         assert_eq_m512(r, a);
32680         let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
32681         let e = _mm512_set_ps(
32682             19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
32683         );
32684         assert_eq_m512(r, e);
32685     }
32686
32687     #[simd_test(enable = "avx512f")]
32688     unsafe fn test_mm512_maskz_unpacklo_ps() {
32689         let a = _mm512_set_ps(
32690             1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
32691         );
32692         let b = _mm512_set_ps(
32693             17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
32694         );
32695         let r = _mm512_maskz_unpacklo_ps(0, a, b);
32696         assert_eq_m512(r, _mm512_setzero_ps());
32697         let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
32698         let e = _mm512_set_ps(
32699             0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
32700         );
32701         assert_eq_m512(r, e);
32702     }
32703
32704     #[simd_test(enable = "avx512f")]
32705     unsafe fn test_mm512_alignr_epi32() {
32706         let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
32707         let b = _mm512_set_epi32(
32708             32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
32709         );
32710         let r = _mm512_alignr_epi32(a, b, 0);
32711         assert_eq_m512i(r, b);
32712         let r = _mm512_alignr_epi32(a, b, 16);
32713         assert_eq_m512i(r, b);
32714         let r = _mm512_alignr_epi32(a, b, 1);
32715         let e = _mm512_set_epi32(
32716             1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
32717         );
32718         assert_eq_m512i(r, e);
32719     }
32720
32721     #[simd_test(enable = "avx512f")]
32722     unsafe fn test_mm512_mask_alignr_epi32() {
32723         let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
32724         let b = _mm512_set_epi32(
32725             32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
32726         );
32727         let r = _mm512_mask_alignr_epi32(a, 0, a, b, 1);
32728         assert_eq_m512i(r, a);
32729         let r = _mm512_mask_alignr_epi32(a, 0b11111111_11111111, a, b, 1);
32730         let e = _mm512_set_epi32(
32731             1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
32732         );
32733         assert_eq_m512i(r, e);
32734     }
32735
32736     #[simd_test(enable = "avx512f")]
32737     unsafe fn test_mm512_maskz_alignr_epi32() {
32738         let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
32739         let b = _mm512_set_epi32(
32740             32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
32741         );
32742         let r = _mm512_maskz_alignr_epi32(0, a, b, 1);
32743         assert_eq_m512i(r, _mm512_setzero_si512());
32744         let r = _mm512_maskz_alignr_epi32(0b00000000_11111111, a, b, 1);
32745         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 23, 22, 21, 20, 19, 18);
32746         assert_eq_m512i(r, e);
32747     }
32748
32749     #[simd_test(enable = "avx512f")]
32750     unsafe fn test_mm512_and_epi32() {
32751         let a = _mm512_set_epi32(
32752             1 << 1 | 1 << 2,
32753             0,
32754             0,
32755             0,
32756             0,
32757             0,
32758             0,
32759             0,
32760             0,
32761             0,
32762             0,
32763             0,
32764             0,
32765             0,
32766             0,
32767             1 << 1 | 1 << 3,
32768         );
32769         let b = _mm512_set_epi32(
32770             1 << 1,
32771             0,
32772             0,
32773             0,
32774             0,
32775             0,
32776             0,
32777             0,
32778             0,
32779             0,
32780             0,
32781             0,
32782             0,
32783             0,
32784             0,
32785             1 << 3 | 1 << 4,
32786         );
32787         let r = _mm512_and_epi32(a, b);
32788         let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
32789         assert_eq_m512i(r, e);
32790     }
32791
32792     #[simd_test(enable = "avx512f")]
32793     unsafe fn test_mm512_mask_and_epi32() {
32794         let a = _mm512_set_epi32(
32795             1 << 1 | 1 << 2,
32796             0,
32797             0,
32798             0,
32799             0,
32800             0,
32801             0,
32802             0,
32803             0,
32804             0,
32805             0,
32806             0,
32807             0,
32808             0,
32809             0,
32810             1 << 1 | 1 << 3,
32811         );
32812         let b = _mm512_set_epi32(
32813             1 << 1,
32814             0,
32815             0,
32816             0,
32817             0,
32818             0,
32819             0,
32820             0,
32821             0,
32822             0,
32823             0,
32824             0,
32825             0,
32826             0,
32827             0,
32828             1 << 3 | 1 << 4,
32829         );
32830         let r = _mm512_mask_and_epi32(a, 0, a, b);
32831         assert_eq_m512i(r, a);
32832
32833         let r = _mm512_mask_and_epi32(a, 0b01111111_11111111, a, b);
32834         let e = _mm512_set_epi32(
32835             1 << 1 | 1 << 2,
32836             0,
32837             0,
32838             0,
32839             0,
32840             0,
32841             0,
32842             0,
32843             0,
32844             0,
32845             0,
32846             0,
32847             0,
32848             0,
32849             0,
32850             1 << 3,
32851         );
32852         assert_eq_m512i(r, e);
32853     }
32854
32855     #[simd_test(enable = "avx512f")]
32856     unsafe fn test_mm512_maskz_and_epi32() {
32857         let a = _mm512_set_epi32(
32858             1 << 1 | 1 << 2,
32859             0,
32860             0,
32861             0,
32862             0,
32863             0,
32864             0,
32865             0,
32866             0,
32867             0,
32868             0,
32869             0,
32870             0,
32871             0,
32872             0,
32873             1 << 1 | 1 << 3,
32874         );
32875         let b = _mm512_set_epi32(
32876             1 << 1,
32877             0,
32878             0,
32879             0,
32880             0,
32881             0,
32882             0,
32883             0,
32884             0,
32885             0,
32886             0,
32887             0,
32888             0,
32889             0,
32890             0,
32891             1 << 3 | 1 << 4,
32892         );
32893         let r = _mm512_maskz_and_epi32(0, a, b);
32894         assert_eq_m512i(r, _mm512_setzero_si512());
32895
32896         let r = _mm512_maskz_and_epi32(0b00000000_11111111, a, b);
32897         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
32898         assert_eq_m512i(r, e);
32899     }
32900
32901     #[simd_test(enable = "avx512f")]
32902     unsafe fn test_mm512_and_si512() {
32903         let a = _mm512_set_epi32(
32904             1 << 1 | 1 << 2,
32905             0,
32906             0,
32907             0,
32908             0,
32909             0,
32910             0,
32911             0,
32912             0,
32913             0,
32914             0,
32915             0,
32916             0,
32917             0,
32918             0,
32919             1 << 1 | 1 << 3,
32920         );
32921         let b = _mm512_set_epi32(
32922             1 << 1,
32923             0,
32924             0,
32925             0,
32926             0,
32927             0,
32928             0,
32929             0,
32930             0,
32931             0,
32932             0,
32933             0,
32934             0,
32935             0,
32936             0,
32937             1 << 3 | 1 << 4,
32938         );
32939         let r = _mm512_and_epi32(a, b);
32940         let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
32941         assert_eq_m512i(r, e);
32942     }
32943
32944     #[simd_test(enable = "avx512f")]
32945     unsafe fn test_mm512_or_epi32() {
32946         let a = _mm512_set_epi32(
32947             1 << 1 | 1 << 2,
32948             0,
32949             0,
32950             0,
32951             0,
32952             0,
32953             0,
32954             0,
32955             0,
32956             0,
32957             0,
32958             0,
32959             0,
32960             0,
32961             0,
32962             1 << 1 | 1 << 3,
32963         );
32964         let b = _mm512_set_epi32(
32965             1 << 1,
32966             0,
32967             0,
32968             0,
32969             0,
32970             0,
32971             0,
32972             0,
32973             0,
32974             0,
32975             0,
32976             0,
32977             0,
32978             0,
32979             0,
32980             1 << 3 | 1 << 4,
32981         );
32982         let r = _mm512_or_epi32(a, b);
32983         let e = _mm512_set_epi32(
32984             1 << 1 | 1 << 2,
32985             0,
32986             0,
32987             0,
32988             0,
32989             0,
32990             0,
32991             0,
32992             0,
32993             0,
32994             0,
32995             0,
32996             0,
32997             0,
32998             0,
32999             1 << 1 | 1 << 3 | 1 << 4,
33000         );
33001         assert_eq_m512i(r, e);
33002     }
33003
33004     #[simd_test(enable = "avx512f")]
33005     unsafe fn test_mm512_mask_or_epi32() {
33006         let a = _mm512_set_epi32(
33007             1 << 1 | 1 << 2,
33008             0,
33009             0,
33010             0,
33011             0,
33012             0,
33013             0,
33014             0,
33015             0,
33016             0,
33017             0,
33018             0,
33019             0,
33020             0,
33021             0,
33022             1 << 1 | 1 << 3,
33023         );
33024         let b = _mm512_set_epi32(
33025             1 << 1,
33026             0,
33027             0,
33028             0,
33029             0,
33030             0,
33031             0,
33032             0,
33033             0,
33034             0,
33035             0,
33036             0,
33037             0,
33038             0,
33039             0,
33040             1 << 3 | 1 << 4,
33041         );
33042         let r = _mm512_mask_or_epi32(a, 0, a, b);
33043         assert_eq_m512i(r, a);
33044
33045         let r = _mm512_mask_or_epi32(a, 0b11111111_11111111, a, b);
33046         let e = _mm512_set_epi32(
33047             1 << 1 | 1 << 2,
33048             0,
33049             0,
33050             0,
33051             0,
33052             0,
33053             0,
33054             0,
33055             0,
33056             0,
33057             0,
33058             0,
33059             0,
33060             0,
33061             0,
33062             1 << 1 | 1 << 3 | 1 << 4,
33063         );
33064         assert_eq_m512i(r, e);
33065     }
33066
33067     #[simd_test(enable = "avx512f")]
33068     unsafe fn test_mm512_maskz_or_epi32() {
33069         let a = _mm512_set_epi32(
33070             1 << 1 | 1 << 2,
33071             0,
33072             0,
33073             0,
33074             0,
33075             0,
33076             0,
33077             0,
33078             0,
33079             0,
33080             0,
33081             0,
33082             0,
33083             0,
33084             0,
33085             1 << 1 | 1 << 3,
33086         );
33087         let b = _mm512_set_epi32(
33088             1 << 1,
33089             0,
33090             0,
33091             0,
33092             0,
33093             0,
33094             0,
33095             0,
33096             0,
33097             0,
33098             0,
33099             0,
33100             0,
33101             0,
33102             0,
33103             1 << 3 | 1 << 4,
33104         );
33105         let r = _mm512_maskz_or_epi32(0, a, b);
33106         assert_eq_m512i(r, _mm512_setzero_si512());
33107
33108         let r = _mm512_maskz_or_epi32(0b00000000_11111111, a, b);
33109         let e = _mm512_set_epi32(
33110             0,
33111             0,
33112             0,
33113             0,
33114             0,
33115             0,
33116             0,
33117             0,
33118             0,
33119             0,
33120             0,
33121             0,
33122             0,
33123             0,
33124             0,
33125             1 << 1 | 1 << 3 | 1 << 4,
33126         );
33127         assert_eq_m512i(r, e);
33128     }
33129
33130     #[simd_test(enable = "avx512f")]
33131     unsafe fn test_mm512_or_si512() {
33132         let a = _mm512_set_epi32(
33133             1 << 1 | 1 << 2,
33134             0,
33135             0,
33136             0,
33137             0,
33138             0,
33139             0,
33140             0,
33141             0,
33142             0,
33143             0,
33144             0,
33145             0,
33146             0,
33147             0,
33148             1 << 1 | 1 << 3,
33149         );
33150         let b = _mm512_set_epi32(
33151             1 << 1,
33152             0,
33153             0,
33154             0,
33155             0,
33156             0,
33157             0,
33158             0,
33159             0,
33160             0,
33161             0,
33162             0,
33163             0,
33164             0,
33165             0,
33166             1 << 3 | 1 << 4,
33167         );
33168         let r = _mm512_or_epi32(a, b);
33169         let e = _mm512_set_epi32(
33170             1 << 1 | 1 << 2,
33171             0,
33172             0,
33173             0,
33174             0,
33175             0,
33176             0,
33177             0,
33178             0,
33179             0,
33180             0,
33181             0,
33182             0,
33183             0,
33184             0,
33185             1 << 1 | 1 << 3 | 1 << 4,
33186         );
33187         assert_eq_m512i(r, e);
33188     }
33189
33190     #[simd_test(enable = "avx512f")]
33191     unsafe fn test_mm512_xor_epi32() {
33192         let a = _mm512_set_epi32(
33193             1 << 1 | 1 << 2,
33194             0,
33195             0,
33196             0,
33197             0,
33198             0,
33199             0,
33200             0,
33201             0,
33202             0,
33203             0,
33204             0,
33205             0,
33206             0,
33207             0,
33208             1 << 1 | 1 << 3,
33209         );
33210         let b = _mm512_set_epi32(
33211             1 << 1,
33212             0,
33213             0,
33214             0,
33215             0,
33216             0,
33217             0,
33218             0,
33219             0,
33220             0,
33221             0,
33222             0,
33223             0,
33224             0,
33225             0,
33226             1 << 3 | 1 << 4,
33227         );
33228         let r = _mm512_xor_epi32(a, b);
33229         let e = _mm512_set_epi32(
33230             1 << 2,
33231             0,
33232             0,
33233             0,
33234             0,
33235             0,
33236             0,
33237             0,
33238             0,
33239             0,
33240             0,
33241             0,
33242             0,
33243             0,
33244             0,
33245             1 << 1 | 1 << 4,
33246         );
33247         assert_eq_m512i(r, e);
33248     }
33249
33250     #[simd_test(enable = "avx512f")]
33251     unsafe fn test_mm512_mask_xor_epi32() {
33252         let a = _mm512_set_epi32(
33253             1 << 1 | 1 << 2,
33254             0,
33255             0,
33256             0,
33257             0,
33258             0,
33259             0,
33260             0,
33261             0,
33262             0,
33263             0,
33264             0,
33265             0,
33266             0,
33267             0,
33268             1 << 1 | 1 << 3,
33269         );
33270         let b = _mm512_set_epi32(
33271             1 << 1,
33272             0,
33273             0,
33274             0,
33275             0,
33276             0,
33277             0,
33278             0,
33279             0,
33280             0,
33281             0,
33282             0,
33283             0,
33284             0,
33285             0,
33286             1 << 3 | 1 << 4,
33287         );
33288         let r = _mm512_mask_xor_epi32(a, 0, a, b);
33289         assert_eq_m512i(r, a);
33290
33291         let r = _mm512_mask_xor_epi32(a, 0b01111111_11111111, a, b);
33292         let e = _mm512_set_epi32(
33293             1 << 1 | 1 << 2,
33294             0,
33295             0,
33296             0,
33297             0,
33298             0,
33299             0,
33300             0,
33301             0,
33302             0,
33303             0,
33304             0,
33305             0,
33306             0,
33307             0,
33308             1 << 1 | 1 << 4,
33309         );
33310         assert_eq_m512i(r, e);
33311     }
33312
33313     #[simd_test(enable = "avx512f")]
33314     unsafe fn test_mm512_maskz_xor_epi32() {
33315         let a = _mm512_set_epi32(
33316             1 << 1 | 1 << 2,
33317             0,
33318             0,
33319             0,
33320             0,
33321             0,
33322             0,
33323             0,
33324             0,
33325             0,
33326             0,
33327             0,
33328             0,
33329             0,
33330             0,
33331             1 << 1 | 1 << 3,
33332         );
33333         let b = _mm512_set_epi32(
33334             1 << 1,
33335             0,
33336             0,
33337             0,
33338             0,
33339             0,
33340             0,
33341             0,
33342             0,
33343             0,
33344             0,
33345             0,
33346             0,
33347             0,
33348             0,
33349             1 << 3 | 1 << 4,
33350         );
33351         let r = _mm512_maskz_xor_epi32(0, a, b);
33352         assert_eq_m512i(r, _mm512_setzero_si512());
33353
33354         let r = _mm512_maskz_xor_epi32(0b00000000_11111111, a, b);
33355         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 4);
33356         assert_eq_m512i(r, e);
33357     }
33358
33359     #[simd_test(enable = "avx512f")]
33360     unsafe fn test_mm512_xor_si512() {
33361         let a = _mm512_set_epi32(
33362             1 << 1 | 1 << 2,
33363             0,
33364             0,
33365             0,
33366             0,
33367             0,
33368             0,
33369             0,
33370             0,
33371             0,
33372             0,
33373             0,
33374             0,
33375             0,
33376             0,
33377             1 << 1 | 1 << 3,
33378         );
33379         let b = _mm512_set_epi32(
33380             1 << 1,
33381             0,
33382             0,
33383             0,
33384             0,
33385             0,
33386             0,
33387             0,
33388             0,
33389             0,
33390             0,
33391             0,
33392             0,
33393             0,
33394             0,
33395             1 << 3 | 1 << 4,
33396         );
33397         let r = _mm512_xor_epi32(a, b);
33398         let e = _mm512_set_epi32(
33399             1 << 2,
33400             0,
33401             0,
33402             0,
33403             0,
33404             0,
33405             0,
33406             0,
33407             0,
33408             0,
33409             0,
33410             0,
33411             0,
33412             0,
33413             0,
33414             1 << 1 | 1 << 4,
33415         );
33416         assert_eq_m512i(r, e);
33417     }
33418
33419     #[simd_test(enable = "avx512f")]
33420     unsafe fn test_mm512_andnot_epi32() {
33421         let a = _mm512_set1_epi32(0);
33422         let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
33423         let r = _mm512_andnot_epi32(a, b);
33424         let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
33425         assert_eq_m512i(r, e);
33426     }
33427
33428     #[simd_test(enable = "avx512f")]
33429     unsafe fn test_mm512_mask_andnot_epi32() {
33430         let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
33431         let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
33432         let r = _mm512_mask_andnot_epi32(a, 0, a, b);
33433         assert_eq_m512i(r, a);
33434
33435         let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
33436         let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
33437         assert_eq_m512i(r, e);
33438     }
33439
33440     #[simd_test(enable = "avx512f")]
33441     unsafe fn test_mm512_maskz_andnot_epi32() {
33442         let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
33443         let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
33444         let r = _mm512_maskz_andnot_epi32(0, a, b);
33445         assert_eq_m512i(r, _mm512_setzero_si512());
33446
33447         let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
33448         let e = _mm512_set_epi32(
33449             0,
33450             0,
33451             0,
33452             0,
33453             0,
33454             0,
33455             0,
33456             0,
33457             1 << 3 | 1 << 4,
33458             1 << 3 | 1 << 4,
33459             1 << 3 | 1 << 4,
33460             1 << 3 | 1 << 4,
33461             1 << 3 | 1 << 4,
33462             1 << 3 | 1 << 4,
33463             1 << 3 | 1 << 4,
33464             1 << 3 | 1 << 4,
33465         );
33466         assert_eq_m512i(r, e);
33467     }
33468
33469     #[simd_test(enable = "avx512f")]
33470     unsafe fn test_mm512_kand() {
33471         let a: u16 = 0b11001100_00110011;
33472         let b: u16 = 0b11001100_00110011;
33473         let r = _mm512_kand(a, b);
33474         let e: u16 = 0b11001100_00110011;
33475         assert_eq!(r, e);
33476     }
33477
33478     #[simd_test(enable = "avx512f")]
33479     unsafe fn test_kand_mask16() {
33480         let a: u16 = 0b11001100_00110011;
33481         let b: u16 = 0b11001100_00110011;
33482         let r = _kand_mask16(a, b);
33483         let e: u16 = 0b11001100_00110011;
33484         assert_eq!(r, e);
33485     }
33486
33487     #[simd_test(enable = "avx512f")]
33488     unsafe fn test_mm512_kor() {
33489         let a: u16 = 0b11001100_00110011;
33490         let b: u16 = 0b00101110_00001011;
33491         let r = _mm512_kor(a, b);
33492         let e: u16 = 0b11101110_00111011;
33493         assert_eq!(r, e);
33494     }
33495
33496     #[simd_test(enable = "avx512f")]
33497     unsafe fn test_kor_mask16() {
33498         let a: u16 = 0b11001100_00110011;
33499         let b: u16 = 0b00101110_00001011;
33500         let r = _kor_mask16(a, b);
33501         let e: u16 = 0b11101110_00111011;
33502         assert_eq!(r, e);
33503     }
33504
33505     #[simd_test(enable = "avx512f")]
33506     unsafe fn test_mm512_kxor() {
33507         let a: u16 = 0b11001100_00110011;
33508         let b: u16 = 0b00101110_00001011;
33509         let r = _mm512_kxor(a, b);
33510         let e: u16 = 0b11100010_00111000;
33511         assert_eq!(r, e);
33512     }
33513
33514     #[simd_test(enable = "avx512f")]
33515     unsafe fn test_kxor_mask16() {
33516         let a: u16 = 0b11001100_00110011;
33517         let b: u16 = 0b00101110_00001011;
33518         let r = _kxor_mask16(a, b);
33519         let e: u16 = 0b11100010_00111000;
33520         assert_eq!(r, e);
33521     }
33522
33523     #[simd_test(enable = "avx512f")]
33524     unsafe fn test_mm512_knot() {
33525         let a: u16 = 0b11001100_00110011;
33526         let r = _mm512_knot(a);
33527         let e: u16 = 0b00110011_11001100;
33528         assert_eq!(r, e);
33529     }
33530
33531     #[simd_test(enable = "avx512f")]
33532     unsafe fn test_knot_mask16() {
33533         let a: u16 = 0b11001100_00110011;
33534         let r = _knot_mask16(a);
33535         let e: u16 = 0b00110011_11001100;
33536         assert_eq!(r, e);
33537     }
33538
33539     #[simd_test(enable = "avx512f")]
33540     unsafe fn test_mm512_kandn() {
33541         let a: u16 = 0b11001100_00110011;
33542         let b: u16 = 0b00101110_00001011;
33543         let r = _mm512_kandn(a, b);
33544         let e: u16 = 0b00100010_00001000;
33545         assert_eq!(r, e);
33546     }
33547
33548     #[simd_test(enable = "avx512f")]
33549     unsafe fn test_kandn_mask16() {
33550         let a: u16 = 0b11001100_00110011;
33551         let b: u16 = 0b00101110_00001011;
33552         let r = _kandn_mask16(a, b);
33553         let e: u16 = 0b00100010_00001000;
33554         assert_eq!(r, e);
33555     }
33556
33557     #[simd_test(enable = "avx512f")]
33558     unsafe fn test_mm512_kxnor() {
33559         let a: u16 = 0b11001100_00110011;
33560         let b: u16 = 0b00101110_00001011;
33561         let r = _mm512_kxnor(a, b);
33562         let e: u16 = 0b00011101_11000111;
33563         assert_eq!(r, e);
33564     }
33565
33566     #[simd_test(enable = "avx512f")]
33567     unsafe fn test_kxnor_mask16() {
33568         let a: u16 = 0b11001100_00110011;
33569         let b: u16 = 0b00101110_00001011;
33570         let r = _kxnor_mask16(a, b);
33571         let e: u16 = 0b00011101_11000111;
33572         assert_eq!(r, e);
33573     }
33574
33575     #[simd_test(enable = "avx512f")]
33576     unsafe fn test_mm512_kmov() {
33577         let a: u16 = 0b11001100_00110011;
33578         let r = _mm512_kmov(a);
33579         let e: u16 = 0b11001100_00110011;
33580         assert_eq!(r, e);
33581     }
33582
33583     #[simd_test(enable = "avx512f")]
33584     unsafe fn test_mm512_int2mask() {
33585         let a: i32 = 0b11001100_00110011;
33586         let r = _mm512_int2mask(a);
33587         let e: u16 = 0b11001100_00110011;
33588         assert_eq!(r, e);
33589     }
33590
33591     #[simd_test(enable = "avx512f")]
33592     unsafe fn test_mm512_mask2int() {
33593         let k1: __mmask16 = 0b11001100_00110011;
33594         let r = _mm512_mask2int(k1);
33595         let e: i32 = 0b11001100_00110011;
33596         assert_eq!(r, e);
33597     }
33598
33599     #[simd_test(enable = "avx512f")]
33600     unsafe fn test_mm512_kunpackb() {
33601         let a: u16 = 0b11001100_00110011;
33602         let b: u16 = 0b00101110_00001011;
33603         let r = _mm512_kunpackb(a, b);
33604         let e: u16 = 0b00101110_00110011;
33605         assert_eq!(r, e);
33606     }
33607
33608     #[simd_test(enable = "avx512f")]
33609     unsafe fn test_mm512_kortestc() {
33610         let a: u16 = 0b11001100_00110011;
33611         let b: u16 = 0b00101110_00001011;
33612         let r = _mm512_kortestc(a, b);
33613         assert_eq!(r, 0);
33614         let b: u16 = 0b11111111_11111111;
33615         let r = _mm512_kortestc(a, b);
33616         assert_eq!(r, 1);
33617     }
33618
33619     #[simd_test(enable = "avx512f")]
33620     unsafe fn test_mm512_test_epi32_mask() {
33621         let a = _mm512_set1_epi32(1 << 0);
33622         let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
33623         let r = _mm512_test_epi32_mask(a, b);
33624         let e: __mmask16 = 0b11111111_11111111;
33625         assert_eq!(r, e);
33626     }
33627
33628     #[simd_test(enable = "avx512f")]
33629     unsafe fn test_mm512_mask_test_epi32_mask() {
33630         let a = _mm512_set1_epi32(1 << 0);
33631         let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
33632         let r = _mm512_mask_test_epi32_mask(0, a, b);
33633         assert_eq!(r, 0);
33634         let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
33635         let e: __mmask16 = 0b11111111_11111111;
33636         assert_eq!(r, e);
33637     }
33638
33639     #[simd_test(enable = "avx512f")]
33640     unsafe fn test_mm512_testn_epi32_mask() {
33641         let a = _mm512_set1_epi32(1 << 0);
33642         let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
33643         let r = _mm512_testn_epi32_mask(a, b);
33644         let e: __mmask16 = 0b00000000_00000000;
33645         assert_eq!(r, e);
33646     }
33647
33648     #[simd_test(enable = "avx512f")]
33649     unsafe fn test_mm512_mask_testn_epi32_mask() {
33650         let a = _mm512_set1_epi32(1 << 0);
33651         let b = _mm512_set1_epi32(1 << 1);
33652         let r = _mm512_mask_test_epi32_mask(0, a, b);
33653         assert_eq!(r, 0);
33654         let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
33655         let e: __mmask16 = 0b11111111_11111111;
33656         assert_eq!(r, e);
33657     }
33658
33659     #[simd_test(enable = "avx512f")]
33660     unsafe fn test_mm512_stream_ps() {
33661         #[repr(align(32))]
33662         struct Memory {
33663             pub data: [f32; 16],
33664         }
33665         let a = _mm512_set1_ps(7.0);
33666         let mut mem = Memory { data: [-1.0; 16] };
33667
33668         _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
33669         for i in 0..16 {
33670             assert_eq!(mem.data[i], get_m512(a, i));
33671         }
33672     }
33673
33674     #[simd_test(enable = "avx512f")]
33675     unsafe fn test_mm512_reduce_add_epi32() {
33676         let a = _mm512_set1_epi32(1);
33677         let e: i32 = _mm512_reduce_add_epi32(a);
33678         assert_eq!(16, e);
33679     }
33680
33681     #[simd_test(enable = "avx512f")]
33682     unsafe fn test_mm512_mask_reduce_add_epi32() {
33683         let a = _mm512_set1_epi32(1);
33684         let e: i32 = _mm512_mask_reduce_add_epi32(0b11111111_00000000, a);
33685         assert_eq!(8, e);
33686     }
33687
33688     #[simd_test(enable = "avx512f")]
33689     unsafe fn test_mm512_reduce_add_ps() {
33690         let a = _mm512_set1_ps(1.);
33691         let e: f32 = _mm512_reduce_add_ps(a);
33692         assert_eq!(16., e);
33693     }
33694
33695     #[simd_test(enable = "avx512f")]
33696     unsafe fn test_mm512_mask_reduce_add_ps() {
33697         let a = _mm512_set1_ps(1.);
33698         let e: f32 = _mm512_mask_reduce_add_ps(0b11111111_00000000, a);
33699         assert_eq!(8., e);
33700     }
33701
33702     #[simd_test(enable = "avx512f")]
33703     unsafe fn test_mm512_reduce_mul_epi32() {
33704         let a = _mm512_set1_epi32(2);
33705         let e: i32 = _mm512_reduce_mul_epi32(a);
33706         assert_eq!(65536, e);
33707     }
33708
33709     #[simd_test(enable = "avx512f")]
33710     unsafe fn test_mm512_mask_reduce_mul_epi32() {
33711         let a = _mm512_set1_epi32(2);
33712         let e: i32 = _mm512_mask_reduce_mul_epi32(0b11111111_00000000, a);
33713         assert_eq!(256, e);
33714     }
33715
33716     #[simd_test(enable = "avx512f")]
33717     unsafe fn test_mm512_reduce_mul_ps() {
33718         let a = _mm512_set1_ps(2.);
33719         let e: f32 = _mm512_reduce_mul_ps(a);
33720         assert_eq!(65536., e);
33721     }
33722
33723     #[simd_test(enable = "avx512f")]
33724     unsafe fn test_mm512_mask_reduce_mul_ps() {
33725         let a = _mm512_set1_ps(2.);
33726         let e: f32 = _mm512_mask_reduce_mul_ps(0b11111111_00000000, a);
33727         assert_eq!(256., e);
33728     }
33729
33730     #[simd_test(enable = "avx512f")]
33731     unsafe fn test_mm512_reduce_max_epi32() {
33732         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33733         let e: i32 = _mm512_reduce_max_epi32(a);
33734         assert_eq!(15, e);
33735     }
33736
33737     #[simd_test(enable = "avx512f")]
33738     unsafe fn test_mm512_mask_reduce_max_epi32() {
33739         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33740         let e: i32 = _mm512_mask_reduce_max_epi32(0b11111111_00000000, a);
33741         assert_eq!(7, e);
33742     }
33743
33744     #[simd_test(enable = "avx512f")]
33745     unsafe fn test_mm512_reduce_max_epu32() {
33746         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33747         let e: u32 = _mm512_reduce_max_epu32(a);
33748         assert_eq!(15, e);
33749     }
33750
33751     #[simd_test(enable = "avx512f")]
33752     unsafe fn test_mm512_mask_reduce_max_epu32() {
33753         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33754         let e: u32 = _mm512_mask_reduce_max_epu32(0b11111111_00000000, a);
33755         assert_eq!(7, e);
33756     }
33757
33758     #[simd_test(enable = "avx512f")]
33759     unsafe fn test_mm512_reduce_max_ps() {
33760         let a = _mm512_set_ps(
33761             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33762         );
33763         let e: f32 = _mm512_reduce_max_ps(a);
33764         assert_eq!(15., e);
33765     }
33766
33767     #[simd_test(enable = "avx512f")]
33768     unsafe fn test_mm512_mask_reduce_max_ps() {
33769         let a = _mm512_set_ps(
33770             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33771         );
33772         let e: f32 = _mm512_mask_reduce_max_ps(0b11111111_00000000, a);
33773         assert_eq!(7., e);
33774     }
33775
33776     #[simd_test(enable = "avx512f")]
33777     unsafe fn test_mm512_reduce_min_epi32() {
33778         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33779         let e: i32 = _mm512_reduce_min_epi32(a);
33780         assert_eq!(0, e);
33781     }
33782
33783     #[simd_test(enable = "avx512f")]
33784     unsafe fn test_mm512_mask_reduce_min_epi32() {
33785         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33786         let e: i32 = _mm512_mask_reduce_min_epi32(0b11111111_00000000, a);
33787         assert_eq!(0, e);
33788     }
33789
33790     #[simd_test(enable = "avx512f")]
33791     unsafe fn test_mm512_reduce_min_epu32() {
33792         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33793         let e: u32 = _mm512_reduce_min_epu32(a);
33794         assert_eq!(0, e);
33795     }
33796
33797     #[simd_test(enable = "avx512f")]
33798     unsafe fn test_mm512_mask_reduce_min_epu32() {
33799         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33800         let e: u32 = _mm512_mask_reduce_min_epu32(0b11111111_00000000, a);
33801         assert_eq!(0, e);
33802     }
33803
33804     #[simd_test(enable = "avx512f")]
33805     unsafe fn test_mm512_reduce_min_ps() {
33806         let a = _mm512_set_ps(
33807             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33808         );
33809         let e: f32 = _mm512_reduce_min_ps(a);
33810         assert_eq!(0., e);
33811     }
33812
33813     #[simd_test(enable = "avx512f")]
33814     unsafe fn test_mm512_mask_reduce_min_ps() {
33815         let a = _mm512_set_ps(
33816             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33817         );
33818         let e: f32 = _mm512_mask_reduce_min_ps(0b11111111_00000000, a);
33819         assert_eq!(0., e);
33820     }
33821
33822     #[simd_test(enable = "avx512f")]
33823     unsafe fn test_mm512_reduce_and_epi32() {
33824         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
33825         let e: i32 = _mm512_reduce_and_epi32(a);
33826         assert_eq!(0, e);
33827     }
33828
33829     #[simd_test(enable = "avx512f")]
33830     unsafe fn test_mm512_mask_reduce_and_epi32() {
33831         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
33832         let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
33833         assert_eq!(1, e);
33834     }
33835
33836     #[simd_test(enable = "avx512f")]
33837     unsafe fn test_mm512_reduce_or_epi32() {
33838         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
33839         let e: i32 = _mm512_reduce_or_epi32(a);
33840         assert_eq!(3, e);
33841     }
33842
33843     #[simd_test(enable = "avx512f")]
33844     unsafe fn test_mm512_mask_reduce_or_epi32() {
33845         let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
33846         let e: i32 = _mm512_mask_reduce_or_epi32(0b11111111_00000000, a);
33847         assert_eq!(1, e);
33848     }
33849
33850     #[simd_test(enable = "avx512f")]
33851     unsafe fn test_mm512_mask_compress_epi32() {
33852         let src = _mm512_set1_epi32(200);
33853         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33854         let r = _mm512_mask_compress_epi32(src, 0b01010101_01010101, a);
33855         let e = _mm512_set_epi32(
33856             200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
33857         );
33858         assert_eq_m512i(r, e);
33859     }
33860
33861     #[simd_test(enable = "avx512f")]
33862     unsafe fn test_mm512_maskz_compress_epi32() {
33863         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33864         let r = _mm512_maskz_compress_epi32(0b01010101_01010101, a);
33865         let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
33866         assert_eq_m512i(r, e);
33867     }
33868
33869     #[simd_test(enable = "avx512f")]
33870     unsafe fn test_mm512_mask_compress_ps() {
33871         let src = _mm512_set1_ps(200.);
33872         let a = _mm512_set_ps(
33873             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33874         );
33875         let r = _mm512_mask_compress_ps(src, 0b01010101_01010101, a);
33876         let e = _mm512_set_ps(
33877             200., 200., 200., 200., 200., 200., 200., 200., 1., 3., 5., 7., 9., 11., 13., 15.,
33878         );
33879         assert_eq_m512(r, e);
33880     }
33881
33882     #[simd_test(enable = "avx512f")]
33883     unsafe fn test_mm512_maskz_compress_ps() {
33884         let a = _mm512_set_ps(
33885             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33886         );
33887         let r = _mm512_maskz_compress_ps(0b01010101_01010101, a);
33888         let e = _mm512_set_ps(
33889             0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 5., 7., 9., 11., 13., 15.,
33890         );
33891         assert_eq_m512(r, e);
33892     }
33893
33894     #[simd_test(enable = "avx512f")]
33895     unsafe fn test_mm512_mask_expand_epi32() {
33896         let src = _mm512_set1_epi32(200);
33897         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33898         let r = _mm512_mask_expand_epi32(src, 0b01010101_01010101, a);
33899         let e = _mm512_set_epi32(
33900             200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
33901         );
33902         assert_eq_m512i(r, e);
33903     }
33904
33905     #[simd_test(enable = "avx512f")]
33906     unsafe fn test_mm512_maskz_expand_epi32() {
33907         let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
33908         let r = _mm512_maskz_expand_epi32(0b01010101_01010101, a);
33909         let e = _mm512_set_epi32(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
33910         assert_eq_m512i(r, e);
33911     }
33912
33913     #[simd_test(enable = "avx512f")]
33914     unsafe fn test_mm512_mask_expand_ps() {
33915         let src = _mm512_set1_ps(200.);
33916         let a = _mm512_set_ps(
33917             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33918         );
33919         let r = _mm512_mask_expand_ps(src, 0b01010101_01010101, a);
33920         let e = _mm512_set_ps(
33921             200., 8., 200., 9., 200., 10., 200., 11., 200., 12., 200., 13., 200., 14., 200., 15.,
33922         );
33923         assert_eq_m512(r, e);
33924     }
33925
33926     #[simd_test(enable = "avx512f")]
33927     unsafe fn test_mm512_maskz_expand_ps() {
33928         let a = _mm512_set_ps(
33929             0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
33930         );
33931         let r = _mm512_maskz_expand_ps(0b01010101_01010101, a);
33932         let e = _mm512_set_ps(
33933             0., 8., 0., 9., 0., 10., 0., 11., 0., 12., 0., 13., 0., 14., 0., 15.,
33934         );
33935         assert_eq_m512(r, e);
33936     }
33937
33938     #[simd_test(enable = "avx512f")]
33939     unsafe fn test_mm512_loadu_epi32() {
33940         let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
33941         let p = a.as_ptr();
33942         let r = _mm512_loadu_epi32(black_box(p));
33943         let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
33944         assert_eq_m512i(r, e);
33945     }
33946
33947     #[simd_test(enable = "avx512f")]
33948     unsafe fn test_mm512_storeu_epi32() {
33949         let a = _mm512_set1_epi32(9);
33950         let mut r = _mm512_undefined_epi32();
33951         _mm512_storeu_epi32(&mut r as *mut _ as *mut i32, a);
33952         assert_eq_m512i(r, a);
33953     }
33954
33955     #[simd_test(enable = "avx512f")]
33956     unsafe fn test_mm512_loadu_si512() {
33957         let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
33958         let p = a.as_ptr();
33959         let r = _mm512_loadu_si512(black_box(p));
33960         let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
33961         assert_eq_m512i(r, e);
33962     }
33963
33964     #[simd_test(enable = "avx512f")]
33965     unsafe fn test_mm512_storeu_si512() {
33966         let a = _mm512_set1_epi32(9);
33967         let mut r = _mm512_undefined_epi32();
33968         _mm512_storeu_si512(&mut r as *mut _ as *mut i32, a);
33969         assert_eq_m512i(r, a);
33970     }
33971
33972     #[simd_test(enable = "avx512f")]
33973     unsafe fn test_mm512_load_si512() {
33974         #[repr(align(64))]
33975         struct Align {
33976             data: [i32; 16], // 64 bytes
33977         }
33978         let a = Align {
33979             data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
33980         };
33981         let p = (a.data).as_ptr();
33982         let r = _mm512_load_si512(black_box(p));
33983         let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
33984         assert_eq_m512i(r, e);
33985     }
33986
33987     #[simd_test(enable = "avx512f")]
33988     unsafe fn test_mm512_store_si512() {
33989         let a = _mm512_set1_epi32(9);
33990         let mut r = _mm512_undefined_epi32();
33991         _mm512_store_si512(&mut r as *mut _ as *mut i32, a);
33992         assert_eq_m512i(r, a);
33993     }
33994
33995     #[simd_test(enable = "avx512f")]
33996     unsafe fn test_mm512_load_epi32() {
33997         #[repr(align(64))]
33998         struct Align {
33999             data: [i32; 16], // 64 bytes
34000         }
34001         let a = Align {
34002             data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
34003         };
34004         let p = (a.data).as_ptr();
34005         let r = _mm512_load_epi32(black_box(p));
34006         let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
34007         assert_eq_m512i(r, e);
34008     }
34009
34010     #[simd_test(enable = "avx512f")]
34011     unsafe fn test_mm512_store_epi32() {
34012         let a = _mm512_set1_epi32(9);
34013         let mut r = _mm512_undefined_epi32();
34014         _mm512_store_epi32(&mut r as *mut _ as *mut i32, a);
34015         assert_eq_m512i(r, a);
34016     }
34017
34018     #[simd_test(enable = "avx512f")]
34019     unsafe fn test_mm512_load_ps() {
34020         #[repr(align(64))]
34021         struct Align {
34022             data: [f32; 16], // 64 bytes
34023         }
34024         let a = Align {
34025             data: [
34026                 4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
34027             ],
34028         };
34029         let p = (a.data).as_ptr();
34030         let r = _mm512_load_ps(black_box(p));
34031         let e = _mm512_setr_ps(
34032             4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
34033         );
34034         assert_eq_m512(r, e);
34035     }
34036
34037     #[simd_test(enable = "avx512f")]
34038     unsafe fn test_mm512_store_ps() {
34039         let a = _mm512_set1_ps(9.);
34040         let mut r = _mm512_undefined_ps();
34041         _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
34042         assert_eq_m512(r, a);
34043     }
34044
34045     #[simd_test(enable = "avx512f")]
34046     unsafe fn test_mm512_mask_set1_epi32() {
34047         let src = _mm512_set1_epi32(2);
34048         let a: i32 = 11;
34049         let r = _mm512_mask_set1_epi32(src, 0, a);
34050         assert_eq_m512i(r, src);
34051         let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
34052         let e = _mm512_set1_epi32(11);
34053         assert_eq_m512i(r, e);
34054     }
34055
34056     #[simd_test(enable = "avx512f")]
34057     unsafe fn test_mm512_maskz_set1_epi32() {
34058         let a: i32 = 11;
34059         let r = _mm512_maskz_set1_epi32(0, a);
34060         assert_eq_m512i(r, _mm512_setzero_si512());
34061         let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
34062         let e = _mm512_set1_epi32(11);
34063         assert_eq_m512i(r, e);
34064     }
34065
34066     #[simd_test(enable = "avx512f")]
34067     unsafe fn test_mm_mask_move_ss() {
34068         let src = _mm_set_ps(10., 11., 100., 110.);
34069         let a = _mm_set_ps(1., 2., 10., 20.);
34070         let b = _mm_set_ps(3., 4., 30., 40.);
34071         let r = _mm_mask_move_ss(src, 0, a, b);
34072         let e = _mm_set_ps(1., 2., 10., 110.);
34073         assert_eq_m128(r, e);
34074         let r = _mm_mask_move_ss(src, 0b11111111, a, b);
34075         let e = _mm_set_ps(1., 2., 10., 40.);
34076         assert_eq_m128(r, e);
34077     }
34078
34079     #[simd_test(enable = "avx512f")]
34080     unsafe fn test_mm_maskz_move_ss() {
34081         let a = _mm_set_ps(1., 2., 10., 20.);
34082         let b = _mm_set_ps(3., 4., 30., 40.);
34083         let r = _mm_maskz_move_ss(0, a, b);
34084         let e = _mm_set_ps(1., 2., 10., 0.);
34085         assert_eq_m128(r, e);
34086         let r = _mm_maskz_move_ss(0b11111111, a, b);
34087         let e = _mm_set_ps(1., 2., 10., 40.);
34088         assert_eq_m128(r, e);
34089     }
34090
34091     #[simd_test(enable = "avx512f")]
34092     unsafe fn test_mm_mask_move_sd() {
34093         let src = _mm_set_pd(10., 11.);
34094         let a = _mm_set_pd(1., 2.);
34095         let b = _mm_set_pd(3., 4.);
34096         let r = _mm_mask_move_sd(src, 0, a, b);
34097         let e = _mm_set_pd(1., 11.);
34098         assert_eq_m128d(r, e);
34099         let r = _mm_mask_move_sd(src, 0b11111111, a, b);
34100         let e = _mm_set_pd(1., 4.);
34101         assert_eq_m128d(r, e);
34102     }
34103
34104     #[simd_test(enable = "avx512f")]
34105     unsafe fn test_mm_maskz_move_sd() {
34106         let a = _mm_set_pd(1., 2.);
34107         let b = _mm_set_pd(3., 4.);
34108         let r = _mm_maskz_move_sd(0, a, b);
34109         let e = _mm_set_pd(1., 0.);
34110         assert_eq_m128d(r, e);
34111         let r = _mm_maskz_move_sd(0b11111111, a, b);
34112         let e = _mm_set_pd(1., 4.);
34113         assert_eq_m128d(r, e);
34114     }
34115
34116     #[simd_test(enable = "avx512f")]
34117     unsafe fn test_mm_mask_add_ss() {
34118         let src = _mm_set_ps(10., 11., 100., 110.);
34119         let a = _mm_set_ps(1., 2., 10., 20.);
34120         let b = _mm_set_ps(3., 4., 30., 40.);
34121         let r = _mm_mask_add_ss(src, 0, a, b);
34122         let e = _mm_set_ps(1., 2., 10., 110.);
34123         assert_eq_m128(r, e);
34124         let r = _mm_mask_add_ss(src, 0b11111111, a, b);
34125         let e = _mm_set_ps(1., 2., 10., 60.);
34126         assert_eq_m128(r, e);
34127     }
34128
34129     #[simd_test(enable = "avx512f")]
34130     unsafe fn test_mm_maskz_add_ss() {
34131         let a = _mm_set_ps(1., 2., 10., 20.);
34132         let b = _mm_set_ps(3., 4., 30., 40.);
34133         let r = _mm_maskz_add_ss(0, a, b);
34134         let e = _mm_set_ps(1., 2., 10., 0.);
34135         assert_eq_m128(r, e);
34136         let r = _mm_maskz_add_ss(0b11111111, a, b);
34137         let e = _mm_set_ps(1., 2., 10., 60.);
34138         assert_eq_m128(r, e);
34139     }
34140
34141     #[simd_test(enable = "avx512f")]
34142     unsafe fn test_mm_mask_add_sd() {
34143         let src = _mm_set_pd(10., 11.);
34144         let a = _mm_set_pd(1., 2.);
34145         let b = _mm_set_pd(3., 4.);
34146         let r = _mm_mask_add_sd(src, 0, a, b);
34147         let e = _mm_set_pd(1., 11.);
34148         assert_eq_m128d(r, e);
34149         let r = _mm_mask_add_sd(src, 0b11111111, a, b);
34150         let e = _mm_set_pd(1., 6.);
34151         assert_eq_m128d(r, e);
34152     }
34153
34154     #[simd_test(enable = "avx512f")]
34155     unsafe fn test_mm_maskz_add_sd() {
34156         let a = _mm_set_pd(1., 2.);
34157         let b = _mm_set_pd(3., 4.);
34158         let r = _mm_maskz_add_sd(0, a, b);
34159         let e = _mm_set_pd(1., 0.);
34160         assert_eq_m128d(r, e);
34161         let r = _mm_maskz_add_sd(0b11111111, a, b);
34162         let e = _mm_set_pd(1., 6.);
34163         assert_eq_m128d(r, e);
34164     }
34165
34166     #[simd_test(enable = "avx512f")]
34167     unsafe fn test_mm_mask_sub_ss() {
34168         let src = _mm_set_ps(10., 11., 100., 110.);
34169         let a = _mm_set_ps(1., 2., 10., 20.);
34170         let b = _mm_set_ps(3., 4., 30., 40.);
34171         let r = _mm_mask_sub_ss(src, 0, a, b);
34172         let e = _mm_set_ps(1., 2., 10., 110.);
34173         assert_eq_m128(r, e);
34174         let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
34175         let e = _mm_set_ps(1., 2., 10., -20.);
34176         assert_eq_m128(r, e);
34177     }
34178
34179     #[simd_test(enable = "avx512f")]
34180     unsafe fn test_mm_maskz_sub_ss() {
34181         let a = _mm_set_ps(1., 2., 10., 20.);
34182         let b = _mm_set_ps(3., 4., 30., 40.);
34183         let r = _mm_maskz_sub_ss(0, a, b);
34184         let e = _mm_set_ps(1., 2., 10., 0.);
34185         assert_eq_m128(r, e);
34186         let r = _mm_maskz_sub_ss(0b11111111, a, b);
34187         let e = _mm_set_ps(1., 2., 10., -20.);
34188         assert_eq_m128(r, e);
34189     }
34190
34191     #[simd_test(enable = "avx512f")]
34192     unsafe fn test_mm_mask_sub_sd() {
34193         let src = _mm_set_pd(10., 11.);
34194         let a = _mm_set_pd(1., 2.);
34195         let b = _mm_set_pd(3., 4.);
34196         let r = _mm_mask_sub_sd(src, 0, a, b);
34197         let e = _mm_set_pd(1., 11.);
34198         assert_eq_m128d(r, e);
34199         let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
34200         let e = _mm_set_pd(1., -2.);
34201         assert_eq_m128d(r, e);
34202     }
34203
34204     #[simd_test(enable = "avx512f")]
34205     unsafe fn test_mm_maskz_sub_sd() {
34206         let a = _mm_set_pd(1., 2.);
34207         let b = _mm_set_pd(3., 4.);
34208         let r = _mm_maskz_sub_sd(0, a, b);
34209         let e = _mm_set_pd(1., 0.);
34210         assert_eq_m128d(r, e);
34211         let r = _mm_maskz_sub_sd(0b11111111, a, b);
34212         let e = _mm_set_pd(1., -2.);
34213         assert_eq_m128d(r, e);
34214     }
34215
34216     #[simd_test(enable = "avx512f")]
34217     unsafe fn test_mm_mask_mul_ss() {
34218         let src = _mm_set_ps(10., 11., 100., 110.);
34219         let a = _mm_set_ps(1., 2., 10., 20.);
34220         let b = _mm_set_ps(3., 4., 30., 40.);
34221         let r = _mm_mask_mul_ss(src, 0, a, b);
34222         let e = _mm_set_ps(1., 2., 10., 110.);
34223         assert_eq_m128(r, e);
34224         let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
34225         let e = _mm_set_ps(1., 2., 10., 800.);
34226         assert_eq_m128(r, e);
34227     }
34228
34229     #[simd_test(enable = "avx512f")]
34230     unsafe fn test_mm_maskz_mul_ss() {
34231         let a = _mm_set_ps(1., 2., 10., 20.);
34232         let b = _mm_set_ps(3., 4., 30., 40.);
34233         let r = _mm_maskz_mul_ss(0, a, b);
34234         let e = _mm_set_ps(1., 2., 10., 0.);
34235         assert_eq_m128(r, e);
34236         let r = _mm_maskz_mul_ss(0b11111111, a, b);
34237         let e = _mm_set_ps(1., 2., 10., 800.);
34238         assert_eq_m128(r, e);
34239     }
34240
34241     #[simd_test(enable = "avx512f")]
34242     unsafe fn test_mm_mask_mul_sd() {
34243         let src = _mm_set_pd(10., 11.);
34244         let a = _mm_set_pd(1., 2.);
34245         let b = _mm_set_pd(3., 4.);
34246         let r = _mm_mask_mul_sd(src, 0, a, b);
34247         let e = _mm_set_pd(1., 11.);
34248         assert_eq_m128d(r, e);
34249         let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
34250         let e = _mm_set_pd(1., 8.);
34251         assert_eq_m128d(r, e);
34252     }
34253
34254     #[simd_test(enable = "avx512f")]
34255     unsafe fn test_mm_maskz_mul_sd() {
34256         let a = _mm_set_pd(1., 2.);
34257         let b = _mm_set_pd(3., 4.);
34258         let r = _mm_maskz_mul_sd(0, a, b);
34259         let e = _mm_set_pd(1., 0.);
34260         assert_eq_m128d(r, e);
34261         let r = _mm_maskz_mul_sd(0b11111111, a, b);
34262         let e = _mm_set_pd(1., 8.);
34263         assert_eq_m128d(r, e);
34264     }
34265
34266     #[simd_test(enable = "avx512f")]
34267     unsafe fn test_mm_mask_div_ss() {
34268         let src = _mm_set_ps(10., 11., 100., 110.);
34269         let a = _mm_set_ps(1., 2., 10., 20.);
34270         let b = _mm_set_ps(3., 4., 30., 40.);
34271         let r = _mm_mask_div_ss(src, 0, a, b);
34272         let e = _mm_set_ps(1., 2., 10., 110.);
34273         assert_eq_m128(r, e);
34274         let r = _mm_mask_div_ss(src, 0b11111111, a, b);
34275         let e = _mm_set_ps(1., 2., 10., 0.5);
34276         assert_eq_m128(r, e);
34277     }
34278
34279     #[simd_test(enable = "avx512f")]
34280     unsafe fn test_mm_maskz_div_ss() {
34281         let a = _mm_set_ps(1., 2., 10., 20.);
34282         let b = _mm_set_ps(3., 4., 30., 40.);
34283         let r = _mm_maskz_div_ss(0, a, b);
34284         let e = _mm_set_ps(1., 2., 10., 0.);
34285         assert_eq_m128(r, e);
34286         let r = _mm_maskz_div_ss(0b11111111, a, b);
34287         let e = _mm_set_ps(1., 2., 10., 0.5);
34288         assert_eq_m128(r, e);
34289     }
34290
34291     #[simd_test(enable = "avx512f")]
34292     unsafe fn test_mm_mask_div_sd() {
34293         let src = _mm_set_pd(10., 11.);
34294         let a = _mm_set_pd(1., 2.);
34295         let b = _mm_set_pd(3., 4.);
34296         let r = _mm_mask_div_sd(src, 0, a, b);
34297         let e = _mm_set_pd(1., 11.);
34298         assert_eq_m128d(r, e);
34299         let r = _mm_mask_div_sd(src, 0b11111111, a, b);
34300         let e = _mm_set_pd(1., 0.5);
34301         assert_eq_m128d(r, e);
34302     }
34303
34304     #[simd_test(enable = "avx512f")]
34305     unsafe fn test_mm_maskz_div_sd() {
34306         let a = _mm_set_pd(1., 2.);
34307         let b = _mm_set_pd(3., 4.);
34308         let r = _mm_maskz_div_sd(0, a, b);
34309         let e = _mm_set_pd(1., 0.);
34310         assert_eq_m128d(r, e);
34311         let r = _mm_maskz_div_sd(0b11111111, a, b);
34312         let e = _mm_set_pd(1., 0.5);
34313         assert_eq_m128d(r, e);
34314     }
34315
34316     #[simd_test(enable = "avx512f")]
34317     unsafe fn test_mm_mask_max_ss() {
34318         let a = _mm_set_ps(0., 1., 2., 3.);
34319         let b = _mm_set_ps(4., 5., 6., 7.);
34320         let r = _mm_mask_max_ss(a, 0, a, b);
34321         let e = _mm_set_ps(0., 1., 2., 3.);
34322         assert_eq_m128(r, e);
34323         let r = _mm_mask_max_ss(a, 0b11111111, a, b);
34324         let e = _mm_set_ps(0., 1., 2., 7.);
34325         assert_eq_m128(r, e);
34326     }
34327
34328     #[simd_test(enable = "avx512f")]
34329     unsafe fn test_mm_maskz_max_ss() {
34330         let a = _mm_set_ps(0., 1., 2., 3.);
34331         let b = _mm_set_ps(4., 5., 6., 7.);
34332         let r = _mm_maskz_max_ss(0, a, b);
34333         let e = _mm_set_ps(0., 1., 2., 0.);
34334         assert_eq_m128(r, e);
34335         let r = _mm_maskz_max_ss(0b11111111, a, b);
34336         let e = _mm_set_ps(0., 1., 2., 7.);
34337         assert_eq_m128(r, e);
34338     }
34339
34340     #[simd_test(enable = "avx512f")]
34341     unsafe fn test_mm_mask_max_sd() {
34342         let a = _mm_set_pd(0., 1.);
34343         let b = _mm_set_pd(2., 3.);
34344         let r = _mm_mask_max_sd(a, 0, a, b);
34345         let e = _mm_set_pd(0., 1.);
34346         assert_eq_m128d(r, e);
34347         let r = _mm_mask_max_sd(a, 0b11111111, a, b);
34348         let e = _mm_set_pd(0., 3.);
34349         assert_eq_m128d(r, e);
34350     }
34351
34352     #[simd_test(enable = "avx512f")]
34353     unsafe fn test_mm_maskz_max_sd() {
34354         let a = _mm_set_pd(0., 1.);
34355         let b = _mm_set_pd(2., 3.);
34356         let r = _mm_maskz_max_sd(0, a, b);
34357         let e = _mm_set_pd(0., 0.);
34358         assert_eq_m128d(r, e);
34359         let r = _mm_maskz_max_sd(0b11111111, a, b);
34360         let e = _mm_set_pd(0., 3.);
34361         assert_eq_m128d(r, e);
34362     }
34363
34364     #[simd_test(enable = "avx512f")]
34365     unsafe fn test_mm_mask_min_ss() {
34366         let a = _mm_set_ps(0., 1., 2., 3.);
34367         let b = _mm_set_ps(4., 5., 6., 7.);
34368         let r = _mm_mask_min_ss(a, 0, a, b);
34369         let e = _mm_set_ps(0., 1., 2., 3.);
34370         assert_eq_m128(r, e);
34371         let r = _mm_mask_min_ss(a, 0b11111111, a, b);
34372         let e = _mm_set_ps(0., 1., 2., 3.);
34373         assert_eq_m128(r, e);
34374     }
34375
34376     #[simd_test(enable = "avx512f")]
34377     unsafe fn test_mm_maskz_min_ss() {
34378         let a = _mm_set_ps(0., 1., 2., 3.);
34379         let b = _mm_set_ps(4., 5., 6., 7.);
34380         let r = _mm_maskz_min_ss(0, a, b);
34381         let e = _mm_set_ps(0., 1., 2., 0.);
34382         assert_eq_m128(r, e);
34383         let r = _mm_maskz_min_ss(0b11111111, a, b);
34384         let e = _mm_set_ps(0., 1., 2., 3.);
34385         assert_eq_m128(r, e);
34386     }
34387
34388     #[simd_test(enable = "avx512f")]
34389     unsafe fn test_mm_mask_min_sd() {
34390         let a = _mm_set_pd(0., 1.);
34391         let b = _mm_set_pd(2., 3.);
34392         let r = _mm_mask_min_sd(a, 0, a, b);
34393         let e = _mm_set_pd(0., 1.);
34394         assert_eq_m128d(r, e);
34395         let r = _mm_mask_min_sd(a, 0b11111111, a, b);
34396         let e = _mm_set_pd(0., 1.);
34397         assert_eq_m128d(r, e);
34398     }
34399
34400     #[simd_test(enable = "avx512f")]
34401     unsafe fn test_mm_maskz_min_sd() {
34402         let a = _mm_set_pd(0., 1.);
34403         let b = _mm_set_pd(2., 3.);
34404         let r = _mm_maskz_min_sd(0, a, b);
34405         let e = _mm_set_pd(0., 0.);
34406         assert_eq_m128d(r, e);
34407         let r = _mm_maskz_min_sd(0b11111111, a, b);
34408         let e = _mm_set_pd(0., 1.);
34409         assert_eq_m128d(r, e);
34410     }
34411
34412     #[simd_test(enable = "avx512f")]
34413     unsafe fn test_mm_mask_sqrt_ss() {
34414         let src = _mm_set_ps(10., 11., 100., 110.);
34415         let a = _mm_set_ps(1., 2., 10., 20.);
34416         let b = _mm_set_ps(3., 4., 30., 4.);
34417         let r = _mm_mask_sqrt_ss(src, 0, a, b);
34418         let e = _mm_set_ps(1., 2., 10., 110.);
34419         assert_eq_m128(r, e);
34420         let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
34421         let e = _mm_set_ps(1., 2., 10., 2.);
34422         assert_eq_m128(r, e);
34423     }
34424
34425     #[simd_test(enable = "avx512f")]
34426     unsafe fn test_mm_maskz_sqrt_ss() {
34427         let a = _mm_set_ps(1., 2., 10., 20.);
34428         let b = _mm_set_ps(3., 4., 30., 4.);
34429         let r = _mm_maskz_sqrt_ss(0, a, b);
34430         let e = _mm_set_ps(1., 2., 10., 0.);
34431         assert_eq_m128(r, e);
34432         let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
34433         let e = _mm_set_ps(1., 2., 10., 2.);
34434         assert_eq_m128(r, e);
34435     }
34436
34437     #[simd_test(enable = "avx512f")]
34438     unsafe fn test_mm_mask_sqrt_sd() {
34439         let src = _mm_set_pd(10., 11.);
34440         let a = _mm_set_pd(1., 2.);
34441         let b = _mm_set_pd(3., 4.);
34442         let r = _mm_mask_sqrt_sd(src, 0, a, b);
34443         let e = _mm_set_pd(1., 11.);
34444         assert_eq_m128d(r, e);
34445         let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
34446         let e = _mm_set_pd(1., 2.);
34447         assert_eq_m128d(r, e);
34448     }
34449
34450     #[simd_test(enable = "avx512f")]
34451     unsafe fn test_mm_maskz_sqrt_sd() {
34452         let a = _mm_set_pd(1., 2.);
34453         let b = _mm_set_pd(3., 4.);
34454         let r = _mm_maskz_sqrt_sd(0, a, b);
34455         let e = _mm_set_pd(1., 0.);
34456         assert_eq_m128d(r, e);
34457         let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
34458         let e = _mm_set_pd(1., 2.);
34459         assert_eq_m128d(r, e);
34460     }
34461
34462     #[simd_test(enable = "avx512f")]
34463     unsafe fn test_mm_rsqrt14_ss() {
34464         let a = _mm_set_ps(1., 2., 10., 20.);
34465         let b = _mm_set_ps(3., 4., 30., 4.);
34466         let r = _mm_rsqrt14_ss(a, b);
34467         let e = _mm_set_ps(1., 2., 10., 0.5);
34468         assert_eq_m128(r, e);
34469     }
34470
34471     #[simd_test(enable = "avx512f")]
34472     unsafe fn test_mm_mask_rsqrt14_ss() {
34473         let src = _mm_set_ps(10., 11., 100., 110.);
34474         let a = _mm_set_ps(1., 2., 10., 20.);
34475         let b = _mm_set_ps(3., 4., 30., 4.);
34476         let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
34477         let e = _mm_set_ps(1., 2., 10., 110.);
34478         assert_eq_m128(r, e);
34479         let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
34480         let e = _mm_set_ps(1., 2., 10., 0.5);
34481         assert_eq_m128(r, e);
34482     }
34483
34484     #[simd_test(enable = "avx512f")]
34485     unsafe fn test_mm_maskz_rsqrt14_ss() {
34486         let a = _mm_set_ps(1., 2., 10., 20.);
34487         let b = _mm_set_ps(3., 4., 30., 4.);
34488         let r = _mm_maskz_rsqrt14_ss(0, a, b);
34489         let e = _mm_set_ps(1., 2., 10., 0.);
34490         assert_eq_m128(r, e);
34491         let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
34492         let e = _mm_set_ps(1., 2., 10., 0.5);
34493         assert_eq_m128(r, e);
34494     }
34495
34496     #[simd_test(enable = "avx512f")]
34497     unsafe fn test_mm_rsqrt14_sd() {
34498         let a = _mm_set_pd(1., 2.);
34499         let b = _mm_set_pd(3., 4.);
34500         let r = _mm_rsqrt14_sd(a, b);
34501         let e = _mm_set_pd(1., 0.5);
34502         assert_eq_m128d(r, e);
34503     }
34504
34505     #[simd_test(enable = "avx512f")]
34506     unsafe fn test_mm_mask_rsqrt14_sd() {
34507         let src = _mm_set_pd(10., 11.);
34508         let a = _mm_set_pd(1., 2.);
34509         let b = _mm_set_pd(3., 4.);
34510         let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
34511         let e = _mm_set_pd(1., 11.);
34512         assert_eq_m128d(r, e);
34513         let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
34514         let e = _mm_set_pd(1., 0.5);
34515         assert_eq_m128d(r, e);
34516     }
34517
34518     #[simd_test(enable = "avx512f")]
34519     unsafe fn test_mm_maskz_rsqrt14_sd() {
34520         let a = _mm_set_pd(1., 2.);
34521         let b = _mm_set_pd(3., 4.);
34522         let r = _mm_maskz_rsqrt14_sd(0, a, b);
34523         let e = _mm_set_pd(1., 0.);
34524         assert_eq_m128d(r, e);
34525         let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
34526         let e = _mm_set_pd(1., 0.5);
34527         assert_eq_m128d(r, e);
34528     }
34529
34530     #[simd_test(enable = "avx512f")]
34531     unsafe fn test_mm_rcp14_ss() {
34532         let a = _mm_set_ps(1., 2., 10., 20.);
34533         let b = _mm_set_ps(3., 4., 30., 4.);
34534         let r = _mm_rcp14_ss(a, b);
34535         let e = _mm_set_ps(1., 2., 10., 0.25);
34536         assert_eq_m128(r, e);
34537     }
34538
34539     #[simd_test(enable = "avx512f")]
34540     unsafe fn test_mm_mask_rcp14_ss() {
34541         let src = _mm_set_ps(10., 11., 100., 110.);
34542         let a = _mm_set_ps(1., 2., 10., 20.);
34543         let b = _mm_set_ps(3., 4., 30., 4.);
34544         let r = _mm_mask_rcp14_ss(src, 0, a, b);
34545         let e = _mm_set_ps(1., 2., 10., 110.);
34546         assert_eq_m128(r, e);
34547         let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
34548         let e = _mm_set_ps(1., 2., 10., 0.25);
34549         assert_eq_m128(r, e);
34550     }
34551
34552     #[simd_test(enable = "avx512f")]
34553     unsafe fn test_mm_maskz_rcp14_ss() {
34554         let a = _mm_set_ps(1., 2., 10., 20.);
34555         let b = _mm_set_ps(3., 4., 30., 4.);
34556         let r = _mm_maskz_rcp14_ss(0, a, b);
34557         let e = _mm_set_ps(1., 2., 10., 0.);
34558         assert_eq_m128(r, e);
34559         let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
34560         let e = _mm_set_ps(1., 2., 10., 0.25);
34561         assert_eq_m128(r, e);
34562     }
34563
34564     #[simd_test(enable = "avx512f")]
34565     unsafe fn test_mm_rcp14_sd() {
34566         let a = _mm_set_pd(1., 2.);
34567         let b = _mm_set_pd(3., 4.);
34568         let r = _mm_rcp14_sd(a, b);
34569         let e = _mm_set_pd(1., 0.25);
34570         assert_eq_m128d(r, e);
34571     }
34572
34573     #[simd_test(enable = "avx512f")]
34574     unsafe fn test_mm_mask_rcp14_sd() {
34575         let src = _mm_set_pd(10., 11.);
34576         let a = _mm_set_pd(1., 2.);
34577         let b = _mm_set_pd(3., 4.);
34578         let r = _mm_mask_rcp14_sd(src, 0, a, b);
34579         let e = _mm_set_pd(1., 11.);
34580         assert_eq_m128d(r, e);
34581         let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
34582         let e = _mm_set_pd(1., 0.25);
34583         assert_eq_m128d(r, e);
34584     }
34585
34586     #[simd_test(enable = "avx512f")]
34587     unsafe fn test_mm_maskz_rcp14_sd() {
34588         let a = _mm_set_pd(1., 2.);
34589         let b = _mm_set_pd(3., 4.);
34590         let r = _mm_maskz_rcp14_sd(0, a, b);
34591         let e = _mm_set_pd(1., 0.);
34592         assert_eq_m128d(r, e);
34593         let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
34594         let e = _mm_set_pd(1., 0.25);
34595         assert_eq_m128d(r, e);
34596     }
34597
34598     #[simd_test(enable = "avx512f")]
34599     unsafe fn test_mm_getexp_ss() {
34600         let a = _mm_set1_ps(2.);
34601         let b = _mm_set1_ps(3.);
34602         let r = _mm_getexp_ss(a, b);
34603         let e = _mm_set_ps(2., 2., 2., 1.);
34604         assert_eq_m128(r, e);
34605     }
34606
34607     #[simd_test(enable = "avx512f")]
34608     unsafe fn test_mm_mask_getexp_ss() {
34609         let a = _mm_set1_ps(2.);
34610         let b = _mm_set1_ps(3.);
34611         let r = _mm_mask_getexp_ss(a, 0, a, b);
34612         let e = _mm_set_ps(2., 2., 2., 2.);
34613         assert_eq_m128(r, e);
34614         let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
34615         let e = _mm_set_ps(2., 2., 2., 1.);
34616         assert_eq_m128(r, e);
34617     }
34618
34619     #[simd_test(enable = "avx512f")]
34620     unsafe fn test_mm_maskz_getexp_ss() {
34621         let a = _mm_set1_ps(2.);
34622         let b = _mm_set1_ps(3.);
34623         let r = _mm_maskz_getexp_ss(0, a, b);
34624         let e = _mm_set_ps(2., 2., 2., 0.);
34625         assert_eq_m128(r, e);
34626         let r = _mm_maskz_getexp_ss(0b11111111, a, b);
34627         let e = _mm_set_ps(2., 2., 2., 1.);
34628         assert_eq_m128(r, e);
34629     }
34630
34631     #[simd_test(enable = "avx512f")]
34632     unsafe fn test_mm_getexp_sd() {
34633         let a = _mm_set1_pd(2.);
34634         let b = _mm_set1_pd(3.);
34635         let r = _mm_getexp_sd(a, b);
34636         let e = _mm_set_pd(2., 1.);
34637         assert_eq_m128d(r, e);
34638     }
34639
34640     #[simd_test(enable = "avx512f")]
34641     unsafe fn test_mm_mask_getexp_sd() {
34642         let a = _mm_set1_pd(2.);
34643         let b = _mm_set1_pd(3.);
34644         let r = _mm_mask_getexp_sd(a, 0, a, b);
34645         let e = _mm_set_pd(2., 2.);
34646         assert_eq_m128d(r, e);
34647         let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
34648         let e = _mm_set_pd(2., 1.);
34649         assert_eq_m128d(r, e);
34650     }
34651
34652     #[simd_test(enable = "avx512f")]
34653     unsafe fn test_mm_maskz_getexp_sd() {
34654         let a = _mm_set1_pd(2.);
34655         let b = _mm_set1_pd(3.);
34656         let r = _mm_maskz_getexp_sd(0, a, b);
34657         let e = _mm_set_pd(2., 0.);
34658         assert_eq_m128d(r, e);
34659         let r = _mm_maskz_getexp_sd(0b11111111, a, b);
34660         let e = _mm_set_pd(2., 1.);
34661         assert_eq_m128d(r, e);
34662     }
34663
34664     #[simd_test(enable = "avx512f")]
34665     unsafe fn test_mm_getmant_ss() {
34666         let a = _mm_set1_ps(20.);
34667         let b = _mm_set1_ps(10.);
34668         let r = _mm_getmant_ss(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34669         let e = _mm_set_ps(20., 20., 20., 1.25);
34670         assert_eq_m128(r, e);
34671     }
34672
34673     #[simd_test(enable = "avx512f")]
34674     unsafe fn test_mm_mask_getmant_ss() {
34675         let a = _mm_set1_ps(20.);
34676         let b = _mm_set1_ps(10.);
34677         let r = _mm_mask_getmant_ss(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34678         let e = _mm_set_ps(20., 20., 20., 20.);
34679         assert_eq_m128(r, e);
34680         let r = _mm_mask_getmant_ss(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34681         let e = _mm_set_ps(20., 20., 20., 1.25);
34682         assert_eq_m128(r, e);
34683     }
34684
34685     #[simd_test(enable = "avx512f")]
34686     unsafe fn test_mm_maskz_getmant_ss() {
34687         let a = _mm_set1_ps(20.);
34688         let b = _mm_set1_ps(10.);
34689         let r = _mm_maskz_getmant_ss(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34690         let e = _mm_set_ps(20., 20., 20., 0.);
34691         assert_eq_m128(r, e);
34692         let r = _mm_maskz_getmant_ss(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34693         let e = _mm_set_ps(20., 20., 20., 1.25);
34694         assert_eq_m128(r, e);
34695     }
34696
34697     #[simd_test(enable = "avx512f")]
34698     unsafe fn test_mm_getmant_sd() {
34699         let a = _mm_set1_pd(20.);
34700         let b = _mm_set1_pd(10.);
34701         let r = _mm_getmant_sd(a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34702         let e = _mm_set_pd(20., 1.25);
34703         assert_eq_m128d(r, e);
34704     }
34705
34706     #[simd_test(enable = "avx512f")]
34707     unsafe fn test_mm_mask_getmant_sd() {
34708         let a = _mm_set1_pd(20.);
34709         let b = _mm_set1_pd(10.);
34710         let r = _mm_mask_getmant_sd(a, 0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34711         let e = _mm_set_pd(20., 20.);
34712         assert_eq_m128d(r, e);
34713         let r = _mm_mask_getmant_sd(a, 0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34714         let e = _mm_set_pd(20., 1.25);
34715         assert_eq_m128d(r, e);
34716     }
34717
34718     #[simd_test(enable = "avx512f")]
34719     unsafe fn test_mm_maskz_getmant_sd() {
34720         let a = _mm_set1_pd(20.);
34721         let b = _mm_set1_pd(10.);
34722         let r = _mm_maskz_getmant_sd(0, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34723         let e = _mm_set_pd(20., 0.);
34724         assert_eq_m128d(r, e);
34725         let r = _mm_maskz_getmant_sd(0b11111111, a, b, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC);
34726         let e = _mm_set_pd(20., 1.25);
34727         assert_eq_m128d(r, e);
34728     }
34729
34730     #[simd_test(enable = "avx512f")]
34731     unsafe fn test_mm_roundscale_ss() {
34732         let a = _mm_set1_ps(2.2);
34733         let b = _mm_set1_ps(1.1);
34734         let r = _mm_roundscale_ss(a, b, 0);
34735         let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
34736         assert_eq_m128(r, e);
34737     }
34738
34739     #[simd_test(enable = "avx512f")]
34740     unsafe fn test_mm_mask_roundscale_ss() {
34741         let a = _mm_set1_ps(2.2);
34742         let b = _mm_set1_ps(1.1);
34743         let r = _mm_mask_roundscale_ss(a, 0, a, b, 0);
34744         let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
34745         assert_eq_m128(r, e);
34746         let r = _mm_mask_roundscale_ss(a, 0b11111111, a, b, 0);
34747         let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
34748         assert_eq_m128(r, e);
34749     }
34750
34751     #[simd_test(enable = "avx512f")]
34752     unsafe fn test_mm_maskz_roundscale_ss() {
34753         let a = _mm_set1_ps(2.2);
34754         let b = _mm_set1_ps(1.1);
34755         let r = _mm_maskz_roundscale_ss(0, a, b, 0);
34756         let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
34757         assert_eq_m128(r, e);
34758         let r = _mm_maskz_roundscale_ss(0b11111111, a, b, 0);
34759         let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
34760         assert_eq_m128(r, e);
34761     }
34762
34763     #[simd_test(enable = "avx512f")]
34764     unsafe fn test_mm_roundscale_sd() {
34765         let a = _mm_set1_pd(2.2);
34766         let b = _mm_set1_pd(1.1);
34767         let r = _mm_roundscale_sd(a, b, 0);
34768         let e = _mm_set_pd(2.2, 1.0);
34769         assert_eq_m128d(r, e);
34770     }
34771
34772     #[simd_test(enable = "avx512f")]
34773     unsafe fn test_mm_mask_roundscale_sd() {
34774         let a = _mm_set1_pd(2.2);
34775         let b = _mm_set1_pd(1.1);
34776         let r = _mm_mask_roundscale_sd(a, 0, a, b, 0);
34777         let e = _mm_set_pd(2.2, 2.2);
34778         assert_eq_m128d(r, e);
34779         let r = _mm_mask_roundscale_sd(a, 0b11111111, a, b, 0);
34780         let e = _mm_set_pd(2.2, 1.0);
34781         assert_eq_m128d(r, e);
34782     }
34783
34784     #[simd_test(enable = "avx512f")]
34785     unsafe fn test_mm_maskz_roundscale_sd() {
34786         let a = _mm_set1_pd(2.2);
34787         let b = _mm_set1_pd(1.1);
34788         let r = _mm_maskz_roundscale_sd(0, a, b, 0);
34789         let e = _mm_set_pd(2.2, 0.0);
34790         assert_eq_m128d(r, e);
34791         let r = _mm_maskz_roundscale_sd(0b11111111, a, b, 0);
34792         let e = _mm_set_pd(2.2, 1.0);
34793         assert_eq_m128d(r, e);
34794     }
34795
34796     #[simd_test(enable = "avx512f")]
34797     unsafe fn test_mm_scalef_ss() {
34798         let a = _mm_set1_ps(1.);
34799         let b = _mm_set1_ps(3.);
34800         let r = _mm_scalef_ss(a, b);
34801         let e = _mm_set_ps(1., 1., 1., 8.);
34802         assert_eq_m128(r, e);
34803     }
34804
34805     #[simd_test(enable = "avx512f")]
34806     unsafe fn test_mm_mask_scalef_ss() {
34807         let a = _mm_set1_ps(1.);
34808         let b = _mm_set1_ps(3.);
34809         let r = _mm_mask_scalef_ss(a, 0, a, b);
34810         let e = _mm_set_ps(1., 1., 1., 1.);
34811         assert_eq_m128(r, e);
34812         let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
34813         let e = _mm_set_ps(1., 1., 1., 8.);
34814         assert_eq_m128(r, e);
34815     }
34816
34817     #[simd_test(enable = "avx512f")]
34818     unsafe fn test_mm_maskz_scalef_ss() {
34819         let a = _mm_set1_ps(1.);
34820         let b = _mm_set1_ps(3.);
34821         let r = _mm_maskz_scalef_ss(0, a, b);
34822         let e = _mm_set_ps(1., 1., 1., 0.);
34823         assert_eq_m128(r, e);
34824         let r = _mm_maskz_scalef_ss(0b11111111, a, b);
34825         let e = _mm_set_ps(1., 1., 1., 8.);
34826         assert_eq_m128(r, e);
34827     }
34828
34829     #[simd_test(enable = "avx512f")]
34830     unsafe fn test_mm_scalef_sd() {
34831         let a = _mm_set1_pd(1.);
34832         let b = _mm_set1_pd(3.);
34833         let r = _mm_scalef_sd(a, b);
34834         let e = _mm_set_pd(1., 8.);
34835         assert_eq_m128d(r, e);
34836     }
34837
34838     #[simd_test(enable = "avx512f")]
34839     unsafe fn test_mm_mask_scalef_sd() {
34840         let a = _mm_set1_pd(1.);
34841         let b = _mm_set1_pd(3.);
34842         let r = _mm_mask_scalef_sd(a, 0, a, b);
34843         let e = _mm_set_pd(1., 1.);
34844         assert_eq_m128d(r, e);
34845         let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
34846         let e = _mm_set_pd(1., 8.);
34847         assert_eq_m128d(r, e);
34848     }
34849
34850     #[simd_test(enable = "avx512f")]
34851     unsafe fn test_mm_maskz_scalef_sd() {
34852         let a = _mm_set1_pd(1.);
34853         let b = _mm_set1_pd(3.);
34854         let r = _mm_maskz_scalef_sd(0, a, b);
34855         let e = _mm_set_pd(1., 0.);
34856         assert_eq_m128d(r, e);
34857         let r = _mm_maskz_scalef_sd(0b11111111, a, b);
34858         let e = _mm_set_pd(1., 8.);
34859         assert_eq_m128d(r, e);
34860     }
34861
34862     #[simd_test(enable = "avx512f")]
34863     unsafe fn test_mm_mask_fmadd_ss() {
34864         let a = _mm_set1_ps(1.);
34865         let b = _mm_set1_ps(2.);
34866         let c = _mm_set1_ps(3.);
34867         let r = _mm_mask_fmadd_ss(a, 0, b, c);
34868         assert_eq_m128(r, a);
34869         let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
34870         let e = _mm_set_ps(1., 1., 1., 5.);
34871         assert_eq_m128(r, e);
34872     }
34873
34874     #[simd_test(enable = "avx512f")]
34875     unsafe fn test_mm_maskz_fmadd_ss() {
34876         let a = _mm_set1_ps(1.);
34877         let b = _mm_set1_ps(2.);
34878         let c = _mm_set1_ps(3.);
34879         let r = _mm_maskz_fmadd_ss(0, a, b, c);
34880         let e = _mm_set_ps(1., 1., 1., 0.);
34881         assert_eq_m128(r, e);
34882         let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
34883         let e = _mm_set_ps(1., 1., 1., 5.);
34884         assert_eq_m128(r, e);
34885     }
34886
34887     #[simd_test(enable = "avx512f")]
34888     unsafe fn test_mm_mask3_fmadd_ss() {
34889         let a = _mm_set1_ps(1.);
34890         let b = _mm_set1_ps(2.);
34891         let c = _mm_set1_ps(3.);
34892         let r = _mm_mask3_fmadd_ss(a, b, c, 0);
34893         assert_eq_m128(r, c);
34894         let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
34895         let e = _mm_set_ps(3., 3., 3., 5.);
34896         assert_eq_m128(r, e);
34897     }
34898
34899     #[simd_test(enable = "avx512f")]
34900     unsafe fn test_mm_mask_fmadd_sd() {
34901         let a = _mm_set1_pd(1.);
34902         let b = _mm_set1_pd(2.);
34903         let c = _mm_set1_pd(3.);
34904         let r = _mm_mask_fmadd_sd(a, 0, b, c);
34905         assert_eq_m128d(r, a);
34906         let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
34907         let e = _mm_set_pd(1., 5.);
34908         assert_eq_m128d(r, e);
34909     }
34910
34911     #[simd_test(enable = "avx512f")]
34912     unsafe fn test_mm_maskz_fmadd_sd() {
34913         let a = _mm_set1_pd(1.);
34914         let b = _mm_set1_pd(2.);
34915         let c = _mm_set1_pd(3.);
34916         let r = _mm_maskz_fmadd_sd(0, a, b, c);
34917         let e = _mm_set_pd(1., 0.);
34918         assert_eq_m128d(r, e);
34919         let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
34920         let e = _mm_set_pd(1., 5.);
34921         assert_eq_m128d(r, e);
34922     }
34923
34924     #[simd_test(enable = "avx512f")]
34925     unsafe fn test_mm_mask3_fmadd_sd() {
34926         let a = _mm_set1_pd(1.);
34927         let b = _mm_set1_pd(2.);
34928         let c = _mm_set1_pd(3.);
34929         let r = _mm_mask3_fmadd_sd(a, b, c, 0);
34930         assert_eq_m128d(r, c);
34931         let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
34932         let e = _mm_set_pd(3., 5.);
34933         assert_eq_m128d(r, e);
34934     }
34935
34936     #[simd_test(enable = "avx512f")]
34937     unsafe fn test_mm_mask_fmsub_ss() {
34938         let a = _mm_set1_ps(1.);
34939         let b = _mm_set1_ps(2.);
34940         let c = _mm_set1_ps(3.);
34941         let r = _mm_mask_fmsub_ss(a, 0, b, c);
34942         assert_eq_m128(r, a);
34943         let r = _mm_mask_fmsub_ss(a, 0b11111111, b, c);
34944         let e = _mm_set_ps(1., 1., 1., -1.);
34945         assert_eq_m128(r, e);
34946     }
34947
34948     #[simd_test(enable = "avx512f")]
34949     unsafe fn test_mm_maskz_fmsub_ss() {
34950         let a = _mm_set1_ps(1.);
34951         let b = _mm_set1_ps(2.);
34952         let c = _mm_set1_ps(3.);
34953         let r = _mm_maskz_fmsub_ss(0, a, b, c);
34954         let e = _mm_set_ps(1., 1., 1., 0.);
34955         assert_eq_m128(r, e);
34956         let r = _mm_maskz_fmsub_ss(0b11111111, a, b, c);
34957         let e = _mm_set_ps(1., 1., 1., -1.);
34958         assert_eq_m128(r, e);
34959     }
34960
34961     #[simd_test(enable = "avx512f")]
34962     unsafe fn test_mm_mask3_fmsub_ss() {
34963         let a = _mm_set1_ps(1.);
34964         let b = _mm_set1_ps(2.);
34965         let c = _mm_set1_ps(3.);
34966         let r = _mm_mask3_fmsub_ss(a, b, c, 0);
34967         assert_eq_m128(r, c);
34968         let r = _mm_mask3_fmsub_ss(a, b, c, 0b11111111);
34969         let e = _mm_set_ps(3., 3., 3., -1.);
34970         assert_eq_m128(r, e);
34971     }
34972
34973     #[simd_test(enable = "avx512f")]
34974     unsafe fn test_mm_mask_fmsub_sd() {
34975         let a = _mm_set1_pd(1.);
34976         let b = _mm_set1_pd(2.);
34977         let c = _mm_set1_pd(3.);
34978         let r = _mm_mask_fmsub_sd(a, 0, b, c);
34979         assert_eq_m128d(r, a);
34980         let r = _mm_mask_fmsub_sd(a, 0b11111111, b, c);
34981         let e = _mm_set_pd(1., -1.);
34982         assert_eq_m128d(r, e);
34983     }
34984
34985     #[simd_test(enable = "avx512f")]
34986     unsafe fn test_mm_maskz_fmsub_sd() {
34987         let a = _mm_set1_pd(1.);
34988         let b = _mm_set1_pd(2.);
34989         let c = _mm_set1_pd(3.);
34990         let r = _mm_maskz_fmsub_sd(0, a, b, c);
34991         let e = _mm_set_pd(1., 0.);
34992         assert_eq_m128d(r, e);
34993         let r = _mm_maskz_fmsub_sd(0b11111111, a, b, c);
34994         let e = _mm_set_pd(1., -1.);
34995         assert_eq_m128d(r, e);
34996     }
34997
34998     #[simd_test(enable = "avx512f")]
34999     unsafe fn test_mm_mask3_fmsub_sd() {
35000         let a = _mm_set1_pd(1.);
35001         let b = _mm_set1_pd(2.);
35002         let c = _mm_set1_pd(3.);
35003         let r = _mm_mask3_fmsub_sd(a, b, c, 0);
35004         assert_eq_m128d(r, c);
35005         let r = _mm_mask3_fmsub_sd(a, b, c, 0b11111111);
35006         let e = _mm_set_pd(3., -1.);
35007         assert_eq_m128d(r, e);
35008     }
35009
35010     #[simd_test(enable = "avx512f")]
35011     unsafe fn test_mm_mask_fnmadd_ss() {
35012         let a = _mm_set1_ps(1.);
35013         let b = _mm_set1_ps(2.);
35014         let c = _mm_set1_ps(3.);
35015         let r = _mm_mask_fnmadd_ss(a, 0, b, c);
35016         assert_eq_m128(r, a);
35017         let r = _mm_mask_fnmadd_ss(a, 0b11111111, b, c);
35018         let e = _mm_set_ps(1., 1., 1., 1.);
35019         assert_eq_m128(r, e);
35020     }
35021
35022     #[simd_test(enable = "avx512f")]
35023     unsafe fn test_mm_maskz_fnmadd_ss() {
35024         let a = _mm_set1_ps(1.);
35025         let b = _mm_set1_ps(2.);
35026         let c = _mm_set1_ps(3.);
35027         let r = _mm_maskz_fnmadd_ss(0, a, b, c);
35028         let e = _mm_set_ps(1., 1., 1., 0.);
35029         assert_eq_m128(r, e);
35030         let r = _mm_maskz_fnmadd_ss(0b11111111, a, b, c);
35031         let e = _mm_set_ps(1., 1., 1., 1.);
35032         assert_eq_m128(r, e);
35033     }
35034
35035     #[simd_test(enable = "avx512f")]
35036     unsafe fn test_mm_mask3_fnmadd_ss() {
35037         let a = _mm_set1_ps(1.);
35038         let b = _mm_set1_ps(2.);
35039         let c = _mm_set1_ps(3.);
35040         let r = _mm_mask3_fnmadd_ss(a, b, c, 0);
35041         assert_eq_m128(r, c);
35042         let r = _mm_mask3_fnmadd_ss(a, b, c, 0b11111111);
35043         let e = _mm_set_ps(3., 3., 3., 1.);
35044         assert_eq_m128(r, e);
35045     }
35046
35047     #[simd_test(enable = "avx512f")]
35048     unsafe fn test_mm_mask_fnmadd_sd() {
35049         let a = _mm_set1_pd(1.);
35050         let b = _mm_set1_pd(2.);
35051         let c = _mm_set1_pd(3.);
35052         let r = _mm_mask_fnmadd_sd(a, 0, b, c);
35053         assert_eq_m128d(r, a);
35054         let r = _mm_mask_fnmadd_sd(a, 0b11111111, b, c);
35055         let e = _mm_set_pd(1., 1.);
35056         assert_eq_m128d(r, e);
35057     }
35058
35059     #[simd_test(enable = "avx512f")]
35060     unsafe fn test_mm_maskz_fnmadd_sd() {
35061         let a = _mm_set1_pd(1.);
35062         let b = _mm_set1_pd(2.);
35063         let c = _mm_set1_pd(3.);
35064         let r = _mm_maskz_fnmadd_sd(0, a, b, c);
35065         let e = _mm_set_pd(1., 0.);
35066         assert_eq_m128d(r, e);
35067         let r = _mm_maskz_fnmadd_sd(0b11111111, a, b, c);
35068         let e = _mm_set_pd(1., 1.);
35069         assert_eq_m128d(r, e);
35070     }
35071
35072     #[simd_test(enable = "avx512f")]
35073     unsafe fn test_mm_mask3_fnmadd_sd() {
35074         let a = _mm_set1_pd(1.);
35075         let b = _mm_set1_pd(2.);
35076         let c = _mm_set1_pd(3.);
35077         let r = _mm_mask3_fnmadd_sd(a, b, c, 0);
35078         assert_eq_m128d(r, c);
35079         let r = _mm_mask3_fnmadd_sd(a, b, c, 0b11111111);
35080         let e = _mm_set_pd(3., 1.);
35081         assert_eq_m128d(r, e);
35082     }
35083
35084     #[simd_test(enable = "avx512f")]
35085     unsafe fn test_mm_mask_fnmsub_ss() {
35086         let a = _mm_set1_ps(1.);
35087         let b = _mm_set1_ps(2.);
35088         let c = _mm_set1_ps(3.);
35089         let r = _mm_mask_fnmsub_ss(a, 0, b, c);
35090         assert_eq_m128(r, a);
35091         let r = _mm_mask_fnmsub_ss(a, 0b11111111, b, c);
35092         let e = _mm_set_ps(1., 1., 1., -5.);
35093         assert_eq_m128(r, e);
35094     }
35095
35096     #[simd_test(enable = "avx512f")]
35097     unsafe fn test_mm_maskz_fnmsub_ss() {
35098         let a = _mm_set1_ps(1.);
35099         let b = _mm_set1_ps(2.);
35100         let c = _mm_set1_ps(3.);
35101         let r = _mm_maskz_fnmsub_ss(0, a, b, c);
35102         let e = _mm_set_ps(1., 1., 1., 0.);
35103         assert_eq_m128(r, e);
35104         let r = _mm_maskz_fnmsub_ss(0b11111111, a, b, c);
35105         let e = _mm_set_ps(1., 1., 1., -5.);
35106         assert_eq_m128(r, e);
35107     }
35108
35109     #[simd_test(enable = "avx512f")]
35110     unsafe fn test_mm_mask3_fnmsub_ss() {
35111         let a = _mm_set1_ps(1.);
35112         let b = _mm_set1_ps(2.);
35113         let c = _mm_set1_ps(3.);
35114         let r = _mm_mask3_fnmsub_ss(a, b, c, 0);
35115         assert_eq_m128(r, c);
35116         let r = _mm_mask3_fnmsub_ss(a, b, c, 0b11111111);
35117         let e = _mm_set_ps(3., 3., 3., -5.);
35118         assert_eq_m128(r, e);
35119     }
35120
35121     #[simd_test(enable = "avx512f")]
35122     unsafe fn test_mm_mask_fnmsub_sd() {
35123         let a = _mm_set1_pd(1.);
35124         let b = _mm_set1_pd(2.);
35125         let c = _mm_set1_pd(3.);
35126         let r = _mm_mask_fnmsub_sd(a, 0, b, c);
35127         assert_eq_m128d(r, a);
35128         let r = _mm_mask_fnmsub_sd(a, 0b11111111, b, c);
35129         let e = _mm_set_pd(1., -5.);
35130         assert_eq_m128d(r, e);
35131     }
35132
35133     #[simd_test(enable = "avx512f")]
35134     unsafe fn test_mm_maskz_fnmsub_sd() {
35135         let a = _mm_set1_pd(1.);
35136         let b = _mm_set1_pd(2.);
35137         let c = _mm_set1_pd(3.);
35138         let r = _mm_maskz_fnmsub_sd(0, a, b, c);
35139         let e = _mm_set_pd(1., 0.);
35140         assert_eq_m128d(r, e);
35141         let r = _mm_maskz_fnmsub_sd(0b11111111, a, b, c);
35142         let e = _mm_set_pd(1., -5.);
35143         assert_eq_m128d(r, e);
35144     }
35145
35146     #[simd_test(enable = "avx512f")]
35147     unsafe fn test_mm_mask3_fnmsub_sd() {
35148         let a = _mm_set1_pd(1.);
35149         let b = _mm_set1_pd(2.);
35150         let c = _mm_set1_pd(3.);
35151         let r = _mm_mask3_fnmsub_sd(a, b, c, 0);
35152         assert_eq_m128d(r, c);
35153         let r = _mm_mask3_fnmsub_sd(a, b, c, 0b11111111);
35154         let e = _mm_set_pd(3., -5.);
35155         assert_eq_m128d(r, e);
35156     }
35157
35158     #[simd_test(enable = "avx512f")]
35159     unsafe fn test_mm_add_round_ss() {
35160         let a = _mm_set_ps(1., 2., 10., 20.);
35161         let b = _mm_set_ps(3., 4., 30., 40.);
35162         let r = _mm_add_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35163         let e = _mm_set_ps(1., 2., 10., 60.);
35164         assert_eq_m128(r, e);
35165     }
35166
35167     #[simd_test(enable = "avx512f")]
35168     unsafe fn test_mm_mask_add_round_ss() {
35169         let src = _mm_set_ps(10., 11., 100., 110.);
35170         let a = _mm_set_ps(1., 2., 10., 20.);
35171         let b = _mm_set_ps(3., 4., 30., 40.);
35172         let r = _mm_mask_add_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35173         let e = _mm_set_ps(1., 2., 10., 110.);
35174         assert_eq_m128(r, e);
35175         let r = _mm_mask_add_round_ss(
35176             src,
35177             0b11111111,
35178             a,
35179             b,
35180             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35181         );
35182         let e = _mm_set_ps(1., 2., 10., 60.);
35183         assert_eq_m128(r, e);
35184     }
35185
35186     #[simd_test(enable = "avx512f")]
35187     unsafe fn test_mm_maskz_add_round_ss() {
35188         let a = _mm_set_ps(1., 2., 10., 20.);
35189         let b = _mm_set_ps(3., 4., 30., 40.);
35190         let r = _mm_maskz_add_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35191         let e = _mm_set_ps(1., 2., 10., 0.);
35192         assert_eq_m128(r, e);
35193         let r = _mm_maskz_add_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35194         let e = _mm_set_ps(1., 2., 10., 60.);
35195         assert_eq_m128(r, e);
35196     }
35197
35198     #[simd_test(enable = "avx512f")]
35199     unsafe fn test_mm_add_round_sd() {
35200         let a = _mm_set_pd(1., 2.);
35201         let b = _mm_set_pd(3., 4.);
35202         let r = _mm_add_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35203         let e = _mm_set_pd(1., 6.);
35204         assert_eq_m128d(r, e);
35205     }
35206
35207     #[simd_test(enable = "avx512f")]
35208     unsafe fn test_mm_mask_add_round_sd() {
35209         let src = _mm_set_pd(10., 11.);
35210         let a = _mm_set_pd(1., 2.);
35211         let b = _mm_set_pd(3., 4.);
35212         let r = _mm_mask_add_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35213         let e = _mm_set_pd(1., 11.);
35214         assert_eq_m128d(r, e);
35215         let r = _mm_mask_add_round_sd(
35216             src,
35217             0b11111111,
35218             a,
35219             b,
35220             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35221         );
35222         let e = _mm_set_pd(1., 6.);
35223         assert_eq_m128d(r, e);
35224     }
35225
35226     #[simd_test(enable = "avx512f")]
35227     unsafe fn test_mm_maskz_add_round_sd() {
35228         let a = _mm_set_pd(1., 2.);
35229         let b = _mm_set_pd(3., 4.);
35230         let r = _mm_maskz_add_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35231         let e = _mm_set_pd(1., 0.);
35232         assert_eq_m128d(r, e);
35233         let r = _mm_maskz_add_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35234         let e = _mm_set_pd(1., 6.);
35235         assert_eq_m128d(r, e);
35236     }
35237
35238     #[simd_test(enable = "avx512f")]
35239     unsafe fn test_mm_sub_round_ss() {
35240         let a = _mm_set_ps(1., 2., 10., 20.);
35241         let b = _mm_set_ps(3., 4., 30., 40.);
35242         let r = _mm_sub_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35243         let e = _mm_set_ps(1., 2., 10., -20.);
35244         assert_eq_m128(r, e);
35245     }
35246
35247     #[simd_test(enable = "avx512f")]
35248     unsafe fn test_mm_mask_sub_round_ss() {
35249         let src = _mm_set_ps(10., 11., 100., 110.);
35250         let a = _mm_set_ps(1., 2., 10., 20.);
35251         let b = _mm_set_ps(3., 4., 30., 40.);
35252         let r = _mm_mask_sub_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35253         let e = _mm_set_ps(1., 2., 10., 110.);
35254         assert_eq_m128(r, e);
35255         let r = _mm_mask_sub_round_ss(
35256             src,
35257             0b11111111,
35258             a,
35259             b,
35260             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35261         );
35262         let e = _mm_set_ps(1., 2., 10., -20.);
35263         assert_eq_m128(r, e);
35264     }
35265
35266     #[simd_test(enable = "avx512f")]
35267     unsafe fn test_mm_maskz_sub_round_ss() {
35268         let a = _mm_set_ps(1., 2., 10., 20.);
35269         let b = _mm_set_ps(3., 4., 30., 40.);
35270         let r = _mm_maskz_sub_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35271         let e = _mm_set_ps(1., 2., 10., 0.);
35272         assert_eq_m128(r, e);
35273         let r = _mm_maskz_sub_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35274         let e = _mm_set_ps(1., 2., 10., -20.);
35275         assert_eq_m128(r, e);
35276     }
35277
35278     #[simd_test(enable = "avx512f")]
35279     unsafe fn test_mm_sub_round_sd() {
35280         let a = _mm_set_pd(1., 2.);
35281         let b = _mm_set_pd(3., 4.);
35282         let r = _mm_sub_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35283         let e = _mm_set_pd(1., -2.);
35284         assert_eq_m128d(r, e);
35285     }
35286
35287     #[simd_test(enable = "avx512f")]
35288     unsafe fn test_mm_mask_sub_round_sd() {
35289         let src = _mm_set_pd(10., 11.);
35290         let a = _mm_set_pd(1., 2.);
35291         let b = _mm_set_pd(3., 4.);
35292         let r = _mm_mask_sub_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35293         let e = _mm_set_pd(1., 11.);
35294         assert_eq_m128d(r, e);
35295         let r = _mm_mask_sub_round_sd(
35296             src,
35297             0b11111111,
35298             a,
35299             b,
35300             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35301         );
35302         let e = _mm_set_pd(1., -2.);
35303         assert_eq_m128d(r, e);
35304     }
35305
35306     #[simd_test(enable = "avx512f")]
35307     unsafe fn test_mm_maskz_sub_round_sd() {
35308         let a = _mm_set_pd(1., 2.);
35309         let b = _mm_set_pd(3., 4.);
35310         let r = _mm_maskz_sub_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35311         let e = _mm_set_pd(1., 0.);
35312         assert_eq_m128d(r, e);
35313         let r = _mm_maskz_sub_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35314         let e = _mm_set_pd(1., -2.);
35315         assert_eq_m128d(r, e);
35316     }
35317
35318     #[simd_test(enable = "avx512f")]
35319     unsafe fn test_mm_mul_round_ss() {
35320         let a = _mm_set_ps(1., 2., 10., 20.);
35321         let b = _mm_set_ps(3., 4., 30., 40.);
35322         let r = _mm_mul_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35323         let e = _mm_set_ps(1., 2., 10., 800.);
35324         assert_eq_m128(r, e);
35325     }
35326
35327     #[simd_test(enable = "avx512f")]
35328     unsafe fn test_mm_mask_mul_round_ss() {
35329         let src = _mm_set_ps(10., 11., 100., 110.);
35330         let a = _mm_set_ps(1., 2., 10., 20.);
35331         let b = _mm_set_ps(3., 4., 30., 40.);
35332         let r = _mm_mask_mul_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35333         let e = _mm_set_ps(1., 2., 10., 110.);
35334         assert_eq_m128(r, e);
35335         let r = _mm_mask_mul_round_ss(
35336             src,
35337             0b11111111,
35338             a,
35339             b,
35340             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35341         );
35342         let e = _mm_set_ps(1., 2., 10., 800.);
35343         assert_eq_m128(r, e);
35344     }
35345
35346     #[simd_test(enable = "avx512f")]
35347     unsafe fn test_mm_maskz_mul_round_ss() {
35348         let a = _mm_set_ps(1., 2., 10., 20.);
35349         let b = _mm_set_ps(3., 4., 30., 40.);
35350         let r = _mm_maskz_mul_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35351         let e = _mm_set_ps(1., 2., 10., 0.);
35352         assert_eq_m128(r, e);
35353         let r = _mm_maskz_mul_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35354         let e = _mm_set_ps(1., 2., 10., 800.);
35355         assert_eq_m128(r, e);
35356     }
35357
35358     #[simd_test(enable = "avx512f")]
35359     unsafe fn test_mm_mul_round_sd() {
35360         let a = _mm_set_pd(1., 2.);
35361         let b = _mm_set_pd(3., 4.);
35362         let r = _mm_mul_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35363         let e = _mm_set_pd(1., 8.);
35364         assert_eq_m128d(r, e);
35365     }
35366
35367     #[simd_test(enable = "avx512f")]
35368     unsafe fn test_mm_mask_mul_round_sd() {
35369         let src = _mm_set_pd(10., 11.);
35370         let a = _mm_set_pd(1., 2.);
35371         let b = _mm_set_pd(3., 4.);
35372         let r = _mm_mask_mul_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35373         let e = _mm_set_pd(1., 11.);
35374         assert_eq_m128d(r, e);
35375         let r = _mm_mask_mul_round_sd(
35376             src,
35377             0b11111111,
35378             a,
35379             b,
35380             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35381         );
35382         let e = _mm_set_pd(1., 8.);
35383         assert_eq_m128d(r, e);
35384     }
35385
35386     #[simd_test(enable = "avx512f")]
35387     unsafe fn test_mm_maskz_mul_round_sd() {
35388         let a = _mm_set_pd(1., 2.);
35389         let b = _mm_set_pd(3., 4.);
35390         let r = _mm_maskz_mul_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35391         let e = _mm_set_pd(1., 0.);
35392         assert_eq_m128d(r, e);
35393         let r = _mm_maskz_mul_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35394         let e = _mm_set_pd(1., 8.);
35395         assert_eq_m128d(r, e);
35396     }
35397
35398     #[simd_test(enable = "avx512f")]
35399     unsafe fn test_mm_div_round_ss() {
35400         let a = _mm_set_ps(1., 2., 10., 20.);
35401         let b = _mm_set_ps(3., 4., 30., 40.);
35402         let r = _mm_div_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35403         let e = _mm_set_ps(1., 2., 10., 0.5);
35404         assert_eq_m128(r, e);
35405     }
35406
35407     #[simd_test(enable = "avx512f")]
35408     unsafe fn test_mm_mask_div_round_ss() {
35409         let src = _mm_set_ps(10., 11., 100., 110.);
35410         let a = _mm_set_ps(1., 2., 10., 20.);
35411         let b = _mm_set_ps(3., 4., 30., 40.);
35412         let r = _mm_mask_div_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35413         let e = _mm_set_ps(1., 2., 10., 110.);
35414         assert_eq_m128(r, e);
35415         let r = _mm_mask_div_round_ss(
35416             src,
35417             0b11111111,
35418             a,
35419             b,
35420             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35421         );
35422         let e = _mm_set_ps(1., 2., 10., 0.5);
35423         assert_eq_m128(r, e);
35424     }
35425
35426     #[simd_test(enable = "avx512f")]
35427     unsafe fn test_mm_maskz_div_round_ss() {
35428         let a = _mm_set_ps(1., 2., 10., 20.);
35429         let b = _mm_set_ps(3., 4., 30., 40.);
35430         let r = _mm_maskz_div_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35431         let e = _mm_set_ps(1., 2., 10., 0.);
35432         assert_eq_m128(r, e);
35433         let r = _mm_maskz_div_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35434         let e = _mm_set_ps(1., 2., 10., 0.5);
35435         assert_eq_m128(r, e);
35436     }
35437
35438     #[simd_test(enable = "avx512f")]
35439     unsafe fn test_mm_div_round_sd() {
35440         let a = _mm_set_pd(1., 2.);
35441         let b = _mm_set_pd(3., 4.);
35442         let r = _mm_div_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35443         let e = _mm_set_pd(1., 0.5);
35444         assert_eq_m128d(r, e);
35445     }
35446
35447     #[simd_test(enable = "avx512f")]
35448     unsafe fn test_mm_mask_div_round_sd() {
35449         let src = _mm_set_pd(10., 11.);
35450         let a = _mm_set_pd(1., 2.);
35451         let b = _mm_set_pd(3., 4.);
35452         let r = _mm_mask_div_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35453         let e = _mm_set_pd(1., 11.);
35454         assert_eq_m128d(r, e);
35455         let r = _mm_mask_div_round_sd(
35456             src,
35457             0b11111111,
35458             a,
35459             b,
35460             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35461         );
35462         let e = _mm_set_pd(1., 0.5);
35463         assert_eq_m128d(r, e);
35464     }
35465
35466     #[simd_test(enable = "avx512f")]
35467     unsafe fn test_mm_maskz_div_round_sd() {
35468         let a = _mm_set_pd(1., 2.);
35469         let b = _mm_set_pd(3., 4.);
35470         let r = _mm_maskz_div_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35471         let e = _mm_set_pd(1., 0.);
35472         assert_eq_m128d(r, e);
35473         let r = _mm_maskz_div_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35474         let e = _mm_set_pd(1., 0.5);
35475         assert_eq_m128d(r, e);
35476     }
35477
35478     #[simd_test(enable = "avx512f")]
35479     unsafe fn test_mm_max_round_ss() {
35480         let a = _mm_set_ps(0., 1., 2., 3.);
35481         let b = _mm_set_ps(4., 5., 6., 7.);
35482         let r = _mm_max_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
35483         let e = _mm_set_ps(0., 1., 2., 7.);
35484         assert_eq_m128(r, e);
35485     }
35486
35487     #[simd_test(enable = "avx512f")]
35488     unsafe fn test_mm_mask_max_round_ss() {
35489         let a = _mm_set_ps(0., 1., 2., 3.);
35490         let b = _mm_set_ps(4., 5., 6., 7.);
35491         let r = _mm_mask_max_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
35492         let e = _mm_set_ps(0., 1., 2., 3.);
35493         assert_eq_m128(r, e);
35494         let r = _mm_mask_max_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35495         let e = _mm_set_ps(0., 1., 2., 7.);
35496         assert_eq_m128(r, e);
35497     }
35498
35499     #[simd_test(enable = "avx512f")]
35500     unsafe fn test_mm_maskz_max_round_ss() {
35501         let a = _mm_set_ps(0., 1., 2., 3.);
35502         let b = _mm_set_ps(4., 5., 6., 7.);
35503         let r = _mm_maskz_max_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
35504         let e = _mm_set_ps(0., 1., 2., 0.);
35505         assert_eq_m128(r, e);
35506         let r = _mm_maskz_max_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35507         let e = _mm_set_ps(0., 1., 2., 7.);
35508         assert_eq_m128(r, e);
35509     }
35510
35511     #[simd_test(enable = "avx512f")]
35512     unsafe fn test_mm_max_round_sd() {
35513         let a = _mm_set_pd(0., 1.);
35514         let b = _mm_set_pd(2., 3.);
35515         let r = _mm_max_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
35516         let e = _mm_set_pd(0., 3.);
35517         assert_eq_m128d(r, e);
35518     }
35519
35520     #[simd_test(enable = "avx512f")]
35521     unsafe fn test_mm_mask_max_round_sd() {
35522         let a = _mm_set_pd(0., 1.);
35523         let b = _mm_set_pd(2., 3.);
35524         let r = _mm_mask_max_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
35525         let e = _mm_set_pd(0., 1.);
35526         assert_eq_m128d(r, e);
35527         let r = _mm_mask_max_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35528         let e = _mm_set_pd(0., 3.);
35529         assert_eq_m128d(r, e);
35530     }
35531
35532     #[simd_test(enable = "avx512f")]
35533     unsafe fn test_mm_maskz_max_round_sd() {
35534         let a = _mm_set_pd(0., 1.);
35535         let b = _mm_set_pd(2., 3.);
35536         let r = _mm_maskz_max_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
35537         let e = _mm_set_pd(0., 0.);
35538         assert_eq_m128d(r, e);
35539         let r = _mm_maskz_max_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35540         let e = _mm_set_pd(0., 3.);
35541         assert_eq_m128d(r, e);
35542     }
35543
35544     #[simd_test(enable = "avx512f")]
35545     unsafe fn test_mm_min_round_ss() {
35546         let a = _mm_set_ps(0., 1., 2., 3.);
35547         let b = _mm_set_ps(4., 5., 6., 7.);
35548         let r = _mm_min_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
35549         let e = _mm_set_ps(0., 1., 2., 3.);
35550         assert_eq_m128(r, e);
35551     }
35552
35553     #[simd_test(enable = "avx512f")]
35554     unsafe fn test_mm_mask_min_round_ss() {
35555         let a = _mm_set_ps(0., 1., 2., 3.);
35556         let b = _mm_set_ps(4., 5., 6., 7.);
35557         let r = _mm_mask_min_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
35558         let e = _mm_set_ps(0., 1., 2., 3.);
35559         assert_eq_m128(r, e);
35560         let r = _mm_mask_min_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35561         let e = _mm_set_ps(0., 1., 2., 3.);
35562         assert_eq_m128(r, e);
35563     }
35564
35565     #[simd_test(enable = "avx512f")]
35566     unsafe fn test_mm_maskz_min_round_ss() {
35567         let a = _mm_set_ps(0., 1., 2., 3.);
35568         let b = _mm_set_ps(4., 5., 6., 7.);
35569         let r = _mm_maskz_min_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
35570         let e = _mm_set_ps(0., 1., 2., 0.);
35571         assert_eq_m128(r, e);
35572         let r = _mm_maskz_min_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35573         let e = _mm_set_ps(0., 1., 2., 3.);
35574         assert_eq_m128(r, e);
35575     }
35576
35577     #[simd_test(enable = "avx512f")]
35578     unsafe fn test_mm_min_round_sd() {
35579         let a = _mm_set_pd(0., 1.);
35580         let b = _mm_set_pd(2., 3.);
35581         let r = _mm_min_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
35582         let e = _mm_set_pd(0., 1.);
35583         assert_eq_m128d(r, e);
35584     }
35585
35586     #[simd_test(enable = "avx512f")]
35587     unsafe fn test_mm_mask_min_round_sd() {
35588         let a = _mm_set_pd(0., 1.);
35589         let b = _mm_set_pd(2., 3.);
35590         let r = _mm_mask_min_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
35591         let e = _mm_set_pd(0., 1.);
35592         assert_eq_m128d(r, e);
35593         let r = _mm_mask_min_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35594         let e = _mm_set_pd(0., 1.);
35595         assert_eq_m128d(r, e);
35596     }
35597
35598     #[simd_test(enable = "avx512f")]
35599     unsafe fn test_mm_maskz_min_round_sd() {
35600         let a = _mm_set_pd(0., 1.);
35601         let b = _mm_set_pd(2., 3.);
35602         let r = _mm_maskz_min_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
35603         let e = _mm_set_pd(0., 0.);
35604         assert_eq_m128d(r, e);
35605         let r = _mm_maskz_min_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35606         let e = _mm_set_pd(0., 1.);
35607         assert_eq_m128d(r, e);
35608     }
35609
35610     #[simd_test(enable = "avx512f")]
35611     unsafe fn test_mm_sqrt_round_ss() {
35612         let a = _mm_set_ps(1., 2., 10., 20.);
35613         let b = _mm_set_ps(3., 4., 30., 4.);
35614         let r = _mm_sqrt_round_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35615         let e = _mm_set_ps(1., 2., 10., 2.);
35616         assert_eq_m128(r, e);
35617     }
35618
35619     #[simd_test(enable = "avx512f")]
35620     unsafe fn test_mm_mask_sqrt_round_ss() {
35621         let src = _mm_set_ps(10., 11., 100., 110.);
35622         let a = _mm_set_ps(1., 2., 10., 20.);
35623         let b = _mm_set_ps(3., 4., 30., 4.);
35624         let r = _mm_mask_sqrt_round_ss(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35625         let e = _mm_set_ps(1., 2., 10., 110.);
35626         assert_eq_m128(r, e);
35627         let r = _mm_mask_sqrt_round_ss(
35628             src,
35629             0b11111111,
35630             a,
35631             b,
35632             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35633         );
35634         let e = _mm_set_ps(1., 2., 10., 2.);
35635         assert_eq_m128(r, e);
35636     }
35637
35638     #[simd_test(enable = "avx512f")]
35639     unsafe fn test_mm_maskz_sqrt_round_ss() {
35640         let a = _mm_set_ps(1., 2., 10., 20.);
35641         let b = _mm_set_ps(3., 4., 30., 4.);
35642         let r = _mm_maskz_sqrt_round_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35643         let e = _mm_set_ps(1., 2., 10., 0.);
35644         assert_eq_m128(r, e);
35645         let r = _mm_maskz_sqrt_round_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35646         let e = _mm_set_ps(1., 2., 10., 2.);
35647         assert_eq_m128(r, e);
35648     }
35649
35650     #[simd_test(enable = "avx512f")]
35651     unsafe fn test_mm_sqrt_round_sd() {
35652         let a = _mm_set_pd(1., 2.);
35653         let b = _mm_set_pd(3., 4.);
35654         let r = _mm_sqrt_round_sd(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35655         let e = _mm_set_pd(1., 2.);
35656         assert_eq_m128d(r, e);
35657     }
35658
35659     #[simd_test(enable = "avx512f")]
35660     unsafe fn test_mm_mask_sqrt_round_sd() {
35661         let src = _mm_set_pd(10., 11.);
35662         let a = _mm_set_pd(1., 2.);
35663         let b = _mm_set_pd(3., 4.);
35664         let r = _mm_mask_sqrt_round_sd(src, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35665         let e = _mm_set_pd(1., 11.);
35666         assert_eq_m128d(r, e);
35667         let r = _mm_mask_sqrt_round_sd(
35668             src,
35669             0b11111111,
35670             a,
35671             b,
35672             _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
35673         );
35674         let e = _mm_set_pd(1., 2.);
35675         assert_eq_m128d(r, e);
35676     }
35677
35678     #[simd_test(enable = "avx512f")]
35679     unsafe fn test_mm_maskz_sqrt_round_sd() {
35680         let a = _mm_set_pd(1., 2.);
35681         let b = _mm_set_pd(3., 4.);
35682         let r = _mm_maskz_sqrt_round_sd(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35683         let e = _mm_set_pd(1., 0.);
35684         assert_eq_m128d(r, e);
35685         let r = _mm_maskz_sqrt_round_sd(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
35686         let e = _mm_set_pd(1., 2.);
35687         assert_eq_m128d(r, e);
35688     }
35689
35690     #[simd_test(enable = "avx512f")]
35691     unsafe fn test_mm_getexp_round_ss() {
35692         let a = _mm_set1_ps(2.);
35693         let b = _mm_set1_ps(3.);
35694         let r = _mm_getexp_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
35695         let e = _mm_set_ps(2., 2., 2., 1.);
35696         assert_eq_m128(r, e);
35697     }
35698
35699     #[simd_test(enable = "avx512f")]
35700     unsafe fn test_mm_mask_getexp_round_ss() {
35701         let a = _mm_set1_ps(2.);
35702         let b = _mm_set1_ps(3.);
35703         let r = _mm_mask_getexp_round_ss(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
35704         let e = _mm_set_ps(2., 2., 2., 2.);
35705         assert_eq_m128(r, e);
35706         let r = _mm_mask_getexp_round_ss(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35707         let e = _mm_set_ps(2., 2., 2., 1.);
35708         assert_eq_m128(r, e);
35709     }
35710
35711     #[simd_test(enable = "avx512f")]
35712     unsafe fn test_mm_maskz_getexp_round_ss() {
35713         let a = _mm_set1_ps(2.);
35714         let b = _mm_set1_ps(3.);
35715         let r = _mm_maskz_getexp_round_ss(0, a, b, _MM_FROUND_CUR_DIRECTION);
35716         let e = _mm_set_ps(2., 2., 2., 0.);
35717         assert_eq_m128(r, e);
35718         let r = _mm_maskz_getexp_round_ss(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35719         let e = _mm_set_ps(2., 2., 2., 1.);
35720         assert_eq_m128(r, e);
35721     }
35722
35723     #[simd_test(enable = "avx512f")]
35724     unsafe fn test_mm_getexp_round_sd() {
35725         let a = _mm_set1_pd(2.);
35726         let b = _mm_set1_pd(3.);
35727         let r = _mm_getexp_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
35728         let e = _mm_set_pd(2., 1.);
35729         assert_eq_m128d(r, e);
35730     }
35731
35732     #[simd_test(enable = "avx512f")]
35733     unsafe fn test_mm_mask_getexp_round_sd() {
35734         let a = _mm_set1_pd(2.);
35735         let b = _mm_set1_pd(3.);
35736         let r = _mm_mask_getexp_round_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
35737         let e = _mm_set_pd(2., 2.);
35738         assert_eq_m128d(r, e);
35739         let r = _mm_mask_getexp_round_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35740         let e = _mm_set_pd(2., 1.);
35741         assert_eq_m128d(r, e);
35742     }
35743
35744     #[simd_test(enable = "avx512f")]
35745     unsafe fn test_mm_maskz_getexp_round_sd() {
35746         let a = _mm_set1_pd(2.);
35747         let b = _mm_set1_pd(3.);
35748         let r = _mm_maskz_getexp_round_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
35749         let e = _mm_set_pd(2., 0.);
35750         assert_eq_m128d(r, e);
35751         let r = _mm_maskz_getexp_round_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
35752         let e = _mm_set_pd(2., 1.);
35753         assert_eq_m128d(r, e);
35754     }
35755
35756     #[simd_test(enable = "avx512f")]
35757     unsafe fn test_mm_getmant_round_ss() {
35758         let a = _mm_set1_ps(20.);
35759         let b = _mm_set1_ps(10.);
35760         let r = _mm_getmant_round_ss(
35761             a,
35762             b,
35763             _MM_MANT_NORM_1_2,
35764             _MM_MANT_SIGN_SRC,
35765             _MM_FROUND_CUR_DIRECTION,
35766         );
35767         let e = _mm_set_ps(20., 20., 20., 1.25);
35768         assert_eq_m128(r, e);
35769     }
35770
35771     #[simd_test(enable = "avx512f")]
35772     unsafe fn test_mm_mask_getmant_round_ss() {
35773         let a = _mm_set1_ps(20.);
35774         let b = _mm_set1_ps(10.);
35775         let r = _mm_mask_getmant_round_ss(
35776             a,
35777             0,
35778             a,
35779             b,
35780             _MM_MANT_NORM_1_2,
35781             _MM_MANT_SIGN_SRC,
35782             _MM_FROUND_CUR_DIRECTION,
35783         );
35784         let e = _mm_set_ps(20., 20., 20., 20.);
35785         assert_eq_m128(r, e);
35786         let r = _mm_mask_getmant_round_ss(
35787             a,
35788             0b11111111,
35789             a,
35790             b,
35791             _MM_MANT_NORM_1_2,
35792             _MM_MANT_SIGN_SRC,
35793             _MM_FROUND_CUR_DIRECTION,
35794         );
35795         let e = _mm_set_ps(20., 20., 20., 1.25);
35796         assert_eq_m128(r, e);
35797     }
35798
35799     #[simd_test(enable = "avx512f")]
35800     unsafe fn test_mm_maskz_getmant_round_ss() {
35801         let a = _mm_set1_ps(20.);
35802         let b = _mm_set1_ps(10.);
35803         let r = _mm_maskz_getmant_round_ss(
35804             0,
35805             a,
35806             b,
35807             _MM_MANT_NORM_1_2,
35808             _MM_MANT_SIGN_SRC,
35809             _MM_FROUND_CUR_DIRECTION,
35810         );
35811         let e = _mm_set_ps(20., 20., 20., 0.);
35812         assert_eq_m128(r, e);
35813         let r = _mm_maskz_getmant_round_ss(
35814             0b11111111,
35815             a,
35816             b,
35817             _MM_MANT_NORM_1_2,
35818             _MM_MANT_SIGN_SRC,
35819             _MM_FROUND_CUR_DIRECTION,
35820         );
35821         let e = _mm_set_ps(20., 20., 20., 1.25);
35822         assert_eq_m128(r, e);
35823     }
35824
35825     #[simd_test(enable = "avx512f")]
35826     unsafe fn test_mm_getmant_round_sd() {
35827         let a = _mm_set1_pd(20.);
35828         let b = _mm_set1_pd(10.);
35829         let r = _mm_getmant_round_sd(
35830             a,
35831             b,
35832             _MM_MANT_NORM_1_2,
35833             _MM_MANT_SIGN_SRC,
35834             _MM_FROUND_CUR_DIRECTION,
35835         );
35836         let e = _mm_set_pd(20., 1.25);
35837         assert_eq_m128d(r, e);
35838     }
35839
35840     #[simd_test(enable = "avx512f")]
35841     unsafe fn test_mm_mask_getmant_round_sd() {
35842         let a = _mm_set1_pd(20.);
35843         let b = _mm_set1_pd(10.);
35844         let r = _mm_mask_getmant_round_sd(
35845             a,
35846             0,
35847             a,
35848             b,
35849             _MM_MANT_NORM_1_2,
35850             _MM_MANT_SIGN_SRC,
35851             _MM_FROUND_CUR_DIRECTION,
35852         );
35853         let e = _mm_set_pd(20., 20.);
35854         assert_eq_m128d(r, e);
35855         let r = _mm_mask_getmant_round_sd(
35856             a,
35857             0b11111111,
35858             a,
35859             b,
35860             _MM_MANT_NORM_1_2,
35861             _MM_MANT_SIGN_SRC,
35862             _MM_FROUND_CUR_DIRECTION,
35863         );
35864         let e = _mm_set_pd(20., 1.25);
35865         assert_eq_m128d(r, e);
35866     }
35867
35868     #[simd_test(enable = "avx512f")]
35869     unsafe fn test_mm_maskz_getmant_round_sd() {
35870         let a = _mm_set1_pd(20.);
35871         let b = _mm_set1_pd(10.);
35872         let r = _mm_maskz_getmant_round_sd(
35873             0,
35874             a,
35875             b,
35876             _MM_MANT_NORM_1_2,
35877             _MM_MANT_SIGN_SRC,
35878             _MM_FROUND_CUR_DIRECTION,
35879         );
35880         let e = _mm_set_pd(20., 0.);
35881         assert_eq_m128d(r, e);
35882         let r = _mm_maskz_getmant_round_sd(
35883             0b11111111,
35884             a,
35885             b,
35886             _MM_MANT_NORM_1_2,
35887             _MM_MANT_SIGN_SRC,
35888             _MM_FROUND_CUR_DIRECTION,
35889         );
35890         let e = _mm_set_pd(20., 1.25);
35891         assert_eq_m128d(r, e);
35892     }
35893
35894     #[simd_test(enable = "avx512f")]
35895     unsafe fn test_mm_roundscale_round_ss() {
35896         let a = _mm_set1_ps(2.2);
35897         let b = _mm_set1_ps(1.1);
35898         let r = _mm_roundscale_round_ss(a, b, 0, _MM_FROUND_CUR_DIRECTION);
35899         let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
35900         assert_eq_m128(r, e);
35901     }
35902
35903     #[simd_test(enable = "avx512f")]
35904     unsafe fn test_mm_mask_roundscale_round_ss() {
35905         let a = _mm_set1_ps(2.2);
35906         let b = _mm_set1_ps(1.1);
35907         let r = _mm_mask_roundscale_round_ss(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35908         let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
35909         assert_eq_m128(r, e);
35910         let r = _mm_mask_roundscale_round_ss(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35911         let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
35912         assert_eq_m128(r, e);
35913     }
35914
35915     #[simd_test(enable = "avx512f")]
35916     unsafe fn test_mm_maskz_roundscale_round_ss() {
35917         let a = _mm_set1_ps(2.2);
35918         let b = _mm_set1_ps(1.1);
35919         let r = _mm_maskz_roundscale_round_ss(0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35920         let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
35921         assert_eq_m128(r, e);
35922         let r = _mm_maskz_roundscale_round_ss(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35923         let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
35924         assert_eq_m128(r, e);
35925     }
35926
35927     #[simd_test(enable = "avx512f")]
35928     unsafe fn test_mm_roundscale_round_sd() {
35929         let a = _mm_set1_pd(2.2);
35930         let b = _mm_set1_pd(1.1);
35931         let r = _mm_roundscale_round_sd(a, b, 0, _MM_FROUND_CUR_DIRECTION);
35932         let e = _mm_set_pd(2.2, 1.0);
35933         assert_eq_m128d(r, e);
35934     }
35935
35936     #[simd_test(enable = "avx512f")]
35937     unsafe fn test_mm_mask_roundscale_round_sd() {
35938         let a = _mm_set1_pd(2.2);
35939         let b = _mm_set1_pd(1.1);
35940         let r = _mm_mask_roundscale_round_sd(a, 0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35941         let e = _mm_set_pd(2.2, 2.2);
35942         assert_eq_m128d(r, e);
35943         let r = _mm_mask_roundscale_round_sd(a, 0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35944         let e = _mm_set_pd(2.2, 1.0);
35945         assert_eq_m128d(r, e);
35946     }
35947
35948     #[simd_test(enable = "avx512f")]
35949     unsafe fn test_mm_maskz_roundscale_round_sd() {
35950         let a = _mm_set1_pd(2.2);
35951         let b = _mm_set1_pd(1.1);
35952         let r = _mm_maskz_roundscale_round_sd(0, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35953         let e = _mm_set_pd(2.2, 0.0);
35954         assert_eq_m128d(r, e);
35955         let r = _mm_maskz_roundscale_round_sd(0b11111111, a, b, 0, _MM_FROUND_CUR_DIRECTION);
35956         let e = _mm_set_pd(2.2, 1.0);
35957         assert_eq_m128d(r, e);
35958     }
35959
35960     #[simd_test(enable = "avx512f")]
35961     unsafe fn test_mm_scalef_round_ss() {
35962         let a = _mm_set1_ps(1.);
35963         let b = _mm_set1_ps(3.);
35964         let r = _mm_scalef_round_ss(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
35965         let e = _mm_set_ps(1., 1., 1., 8.);
35966         assert_eq_m128(r, e);
35967     }
35968
35969     #[simd_test(enable = "avx512f")]
35970     unsafe fn test_mm_mask_scalef_round_ss() {
35971         let a = _mm_set1_ps(1.);
35972         let b = _mm_set1_ps(3.);
35973         let r = _mm_mask_scalef_round_ss(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
35974         let e = _mm_set_ps(1., 1., 1., 1.);
35975         assert_eq_m128(r, e);
35976         let r = _mm_mask_scalef_round_ss(
35977             a,
35978             0b11111111,
35979             a,
35980             b,
35981             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
35982         );
35983         let e = _mm_set_ps(1., 1., 1., 8.);
35984         assert_eq_m128(r, e);
35985     }
35986
35987     #[simd_test(enable = "avx512f")]
35988     unsafe fn test_mm_maskz_scalef_round_ss() {
35989         let a = _mm_set1_ps(1.);
35990         let b = _mm_set1_ps(3.);
35991         let r = _mm_maskz_scalef_round_ss(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
35992         let e = _mm_set_ps(1., 1., 1., 0.);
35993         assert_eq_m128(r, e);
35994         let r = _mm_maskz_scalef_round_ss(
35995             0b11111111,
35996             a,
35997             b,
35998             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
35999         );
36000         let e = _mm_set_ps(1., 1., 1., 8.);
36001         assert_eq_m128(r, e);
36002     }
36003
36004     #[simd_test(enable = "avx512f")]
36005     unsafe fn test_mm_scalef_round_sd() {
36006         let a = _mm_set1_pd(1.);
36007         let b = _mm_set1_pd(3.);
36008         let r = _mm_scalef_round_sd(a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36009         let e = _mm_set_pd(1., 8.);
36010         assert_eq_m128d(r, e);
36011     }
36012
36013     #[simd_test(enable = "avx512f")]
36014     unsafe fn test_mm_mask_scalef_round_sd() {
36015         let a = _mm_set1_pd(1.);
36016         let b = _mm_set1_pd(3.);
36017         let r = _mm_mask_scalef_round_sd(a, 0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36018         let e = _mm_set_pd(1., 1.);
36019         assert_eq_m128d(r, e);
36020         let r = _mm_mask_scalef_round_sd(
36021             a,
36022             0b11111111,
36023             a,
36024             b,
36025             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36026         );
36027         let e = _mm_set_pd(1., 8.);
36028         assert_eq_m128d(r, e);
36029     }
36030
36031     #[simd_test(enable = "avx512f")]
36032     unsafe fn test_mm_maskz_scalef_round_sd() {
36033         let a = _mm_set1_pd(1.);
36034         let b = _mm_set1_pd(3.);
36035         let r = _mm_maskz_scalef_round_sd(0, a, b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36036         let e = _mm_set_pd(1., 0.);
36037         assert_eq_m128d(r, e);
36038         let r = _mm_maskz_scalef_round_sd(
36039             0b11111111,
36040             a,
36041             b,
36042             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36043         );
36044         let e = _mm_set_pd(1., 8.);
36045         assert_eq_m128d(r, e);
36046     }
36047
36048     #[simd_test(enable = "avx512f")]
36049     unsafe fn test_mm_fmadd_round_ss() {
36050         let a = _mm_set1_ps(1.);
36051         let b = _mm_set1_ps(2.);
36052         let c = _mm_set1_ps(3.);
36053         let r = _mm_fmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36054         let e = _mm_set_ps(1., 1., 1., 5.);
36055         assert_eq_m128(r, e);
36056     }
36057
36058     #[simd_test(enable = "avx512f")]
36059     unsafe fn test_mm_mask_fmadd_round_ss() {
36060         let a = _mm_set1_ps(1.);
36061         let b = _mm_set1_ps(2.);
36062         let c = _mm_set1_ps(3.);
36063         let r = _mm_mask_fmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36064         assert_eq_m128(r, a);
36065         let r = _mm_mask_fmadd_round_ss(
36066             a,
36067             0b11111111,
36068             b,
36069             c,
36070             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36071         );
36072         let e = _mm_set_ps(1., 1., 1., 5.);
36073         assert_eq_m128(r, e);
36074     }
36075
36076     #[simd_test(enable = "avx512f")]
36077     unsafe fn test_mm_maskz_fmadd_round_ss() {
36078         let a = _mm_set1_ps(1.);
36079         let b = _mm_set1_ps(2.);
36080         let c = _mm_set1_ps(3.);
36081         let r = _mm_maskz_fmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36082         let e = _mm_set_ps(1., 1., 1., 0.);
36083         assert_eq_m128(r, e);
36084         let r = _mm_maskz_fmadd_round_ss(
36085             0b11111111,
36086             a,
36087             b,
36088             c,
36089             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36090         );
36091         let e = _mm_set_ps(1., 1., 1., 5.);
36092         assert_eq_m128(r, e);
36093     }
36094
36095     #[simd_test(enable = "avx512f")]
36096     unsafe fn test_mm_mask3_fmadd_round_ss() {
36097         let a = _mm_set1_ps(1.);
36098         let b = _mm_set1_ps(2.);
36099         let c = _mm_set1_ps(3.);
36100         let r = _mm_mask3_fmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36101         assert_eq_m128(r, c);
36102         let r = _mm_mask3_fmadd_round_ss(
36103             a,
36104             b,
36105             c,
36106             0b11111111,
36107             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36108         );
36109         let e = _mm_set_ps(3., 3., 3., 5.);
36110         assert_eq_m128(r, e);
36111     }
36112
36113     #[simd_test(enable = "avx512f")]
36114     unsafe fn test_mm_fmadd_round_sd() {
36115         let a = _mm_set1_pd(1.);
36116         let b = _mm_set1_pd(2.);
36117         let c = _mm_set1_pd(3.);
36118         let r = _mm_fmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36119         let e = _mm_set_pd(1., 5.);
36120         assert_eq_m128d(r, e);
36121     }
36122
36123     #[simd_test(enable = "avx512f")]
36124     unsafe fn test_mm_mask_fmadd_round_sd() {
36125         let a = _mm_set1_pd(1.);
36126         let b = _mm_set1_pd(2.);
36127         let c = _mm_set1_pd(3.);
36128         let r = _mm_mask_fmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36129         assert_eq_m128d(r, a);
36130         let r = _mm_mask_fmadd_round_sd(
36131             a,
36132             0b11111111,
36133             b,
36134             c,
36135             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36136         );
36137         let e = _mm_set_pd(1., 5.);
36138         assert_eq_m128d(r, e);
36139     }
36140
36141     #[simd_test(enable = "avx512f")]
36142     unsafe fn test_mm_maskz_fmadd_round_sd() {
36143         let a = _mm_set1_pd(1.);
36144         let b = _mm_set1_pd(2.);
36145         let c = _mm_set1_pd(3.);
36146         let r = _mm_maskz_fmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36147         let e = _mm_set_pd(1., 0.);
36148         assert_eq_m128d(r, e);
36149         let r = _mm_maskz_fmadd_round_sd(
36150             0b11111111,
36151             a,
36152             b,
36153             c,
36154             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36155         );
36156         let e = _mm_set_pd(1., 5.);
36157         assert_eq_m128d(r, e);
36158     }
36159
36160     #[simd_test(enable = "avx512f")]
36161     unsafe fn test_mm_mask3_fmadd_round_sd() {
36162         let a = _mm_set1_pd(1.);
36163         let b = _mm_set1_pd(2.);
36164         let c = _mm_set1_pd(3.);
36165         let r = _mm_mask3_fmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36166         assert_eq_m128d(r, c);
36167         let r = _mm_mask3_fmadd_round_sd(
36168             a,
36169             b,
36170             c,
36171             0b11111111,
36172             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36173         );
36174         let e = _mm_set_pd(3., 5.);
36175         assert_eq_m128d(r, e);
36176     }
36177
36178     #[simd_test(enable = "avx512f")]
36179     unsafe fn test_mm_fmsub_round_ss() {
36180         let a = _mm_set1_ps(1.);
36181         let b = _mm_set1_ps(2.);
36182         let c = _mm_set1_ps(3.);
36183         let r = _mm_fmsub_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36184         let e = _mm_set_ps(1., 1., 1., -1.);
36185         assert_eq_m128(r, e);
36186     }
36187
36188     #[simd_test(enable = "avx512f")]
36189     unsafe fn test_mm_mask_fmsub_round_ss() {
36190         let a = _mm_set1_ps(1.);
36191         let b = _mm_set1_ps(2.);
36192         let c = _mm_set1_ps(3.);
36193         let r = _mm_mask_fmsub_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36194         assert_eq_m128(r, a);
36195         let r = _mm_mask_fmsub_round_ss(
36196             a,
36197             0b11111111,
36198             b,
36199             c,
36200             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36201         );
36202         let e = _mm_set_ps(1., 1., 1., -1.);
36203         assert_eq_m128(r, e);
36204     }
36205
36206     #[simd_test(enable = "avx512f")]
36207     unsafe fn test_mm_maskz_fmsub_round_ss() {
36208         let a = _mm_set1_ps(1.);
36209         let b = _mm_set1_ps(2.);
36210         let c = _mm_set1_ps(3.);
36211         let r = _mm_maskz_fmsub_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36212         let e = _mm_set_ps(1., 1., 1., 0.);
36213         assert_eq_m128(r, e);
36214         let r = _mm_maskz_fmsub_round_ss(
36215             0b11111111,
36216             a,
36217             b,
36218             c,
36219             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36220         );
36221         let e = _mm_set_ps(1., 1., 1., -1.);
36222         assert_eq_m128(r, e);
36223     }
36224
36225     #[simd_test(enable = "avx512f")]
36226     unsafe fn test_mm_mask3_fmsub_round_ss() {
36227         let a = _mm_set1_ps(1.);
36228         let b = _mm_set1_ps(2.);
36229         let c = _mm_set1_ps(3.);
36230         let r = _mm_mask3_fmsub_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36231         assert_eq_m128(r, c);
36232         let r = _mm_mask3_fmsub_round_ss(
36233             a,
36234             b,
36235             c,
36236             0b11111111,
36237             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36238         );
36239         let e = _mm_set_ps(3., 3., 3., -1.);
36240         assert_eq_m128(r, e);
36241     }
36242
36243     #[simd_test(enable = "avx512f")]
36244     unsafe fn test_mm_fmsub_round_sd() {
36245         let a = _mm_set1_pd(1.);
36246         let b = _mm_set1_pd(2.);
36247         let c = _mm_set1_pd(3.);
36248         let r = _mm_fmsub_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36249         let e = _mm_set_pd(1., -1.);
36250         assert_eq_m128d(r, e);
36251     }
36252
36253     #[simd_test(enable = "avx512f")]
36254     unsafe fn test_mm_mask_fmsub_round_sd() {
36255         let a = _mm_set1_pd(1.);
36256         let b = _mm_set1_pd(2.);
36257         let c = _mm_set1_pd(3.);
36258         let r = _mm_mask_fmsub_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36259         assert_eq_m128d(r, a);
36260         let r = _mm_mask_fmsub_round_sd(
36261             a,
36262             0b11111111,
36263             b,
36264             c,
36265             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36266         );
36267         let e = _mm_set_pd(1., -1.);
36268         assert_eq_m128d(r, e);
36269     }
36270
36271     #[simd_test(enable = "avx512f")]
36272     unsafe fn test_mm_maskz_fmsub_round_sd() {
36273         let a = _mm_set1_pd(1.);
36274         let b = _mm_set1_pd(2.);
36275         let c = _mm_set1_pd(3.);
36276         let r = _mm_maskz_fmsub_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36277         let e = _mm_set_pd(1., 0.);
36278         assert_eq_m128d(r, e);
36279         let r = _mm_maskz_fmsub_round_sd(
36280             0b11111111,
36281             a,
36282             b,
36283             c,
36284             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36285         );
36286         let e = _mm_set_pd(1., -1.);
36287         assert_eq_m128d(r, e);
36288     }
36289
36290     #[simd_test(enable = "avx512f")]
36291     unsafe fn test_mm_mask3_fmsub_round_sd() {
36292         let a = _mm_set1_pd(1.);
36293         let b = _mm_set1_pd(2.);
36294         let c = _mm_set1_pd(3.);
36295         let r = _mm_mask3_fmsub_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36296         assert_eq_m128d(r, c);
36297         let r = _mm_mask3_fmsub_round_sd(
36298             a,
36299             b,
36300             c,
36301             0b11111111,
36302             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36303         );
36304         let e = _mm_set_pd(3., -1.);
36305         assert_eq_m128d(r, e);
36306     }
36307
36308     #[simd_test(enable = "avx512f")]
36309     unsafe fn test_mm_fnmadd_round_ss() {
36310         let a = _mm_set1_ps(1.);
36311         let b = _mm_set1_ps(2.);
36312         let c = _mm_set1_ps(3.);
36313         let r = _mm_fnmadd_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36314         let e = _mm_set_ps(1., 1., 1., 1.);
36315         assert_eq_m128(r, e);
36316     }
36317
36318     #[simd_test(enable = "avx512f")]
36319     unsafe fn test_mm_mask_fnmadd_round_ss() {
36320         let a = _mm_set1_ps(1.);
36321         let b = _mm_set1_ps(2.);
36322         let c = _mm_set1_ps(3.);
36323         let r = _mm_mask_fnmadd_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36324         assert_eq_m128(r, a);
36325         let r = _mm_mask_fnmadd_round_ss(
36326             a,
36327             0b11111111,
36328             b,
36329             c,
36330             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36331         );
36332         let e = _mm_set_ps(1., 1., 1., 1.);
36333         assert_eq_m128(r, e);
36334     }
36335
36336     #[simd_test(enable = "avx512f")]
36337     unsafe fn test_mm_maskz_fnmadd_round_ss() {
36338         let a = _mm_set1_ps(1.);
36339         let b = _mm_set1_ps(2.);
36340         let c = _mm_set1_ps(3.);
36341         let r =
36342             _mm_maskz_fnmadd_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36343         let e = _mm_set_ps(1., 1., 1., 0.);
36344         assert_eq_m128(r, e);
36345         let r = _mm_maskz_fnmadd_round_ss(
36346             0b11111111,
36347             a,
36348             b,
36349             c,
36350             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36351         );
36352         let e = _mm_set_ps(1., 1., 1., 1.);
36353         assert_eq_m128(r, e);
36354     }
36355
36356     #[simd_test(enable = "avx512f")]
36357     unsafe fn test_mm_mask3_fnmadd_round_ss() {
36358         let a = _mm_set1_ps(1.);
36359         let b = _mm_set1_ps(2.);
36360         let c = _mm_set1_ps(3.);
36361         let r =
36362             _mm_mask3_fnmadd_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36363         assert_eq_m128(r, c);
36364         let r = _mm_mask3_fnmadd_round_ss(
36365             a,
36366             b,
36367             c,
36368             0b11111111,
36369             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36370         );
36371         let e = _mm_set_ps(3., 3., 3., 1.);
36372         assert_eq_m128(r, e);
36373     }
36374
36375     #[simd_test(enable = "avx512f")]
36376     unsafe fn test_mm_fnmadd_round_sd() {
36377         let a = _mm_set1_pd(1.);
36378         let b = _mm_set1_pd(2.);
36379         let c = _mm_set1_pd(3.);
36380         let r = _mm_fnmadd_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36381         let e = _mm_set_pd(1., 1.);
36382         assert_eq_m128d(r, e);
36383     }
36384
36385     #[simd_test(enable = "avx512f")]
36386     unsafe fn test_mm_mask_fnmadd_round_sd() {
36387         let a = _mm_set1_pd(1.);
36388         let b = _mm_set1_pd(2.);
36389         let c = _mm_set1_pd(3.);
36390         let r = _mm_mask_fnmadd_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36391         assert_eq_m128d(r, a);
36392         let r = _mm_mask_fnmadd_round_sd(
36393             a,
36394             0b11111111,
36395             b,
36396             c,
36397             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36398         );
36399         let e = _mm_set_pd(1., 1.);
36400         assert_eq_m128d(r, e);
36401     }
36402
36403     #[simd_test(enable = "avx512f")]
36404     unsafe fn test_mm_maskz_fnmadd_round_sd() {
36405         let a = _mm_set1_pd(1.);
36406         let b = _mm_set1_pd(2.);
36407         let c = _mm_set1_pd(3.);
36408         let r =
36409             _mm_maskz_fnmadd_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36410         let e = _mm_set_pd(1., 0.);
36411         assert_eq_m128d(r, e);
36412         let r = _mm_maskz_fnmadd_round_sd(
36413             0b11111111,
36414             a,
36415             b,
36416             c,
36417             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36418         );
36419         let e = _mm_set_pd(1., 1.);
36420         assert_eq_m128d(r, e);
36421     }
36422
36423     #[simd_test(enable = "avx512f")]
36424     unsafe fn test_mm_mask3_fnmadd_round_sd() {
36425         let a = _mm_set1_pd(1.);
36426         let b = _mm_set1_pd(2.);
36427         let c = _mm_set1_pd(3.);
36428         let r =
36429             _mm_mask3_fnmadd_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36430         assert_eq_m128d(r, c);
36431         let r = _mm_mask3_fnmadd_round_sd(
36432             a,
36433             b,
36434             c,
36435             0b11111111,
36436             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36437         );
36438         let e = _mm_set_pd(3., 1.);
36439         assert_eq_m128d(r, e);
36440     }
36441
36442     #[simd_test(enable = "avx512f")]
36443     unsafe fn test_mm_fnmsub_round_ss() {
36444         let a = _mm_set1_ps(1.);
36445         let b = _mm_set1_ps(2.);
36446         let c = _mm_set1_ps(3.);
36447         let r = _mm_fnmsub_round_ss(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36448         let e = _mm_set_ps(1., 1., 1., -5.);
36449         assert_eq_m128(r, e);
36450     }
36451
36452     #[simd_test(enable = "avx512f")]
36453     unsafe fn test_mm_mask_fnmsub_round_ss() {
36454         let a = _mm_set1_ps(1.);
36455         let b = _mm_set1_ps(2.);
36456         let c = _mm_set1_ps(3.);
36457         let r = _mm_mask_fnmsub_round_ss(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36458         assert_eq_m128(r, a);
36459         let r = _mm_mask_fnmsub_round_ss(
36460             a,
36461             0b11111111,
36462             b,
36463             c,
36464             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36465         );
36466         let e = _mm_set_ps(1., 1., 1., -5.);
36467         assert_eq_m128(r, e);
36468     }
36469
36470     #[simd_test(enable = "avx512f")]
36471     unsafe fn test_mm_maskz_fnmsub_round_ss() {
36472         let a = _mm_set1_ps(1.);
36473         let b = _mm_set1_ps(2.);
36474         let c = _mm_set1_ps(3.);
36475         let r =
36476             _mm_maskz_fnmsub_round_ss(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36477         let e = _mm_set_ps(1., 1., 1., 0.);
36478         assert_eq_m128(r, e);
36479         let r = _mm_maskz_fnmsub_round_ss(
36480             0b11111111,
36481             a,
36482             b,
36483             c,
36484             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36485         );
36486         let e = _mm_set_ps(1., 1., 1., -5.);
36487         assert_eq_m128(r, e);
36488     }
36489
36490     #[simd_test(enable = "avx512f")]
36491     unsafe fn test_mm_mask3_fnmsub_round_ss() {
36492         let a = _mm_set1_ps(1.);
36493         let b = _mm_set1_ps(2.);
36494         let c = _mm_set1_ps(3.);
36495         let r =
36496             _mm_mask3_fnmsub_round_ss(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36497         assert_eq_m128(r, c);
36498         let r = _mm_mask3_fnmsub_round_ss(
36499             a,
36500             b,
36501             c,
36502             0b11111111,
36503             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36504         );
36505         let e = _mm_set_ps(3., 3., 3., -5.);
36506         assert_eq_m128(r, e);
36507     }
36508
36509     #[simd_test(enable = "avx512f")]
36510     unsafe fn test_mm_fnmsub_round_sd() {
36511         let a = _mm_set1_pd(1.);
36512         let b = _mm_set1_pd(2.);
36513         let c = _mm_set1_pd(3.);
36514         let r = _mm_fnmsub_round_sd(a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36515         let e = _mm_set_pd(1., -5.);
36516         assert_eq_m128d(r, e);
36517     }
36518
36519     #[simd_test(enable = "avx512f")]
36520     unsafe fn test_mm_mask_fnmsub_round_sd() {
36521         let a = _mm_set1_pd(1.);
36522         let b = _mm_set1_pd(2.);
36523         let c = _mm_set1_pd(3.);
36524         let r = _mm_mask_fnmsub_round_sd(a, 0, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36525         assert_eq_m128d(r, a);
36526         let r = _mm_mask_fnmsub_round_sd(
36527             a,
36528             0b11111111,
36529             b,
36530             c,
36531             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36532         );
36533         let e = _mm_set_pd(1., -5.);
36534         assert_eq_m128d(r, e);
36535     }
36536
36537     #[simd_test(enable = "avx512f")]
36538     unsafe fn test_mm_maskz_fnmsub_round_sd() {
36539         let a = _mm_set1_pd(1.);
36540         let b = _mm_set1_pd(2.);
36541         let c = _mm_set1_pd(3.);
36542         let r =
36543             _mm_maskz_fnmsub_round_sd(0, a, b, c, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36544         let e = _mm_set_pd(1., 0.);
36545         assert_eq_m128d(r, e);
36546         let r = _mm_maskz_fnmsub_round_sd(
36547             0b11111111,
36548             a,
36549             b,
36550             c,
36551             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36552         );
36553         let e = _mm_set_pd(1., -5.);
36554         assert_eq_m128d(r, e);
36555     }
36556
36557     #[simd_test(enable = "avx512f")]
36558     unsafe fn test_mm_mask3_fnmsub_round_sd() {
36559         let a = _mm_set1_pd(1.);
36560         let b = _mm_set1_pd(2.);
36561         let c = _mm_set1_pd(3.);
36562         let r =
36563             _mm_mask3_fnmsub_round_sd(a, b, c, 0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36564         assert_eq_m128d(r, c);
36565         let r = _mm_mask3_fnmsub_round_sd(
36566             a,
36567             b,
36568             c,
36569             0b11111111,
36570             _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
36571         );
36572         let e = _mm_set_pd(3., -5.);
36573         assert_eq_m128d(r, e);
36574     }
36575
36576     #[simd_test(enable = "avx512f")]
36577     unsafe fn test_mm_fixupimm_ss() {
36578         let a = _mm_set_ps(0., 0., 0., f32::NAN);
36579         let b = _mm_set1_ps(f32::MAX);
36580         let c = _mm_set1_epi32(i32::MAX);
36581         let r = _mm_fixupimm_ss(a, b, c, 5);
36582         let e = _mm_set_ps(0., 0., 0., -0.0);
36583         assert_eq_m128(r, e);
36584     }
36585
36586     #[simd_test(enable = "avx512f")]
36587     unsafe fn test_mm_mask_fixupimm_ss() {
36588         let a = _mm_set_ps(0., 0., 0., f32::NAN);
36589         let b = _mm_set1_ps(f32::MAX);
36590         let c = _mm_set1_epi32(i32::MAX);
36591         let r = _mm_mask_fixupimm_ss(a, 0b11111111, b, c, 5);
36592         let e = _mm_set_ps(0., 0., 0., -0.0);
36593         assert_eq_m128(r, e);
36594     }
36595
36596     #[simd_test(enable = "avx512f")]
36597     unsafe fn test_mm_maskz_fixupimm_ss() {
36598         let a = _mm_set_ps(0., 0., 0., f32::NAN);
36599         let b = _mm_set1_ps(f32::MAX);
36600         let c = _mm_set1_epi32(i32::MAX);
36601         let r = _mm_maskz_fixupimm_ss(0b00000000, a, b, c, 5);
36602         let e = _mm_set_ps(0., 0., 0., 0.0);
36603         assert_eq_m128(r, e);
36604         let r = _mm_maskz_fixupimm_ss(0b11111111, a, b, c, 5);
36605         let e = _mm_set_ps(0., 0., 0., -0.0);
36606         assert_eq_m128(r, e);
36607     }
36608
36609     #[simd_test(enable = "avx512f")]
36610     unsafe fn test_mm_fixupimm_sd() {
36611         let a = _mm_set_pd(0., f64::NAN);
36612         let b = _mm_set1_pd(f64::MAX);
36613         let c = _mm_set1_epi64x(i32::MAX as i64);
36614         let r = _mm_fixupimm_sd(a, b, c, 5);
36615         let e = _mm_set_pd(0., -0.0);
36616         assert_eq_m128d(r, e);
36617     }
36618
36619     #[simd_test(enable = "avx512f")]
36620     unsafe fn test_mm_mask_fixupimm_sd() {
36621         let a = _mm_set_pd(0., f64::NAN);
36622         let b = _mm_set1_pd(f64::MAX);
36623         let c = _mm_set1_epi64x(i32::MAX as i64);
36624         let r = _mm_mask_fixupimm_sd(a, 0b11111111, b, c, 5);
36625         let e = _mm_set_pd(0., -0.0);
36626         assert_eq_m128d(r, e);
36627     }
36628
36629     #[simd_test(enable = "avx512f")]
36630     unsafe fn test_mm_maskz_fixupimm_sd() {
36631         let a = _mm_set_pd(0., f64::NAN);
36632         let b = _mm_set1_pd(f64::MAX);
36633         let c = _mm_set1_epi64x(i32::MAX as i64);
36634         let r = _mm_maskz_fixupimm_sd(0b00000000, a, b, c, 5);
36635         let e = _mm_set_pd(0., 0.0);
36636         assert_eq_m128d(r, e);
36637         let r = _mm_maskz_fixupimm_sd(0b11111111, a, b, c, 5);
36638         let e = _mm_set_pd(0., -0.0);
36639         assert_eq_m128d(r, e);
36640     }
36641
36642     #[simd_test(enable = "avx512f")]
36643     unsafe fn test_mm_fixupimm_round_ss() {
36644         let a = _mm_set_ps(0., 0., 0., f32::NAN);
36645         let b = _mm_set1_ps(f32::MAX);
36646         let c = _mm_set1_epi32(i32::MAX);
36647         let r = _mm_fixupimm_round_ss(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36648         let e = _mm_set_ps(0., 0., 0., -0.0);
36649         assert_eq_m128(r, e);
36650     }
36651
36652     #[simd_test(enable = "avx512f")]
36653     unsafe fn test_mm_mask_fixupimm_round_ss() {
36654         let a = _mm_set_ps(0., 0., 0., f32::NAN);
36655         let b = _mm_set1_ps(f32::MAX);
36656         let c = _mm_set1_epi32(i32::MAX);
36657         let r = _mm_mask_fixupimm_round_ss(a, 0b11111111, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36658         let e = _mm_set_ps(0., 0., 0., -0.0);
36659         assert_eq_m128(r, e);
36660     }
36661
36662     #[simd_test(enable = "avx512f")]
36663     unsafe fn test_mm_maskz_fixupimm_round_ss() {
36664         let a = _mm_set_ps(0., 0., 0., f32::NAN);
36665         let b = _mm_set1_ps(f32::MAX);
36666         let c = _mm_set1_epi32(i32::MAX);
36667         let r = _mm_maskz_fixupimm_round_ss(0b00000000, a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36668         let e = _mm_set_ps(0., 0., 0., 0.0);
36669         assert_eq_m128(r, e);
36670         let r = _mm_maskz_fixupimm_round_ss(0b11111111, a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36671         let e = _mm_set_ps(0., 0., 0., -0.0);
36672         assert_eq_m128(r, e);
36673     }
36674
36675     #[simd_test(enable = "avx512f")]
36676     unsafe fn test_mm_fixupimm_round_sd() {
36677         let a = _mm_set_pd(0., f64::NAN);
36678         let b = _mm_set1_pd(f64::MAX);
36679         let c = _mm_set1_epi64x(i32::MAX as i64);
36680         let r = _mm_fixupimm_round_sd(a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36681         let e = _mm_set_pd(0., -0.0);
36682         assert_eq_m128d(r, e);
36683     }
36684
36685     #[simd_test(enable = "avx512f")]
36686     unsafe fn test_mm_mask_fixupimm_round_sd() {
36687         let a = _mm_set_pd(0., f64::NAN);
36688         let b = _mm_set1_pd(f64::MAX);
36689         let c = _mm_set1_epi64x(i32::MAX as i64);
36690         let r = _mm_mask_fixupimm_round_sd(a, 0b11111111, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36691         let e = _mm_set_pd(0., -0.0);
36692         assert_eq_m128d(r, e);
36693     }
36694
36695     #[simd_test(enable = "avx512f")]
36696     unsafe fn test_mm_maskz_fixupimm_round_sd() {
36697         let a = _mm_set_pd(0., f64::NAN);
36698         let b = _mm_set1_pd(f64::MAX);
36699         let c = _mm_set1_epi64x(i32::MAX as i64);
36700         let r = _mm_maskz_fixupimm_round_sd(0b00000000, a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36701         let e = _mm_set_pd(0., 0.0);
36702         assert_eq_m128d(r, e);
36703         let r = _mm_maskz_fixupimm_round_sd(0b11111111, a, b, c, 5, _MM_FROUND_CUR_DIRECTION);
36704         let e = _mm_set_pd(0., -0.0);
36705         assert_eq_m128d(r, e);
36706     }
36707
36708     #[simd_test(enable = "avx512f")]
36709     unsafe fn test_mm_mask_cvtss_sd() {
36710         let a = _mm_set_pd(6., -7.5);
36711         let b = _mm_set_ps(0., -0.5, 1., -1.5);
36712         let r = _mm_mask_cvtss_sd(a, 0, a, b);
36713         assert_eq_m128d(r, a);
36714         let r = _mm_mask_cvtss_sd(a, 0b11111111, a, b);
36715         let e = _mm_set_pd(6., -1.5);
36716         assert_eq_m128d(r, e);
36717     }
36718
36719     #[simd_test(enable = "avx512f")]
36720     unsafe fn test_mm_maskz_cvtss_sd() {
36721         let a = _mm_set_pd(6., -7.5);
36722         let b = _mm_set_ps(0., -0.5, 1., -1.5);
36723         let r = _mm_maskz_cvtss_sd(0, a, b);
36724         let e = _mm_set_pd(6., 0.);
36725         assert_eq_m128d(r, e);
36726         let r = _mm_maskz_cvtss_sd(0b11111111, a, b);
36727         let e = _mm_set_pd(6., -1.5);
36728         assert_eq_m128d(r, e);
36729     }
36730
36731     #[simd_test(enable = "avx512f")]
36732     unsafe fn test_mm_mask_cvtsd_ss() {
36733         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36734         let b = _mm_set_pd(6., -7.5);
36735         let r = _mm_mask_cvtsd_ss(a, 0, a, b);
36736         assert_eq_m128(r, a);
36737         let r = _mm_mask_cvtsd_ss(a, 0b11111111, a, b);
36738         let e = _mm_set_ps(0., -0.5, 1., -7.5);
36739         assert_eq_m128(r, e);
36740     }
36741
36742     #[simd_test(enable = "avx512f")]
36743     unsafe fn test_mm_maskz_cvtsd_ss() {
36744         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36745         let b = _mm_set_pd(6., -7.5);
36746         let r = _mm_maskz_cvtsd_ss(0, a, b);
36747         let e = _mm_set_ps(0., -0.5, 1., 0.);
36748         assert_eq_m128(r, e);
36749         let r = _mm_maskz_cvtsd_ss(0b11111111, a, b);
36750         let e = _mm_set_ps(0., -0.5, 1., -7.5);
36751         assert_eq_m128(r, e);
36752     }
36753
36754     #[simd_test(enable = "avx512f")]
36755     unsafe fn test_mm_cvt_roundss_sd() {
36756         let a = _mm_set_pd(6., -7.5);
36757         let b = _mm_set_ps(0., -0.5, 1., -1.5);
36758         let r = _mm_cvt_roundss_sd(a, b, _MM_FROUND_CUR_DIRECTION);
36759         let e = _mm_set_pd(6., -1.5);
36760         assert_eq_m128d(r, e);
36761     }
36762
36763     #[simd_test(enable = "avx512f")]
36764     unsafe fn test_mm_mask_cvt_roundss_sd() {
36765         let a = _mm_set_pd(6., -7.5);
36766         let b = _mm_set_ps(0., -0.5, 1., -1.5);
36767         let r = _mm_mask_cvt_roundss_sd(a, 0, a, b, _MM_FROUND_CUR_DIRECTION);
36768         assert_eq_m128d(r, a);
36769         let r = _mm_mask_cvt_roundss_sd(a, 0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
36770         let e = _mm_set_pd(6., -1.5);
36771         assert_eq_m128d(r, e);
36772     }
36773
36774     #[simd_test(enable = "avx512f")]
36775     unsafe fn test_mm_maskz_cvt_roundss_sd() {
36776         let a = _mm_set_pd(6., -7.5);
36777         let b = _mm_set_ps(0., -0.5, 1., -1.5);
36778         let r = _mm_maskz_cvt_roundss_sd(0, a, b, _MM_FROUND_CUR_DIRECTION);
36779         let e = _mm_set_pd(6., 0.);
36780         assert_eq_m128d(r, e);
36781         let r = _mm_maskz_cvt_roundss_sd(0b11111111, a, b, _MM_FROUND_CUR_DIRECTION);
36782         let e = _mm_set_pd(6., -1.5);
36783         assert_eq_m128d(r, e);
36784     }
36785
36786     #[simd_test(enable = "avx512f")]
36787     unsafe fn test_mm_cvt_roundsd_ss() {
36788         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36789         let b = _mm_set_pd(6., -7.5);
36790         let r = _mm_cvt_roundsd_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36791         let e = _mm_set_ps(0., -0.5, 1., -7.5);
36792         assert_eq_m128(r, e);
36793     }
36794
36795     #[simd_test(enable = "avx512f")]
36796     unsafe fn test_mm_mask_cvt_roundsd_ss() {
36797         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36798         let b = _mm_set_pd(6., -7.5);
36799         let r = _mm_mask_cvt_roundsd_ss(a, 0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36800         assert_eq_m128(r, a);
36801         let r =
36802             _mm_mask_cvt_roundsd_ss(a, 0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36803         let e = _mm_set_ps(0., -0.5, 1., -7.5);
36804         assert_eq_m128(r, e);
36805     }
36806
36807     #[simd_test(enable = "avx512f")]
36808     unsafe fn test_mm_maskz_cvt_roundsd_ss() {
36809         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36810         let b = _mm_set_pd(6., -7.5);
36811         let r = _mm_maskz_cvt_roundsd_ss(0, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36812         let e = _mm_set_ps(0., -0.5, 1., 0.);
36813         assert_eq_m128(r, e);
36814         let r = _mm_maskz_cvt_roundsd_ss(0b11111111, a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36815         let e = _mm_set_ps(0., -0.5, 1., -7.5);
36816         assert_eq_m128(r, e);
36817     }
36818
36819     #[simd_test(enable = "avx512f")]
36820     unsafe fn test_mm_cvt_roundss_si32() {
36821         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36822         let r = _mm_cvt_roundss_si32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36823         let e: i32 = -1;
36824         assert_eq!(r, e);
36825     }
36826
36827     #[simd_test(enable = "avx512f")]
36828     unsafe fn test_mm_cvt_roundss_i32() {
36829         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36830         let r = _mm_cvt_roundss_i32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36831         let e: i32 = -1;
36832         assert_eq!(r, e);
36833     }
36834
36835     #[simd_test(enable = "avx512f")]
36836     unsafe fn test_mm_cvt_roundss_u32() {
36837         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36838         let r = _mm_cvt_roundss_u32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36839         let e: u32 = u32::MAX;
36840         assert_eq!(r, e);
36841     }
36842
36843     #[simd_test(enable = "avx512f")]
36844     unsafe fn test_mm_cvtss_i32() {
36845         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36846         let r = _mm_cvtss_i32(a);
36847         let e: i32 = -2;
36848         assert_eq!(r, e);
36849     }
36850
36851     #[simd_test(enable = "avx512f")]
36852     unsafe fn test_mm_cvtss_u32() {
36853         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36854         let r = _mm_cvtss_u32(a);
36855         let e: u32 = u32::MAX;
36856         assert_eq!(r, e);
36857     }
36858
36859     #[simd_test(enable = "avx512f")]
36860     unsafe fn test_mm_cvt_roundsd_si32() {
36861         let a = _mm_set_pd(1., -1.5);
36862         let r = _mm_cvt_roundsd_si32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36863         let e: i32 = -1;
36864         assert_eq!(r, e);
36865     }
36866
36867     #[simd_test(enable = "avx512f")]
36868     unsafe fn test_mm_cvt_roundsd_i32() {
36869         let a = _mm_set_pd(1., -1.5);
36870         let r = _mm_cvt_roundsd_i32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36871         let e: i32 = -1;
36872         assert_eq!(r, e);
36873     }
36874
36875     #[simd_test(enable = "avx512f")]
36876     unsafe fn test_mm_cvt_roundsd_u32() {
36877         let a = _mm_set_pd(1., -1.5);
36878         let r = _mm_cvt_roundsd_u32(a, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36879         let e: u32 = u32::MAX;
36880         assert_eq!(r, e);
36881     }
36882
36883     #[simd_test(enable = "avx512f")]
36884     unsafe fn test_mm_cvtsd_i32() {
36885         let a = _mm_set_pd(1., -1.5);
36886         let r = _mm_cvtsd_i32(a);
36887         let e: i32 = -2;
36888         assert_eq!(r, e);
36889     }
36890
36891     #[simd_test(enable = "avx512f")]
36892     unsafe fn test_mm_cvtsd_u32() {
36893         let a = _mm_set_pd(1., -1.5);
36894         let r = _mm_cvtsd_u32(a);
36895         let e: u32 = u32::MAX;
36896         assert_eq!(r, e);
36897     }
36898
36899     #[simd_test(enable = "avx512f")]
36900     unsafe fn test_mm_cvt_roundi32_ss() {
36901         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36902         let b: i32 = 9;
36903         let r = _mm_cvt_roundi32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36904         let e = _mm_set_ps(0., -0.5, 1., 9.);
36905         assert_eq_m128(r, e);
36906     }
36907
36908     #[simd_test(enable = "avx512f")]
36909     unsafe fn test_mm_cvt_roundsi32_ss() {
36910         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36911         let b: i32 = 9;
36912         let r = _mm_cvt_roundsi32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36913         let e = _mm_set_ps(0., -0.5, 1., 9.);
36914         assert_eq_m128(r, e);
36915     }
36916
36917     #[simd_test(enable = "avx512f")]
36918     unsafe fn test_mm_cvt_roundu32_ss() {
36919         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36920         let b: u32 = 9;
36921         let r = _mm_cvt_roundu32_ss(a, b, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
36922         let e = _mm_set_ps(0., -0.5, 1., 9.);
36923         assert_eq_m128(r, e);
36924     }
36925
36926     #[simd_test(enable = "avx512f")]
36927     unsafe fn test_mm_cvti32_ss() {
36928         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36929         let b: i32 = 9;
36930         let r = _mm_cvti32_ss(a, b);
36931         let e = _mm_set_ps(0., -0.5, 1., 9.);
36932         assert_eq_m128(r, e);
36933     }
36934
36935     #[simd_test(enable = "avx512f")]
36936     unsafe fn test_mm_cvti32_sd() {
36937         let a = _mm_set_pd(1., -1.5);
36938         let b: i32 = 9;
36939         let r = _mm_cvti32_sd(a, b);
36940         let e = _mm_set_pd(1., 9.);
36941         assert_eq_m128d(r, e);
36942     }
36943
36944     #[simd_test(enable = "avx512f")]
36945     unsafe fn test_mm_cvtt_roundss_si32() {
36946         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36947         let r = _mm_cvtt_roundss_si32(a, _MM_FROUND_CUR_DIRECTION);
36948         let e: i32 = -2;
36949         assert_eq!(r, e);
36950     }
36951
36952     #[simd_test(enable = "avx512f")]
36953     unsafe fn test_mm_cvtt_roundss_i32() {
36954         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36955         let r = _mm_cvtt_roundss_i32(a, _MM_FROUND_CUR_DIRECTION);
36956         let e: i32 = -2;
36957         assert_eq!(r, e);
36958     }
36959
36960     #[simd_test(enable = "avx512f")]
36961     unsafe fn test_mm_cvtt_roundss_u32() {
36962         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36963         let r = _mm_cvtt_roundss_u32(a, _MM_FROUND_CUR_DIRECTION);
36964         let e: u32 = u32::MAX;
36965         assert_eq!(r, e);
36966     }
36967
36968     #[simd_test(enable = "avx512f")]
36969     unsafe fn test_mm_cvttss_i32() {
36970         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36971         let r = _mm_cvttss_i32(a);
36972         let e: i32 = -2;
36973         assert_eq!(r, e);
36974     }
36975
36976     #[simd_test(enable = "avx512f")]
36977     unsafe fn test_mm_cvttss_u32() {
36978         let a = _mm_set_ps(0., -0.5, 1., -1.5);
36979         let r = _mm_cvttss_u32(a);
36980         let e: u32 = u32::MAX;
36981         assert_eq!(r, e);
36982     }
36983
36984     #[simd_test(enable = "avx512f")]
36985     unsafe fn test_mm_cvtt_roundsd_si32() {
36986         let a = _mm_set_pd(1., -1.5);
36987         let r = _mm_cvtt_roundsd_si32(a, _MM_FROUND_CUR_DIRECTION);
36988         let e: i32 = -2;
36989         assert_eq!(r, e);
36990     }
36991
36992     #[simd_test(enable = "avx512f")]
36993     unsafe fn test_mm_cvtt_roundsd_i32() {
36994         let a = _mm_set_pd(1., -1.5);
36995         let r = _mm_cvtt_roundsd_i32(a, _MM_FROUND_CUR_DIRECTION);
36996         let e: i32 = -2;
36997         assert_eq!(r, e);
36998     }
36999
37000     #[simd_test(enable = "avx512f")]
37001     unsafe fn test_mm_cvtt_roundsd_u32() {
37002         let a = _mm_set_pd(1., -1.5);
37003         let r = _mm_cvtt_roundsd_u32(a, _MM_FROUND_CUR_DIRECTION);
37004         let e: u32 = u32::MAX;
37005         assert_eq!(r, e);
37006     }
37007
37008     #[simd_test(enable = "avx512f")]
37009     unsafe fn test_mm_cvttsd_i32() {
37010         let a = _mm_set_pd(1., -1.5);
37011         let r = _mm_cvttsd_i32(a);
37012         let e: i32 = -2;
37013         assert_eq!(r, e);
37014     }
37015
37016     #[simd_test(enable = "avx512f")]
37017     unsafe fn test_mm_cvttsd_u32() {
37018         let a = _mm_set_pd(1., -1.5);
37019         let r = _mm_cvttsd_u32(a);
37020         let e: u32 = u32::MAX;
37021         assert_eq!(r, e);
37022     }
37023
37024     #[simd_test(enable = "avx512f")]
37025     unsafe fn test_mm_cvtu32_ss() {
37026         let a = _mm_set_ps(0., -0.5, 1., -1.5);
37027         let b: u32 = 9;
37028         let r = _mm_cvtu32_ss(a, b);
37029         let e = _mm_set_ps(0., -0.5, 1., 9.);
37030         assert_eq_m128(r, e);
37031     }
37032
37033     #[simd_test(enable = "avx512f")]
37034     unsafe fn test_mm_cvtu32_sd() {
37035         let a = _mm_set_pd(1., -1.5);
37036         let b: u32 = 9;
37037         let r = _mm_cvtu32_sd(a, b);
37038         let e = _mm_set_pd(1., 9.);
37039         assert_eq_m128d(r, e);
37040     }
37041
37042     #[simd_test(enable = "avx512f")]
37043     unsafe fn test_mm_cvtu64_ss() {
37044         let a = _mm_set_ps(0., -0.5, 1., -1.5);
37045         let b: u64 = 9;
37046         let r = _mm_cvtu64_ss(a, b);
37047         let e = _mm_set_ps(0., -0.5, 1., 9.);
37048         assert_eq_m128(r, e);
37049     }
37050
37051     #[simd_test(enable = "avx512f")]
37052     unsafe fn test_mm_cvtu64_sd() {
37053         let a = _mm_set_pd(1., -1.5);
37054         let b: u64 = 9;
37055         let r = _mm_cvtu64_sd(a, b);
37056         let e = _mm_set_pd(1., 9.);
37057         assert_eq_m128d(r, e);
37058     }
37059
37060     #[simd_test(enable = "avx512f")]
37061     unsafe fn test_mm_comi_round_ss() {
37062         let a = _mm_set1_ps(2.2);
37063         let b = _mm_set1_ps(1.1);
37064         let r = _mm_comi_round_ss(a, b, 0, _MM_FROUND_CUR_DIRECTION);
37065         let e: i32 = 0;
37066         assert_eq!(r, e);
37067     }
37068
37069     #[simd_test(enable = "avx512f")]
37070     unsafe fn test_mm_comi_round_sd() {
37071         let a = _mm_set1_pd(2.2);
37072         let b = _mm_set1_pd(1.1);
37073         let r = _mm_comi_round_sd(a, b, 0, _MM_FROUND_CUR_DIRECTION);
37074         let e: i32 = 0;
37075         assert_eq!(r, e);
37076     }
37077 }