library/stdarch/crates/core_arch/src/x86/sse41.rs

   1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
   2
   3 use crate::{
   4     core_arch::{simd::*, simd_llvm::*, x86::*},
   5     mem::transmute,
   6 };
   7
   8 #[cfg(test)]
   9 use stdarch_test::assert_instr;
  10
  11 // SSE4 rounding constans
  12 /// round to nearest
  13 #[stable(feature = "simd_x86", since = "1.27.0")]
  14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
  15 /// round down
  16 #[stable(feature = "simd_x86", since = "1.27.0")]
  17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
  18 /// round up
  19 #[stable(feature = "simd_x86", since = "1.27.0")]
  20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
  21 /// truncate
  22 #[stable(feature = "simd_x86", since = "1.27.0")]
  23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
  24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
  25 #[stable(feature = "simd_x86", since = "1.27.0")]
  26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
  27 /// do not suppress exceptions
  28 #[stable(feature = "simd_x86", since = "1.27.0")]
  29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
  30 /// suppress exceptions
  31 #[stable(feature = "simd_x86", since = "1.27.0")]
  32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
  33 /// round to nearest and do not suppress exceptions
  34 #[stable(feature = "simd_x86", since = "1.27.0")]
  35 pub const _MM_FROUND_NINT: i32 = 0x00;
  36 /// round down and do not suppress exceptions
  37 #[stable(feature = "simd_x86", since = "1.27.0")]
  38 pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
  39 /// round up and do not suppress exceptions
  40 #[stable(feature = "simd_x86", since = "1.27.0")]
  41 pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
  42 /// truncate and do not suppress exceptions
  43 #[stable(feature = "simd_x86", since = "1.27.0")]
  44 pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
  45 /// use MXCSR.RC and do not suppress exceptions; see
  46 /// `vendor::_MM_SET_ROUNDING_MODE`
  47 #[stable(feature = "simd_x86", since = "1.27.0")]
  48 pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
  49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
  50 #[stable(feature = "simd_x86", since = "1.27.0")]
  51 pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
  52
  53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
  54 ///
  55 /// The high bit of each corresponding mask byte determines the selection.
  56 /// If the high bit is set the element of `a` is selected. The element
  57 /// of `b` is selected otherwise.
  58 ///
  59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
  60 #[inline]
  61 #[target_feature(enable = "sse4.1")]
  62 #[cfg_attr(test, assert_instr(pblendvb))]
  63 #[stable(feature = "simd_x86", since = "1.27.0")]
  64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
  65     transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
  66 }
  67
  68 /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
  69 ///
  70 /// The mask bits determine the selection. A clear bit selects the
  71 /// corresponding element of `a`, and a set bit the corresponding
  72 /// element of `b`.
  73 ///
  74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
  75 #[inline]
  76 #[target_feature(enable = "sse4.1")]
  77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
  78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
  79 // #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
  80 #[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
  81 #[rustc_legacy_const_generics(2)]
  82 #[stable(feature = "simd_x86", since = "1.27.0")]
  83 pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
  84     static_assert_imm8!(IMM8);
  85     transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
  86 }
  87
  88 /// Blend packed double-precision (64-bit) floating-point elements from `a`
  89 /// and `b` using `mask`
  90 ///
  91 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
  92 #[inline]
  93 #[target_feature(enable = "sse4.1")]
  94 #[cfg_attr(test, assert_instr(blendvpd))]
  95 #[stable(feature = "simd_x86", since = "1.27.0")]
  96 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
  97     blendvpd(a, b, mask)
  98 }
  99
 100 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 101 /// and `b` using `mask`
 102 ///
 103 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
 104 #[inline]
 105 #[target_feature(enable = "sse4.1")]
 106 #[cfg_attr(test, assert_instr(blendvps))]
 107 #[stable(feature = "simd_x86", since = "1.27.0")]
 108 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 109     blendvps(a, b, mask)
 110 }
 111
 112 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 113 /// and `b` using control mask `IMM2`
 114 ///
 115 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
 116 #[inline]
 117 #[target_feature(enable = "sse4.1")]
 118 // Note: LLVM7 prefers the single-precision floating-point domain when possible
 119 // see https://bugs.llvm.org/show_bug.cgi?id=38195
 120 // #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
 121 #[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
 122 #[rustc_legacy_const_generics(2)]
 123 #[stable(feature = "simd_x86", since = "1.27.0")]
 124 pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
 125     static_assert_imm2!(IMM2);
 126     blendpd(a, b, IMM2 as u8)
 127 }
 128
 129 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 130 /// and `b` using mask `IMM4`
 131 ///
 132 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
 133 #[inline]
 134 #[target_feature(enable = "sse4.1")]
 135 #[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
 136 #[rustc_legacy_const_generics(2)]
 137 #[stable(feature = "simd_x86", since = "1.27.0")]
 138 pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
 139     static_assert_imm4!(IMM4);
 140     blendps(a, b, IMM4 as u8)
 141 }
 142
 143 /// Extracts a single-precision (32-bit) floating-point element from `a`,
 144 /// selected with `IMM8`
 145 ///
 146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
 147 #[inline]
 148 #[target_feature(enable = "sse4.1")]
 149 #[cfg_attr(
 150     all(test, not(target_os = "windows")),
 151     assert_instr(extractps, IMM8 = 0)
 152 )]
 153 #[rustc_legacy_const_generics(1)]
 154 #[stable(feature = "simd_x86", since = "1.27.0")]
 155 pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
 156     static_assert_imm2!(IMM8);
 157     transmute(simd_extract::<_, f32>(a, IMM8 as u32))
 158 }
 159
 160 /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
 161 /// integer containing the zero-extended integer data.
 162 ///
 163 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
 164 ///
 165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
 166 #[inline]
 167 #[target_feature(enable = "sse4.1")]
 168 #[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
 169 #[rustc_legacy_const_generics(1)]
 170 #[stable(feature = "simd_x86", since = "1.27.0")]
 171 pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
 172     static_assert_imm4!(IMM8);
 173     simd_extract::<_, u8>(a.as_u8x16(), IMM8 as u32) as i32
 174 }
 175
 176 /// Extracts an 32-bit integer from `a` selected with `IMM8`
 177 ///
 178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
 179 #[inline]
 180 #[target_feature(enable = "sse4.1")]
 181 #[cfg_attr(
 182     all(test, not(target_os = "windows")),
 183     assert_instr(extractps, IMM8 = 1)
 184 )]
 185 #[rustc_legacy_const_generics(1)]
 186 #[stable(feature = "simd_x86", since = "1.27.0")]
 187 pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
 188     static_assert_imm2!(IMM8);
 189     simd_extract::<_, i32>(a.as_i32x4(), IMM8 as u32)
 190 }
 191
 192 /// Select a single value in `a` to store at some position in `b`,
 193 /// Then zero elements according to `IMM8`.
 194 ///
 195 /// `IMM8` specifies which bits from operand `a` will be copied, which bits in
 196 /// the result they will be copied to, and which bits in the result will be
 197 /// cleared. The following assignments are made:
 198 ///
 199 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
 200 ///     - `00`: Selects bits `[31:0]` from operand `a`.
 201 ///     - `01`: Selects bits `[63:32]` from operand `a`.
 202 ///     - `10`: Selects bits `[95:64]` from operand `a`.
 203 ///     - `11`: Selects bits `[127:96]` from operand `a`.
 204 ///
 205 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
 206 /// from operand `a` are copied:
 207 ///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
 208 ///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
 209 ///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
 210 ///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
 211 ///
 212 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 213 /// element is cleared.
 214 ///
 215 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
 216 #[inline]
 217 #[target_feature(enable = "sse4.1")]
 218 #[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
 219 #[rustc_legacy_const_generics(2)]
 220 #[stable(feature = "simd_x86", since = "1.27.0")]
 221 pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
 222     static_assert_imm8!(IMM8);
 223     insertps(a, b, IMM8 as u8)
 224 }
 225
 226 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
 227 /// location specified by `IMM8`.
 228 ///
 229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
 230 #[inline]
 231 #[target_feature(enable = "sse4.1")]
 232 #[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
 233 #[rustc_legacy_const_generics(2)]
 234 #[stable(feature = "simd_x86", since = "1.27.0")]
 235 pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 236     static_assert_imm4!(IMM8);
 237     transmute(simd_insert(a.as_i8x16(), IMM8 as u32, i as i8))
 238 }
 239
 240 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
 241 /// location specified by `IMM8`.
 242 ///
 243 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
 244 #[inline]
 245 #[target_feature(enable = "sse4.1")]
 246 #[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
 247 #[rustc_legacy_const_generics(2)]
 248 #[stable(feature = "simd_x86", since = "1.27.0")]
 249 pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 250     static_assert_imm2!(IMM8);
 251     transmute(simd_insert(a.as_i32x4(), IMM8 as u32, i))
 252 }
 253
 254 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
 255 /// values in dst.
 256 ///
 257 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
 258 #[inline]
 259 #[target_feature(enable = "sse4.1")]
 260 #[cfg_attr(test, assert_instr(pmaxsb))]
 261 #[stable(feature = "simd_x86", since = "1.27.0")]
 262 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
 263     transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
 264 }
 265
 266 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
 267 /// maximum.
 268 ///
 269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
 270 #[inline]
 271 #[target_feature(enable = "sse4.1")]
 272 #[cfg_attr(test, assert_instr(pmaxuw))]
 273 #[stable(feature = "simd_x86", since = "1.27.0")]
 274 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
 275     transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
 276 }
 277
 278 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
 279 /// values.
 280 ///
 281 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
 282 #[inline]
 283 #[target_feature(enable = "sse4.1")]
 284 #[cfg_attr(test, assert_instr(pmaxsd))]
 285 #[stable(feature = "simd_x86", since = "1.27.0")]
 286 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
 287     transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
 288 }
 289
 290 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
 291 /// maximum values.
 292 ///
 293 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
 294 #[inline]
 295 #[target_feature(enable = "sse4.1")]
 296 #[cfg_attr(test, assert_instr(pmaxud))]
 297 #[stable(feature = "simd_x86", since = "1.27.0")]
 298 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
 299     transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
 300 }
 301
 302 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
 303 /// values in dst.
 304 ///
 305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
 306 #[inline]
 307 #[target_feature(enable = "sse4.1")]
 308 #[cfg_attr(test, assert_instr(pminsb))]
 309 #[stable(feature = "simd_x86", since = "1.27.0")]
 310 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
 311     transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
 312 }
 313
 314 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
 315 /// minimum.
 316 ///
 317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
 318 #[inline]
 319 #[target_feature(enable = "sse4.1")]
 320 #[cfg_attr(test, assert_instr(pminuw))]
 321 #[stable(feature = "simd_x86", since = "1.27.0")]
 322 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
 323     transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
 324 }
 325
 326 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
 327 /// values.
 328 ///
 329 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
 330 #[inline]
 331 #[target_feature(enable = "sse4.1")]
 332 #[cfg_attr(test, assert_instr(pminsd))]
 333 #[stable(feature = "simd_x86", since = "1.27.0")]
 334 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
 335     transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
 336 }
 337
 338 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
 339 /// minimum values.
 340 ///
 341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
 342 #[inline]
 343 #[target_feature(enable = "sse4.1")]
 344 #[cfg_attr(test, assert_instr(pminud))]
 345 #[stable(feature = "simd_x86", since = "1.27.0")]
 346 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 347     transmute(pminud(a.as_u32x4(), b.as_u32x4()))
 348 }
 349
 350 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 351 /// using unsigned saturation
 352 ///
 353 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
 354 #[inline]
 355 #[target_feature(enable = "sse4.1")]
 356 #[cfg_attr(test, assert_instr(packusdw))]
 357 #[stable(feature = "simd_x86", since = "1.27.0")]
 358 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
 359     transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
 360 }
 361
 362 /// Compares packed 64-bit integers in `a` and `b` for equality
 363 ///
 364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
 365 #[inline]
 366 #[target_feature(enable = "sse4.1")]
 367 #[cfg_attr(test, assert_instr(pcmpeqq))]
 368 #[stable(feature = "simd_x86", since = "1.27.0")]
 369 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 370     transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 371 }
 372
 373 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
 374 ///
 375 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
 376 #[inline]
 377 #[target_feature(enable = "sse4.1")]
 378 #[cfg_attr(test, assert_instr(pmovsxbw))]
 379 #[stable(feature = "simd_x86", since = "1.27.0")]
 380 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 381     let a = a.as_i8x16();
 382     let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 383     transmute(simd_cast::<_, i16x8>(a))
 384 }
 385
 386 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
 387 ///
 388 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
 389 #[inline]
 390 #[target_feature(enable = "sse4.1")]
 391 #[cfg_attr(test, assert_instr(pmovsxbd))]
 392 #[stable(feature = "simd_x86", since = "1.27.0")]
 393 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 394     let a = a.as_i8x16();
 395     let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
 396     transmute(simd_cast::<_, i32x4>(a))
 397 }
 398
 399 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 400 /// 64-bit integers
 401 ///
 402 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
 403 #[inline]
 404 #[target_feature(enable = "sse4.1")]
 405 #[cfg_attr(test, assert_instr(pmovsxbq))]
 406 #[stable(feature = "simd_x86", since = "1.27.0")]
 407 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 408     let a = a.as_i8x16();
 409     let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
 410     transmute(simd_cast::<_, i64x2>(a))
 411 }
 412
 413 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
 414 ///
 415 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
 416 #[inline]
 417 #[target_feature(enable = "sse4.1")]
 418 #[cfg_attr(test, assert_instr(pmovsxwd))]
 419 #[stable(feature = "simd_x86", since = "1.27.0")]
 420 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 421     let a = a.as_i16x8();
 422     let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
 423     transmute(simd_cast::<_, i32x4>(a))
 424 }
 425
 426 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
 427 ///
 428 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
 429 #[inline]
 430 #[target_feature(enable = "sse4.1")]
 431 #[cfg_attr(test, assert_instr(pmovsxwq))]
 432 #[stable(feature = "simd_x86", since = "1.27.0")]
 433 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 434     let a = a.as_i16x8();
 435     let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
 436     transmute(simd_cast::<_, i64x2>(a))
 437 }
 438
 439 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
 440 ///
 441 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
 442 #[inline]
 443 #[target_feature(enable = "sse4.1")]
 444 #[cfg_attr(test, assert_instr(pmovsxdq))]
 445 #[stable(feature = "simd_x86", since = "1.27.0")]
 446 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 447     let a = a.as_i32x4();
 448     let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
 449     transmute(simd_cast::<_, i64x2>(a))
 450 }
 451
 452 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
 453 ///
 454 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
 455 #[inline]
 456 #[target_feature(enable = "sse4.1")]
 457 #[cfg_attr(test, assert_instr(pmovzxbw))]
 458 #[stable(feature = "simd_x86", since = "1.27.0")]
 459 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 460     let a = a.as_u8x16();
 461     let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 462     transmute(simd_cast::<_, i16x8>(a))
 463 }
 464
 465 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
 466 ///
 467 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
 468 #[inline]
 469 #[target_feature(enable = "sse4.1")]
 470 #[cfg_attr(test, assert_instr(pmovzxbd))]
 471 #[stable(feature = "simd_x86", since = "1.27.0")]
 472 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 473     let a = a.as_u8x16();
 474     let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
 475     transmute(simd_cast::<_, i32x4>(a))
 476 }
 477
 478 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
 479 ///
 480 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
 481 #[inline]
 482 #[target_feature(enable = "sse4.1")]
 483 #[cfg_attr(test, assert_instr(pmovzxbq))]
 484 #[stable(feature = "simd_x86", since = "1.27.0")]
 485 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 486     let a = a.as_u8x16();
 487     let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
 488     transmute(simd_cast::<_, i64x2>(a))
 489 }
 490
 491 /// Zeroes extend packed unsigned 16-bit integers in `a`
 492 /// to packed 32-bit integers
 493 ///
 494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
 495 #[inline]
 496 #[target_feature(enable = "sse4.1")]
 497 #[cfg_attr(test, assert_instr(pmovzxwd))]
 498 #[stable(feature = "simd_x86", since = "1.27.0")]
 499 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 500     let a = a.as_u16x8();
 501     let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
 502     transmute(simd_cast::<_, i32x4>(a))
 503 }
 504
 505 /// Zeroes extend packed unsigned 16-bit integers in `a`
 506 /// to packed 64-bit integers
 507 ///
 508 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
 509 #[inline]
 510 #[target_feature(enable = "sse4.1")]
 511 #[cfg_attr(test, assert_instr(pmovzxwq))]
 512 #[stable(feature = "simd_x86", since = "1.27.0")]
 513 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 514     let a = a.as_u16x8();
 515     let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
 516     transmute(simd_cast::<_, i64x2>(a))
 517 }
 518
 519 /// Zeroes extend packed unsigned 32-bit integers in `a`
 520 /// to packed 64-bit integers
 521 ///
 522 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
 523 #[inline]
 524 #[target_feature(enable = "sse4.1")]
 525 #[cfg_attr(test, assert_instr(pmovzxdq))]
 526 #[stable(feature = "simd_x86", since = "1.27.0")]
 527 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 528     let a = a.as_u32x4();
 529     let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
 530     transmute(simd_cast::<_, i64x2>(a))
 531 }
 532
 533 /// Returns the dot product of two __m128d vectors.
 534 ///
 535 /// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
 536 /// If a condition mask bit is zero, the corresponding multiplication is
 537 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 538 /// the dot product will be stored in the return value component. Otherwise if
 539 /// the broadcast mask bit is zero then the return component will be zero.
 540 ///
 541 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
 542 #[inline]
 543 #[target_feature(enable = "sse4.1")]
 544 #[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
 545 #[rustc_legacy_const_generics(2)]
 546 #[stable(feature = "simd_x86", since = "1.27.0")]
 547 pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
 548     static_assert_imm8!(IMM8);
 549     dppd(a, b, IMM8 as u8)
 550 }
 551
 552 /// Returns the dot product of two __m128 vectors.
 553 ///
 554 /// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
 555 /// If a condition mask bit is zero, the corresponding multiplication is
 556 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 557 /// the dot product will be stored in the return value component. Otherwise if
 558 /// the broadcast mask bit is zero then the return component will be zero.
 559 ///
 560 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
 561 #[inline]
 562 #[target_feature(enable = "sse4.1")]
 563 #[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
 564 #[rustc_legacy_const_generics(2)]
 565 #[stable(feature = "simd_x86", since = "1.27.0")]
 566 pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
 567     static_assert_imm8!(IMM8);
 568     dpps(a, b, IMM8 as u8)
 569 }
 570
 571 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 572 /// down to an integer value, and stores the results as packed double-precision
 573 /// floating-point elements.
 574 ///
 575 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
 576 #[inline]
 577 #[target_feature(enable = "sse4.1")]
 578 #[cfg_attr(test, assert_instr(roundpd))]
 579 #[stable(feature = "simd_x86", since = "1.27.0")]
 580 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 581     simd_floor(a)
 582 }
 583
 584 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 585 /// down to an integer value, and stores the results as packed single-precision
 586 /// floating-point elements.
 587 ///
 588 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
 589 #[inline]
 590 #[target_feature(enable = "sse4.1")]
 591 #[cfg_attr(test, assert_instr(roundps))]
 592 #[stable(feature = "simd_x86", since = "1.27.0")]
 593 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 594     simd_floor(a)
 595 }
 596
 597 /// Round the lower double-precision (64-bit) floating-point element in `b`
 598 /// down to an integer value, store the result as a double-precision
 599 /// floating-point element in the lower element of the intrinsic result,
 600 /// and copies the upper element from `a` to the upper element of the intrinsic
 601 /// result.
 602 ///
 603 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
 604 #[inline]
 605 #[target_feature(enable = "sse4.1")]
 606 #[cfg_attr(test, assert_instr(roundsd))]
 607 #[stable(feature = "simd_x86", since = "1.27.0")]
 608 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 609     roundsd(a, b, _MM_FROUND_FLOOR)
 610 }
 611
 612 /// Round the lower single-precision (32-bit) floating-point element in `b`
 613 /// down to an integer value, store the result as a single-precision
 614 /// floating-point element in the lower element of the intrinsic result,
 615 /// and copies the upper 3 packed elements from `a` to the upper elements
 616 /// of the intrinsic result.
 617 ///
 618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
 619 #[inline]
 620 #[target_feature(enable = "sse4.1")]
 621 #[cfg_attr(test, assert_instr(roundss))]
 622 #[stable(feature = "simd_x86", since = "1.27.0")]
 623 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 624     roundss(a, b, _MM_FROUND_FLOOR)
 625 }
 626
 627 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 628 /// up to an integer value, and stores the results as packed double-precision
 629 /// floating-point elements.
 630 ///
 631 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
 632 #[inline]
 633 #[target_feature(enable = "sse4.1")]
 634 #[cfg_attr(test, assert_instr(roundpd))]
 635 #[stable(feature = "simd_x86", since = "1.27.0")]
 636 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 637     simd_ceil(a)
 638 }
 639
 640 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 641 /// up to an integer value, and stores the results as packed single-precision
 642 /// floating-point elements.
 643 ///
 644 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
 645 #[inline]
 646 #[target_feature(enable = "sse4.1")]
 647 #[cfg_attr(test, assert_instr(roundps))]
 648 #[stable(feature = "simd_x86", since = "1.27.0")]
 649 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 650     simd_ceil(a)
 651 }
 652
 653 /// Round the lower double-precision (64-bit) floating-point element in `b`
 654 /// up to an integer value, store the result as a double-precision
 655 /// floating-point element in the lower element of the intrisic result,
 656 /// and copies the upper element from `a` to the upper element
 657 /// of the intrinsic result.
 658 ///
 659 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
 660 #[inline]
 661 #[target_feature(enable = "sse4.1")]
 662 #[cfg_attr(test, assert_instr(roundsd))]
 663 #[stable(feature = "simd_x86", since = "1.27.0")]
 664 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 665     roundsd(a, b, _MM_FROUND_CEIL)
 666 }
 667
 668 /// Round the lower single-precision (32-bit) floating-point element in `b`
 669 /// up to an integer value, store the result as a single-precision
 670 /// floating-point element in the lower element of the intrinsic result,
 671 /// and copies the upper 3 packed elements from `a` to the upper elements
 672 /// of the intrinsic result.
 673 ///
 674 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
 675 #[inline]
 676 #[target_feature(enable = "sse4.1")]
 677 #[cfg_attr(test, assert_instr(roundss))]
 678 #[stable(feature = "simd_x86", since = "1.27.0")]
 679 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 680     roundss(a, b, _MM_FROUND_CEIL)
 681 }
 682
 683 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 684 /// using the `ROUNDING` parameter, and stores the results as packed
 685 /// double-precision floating-point elements.
 686 /// Rounding is done according to the rounding parameter, which can be one of:
 687 ///
 688 /// ```
 689 /// #[cfg(target_arch = "x86")]
 690 /// use std::arch::x86::*;
 691 /// #[cfg(target_arch = "x86_64")]
 692 /// use std::arch::x86_64::*;
 693 ///
 694 /// # fn main() {
 695 /// // round to nearest, and suppress exceptions:
 696 /// # let _x =
 697 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 698 /// // round down, and suppress exceptions:
 699 /// # let _x =
 700 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 701 /// // round up, and suppress exceptions:
 702 /// # let _x =
 703 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 704 /// // truncate, and suppress exceptions:
 705 /// # let _x =
 706 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 707 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 708 /// # let _x =
 709 /// _MM_FROUND_CUR_DIRECTION;
 710 /// # }
 711 /// ```
 712 ///
 713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
 714 #[inline]
 715 #[target_feature(enable = "sse4.1")]
 716 #[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
 717 #[rustc_legacy_const_generics(1)]
 718 #[stable(feature = "simd_x86", since = "1.27.0")]
 719 pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
 720     static_assert_imm4!(ROUNDING);
 721     roundpd(a, ROUNDING)
 722 }
 723
 724 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 725 /// using the `ROUNDING` parameter, and stores the results as packed
 726 /// single-precision floating-point elements.
 727 /// Rounding is done according to the rounding parameter, which can be one of:
 728 ///
 729 /// ```
 730 /// #[cfg(target_arch = "x86")]
 731 /// use std::arch::x86::*;
 732 /// #[cfg(target_arch = "x86_64")]
 733 /// use std::arch::x86_64::*;
 734 ///
 735 /// # fn main() {
 736 /// // round to nearest, and suppress exceptions:
 737 /// # let _x =
 738 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 739 /// // round down, and suppress exceptions:
 740 /// # let _x =
 741 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 742 /// // round up, and suppress exceptions:
 743 /// # let _x =
 744 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 745 /// // truncate, and suppress exceptions:
 746 /// # let _x =
 747 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 748 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 749 /// # let _x =
 750 /// _MM_FROUND_CUR_DIRECTION;
 751 /// # }
 752 /// ```
 753 ///
 754 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
 755 #[inline]
 756 #[target_feature(enable = "sse4.1")]
 757 #[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
 758 #[rustc_legacy_const_generics(1)]
 759 #[stable(feature = "simd_x86", since = "1.27.0")]
 760 pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
 761     static_assert_imm4!(ROUNDING);
 762     roundps(a, ROUNDING)
 763 }
 764
 765 /// Round the lower double-precision (64-bit) floating-point element in `b`
 766 /// using the `ROUNDING` parameter, store the result as a double-precision
 767 /// floating-point element in the lower element of the intrinsic result,
 768 /// and copies the upper element from `a` to the upper element of the intrinsic
 769 /// result.
 770 /// Rounding is done according to the rounding parameter, which can be one of:
 771 ///
 772 /// ```
 773 /// #[cfg(target_arch = "x86")]
 774 /// use std::arch::x86::*;
 775 /// #[cfg(target_arch = "x86_64")]
 776 /// use std::arch::x86_64::*;
 777 ///
 778 /// # fn main() {
 779 /// // round to nearest, and suppress exceptions:
 780 /// # let _x =
 781 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 782 /// // round down, and suppress exceptions:
 783 /// # let _x =
 784 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 785 /// // round up, and suppress exceptions:
 786 /// # let _x =
 787 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 788 /// // truncate, and suppress exceptions:
 789 /// # let _x =
 790 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 791 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 792 /// # let _x =
 793 /// _MM_FROUND_CUR_DIRECTION;
 794 /// # }
 795 /// ```
 796 ///
 797 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
 798 #[inline]
 799 #[target_feature(enable = "sse4.1")]
 800 #[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
 801 #[rustc_legacy_const_generics(2)]
 802 #[stable(feature = "simd_x86", since = "1.27.0")]
 803 pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
 804     static_assert_imm4!(ROUNDING);
 805     roundsd(a, b, ROUNDING)
 806 }
 807
 808 /// Round the lower single-precision (32-bit) floating-point element in `b`
 809 /// using the `ROUNDING` parameter, store the result as a single-precision
 810 /// floating-point element in the lower element of the intrinsic result,
 811 /// and copies the upper 3 packed elements from `a` to the upper elements
 812 /// of the instrinsic result.
 813 /// Rounding is done according to the rounding parameter, which can be one of:
 814 ///
 815 /// ```
 816 /// #[cfg(target_arch = "x86")]
 817 /// use std::arch::x86::*;
 818 /// #[cfg(target_arch = "x86_64")]
 819 /// use std::arch::x86_64::*;
 820 ///
 821 /// # fn main() {
 822 /// // round to nearest, and suppress exceptions:
 823 /// # let _x =
 824 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 825 /// // round down, and suppress exceptions:
 826 /// # let _x =
 827 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 828 /// // round up, and suppress exceptions:
 829 /// # let _x =
 830 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 831 /// // truncate, and suppress exceptions:
 832 /// # let _x =
 833 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 834 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 835 /// # let _x =
 836 /// _MM_FROUND_CUR_DIRECTION;
 837 /// # }
 838 /// ```
 839 ///
 840 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
 841 #[inline]
 842 #[target_feature(enable = "sse4.1")]
 843 #[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
 844 #[rustc_legacy_const_generics(2)]
 845 #[stable(feature = "simd_x86", since = "1.27.0")]
 846 pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
 847     static_assert_imm4!(ROUNDING);
 848     roundss(a, b, ROUNDING)
 849 }
 850
 851 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
 852 /// returning a vector containing its value in its first position, and its
 853 /// index
 854 /// in its second position; all other elements are set to zero.
 855 ///
 856 /// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
 857 /// instruction.
 858 ///
 859 /// Arguments:
 860 ///
 861 /// * `a` - A 128-bit vector of type `__m128i`.
 862 ///
 863 /// Returns:
 864 ///
 865 /// A 128-bit value where:
 866 ///
 867 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 868 /// * bits `[18:16]` - contain the index of the minimum value
 869 /// * remaining bits are set to `0`.
 870 ///
 871 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
 872 #[inline]
 873 #[target_feature(enable = "sse4.1")]
 874 #[cfg_attr(test, assert_instr(phminposuw))]
 875 #[stable(feature = "simd_x86", since = "1.27.0")]
 876 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 877     transmute(phminposuw(a.as_u16x8()))
 878 }
 879
 880 /// Multiplies the low 32-bit integers from each packed 64-bit
 881 /// element in `a` and `b`, and returns the signed 64-bit result.
 882 ///
 883 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
 884 #[inline]
 885 #[target_feature(enable = "sse4.1")]
 886 #[cfg_attr(test, assert_instr(pmuldq))]
 887 #[stable(feature = "simd_x86", since = "1.27.0")]
 888 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 889     transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
 890 }
 891
 892 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
 893 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
 894 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
 895 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 896 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
 897 /// return a negative number.
 898 ///
 899 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
 900 #[inline]
 901 #[target_feature(enable = "sse4.1")]
 902 #[cfg_attr(test, assert_instr(pmulld))]
 903 #[stable(feature = "simd_x86", since = "1.27.0")]
 904 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 905     transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 906 }
 907
 908 /// Subtracts 8-bit unsigned integer values and computes the absolute
 909 /// values of the differences to the corresponding bits in the destination.
 910 /// Then sums of the absolute differences are returned according to the bit
 911 /// fields in the immediate operand.
 912 ///
 913 /// The following algorithm is performed:
 914 ///
 915 /// ```ignore
 916 /// i = IMM8[2] * 4
 917 /// j = IMM8[1:0] * 4
 918 /// for k := 0 to 7
 919 ///     d0 = abs(a[i + k + 0] - b[j + 0])
 920 ///     d1 = abs(a[i + k + 1] - b[j + 1])
 921 ///     d2 = abs(a[i + k + 2] - b[j + 2])
 922 ///     d3 = abs(a[i + k + 3] - b[j + 3])
 923 ///     r[k] = d0 + d1 + d2 + d3
 924 /// ```
 925 ///
 926 /// Arguments:
 927 ///
 928 /// * `a` - A 128-bit vector of type `__m128i`.
 929 /// * `b` - A 128-bit vector of type `__m128i`.
 930 /// * `IMM8` - An 8-bit immediate operand specifying how the absolute
 931 ///   differences are to be calculated
 932 ///     * Bit `[2]` specify the offset for operand `a`
 933 ///     * Bits `[1:0]` specify the offset for operand `b`
 934 ///
 935 /// Returns:
 936 ///
 937 /// * A `__m128i` vector containing the sums of the sets of   absolute
 938 ///   differences between both operands.
 939 ///
 940 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
 941 #[inline]
 942 #[target_feature(enable = "sse4.1")]
 943 #[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
 944 #[rustc_legacy_const_generics(2)]
 945 #[stable(feature = "simd_x86", since = "1.27.0")]
 946 pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
 947     static_assert_imm3!(IMM8);
 948     transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8))
 949 }
 950
 951 /// Tests whether the specified bits in a 128-bit integer vector are all
 952 /// zeros.
 953 ///
 954 /// Arguments:
 955 ///
 956 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 957 /// * `mask` - A 128-bit integer vector selecting which bits to test in
 958 ///   operand `a`.
 959 ///
 960 /// Returns:
 961 ///
 962 /// * `1` - if the specified bits are all zeros,
 963 /// * `0` - otherwise.
 964 ///
 965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
 966 #[inline]
 967 #[target_feature(enable = "sse4.1")]
 968 #[cfg_attr(test, assert_instr(ptest))]
 969 #[stable(feature = "simd_x86", since = "1.27.0")]
 970 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 971     ptestz(a.as_i64x2(), mask.as_i64x2())
 972 }
 973
 974 /// Tests whether the specified bits in a 128-bit integer vector are all
 975 /// ones.
 976 ///
 977 /// Arguments:
 978 ///
 979 /// * `a` - A 128-bit integer vector containing the bits to be tested.
 980 /// * `mask` - A 128-bit integer vector selecting which bits to test in
 981 ///   operand `a`.
 982 ///
 983 /// Returns:
 984 ///
 985 /// * `1` - if the specified bits are all ones,
 986 /// * `0` - otherwise.
 987 ///
 988 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
 989 #[inline]
 990 #[target_feature(enable = "sse4.1")]
 991 #[cfg_attr(test, assert_instr(ptest))]
 992 #[stable(feature = "simd_x86", since = "1.27.0")]
 993 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
 994     ptestc(a.as_i64x2(), mask.as_i64x2())
 995 }
 996
 997 /// Tests whether the specified bits in a 128-bit integer vector are
 998 /// neither all zeros nor all ones.
 999 ///
1000 /// Arguments:
1001 ///
1002 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1003 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1004 ///   operand `a`.
1005 ///
1006 /// Returns:
1007 ///
1008 /// * `1` - if the specified bits are neither all zeros nor all ones,
1009 /// * `0` - otherwise.
1010 ///
1011 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1012 #[inline]
1013 #[target_feature(enable = "sse4.1")]
1014 #[cfg_attr(test, assert_instr(ptest))]
1015 #[stable(feature = "simd_x86", since = "1.27.0")]
1016 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1017     ptestnzc(a.as_i64x2(), mask.as_i64x2())
1018 }
1019
1020 /// Tests whether the specified bits in a 128-bit integer vector are all
1021 /// zeros.
1022 ///
1023 /// Arguments:
1024 ///
1025 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1026 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1027 ///   operand `a`.
1028 ///
1029 /// Returns:
1030 ///
1031 /// * `1` - if the specified bits are all zeros,
1032 /// * `0` - otherwise.
1033 ///
1034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1035 #[inline]
1036 #[target_feature(enable = "sse4.1")]
1037 #[cfg_attr(test, assert_instr(ptest))]
1038 #[stable(feature = "simd_x86", since = "1.27.0")]
1039 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1040     _mm_testz_si128(a, mask)
1041 }
1042
1043 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1044 /// ones.
1045 ///
1046 /// Argument:
1047 ///
1048 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1049 ///
1050 /// Returns:
1051 ///
1052 /// * `1` - if the bits specified in the operand are all set to 1,
1053 /// * `0` - otherwise.
1054 ///
1055 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1056 #[inline]
1057 #[target_feature(enable = "sse4.1")]
1058 #[cfg_attr(test, assert_instr(pcmpeqd))]
1059 #[cfg_attr(test, assert_instr(ptest))]
1060 #[stable(feature = "simd_x86", since = "1.27.0")]
1061 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1062     _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1063 }
1064
1065 /// Tests whether the specified bits in a 128-bit integer vector are
1066 /// neither all zeros nor all ones.
1067 ///
1068 /// Arguments:
1069 ///
1070 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1071 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1072 ///   operand `a`.
1073 ///
1074 /// Returns:
1075 ///
1076 /// * `1` - if the specified bits are neither all zeros nor all ones,
1077 /// * `0` - otherwise.
1078 ///
1079 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1080 #[inline]
1081 #[target_feature(enable = "sse4.1")]
1082 #[cfg_attr(test, assert_instr(ptest))]
1083 #[stable(feature = "simd_x86", since = "1.27.0")]
1084 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1085     _mm_testnzc_si128(a, mask)
1086 }
1087
1088 #[allow(improper_ctypes)]
1089 extern "C" {
1090     #[link_name = "llvm.x86.sse41.pblendvb"]
1091     fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1092     #[link_name = "llvm.x86.sse41.blendvpd"]
1093     fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1094     #[link_name = "llvm.x86.sse41.blendvps"]
1095     fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1096     #[link_name = "llvm.x86.sse41.blendpd"]
1097     fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1098     #[link_name = "llvm.x86.sse41.blendps"]
1099     fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1100     #[link_name = "llvm.x86.sse41.pblendw"]
1101     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1102     #[link_name = "llvm.x86.sse41.insertps"]
1103     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1104     #[link_name = "llvm.x86.sse41.pmaxsb"]
1105     fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1106     #[link_name = "llvm.x86.sse41.pmaxuw"]
1107     fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1108     #[link_name = "llvm.x86.sse41.pmaxsd"]
1109     fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1110     #[link_name = "llvm.x86.sse41.pmaxud"]
1111     fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1112     #[link_name = "llvm.x86.sse41.pminsb"]
1113     fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1114     #[link_name = "llvm.x86.sse41.pminuw"]
1115     fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1116     #[link_name = "llvm.x86.sse41.pminsd"]
1117     fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1118     #[link_name = "llvm.x86.sse41.pminud"]
1119     fn pminud(a: u32x4, b: u32x4) -> u32x4;
1120     #[link_name = "llvm.x86.sse41.packusdw"]
1121     fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1122     #[link_name = "llvm.x86.sse41.dppd"]
1123     fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1124     #[link_name = "llvm.x86.sse41.dpps"]
1125     fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1126     #[link_name = "llvm.x86.sse41.round.pd"]
1127     fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1128     #[link_name = "llvm.x86.sse41.round.ps"]
1129     fn roundps(a: __m128, rounding: i32) -> __m128;
1130     #[link_name = "llvm.x86.sse41.round.sd"]
1131     fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1132     #[link_name = "llvm.x86.sse41.round.ss"]
1133     fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1134     #[link_name = "llvm.x86.sse41.phminposuw"]
1135     fn phminposuw(a: u16x8) -> u16x8;
1136     #[link_name = "llvm.x86.sse41.pmuldq"]
1137     fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1138     #[link_name = "llvm.x86.sse41.mpsadbw"]
1139     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1140     #[link_name = "llvm.x86.sse41.ptestz"]
1141     fn ptestz(a: i64x2, mask: i64x2) -> i32;
1142     #[link_name = "llvm.x86.sse41.ptestc"]
1143     fn ptestc(a: i64x2, mask: i64x2) -> i32;
1144     #[link_name = "llvm.x86.sse41.ptestnzc"]
1145     fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1146 }
1147
1148 #[cfg(test)]
1149 mod tests {
1150     use crate::core_arch::x86::*;
1151     use std::mem;
1152     use stdarch_test::simd_test;
1153
1154     #[simd_test(enable = "sse4.1")]
1155     unsafe fn test_mm_blendv_epi8() {
1156         #[rustfmt::skip]
1157         let a = _mm_setr_epi8(
1158             0, 1, 2, 3, 4, 5, 6, 7,
1159             8, 9, 10, 11, 12, 13, 14, 15,
1160         );
1161         #[rustfmt::skip]
1162         let b = _mm_setr_epi8(
1163             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1164         );
1165         #[rustfmt::skip]
1166         let mask = _mm_setr_epi8(
1167             0, -1, 0, -1, 0, -1, 0, -1,
1168             0, -1, 0, -1, 0, -1, 0, -1,
1169         );
1170         #[rustfmt::skip]
1171         let e = _mm_setr_epi8(
1172             0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1173         );
1174         assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1175     }
1176
1177     #[simd_test(enable = "sse4.1")]
1178     unsafe fn test_mm_blendv_pd() {
1179         let a = _mm_set1_pd(0.0);
1180         let b = _mm_set1_pd(1.0);
1181         let mask = transmute(_mm_setr_epi64x(0, -1));
1182         let r = _mm_blendv_pd(a, b, mask);
1183         let e = _mm_setr_pd(0.0, 1.0);
1184         assert_eq_m128d(r, e);
1185     }
1186
1187     #[simd_test(enable = "sse4.1")]
1188     unsafe fn test_mm_blendv_ps() {
1189         let a = _mm_set1_ps(0.0);
1190         let b = _mm_set1_ps(1.0);
1191         let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1192         let r = _mm_blendv_ps(a, b, mask);
1193         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1194         assert_eq_m128(r, e);
1195     }
1196
1197     #[simd_test(enable = "sse4.1")]
1198     unsafe fn test_mm_blend_pd() {
1199         let a = _mm_set1_pd(0.0);
1200         let b = _mm_set1_pd(1.0);
1201         let r = _mm_blend_pd::<0b10>(a, b);
1202         let e = _mm_setr_pd(0.0, 1.0);
1203         assert_eq_m128d(r, e);
1204     }
1205
1206     #[simd_test(enable = "sse4.1")]
1207     unsafe fn test_mm_blend_ps() {
1208         let a = _mm_set1_ps(0.0);
1209         let b = _mm_set1_ps(1.0);
1210         let r = _mm_blend_ps::<0b1010>(a, b);
1211         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1212         assert_eq_m128(r, e);
1213     }
1214
1215     #[simd_test(enable = "sse4.1")]
1216     unsafe fn test_mm_blend_epi16() {
1217         let a = _mm_set1_epi16(0);
1218         let b = _mm_set1_epi16(1);
1219         let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1220         let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1221         assert_eq_m128i(r, e);
1222     }
1223
1224     #[simd_test(enable = "sse4.1")]
1225     unsafe fn test_mm_extract_ps() {
1226         let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1227         let r: f32 = transmute(_mm_extract_ps::<1>(a));
1228         assert_eq!(r, 1.0);
1229         let r: f32 = transmute(_mm_extract_ps::<3>(a));
1230         assert_eq!(r, 3.0);
1231     }
1232
1233     #[simd_test(enable = "sse4.1")]
1234     unsafe fn test_mm_extract_epi8() {
1235         #[rustfmt::skip]
1236         let a = _mm_setr_epi8(
1237             -1, 1, 2, 3, 4, 5, 6, 7,
1238             8, 9, 10, 11, 12, 13, 14, 15
1239         );
1240         let r1 = _mm_extract_epi8::<0>(a);
1241         let r2 = _mm_extract_epi8::<3>(a);
1242         assert_eq!(r1, 0xFF);
1243         assert_eq!(r2, 3);
1244     }
1245
1246     #[simd_test(enable = "sse4.1")]
1247     unsafe fn test_mm_extract_epi32() {
1248         let a = _mm_setr_epi32(0, 1, 2, 3);
1249         let r = _mm_extract_epi32::<1>(a);
1250         assert_eq!(r, 1);
1251         let r = _mm_extract_epi32::<3>(a);
1252         assert_eq!(r, 3);
1253     }
1254
1255     #[simd_test(enable = "sse4.1")]
1256     unsafe fn test_mm_insert_ps() {
1257         let a = _mm_set1_ps(1.0);
1258         let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1259         let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1260         let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1261         assert_eq_m128(r, e);
1262     }
1263
1264     #[simd_test(enable = "sse4.1")]
1265     unsafe fn test_mm_insert_epi8() {
1266         let a = _mm_set1_epi8(0);
1267         let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1268         let r = _mm_insert_epi8::<1>(a, 32);
1269         assert_eq_m128i(r, e);
1270         let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1271         let r = _mm_insert_epi8::<14>(a, 32);
1272         assert_eq_m128i(r, e);
1273     }
1274
1275     #[simd_test(enable = "sse4.1")]
1276     unsafe fn test_mm_insert_epi32() {
1277         let a = _mm_set1_epi32(0);
1278         let e = _mm_setr_epi32(0, 32, 0, 0);
1279         let r = _mm_insert_epi32::<1>(a, 32);
1280         assert_eq_m128i(r, e);
1281         let e = _mm_setr_epi32(0, 0, 0, 32);
1282         let r = _mm_insert_epi32::<3>(a, 32);
1283         assert_eq_m128i(r, e);
1284     }
1285
1286     #[simd_test(enable = "sse4.1")]
1287     unsafe fn test_mm_max_epi8() {
1288         #[rustfmt::skip]
1289         let a = _mm_setr_epi8(
1290             1, 4, 5, 8, 9, 12, 13, 16,
1291             17, 20, 21, 24, 25, 28, 29, 32,
1292         );
1293         #[rustfmt::skip]
1294         let b = _mm_setr_epi8(
1295             2, 3, 6, 7, 10, 11, 14, 15,
1296             18, 19, 22, 23, 26, 27, 30, 31,
1297         );
1298         let r = _mm_max_epi8(a, b);
1299         #[rustfmt::skip]
1300         let e = _mm_setr_epi8(
1301             2, 4, 6, 8, 10, 12, 14, 16,
1302             18, 20, 22, 24, 26, 28, 30, 32,
1303         );
1304         assert_eq_m128i(r, e);
1305     }
1306
1307     #[simd_test(enable = "sse4.1")]
1308     unsafe fn test_mm_max_epu16() {
1309         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1310         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1311         let r = _mm_max_epu16(a, b);
1312         let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1313         assert_eq_m128i(r, e);
1314     }
1315
1316     #[simd_test(enable = "sse4.1")]
1317     unsafe fn test_mm_max_epi32() {
1318         let a = _mm_setr_epi32(1, 4, 5, 8);
1319         let b = _mm_setr_epi32(2, 3, 6, 7);
1320         let r = _mm_max_epi32(a, b);
1321         let e = _mm_setr_epi32(2, 4, 6, 8);
1322         assert_eq_m128i(r, e);
1323     }
1324
1325     #[simd_test(enable = "sse4.1")]
1326     unsafe fn test_mm_max_epu32() {
1327         let a = _mm_setr_epi32(1, 4, 5, 8);
1328         let b = _mm_setr_epi32(2, 3, 6, 7);
1329         let r = _mm_max_epu32(a, b);
1330         let e = _mm_setr_epi32(2, 4, 6, 8);
1331         assert_eq_m128i(r, e);
1332     }
1333
1334     #[simd_test(enable = "sse4.1")]
1335     unsafe fn test_mm_min_epi8_1() {
1336         #[rustfmt::skip]
1337         let a = _mm_setr_epi8(
1338             1, 4, 5, 8, 9, 12, 13, 16,
1339             17, 20, 21, 24, 25, 28, 29, 32,
1340         );
1341         #[rustfmt::skip]
1342         let b = _mm_setr_epi8(
1343             2, 3, 6, 7, 10, 11, 14, 15,
1344             18, 19, 22, 23, 26, 27, 30, 31,
1345         );
1346         let r = _mm_min_epi8(a, b);
1347         #[rustfmt::skip]
1348         let e = _mm_setr_epi8(
1349             1, 3, 5, 7, 9, 11, 13, 15,
1350             17, 19, 21, 23, 25, 27, 29, 31,
1351         );
1352         assert_eq_m128i(r, e);
1353     }
1354
1355     #[simd_test(enable = "sse4.1")]
1356     unsafe fn test_mm_min_epi8_2() {
1357         #[rustfmt::skip]
1358         let a = _mm_setr_epi8(
1359             1, -4, -5, 8, -9, -12, 13, -16,
1360             17, 20, 21, 24, 25, 28, 29, 32,
1361         );
1362         #[rustfmt::skip]
1363         let b = _mm_setr_epi8(
1364             2, -3, -6, 7, -10, -11, 14, -15,
1365             18, 19, 22, 23, 26, 27, 30, 31,
1366         );
1367         let r = _mm_min_epi8(a, b);
1368         #[rustfmt::skip]
1369         let e = _mm_setr_epi8(
1370             1, -4, -6, 7, -10, -12, 13, -16,
1371             17, 19, 21, 23, 25, 27, 29, 31,
1372         );
1373         assert_eq_m128i(r, e);
1374     }
1375
1376     #[simd_test(enable = "sse4.1")]
1377     unsafe fn test_mm_min_epu16() {
1378         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1379         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1380         let r = _mm_min_epu16(a, b);
1381         let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1382         assert_eq_m128i(r, e);
1383     }
1384
1385     #[simd_test(enable = "sse4.1")]
1386     unsafe fn test_mm_min_epi32_1() {
1387         let a = _mm_setr_epi32(1, 4, 5, 8);
1388         let b = _mm_setr_epi32(2, 3, 6, 7);
1389         let r = _mm_min_epi32(a, b);
1390         let e = _mm_setr_epi32(1, 3, 5, 7);
1391         assert_eq_m128i(r, e);
1392     }
1393
1394     #[simd_test(enable = "sse4.1")]
1395     unsafe fn test_mm_min_epi32_2() {
1396         let a = _mm_setr_epi32(-1, 4, 5, -7);
1397         let b = _mm_setr_epi32(-2, 3, -6, 8);
1398         let r = _mm_min_epi32(a, b);
1399         let e = _mm_setr_epi32(-2, 3, -6, -7);
1400         assert_eq_m128i(r, e);
1401     }
1402
1403     #[simd_test(enable = "sse4.1")]
1404     unsafe fn test_mm_min_epu32() {
1405         let a = _mm_setr_epi32(1, 4, 5, 8);
1406         let b = _mm_setr_epi32(2, 3, 6, 7);
1407         let r = _mm_min_epu32(a, b);
1408         let e = _mm_setr_epi32(1, 3, 5, 7);
1409         assert_eq_m128i(r, e);
1410     }
1411
1412     #[simd_test(enable = "sse4.1")]
1413     unsafe fn test_mm_packus_epi32() {
1414         let a = _mm_setr_epi32(1, 2, 3, 4);
1415         let b = _mm_setr_epi32(-1, -2, -3, -4);
1416         let r = _mm_packus_epi32(a, b);
1417         let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1418         assert_eq_m128i(r, e);
1419     }
1420
1421     #[simd_test(enable = "sse4.1")]
1422     unsafe fn test_mm_cmpeq_epi64() {
1423         let a = _mm_setr_epi64x(0, 1);
1424         let b = _mm_setr_epi64x(0, 0);
1425         let r = _mm_cmpeq_epi64(a, b);
1426         let e = _mm_setr_epi64x(-1, 0);
1427         assert_eq_m128i(r, e);
1428     }
1429
1430     #[simd_test(enable = "sse4.1")]
1431     unsafe fn test_mm_cvtepi8_epi16() {
1432         let a = _mm_set1_epi8(10);
1433         let r = _mm_cvtepi8_epi16(a);
1434         let e = _mm_set1_epi16(10);
1435         assert_eq_m128i(r, e);
1436         let a = _mm_set1_epi8(-10);
1437         let r = _mm_cvtepi8_epi16(a);
1438         let e = _mm_set1_epi16(-10);
1439         assert_eq_m128i(r, e);
1440     }
1441
1442     #[simd_test(enable = "sse4.1")]
1443     unsafe fn test_mm_cvtepi8_epi32() {
1444         let a = _mm_set1_epi8(10);
1445         let r = _mm_cvtepi8_epi32(a);
1446         let e = _mm_set1_epi32(10);
1447         assert_eq_m128i(r, e);
1448         let a = _mm_set1_epi8(-10);
1449         let r = _mm_cvtepi8_epi32(a);
1450         let e = _mm_set1_epi32(-10);
1451         assert_eq_m128i(r, e);
1452     }
1453
1454     #[simd_test(enable = "sse4.1")]
1455     unsafe fn test_mm_cvtepi8_epi64() {
1456         let a = _mm_set1_epi8(10);
1457         let r = _mm_cvtepi8_epi64(a);
1458         let e = _mm_set1_epi64x(10);
1459         assert_eq_m128i(r, e);
1460         let a = _mm_set1_epi8(-10);
1461         let r = _mm_cvtepi8_epi64(a);
1462         let e = _mm_set1_epi64x(-10);
1463         assert_eq_m128i(r, e);
1464     }
1465
1466     #[simd_test(enable = "sse4.1")]
1467     unsafe fn test_mm_cvtepi16_epi32() {
1468         let a = _mm_set1_epi16(10);
1469         let r = _mm_cvtepi16_epi32(a);
1470         let e = _mm_set1_epi32(10);
1471         assert_eq_m128i(r, e);
1472         let a = _mm_set1_epi16(-10);
1473         let r = _mm_cvtepi16_epi32(a);
1474         let e = _mm_set1_epi32(-10);
1475         assert_eq_m128i(r, e);
1476     }
1477
1478     #[simd_test(enable = "sse4.1")]
1479     unsafe fn test_mm_cvtepi16_epi64() {
1480         let a = _mm_set1_epi16(10);
1481         let r = _mm_cvtepi16_epi64(a);
1482         let e = _mm_set1_epi64x(10);
1483         assert_eq_m128i(r, e);
1484         let a = _mm_set1_epi16(-10);
1485         let r = _mm_cvtepi16_epi64(a);
1486         let e = _mm_set1_epi64x(-10);
1487         assert_eq_m128i(r, e);
1488     }
1489
1490     #[simd_test(enable = "sse4.1")]
1491     unsafe fn test_mm_cvtepi32_epi64() {
1492         let a = _mm_set1_epi32(10);
1493         let r = _mm_cvtepi32_epi64(a);
1494         let e = _mm_set1_epi64x(10);
1495         assert_eq_m128i(r, e);
1496         let a = _mm_set1_epi32(-10);
1497         let r = _mm_cvtepi32_epi64(a);
1498         let e = _mm_set1_epi64x(-10);
1499         assert_eq_m128i(r, e);
1500     }
1501
1502     #[simd_test(enable = "sse4.1")]
1503     unsafe fn test_mm_cvtepu8_epi16() {
1504         let a = _mm_set1_epi8(10);
1505         let r = _mm_cvtepu8_epi16(a);
1506         let e = _mm_set1_epi16(10);
1507         assert_eq_m128i(r, e);
1508     }
1509
1510     #[simd_test(enable = "sse4.1")]
1511     unsafe fn test_mm_cvtepu8_epi32() {
1512         let a = _mm_set1_epi8(10);
1513         let r = _mm_cvtepu8_epi32(a);
1514         let e = _mm_set1_epi32(10);
1515         assert_eq_m128i(r, e);
1516     }
1517
1518     #[simd_test(enable = "sse4.1")]
1519     unsafe fn test_mm_cvtepu8_epi64() {
1520         let a = _mm_set1_epi8(10);
1521         let r = _mm_cvtepu8_epi64(a);
1522         let e = _mm_set1_epi64x(10);
1523         assert_eq_m128i(r, e);
1524     }
1525
1526     #[simd_test(enable = "sse4.1")]
1527     unsafe fn test_mm_cvtepu16_epi32() {
1528         let a = _mm_set1_epi16(10);
1529         let r = _mm_cvtepu16_epi32(a);
1530         let e = _mm_set1_epi32(10);
1531         assert_eq_m128i(r, e);
1532     }
1533
1534     #[simd_test(enable = "sse4.1")]
1535     unsafe fn test_mm_cvtepu16_epi64() {
1536         let a = _mm_set1_epi16(10);
1537         let r = _mm_cvtepu16_epi64(a);
1538         let e = _mm_set1_epi64x(10);
1539         assert_eq_m128i(r, e);
1540     }
1541
1542     #[simd_test(enable = "sse4.1")]
1543     unsafe fn test_mm_cvtepu32_epi64() {
1544         let a = _mm_set1_epi32(10);
1545         let r = _mm_cvtepu32_epi64(a);
1546         let e = _mm_set1_epi64x(10);
1547         assert_eq_m128i(r, e);
1548     }
1549
1550     #[simd_test(enable = "sse4.1")]
1551     unsafe fn test_mm_dp_pd() {
1552         let a = _mm_setr_pd(2.0, 3.0);
1553         let b = _mm_setr_pd(1.0, 4.0);
1554         let e = _mm_setr_pd(14.0, 0.0);
1555         assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1556     }
1557
1558     #[simd_test(enable = "sse4.1")]
1559     unsafe fn test_mm_dp_ps() {
1560         let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1561         let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1562         let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1563         assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1564     }
1565
1566     #[simd_test(enable = "sse4.1")]
1567     unsafe fn test_mm_floor_pd() {
1568         let a = _mm_setr_pd(2.5, 4.5);
1569         let r = _mm_floor_pd(a);
1570         let e = _mm_setr_pd(2.0, 4.0);
1571         assert_eq_m128d(r, e);
1572     }
1573
1574     #[simd_test(enable = "sse4.1")]
1575     unsafe fn test_mm_floor_ps() {
1576         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1577         let r = _mm_floor_ps(a);
1578         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1579         assert_eq_m128(r, e);
1580     }
1581
1582     #[simd_test(enable = "sse4.1")]
1583     unsafe fn test_mm_floor_sd() {
1584         let a = _mm_setr_pd(2.5, 4.5);
1585         let b = _mm_setr_pd(-1.5, -3.5);
1586         let r = _mm_floor_sd(a, b);
1587         let e = _mm_setr_pd(-2.0, 4.5);
1588         assert_eq_m128d(r, e);
1589     }
1590
1591     #[simd_test(enable = "sse4.1")]
1592     unsafe fn test_mm_floor_ss() {
1593         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1594         let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1595         let r = _mm_floor_ss(a, b);
1596         let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1597         assert_eq_m128(r, e);
1598     }
1599
1600     #[simd_test(enable = "sse4.1")]
1601     unsafe fn test_mm_ceil_pd() {
1602         let a = _mm_setr_pd(1.5, 3.5);
1603         let r = _mm_ceil_pd(a);
1604         let e = _mm_setr_pd(2.0, 4.0);
1605         assert_eq_m128d(r, e);
1606     }
1607
1608     #[simd_test(enable = "sse4.1")]
1609     unsafe fn test_mm_ceil_ps() {
1610         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1611         let r = _mm_ceil_ps(a);
1612         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1613         assert_eq_m128(r, e);
1614     }
1615
1616     #[simd_test(enable = "sse4.1")]
1617     unsafe fn test_mm_ceil_sd() {
1618         let a = _mm_setr_pd(1.5, 3.5);
1619         let b = _mm_setr_pd(-2.5, -4.5);
1620         let r = _mm_ceil_sd(a, b);
1621         let e = _mm_setr_pd(-2.0, 3.5);
1622         assert_eq_m128d(r, e);
1623     }
1624
1625     #[simd_test(enable = "sse4.1")]
1626     unsafe fn test_mm_ceil_ss() {
1627         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1628         let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1629         let r = _mm_ceil_ss(a, b);
1630         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1631         assert_eq_m128(r, e);
1632     }
1633
1634     #[simd_test(enable = "sse4.1")]
1635     unsafe fn test_mm_round_pd() {
1636         let a = _mm_setr_pd(1.25, 3.75);
1637         let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1638         let e = _mm_setr_pd(1.0, 4.0);
1639         assert_eq_m128d(r, e);
1640     }
1641
1642     #[simd_test(enable = "sse4.1")]
1643     unsafe fn test_mm_round_ps() {
1644         let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1645         let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1646         let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1647         assert_eq_m128(r, e);
1648     }
1649
1650     #[simd_test(enable = "sse4.1")]
1651     unsafe fn test_mm_round_sd() {
1652         let a = _mm_setr_pd(1.5, 3.5);
1653         let b = _mm_setr_pd(-2.5, -4.5);
1654         let old_mode = _MM_GET_ROUNDING_MODE();
1655         _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1656         let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
1657         _MM_SET_ROUNDING_MODE(old_mode);
1658         let e = _mm_setr_pd(-2.0, 3.5);
1659         assert_eq_m128d(r, e);
1660     }
1661
1662     #[simd_test(enable = "sse4.1")]
1663     unsafe fn test_mm_round_ss() {
1664         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1665         let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1666         let old_mode = _MM_GET_ROUNDING_MODE();
1667         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1668         let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
1669         _MM_SET_ROUNDING_MODE(old_mode);
1670         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1671         assert_eq_m128(r, e);
1672     }
1673
1674     #[simd_test(enable = "sse4.1")]
1675     unsafe fn test_mm_minpos_epu16_1() {
1676         let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1677         let r = _mm_minpos_epu16(a);
1678         let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1679         assert_eq_m128i(r, e);
1680     }
1681
1682     #[simd_test(enable = "sse4.1")]
1683     unsafe fn test_mm_minpos_epu16_2() {
1684         let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1685         let r = _mm_minpos_epu16(a);
1686         let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1687         assert_eq_m128i(r, e);
1688     }
1689
1690     #[simd_test(enable = "sse4.1")]
1691     unsafe fn test_mm_mul_epi32() {
1692         {
1693             let a = _mm_setr_epi32(1, 1, 1, 1);
1694             let b = _mm_setr_epi32(1, 2, 3, 4);
1695             let r = _mm_mul_epi32(a, b);
1696             let e = _mm_setr_epi64x(1, 3);
1697             assert_eq_m128i(r, e);
1698         }
1699         {
1700             let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1701             let b = _mm_setr_epi32(
1702                 -20, -256, /* ignored */
1703                 666666, 666666, /* ignored */
1704             );
1705             let r = _mm_mul_epi32(a, b);
1706             let e = _mm_setr_epi64x(-300, 823043843622);
1707             assert_eq_m128i(r, e);
1708         }
1709     }
1710
1711     #[simd_test(enable = "sse4.1")]
1712     unsafe fn test_mm_mullo_epi32() {
1713         {
1714             let a = _mm_setr_epi32(1, 1, 1, 1);
1715             let b = _mm_setr_epi32(1, 2, 3, 4);
1716             let r = _mm_mullo_epi32(a, b);
1717             let e = _mm_setr_epi32(1, 2, 3, 4);
1718             assert_eq_m128i(r, e);
1719         }
1720         {
1721             let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1722             let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1723             let r = _mm_mullo_epi32(a, b);
1724             // Attention, most significant bit in r[2] is treated
1725             // as a sign bit:
1726             // 1234567 * 666666 = -1589877210
1727             let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1728             assert_eq_m128i(r, e);
1729         }
1730     }
1731
1732     #[simd_test(enable = "sse4.1")]
1733     unsafe fn test_mm_minpos_epu16() {
1734         let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1735         let r = _mm_minpos_epu16(a);
1736         let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1737         assert_eq_m128i(r, e);
1738     }
1739
1740     #[simd_test(enable = "sse4.1")]
1741     unsafe fn test_mm_mpsadbw_epu8() {
1742         #[rustfmt::skip]
1743         let a = _mm_setr_epi8(
1744             0, 1, 2, 3, 4, 5, 6, 7,
1745             8, 9, 10, 11, 12, 13, 14, 15,
1746         );
1747
1748         let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1749         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1750         assert_eq_m128i(r, e);
1751
1752         let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1753         let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1754         assert_eq_m128i(r, e);
1755
1756         let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1757         let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1758         assert_eq_m128i(r, e);
1759
1760         let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1761         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1762         assert_eq_m128i(r, e);
1763
1764         let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1765         let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1766         assert_eq_m128i(r, e);
1767     }
1768
1769     #[simd_test(enable = "sse4.1")]
1770     unsafe fn test_mm_testz_si128() {
1771         let a = _mm_set1_epi8(1);
1772         let mask = _mm_set1_epi8(0);
1773         let r = _mm_testz_si128(a, mask);
1774         assert_eq!(r, 1);
1775         let a = _mm_set1_epi8(0b101);
1776         let mask = _mm_set1_epi8(0b110);
1777         let r = _mm_testz_si128(a, mask);
1778         assert_eq!(r, 0);
1779         let a = _mm_set1_epi8(0b011);
1780         let mask = _mm_set1_epi8(0b100);
1781         let r = _mm_testz_si128(a, mask);
1782         assert_eq!(r, 1);
1783     }
1784
1785     #[simd_test(enable = "sse4.1")]
1786     unsafe fn test_mm_testc_si128() {
1787         let a = _mm_set1_epi8(-1);
1788         let mask = _mm_set1_epi8(0);
1789         let r = _mm_testc_si128(a, mask);
1790         assert_eq!(r, 1);
1791         let a = _mm_set1_epi8(0b101);
1792         let mask = _mm_set1_epi8(0b110);
1793         let r = _mm_testc_si128(a, mask);
1794         assert_eq!(r, 0);
1795         let a = _mm_set1_epi8(0b101);
1796         let mask = _mm_set1_epi8(0b100);
1797         let r = _mm_testc_si128(a, mask);
1798         assert_eq!(r, 1);
1799     }
1800
1801     #[simd_test(enable = "sse4.1")]
1802     unsafe fn test_mm_testnzc_si128() {
1803         let a = _mm_set1_epi8(0);
1804         let mask = _mm_set1_epi8(1);
1805         let r = _mm_testnzc_si128(a, mask);
1806         assert_eq!(r, 0);
1807         let a = _mm_set1_epi8(-1);
1808         let mask = _mm_set1_epi8(0);
1809         let r = _mm_testnzc_si128(a, mask);
1810         assert_eq!(r, 0);
1811         let a = _mm_set1_epi8(0b101);
1812         let mask = _mm_set1_epi8(0b110);
1813         let r = _mm_testnzc_si128(a, mask);
1814         assert_eq!(r, 1);
1815         let a = _mm_set1_epi8(0b101);
1816         let mask = _mm_set1_epi8(0b101);
1817         let r = _mm_testnzc_si128(a, mask);
1818         assert_eq!(r, 0);
1819     }
1820
1821     #[simd_test(enable = "sse4.1")]
1822     unsafe fn test_mm_test_all_zeros() {
1823         let a = _mm_set1_epi8(1);
1824         let mask = _mm_set1_epi8(0);
1825         let r = _mm_test_all_zeros(a, mask);
1826         assert_eq!(r, 1);
1827         let a = _mm_set1_epi8(0b101);
1828         let mask = _mm_set1_epi8(0b110);
1829         let r = _mm_test_all_zeros(a, mask);
1830         assert_eq!(r, 0);
1831         let a = _mm_set1_epi8(0b011);
1832         let mask = _mm_set1_epi8(0b100);
1833         let r = _mm_test_all_zeros(a, mask);
1834         assert_eq!(r, 1);
1835     }
1836
1837     #[simd_test(enable = "sse4.1")]
1838     unsafe fn test_mm_test_all_ones() {
1839         let a = _mm_set1_epi8(-1);
1840         let r = _mm_test_all_ones(a);
1841         assert_eq!(r, 1);
1842         let a = _mm_set1_epi8(0b101);
1843         let r = _mm_test_all_ones(a);
1844         assert_eq!(r, 0);
1845     }
1846
1847     #[simd_test(enable = "sse4.1")]
1848     unsafe fn test_mm_test_mix_ones_zeros() {
1849         let a = _mm_set1_epi8(0);
1850         let mask = _mm_set1_epi8(1);
1851         let r = _mm_test_mix_ones_zeros(a, mask);
1852         assert_eq!(r, 0);
1853         let a = _mm_set1_epi8(-1);
1854         let mask = _mm_set1_epi8(0);
1855         let r = _mm_test_mix_ones_zeros(a, mask);
1856         assert_eq!(r, 0);
1857         let a = _mm_set1_epi8(0b101);
1858         let mask = _mm_set1_epi8(0b110);
1859         let r = _mm_test_mix_ones_zeros(a, mask);
1860         assert_eq!(r, 1);
1861         let a = _mm_set1_epi8(0b101);
1862         let mask = _mm_set1_epi8(0b101);
1863         let r = _mm_test_mix_ones_zeros(a, mask);
1864         assert_eq!(r, 0);
1865     }
1866 }