src/stdsimd/crates/core_arch/src/x86/sse41.rs

   1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
   2
   3 use crate::{
   4     core_arch::{simd::*, simd_llvm::*, x86::*},
   5     mem::transmute,
   6 };
   7
   8 #[cfg(test)]
   9 use stdsimd_test::assert_instr;
  10
  11 // SSE4 rounding constans
  12 /// round to nearest
  13 #[stable(feature = "simd_x86", since = "1.27.0")]
  14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
  15 /// round down
  16 #[stable(feature = "simd_x86", since = "1.27.0")]
  17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
  18 /// round up
  19 #[stable(feature = "simd_x86", since = "1.27.0")]
  20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
  21 /// truncate
  22 #[stable(feature = "simd_x86", since = "1.27.0")]
  23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
  24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
  25 #[stable(feature = "simd_x86", since = "1.27.0")]
  26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
  27 /// do not suppress exceptions
  28 #[stable(feature = "simd_x86", since = "1.27.0")]
  29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
  30 /// suppress exceptions
  31 #[stable(feature = "simd_x86", since = "1.27.0")]
  32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
  33 /// round to nearest and do not suppress exceptions
  34 #[stable(feature = "simd_x86", since = "1.27.0")]
  35 pub const _MM_FROUND_NINT: i32 = 0x00;
  36 /// round down and do not suppress exceptions
  37 #[stable(feature = "simd_x86", since = "1.27.0")]
  38 pub const _MM_FROUND_FLOOR: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
  39 /// round up and do not suppress exceptions
  40 #[stable(feature = "simd_x86", since = "1.27.0")]
  41 pub const _MM_FROUND_CEIL: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
  42 /// truncate and do not suppress exceptions
  43 #[stable(feature = "simd_x86", since = "1.27.0")]
  44 pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
  45 /// use MXCSR.RC and do not suppress exceptions; see
  46 /// `vendor::_MM_SET_ROUNDING_MODE`
  47 #[stable(feature = "simd_x86", since = "1.27.0")]
  48 pub const _MM_FROUND_RINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
  49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
  50 #[stable(feature = "simd_x86", since = "1.27.0")]
  51 pub const _MM_FROUND_NEARBYINT: i32 = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
  52
  53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
  54 ///
  55 /// The high bit of each corresponding mask byte determines the selection.
  56 /// If the high bit is set the element of `a` is selected. The element
  57 /// of `b` is selected otherwise.
  58 ///
  59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
  60 #[inline]
  61 #[target_feature(enable = "sse4.1")]
  62 #[cfg_attr(test, assert_instr(pblendvb))]
  63 #[stable(feature = "simd_x86", since = "1.27.0")]
  64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
  65     transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
  66 }
  67
  68 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
  69 ///
  70 /// The mask bits determine the selection. A clear bit selects the
  71 /// corresponding element of `a`, and a set bit the corresponding
  72 /// element of `b`.
  73 ///
  74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
  75 #[inline]
  76 #[target_feature(enable = "sse4.1")]
  77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
  78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
  79 // #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
  80 #[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))]
  81 #[rustc_args_required_const(2)]
  82 #[stable(feature = "simd_x86", since = "1.27.0")]
  83 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
  84     let a = a.as_i16x8();
  85     let b = b.as_i16x8();
  86     macro_rules! call {
  87         ($imm8:expr) => {
  88             pblendw(a, b, $imm8)
  89         };
  90     }
  91     transmute(constify_imm8!(imm8, call))
  92 }
  93
  94 /// Blend packed double-precision (64-bit) floating-point elements from `a`
  95 /// and `b` using `mask`
  96 ///
  97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
  98 #[inline]
  99 #[target_feature(enable = "sse4.1")]
 100 #[cfg_attr(test, assert_instr(blendvpd))]
 101 #[stable(feature = "simd_x86", since = "1.27.0")]
 102 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 103     blendvpd(a, b, mask)
 104 }
 105
 106 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 107 /// and `b` using `mask`
 108 ///
 109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
 110 #[inline]
 111 #[target_feature(enable = "sse4.1")]
 112 #[cfg_attr(test, assert_instr(blendvps))]
 113 #[stable(feature = "simd_x86", since = "1.27.0")]
 114 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 115     blendvps(a, b, mask)
 116 }
 117
 118 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 119 /// and `b` using control mask `imm2`
 120 ///
 121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
 122 #[inline]
 123 #[target_feature(enable = "sse4.1")]
 124 // Note: LLVM7 prefers the single-precision floating-point domain when possible
 125 // see https://bugs.llvm.org/show_bug.cgi?id=38195
 126 // #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
 127 #[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))]
 128 #[rustc_args_required_const(2)]
 129 #[stable(feature = "simd_x86", since = "1.27.0")]
 130 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
 131     macro_rules! call {
 132         ($imm2:expr) => {
 133             blendpd(a, b, $imm2)
 134         };
 135     }
 136     constify_imm2!(imm2, call)
 137 }
 138
 139 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 140 /// and `b` using mask `imm4`
 141 ///
 142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
 143 #[inline]
 144 #[target_feature(enable = "sse4.1")]
 145 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
 146 #[rustc_args_required_const(2)]
 147 #[stable(feature = "simd_x86", since = "1.27.0")]
 148 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
 149     macro_rules! call {
 150         ($imm4:expr) => {
 151             blendps(a, b, $imm4)
 152         };
 153     }
 154     constify_imm4!(imm4, call)
 155 }
 156
 157 /// Extracts a single-precision (32-bit) floating-point element from `a`,
 158 /// selected with `imm8`
 159 ///
 160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
 161 #[inline]
 162 #[target_feature(enable = "sse4.1")]
 163 #[cfg_attr(
 164     all(test, not(target_os = "windows")),
 165     assert_instr(extractps, imm8 = 0)
 166 )]
 167 #[rustc_args_required_const(1)]
 168 #[stable(feature = "simd_x86", since = "1.27.0")]
 169 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
 170     transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
 171 }
 172
 173 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
 174 /// integer containing the zero-extended integer data.
 175 ///
 176 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
 177 ///
 178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
 179 #[inline]
 180 #[target_feature(enable = "sse4.1")]
 181 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
 182 #[rustc_args_required_const(1)]
 183 #[stable(feature = "simd_x86", since = "1.27.0")]
 184 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
 185     let imm8 = (imm8 & 15) as u32;
 186     simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32
 187 }
 188
 189 /// Extracts an 32-bit integer from `a` selected with `imm8`
 190 ///
 191 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
 192 #[inline]
 193 #[target_feature(enable = "sse4.1")]
 194 #[cfg_attr(
 195     all(test, not(target_os = "windows")),
 196     assert_instr(extractps, imm8 = 1)
 197 )]
 198 #[rustc_args_required_const(1)]
 199 #[stable(feature = "simd_x86", since = "1.27.0")]
 200 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
 201     let imm8 = (imm8 & 3) as u32;
 202     simd_extract::<_, i32>(a.as_i32x4(), imm8)
 203 }
 204
 205 /// Select a single value in `a` to store at some position in `b`,
 206 /// Then zero elements according to `imm8`.
 207 ///
 208 /// `imm8` specifies which bits from operand `a` will be copied, which bits in
 209 /// the result they will be copied to, and which bits in the result will be
 210 /// cleared. The following assignments are made:
 211 ///
 212 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
 213 ///     - `00`: Selects bits `[31:0]` from operand `a`.
 214 ///     - `01`: Selects bits `[63:32]` from operand `a`.
 215 ///     - `10`: Selects bits `[95:64]` from operand `a`.
 216 ///     - `11`: Selects bits `[127:96]` from operand `a`.
 217 ///
 218 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
 219 /// from operand `a` are copied:
 220 ///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
 221 ///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
 222 ///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
 223 ///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
 224 ///
 225 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 226 /// element is cleared.
 227 ///
 228 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
 229 #[inline]
 230 #[target_feature(enable = "sse4.1")]
 231 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
 232 #[rustc_args_required_const(2)]
 233 #[stable(feature = "simd_x86", since = "1.27.0")]
 234 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 235     macro_rules! call {
 236         ($imm8:expr) => {
 237             insertps(a, b, $imm8)
 238         };
 239     }
 240     constify_imm8!(imm8, call)
 241 }
 242
 243 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
 244 /// location specified by `imm8`.
 245 ///
 246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
 247 #[inline]
 248 #[target_feature(enable = "sse4.1")]
 249 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
 250 #[rustc_args_required_const(2)]
 251 #[stable(feature = "simd_x86", since = "1.27.0")]
 252 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
 253     transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8))
 254 }
 255
 256 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
 257 /// location specified by `imm8`.
 258 ///
 259 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
 260 #[inline]
 261 #[target_feature(enable = "sse4.1")]
 262 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
 263 #[rustc_args_required_const(2)]
 264 #[stable(feature = "simd_x86", since = "1.27.0")]
 265 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
 266     transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
 267 }
 268
 269 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
 270 /// values in dst.
 271 ///
 272 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
 273 #[inline]
 274 #[target_feature(enable = "sse4.1")]
 275 #[cfg_attr(test, assert_instr(pmaxsb))]
 276 #[stable(feature = "simd_x86", since = "1.27.0")]
 277 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
 278     transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
 279 }
 280
 281 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
 282 /// maximum.
 283 ///
 284 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
 285 #[inline]
 286 #[target_feature(enable = "sse4.1")]
 287 #[cfg_attr(test, assert_instr(pmaxuw))]
 288 #[stable(feature = "simd_x86", since = "1.27.0")]
 289 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
 290     transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
 291 }
 292
 293 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
 294 /// values.
 295 ///
 296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
 297 #[inline]
 298 #[target_feature(enable = "sse4.1")]
 299 #[cfg_attr(test, assert_instr(pmaxsd))]
 300 #[stable(feature = "simd_x86", since = "1.27.0")]
 301 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
 302     transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
 303 }
 304
 305 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
 306 /// maximum values.
 307 ///
 308 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
 309 #[inline]
 310 #[target_feature(enable = "sse4.1")]
 311 #[cfg_attr(test, assert_instr(pmaxud))]
 312 #[stable(feature = "simd_x86", since = "1.27.0")]
 313 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
 314     transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
 315 }
 316
 317 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
 318 /// values in dst.
 319 ///
 320 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
 321 #[inline]
 322 #[target_feature(enable = "sse4.1")]
 323 #[cfg_attr(test, assert_instr(pminsb))]
 324 #[stable(feature = "simd_x86", since = "1.27.0")]
 325 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
 326     transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
 327 }
 328
 329 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
 330 /// minimum.
 331 ///
 332 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
 333 #[inline]
 334 #[target_feature(enable = "sse4.1")]
 335 #[cfg_attr(test, assert_instr(pminuw))]
 336 #[stable(feature = "simd_x86", since = "1.27.0")]
 337 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
 338     transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
 339 }
 340
 341 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
 342 /// values.
 343 ///
 344 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
 345 #[inline]
 346 #[target_feature(enable = "sse4.1")]
 347 #[cfg_attr(test, assert_instr(pminsd))]
 348 #[stable(feature = "simd_x86", since = "1.27.0")]
 349 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
 350     transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
 351 }
 352
 353 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
 354 /// minimum values.
 355 ///
 356 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
 357 #[inline]
 358 #[target_feature(enable = "sse4.1")]
 359 #[cfg_attr(test, assert_instr(pminud))]
 360 #[stable(feature = "simd_x86", since = "1.27.0")]
 361 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 362     transmute(pminud(a.as_u32x4(), b.as_u32x4()))
 363 }
 364
 365 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 366 /// using unsigned saturation
 367 ///
 368 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
 369 #[inline]
 370 #[target_feature(enable = "sse4.1")]
 371 #[cfg_attr(test, assert_instr(packusdw))]
 372 #[stable(feature = "simd_x86", since = "1.27.0")]
 373 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
 374     transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
 375 }
 376
 377 /// Compares packed 64-bit integers in `a` and `b` for equality
 378 ///
 379 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
 380 #[inline]
 381 #[target_feature(enable = "sse4.1")]
 382 #[cfg_attr(test, assert_instr(pcmpeqq))]
 383 #[stable(feature = "simd_x86", since = "1.27.0")]
 384 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 385     transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 386 }
 387
 388 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
 389 ///
 390 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
 391 #[inline]
 392 #[target_feature(enable = "sse4.1")]
 393 #[cfg_attr(test, assert_instr(pmovsxbw))]
 394 #[stable(feature = "simd_x86", since = "1.27.0")]
 395 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 396     let a = a.as_i8x16();
 397     let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 398     transmute(simd_cast::<_, i16x8>(a))
 399 }
 400
 401 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
 402 ///
 403 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
 404 #[inline]
 405 #[target_feature(enable = "sse4.1")]
 406 #[cfg_attr(test, assert_instr(pmovsxbd))]
 407 #[stable(feature = "simd_x86", since = "1.27.0")]
 408 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 409     let a = a.as_i8x16();
 410     let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
 411     transmute(simd_cast::<_, i32x4>(a))
 412 }
 413
 414 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 415 /// 64-bit integers
 416 ///
 417 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
 418 #[inline]
 419 #[target_feature(enable = "sse4.1")]
 420 #[cfg_attr(test, assert_instr(pmovsxbq))]
 421 #[stable(feature = "simd_x86", since = "1.27.0")]
 422 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 423     let a = a.as_i8x16();
 424     let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
 425     transmute(simd_cast::<_, i64x2>(a))
 426 }
 427
 428 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
 429 ///
 430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
 431 #[inline]
 432 #[target_feature(enable = "sse4.1")]
 433 #[cfg_attr(test, assert_instr(pmovsxwd))]
 434 #[stable(feature = "simd_x86", since = "1.27.0")]
 435 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 436     let a = a.as_i16x8();
 437     let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
 438     transmute(simd_cast::<_, i32x4>(a))
 439 }
 440
 441 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
 442 ///
 443 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
 444 #[inline]
 445 #[target_feature(enable = "sse4.1")]
 446 #[cfg_attr(test, assert_instr(pmovsxwq))]
 447 #[stable(feature = "simd_x86", since = "1.27.0")]
 448 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 449     let a = a.as_i16x8();
 450     let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
 451     transmute(simd_cast::<_, i64x2>(a))
 452 }
 453
 454 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
 455 ///
 456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
 457 #[inline]
 458 #[target_feature(enable = "sse4.1")]
 459 #[cfg_attr(test, assert_instr(pmovsxdq))]
 460 #[stable(feature = "simd_x86", since = "1.27.0")]
 461 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 462     let a = a.as_i32x4();
 463     let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
 464     transmute(simd_cast::<_, i64x2>(a))
 465 }
 466
 467 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
 468 ///
 469 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
 470 #[inline]
 471 #[target_feature(enable = "sse4.1")]
 472 #[cfg_attr(test, assert_instr(pmovzxbw))]
 473 #[stable(feature = "simd_x86", since = "1.27.0")]
 474 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 475     let a = a.as_u8x16();
 476     let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 477     transmute(simd_cast::<_, i16x8>(a))
 478 }
 479
 480 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
 481 ///
 482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
 483 #[inline]
 484 #[target_feature(enable = "sse4.1")]
 485 #[cfg_attr(test, assert_instr(pmovzxbd))]
 486 #[stable(feature = "simd_x86", since = "1.27.0")]
 487 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 488     let a = a.as_u8x16();
 489     let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
 490     transmute(simd_cast::<_, i32x4>(a))
 491 }
 492
 493 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
 494 ///
 495 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
 496 #[inline]
 497 #[target_feature(enable = "sse4.1")]
 498 #[cfg_attr(test, assert_instr(pmovzxbq))]
 499 #[stable(feature = "simd_x86", since = "1.27.0")]
 500 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 501     let a = a.as_u8x16();
 502     let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
 503     transmute(simd_cast::<_, i64x2>(a))
 504 }
 505
 506 /// Zeroes extend packed unsigned 16-bit integers in `a`
 507 /// to packed 32-bit integers
 508 ///
 509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
 510 #[inline]
 511 #[target_feature(enable = "sse4.1")]
 512 #[cfg_attr(test, assert_instr(pmovzxwd))]
 513 #[stable(feature = "simd_x86", since = "1.27.0")]
 514 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 515     let a = a.as_u16x8();
 516     let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
 517     transmute(simd_cast::<_, i32x4>(a))
 518 }
 519
 520 /// Zeroes extend packed unsigned 16-bit integers in `a`
 521 /// to packed 64-bit integers
 522 ///
 523 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
 524 #[inline]
 525 #[target_feature(enable = "sse4.1")]
 526 #[cfg_attr(test, assert_instr(pmovzxwq))]
 527 #[stable(feature = "simd_x86", since = "1.27.0")]
 528 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 529     let a = a.as_u16x8();
 530     let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
 531     transmute(simd_cast::<_, i64x2>(a))
 532 }
 533
 534 /// Zeroes extend packed unsigned 32-bit integers in `a`
 535 /// to packed 64-bit integers
 536 ///
 537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
 538 #[inline]
 539 #[target_feature(enable = "sse4.1")]
 540 #[cfg_attr(test, assert_instr(pmovzxdq))]
 541 #[stable(feature = "simd_x86", since = "1.27.0")]
 542 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 543     let a = a.as_u32x4();
 544     let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
 545     transmute(simd_cast::<_, i64x2>(a))
 546 }
 547
 548 /// Returns the dot product of two __m128d vectors.
 549 ///
 550 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
 551 /// If a condition mask bit is zero, the corresponding multiplication is
 552 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 553 /// the dot product will be stored in the return value component. Otherwise if
 554 /// the broadcast mask bit is zero then the return component will be zero.
 555 ///
 556 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
 557 #[inline]
 558 #[target_feature(enable = "sse4.1")]
 559 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
 560 #[rustc_args_required_const(2)]
 561 #[stable(feature = "simd_x86", since = "1.27.0")]
 562 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 563     macro_rules! call {
 564         ($imm8:expr) => {
 565             dppd(a, b, $imm8)
 566         };
 567     }
 568     constify_imm8!(imm8, call)
 569 }
 570
 571 /// Returns the dot product of two __m128 vectors.
 572 ///
 573 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
 574 /// If a condition mask bit is zero, the corresponding multiplication is
 575 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 576 /// the dot product will be stored in the return value component. Otherwise if
 577 /// the broadcast mask bit is zero then the return component will be zero.
 578 ///
 579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
 580 #[inline]
 581 #[target_feature(enable = "sse4.1")]
 582 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
 583 #[rustc_args_required_const(2)]
 584 #[stable(feature = "simd_x86", since = "1.27.0")]
 585 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 586     macro_rules! call {
 587         ($imm8:expr) => {
 588             dpps(a, b, $imm8)
 589         };
 590     }
 591     constify_imm8!(imm8, call)
 592 }
 593
 594 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 595 /// down to an integer value, and stores the results as packed double-precision
 596 /// floating-point elements.
 597 ///
 598 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
 599 #[inline]
 600 #[target_feature(enable = "sse4.1")]
 601 #[cfg_attr(test, assert_instr(roundpd))]
 602 #[stable(feature = "simd_x86", since = "1.27.0")]
 603 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 604     roundpd(a, _MM_FROUND_FLOOR)
 605 }
 606
 607 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 608 /// down to an integer value, and stores the results as packed single-precision
 609 /// floating-point elements.
 610 ///
 611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
 612 #[inline]
 613 #[target_feature(enable = "sse4.1")]
 614 #[cfg_attr(test, assert_instr(roundps))]
 615 #[stable(feature = "simd_x86", since = "1.27.0")]
 616 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 617     roundps(a, _MM_FROUND_FLOOR)
 618 }
 619
 620 /// Round the lower double-precision (64-bit) floating-point element in `b`
 621 /// down to an integer value, store the result as a double-precision
 622 /// floating-point element in the lower element of the intrinsic result,
 623 /// and copies the upper element from `a` to the upper element of the intrinsic
 624 /// result.
 625 ///
 626 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
 627 #[inline]
 628 #[target_feature(enable = "sse4.1")]
 629 #[cfg_attr(test, assert_instr(roundsd))]
 630 #[stable(feature = "simd_x86", since = "1.27.0")]
 631 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 632     roundsd(a, b, _MM_FROUND_FLOOR)
 633 }
 634
 635 /// Round the lower single-precision (32-bit) floating-point element in `b`
 636 /// down to an integer value, store the result as a single-precision
 637 /// floating-point element in the lower element of the intrinsic result,
 638 /// and copies the upper 3 packed elements from `a` to the upper elements
 639 /// of the intrinsic result.
 640 ///
 641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
 642 #[inline]
 643 #[target_feature(enable = "sse4.1")]
 644 #[cfg_attr(test, assert_instr(roundss))]
 645 #[stable(feature = "simd_x86", since = "1.27.0")]
 646 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 647     roundss(a, b, _MM_FROUND_FLOOR)
 648 }
 649
 650 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 651 /// up to an integer value, and stores the results as packed double-precision
 652 /// floating-point elements.
 653 ///
 654 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
 655 #[inline]
 656 #[target_feature(enable = "sse4.1")]
 657 #[cfg_attr(test, assert_instr(roundpd))]
 658 #[stable(feature = "simd_x86", since = "1.27.0")]
 659 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 660     roundpd(a, _MM_FROUND_CEIL)
 661 }
 662
 663 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 664 /// up to an integer value, and stores the results as packed single-precision
 665 /// floating-point elements.
 666 ///
 667 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
 668 #[inline]
 669 #[target_feature(enable = "sse4.1")]
 670 #[cfg_attr(test, assert_instr(roundps))]
 671 #[stable(feature = "simd_x86", since = "1.27.0")]
 672 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 673     roundps(a, _MM_FROUND_CEIL)
 674 }
 675
 676 /// Round the lower double-precision (64-bit) floating-point element in `b`
 677 /// up to an integer value, store the result as a double-precision
 678 /// floating-point element in the lower element of the intrisic result,
 679 /// and copies the upper element from `a` to the upper element
 680 /// of the intrinsic result.
 681 ///
 682 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
 683 #[inline]
 684 #[target_feature(enable = "sse4.1")]
 685 #[cfg_attr(test, assert_instr(roundsd))]
 686 #[stable(feature = "simd_x86", since = "1.27.0")]
 687 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 688     roundsd(a, b, _MM_FROUND_CEIL)
 689 }
 690
 691 /// Round the lower single-precision (32-bit) floating-point element in `b`
 692 /// up to an integer value, store the result as a single-precision
 693 /// floating-point element in the lower element of the intrinsic result,
 694 /// and copies the upper 3 packed elements from `a` to the upper elements
 695 /// of the intrinsic result.
 696 ///
 697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
 698 #[inline]
 699 #[target_feature(enable = "sse4.1")]
 700 #[cfg_attr(test, assert_instr(roundss))]
 701 #[stable(feature = "simd_x86", since = "1.27.0")]
 702 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 703     roundss(a, b, _MM_FROUND_CEIL)
 704 }
 705
 706 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 707 /// using the `rounding` parameter, and stores the results as packed
 708 /// double-precision floating-point elements.
 709 /// Rounding is done according to the rounding parameter, which can be one of:
 710 ///
 711 /// ```
 712 /// #[cfg(target_arch = "x86")]
 713 /// use std::arch::x86::*;
 714 /// #[cfg(target_arch = "x86_64")]
 715 /// use std::arch::x86_64::*;
 716 ///
 717 /// # fn main() {
 718 /// // round to nearest, and suppress exceptions:
 719 /// # let _x =
 720 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 721 /// // round down, and suppress exceptions:
 722 /// # let _x =
 723 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 724 /// // round up, and suppress exceptions:
 725 /// # let _x =
 726 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 727 /// // truncate, and suppress exceptions:
 728 /// # let _x =
 729 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 730 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 731 /// # let _x =
 732 /// _MM_FROUND_CUR_DIRECTION;
 733 /// # }
 734 /// ```
 735 ///
 736 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
 737 #[inline]
 738 #[target_feature(enable = "sse4.1")]
 739 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
 740 #[rustc_args_required_const(1)]
 741 #[stable(feature = "simd_x86", since = "1.27.0")]
 742 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
 743     macro_rules! call {
 744         ($imm4:expr) => {
 745             roundpd(a, $imm4)
 746         };
 747     }
 748     constify_imm4!(rounding, call)
 749 }
 750
 751 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 752 /// using the `rounding` parameter, and stores the results as packed
 753 /// single-precision floating-point elements.
 754 /// Rounding is done according to the rounding parameter, which can be one of:
 755 ///
 756 /// ```
 757 /// #[cfg(target_arch = "x86")]
 758 /// use std::arch::x86::*;
 759 /// #[cfg(target_arch = "x86_64")]
 760 /// use std::arch::x86_64::*;
 761 ///
 762 /// # fn main() {
 763 /// // round to nearest, and suppress exceptions:
 764 /// # let _x =
 765 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 766 /// // round down, and suppress exceptions:
 767 /// # let _x =
 768 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 769 /// // round up, and suppress exceptions:
 770 /// # let _x =
 771 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 772 /// // truncate, and suppress exceptions:
 773 /// # let _x =
 774 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 775 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 776 /// # let _x =
 777 /// _MM_FROUND_CUR_DIRECTION;
 778 /// # }
 779 /// ```
 780 ///
 781 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
 782 #[inline]
 783 #[target_feature(enable = "sse4.1")]
 784 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
 785 #[rustc_args_required_const(1)]
 786 #[stable(feature = "simd_x86", since = "1.27.0")]
 787 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
 788     macro_rules! call {
 789         ($imm4:expr) => {
 790             roundps(a, $imm4)
 791         };
 792     }
 793     constify_imm4!(rounding, call)
 794 }
 795
 796 /// Round the lower double-precision (64-bit) floating-point element in `b`
 797 /// using the `rounding` parameter, store the result as a double-precision
 798 /// floating-point element in the lower element of the intrinsic result,
 799 /// and copies the upper element from `a` to the upper element of the intrinsic
 800 /// result.
 801 /// Rounding is done according to the rounding parameter, which can be one of:
 802 ///
 803 /// ```
 804 /// #[cfg(target_arch = "x86")]
 805 /// use std::arch::x86::*;
 806 /// #[cfg(target_arch = "x86_64")]
 807 /// use std::arch::x86_64::*;
 808 ///
 809 /// # fn main() {
 810 /// // round to nearest, and suppress exceptions:
 811 /// # let _x =
 812 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 813 /// // round down, and suppress exceptions:
 814 /// # let _x =
 815 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 816 /// // round up, and suppress exceptions:
 817 /// # let _x =
 818 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 819 /// // truncate, and suppress exceptions:
 820 /// # let _x =
 821 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 822 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 823 /// # let _x =
 824 /// _MM_FROUND_CUR_DIRECTION;
 825 /// # }
 826 /// ```
 827 ///
 828 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
 829 #[inline]
 830 #[target_feature(enable = "sse4.1")]
 831 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
 832 #[rustc_args_required_const(2)]
 833 #[stable(feature = "simd_x86", since = "1.27.0")]
 834 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
 835     macro_rules! call {
 836         ($imm4:expr) => {
 837             roundsd(a, b, $imm4)
 838         };
 839     }
 840     constify_imm4!(rounding, call)
 841 }
 842
 843 /// Round the lower single-precision (32-bit) floating-point element in `b`
 844 /// using the `rounding` parameter, store the result as a single-precision
 845 /// floating-point element in the lower element of the intrinsic result,
 846 /// and copies the upper 3 packed elements from `a` to the upper elements
 847 /// of the instrinsic result.
 848 /// Rounding is done according to the rounding parameter, which can be one of:
 849 ///
 850 /// ```
 851 /// #[cfg(target_arch = "x86")]
 852 /// use std::arch::x86::*;
 853 /// #[cfg(target_arch = "x86_64")]
 854 /// use std::arch::x86_64::*;
 855 ///
 856 /// # fn main() {
 857 /// // round to nearest, and suppress exceptions:
 858 /// # let _x =
 859 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 860 /// // round down, and suppress exceptions:
 861 /// # let _x =
 862 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 863 /// // round up, and suppress exceptions:
 864 /// # let _x =
 865 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 866 /// // truncate, and suppress exceptions:
 867 /// # let _x =
 868 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 869 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 870 /// # let _x =
 871 /// _MM_FROUND_CUR_DIRECTION;
 872 /// # }
 873 /// ```
 874 ///
 875 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
 876 #[inline]
 877 #[target_feature(enable = "sse4.1")]
 878 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
 879 #[rustc_args_required_const(2)]
 880 #[stable(feature = "simd_x86", since = "1.27.0")]
 881 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 882     macro_rules! call {
 883         ($imm4:expr) => {
 884             roundss(a, b, $imm4)
 885         };
 886     }
 887     constify_imm4!(rounding, call)
 888 }
 889
 890 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
 891 /// returning a vector containing its value in its first position, and its
 892 /// index
 893 /// in its second position; all other elements are set to zero.
 894 ///
 895 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
 896 /// instruction.
 897 ///
 898 /// Arguments:
 899 ///
 900 /// * `a` - A 128-bit vector of type `__m128i`.
 901 ///
 902 /// Returns:
 903 ///
 904 /// A 128-bit value where:
 905 ///
 906 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 907 /// * bits `[18:16]` - contain the index of the minimum value
 908 /// * remaining bits are set to `0`.
 909 ///
 910 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
 911 #[inline]
 912 #[target_feature(enable = "sse4.1")]
 913 #[cfg_attr(test, assert_instr(phminposuw))]
 914 #[stable(feature = "simd_x86", since = "1.27.0")]
 915 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 916     transmute(phminposuw(a.as_u16x8()))
 917 }
 918
 919 /// Multiplies the low 32-bit integers from each packed 64-bit
 920 /// element in `a` and `b`, and returns the signed 64-bit result.
 921 ///
 922 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
 923 #[inline]
 924 #[target_feature(enable = "sse4.1")]
 925 #[cfg_attr(test, assert_instr(pmuldq))]
 926 #[stable(feature = "simd_x86", since = "1.27.0")]
 927 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 928     transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
 929 }
 930
 931 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
 932 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
 933 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
 934 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 935 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
 936 /// return a negative number.
 937 ///
 938 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
 939 #[inline]
 940 #[target_feature(enable = "sse4.1")]
 941 #[cfg_attr(test, assert_instr(pmulld))]
 942 #[stable(feature = "simd_x86", since = "1.27.0")]
 943 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 944     transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 945 }
 946
 947 /// Subtracts 8-bit unsigned integer values and computes the absolute
 948 /// values of the differences to the corresponding bits in the destination.
 949 /// Then sums of the absolute differences are returned according to the bit
 950 /// fields in the immediate operand.
 951 ///
 952 /// The following algorithm is performed:
 953 ///
 954 /// ```ignore
 955 /// i = imm8[2] * 4
 956 /// j = imm8[1:0] * 4
 957 /// for k := 0 to 7
 958 ///     d0 = abs(a[i + k + 0] - b[j + 0])
 959 ///     d1 = abs(a[i + k + 1] - b[j + 1])
 960 ///     d2 = abs(a[i + k + 2] - b[j + 2])
 961 ///     d3 = abs(a[i + k + 3] - b[j + 3])
 962 ///     r[k] = d0 + d1 + d2 + d3
 963 /// ```
 964 ///
 965 /// Arguments:
 966 ///
 967 /// * `a` - A 128-bit vector of type `__m128i`.
 968 /// * `b` - A 128-bit vector of type `__m128i`.
 969 /// * `imm8` - An 8-bit immediate operand specifying how the absolute
 970 ///   differences are to be calculated
 971 ///     * Bit `[2]` specify the offset for operand `a`
 972 ///     * Bits `[1:0]` specify the offset for operand `b`
 973 ///
 974 /// Returns:
 975 ///
 976 /// * A `__m128i` vector containing the sums of the sets of   absolute
 977 ///   differences between both operands.
 978 ///
 979 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
 980 #[inline]
 981 #[target_feature(enable = "sse4.1")]
 982 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
 983 #[rustc_args_required_const(2)]
 984 #[stable(feature = "simd_x86", since = "1.27.0")]
 985 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 986     let a = a.as_u8x16();
 987     let b = b.as_u8x16();
 988     macro_rules! call {
 989         ($imm8:expr) => {
 990             mpsadbw(a, b, $imm8)
 991         };
 992     }
 993     transmute(constify_imm3!(imm8, call))
 994 }
 995
 996 /// Tests whether the specified bits in a 128-bit integer vector are all
 997 /// zeros.
 998 ///
 999 /// Arguments:
1000 ///
1001 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1002 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1003 ///   operand `a`.
1004 ///
1005 /// Returns:
1006 ///
1007 /// * `1` - if the specified bits are all zeros,
1008 /// * `0` - otherwise.
1009 ///
1010 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
1011 #[inline]
1012 #[target_feature(enable = "sse4.1")]
1013 #[cfg_attr(test, assert_instr(ptest))]
1014 #[stable(feature = "simd_x86", since = "1.27.0")]
1015 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1016     ptestz(a.as_i64x2(), mask.as_i64x2())
1017 }
1018
1019 /// Tests whether the specified bits in a 128-bit integer vector are all
1020 /// ones.
1021 ///
1022 /// Arguments:
1023 ///
1024 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1025 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1026 ///   operand `a`.
1027 ///
1028 /// Returns:
1029 ///
1030 /// * `1` - if the specified bits are all ones,
1031 /// * `0` - otherwise.
1032 ///
1033 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
1034 #[inline]
1035 #[target_feature(enable = "sse4.1")]
1036 #[cfg_attr(test, assert_instr(ptest))]
1037 #[stable(feature = "simd_x86", since = "1.27.0")]
1038 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1039     ptestc(a.as_i64x2(), mask.as_i64x2())
1040 }
1041
1042 /// Tests whether the specified bits in a 128-bit integer vector are
1043 /// neither all zeros nor all ones.
1044 ///
1045 /// Arguments:
1046 ///
1047 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1048 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1049 ///   operand `a`.
1050 ///
1051 /// Returns:
1052 ///
1053 /// * `1` - if the specified bits are neither all zeros nor all ones,
1054 /// * `0` - otherwise.
1055 ///
1056 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1057 #[inline]
1058 #[target_feature(enable = "sse4.1")]
1059 #[cfg_attr(test, assert_instr(ptest))]
1060 #[stable(feature = "simd_x86", since = "1.27.0")]
1061 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1062     ptestnzc(a.as_i64x2(), mask.as_i64x2())
1063 }
1064
1065 /// Tests whether the specified bits in a 128-bit integer vector are all
1066 /// zeros.
1067 ///
1068 /// Arguments:
1069 ///
1070 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1071 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1072 ///   operand `a`.
1073 ///
1074 /// Returns:
1075 ///
1076 /// * `1` - if the specified bits are all zeros,
1077 /// * `0` - otherwise.
1078 ///
1079 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1080 #[inline]
1081 #[target_feature(enable = "sse4.1")]
1082 #[cfg_attr(test, assert_instr(ptest))]
1083 #[stable(feature = "simd_x86", since = "1.27.0")]
1084 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1085     _mm_testz_si128(a, mask)
1086 }
1087
1088 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1089 /// ones.
1090 ///
1091 /// Argument:
1092 ///
1093 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1094 ///
1095 /// Returns:
1096 ///
1097 /// * `1` - if the bits specified in the operand are all set to 1,
1098 /// * `0` - otherwise.
1099 ///
1100 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1101 #[inline]
1102 #[target_feature(enable = "sse4.1")]
1103 #[cfg_attr(test, assert_instr(pcmpeqd))]
1104 #[cfg_attr(test, assert_instr(ptest))]
1105 #[stable(feature = "simd_x86", since = "1.27.0")]
1106 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1107     _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1108 }
1109
1110 /// Tests whether the specified bits in a 128-bit integer vector are
1111 /// neither all zeros nor all ones.
1112 ///
1113 /// Arguments:
1114 ///
1115 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1116 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1117 ///   operand `a`.
1118 ///
1119 /// Returns:
1120 ///
1121 /// * `1` - if the specified bits are neither all zeros nor all ones,
1122 /// * `0` - otherwise.
1123 ///
1124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1125 #[inline]
1126 #[target_feature(enable = "sse4.1")]
1127 #[cfg_attr(test, assert_instr(ptest))]
1128 #[stable(feature = "simd_x86", since = "1.27.0")]
1129 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1130     _mm_testnzc_si128(a, mask)
1131 }
1132
1133 #[allow(improper_ctypes)]
1134 extern "C" {
1135     #[link_name = "llvm.x86.sse41.pblendvb"]
1136     fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1137     #[link_name = "llvm.x86.sse41.blendvpd"]
1138     fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1139     #[link_name = "llvm.x86.sse41.blendvps"]
1140     fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1141     #[link_name = "llvm.x86.sse41.blendpd"]
1142     fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1143     #[link_name = "llvm.x86.sse41.blendps"]
1144     fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1145     #[link_name = "llvm.x86.sse41.pblendw"]
1146     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1147     #[link_name = "llvm.x86.sse41.insertps"]
1148     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1149     #[link_name = "llvm.x86.sse41.pmaxsb"]
1150     fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1151     #[link_name = "llvm.x86.sse41.pmaxuw"]
1152     fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1153     #[link_name = "llvm.x86.sse41.pmaxsd"]
1154     fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1155     #[link_name = "llvm.x86.sse41.pmaxud"]
1156     fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1157     #[link_name = "llvm.x86.sse41.pminsb"]
1158     fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1159     #[link_name = "llvm.x86.sse41.pminuw"]
1160     fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1161     #[link_name = "llvm.x86.sse41.pminsd"]
1162     fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1163     #[link_name = "llvm.x86.sse41.pminud"]
1164     fn pminud(a: u32x4, b: u32x4) -> u32x4;
1165     #[link_name = "llvm.x86.sse41.packusdw"]
1166     fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1167     #[link_name = "llvm.x86.sse41.dppd"]
1168     fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1169     #[link_name = "llvm.x86.sse41.dpps"]
1170     fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1171     #[link_name = "llvm.x86.sse41.round.pd"]
1172     fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1173     #[link_name = "llvm.x86.sse41.round.ps"]
1174     fn roundps(a: __m128, rounding: i32) -> __m128;
1175     #[link_name = "llvm.x86.sse41.round.sd"]
1176     fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1177     #[link_name = "llvm.x86.sse41.round.ss"]
1178     fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1179     #[link_name = "llvm.x86.sse41.phminposuw"]
1180     fn phminposuw(a: u16x8) -> u16x8;
1181     #[link_name = "llvm.x86.sse41.pmuldq"]
1182     fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1183     #[link_name = "llvm.x86.sse41.mpsadbw"]
1184     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1185     #[link_name = "llvm.x86.sse41.ptestz"]
1186     fn ptestz(a: i64x2, mask: i64x2) -> i32;
1187     #[link_name = "llvm.x86.sse41.ptestc"]
1188     fn ptestc(a: i64x2, mask: i64x2) -> i32;
1189     #[link_name = "llvm.x86.sse41.ptestnzc"]
1190     fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1191 }
1192
1193 #[cfg(test)]
1194 mod tests {
1195     use crate::core_arch::x86::*;
1196     use std::mem;
1197     use stdsimd_test::simd_test;
1198
1199     #[simd_test(enable = "sse4.1")]
1200     unsafe fn test_mm_blendv_epi8() {
1201         #[rustfmt::skip]
1202         let a = _mm_setr_epi8(
1203             0, 1, 2, 3, 4, 5, 6, 7,
1204             8, 9, 10, 11, 12, 13, 14, 15,
1205         );
1206         #[rustfmt::skip]
1207         let b = _mm_setr_epi8(
1208             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1209         );
1210         #[rustfmt::skip]
1211         let mask = _mm_setr_epi8(
1212             0, -1, 0, -1, 0, -1, 0, -1,
1213             0, -1, 0, -1, 0, -1, 0, -1,
1214         );
1215         #[rustfmt::skip]
1216         let e = _mm_setr_epi8(
1217             0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1218         );
1219         assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1220     }
1221
1222     #[simd_test(enable = "sse4.1")]
1223     unsafe fn test_mm_blendv_pd() {
1224         let a = _mm_set1_pd(0.0);
1225         let b = _mm_set1_pd(1.0);
1226         let mask = transmute(_mm_setr_epi64x(0, -1));
1227         let r = _mm_blendv_pd(a, b, mask);
1228         let e = _mm_setr_pd(0.0, 1.0);
1229         assert_eq_m128d(r, e);
1230     }
1231
1232     #[simd_test(enable = "sse4.1")]
1233     unsafe fn test_mm_blendv_ps() {
1234         let a = _mm_set1_ps(0.0);
1235         let b = _mm_set1_ps(1.0);
1236         let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1237         let r = _mm_blendv_ps(a, b, mask);
1238         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1239         assert_eq_m128(r, e);
1240     }
1241
1242     #[simd_test(enable = "sse4.1")]
1243     unsafe fn test_mm_blend_pd() {
1244         let a = _mm_set1_pd(0.0);
1245         let b = _mm_set1_pd(1.0);
1246         let r = _mm_blend_pd(a, b, 0b10);
1247         let e = _mm_setr_pd(0.0, 1.0);
1248         assert_eq_m128d(r, e);
1249     }
1250
1251     #[simd_test(enable = "sse4.1")]
1252     unsafe fn test_mm_blend_ps() {
1253         let a = _mm_set1_ps(0.0);
1254         let b = _mm_set1_ps(1.0);
1255         let r = _mm_blend_ps(a, b, 0b1010);
1256         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1257         assert_eq_m128(r, e);
1258     }
1259
1260     #[simd_test(enable = "sse4.1")]
1261     unsafe fn test_mm_blend_epi16() {
1262         let a = _mm_set1_epi16(0);
1263         let b = _mm_set1_epi16(1);
1264         let r = _mm_blend_epi16(a, b, 0b1010_1100);
1265         let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1266         assert_eq_m128i(r, e);
1267     }
1268
1269     #[simd_test(enable = "sse4.1")]
1270     unsafe fn test_mm_extract_ps() {
1271         let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1272         let r: f32 = transmute(_mm_extract_ps(a, 1));
1273         assert_eq!(r, 1.0);
1274         let r: f32 = transmute(_mm_extract_ps(a, 5));
1275         assert_eq!(r, 1.0);
1276     }
1277
1278     #[simd_test(enable = "sse4.1")]
1279     unsafe fn test_mm_extract_epi8() {
1280         #[rustfmt::skip]
1281         let a = _mm_setr_epi8(
1282             -1, 1, 2, 3, 4, 5, 6, 7,
1283             8, 9, 10, 11, 12, 13, 14, 15
1284         );
1285         let r1 = _mm_extract_epi8(a, 0);
1286         let r2 = _mm_extract_epi8(a, 19);
1287         assert_eq!(r1, 0xFF);
1288         assert_eq!(r2, 3);
1289     }
1290
1291     #[simd_test(enable = "sse4.1")]
1292     unsafe fn test_mm_extract_epi32() {
1293         let a = _mm_setr_epi32(0, 1, 2, 3);
1294         let r = _mm_extract_epi32(a, 1);
1295         assert_eq!(r, 1);
1296         let r = _mm_extract_epi32(a, 5);
1297         assert_eq!(r, 1);
1298     }
1299
1300     #[simd_test(enable = "sse4.1")]
1301     unsafe fn test_mm_insert_ps() {
1302         let a = _mm_set1_ps(1.0);
1303         let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1304         let r = _mm_insert_ps(a, b, 0b11_00_1100);
1305         let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1306         assert_eq_m128(r, e);
1307     }
1308
1309     #[simd_test(enable = "sse4.1")]
1310     unsafe fn test_mm_insert_epi8() {
1311         let a = _mm_set1_epi8(0);
1312         let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1313         let r = _mm_insert_epi8(a, 32, 1);
1314         assert_eq_m128i(r, e);
1315         let r = _mm_insert_epi8(a, 32, 17);
1316         assert_eq_m128i(r, e);
1317     }
1318
1319     #[simd_test(enable = "sse4.1")]
1320     unsafe fn test_mm_insert_epi32() {
1321         let a = _mm_set1_epi32(0);
1322         let e = _mm_setr_epi32(0, 32, 0, 0);
1323         let r = _mm_insert_epi32(a, 32, 1);
1324         assert_eq_m128i(r, e);
1325         let r = _mm_insert_epi32(a, 32, 5);
1326         assert_eq_m128i(r, e);
1327     }
1328
1329     #[simd_test(enable = "sse4.1")]
1330     unsafe fn test_mm_max_epi8() {
1331         #[rustfmt::skip]
1332         let a = _mm_setr_epi8(
1333             1, 4, 5, 8, 9, 12, 13, 16,
1334             17, 20, 21, 24, 25, 28, 29, 32,
1335         );
1336         #[rustfmt::skip]
1337         let b = _mm_setr_epi8(
1338             2, 3, 6, 7, 10, 11, 14, 15,
1339             18, 19, 22, 23, 26, 27, 30, 31,
1340         );
1341         let r = _mm_max_epi8(a, b);
1342         #[rustfmt::skip]
1343         let e = _mm_setr_epi8(
1344             2, 4, 6, 8, 10, 12, 14, 16,
1345             18, 20, 22, 24, 26, 28, 30, 32,
1346         );
1347         assert_eq_m128i(r, e);
1348     }
1349
1350     #[simd_test(enable = "sse4.1")]
1351     unsafe fn test_mm_max_epu16() {
1352         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1353         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1354         let r = _mm_max_epu16(a, b);
1355         let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1356         assert_eq_m128i(r, e);
1357     }
1358
1359     #[simd_test(enable = "sse4.1")]
1360     unsafe fn test_mm_max_epi32() {
1361         let a = _mm_setr_epi32(1, 4, 5, 8);
1362         let b = _mm_setr_epi32(2, 3, 6, 7);
1363         let r = _mm_max_epi32(a, b);
1364         let e = _mm_setr_epi32(2, 4, 6, 8);
1365         assert_eq_m128i(r, e);
1366     }
1367
1368     #[simd_test(enable = "sse4.1")]
1369     unsafe fn test_mm_max_epu32() {
1370         let a = _mm_setr_epi32(1, 4, 5, 8);
1371         let b = _mm_setr_epi32(2, 3, 6, 7);
1372         let r = _mm_max_epu32(a, b);
1373         let e = _mm_setr_epi32(2, 4, 6, 8);
1374         assert_eq_m128i(r, e);
1375     }
1376
1377     #[simd_test(enable = "sse4.1")]
1378     unsafe fn test_mm_min_epi8_1() {
1379         #[rustfmt::skip]
1380         let a = _mm_setr_epi8(
1381             1, 4, 5, 8, 9, 12, 13, 16,
1382             17, 20, 21, 24, 25, 28, 29, 32,
1383         );
1384         #[rustfmt::skip]
1385         let b = _mm_setr_epi8(
1386             2, 3, 6, 7, 10, 11, 14, 15,
1387             18, 19, 22, 23, 26, 27, 30, 31,
1388         );
1389         let r = _mm_min_epi8(a, b);
1390         #[rustfmt::skip]
1391         let e = _mm_setr_epi8(
1392             1, 3, 5, 7, 9, 11, 13, 15,
1393             17, 19, 21, 23, 25, 27, 29, 31,
1394         );
1395         assert_eq_m128i(r, e);
1396     }
1397
1398     #[simd_test(enable = "sse4.1")]
1399     unsafe fn test_mm_min_epi8_2() {
1400         #[rustfmt::skip]
1401         let a = _mm_setr_epi8(
1402             1, -4, -5, 8, -9, -12, 13, -16,
1403             17, 20, 21, 24, 25, 28, 29, 32,
1404         );
1405         #[rustfmt::skip]
1406         let b = _mm_setr_epi8(
1407             2, -3, -6, 7, -10, -11, 14, -15,
1408             18, 19, 22, 23, 26, 27, 30, 31,
1409         );
1410         let r = _mm_min_epi8(a, b);
1411         #[rustfmt::skip]
1412         let e = _mm_setr_epi8(
1413             1, -4, -6, 7, -10, -12, 13, -16,
1414             17, 19, 21, 23, 25, 27, 29, 31,
1415         );
1416         assert_eq_m128i(r, e);
1417     }
1418
1419     #[simd_test(enable = "sse4.1")]
1420     unsafe fn test_mm_min_epu16() {
1421         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1422         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1423         let r = _mm_min_epu16(a, b);
1424         let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1425         assert_eq_m128i(r, e);
1426     }
1427
1428     #[simd_test(enable = "sse4.1")]
1429     unsafe fn test_mm_min_epi32_1() {
1430         let a = _mm_setr_epi32(1, 4, 5, 8);
1431         let b = _mm_setr_epi32(2, 3, 6, 7);
1432         let r = _mm_min_epi32(a, b);
1433         let e = _mm_setr_epi32(1, 3, 5, 7);
1434         assert_eq_m128i(r, e);
1435     }
1436
1437     #[simd_test(enable = "sse4.1")]
1438     unsafe fn test_mm_min_epi32_2() {
1439         let a = _mm_setr_epi32(-1, 4, 5, -7);
1440         let b = _mm_setr_epi32(-2, 3, -6, 8);
1441         let r = _mm_min_epi32(a, b);
1442         let e = _mm_setr_epi32(-2, 3, -6, -7);
1443         assert_eq_m128i(r, e);
1444     }
1445
1446     #[simd_test(enable = "sse4.1")]
1447     unsafe fn test_mm_min_epu32() {
1448         let a = _mm_setr_epi32(1, 4, 5, 8);
1449         let b = _mm_setr_epi32(2, 3, 6, 7);
1450         let r = _mm_min_epu32(a, b);
1451         let e = _mm_setr_epi32(1, 3, 5, 7);
1452         assert_eq_m128i(r, e);
1453     }
1454
1455     #[simd_test(enable = "sse4.1")]
1456     unsafe fn test_mm_packus_epi32() {
1457         let a = _mm_setr_epi32(1, 2, 3, 4);
1458         let b = _mm_setr_epi32(-1, -2, -3, -4);
1459         let r = _mm_packus_epi32(a, b);
1460         let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1461         assert_eq_m128i(r, e);
1462     }
1463
1464     #[simd_test(enable = "sse4.1")]
1465     unsafe fn test_mm_cmpeq_epi64() {
1466         let a = _mm_setr_epi64x(0, 1);
1467         let b = _mm_setr_epi64x(0, 0);
1468         let r = _mm_cmpeq_epi64(a, b);
1469         let e = _mm_setr_epi64x(-1, 0);
1470         assert_eq_m128i(r, e);
1471     }
1472
1473     #[simd_test(enable = "sse4.1")]
1474     unsafe fn test_mm_cvtepi8_epi16() {
1475         let a = _mm_set1_epi8(10);
1476         let r = _mm_cvtepi8_epi16(a);
1477         let e = _mm_set1_epi16(10);
1478         assert_eq_m128i(r, e);
1479         let a = _mm_set1_epi8(-10);
1480         let r = _mm_cvtepi8_epi16(a);
1481         let e = _mm_set1_epi16(-10);
1482         assert_eq_m128i(r, e);
1483     }
1484
1485     #[simd_test(enable = "sse4.1")]
1486     unsafe fn test_mm_cvtepi8_epi32() {
1487         let a = _mm_set1_epi8(10);
1488         let r = _mm_cvtepi8_epi32(a);
1489         let e = _mm_set1_epi32(10);
1490         assert_eq_m128i(r, e);
1491         let a = _mm_set1_epi8(-10);
1492         let r = _mm_cvtepi8_epi32(a);
1493         let e = _mm_set1_epi32(-10);
1494         assert_eq_m128i(r, e);
1495     }
1496
1497     #[simd_test(enable = "sse4.1")]
1498     unsafe fn test_mm_cvtepi8_epi64() {
1499         let a = _mm_set1_epi8(10);
1500         let r = _mm_cvtepi8_epi64(a);
1501         let e = _mm_set1_epi64x(10);
1502         assert_eq_m128i(r, e);
1503         let a = _mm_set1_epi8(-10);
1504         let r = _mm_cvtepi8_epi64(a);
1505         let e = _mm_set1_epi64x(-10);
1506         assert_eq_m128i(r, e);
1507     }
1508
1509     #[simd_test(enable = "sse4.1")]
1510     unsafe fn test_mm_cvtepi16_epi32() {
1511         let a = _mm_set1_epi16(10);
1512         let r = _mm_cvtepi16_epi32(a);
1513         let e = _mm_set1_epi32(10);
1514         assert_eq_m128i(r, e);
1515         let a = _mm_set1_epi16(-10);
1516         let r = _mm_cvtepi16_epi32(a);
1517         let e = _mm_set1_epi32(-10);
1518         assert_eq_m128i(r, e);
1519     }
1520
1521     #[simd_test(enable = "sse4.1")]
1522     unsafe fn test_mm_cvtepi16_epi64() {
1523         let a = _mm_set1_epi16(10);
1524         let r = _mm_cvtepi16_epi64(a);
1525         let e = _mm_set1_epi64x(10);
1526         assert_eq_m128i(r, e);
1527         let a = _mm_set1_epi16(-10);
1528         let r = _mm_cvtepi16_epi64(a);
1529         let e = _mm_set1_epi64x(-10);
1530         assert_eq_m128i(r, e);
1531     }
1532
1533     #[simd_test(enable = "sse4.1")]
1534     unsafe fn test_mm_cvtepi32_epi64() {
1535         let a = _mm_set1_epi32(10);
1536         let r = _mm_cvtepi32_epi64(a);
1537         let e = _mm_set1_epi64x(10);
1538         assert_eq_m128i(r, e);
1539         let a = _mm_set1_epi32(-10);
1540         let r = _mm_cvtepi32_epi64(a);
1541         let e = _mm_set1_epi64x(-10);
1542         assert_eq_m128i(r, e);
1543     }
1544
1545     #[simd_test(enable = "sse4.1")]
1546     unsafe fn test_mm_cvtepu8_epi16() {
1547         let a = _mm_set1_epi8(10);
1548         let r = _mm_cvtepu8_epi16(a);
1549         let e = _mm_set1_epi16(10);
1550         assert_eq_m128i(r, e);
1551     }
1552
1553     #[simd_test(enable = "sse4.1")]
1554     unsafe fn test_mm_cvtepu8_epi32() {
1555         let a = _mm_set1_epi8(10);
1556         let r = _mm_cvtepu8_epi32(a);
1557         let e = _mm_set1_epi32(10);
1558         assert_eq_m128i(r, e);
1559     }
1560
1561     #[simd_test(enable = "sse4.1")]
1562     unsafe fn test_mm_cvtepu8_epi64() {
1563         let a = _mm_set1_epi8(10);
1564         let r = _mm_cvtepu8_epi64(a);
1565         let e = _mm_set1_epi64x(10);
1566         assert_eq_m128i(r, e);
1567     }
1568
1569     #[simd_test(enable = "sse4.1")]
1570     unsafe fn test_mm_cvtepu16_epi32() {
1571         let a = _mm_set1_epi16(10);
1572         let r = _mm_cvtepu16_epi32(a);
1573         let e = _mm_set1_epi32(10);
1574         assert_eq_m128i(r, e);
1575     }
1576
1577     #[simd_test(enable = "sse4.1")]
1578     unsafe fn test_mm_cvtepu16_epi64() {
1579         let a = _mm_set1_epi16(10);
1580         let r = _mm_cvtepu16_epi64(a);
1581         let e = _mm_set1_epi64x(10);
1582         assert_eq_m128i(r, e);
1583     }
1584
1585     #[simd_test(enable = "sse4.1")]
1586     unsafe fn test_mm_cvtepu32_epi64() {
1587         let a = _mm_set1_epi32(10);
1588         let r = _mm_cvtepu32_epi64(a);
1589         let e = _mm_set1_epi64x(10);
1590         assert_eq_m128i(r, e);
1591     }
1592
1593     #[simd_test(enable = "sse4.1")]
1594     unsafe fn test_mm_dp_pd() {
1595         let a = _mm_setr_pd(2.0, 3.0);
1596         let b = _mm_setr_pd(1.0, 4.0);
1597         let e = _mm_setr_pd(14.0, 0.0);
1598         assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
1599     }
1600
1601     #[simd_test(enable = "sse4.1")]
1602     unsafe fn test_mm_dp_ps() {
1603         let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1604         let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1605         let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1606         assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
1607     }
1608
1609     #[simd_test(enable = "sse4.1")]
1610     unsafe fn test_mm_floor_pd() {
1611         let a = _mm_setr_pd(2.5, 4.5);
1612         let r = _mm_floor_pd(a);
1613         let e = _mm_setr_pd(2.0, 4.0);
1614         assert_eq_m128d(r, e);
1615     }
1616
1617     #[simd_test(enable = "sse4.1")]
1618     unsafe fn test_mm_floor_ps() {
1619         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1620         let r = _mm_floor_ps(a);
1621         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1622         assert_eq_m128(r, e);
1623     }
1624
1625     #[simd_test(enable = "sse4.1")]
1626     unsafe fn test_mm_floor_sd() {
1627         let a = _mm_setr_pd(2.5, 4.5);
1628         let b = _mm_setr_pd(-1.5, -3.5);
1629         let r = _mm_floor_sd(a, b);
1630         let e = _mm_setr_pd(-2.0, 4.5);
1631         assert_eq_m128d(r, e);
1632     }
1633
1634     #[simd_test(enable = "sse4.1")]
1635     unsafe fn test_mm_floor_ss() {
1636         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1637         let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1638         let r = _mm_floor_ss(a, b);
1639         let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1640         assert_eq_m128(r, e);
1641     }
1642
1643     #[simd_test(enable = "sse4.1")]
1644     unsafe fn test_mm_ceil_pd() {
1645         let a = _mm_setr_pd(1.5, 3.5);
1646         let r = _mm_ceil_pd(a);
1647         let e = _mm_setr_pd(2.0, 4.0);
1648         assert_eq_m128d(r, e);
1649     }
1650
1651     #[simd_test(enable = "sse4.1")]
1652     unsafe fn test_mm_ceil_ps() {
1653         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1654         let r = _mm_ceil_ps(a);
1655         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1656         assert_eq_m128(r, e);
1657     }
1658
1659     #[simd_test(enable = "sse4.1")]
1660     unsafe fn test_mm_ceil_sd() {
1661         let a = _mm_setr_pd(1.5, 3.5);
1662         let b = _mm_setr_pd(-2.5, -4.5);
1663         let r = _mm_ceil_sd(a, b);
1664         let e = _mm_setr_pd(-2.0, 3.5);
1665         assert_eq_m128d(r, e);
1666     }
1667
1668     #[simd_test(enable = "sse4.1")]
1669     unsafe fn test_mm_ceil_ss() {
1670         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1671         let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1672         let r = _mm_ceil_ss(a, b);
1673         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1674         assert_eq_m128(r, e);
1675     }
1676
1677     #[simd_test(enable = "sse4.1")]
1678     unsafe fn test_mm_round_pd() {
1679         let a = _mm_setr_pd(1.25, 3.75);
1680         let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
1681         let e = _mm_setr_pd(1.0, 4.0);
1682         assert_eq_m128d(r, e);
1683     }
1684
1685     #[simd_test(enable = "sse4.1")]
1686     unsafe fn test_mm_round_ps() {
1687         let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1688         let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
1689         let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1690         assert_eq_m128(r, e);
1691     }
1692
1693     #[simd_test(enable = "sse4.1")]
1694     unsafe fn test_mm_round_sd() {
1695         let a = _mm_setr_pd(1.5, 3.5);
1696         let b = _mm_setr_pd(-2.5, -4.5);
1697         let old_mode = _MM_GET_ROUNDING_MODE();
1698         _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1699         let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
1700         _MM_SET_ROUNDING_MODE(old_mode);
1701         let e = _mm_setr_pd(-2.0, 3.5);
1702         assert_eq_m128d(r, e);
1703     }
1704
1705     #[simd_test(enable = "sse4.1")]
1706     unsafe fn test_mm_round_ss() {
1707         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1708         let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1709         let old_mode = _MM_GET_ROUNDING_MODE();
1710         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1711         let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
1712         _MM_SET_ROUNDING_MODE(old_mode);
1713         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1714         assert_eq_m128(r, e);
1715     }
1716
1717     #[simd_test(enable = "sse4.1")]
1718     unsafe fn test_mm_minpos_epu16_1() {
1719         let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1720         let r = _mm_minpos_epu16(a);
1721         let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1722         assert_eq_m128i(r, e);
1723     }
1724
1725     #[simd_test(enable = "sse4.1")]
1726     unsafe fn test_mm_minpos_epu16_2() {
1727         let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1728         let r = _mm_minpos_epu16(a);
1729         let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1730         assert_eq_m128i(r, e);
1731     }
1732
1733     #[simd_test(enable = "sse4.1")]
1734     unsafe fn test_mm_mul_epi32() {
1735         {
1736             let a = _mm_setr_epi32(1, 1, 1, 1);
1737             let b = _mm_setr_epi32(1, 2, 3, 4);
1738             let r = _mm_mul_epi32(a, b);
1739             let e = _mm_setr_epi64x(1, 3);
1740             assert_eq_m128i(r, e);
1741         }
1742         {
1743             let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1744             let b = _mm_setr_epi32(
1745                 -20, -256, /* ignored */
1746                 666666, 666666, /* ignored */
1747             );
1748             let r = _mm_mul_epi32(a, b);
1749             let e = _mm_setr_epi64x(-300, 823043843622);
1750             assert_eq_m128i(r, e);
1751         }
1752     }
1753
1754     #[simd_test(enable = "sse4.1")]
1755     unsafe fn test_mm_mullo_epi32() {
1756         {
1757             let a = _mm_setr_epi32(1, 1, 1, 1);
1758             let b = _mm_setr_epi32(1, 2, 3, 4);
1759             let r = _mm_mullo_epi32(a, b);
1760             let e = _mm_setr_epi32(1, 2, 3, 4);
1761             assert_eq_m128i(r, e);
1762         }
1763         {
1764             let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1765             let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1766             let r = _mm_mullo_epi32(a, b);
1767             // Attention, most significant bit in r[2] is treated
1768             // as a sign bit:
1769             // 1234567 * 666666 = -1589877210
1770             let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1771             assert_eq_m128i(r, e);
1772         }
1773     }
1774
1775     #[simd_test(enable = "sse4.1")]
1776     unsafe fn test_mm_minpos_epu16() {
1777         let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1778         let r = _mm_minpos_epu16(a);
1779         let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1780         assert_eq_m128i(r, e);
1781     }
1782
1783     #[simd_test(enable = "sse4.1")]
1784     unsafe fn test_mm_mpsadbw_epu8() {
1785         #[rustfmt::skip]
1786         let a = _mm_setr_epi8(
1787             0, 1, 2, 3, 4, 5, 6, 7,
1788             8, 9, 10, 11, 12, 13, 14, 15,
1789         );
1790
1791         let r = _mm_mpsadbw_epu8(a, a, 0b000);
1792         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1793         assert_eq_m128i(r, e);
1794
1795         let r = _mm_mpsadbw_epu8(a, a, 0b001);
1796         let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1797         assert_eq_m128i(r, e);
1798
1799         let r = _mm_mpsadbw_epu8(a, a, 0b100);
1800         let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1801         assert_eq_m128i(r, e);
1802
1803         let r = _mm_mpsadbw_epu8(a, a, 0b101);
1804         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1805         assert_eq_m128i(r, e);
1806
1807         let r = _mm_mpsadbw_epu8(a, a, 0b111);
1808         let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1809         assert_eq_m128i(r, e);
1810     }
1811
1812     #[simd_test(enable = "sse4.1")]
1813     unsafe fn test_mm_testz_si128() {
1814         let a = _mm_set1_epi8(1);
1815         let mask = _mm_set1_epi8(0);
1816         let r = _mm_testz_si128(a, mask);
1817         assert_eq!(r, 1);
1818         let a = _mm_set1_epi8(0b101);
1819         let mask = _mm_set1_epi8(0b110);
1820         let r = _mm_testz_si128(a, mask);
1821         assert_eq!(r, 0);
1822         let a = _mm_set1_epi8(0b011);
1823         let mask = _mm_set1_epi8(0b100);
1824         let r = _mm_testz_si128(a, mask);
1825         assert_eq!(r, 1);
1826     }
1827
1828     #[simd_test(enable = "sse4.1")]
1829     unsafe fn test_mm_testc_si128() {
1830         let a = _mm_set1_epi8(-1);
1831         let mask = _mm_set1_epi8(0);
1832         let r = _mm_testc_si128(a, mask);
1833         assert_eq!(r, 1);
1834         let a = _mm_set1_epi8(0b101);
1835         let mask = _mm_set1_epi8(0b110);
1836         let r = _mm_testc_si128(a, mask);
1837         assert_eq!(r, 0);
1838         let a = _mm_set1_epi8(0b101);
1839         let mask = _mm_set1_epi8(0b100);
1840         let r = _mm_testc_si128(a, mask);
1841         assert_eq!(r, 1);
1842     }
1843
1844     #[simd_test(enable = "sse4.1")]
1845     unsafe fn test_mm_testnzc_si128() {
1846         let a = _mm_set1_epi8(0);
1847         let mask = _mm_set1_epi8(1);
1848         let r = _mm_testnzc_si128(a, mask);
1849         assert_eq!(r, 0);
1850         let a = _mm_set1_epi8(-1);
1851         let mask = _mm_set1_epi8(0);
1852         let r = _mm_testnzc_si128(a, mask);
1853         assert_eq!(r, 0);
1854         let a = _mm_set1_epi8(0b101);
1855         let mask = _mm_set1_epi8(0b110);
1856         let r = _mm_testnzc_si128(a, mask);
1857         assert_eq!(r, 1);
1858         let a = _mm_set1_epi8(0b101);
1859         let mask = _mm_set1_epi8(0b101);
1860         let r = _mm_testnzc_si128(a, mask);
1861         assert_eq!(r, 0);
1862     }
1863
1864     #[simd_test(enable = "sse4.1")]
1865     unsafe fn test_mm_test_all_zeros() {
1866         let a = _mm_set1_epi8(1);
1867         let mask = _mm_set1_epi8(0);
1868         let r = _mm_test_all_zeros(a, mask);
1869         assert_eq!(r, 1);
1870         let a = _mm_set1_epi8(0b101);
1871         let mask = _mm_set1_epi8(0b110);
1872         let r = _mm_test_all_zeros(a, mask);
1873         assert_eq!(r, 0);
1874         let a = _mm_set1_epi8(0b011);
1875         let mask = _mm_set1_epi8(0b100);
1876         let r = _mm_test_all_zeros(a, mask);
1877         assert_eq!(r, 1);
1878     }
1879
1880     #[simd_test(enable = "sse4.1")]
1881     unsafe fn test_mm_test_all_ones() {
1882         let a = _mm_set1_epi8(-1);
1883         let r = _mm_test_all_ones(a);
1884         assert_eq!(r, 1);
1885         let a = _mm_set1_epi8(0b101);
1886         let r = _mm_test_all_ones(a);
1887         assert_eq!(r, 0);
1888     }
1889
1890     #[simd_test(enable = "sse4.1")]
1891     unsafe fn test_mm_test_mix_ones_zeros() {
1892         let a = _mm_set1_epi8(0);
1893         let mask = _mm_set1_epi8(1);
1894         let r = _mm_test_mix_ones_zeros(a, mask);
1895         assert_eq!(r, 0);
1896         let a = _mm_set1_epi8(-1);
1897         let mask = _mm_set1_epi8(0);
1898         let r = _mm_test_mix_ones_zeros(a, mask);
1899         assert_eq!(r, 0);
1900         let a = _mm_set1_epi8(0b101);
1901         let mask = _mm_set1_epi8(0b110);
1902         let r = _mm_test_mix_ones_zeros(a, mask);
1903         assert_eq!(r, 1);
1904         let a = _mm_set1_epi8(0b101);
1905         let mask = _mm_set1_epi8(0b101);
1906         let r = _mm_test_mix_ones_zeros(a, mask);
1907         assert_eq!(r, 0);
1908     }
1909 }