library/stdarch/crates/core_arch/src/x86/sse41.rs

   1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
   2
   3 use crate::{
   4     core_arch::{simd::*, simd_llvm::*, x86::*},
   5     mem::transmute,
   6 };
   7
   8 #[cfg(test)]
   9 use stdarch_test::assert_instr;
  10
  11 // SSE4 rounding constans
  12 /// round to nearest
  13 #[stable(feature = "simd_x86", since = "1.27.0")]
  14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
  15 /// round down
  16 #[stable(feature = "simd_x86", since = "1.27.0")]
  17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
  18 /// round up
  19 #[stable(feature = "simd_x86", since = "1.27.0")]
  20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
  21 /// truncate
  22 #[stable(feature = "simd_x86", since = "1.27.0")]
  23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
  24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
  25 #[stable(feature = "simd_x86", since = "1.27.0")]
  26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
  27 /// do not suppress exceptions
  28 #[stable(feature = "simd_x86", since = "1.27.0")]
  29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
  30 /// suppress exceptions
  31 #[stable(feature = "simd_x86", since = "1.27.0")]
  32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
  33 /// round to nearest and do not suppress exceptions
  34 #[stable(feature = "simd_x86", since = "1.27.0")]
  35 pub const _MM_FROUND_NINT: i32 = 0x00;
  36 /// round down and do not suppress exceptions
  37 #[stable(feature = "simd_x86", since = "1.27.0")]
  38 pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
  39 /// round up and do not suppress exceptions
  40 #[stable(feature = "simd_x86", since = "1.27.0")]
  41 pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
  42 /// truncate and do not suppress exceptions
  43 #[stable(feature = "simd_x86", since = "1.27.0")]
  44 pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
  45 /// use MXCSR.RC and do not suppress exceptions; see
  46 /// `vendor::_MM_SET_ROUNDING_MODE`
  47 #[stable(feature = "simd_x86", since = "1.27.0")]
  48 pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
  49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
  50 #[stable(feature = "simd_x86", since = "1.27.0")]
  51 pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
  52
  53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
  54 ///
  55 /// The high bit of each corresponding mask byte determines the selection.
  56 /// If the high bit is set the element of `a` is selected. The element
  57 /// of `b` is selected otherwise.
  58 ///
  59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
  60 #[inline]
  61 #[target_feature(enable = "sse4.1")]
  62 #[cfg_attr(test, assert_instr(pblendvb))]
  63 #[stable(feature = "simd_x86", since = "1.27.0")]
  64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
  65     transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
  66 }
  67
  68 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
  69 ///
  70 /// The mask bits determine the selection. A clear bit selects the
  71 /// corresponding element of `a`, and a set bit the corresponding
  72 /// element of `b`.
  73 ///
  74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
  75 #[inline]
  76 #[target_feature(enable = "sse4.1")]
  77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
  78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
  79 // #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
  80 #[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))]
  81 #[rustc_args_required_const(2)]
  82 #[stable(feature = "simd_x86", since = "1.27.0")]
  83 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
  84     let a = a.as_i16x8();
  85     let b = b.as_i16x8();
  86     macro_rules! call {
  87         ($imm8:expr) => {
  88             pblendw(a, b, $imm8)
  89         };
  90     }
  91     transmute(constify_imm8!(imm8, call))
  92 }
  93
  94 /// Blend packed double-precision (64-bit) floating-point elements from `a`
  95 /// and `b` using `mask`
  96 ///
  97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
  98 #[inline]
  99 #[target_feature(enable = "sse4.1")]
 100 #[cfg_attr(test, assert_instr(blendvpd))]
 101 #[stable(feature = "simd_x86", since = "1.27.0")]
 102 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 103     blendvpd(a, b, mask)
 104 }
 105
 106 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 107 /// and `b` using `mask`
 108 ///
 109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
 110 #[inline]
 111 #[target_feature(enable = "sse4.1")]
 112 #[cfg_attr(test, assert_instr(blendvps))]
 113 #[stable(feature = "simd_x86", since = "1.27.0")]
 114 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 115     blendvps(a, b, mask)
 116 }
 117
 118 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 119 /// and `b` using control mask `imm2`
 120 ///
 121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
 122 #[inline]
 123 #[target_feature(enable = "sse4.1")]
 124 // Note: LLVM7 prefers the single-precision floating-point domain when possible
 125 // see https://bugs.llvm.org/show_bug.cgi?id=38195
 126 // #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
 127 #[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))]
 128 #[rustc_args_required_const(2)]
 129 #[stable(feature = "simd_x86", since = "1.27.0")]
 130 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
 131     macro_rules! call {
 132         ($imm2:expr) => {
 133             blendpd(a, b, $imm2)
 134         };
 135     }
 136     constify_imm2!(imm2, call)
 137 }
 138
 139 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 140 /// and `b` using mask `imm4`
 141 ///
 142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
 143 #[inline]
 144 #[target_feature(enable = "sse4.1")]
 145 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
 146 #[rustc_args_required_const(2)]
 147 #[stable(feature = "simd_x86", since = "1.27.0")]
 148 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
 149     macro_rules! call {
 150         ($imm4:expr) => {
 151             blendps(a, b, $imm4)
 152         };
 153     }
 154     constify_imm4!(imm4, call)
 155 }
 156
 157 /// Extracts a single-precision (32-bit) floating-point element from `a`,
 158 /// selected with `imm8`
 159 ///
 160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
 161 #[inline]
 162 #[target_feature(enable = "sse4.1")]
 163 #[cfg_attr(
 164     all(test, not(target_os = "windows")),
 165     assert_instr(extractps, imm8 = 0)
 166 )]
 167 #[rustc_args_required_const(1)]
 168 #[stable(feature = "simd_x86", since = "1.27.0")]
 169 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
 170     macro_rules! call {
 171         ($imm2:expr) => {
 172             transmute(simd_extract::<_, f32>(a, $imm2))
 173         };
 174     }
 175     constify_imm2!(imm8, call)
 176 }
 177
 178 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
 179 /// integer containing the zero-extended integer data.
 180 ///
 181 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
 182 ///
 183 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
 184 #[inline]
 185 #[target_feature(enable = "sse4.1")]
 186 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
 187 #[rustc_args_required_const(1)]
 188 #[stable(feature = "simd_x86", since = "1.27.0")]
 189 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
 190     let a = a.as_u8x16();
 191     macro_rules! call {
 192         ($imm4:expr) => {
 193             simd_extract::<_, u8>(a, $imm4) as i32
 194         };
 195     }
 196     constify_imm4!(imm8, call)
 197 }
 198
 199 /// Extracts an 32-bit integer from `a` selected with `imm8`
 200 ///
 201 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
 202 #[inline]
 203 #[target_feature(enable = "sse4.1")]
 204 #[cfg_attr(
 205     all(test, not(target_os = "windows")),
 206     assert_instr(extractps, imm8 = 1)
 207 )]
 208 #[rustc_args_required_const(1)]
 209 #[stable(feature = "simd_x86", since = "1.27.0")]
 210 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
 211     let a = a.as_i32x4();
 212     macro_rules! call {
 213         ($imm2:expr) => {
 214             simd_extract::<_, i32>(a, $imm2)
 215         };
 216     }
 217     constify_imm2!(imm8, call)
 218 }
 219
 220 /// Select a single value in `a` to store at some position in `b`,
 221 /// Then zero elements according to `imm8`.
 222 ///
 223 /// `imm8` specifies which bits from operand `a` will be copied, which bits in
 224 /// the result they will be copied to, and which bits in the result will be
 225 /// cleared. The following assignments are made:
 226 ///
 227 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
 228 ///     - `00`: Selects bits `[31:0]` from operand `a`.
 229 ///     - `01`: Selects bits `[63:32]` from operand `a`.
 230 ///     - `10`: Selects bits `[95:64]` from operand `a`.
 231 ///     - `11`: Selects bits `[127:96]` from operand `a`.
 232 ///
 233 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
 234 /// from operand `a` are copied:
 235 ///     - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
 236 ///     - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
 237 ///     - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
 238 ///     - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
 239 ///
 240 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 241 /// element is cleared.
 242 ///
 243 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
 244 #[inline]
 245 #[target_feature(enable = "sse4.1")]
 246 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
 247 #[rustc_args_required_const(2)]
 248 #[stable(feature = "simd_x86", since = "1.27.0")]
 249 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 250     macro_rules! call {
 251         ($imm8:expr) => {
 252             insertps(a, b, $imm8)
 253         };
 254     }
 255     constify_imm8!(imm8, call)
 256 }
 257
 258 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
 259 /// location specified by `imm8`.
 260 ///
 261 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
 262 #[inline]
 263 #[target_feature(enable = "sse4.1")]
 264 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
 265 #[rustc_args_required_const(2)]
 266 #[stable(feature = "simd_x86", since = "1.27.0")]
 267 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
 268     let a = a.as_i8x16();
 269     macro_rules! call {
 270         ($imm4:expr) => {
 271             transmute(simd_insert(a, $imm4, i as i8))
 272         };
 273     }
 274     constify_imm4!(imm8, call)
 275 }
 276
 277 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
 278 /// location specified by `imm8`.
 279 ///
 280 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
 281 #[inline]
 282 #[target_feature(enable = "sse4.1")]
 283 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
 284 #[rustc_args_required_const(2)]
 285 #[stable(feature = "simd_x86", since = "1.27.0")]
 286 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
 287     let a = a.as_i32x4();
 288     macro_rules! call {
 289         ($imm2:expr) => {
 290             transmute(simd_insert(a, $imm2, i))
 291         };
 292     }
 293     constify_imm2!(imm8, call)
 294 }
 295
 296 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
 297 /// values in dst.
 298 ///
 299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
 300 #[inline]
 301 #[target_feature(enable = "sse4.1")]
 302 #[cfg_attr(test, assert_instr(pmaxsb))]
 303 #[stable(feature = "simd_x86", since = "1.27.0")]
 304 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
 305     transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
 306 }
 307
 308 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
 309 /// maximum.
 310 ///
 311 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
 312 #[inline]
 313 #[target_feature(enable = "sse4.1")]
 314 #[cfg_attr(test, assert_instr(pmaxuw))]
 315 #[stable(feature = "simd_x86", since = "1.27.0")]
 316 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
 317     transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
 318 }
 319
 320 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
 321 /// values.
 322 ///
 323 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
 324 #[inline]
 325 #[target_feature(enable = "sse4.1")]
 326 #[cfg_attr(test, assert_instr(pmaxsd))]
 327 #[stable(feature = "simd_x86", since = "1.27.0")]
 328 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
 329     transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
 330 }
 331
 332 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
 333 /// maximum values.
 334 ///
 335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
 336 #[inline]
 337 #[target_feature(enable = "sse4.1")]
 338 #[cfg_attr(test, assert_instr(pmaxud))]
 339 #[stable(feature = "simd_x86", since = "1.27.0")]
 340 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
 341     transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
 342 }
 343
 344 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
 345 /// values in dst.
 346 ///
 347 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
 348 #[inline]
 349 #[target_feature(enable = "sse4.1")]
 350 #[cfg_attr(test, assert_instr(pminsb))]
 351 #[stable(feature = "simd_x86", since = "1.27.0")]
 352 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
 353     transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
 354 }
 355
 356 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
 357 /// minimum.
 358 ///
 359 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
 360 #[inline]
 361 #[target_feature(enable = "sse4.1")]
 362 #[cfg_attr(test, assert_instr(pminuw))]
 363 #[stable(feature = "simd_x86", since = "1.27.0")]
 364 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
 365     transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
 366 }
 367
 368 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
 369 /// values.
 370 ///
 371 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
 372 #[inline]
 373 #[target_feature(enable = "sse4.1")]
 374 #[cfg_attr(test, assert_instr(pminsd))]
 375 #[stable(feature = "simd_x86", since = "1.27.0")]
 376 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
 377     transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
 378 }
 379
 380 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
 381 /// minimum values.
 382 ///
 383 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
 384 #[inline]
 385 #[target_feature(enable = "sse4.1")]
 386 #[cfg_attr(test, assert_instr(pminud))]
 387 #[stable(feature = "simd_x86", since = "1.27.0")]
 388 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 389     transmute(pminud(a.as_u32x4(), b.as_u32x4()))
 390 }
 391
 392 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
 393 /// using unsigned saturation
 394 ///
 395 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
 396 #[inline]
 397 #[target_feature(enable = "sse4.1")]
 398 #[cfg_attr(test, assert_instr(packusdw))]
 399 #[stable(feature = "simd_x86", since = "1.27.0")]
 400 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
 401     transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
 402 }
 403
 404 /// Compares packed 64-bit integers in `a` and `b` for equality
 405 ///
 406 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
 407 #[inline]
 408 #[target_feature(enable = "sse4.1")]
 409 #[cfg_attr(test, assert_instr(pcmpeqq))]
 410 #[stable(feature = "simd_x86", since = "1.27.0")]
 411 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 412     transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 413 }
 414
 415 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
 416 ///
 417 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
 418 #[inline]
 419 #[target_feature(enable = "sse4.1")]
 420 #[cfg_attr(test, assert_instr(pmovsxbw))]
 421 #[stable(feature = "simd_x86", since = "1.27.0")]
 422 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 423     let a = a.as_i8x16();
 424     let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 425     transmute(simd_cast::<_, i16x8>(a))
 426 }
 427
 428 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
 429 ///
 430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
 431 #[inline]
 432 #[target_feature(enable = "sse4.1")]
 433 #[cfg_attr(test, assert_instr(pmovsxbd))]
 434 #[stable(feature = "simd_x86", since = "1.27.0")]
 435 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 436     let a = a.as_i8x16();
 437     let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
 438     transmute(simd_cast::<_, i32x4>(a))
 439 }
 440
 441 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 442 /// 64-bit integers
 443 ///
 444 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
 445 #[inline]
 446 #[target_feature(enable = "sse4.1")]
 447 #[cfg_attr(test, assert_instr(pmovsxbq))]
 448 #[stable(feature = "simd_x86", since = "1.27.0")]
 449 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 450     let a = a.as_i8x16();
 451     let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
 452     transmute(simd_cast::<_, i64x2>(a))
 453 }
 454
 455 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
 456 ///
 457 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
 458 #[inline]
 459 #[target_feature(enable = "sse4.1")]
 460 #[cfg_attr(test, assert_instr(pmovsxwd))]
 461 #[stable(feature = "simd_x86", since = "1.27.0")]
 462 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 463     let a = a.as_i16x8();
 464     let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
 465     transmute(simd_cast::<_, i32x4>(a))
 466 }
 467
 468 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
 469 ///
 470 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
 471 #[inline]
 472 #[target_feature(enable = "sse4.1")]
 473 #[cfg_attr(test, assert_instr(pmovsxwq))]
 474 #[stable(feature = "simd_x86", since = "1.27.0")]
 475 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 476     let a = a.as_i16x8();
 477     let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
 478     transmute(simd_cast::<_, i64x2>(a))
 479 }
 480
 481 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
 482 ///
 483 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
 484 #[inline]
 485 #[target_feature(enable = "sse4.1")]
 486 #[cfg_attr(test, assert_instr(pmovsxdq))]
 487 #[stable(feature = "simd_x86", since = "1.27.0")]
 488 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 489     let a = a.as_i32x4();
 490     let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
 491     transmute(simd_cast::<_, i64x2>(a))
 492 }
 493
 494 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
 495 ///
 496 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
 497 #[inline]
 498 #[target_feature(enable = "sse4.1")]
 499 #[cfg_attr(test, assert_instr(pmovzxbw))]
 500 #[stable(feature = "simd_x86", since = "1.27.0")]
 501 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 502     let a = a.as_u8x16();
 503     let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
 504     transmute(simd_cast::<_, i16x8>(a))
 505 }
 506
 507 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
 508 ///
 509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
 510 #[inline]
 511 #[target_feature(enable = "sse4.1")]
 512 #[cfg_attr(test, assert_instr(pmovzxbd))]
 513 #[stable(feature = "simd_x86", since = "1.27.0")]
 514 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 515     let a = a.as_u8x16();
 516     let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
 517     transmute(simd_cast::<_, i32x4>(a))
 518 }
 519
 520 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
 521 ///
 522 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
 523 #[inline]
 524 #[target_feature(enable = "sse4.1")]
 525 #[cfg_attr(test, assert_instr(pmovzxbq))]
 526 #[stable(feature = "simd_x86", since = "1.27.0")]
 527 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 528     let a = a.as_u8x16();
 529     let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
 530     transmute(simd_cast::<_, i64x2>(a))
 531 }
 532
 533 /// Zeroes extend packed unsigned 16-bit integers in `a`
 534 /// to packed 32-bit integers
 535 ///
 536 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
 537 #[inline]
 538 #[target_feature(enable = "sse4.1")]
 539 #[cfg_attr(test, assert_instr(pmovzxwd))]
 540 #[stable(feature = "simd_x86", since = "1.27.0")]
 541 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 542     let a = a.as_u16x8();
 543     let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
 544     transmute(simd_cast::<_, i32x4>(a))
 545 }
 546
 547 /// Zeroes extend packed unsigned 16-bit integers in `a`
 548 /// to packed 64-bit integers
 549 ///
 550 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
 551 #[inline]
 552 #[target_feature(enable = "sse4.1")]
 553 #[cfg_attr(test, assert_instr(pmovzxwq))]
 554 #[stable(feature = "simd_x86", since = "1.27.0")]
 555 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 556     let a = a.as_u16x8();
 557     let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
 558     transmute(simd_cast::<_, i64x2>(a))
 559 }
 560
 561 /// Zeroes extend packed unsigned 32-bit integers in `a`
 562 /// to packed 64-bit integers
 563 ///
 564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
 565 #[inline]
 566 #[target_feature(enable = "sse4.1")]
 567 #[cfg_attr(test, assert_instr(pmovzxdq))]
 568 #[stable(feature = "simd_x86", since = "1.27.0")]
 569 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 570     let a = a.as_u32x4();
 571     let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
 572     transmute(simd_cast::<_, i64x2>(a))
 573 }
 574
 575 /// Returns the dot product of two __m128d vectors.
 576 ///
 577 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
 578 /// If a condition mask bit is zero, the corresponding multiplication is
 579 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 580 /// the dot product will be stored in the return value component. Otherwise if
 581 /// the broadcast mask bit is zero then the return component will be zero.
 582 ///
 583 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
 584 #[inline]
 585 #[target_feature(enable = "sse4.1")]
 586 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
 587 #[rustc_args_required_const(2)]
 588 #[stable(feature = "simd_x86", since = "1.27.0")]
 589 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 590     macro_rules! call {
 591         ($imm8:expr) => {
 592             dppd(a, b, $imm8)
 593         };
 594     }
 595     constify_imm8!(imm8, call)
 596 }
 597
 598 /// Returns the dot product of two __m128 vectors.
 599 ///
 600 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
 601 /// If a condition mask bit is zero, the corresponding multiplication is
 602 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 603 /// the dot product will be stored in the return value component. Otherwise if
 604 /// the broadcast mask bit is zero then the return component will be zero.
 605 ///
 606 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
 607 #[inline]
 608 #[target_feature(enable = "sse4.1")]
 609 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
 610 #[rustc_args_required_const(2)]
 611 #[stable(feature = "simd_x86", since = "1.27.0")]
 612 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 613     macro_rules! call {
 614         ($imm8:expr) => {
 615             dpps(a, b, $imm8)
 616         };
 617     }
 618     constify_imm8!(imm8, call)
 619 }
 620
 621 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 622 /// down to an integer value, and stores the results as packed double-precision
 623 /// floating-point elements.
 624 ///
 625 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
 626 #[inline]
 627 #[target_feature(enable = "sse4.1")]
 628 #[cfg_attr(test, assert_instr(roundpd))]
 629 #[stable(feature = "simd_x86", since = "1.27.0")]
 630 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 631     simd_floor(a)
 632 }
 633
 634 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 635 /// down to an integer value, and stores the results as packed single-precision
 636 /// floating-point elements.
 637 ///
 638 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
 639 #[inline]
 640 #[target_feature(enable = "sse4.1")]
 641 #[cfg_attr(test, assert_instr(roundps))]
 642 #[stable(feature = "simd_x86", since = "1.27.0")]
 643 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 644     simd_floor(a)
 645 }
 646
 647 /// Round the lower double-precision (64-bit) floating-point element in `b`
 648 /// down to an integer value, store the result as a double-precision
 649 /// floating-point element in the lower element of the intrinsic result,
 650 /// and copies the upper element from `a` to the upper element of the intrinsic
 651 /// result.
 652 ///
 653 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
 654 #[inline]
 655 #[target_feature(enable = "sse4.1")]
 656 #[cfg_attr(test, assert_instr(roundsd))]
 657 #[stable(feature = "simd_x86", since = "1.27.0")]
 658 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 659     roundsd(a, b, _MM_FROUND_FLOOR)
 660 }
 661
 662 /// Round the lower single-precision (32-bit) floating-point element in `b`
 663 /// down to an integer value, store the result as a single-precision
 664 /// floating-point element in the lower element of the intrinsic result,
 665 /// and copies the upper 3 packed elements from `a` to the upper elements
 666 /// of the intrinsic result.
 667 ///
 668 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
 669 #[inline]
 670 #[target_feature(enable = "sse4.1")]
 671 #[cfg_attr(test, assert_instr(roundss))]
 672 #[stable(feature = "simd_x86", since = "1.27.0")]
 673 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 674     roundss(a, b, _MM_FROUND_FLOOR)
 675 }
 676
 677 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 678 /// up to an integer value, and stores the results as packed double-precision
 679 /// floating-point elements.
 680 ///
 681 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
 682 #[inline]
 683 #[target_feature(enable = "sse4.1")]
 684 #[cfg_attr(test, assert_instr(roundpd))]
 685 #[stable(feature = "simd_x86", since = "1.27.0")]
 686 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 687     simd_ceil(a)
 688 }
 689
 690 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 691 /// up to an integer value, and stores the results as packed single-precision
 692 /// floating-point elements.
 693 ///
 694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
 695 #[inline]
 696 #[target_feature(enable = "sse4.1")]
 697 #[cfg_attr(test, assert_instr(roundps))]
 698 #[stable(feature = "simd_x86", since = "1.27.0")]
 699 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 700     simd_ceil(a)
 701 }
 702
 703 /// Round the lower double-precision (64-bit) floating-point element in `b`
 704 /// up to an integer value, store the result as a double-precision
 705 /// floating-point element in the lower element of the intrisic result,
 706 /// and copies the upper element from `a` to the upper element
 707 /// of the intrinsic result.
 708 ///
 709 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
 710 #[inline]
 711 #[target_feature(enable = "sse4.1")]
 712 #[cfg_attr(test, assert_instr(roundsd))]
 713 #[stable(feature = "simd_x86", since = "1.27.0")]
 714 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 715     roundsd(a, b, _MM_FROUND_CEIL)
 716 }
 717
 718 /// Round the lower single-precision (32-bit) floating-point element in `b`
 719 /// up to an integer value, store the result as a single-precision
 720 /// floating-point element in the lower element of the intrinsic result,
 721 /// and copies the upper 3 packed elements from `a` to the upper elements
 722 /// of the intrinsic result.
 723 ///
 724 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
 725 #[inline]
 726 #[target_feature(enable = "sse4.1")]
 727 #[cfg_attr(test, assert_instr(roundss))]
 728 #[stable(feature = "simd_x86", since = "1.27.0")]
 729 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 730     roundss(a, b, _MM_FROUND_CEIL)
 731 }
 732
 733 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 734 /// using the `rounding` parameter, and stores the results as packed
 735 /// double-precision floating-point elements.
 736 /// Rounding is done according to the rounding parameter, which can be one of:
 737 ///
 738 /// ```
 739 /// #[cfg(target_arch = "x86")]
 740 /// use std::arch::x86::*;
 741 /// #[cfg(target_arch = "x86_64")]
 742 /// use std::arch::x86_64::*;
 743 ///
 744 /// # fn main() {
 745 /// // round to nearest, and suppress exceptions:
 746 /// # let _x =
 747 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 748 /// // round down, and suppress exceptions:
 749 /// # let _x =
 750 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 751 /// // round up, and suppress exceptions:
 752 /// # let _x =
 753 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 754 /// // truncate, and suppress exceptions:
 755 /// # let _x =
 756 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 757 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 758 /// # let _x =
 759 /// _MM_FROUND_CUR_DIRECTION;
 760 /// # }
 761 /// ```
 762 ///
 763 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
 764 #[inline]
 765 #[target_feature(enable = "sse4.1")]
 766 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
 767 #[rustc_args_required_const(1)]
 768 #[stable(feature = "simd_x86", since = "1.27.0")]
 769 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
 770     macro_rules! call {
 771         ($imm4:expr) => {
 772             roundpd(a, $imm4)
 773         };
 774     }
 775     constify_imm4!(rounding, call)
 776 }
 777
 778 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 779 /// using the `rounding` parameter, and stores the results as packed
 780 /// single-precision floating-point elements.
 781 /// Rounding is done according to the rounding parameter, which can be one of:
 782 ///
 783 /// ```
 784 /// #[cfg(target_arch = "x86")]
 785 /// use std::arch::x86::*;
 786 /// #[cfg(target_arch = "x86_64")]
 787 /// use std::arch::x86_64::*;
 788 ///
 789 /// # fn main() {
 790 /// // round to nearest, and suppress exceptions:
 791 /// # let _x =
 792 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 793 /// // round down, and suppress exceptions:
 794 /// # let _x =
 795 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 796 /// // round up, and suppress exceptions:
 797 /// # let _x =
 798 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 799 /// // truncate, and suppress exceptions:
 800 /// # let _x =
 801 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 802 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 803 /// # let _x =
 804 /// _MM_FROUND_CUR_DIRECTION;
 805 /// # }
 806 /// ```
 807 ///
 808 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
 809 #[inline]
 810 #[target_feature(enable = "sse4.1")]
 811 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
 812 #[rustc_args_required_const(1)]
 813 #[stable(feature = "simd_x86", since = "1.27.0")]
 814 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
 815     macro_rules! call {
 816         ($imm4:expr) => {
 817             roundps(a, $imm4)
 818         };
 819     }
 820     constify_imm4!(rounding, call)
 821 }
 822
 823 /// Round the lower double-precision (64-bit) floating-point element in `b`
 824 /// using the `rounding` parameter, store the result as a double-precision
 825 /// floating-point element in the lower element of the intrinsic result,
 826 /// and copies the upper element from `a` to the upper element of the intrinsic
 827 /// result.
 828 /// Rounding is done according to the rounding parameter, which can be one of:
 829 ///
 830 /// ```
 831 /// #[cfg(target_arch = "x86")]
 832 /// use std::arch::x86::*;
 833 /// #[cfg(target_arch = "x86_64")]
 834 /// use std::arch::x86_64::*;
 835 ///
 836 /// # fn main() {
 837 /// // round to nearest, and suppress exceptions:
 838 /// # let _x =
 839 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 840 /// // round down, and suppress exceptions:
 841 /// # let _x =
 842 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 843 /// // round up, and suppress exceptions:
 844 /// # let _x =
 845 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 846 /// // truncate, and suppress exceptions:
 847 /// # let _x =
 848 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 849 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 850 /// # let _x =
 851 /// _MM_FROUND_CUR_DIRECTION;
 852 /// # }
 853 /// ```
 854 ///
 855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
 856 #[inline]
 857 #[target_feature(enable = "sse4.1")]
 858 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
 859 #[rustc_args_required_const(2)]
 860 #[stable(feature = "simd_x86", since = "1.27.0")]
 861 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
 862     macro_rules! call {
 863         ($imm4:expr) => {
 864             roundsd(a, b, $imm4)
 865         };
 866     }
 867     constify_imm4!(rounding, call)
 868 }
 869
 870 /// Round the lower single-precision (32-bit) floating-point element in `b`
 871 /// using the `rounding` parameter, store the result as a single-precision
 872 /// floating-point element in the lower element of the intrinsic result,
 873 /// and copies the upper 3 packed elements from `a` to the upper elements
 874 /// of the instrinsic result.
 875 /// Rounding is done according to the rounding parameter, which can be one of:
 876 ///
 877 /// ```
 878 /// #[cfg(target_arch = "x86")]
 879 /// use std::arch::x86::*;
 880 /// #[cfg(target_arch = "x86_64")]
 881 /// use std::arch::x86_64::*;
 882 ///
 883 /// # fn main() {
 884 /// // round to nearest, and suppress exceptions:
 885 /// # let _x =
 886 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 887 /// // round down, and suppress exceptions:
 888 /// # let _x =
 889 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
 890 /// // round up, and suppress exceptions:
 891 /// # let _x =
 892 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
 893 /// // truncate, and suppress exceptions:
 894 /// # let _x =
 895 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
 896 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
 897 /// # let _x =
 898 /// _MM_FROUND_CUR_DIRECTION;
 899 /// # }
 900 /// ```
 901 ///
 902 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
 903 #[inline]
 904 #[target_feature(enable = "sse4.1")]
 905 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
 906 #[rustc_args_required_const(2)]
 907 #[stable(feature = "simd_x86", since = "1.27.0")]
 908 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 909     macro_rules! call {
 910         ($imm4:expr) => {
 911             roundss(a, b, $imm4)
 912         };
 913     }
 914     constify_imm4!(rounding, call)
 915 }
 916
 917 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
 918 /// returning a vector containing its value in its first position, and its
 919 /// index
 920 /// in its second position; all other elements are set to zero.
 921 ///
 922 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
 923 /// instruction.
 924 ///
 925 /// Arguments:
 926 ///
 927 /// * `a` - A 128-bit vector of type `__m128i`.
 928 ///
 929 /// Returns:
 930 ///
 931 /// A 128-bit value where:
 932 ///
 933 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 934 /// * bits `[18:16]` - contain the index of the minimum value
 935 /// * remaining bits are set to `0`.
 936 ///
 937 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
 938 #[inline]
 939 #[target_feature(enable = "sse4.1")]
 940 #[cfg_attr(test, assert_instr(phminposuw))]
 941 #[stable(feature = "simd_x86", since = "1.27.0")]
 942 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 943     transmute(phminposuw(a.as_u16x8()))
 944 }
 945
 946 /// Multiplies the low 32-bit integers from each packed 64-bit
 947 /// element in `a` and `b`, and returns the signed 64-bit result.
 948 ///
 949 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
 950 #[inline]
 951 #[target_feature(enable = "sse4.1")]
 952 #[cfg_attr(test, assert_instr(pmuldq))]
 953 #[stable(feature = "simd_x86", since = "1.27.0")]
 954 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 955     transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
 956 }
 957
 958 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
 959 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
 960 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
 961 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 962 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
 963 /// return a negative number.
 964 ///
 965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
 966 #[inline]
 967 #[target_feature(enable = "sse4.1")]
 968 #[cfg_attr(test, assert_instr(pmulld))]
 969 #[stable(feature = "simd_x86", since = "1.27.0")]
 970 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 971     transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 972 }
 973
 974 /// Subtracts 8-bit unsigned integer values and computes the absolute
 975 /// values of the differences to the corresponding bits in the destination.
 976 /// Then sums of the absolute differences are returned according to the bit
 977 /// fields in the immediate operand.
 978 ///
 979 /// The following algorithm is performed:
 980 ///
 981 /// ```ignore
 982 /// i = imm8[2] * 4
 983 /// j = imm8[1:0] * 4
 984 /// for k := 0 to 7
 985 ///     d0 = abs(a[i + k + 0] - b[j + 0])
 986 ///     d1 = abs(a[i + k + 1] - b[j + 1])
 987 ///     d2 = abs(a[i + k + 2] - b[j + 2])
 988 ///     d3 = abs(a[i + k + 3] - b[j + 3])
 989 ///     r[k] = d0 + d1 + d2 + d3
 990 /// ```
 991 ///
 992 /// Arguments:
 993 ///
 994 /// * `a` - A 128-bit vector of type `__m128i`.
 995 /// * `b` - A 128-bit vector of type `__m128i`.
 996 /// * `imm8` - An 8-bit immediate operand specifying how the absolute
 997 ///   differences are to be calculated
 998 ///     * Bit `[2]` specify the offset for operand `a`
 999 ///     * Bits `[1:0]` specify the offset for operand `b`
1000 ///
1001 /// Returns:
1002 ///
1003 /// * A `__m128i` vector containing the sums of the sets of   absolute
1004 ///   differences between both operands.
1005 ///
1006 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
1007 #[inline]
1008 #[target_feature(enable = "sse4.1")]
1009 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
1010 #[rustc_args_required_const(2)]
1011 #[stable(feature = "simd_x86", since = "1.27.0")]
1012 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1013     let a = a.as_u8x16();
1014     let b = b.as_u8x16();
1015     macro_rules! call {
1016         ($imm8:expr) => {
1017             mpsadbw(a, b, $imm8)
1018         };
1019     }
1020     transmute(constify_imm3!(imm8, call))
1021 }
1022
1023 /// Tests whether the specified bits in a 128-bit integer vector are all
1024 /// zeros.
1025 ///
1026 /// Arguments:
1027 ///
1028 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1029 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1030 ///   operand `a`.
1031 ///
1032 /// Returns:
1033 ///
1034 /// * `1` - if the specified bits are all zeros,
1035 /// * `0` - otherwise.
1036 ///
1037 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
1038 #[inline]
1039 #[target_feature(enable = "sse4.1")]
1040 #[cfg_attr(test, assert_instr(ptest))]
1041 #[stable(feature = "simd_x86", since = "1.27.0")]
1042 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1043     ptestz(a.as_i64x2(), mask.as_i64x2())
1044 }
1045
1046 /// Tests whether the specified bits in a 128-bit integer vector are all
1047 /// ones.
1048 ///
1049 /// Arguments:
1050 ///
1051 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1052 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1053 ///   operand `a`.
1054 ///
1055 /// Returns:
1056 ///
1057 /// * `1` - if the specified bits are all ones,
1058 /// * `0` - otherwise.
1059 ///
1060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
1061 #[inline]
1062 #[target_feature(enable = "sse4.1")]
1063 #[cfg_attr(test, assert_instr(ptest))]
1064 #[stable(feature = "simd_x86", since = "1.27.0")]
1065 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1066     ptestc(a.as_i64x2(), mask.as_i64x2())
1067 }
1068
1069 /// Tests whether the specified bits in a 128-bit integer vector are
1070 /// neither all zeros nor all ones.
1071 ///
1072 /// Arguments:
1073 ///
1074 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1075 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1076 ///   operand `a`.
1077 ///
1078 /// Returns:
1079 ///
1080 /// * `1` - if the specified bits are neither all zeros nor all ones,
1081 /// * `0` - otherwise.
1082 ///
1083 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1084 #[inline]
1085 #[target_feature(enable = "sse4.1")]
1086 #[cfg_attr(test, assert_instr(ptest))]
1087 #[stable(feature = "simd_x86", since = "1.27.0")]
1088 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1089     ptestnzc(a.as_i64x2(), mask.as_i64x2())
1090 }
1091
1092 /// Tests whether the specified bits in a 128-bit integer vector are all
1093 /// zeros.
1094 ///
1095 /// Arguments:
1096 ///
1097 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1098 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1099 ///   operand `a`.
1100 ///
1101 /// Returns:
1102 ///
1103 /// * `1` - if the specified bits are all zeros,
1104 /// * `0` - otherwise.
1105 ///
1106 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1107 #[inline]
1108 #[target_feature(enable = "sse4.1")]
1109 #[cfg_attr(test, assert_instr(ptest))]
1110 #[stable(feature = "simd_x86", since = "1.27.0")]
1111 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1112     _mm_testz_si128(a, mask)
1113 }
1114
1115 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1116 /// ones.
1117 ///
1118 /// Argument:
1119 ///
1120 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1121 ///
1122 /// Returns:
1123 ///
1124 /// * `1` - if the bits specified in the operand are all set to 1,
1125 /// * `0` - otherwise.
1126 ///
1127 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1128 #[inline]
1129 #[target_feature(enable = "sse4.1")]
1130 #[cfg_attr(test, assert_instr(pcmpeqd))]
1131 #[cfg_attr(test, assert_instr(ptest))]
1132 #[stable(feature = "simd_x86", since = "1.27.0")]
1133 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1134     _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1135 }
1136
1137 /// Tests whether the specified bits in a 128-bit integer vector are
1138 /// neither all zeros nor all ones.
1139 ///
1140 /// Arguments:
1141 ///
1142 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1143 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1144 ///   operand `a`.
1145 ///
1146 /// Returns:
1147 ///
1148 /// * `1` - if the specified bits are neither all zeros nor all ones,
1149 /// * `0` - otherwise.
1150 ///
1151 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1152 #[inline]
1153 #[target_feature(enable = "sse4.1")]
1154 #[cfg_attr(test, assert_instr(ptest))]
1155 #[stable(feature = "simd_x86", since = "1.27.0")]
1156 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1157     _mm_testnzc_si128(a, mask)
1158 }
1159
1160 #[allow(improper_ctypes)]
1161 extern "C" {
1162     #[link_name = "llvm.x86.sse41.pblendvb"]
1163     fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1164     #[link_name = "llvm.x86.sse41.blendvpd"]
1165     fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1166     #[link_name = "llvm.x86.sse41.blendvps"]
1167     fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1168     #[link_name = "llvm.x86.sse41.blendpd"]
1169     fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1170     #[link_name = "llvm.x86.sse41.blendps"]
1171     fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1172     #[link_name = "llvm.x86.sse41.pblendw"]
1173     fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1174     #[link_name = "llvm.x86.sse41.insertps"]
1175     fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1176     #[link_name = "llvm.x86.sse41.pmaxsb"]
1177     fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1178     #[link_name = "llvm.x86.sse41.pmaxuw"]
1179     fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1180     #[link_name = "llvm.x86.sse41.pmaxsd"]
1181     fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1182     #[link_name = "llvm.x86.sse41.pmaxud"]
1183     fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1184     #[link_name = "llvm.x86.sse41.pminsb"]
1185     fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1186     #[link_name = "llvm.x86.sse41.pminuw"]
1187     fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1188     #[link_name = "llvm.x86.sse41.pminsd"]
1189     fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1190     #[link_name = "llvm.x86.sse41.pminud"]
1191     fn pminud(a: u32x4, b: u32x4) -> u32x4;
1192     #[link_name = "llvm.x86.sse41.packusdw"]
1193     fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1194     #[link_name = "llvm.x86.sse41.dppd"]
1195     fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1196     #[link_name = "llvm.x86.sse41.dpps"]
1197     fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1198     #[link_name = "llvm.x86.sse41.round.pd"]
1199     fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1200     #[link_name = "llvm.x86.sse41.round.ps"]
1201     fn roundps(a: __m128, rounding: i32) -> __m128;
1202     #[link_name = "llvm.x86.sse41.round.sd"]
1203     fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1204     #[link_name = "llvm.x86.sse41.round.ss"]
1205     fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1206     #[link_name = "llvm.x86.sse41.phminposuw"]
1207     fn phminposuw(a: u16x8) -> u16x8;
1208     #[link_name = "llvm.x86.sse41.pmuldq"]
1209     fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1210     #[link_name = "llvm.x86.sse41.mpsadbw"]
1211     fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1212     #[link_name = "llvm.x86.sse41.ptestz"]
1213     fn ptestz(a: i64x2, mask: i64x2) -> i32;
1214     #[link_name = "llvm.x86.sse41.ptestc"]
1215     fn ptestc(a: i64x2, mask: i64x2) -> i32;
1216     #[link_name = "llvm.x86.sse41.ptestnzc"]
1217     fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1218 }
1219
1220 #[cfg(test)]
1221 mod tests {
1222     use crate::core_arch::x86::*;
1223     use std::mem;
1224     use stdarch_test::simd_test;
1225
1226     #[simd_test(enable = "sse4.1")]
1227     unsafe fn test_mm_blendv_epi8() {
1228         #[rustfmt::skip]
1229         let a = _mm_setr_epi8(
1230             0, 1, 2, 3, 4, 5, 6, 7,
1231             8, 9, 10, 11, 12, 13, 14, 15,
1232         );
1233         #[rustfmt::skip]
1234         let b = _mm_setr_epi8(
1235             16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1236         );
1237         #[rustfmt::skip]
1238         let mask = _mm_setr_epi8(
1239             0, -1, 0, -1, 0, -1, 0, -1,
1240             0, -1, 0, -1, 0, -1, 0, -1,
1241         );
1242         #[rustfmt::skip]
1243         let e = _mm_setr_epi8(
1244             0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1245         );
1246         assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1247     }
1248
1249     #[simd_test(enable = "sse4.1")]
1250     unsafe fn test_mm_blendv_pd() {
1251         let a = _mm_set1_pd(0.0);
1252         let b = _mm_set1_pd(1.0);
1253         let mask = transmute(_mm_setr_epi64x(0, -1));
1254         let r = _mm_blendv_pd(a, b, mask);
1255         let e = _mm_setr_pd(0.0, 1.0);
1256         assert_eq_m128d(r, e);
1257     }
1258
1259     #[simd_test(enable = "sse4.1")]
1260     unsafe fn test_mm_blendv_ps() {
1261         let a = _mm_set1_ps(0.0);
1262         let b = _mm_set1_ps(1.0);
1263         let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1264         let r = _mm_blendv_ps(a, b, mask);
1265         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1266         assert_eq_m128(r, e);
1267     }
1268
1269     #[simd_test(enable = "sse4.1")]
1270     unsafe fn test_mm_blend_pd() {
1271         let a = _mm_set1_pd(0.0);
1272         let b = _mm_set1_pd(1.0);
1273         let r = _mm_blend_pd(a, b, 0b10);
1274         let e = _mm_setr_pd(0.0, 1.0);
1275         assert_eq_m128d(r, e);
1276     }
1277
1278     #[simd_test(enable = "sse4.1")]
1279     unsafe fn test_mm_blend_ps() {
1280         let a = _mm_set1_ps(0.0);
1281         let b = _mm_set1_ps(1.0);
1282         let r = _mm_blend_ps(a, b, 0b1010);
1283         let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1284         assert_eq_m128(r, e);
1285     }
1286
1287     #[simd_test(enable = "sse4.1")]
1288     unsafe fn test_mm_blend_epi16() {
1289         let a = _mm_set1_epi16(0);
1290         let b = _mm_set1_epi16(1);
1291         let r = _mm_blend_epi16(a, b, 0b1010_1100);
1292         let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1293         assert_eq_m128i(r, e);
1294     }
1295
1296     #[simd_test(enable = "sse4.1")]
1297     unsafe fn test_mm_extract_ps() {
1298         let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1299         let r: f32 = transmute(_mm_extract_ps(a, 1));
1300         assert_eq!(r, 1.0);
1301         let r: f32 = transmute(_mm_extract_ps(a, 5));
1302         assert_eq!(r, 1.0);
1303     }
1304
1305     #[simd_test(enable = "sse4.1")]
1306     unsafe fn test_mm_extract_epi8() {
1307         #[rustfmt::skip]
1308         let a = _mm_setr_epi8(
1309             -1, 1, 2, 3, 4, 5, 6, 7,
1310             8, 9, 10, 11, 12, 13, 14, 15
1311         );
1312         let r1 = _mm_extract_epi8(a, 0);
1313         let r2 = _mm_extract_epi8(a, 19);
1314         assert_eq!(r1, 0xFF);
1315         assert_eq!(r2, 3);
1316     }
1317
1318     #[simd_test(enable = "sse4.1")]
1319     unsafe fn test_mm_extract_epi32() {
1320         let a = _mm_setr_epi32(0, 1, 2, 3);
1321         let r = _mm_extract_epi32(a, 1);
1322         assert_eq!(r, 1);
1323         let r = _mm_extract_epi32(a, 5);
1324         assert_eq!(r, 1);
1325     }
1326
1327     #[simd_test(enable = "sse4.1")]
1328     unsafe fn test_mm_insert_ps() {
1329         let a = _mm_set1_ps(1.0);
1330         let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1331         let r = _mm_insert_ps(a, b, 0b11_00_1100);
1332         let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1333         assert_eq_m128(r, e);
1334     }
1335
1336     #[simd_test(enable = "sse4.1")]
1337     unsafe fn test_mm_insert_epi8() {
1338         let a = _mm_set1_epi8(0);
1339         let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1340         let r = _mm_insert_epi8(a, 32, 1);
1341         assert_eq_m128i(r, e);
1342         let r = _mm_insert_epi8(a, 32, 17);
1343         assert_eq_m128i(r, e);
1344     }
1345
1346     #[simd_test(enable = "sse4.1")]
1347     unsafe fn test_mm_insert_epi32() {
1348         let a = _mm_set1_epi32(0);
1349         let e = _mm_setr_epi32(0, 32, 0, 0);
1350         let r = _mm_insert_epi32(a, 32, 1);
1351         assert_eq_m128i(r, e);
1352         let r = _mm_insert_epi32(a, 32, 5);
1353         assert_eq_m128i(r, e);
1354     }
1355
1356     #[simd_test(enable = "sse4.1")]
1357     unsafe fn test_mm_max_epi8() {
1358         #[rustfmt::skip]
1359         let a = _mm_setr_epi8(
1360             1, 4, 5, 8, 9, 12, 13, 16,
1361             17, 20, 21, 24, 25, 28, 29, 32,
1362         );
1363         #[rustfmt::skip]
1364         let b = _mm_setr_epi8(
1365             2, 3, 6, 7, 10, 11, 14, 15,
1366             18, 19, 22, 23, 26, 27, 30, 31,
1367         );
1368         let r = _mm_max_epi8(a, b);
1369         #[rustfmt::skip]
1370         let e = _mm_setr_epi8(
1371             2, 4, 6, 8, 10, 12, 14, 16,
1372             18, 20, 22, 24, 26, 28, 30, 32,
1373         );
1374         assert_eq_m128i(r, e);
1375     }
1376
1377     #[simd_test(enable = "sse4.1")]
1378     unsafe fn test_mm_max_epu16() {
1379         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1380         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1381         let r = _mm_max_epu16(a, b);
1382         let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1383         assert_eq_m128i(r, e);
1384     }
1385
1386     #[simd_test(enable = "sse4.1")]
1387     unsafe fn test_mm_max_epi32() {
1388         let a = _mm_setr_epi32(1, 4, 5, 8);
1389         let b = _mm_setr_epi32(2, 3, 6, 7);
1390         let r = _mm_max_epi32(a, b);
1391         let e = _mm_setr_epi32(2, 4, 6, 8);
1392         assert_eq_m128i(r, e);
1393     }
1394
1395     #[simd_test(enable = "sse4.1")]
1396     unsafe fn test_mm_max_epu32() {
1397         let a = _mm_setr_epi32(1, 4, 5, 8);
1398         let b = _mm_setr_epi32(2, 3, 6, 7);
1399         let r = _mm_max_epu32(a, b);
1400         let e = _mm_setr_epi32(2, 4, 6, 8);
1401         assert_eq_m128i(r, e);
1402     }
1403
1404     #[simd_test(enable = "sse4.1")]
1405     unsafe fn test_mm_min_epi8_1() {
1406         #[rustfmt::skip]
1407         let a = _mm_setr_epi8(
1408             1, 4, 5, 8, 9, 12, 13, 16,
1409             17, 20, 21, 24, 25, 28, 29, 32,
1410         );
1411         #[rustfmt::skip]
1412         let b = _mm_setr_epi8(
1413             2, 3, 6, 7, 10, 11, 14, 15,
1414             18, 19, 22, 23, 26, 27, 30, 31,
1415         );
1416         let r = _mm_min_epi8(a, b);
1417         #[rustfmt::skip]
1418         let e = _mm_setr_epi8(
1419             1, 3, 5, 7, 9, 11, 13, 15,
1420             17, 19, 21, 23, 25, 27, 29, 31,
1421         );
1422         assert_eq_m128i(r, e);
1423     }
1424
1425     #[simd_test(enable = "sse4.1")]
1426     unsafe fn test_mm_min_epi8_2() {
1427         #[rustfmt::skip]
1428         let a = _mm_setr_epi8(
1429             1, -4, -5, 8, -9, -12, 13, -16,
1430             17, 20, 21, 24, 25, 28, 29, 32,
1431         );
1432         #[rustfmt::skip]
1433         let b = _mm_setr_epi8(
1434             2, -3, -6, 7, -10, -11, 14, -15,
1435             18, 19, 22, 23, 26, 27, 30, 31,
1436         );
1437         let r = _mm_min_epi8(a, b);
1438         #[rustfmt::skip]
1439         let e = _mm_setr_epi8(
1440             1, -4, -6, 7, -10, -12, 13, -16,
1441             17, 19, 21, 23, 25, 27, 29, 31,
1442         );
1443         assert_eq_m128i(r, e);
1444     }
1445
1446     #[simd_test(enable = "sse4.1")]
1447     unsafe fn test_mm_min_epu16() {
1448         let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1449         let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1450         let r = _mm_min_epu16(a, b);
1451         let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1452         assert_eq_m128i(r, e);
1453     }
1454
1455     #[simd_test(enable = "sse4.1")]
1456     unsafe fn test_mm_min_epi32_1() {
1457         let a = _mm_setr_epi32(1, 4, 5, 8);
1458         let b = _mm_setr_epi32(2, 3, 6, 7);
1459         let r = _mm_min_epi32(a, b);
1460         let e = _mm_setr_epi32(1, 3, 5, 7);
1461         assert_eq_m128i(r, e);
1462     }
1463
1464     #[simd_test(enable = "sse4.1")]
1465     unsafe fn test_mm_min_epi32_2() {
1466         let a = _mm_setr_epi32(-1, 4, 5, -7);
1467         let b = _mm_setr_epi32(-2, 3, -6, 8);
1468         let r = _mm_min_epi32(a, b);
1469         let e = _mm_setr_epi32(-2, 3, -6, -7);
1470         assert_eq_m128i(r, e);
1471     }
1472
1473     #[simd_test(enable = "sse4.1")]
1474     unsafe fn test_mm_min_epu32() {
1475         let a = _mm_setr_epi32(1, 4, 5, 8);
1476         let b = _mm_setr_epi32(2, 3, 6, 7);
1477         let r = _mm_min_epu32(a, b);
1478         let e = _mm_setr_epi32(1, 3, 5, 7);
1479         assert_eq_m128i(r, e);
1480     }
1481
1482     #[simd_test(enable = "sse4.1")]
1483     unsafe fn test_mm_packus_epi32() {
1484         let a = _mm_setr_epi32(1, 2, 3, 4);
1485         let b = _mm_setr_epi32(-1, -2, -3, -4);
1486         let r = _mm_packus_epi32(a, b);
1487         let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1488         assert_eq_m128i(r, e);
1489     }
1490
1491     #[simd_test(enable = "sse4.1")]
1492     unsafe fn test_mm_cmpeq_epi64() {
1493         let a = _mm_setr_epi64x(0, 1);
1494         let b = _mm_setr_epi64x(0, 0);
1495         let r = _mm_cmpeq_epi64(a, b);
1496         let e = _mm_setr_epi64x(-1, 0);
1497         assert_eq_m128i(r, e);
1498     }
1499
1500     #[simd_test(enable = "sse4.1")]
1501     unsafe fn test_mm_cvtepi8_epi16() {
1502         let a = _mm_set1_epi8(10);
1503         let r = _mm_cvtepi8_epi16(a);
1504         let e = _mm_set1_epi16(10);
1505         assert_eq_m128i(r, e);
1506         let a = _mm_set1_epi8(-10);
1507         let r = _mm_cvtepi8_epi16(a);
1508         let e = _mm_set1_epi16(-10);
1509         assert_eq_m128i(r, e);
1510     }
1511
1512     #[simd_test(enable = "sse4.1")]
1513     unsafe fn test_mm_cvtepi8_epi32() {
1514         let a = _mm_set1_epi8(10);
1515         let r = _mm_cvtepi8_epi32(a);
1516         let e = _mm_set1_epi32(10);
1517         assert_eq_m128i(r, e);
1518         let a = _mm_set1_epi8(-10);
1519         let r = _mm_cvtepi8_epi32(a);
1520         let e = _mm_set1_epi32(-10);
1521         assert_eq_m128i(r, e);
1522     }
1523
1524     #[simd_test(enable = "sse4.1")]
1525     unsafe fn test_mm_cvtepi8_epi64() {
1526         let a = _mm_set1_epi8(10);
1527         let r = _mm_cvtepi8_epi64(a);
1528         let e = _mm_set1_epi64x(10);
1529         assert_eq_m128i(r, e);
1530         let a = _mm_set1_epi8(-10);
1531         let r = _mm_cvtepi8_epi64(a);
1532         let e = _mm_set1_epi64x(-10);
1533         assert_eq_m128i(r, e);
1534     }
1535
1536     #[simd_test(enable = "sse4.1")]
1537     unsafe fn test_mm_cvtepi16_epi32() {
1538         let a = _mm_set1_epi16(10);
1539         let r = _mm_cvtepi16_epi32(a);
1540         let e = _mm_set1_epi32(10);
1541         assert_eq_m128i(r, e);
1542         let a = _mm_set1_epi16(-10);
1543         let r = _mm_cvtepi16_epi32(a);
1544         let e = _mm_set1_epi32(-10);
1545         assert_eq_m128i(r, e);
1546     }
1547
1548     #[simd_test(enable = "sse4.1")]
1549     unsafe fn test_mm_cvtepi16_epi64() {
1550         let a = _mm_set1_epi16(10);
1551         let r = _mm_cvtepi16_epi64(a);
1552         let e = _mm_set1_epi64x(10);
1553         assert_eq_m128i(r, e);
1554         let a = _mm_set1_epi16(-10);
1555         let r = _mm_cvtepi16_epi64(a);
1556         let e = _mm_set1_epi64x(-10);
1557         assert_eq_m128i(r, e);
1558     }
1559
1560     #[simd_test(enable = "sse4.1")]
1561     unsafe fn test_mm_cvtepi32_epi64() {
1562         let a = _mm_set1_epi32(10);
1563         let r = _mm_cvtepi32_epi64(a);
1564         let e = _mm_set1_epi64x(10);
1565         assert_eq_m128i(r, e);
1566         let a = _mm_set1_epi32(-10);
1567         let r = _mm_cvtepi32_epi64(a);
1568         let e = _mm_set1_epi64x(-10);
1569         assert_eq_m128i(r, e);
1570     }
1571
1572     #[simd_test(enable = "sse4.1")]
1573     unsafe fn test_mm_cvtepu8_epi16() {
1574         let a = _mm_set1_epi8(10);
1575         let r = _mm_cvtepu8_epi16(a);
1576         let e = _mm_set1_epi16(10);
1577         assert_eq_m128i(r, e);
1578     }
1579
1580     #[simd_test(enable = "sse4.1")]
1581     unsafe fn test_mm_cvtepu8_epi32() {
1582         let a = _mm_set1_epi8(10);
1583         let r = _mm_cvtepu8_epi32(a);
1584         let e = _mm_set1_epi32(10);
1585         assert_eq_m128i(r, e);
1586     }
1587
1588     #[simd_test(enable = "sse4.1")]
1589     unsafe fn test_mm_cvtepu8_epi64() {
1590         let a = _mm_set1_epi8(10);
1591         let r = _mm_cvtepu8_epi64(a);
1592         let e = _mm_set1_epi64x(10);
1593         assert_eq_m128i(r, e);
1594     }
1595
1596     #[simd_test(enable = "sse4.1")]
1597     unsafe fn test_mm_cvtepu16_epi32() {
1598         let a = _mm_set1_epi16(10);
1599         let r = _mm_cvtepu16_epi32(a);
1600         let e = _mm_set1_epi32(10);
1601         assert_eq_m128i(r, e);
1602     }
1603
1604     #[simd_test(enable = "sse4.1")]
1605     unsafe fn test_mm_cvtepu16_epi64() {
1606         let a = _mm_set1_epi16(10);
1607         let r = _mm_cvtepu16_epi64(a);
1608         let e = _mm_set1_epi64x(10);
1609         assert_eq_m128i(r, e);
1610     }
1611
1612     #[simd_test(enable = "sse4.1")]
1613     unsafe fn test_mm_cvtepu32_epi64() {
1614         let a = _mm_set1_epi32(10);
1615         let r = _mm_cvtepu32_epi64(a);
1616         let e = _mm_set1_epi64x(10);
1617         assert_eq_m128i(r, e);
1618     }
1619
1620     #[simd_test(enable = "sse4.1")]
1621     unsafe fn test_mm_dp_pd() {
1622         let a = _mm_setr_pd(2.0, 3.0);
1623         let b = _mm_setr_pd(1.0, 4.0);
1624         let e = _mm_setr_pd(14.0, 0.0);
1625         assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
1626     }
1627
1628     #[simd_test(enable = "sse4.1")]
1629     unsafe fn test_mm_dp_ps() {
1630         let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1631         let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1632         let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1633         assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
1634     }
1635
1636     #[simd_test(enable = "sse4.1")]
1637     unsafe fn test_mm_floor_pd() {
1638         let a = _mm_setr_pd(2.5, 4.5);
1639         let r = _mm_floor_pd(a);
1640         let e = _mm_setr_pd(2.0, 4.0);
1641         assert_eq_m128d(r, e);
1642     }
1643
1644     #[simd_test(enable = "sse4.1")]
1645     unsafe fn test_mm_floor_ps() {
1646         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1647         let r = _mm_floor_ps(a);
1648         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1649         assert_eq_m128(r, e);
1650     }
1651
1652     #[simd_test(enable = "sse4.1")]
1653     unsafe fn test_mm_floor_sd() {
1654         let a = _mm_setr_pd(2.5, 4.5);
1655         let b = _mm_setr_pd(-1.5, -3.5);
1656         let r = _mm_floor_sd(a, b);
1657         let e = _mm_setr_pd(-2.0, 4.5);
1658         assert_eq_m128d(r, e);
1659     }
1660
1661     #[simd_test(enable = "sse4.1")]
1662     unsafe fn test_mm_floor_ss() {
1663         let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1664         let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1665         let r = _mm_floor_ss(a, b);
1666         let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1667         assert_eq_m128(r, e);
1668     }
1669
1670     #[simd_test(enable = "sse4.1")]
1671     unsafe fn test_mm_ceil_pd() {
1672         let a = _mm_setr_pd(1.5, 3.5);
1673         let r = _mm_ceil_pd(a);
1674         let e = _mm_setr_pd(2.0, 4.0);
1675         assert_eq_m128d(r, e);
1676     }
1677
1678     #[simd_test(enable = "sse4.1")]
1679     unsafe fn test_mm_ceil_ps() {
1680         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1681         let r = _mm_ceil_ps(a);
1682         let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1683         assert_eq_m128(r, e);
1684     }
1685
1686     #[simd_test(enable = "sse4.1")]
1687     unsafe fn test_mm_ceil_sd() {
1688         let a = _mm_setr_pd(1.5, 3.5);
1689         let b = _mm_setr_pd(-2.5, -4.5);
1690         let r = _mm_ceil_sd(a, b);
1691         let e = _mm_setr_pd(-2.0, 3.5);
1692         assert_eq_m128d(r, e);
1693     }
1694
1695     #[simd_test(enable = "sse4.1")]
1696     unsafe fn test_mm_ceil_ss() {
1697         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1698         let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1699         let r = _mm_ceil_ss(a, b);
1700         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1701         assert_eq_m128(r, e);
1702     }
1703
1704     #[simd_test(enable = "sse4.1")]
1705     unsafe fn test_mm_round_pd() {
1706         let a = _mm_setr_pd(1.25, 3.75);
1707         let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
1708         let e = _mm_setr_pd(1.0, 4.0);
1709         assert_eq_m128d(r, e);
1710     }
1711
1712     #[simd_test(enable = "sse4.1")]
1713     unsafe fn test_mm_round_ps() {
1714         let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1715         let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
1716         let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1717         assert_eq_m128(r, e);
1718     }
1719
1720     #[simd_test(enable = "sse4.1")]
1721     unsafe fn test_mm_round_sd() {
1722         let a = _mm_setr_pd(1.5, 3.5);
1723         let b = _mm_setr_pd(-2.5, -4.5);
1724         let old_mode = _MM_GET_ROUNDING_MODE();
1725         _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1726         let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
1727         _MM_SET_ROUNDING_MODE(old_mode);
1728         let e = _mm_setr_pd(-2.0, 3.5);
1729         assert_eq_m128d(r, e);
1730     }
1731
1732     #[simd_test(enable = "sse4.1")]
1733     unsafe fn test_mm_round_ss() {
1734         let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1735         let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1736         let old_mode = _MM_GET_ROUNDING_MODE();
1737         _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1738         let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
1739         _MM_SET_ROUNDING_MODE(old_mode);
1740         let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1741         assert_eq_m128(r, e);
1742     }
1743
1744     #[simd_test(enable = "sse4.1")]
1745     unsafe fn test_mm_minpos_epu16_1() {
1746         let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1747         let r = _mm_minpos_epu16(a);
1748         let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1749         assert_eq_m128i(r, e);
1750     }
1751
1752     #[simd_test(enable = "sse4.1")]
1753     unsafe fn test_mm_minpos_epu16_2() {
1754         let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1755         let r = _mm_minpos_epu16(a);
1756         let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1757         assert_eq_m128i(r, e);
1758     }
1759
1760     #[simd_test(enable = "sse4.1")]
1761     unsafe fn test_mm_mul_epi32() {
1762         {
1763             let a = _mm_setr_epi32(1, 1, 1, 1);
1764             let b = _mm_setr_epi32(1, 2, 3, 4);
1765             let r = _mm_mul_epi32(a, b);
1766             let e = _mm_setr_epi64x(1, 3);
1767             assert_eq_m128i(r, e);
1768         }
1769         {
1770             let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1771             let b = _mm_setr_epi32(
1772                 -20, -256, /* ignored */
1773                 666666, 666666, /* ignored */
1774             );
1775             let r = _mm_mul_epi32(a, b);
1776             let e = _mm_setr_epi64x(-300, 823043843622);
1777             assert_eq_m128i(r, e);
1778         }
1779     }
1780
1781     #[simd_test(enable = "sse4.1")]
1782     unsafe fn test_mm_mullo_epi32() {
1783         {
1784             let a = _mm_setr_epi32(1, 1, 1, 1);
1785             let b = _mm_setr_epi32(1, 2, 3, 4);
1786             let r = _mm_mullo_epi32(a, b);
1787             let e = _mm_setr_epi32(1, 2, 3, 4);
1788             assert_eq_m128i(r, e);
1789         }
1790         {
1791             let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1792             let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1793             let r = _mm_mullo_epi32(a, b);
1794             // Attention, most significant bit in r[2] is treated
1795             // as a sign bit:
1796             // 1234567 * 666666 = -1589877210
1797             let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1798             assert_eq_m128i(r, e);
1799         }
1800     }
1801
1802     #[simd_test(enable = "sse4.1")]
1803     unsafe fn test_mm_minpos_epu16() {
1804         let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1805         let r = _mm_minpos_epu16(a);
1806         let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1807         assert_eq_m128i(r, e);
1808     }
1809
1810     #[simd_test(enable = "sse4.1")]
1811     unsafe fn test_mm_mpsadbw_epu8() {
1812         #[rustfmt::skip]
1813         let a = _mm_setr_epi8(
1814             0, 1, 2, 3, 4, 5, 6, 7,
1815             8, 9, 10, 11, 12, 13, 14, 15,
1816         );
1817
1818         let r = _mm_mpsadbw_epu8(a, a, 0b000);
1819         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1820         assert_eq_m128i(r, e);
1821
1822         let r = _mm_mpsadbw_epu8(a, a, 0b001);
1823         let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1824         assert_eq_m128i(r, e);
1825
1826         let r = _mm_mpsadbw_epu8(a, a, 0b100);
1827         let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1828         assert_eq_m128i(r, e);
1829
1830         let r = _mm_mpsadbw_epu8(a, a, 0b101);
1831         let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1832         assert_eq_m128i(r, e);
1833
1834         let r = _mm_mpsadbw_epu8(a, a, 0b111);
1835         let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1836         assert_eq_m128i(r, e);
1837     }
1838
1839     #[simd_test(enable = "sse4.1")]
1840     unsafe fn test_mm_testz_si128() {
1841         let a = _mm_set1_epi8(1);
1842         let mask = _mm_set1_epi8(0);
1843         let r = _mm_testz_si128(a, mask);
1844         assert_eq!(r, 1);
1845         let a = _mm_set1_epi8(0b101);
1846         let mask = _mm_set1_epi8(0b110);
1847         let r = _mm_testz_si128(a, mask);
1848         assert_eq!(r, 0);
1849         let a = _mm_set1_epi8(0b011);
1850         let mask = _mm_set1_epi8(0b100);
1851         let r = _mm_testz_si128(a, mask);
1852         assert_eq!(r, 1);
1853     }
1854
1855     #[simd_test(enable = "sse4.1")]
1856     unsafe fn test_mm_testc_si128() {
1857         let a = _mm_set1_epi8(-1);
1858         let mask = _mm_set1_epi8(0);
1859         let r = _mm_testc_si128(a, mask);
1860         assert_eq!(r, 1);
1861         let a = _mm_set1_epi8(0b101);
1862         let mask = _mm_set1_epi8(0b110);
1863         let r = _mm_testc_si128(a, mask);
1864         assert_eq!(r, 0);
1865         let a = _mm_set1_epi8(0b101);
1866         let mask = _mm_set1_epi8(0b100);
1867         let r = _mm_testc_si128(a, mask);
1868         assert_eq!(r, 1);
1869     }
1870
1871     #[simd_test(enable = "sse4.1")]
1872     unsafe fn test_mm_testnzc_si128() {
1873         let a = _mm_set1_epi8(0);
1874         let mask = _mm_set1_epi8(1);
1875         let r = _mm_testnzc_si128(a, mask);
1876         assert_eq!(r, 0);
1877         let a = _mm_set1_epi8(-1);
1878         let mask = _mm_set1_epi8(0);
1879         let r = _mm_testnzc_si128(a, mask);
1880         assert_eq!(r, 0);
1881         let a = _mm_set1_epi8(0b101);
1882         let mask = _mm_set1_epi8(0b110);
1883         let r = _mm_testnzc_si128(a, mask);
1884         assert_eq!(r, 1);
1885         let a = _mm_set1_epi8(0b101);
1886         let mask = _mm_set1_epi8(0b101);
1887         let r = _mm_testnzc_si128(a, mask);
1888         assert_eq!(r, 0);
1889     }
1890
1891     #[simd_test(enable = "sse4.1")]
1892     unsafe fn test_mm_test_all_zeros() {
1893         let a = _mm_set1_epi8(1);
1894         let mask = _mm_set1_epi8(0);
1895         let r = _mm_test_all_zeros(a, mask);
1896         assert_eq!(r, 1);
1897         let a = _mm_set1_epi8(0b101);
1898         let mask = _mm_set1_epi8(0b110);
1899         let r = _mm_test_all_zeros(a, mask);
1900         assert_eq!(r, 0);
1901         let a = _mm_set1_epi8(0b011);
1902         let mask = _mm_set1_epi8(0b100);
1903         let r = _mm_test_all_zeros(a, mask);
1904         assert_eq!(r, 1);
1905     }
1906
1907     #[simd_test(enable = "sse4.1")]
1908     unsafe fn test_mm_test_all_ones() {
1909         let a = _mm_set1_epi8(-1);
1910         let r = _mm_test_all_ones(a);
1911         assert_eq!(r, 1);
1912         let a = _mm_set1_epi8(0b101);
1913         let r = _mm_test_all_ones(a);
1914         assert_eq!(r, 0);
1915     }
1916
1917     #[simd_test(enable = "sse4.1")]
1918     unsafe fn test_mm_test_mix_ones_zeros() {
1919         let a = _mm_set1_epi8(0);
1920         let mask = _mm_set1_epi8(1);
1921         let r = _mm_test_mix_ones_zeros(a, mask);
1922         assert_eq!(r, 0);
1923         let a = _mm_set1_epi8(-1);
1924         let mask = _mm_set1_epi8(0);
1925         let r = _mm_test_mix_ones_zeros(a, mask);
1926         assert_eq!(r, 0);
1927         let a = _mm_set1_epi8(0b101);
1928         let mask = _mm_set1_epi8(0b110);
1929         let r = _mm_test_mix_ones_zeros(a, mask);
1930         assert_eq!(r, 1);
1931         let a = _mm_set1_epi8(0b101);
1932         let mask = _mm_set1_epi8(0b101);
1933         let r = _mm_test_mix_ones_zeros(a, mask);
1934         assert_eq!(r, 0);
1935     }
1936 }