library/stdarch/crates/core_arch/src/x86/sse.rs

   1 //! Streaming SIMD Extensions (SSE)
   2
   3 use crate::{
   4     core_arch::{simd::*, simd_llvm::*, x86::*},
   5     intrinsics, mem, ptr,
   6 };
   7
   8 #[cfg(test)]
   9 use stdarch_test::assert_instr;
  10
  11 /// Adds the first component of `a` and `b`, the other components are copied
  12 /// from `a`.
  13 ///
  14 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
  15 #[inline]
  16 #[target_feature(enable = "sse")]
  17 #[cfg_attr(test, assert_instr(addss))]
  18 #[stable(feature = "simd_x86", since = "1.27.0")]
  19 pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
  20     addss(a, b)
  21 }
  22
  23 /// Adds __m128 vectors.
  24 ///
  25 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
  26 #[inline]
  27 #[target_feature(enable = "sse")]
  28 #[cfg_attr(test, assert_instr(addps))]
  29 #[stable(feature = "simd_x86", since = "1.27.0")]
  30 pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
  31     simd_add(a, b)
  32 }
  33
  34 /// Subtracts the first component of `b` from `a`, the other components are
  35 /// copied from `a`.
  36 ///
  37 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
  38 #[inline]
  39 #[target_feature(enable = "sse")]
  40 #[cfg_attr(test, assert_instr(subss))]
  41 #[stable(feature = "simd_x86", since = "1.27.0")]
  42 pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
  43     subss(a, b)
  44 }
  45
  46 /// Subtracts __m128 vectors.
  47 ///
  48 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
  49 #[inline]
  50 #[target_feature(enable = "sse")]
  51 #[cfg_attr(test, assert_instr(subps))]
  52 #[stable(feature = "simd_x86", since = "1.27.0")]
  53 pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
  54     simd_sub(a, b)
  55 }
  56
  57 /// Multiplies the first component of `a` and `b`, the other components are
  58 /// copied from `a`.
  59 ///
  60 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
  61 #[inline]
  62 #[target_feature(enable = "sse")]
  63 #[cfg_attr(test, assert_instr(mulss))]
  64 #[stable(feature = "simd_x86", since = "1.27.0")]
  65 pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
  66     mulss(a, b)
  67 }
  68
  69 /// Multiplies __m128 vectors.
  70 ///
  71 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
  72 #[inline]
  73 #[target_feature(enable = "sse")]
  74 #[cfg_attr(test, assert_instr(mulps))]
  75 #[stable(feature = "simd_x86", since = "1.27.0")]
  76 pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
  77     simd_mul(a, b)
  78 }
  79
  80 /// Divides the first component of `b` by `a`, the other components are
  81 /// copied from `a`.
  82 ///
  83 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
  84 #[inline]
  85 #[target_feature(enable = "sse")]
  86 #[cfg_attr(test, assert_instr(divss))]
  87 #[stable(feature = "simd_x86", since = "1.27.0")]
  88 pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
  89     divss(a, b)
  90 }
  91
  92 /// Divides __m128 vectors.
  93 ///
  94 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
  95 #[inline]
  96 #[target_feature(enable = "sse")]
  97 #[cfg_attr(test, assert_instr(divps))]
  98 #[stable(feature = "simd_x86", since = "1.27.0")]
  99 pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
 100     simd_div(a, b)
 101 }
 102
 103 /// Returns the square root of the first single-precision (32-bit)
 104 /// floating-point element in `a`, the other elements are unchanged.
 105 ///
 106 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
 107 #[inline]
 108 #[target_feature(enable = "sse")]
 109 #[cfg_attr(test, assert_instr(sqrtss))]
 110 #[stable(feature = "simd_x86", since = "1.27.0")]
 111 pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
 112     sqrtss(a)
 113 }
 114
 115 /// Returns the square root of packed single-precision (32-bit) floating-point
 116 /// elements in `a`.
 117 ///
 118 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
 119 #[inline]
 120 #[target_feature(enable = "sse")]
 121 #[cfg_attr(test, assert_instr(sqrtps))]
 122 #[stable(feature = "simd_x86", since = "1.27.0")]
 123 pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
 124     sqrtps(a)
 125 }
 126
 127 /// Returns the approximate reciprocal of the first single-precision
 128 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
 129 ///
 130 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
 131 #[inline]
 132 #[target_feature(enable = "sse")]
 133 #[cfg_attr(test, assert_instr(rcpss))]
 134 #[stable(feature = "simd_x86", since = "1.27.0")]
 135 pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
 136     rcpss(a)
 137 }
 138
 139 /// Returns the approximate reciprocal of packed single-precision (32-bit)
 140 /// floating-point elements in `a`.
 141 ///
 142 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
 143 #[inline]
 144 #[target_feature(enable = "sse")]
 145 #[cfg_attr(test, assert_instr(rcpps))]
 146 #[stable(feature = "simd_x86", since = "1.27.0")]
 147 pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
 148     rcpps(a)
 149 }
 150
 151 /// Returns the approximate reciprocal square root of the first single-precision
 152 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
 153 ///
 154 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
 155 #[inline]
 156 #[target_feature(enable = "sse")]
 157 #[cfg_attr(test, assert_instr(rsqrtss))]
 158 #[stable(feature = "simd_x86", since = "1.27.0")]
 159 pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
 160     rsqrtss(a)
 161 }
 162
 163 /// Returns the approximate reciprocal square root of packed single-precision
 164 /// (32-bit) floating-point elements in `a`.
 165 ///
 166 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
 167 #[inline]
 168 #[target_feature(enable = "sse")]
 169 #[cfg_attr(test, assert_instr(rsqrtps))]
 170 #[stable(feature = "simd_x86", since = "1.27.0")]
 171 pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
 172     rsqrtps(a)
 173 }
 174
 175 /// Compares the first single-precision (32-bit) floating-point element of `a`
 176 /// and `b`, and return the minimum value in the first element of the return
 177 /// value, the other elements are copied from `a`.
 178 ///
 179 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
 180 #[inline]
 181 #[target_feature(enable = "sse")]
 182 #[cfg_attr(test, assert_instr(minss))]
 183 #[stable(feature = "simd_x86", since = "1.27.0")]
 184 pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
 185     minss(a, b)
 186 }
 187
 188 /// Compares packed single-precision (32-bit) floating-point elements in `a` and
 189 /// `b`, and return the corresponding minimum values.
 190 ///
 191 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
 192 #[inline]
 193 #[target_feature(enable = "sse")]
 194 #[cfg_attr(test, assert_instr(minps))]
 195 #[stable(feature = "simd_x86", since = "1.27.0")]
 196 pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
 197     // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
 198     minps(a, b)
 199 }
 200
 201 /// Compares the first single-precision (32-bit) floating-point element of `a`
 202 /// and `b`, and return the maximum value in the first element of the return
 203 /// value, the other elements are copied from `a`.
 204 ///
 205 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
 206 #[inline]
 207 #[target_feature(enable = "sse")]
 208 #[cfg_attr(test, assert_instr(maxss))]
 209 #[stable(feature = "simd_x86", since = "1.27.0")]
 210 pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
 211     maxss(a, b)
 212 }
 213
 214 /// Compares packed single-precision (32-bit) floating-point elements in `a` and
 215 /// `b`, and return the corresponding maximum values.
 216 ///
 217 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
 218 #[inline]
 219 #[target_feature(enable = "sse")]
 220 #[cfg_attr(test, assert_instr(maxps))]
 221 #[stable(feature = "simd_x86", since = "1.27.0")]
 222 pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
 223     // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
 224     maxps(a, b)
 225 }
 226
 227 /// Bitwise AND of packed single-precision (32-bit) floating-point elements.
 228 ///
 229 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
 230 #[inline]
 231 #[target_feature(enable = "sse")]
 232 // i586 only seems to generate plain `and` instructions, so ignore it.
 233 #[cfg_attr(
 234     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
 235     assert_instr(andps)
 236 )]
 237 #[stable(feature = "simd_x86", since = "1.27.0")]
 238 pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
 239     let a: __m128i = mem::transmute(a);
 240     let b: __m128i = mem::transmute(b);
 241     mem::transmute(simd_and(a, b))
 242 }
 243
 244 /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
 245 /// elements.
 246 ///
 247 /// Computes `!a & b` for each bit in `a` and `b`.
 248 ///
 249 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
 250 #[inline]
 251 #[target_feature(enable = "sse")]
 252 // i586 only seems to generate plain `not` and `and` instructions, so ignore
 253 // it.
 254 #[cfg_attr(
 255     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
 256     assert_instr(andnps)
 257 )]
 258 #[stable(feature = "simd_x86", since = "1.27.0")]
 259 pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
 260     let a: __m128i = mem::transmute(a);
 261     let b: __m128i = mem::transmute(b);
 262     let mask: __m128i = mem::transmute(i32x4::splat(-1));
 263     mem::transmute(simd_and(simd_xor(mask, a), b))
 264 }
 265
 266 /// Bitwise OR of packed single-precision (32-bit) floating-point elements.
 267 ///
 268 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
 269 #[inline]
 270 #[target_feature(enable = "sse")]
 271 // i586 only seems to generate plain `or` instructions, so we ignore it.
 272 #[cfg_attr(
 273     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
 274     assert_instr(orps)
 275 )]
 276 #[stable(feature = "simd_x86", since = "1.27.0")]
 277 pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
 278     let a: __m128i = mem::transmute(a);
 279     let b: __m128i = mem::transmute(b);
 280     mem::transmute(simd_or(a, b))
 281 }
 282
 283 /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
 284 /// elements.
 285 ///
 286 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
 287 #[inline]
 288 #[target_feature(enable = "sse")]
 289 // i586 only seems to generate plain `xor` instructions, so we ignore it.
 290 #[cfg_attr(
 291     all(test, any(target_arch = "x86_64", target_feature = "sse2")),
 292     assert_instr(xorps)
 293 )]
 294 #[stable(feature = "simd_x86", since = "1.27.0")]
 295 pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
 296     let a: __m128i = mem::transmute(a);
 297     let b: __m128i = mem::transmute(b);
 298     mem::transmute(simd_xor(a, b))
 299 }
 300
 301 /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
 302 /// the result will be `0xffffffff` if the two inputs are equal, or `0`
 303 /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
 304 ///
 305 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
 306 #[inline]
 307 #[target_feature(enable = "sse")]
 308 #[cfg_attr(test, assert_instr(cmpeqss))]
 309 #[stable(feature = "simd_x86", since = "1.27.0")]
 310 pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
 311     cmpss(a, b, 0)
 312 }
 313
 314 /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
 315 /// of the result will be `0xffffffff` if `a.extract(0)` is less than
 316 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 317 /// upper 96 bits of `a`.
 318 ///
 319 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
 320 #[inline]
 321 #[target_feature(enable = "sse")]
 322 #[cfg_attr(test, assert_instr(cmpltss))]
 323 #[stable(feature = "simd_x86", since = "1.27.0")]
 324 pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
 325     cmpss(a, b, 1)
 326 }
 327
 328 /// Compares the lowest `f32` of both inputs for less than or equal. The lowest
 329 /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
 330 /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 331 /// are the upper 96 bits of `a`.
 332 ///
 333 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
 334 #[inline]
 335 #[target_feature(enable = "sse")]
 336 #[cfg_attr(test, assert_instr(cmpless))]
 337 #[stable(feature = "simd_x86", since = "1.27.0")]
 338 pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
 339     cmpss(a, b, 2)
 340 }
 341
 342 /// Compares the lowest `f32` of both inputs for greater than. The lowest 32
 343 /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
 344 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 345 /// are the upper 96 bits of `a`.
 346 ///
 347 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
 348 #[inline]
 349 #[target_feature(enable = "sse")]
 350 #[cfg_attr(test, assert_instr(cmpltss))]
 351 #[stable(feature = "simd_x86", since = "1.27.0")]
 352 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
 353     simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3])
 354 }
 355
 356 /// Compares the lowest `f32` of both inputs for greater than or equal. The
 357 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
 358 /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
 359 /// of the result are the upper 96 bits of `a`.
 360 ///
 361 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
 362 #[inline]
 363 #[target_feature(enable = "sse")]
 364 #[cfg_attr(test, assert_instr(cmpless))]
 365 #[stable(feature = "simd_x86", since = "1.27.0")]
 366 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
 367     simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3])
 368 }
 369
 370 /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
 371 /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
 372 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 373 /// upper 96 bits of `a`.
 374 ///
 375 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
 376 #[inline]
 377 #[target_feature(enable = "sse")]
 378 #[cfg_attr(test, assert_instr(cmpneqss))]
 379 #[stable(feature = "simd_x86", since = "1.27.0")]
 380 pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
 381     cmpss(a, b, 4)
 382 }
 383
 384 /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
 385 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
 386 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 387 /// upper 96 bits of `a`.
 388 ///
 389 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
 390 #[inline]
 391 #[target_feature(enable = "sse")]
 392 #[cfg_attr(test, assert_instr(cmpnltss))]
 393 #[stable(feature = "simd_x86", since = "1.27.0")]
 394 pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
 395     cmpss(a, b, 5)
 396 }
 397
 398 /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
 399 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 400 /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
 401 /// of the result are the upper 96 bits of `a`.
 402 ///
 403 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
 404 #[inline]
 405 #[target_feature(enable = "sse")]
 406 #[cfg_attr(test, assert_instr(cmpnless))]
 407 #[stable(feature = "simd_x86", since = "1.27.0")]
 408 pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
 409     cmpss(a, b, 6)
 410 }
 411
 412 /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
 413 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
 414 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
 415 /// the upper 96 bits of `a`.
 416 ///
 417 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
 418 #[inline]
 419 #[target_feature(enable = "sse")]
 420 #[cfg_attr(test, assert_instr(cmpnltss))]
 421 #[stable(feature = "simd_x86", since = "1.27.0")]
 422 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
 423     simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3])
 424 }
 425
 426 /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
 427 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 428 /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
 429 /// bits of the result are the upper 96 bits of `a`.
 430 ///
 431 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
 432 #[inline]
 433 #[target_feature(enable = "sse")]
 434 #[cfg_attr(test, assert_instr(cmpnless))]
 435 #[stable(feature = "simd_x86", since = "1.27.0")]
 436 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
 437     simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3])
 438 }
 439
 440 /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
 441 /// the result will be `0xffffffff` if neither of `a.extract(0)` or
 442 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 443 /// are the upper 96 bits of `a`.
 444 ///
 445 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
 446 #[inline]
 447 #[target_feature(enable = "sse")]
 448 #[cfg_attr(test, assert_instr(cmpordss))]
 449 #[stable(feature = "simd_x86", since = "1.27.0")]
 450 pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
 451     cmpss(a, b, 7)
 452 }
 453
 454 /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
 455 /// of the result will be `0xffffffff` if any of `a.extract(0)` or
 456 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 457 /// are the upper 96 bits of `a`.
 458 ///
 459 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
 460 #[inline]
 461 #[target_feature(enable = "sse")]
 462 #[cfg_attr(test, assert_instr(cmpunordss))]
 463 #[stable(feature = "simd_x86", since = "1.27.0")]
 464 pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
 465     cmpss(a, b, 3)
 466 }
 467
 468 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 469 /// The result in the output vector will be `0xffffffff` if the input elements
 470 /// were equal, or `0` otherwise.
 471 ///
 472 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
 473 #[inline]
 474 #[target_feature(enable = "sse")]
 475 #[cfg_attr(test, assert_instr(cmpeqps))]
 476 #[stable(feature = "simd_x86", since = "1.27.0")]
 477 pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
 478     cmpps(a, b, 0)
 479 }
 480
 481 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 482 /// The result in the output vector will be `0xffffffff` if the input element
 483 /// in `a` is less than the corresponding element in `b`, or `0` otherwise.
 484 ///
 485 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
 486 #[inline]
 487 #[target_feature(enable = "sse")]
 488 #[cfg_attr(test, assert_instr(cmpltps))]
 489 #[stable(feature = "simd_x86", since = "1.27.0")]
 490 pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
 491     cmpps(a, b, 1)
 492 }
 493
 494 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 495 /// The result in the output vector will be `0xffffffff` if the input element
 496 /// in `a` is less than or equal to the corresponding element in `b`, or `0`
 497 /// otherwise.
 498 ///
 499 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
 500 #[inline]
 501 #[target_feature(enable = "sse")]
 502 #[cfg_attr(test, assert_instr(cmpleps))]
 503 #[stable(feature = "simd_x86", since = "1.27.0")]
 504 pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
 505     cmpps(a, b, 2)
 506 }
 507
 508 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 509 /// The result in the output vector will be `0xffffffff` if the input element
 510 /// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
 511 ///
 512 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
 513 #[inline]
 514 #[target_feature(enable = "sse")]
 515 #[cfg_attr(test, assert_instr(cmpltps))]
 516 #[stable(feature = "simd_x86", since = "1.27.0")]
 517 pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
 518     cmpps(b, a, 1)
 519 }
 520
 521 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 522 /// The result in the output vector will be `0xffffffff` if the input element
 523 /// in `a` is greater than or equal to the corresponding element in `b`, or `0`
 524 /// otherwise.
 525 ///
 526 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
 527 #[inline]
 528 #[target_feature(enable = "sse")]
 529 #[cfg_attr(test, assert_instr(cmpleps))]
 530 #[stable(feature = "simd_x86", since = "1.27.0")]
 531 pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
 532     cmpps(b, a, 2)
 533 }
 534
 535 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 536 /// The result in the output vector will be `0xffffffff` if the input elements
 537 /// are **not** equal, or `0` otherwise.
 538 ///
 539 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
 540 #[inline]
 541 #[target_feature(enable = "sse")]
 542 #[cfg_attr(test, assert_instr(cmpneqps))]
 543 #[stable(feature = "simd_x86", since = "1.27.0")]
 544 pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
 545     cmpps(a, b, 4)
 546 }
 547
 548 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 549 /// The result in the output vector will be `0xffffffff` if the input element
 550 /// in `a` is **not** less than the corresponding element in `b`, or `0`
 551 /// otherwise.
 552 ///
 553 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
 554 #[inline]
 555 #[target_feature(enable = "sse")]
 556 #[cfg_attr(test, assert_instr(cmpnltps))]
 557 #[stable(feature = "simd_x86", since = "1.27.0")]
 558 pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
 559     cmpps(a, b, 5)
 560 }
 561
 562 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 563 /// The result in the output vector will be `0xffffffff` if the input element
 564 /// in `a` is **not** less than or equal to the corresponding element in `b`, or
 565 /// `0` otherwise.
 566 ///
 567 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
 568 #[inline]
 569 #[target_feature(enable = "sse")]
 570 #[cfg_attr(test, assert_instr(cmpnleps))]
 571 #[stable(feature = "simd_x86", since = "1.27.0")]
 572 pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
 573     cmpps(a, b, 6)
 574 }
 575
 576 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 577 /// The result in the output vector will be `0xffffffff` if the input element
 578 /// in `a` is **not** greater than the corresponding element in `b`, or `0`
 579 /// otherwise.
 580 ///
 581 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
 582 #[inline]
 583 #[target_feature(enable = "sse")]
 584 #[cfg_attr(test, assert_instr(cmpnltps))]
 585 #[stable(feature = "simd_x86", since = "1.27.0")]
 586 pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
 587     cmpps(b, a, 5)
 588 }
 589
 590 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 591 /// The result in the output vector will be `0xffffffff` if the input element
 592 /// in `a` is **not** greater than or equal to the corresponding element in `b`,
 593 /// or `0` otherwise.
 594 ///
 595 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
 596 #[inline]
 597 #[target_feature(enable = "sse")]
 598 #[cfg_attr(test, assert_instr(cmpnleps))]
 599 #[stable(feature = "simd_x86", since = "1.27.0")]
 600 pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
 601     cmpps(b, a, 6)
 602 }
 603
 604 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 605 /// Returns four floats that have one of two possible bit patterns. The element
 606 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 607 /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
 608 ///
 609 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
 610 #[inline]
 611 #[target_feature(enable = "sse")]
 612 #[cfg_attr(test, assert_instr(cmpordps))]
 613 #[stable(feature = "simd_x86", since = "1.27.0")]
 614 pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
 615     cmpps(b, a, 7)
 616 }
 617
 618 /// Compares each of the four floats in `a` to the corresponding element in `b`.
 619 /// Returns four floats that have one of two possible bit patterns. The element
 620 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 621 /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
 622 ///
 623 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
 624 #[inline]
 625 #[target_feature(enable = "sse")]
 626 #[cfg_attr(test, assert_instr(cmpunordps))]
 627 #[stable(feature = "simd_x86", since = "1.27.0")]
 628 pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
 629     cmpps(b, a, 3)
 630 }
 631
 632 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 633 /// `1` if they are equal, or `0` otherwise.
 634 ///
 635 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
 636 #[inline]
 637 #[target_feature(enable = "sse")]
 638 #[cfg_attr(test, assert_instr(comiss))]
 639 #[stable(feature = "simd_x86", since = "1.27.0")]
 640 pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
 641     comieq_ss(a, b)
 642 }
 643
 644 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 645 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
 646 ///
 647 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
 648 #[inline]
 649 #[target_feature(enable = "sse")]
 650 #[cfg_attr(test, assert_instr(comiss))]
 651 #[stable(feature = "simd_x86", since = "1.27.0")]
 652 pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
 653     comilt_ss(a, b)
 654 }
 655
 656 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 657 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 658 /// otherwise.
 659 ///
 660 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
 661 #[inline]
 662 #[target_feature(enable = "sse")]
 663 #[cfg_attr(test, assert_instr(comiss))]
 664 #[stable(feature = "simd_x86", since = "1.27.0")]
 665 pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
 666     comile_ss(a, b)
 667 }
 668
 669 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 670 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 671 /// otherwise.
 672 ///
 673 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
 674 #[inline]
 675 #[target_feature(enable = "sse")]
 676 #[cfg_attr(test, assert_instr(comiss))]
 677 #[stable(feature = "simd_x86", since = "1.27.0")]
 678 pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
 679     comigt_ss(a, b)
 680 }
 681
 682 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 683 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 684 /// `0` otherwise.
 685 ///
 686 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
 687 #[inline]
 688 #[target_feature(enable = "sse")]
 689 #[cfg_attr(test, assert_instr(comiss))]
 690 #[stable(feature = "simd_x86", since = "1.27.0")]
 691 pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
 692     comige_ss(a, b)
 693 }
 694
 695 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 696 /// `1` if they are **not** equal, or `0` otherwise.
 697 ///
 698 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
 699 #[inline]
 700 #[target_feature(enable = "sse")]
 701 #[cfg_attr(test, assert_instr(comiss))]
 702 #[stable(feature = "simd_x86", since = "1.27.0")]
 703 pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
 704     comineq_ss(a, b)
 705 }
 706
 707 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 708 /// `1` if they are equal, or `0` otherwise. This instruction will not signal
 709 /// an exception if either argument is a quiet NaN.
 710 ///
 711 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
 712 #[inline]
 713 #[target_feature(enable = "sse")]
 714 #[cfg_attr(test, assert_instr(ucomiss))]
 715 #[stable(feature = "simd_x86", since = "1.27.0")]
 716 pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
 717     ucomieq_ss(a, b)
 718 }
 719
 720 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 721 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
 722 /// This instruction will not signal an exception if either argument is a quiet
 723 /// NaN.
 724 ///
 725 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
 726 #[inline]
 727 #[target_feature(enable = "sse")]
 728 #[cfg_attr(test, assert_instr(ucomiss))]
 729 #[stable(feature = "simd_x86", since = "1.27.0")]
 730 pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
 731     ucomilt_ss(a, b)
 732 }
 733
 734 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 735 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 736 /// otherwise. This instruction will not signal an exception if either argument
 737 /// is a quiet NaN.
 738 ///
 739 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
 740 #[inline]
 741 #[target_feature(enable = "sse")]
 742 #[cfg_attr(test, assert_instr(ucomiss))]
 743 #[stable(feature = "simd_x86", since = "1.27.0")]
 744 pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
 745     ucomile_ss(a, b)
 746 }
 747
 748 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 749 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 750 /// otherwise. This instruction will not signal an exception if either argument
 751 /// is a quiet NaN.
 752 ///
 753 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
 754 #[inline]
 755 #[target_feature(enable = "sse")]
 756 #[cfg_attr(test, assert_instr(ucomiss))]
 757 #[stable(feature = "simd_x86", since = "1.27.0")]
 758 pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
 759     ucomigt_ss(a, b)
 760 }
 761
 762 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 763 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 764 /// `0` otherwise. This instruction will not signal an exception if either
 765 /// argument is a quiet NaN.
 766 ///
 767 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
 768 #[inline]
 769 #[target_feature(enable = "sse")]
 770 #[cfg_attr(test, assert_instr(ucomiss))]
 771 #[stable(feature = "simd_x86", since = "1.27.0")]
 772 pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
 773     ucomige_ss(a, b)
 774 }
 775
 776 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
 777 /// `1` if they are **not** equal, or `0` otherwise. This instruction will not
 778 /// signal an exception if either argument is a quiet NaN.
 779 ///
 780 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
 781 #[inline]
 782 #[target_feature(enable = "sse")]
 783 #[cfg_attr(test, assert_instr(ucomiss))]
 784 #[stable(feature = "simd_x86", since = "1.27.0")]
 785 pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
 786     ucomineq_ss(a, b)
 787 }
 788
 789 /// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
 790 ///
 791 /// The result is rounded according to the current rounding mode. If the result
 792 /// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
 793 /// (`i32::MIN`) or an invalid operation floating point exception if
 794 /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 795 ///
 796 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
 797 ///
 798 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
 799 #[inline]
 800 #[target_feature(enable = "sse")]
 801 #[cfg_attr(test, assert_instr(cvtss2si))]
 802 #[stable(feature = "simd_x86", since = "1.27.0")]
 803 pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
 804     cvtss2si(a)
 805 }
 806
 807 /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
 808 ///
 809 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
 810 #[inline]
 811 #[target_feature(enable = "sse")]
 812 #[cfg_attr(test, assert_instr(cvtss2si))]
 813 #[stable(feature = "simd_x86", since = "1.27.0")]
 814 pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
 815     _mm_cvtss_si32(a)
 816 }
 817
 818 /// Converts the lowest 32 bit float in the input vector to a 32 bit integer
 819 /// with
 820 /// truncation.
 821 ///
 822 /// The result is rounded always using truncation (round towards zero). If the
 823 /// result cannot be represented as a 32 bit integer the result will be
 824 /// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
 825 /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 826 ///
 827 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
 828 ///
 829 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
 830 #[inline]
 831 #[target_feature(enable = "sse")]
 832 #[cfg_attr(test, assert_instr(cvttss2si))]
 833 #[stable(feature = "simd_x86", since = "1.27.0")]
 834 pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
 835     cvttss2si(a)
 836 }
 837
 838 /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
 839 ///
 840 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
 841 #[inline]
 842 #[target_feature(enable = "sse")]
 843 #[cfg_attr(test, assert_instr(cvttss2si))]
 844 #[stable(feature = "simd_x86", since = "1.27.0")]
 845 pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
 846     _mm_cvttss_si32(a)
 847 }
 848
 849 /// Extracts the lowest 32 bit float from the input vector.
 850 ///
 851 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
 852 #[inline]
 853 #[target_feature(enable = "sse")]
 854 // No point in using assert_instrs. In Unix x86_64 calling convention this is a
 855 // no-op, and on Windows it's just a `mov`.
 856 #[stable(feature = "simd_x86", since = "1.27.0")]
 857 pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
 858     simd_extract(a, 0)
 859 }
 860
 861 /// Converts a 32 bit integer to a 32 bit float. The result vector is the input
 862 /// vector `a` with the lowest 32 bit float replaced by the converted integer.
 863 ///
 864 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
 865 /// input).
 866 ///
 867 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
 868 #[inline]
 869 #[target_feature(enable = "sse")]
 870 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 871 #[stable(feature = "simd_x86", since = "1.27.0")]
 872 pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
 873     cvtsi2ss(a, b)
 874 }
 875
 876 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
 877 ///
 878 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
 879 #[inline]
 880 #[target_feature(enable = "sse")]
 881 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 882 #[stable(feature = "simd_x86", since = "1.27.0")]
 883 pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
 884     _mm_cvtsi32_ss(a, b)
 885 }
 886
 887 /// Construct a `__m128` with the lowest element set to `a` and the rest set to
 888 /// zero.
 889 ///
 890 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
 891 #[inline]
 892 #[target_feature(enable = "sse")]
 893 #[cfg_attr(test, assert_instr(movss))]
 894 #[stable(feature = "simd_x86", since = "1.27.0")]
 895 pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
 896     __m128(a, 0.0, 0.0, 0.0)
 897 }
 898
 899 /// Construct a `__m128` with all element set to `a`.
 900 ///
 901 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
 902 #[inline]
 903 #[target_feature(enable = "sse")]
 904 #[cfg_attr(test, assert_instr(shufps))]
 905 #[stable(feature = "simd_x86", since = "1.27.0")]
 906 pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
 907     __m128(a, a, a, a)
 908 }
 909
 910 /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
 911 ///
 912 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
 913 #[inline]
 914 #[target_feature(enable = "sse")]
 915 #[cfg_attr(test, assert_instr(shufps))]
 916 #[stable(feature = "simd_x86", since = "1.27.0")]
 917 pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
 918     _mm_set1_ps(a)
 919 }
 920
 921 /// Construct a `__m128` from four floating point values highest to lowest.
 922 ///
 923 /// Note that `a` will be the highest 32 bits of the result, and `d` the
 924 /// lowest. This matches the standard way of writing bit patterns on x86:
 925 ///
 926 /// ```text
 927 ///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
 928 ///        +---------+---------+---------+---------+
 929 ///        |    a    |    b    |    c    |    d    |   result
 930 ///        +---------+---------+---------+---------+
 931 /// ```
 932 ///
 933 /// Alternatively:
 934 ///
 935 /// ```text
 936 /// let v = _mm_set_ps(d, c, b, a);
 937 /// ```
 938 ///
 939 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
 940 #[inline]
 941 #[target_feature(enable = "sse")]
 942 #[cfg_attr(test, assert_instr(unpcklps))]
 943 #[stable(feature = "simd_x86", since = "1.27.0")]
 944 pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 945     __m128(d, c, b, a)
 946 }
 947
 948 /// Construct a `__m128` from four floating point values lowest to highest.
 949 ///
 950 /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
 951 /// bits of the result, and `d` the highest.
 952 ///
 953 /// ```text
 954 /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
 955 /// ```
 956 ///
 957 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
 958 #[inline]
 959 #[target_feature(enable = "sse")]
 960 #[cfg_attr(
 961     all(test, any(target_os = "windows", target_arch = "x86_64")),
 962     assert_instr(unpcklps)
 963 )]
 964 // On a 32-bit architecture on non-Windows it just copies the operands from the stack.
 965 #[cfg_attr(
 966     all(test, all(not(target_os = "windows"), target_arch = "x86")),
 967     assert_instr(movaps)
 968 )]
 969 #[stable(feature = "simd_x86", since = "1.27.0")]
 970 pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 971     __m128(a, b, c, d)
 972 }
 973
 974 /// Construct a `__m128` with all elements initialized to zero.
 975 ///
 976 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
 977 #[inline]
 978 #[target_feature(enable = "sse")]
 979 #[cfg_attr(test, assert_instr(xorps))]
 980 #[stable(feature = "simd_x86", since = "1.27.0")]
 981 pub unsafe fn _mm_setzero_ps() -> __m128 {
 982     __m128(0.0, 0.0, 0.0, 0.0)
 983 }
 984
 985 /// A utility function for creating masks to use with Intel shuffle and
 986 /// permute intrinsics.
 987 #[inline]
 988 #[allow(non_snake_case)]
 989 #[unstable(feature = "stdarch", issue = "27731")]
 990 pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
 991     ((z << 6) | (y << 4) | (x << 2) | w) as i32
 992 }
 993
 994 /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
 995 /// `b` using `MASK`.
 996 ///
 997 /// The lower half of result takes values from `a` and the higher half from
 998 /// `b`. Mask is split to 2 control bits each to index the element from inputs.
 999 ///
1000 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1001 ///
1002 /// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1003 /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1004 /// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1005 /// Performing an implicit type conversion between an unsigned integer and a signed integer
1006 /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1007 #[inline]
1008 #[target_feature(enable = "sse")]
1009 #[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1010 #[rustc_legacy_const_generics(2)]
1011 #[stable(feature = "simd_x86", since = "1.27.0")]
1012 pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1013     static_assert_uimm_bits!(MASK, 8);
1014     simd_shuffle!(
1015         a,
1016         b,
1017         [
1018             MASK as u32 & 0b11,
1019             (MASK as u32 >> 2) & 0b11,
1020             ((MASK as u32 >> 4) & 0b11) + 4,
1021             ((MASK as u32 >> 6) & 0b11) + 4,
1022         ],
1023     )
1024 }
1025
1026 /// Unpacks and interleave single-precision (32-bit) floating-point elements
1027 /// from the higher half of `a` and `b`.
1028 ///
1029 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1030 #[inline]
1031 #[target_feature(enable = "sse")]
1032 #[cfg_attr(test, assert_instr(unpckhps))]
1033 #[stable(feature = "simd_x86", since = "1.27.0")]
1034 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1035     simd_shuffle!(a, b, [2, 6, 3, 7])
1036 }
1037
1038 /// Unpacks and interleave single-precision (32-bit) floating-point elements
1039 /// from the lower half of `a` and `b`.
1040 ///
1041 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1042 #[inline]
1043 #[target_feature(enable = "sse")]
1044 #[cfg_attr(test, assert_instr(unpcklps))]
1045 #[stable(feature = "simd_x86", since = "1.27.0")]
1046 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1047     simd_shuffle!(a, b, [0, 4, 1, 5])
1048 }
1049
1050 /// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1051 /// lower half of result.
1052 ///
1053 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1054 #[inline]
1055 #[target_feature(enable = "sse")]
1056 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
1057 #[stable(feature = "simd_x86", since = "1.27.0")]
1058 pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1059     // TODO; figure why this is a different instruction on Windows?
1060     simd_shuffle!(a, b, [6, 7, 2, 3])
1061 }
1062
1063 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1064 /// higher half of result.
1065 ///
1066 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1067 #[inline]
1068 #[target_feature(enable = "sse")]
1069 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
1070 #[stable(feature = "simd_x86", since = "1.27.0")]
1071 pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1072     simd_shuffle!(a, b, [0, 1, 4, 5])
1073 }
1074
1075 /// Returns a mask of the most significant bit of each element in `a`.
1076 ///
1077 /// The mask is stored in the 4 least significant bits of the return value.
1078 /// All other bits are set to `0`.
1079 ///
1080 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1081 #[inline]
1082 #[target_feature(enable = "sse")]
1083 #[cfg_attr(test, assert_instr(movmskps))]
1084 #[stable(feature = "simd_x86", since = "1.27.0")]
1085 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1086     movmskps(a)
1087 }
1088
1089 /// Construct a `__m128` with the lowest element read from `p` and the other
1090 /// elements set to zero.
1091 ///
1092 /// This corresponds to instructions `VMOVSS` / `MOVSS`.
1093 ///
1094 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1095 #[inline]
1096 #[target_feature(enable = "sse")]
1097 #[cfg_attr(test, assert_instr(movss))]
1098 #[stable(feature = "simd_x86", since = "1.27.0")]
1099 pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1100     __m128(*p, 0.0, 0.0, 0.0)
1101 }
1102
1103 /// Construct a `__m128` by duplicating the value read from `p` into all
1104 /// elements.
1105 ///
1106 /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1107 /// shuffling.
1108 ///
1109 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1110 #[inline]
1111 #[target_feature(enable = "sse")]
1112 #[cfg_attr(test, assert_instr(movss))]
1113 #[stable(feature = "simd_x86", since = "1.27.0")]
1114 pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1115     let a = *p;
1116     __m128(a, a, a, a)
1117 }
1118
1119 /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1120 ///
1121 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1122 #[inline]
1123 #[target_feature(enable = "sse")]
1124 #[cfg_attr(test, assert_instr(movss))]
1125 #[stable(feature = "simd_x86", since = "1.27.0")]
1126 pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1127     _mm_load1_ps(p)
1128 }
1129
1130 /// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1131 /// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1132 /// protection fault will be triggered (fatal program crash).
1133 ///
1134 /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1135 /// memory.
1136 ///
1137 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1138 ///
1139 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1140 #[inline]
1141 #[target_feature(enable = "sse")]
1142 #[cfg_attr(test, assert_instr(movaps))]
1143 #[stable(feature = "simd_x86", since = "1.27.0")]
1144 #[allow(clippy::cast_ptr_alignment)]
1145 pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1146     *(p as *const __m128)
1147 }
1148
1149 /// Loads four `f32` values from memory into a `__m128`. There are no
1150 /// restrictions
1151 /// on memory alignment. For aligned memory
1152 /// [`_mm_load_ps`](fn._mm_load_ps.html)
1153 /// may be faster.
1154 ///
1155 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1156 ///
1157 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1158 #[inline]
1159 #[target_feature(enable = "sse")]
1160 #[cfg_attr(test, assert_instr(movups))]
1161 #[stable(feature = "simd_x86", since = "1.27.0")]
1162 pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1163     // Note: Using `*p` would require `f32` alignment, but `movups` has no
1164     // alignment restrictions.
1165     let mut dst = _mm_undefined_ps();
1166     ptr::copy_nonoverlapping(
1167         p as *const u8,
1168         &mut dst as *mut __m128 as *mut u8,
1169         mem::size_of::<__m128>(),
1170     );
1171     dst
1172 }
1173
1174 /// Loads four `f32` values from aligned memory into a `__m128` in reverse
1175 /// order.
1176 ///
1177 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1178 /// protection fault will be triggered (fatal program crash).
1179 ///
1180 /// Functionally equivalent to the following code sequence (assuming `p`
1181 /// satisfies the alignment restrictions):
1182 ///
1183 /// ```text
1184 /// let a0 = *p;
1185 /// let a1 = *p.add(1);
1186 /// let a2 = *p.add(2);
1187 /// let a3 = *p.add(3);
1188 /// __m128::new(a3, a2, a1, a0)
1189 /// ```
1190 ///
1191 /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1192 /// shuffling.
1193 ///
1194 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1195 #[inline]
1196 #[target_feature(enable = "sse")]
1197 #[cfg_attr(test, assert_instr(movaps))]
1198 #[stable(feature = "simd_x86", since = "1.27.0")]
1199 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1200     let a = _mm_load_ps(p);
1201     simd_shuffle!(a, a, [3, 2, 1, 0])
1202 }
1203
1204 /// Loads unaligned 64-bits of integer data from memory into new vector.
1205 ///
1206 /// `mem_addr` does not need to be aligned on any particular boundary.
1207 ///
1208 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
1209 #[inline]
1210 #[target_feature(enable = "sse")]
1211 #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1212 pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
1213     transmute(i64x2(ptr::read_unaligned(mem_addr as *const i64), 0))
1214 }
1215
1216 /// Stores the lowest 32 bit float of `a` into memory.
1217 ///
1218 /// This intrinsic corresponds to the `MOVSS` instruction.
1219 ///
1220 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1221 #[inline]
1222 #[target_feature(enable = "sse")]
1223 #[cfg_attr(test, assert_instr(movss))]
1224 #[stable(feature = "simd_x86", since = "1.27.0")]
1225 pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1226     *p = simd_extract(a, 0);
1227 }
1228
1229 /// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1230 /// memory.
1231 ///
1232 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1233 /// protection fault will be triggered (fatal program crash).
1234 ///
1235 /// Functionally equivalent to the following code sequence (assuming `p`
1236 /// satisfies the alignment restrictions):
1237 ///
1238 /// ```text
1239 /// let x = a.extract(0);
1240 /// *p = x;
1241 /// *p.add(1) = x;
1242 /// *p.add(2) = x;
1243 /// *p.add(3) = x;
1244 /// ```
1245 ///
1246 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1247 #[inline]
1248 #[target_feature(enable = "sse")]
1249 #[cfg_attr(test, assert_instr(movaps))]
1250 #[stable(feature = "simd_x86", since = "1.27.0")]
1251 #[allow(clippy::cast_ptr_alignment)]
1252 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1253     let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1254     *(p as *mut __m128) = b;
1255 }
1256
1257 /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1258 ///
1259 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1260 #[inline]
1261 #[target_feature(enable = "sse")]
1262 #[cfg_attr(test, assert_instr(movaps))]
1263 #[stable(feature = "simd_x86", since = "1.27.0")]
1264 pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1265     _mm_store1_ps(p, a);
1266 }
1267
1268 /// Stores four 32-bit floats into *aligned* memory.
1269 ///
1270 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1271 /// protection fault will be triggered (fatal program crash).
1272 ///
1273 /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1274 /// memory.
1275 ///
1276 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1277 ///
1278 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1279 #[inline]
1280 #[target_feature(enable = "sse")]
1281 #[cfg_attr(test, assert_instr(movaps))]
1282 #[stable(feature = "simd_x86", since = "1.27.0")]
1283 #[allow(clippy::cast_ptr_alignment)]
1284 pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1285     *(p as *mut __m128) = a;
1286 }
1287
1288 /// Stores four 32-bit floats into memory. There are no restrictions on memory
1289 /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1290 /// faster.
1291 ///
1292 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1293 ///
1294 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1295 #[inline]
1296 #[target_feature(enable = "sse")]
1297 #[cfg_attr(test, assert_instr(movups))]
1298 #[stable(feature = "simd_x86", since = "1.27.0")]
1299 pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1300     ptr::copy_nonoverlapping(
1301         &a as *const __m128 as *const u8,
1302         p as *mut u8,
1303         mem::size_of::<__m128>(),
1304     );
1305 }
1306
1307 /// Stores four 32-bit floats into *aligned* memory in reverse order.
1308 ///
1309 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1310 /// protection fault will be triggered (fatal program crash).
1311 ///
1312 /// Functionally equivalent to the following code sequence (assuming `p`
1313 /// satisfies the alignment restrictions):
1314 ///
1315 /// ```text
1316 /// *p = a.extract(3);
1317 /// *p.add(1) = a.extract(2);
1318 /// *p.add(2) = a.extract(1);
1319 /// *p.add(3) = a.extract(0);
1320 /// ```
1321 ///
1322 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1323 #[inline]
1324 #[target_feature(enable = "sse")]
1325 #[cfg_attr(test, assert_instr(movaps))]
1326 #[stable(feature = "simd_x86", since = "1.27.0")]
1327 #[allow(clippy::cast_ptr_alignment)]
1328 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1329     let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1330     *(p as *mut __m128) = b;
1331 }
1332
1333 /// Returns a `__m128` with the first component from `b` and the remaining
1334 /// components from `a`.
1335 ///
1336 /// In other words for any `a` and `b`:
1337 /// ```text
1338 /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1339 /// ```
1340 ///
1341 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1342 #[inline]
1343 #[target_feature(enable = "sse")]
1344 #[cfg_attr(test, assert_instr(movss))]
1345 #[stable(feature = "simd_x86", since = "1.27.0")]
1346 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1347     simd_shuffle!(a, b, [4, 1, 2, 3])
1348 }
1349
1350 /// Performs a serializing operation on all store-to-memory instructions that
1351 /// were issued prior to this instruction.
1352 ///
1353 /// Guarantees that every store instruction that precedes, in program order, is
1354 /// globally visible before any store instruction which follows the fence in
1355 /// program order.
1356 ///
1357 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1358 #[inline]
1359 #[target_feature(enable = "sse")]
1360 #[cfg_attr(test, assert_instr(sfence))]
1361 #[stable(feature = "simd_x86", since = "1.27.0")]
1362 pub unsafe fn _mm_sfence() {
1363     sfence()
1364 }
1365
1366 /// Gets the unsigned 32-bit value of the MXCSR control and status register.
1367 ///
1368 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1369 ///
1370 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1371 #[inline]
1372 #[target_feature(enable = "sse")]
1373 #[cfg_attr(test, assert_instr(stmxcsr))]
1374 #[stable(feature = "simd_x86", since = "1.27.0")]
1375 pub unsafe fn _mm_getcsr() -> u32 {
1376     let mut result = 0_i32;
1377     stmxcsr((&mut result) as *mut _ as *mut i8);
1378     result as u32
1379 }
1380
1381 /// Sets the MXCSR register with the 32-bit unsigned integer value.
1382 ///
1383 /// This register controls how SIMD instructions handle floating point
1384 /// operations. Modifying this register only affects the current thread.
1385 ///
1386 /// It contains several groups of flags:
1387 ///
1388 /// * *Exception flags* report which exceptions occurred since last they were
1389 /// reset.
1390 ///
1391 /// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1392 /// default
1393 /// these flags are all set to 1, so all exceptions are masked. When an
1394 /// an exception is masked, the processor simply sets the exception flag and
1395 /// continues the operation. If the exception is unmasked, the flag is also set
1396 /// but additionally an exception handler is invoked.
1397 ///
1398 /// * *Rounding mode flags* control the rounding mode of floating point
1399 /// instructions.
1400 ///
1401 /// * The *denormals-are-zero mode flag* turns all numbers which would be
1402 /// denormalized (exponent bits are all zeros) into zeros.
1403 ///
1404 /// ## Exception Flags
1405 ///
1406 /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1407 ///   Infinity by Infinity).
1408 ///
1409 /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1410 ///   number. Mainly this can cause loss of precision.
1411 ///
1412 /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1413 ///
1414 /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1415 /// result was too large to be represented (e.g., an `f32` with absolute
1416 /// value
1417 ///   greater than `2^128`).
1418 ///
1419 /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1420 /// result was too small to be represented in a normalized way (e.g., an
1421 /// `f32`
1422 ///   with absulte value smaller than `2^-126`.)
1423 ///
1424 /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1425 ///   precision exception). This means some precision was lost due to rounding.
1426 ///   For example, the fraction `1/3` cannot be represented accurately in a
1427 ///   32 or 64 bit float and computing it would cause this exception to be
1428 ///   raised. Precision exceptions are very common, so they are usually masked.
1429 ///
1430 /// Exception flags can be read and set using the convenience functions
1431 /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1432 /// check if an operation caused some overflow:
1433 ///
1434 /// ```rust,ignore
1435 /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1436 ///                             // perform calculations
1437 /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1438 ///     // handle overflow
1439 /// }
1440 /// ```
1441 ///
1442 /// ## Masking Flags
1443 ///
1444 /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1445 /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1446 /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1447 ///
1448 /// A single masking bit can be set via
1449 ///
1450 /// ```rust,ignore
1451 /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1452 /// ```
1453 ///
1454 /// However, since mask bits are by default all set to 1, it is more common to
1455 /// want to *disable* certain bits. For example, to unmask the underflow
1456 /// exception, use:
1457 ///
1458 /// ```rust,ignore
1459 /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1460 /// exception
1461 /// ```
1462 ///
1463 /// Warning: an unmasked exception will cause an exception handler to be
1464 /// called.
1465 /// The standard handler will simply terminate the process. So, in this case
1466 /// any underflow exception would terminate the current process with something
1467 /// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1468 ///
1469 /// ## Rounding Mode
1470 ///
1471 /// The rounding mode is describe using two bits. It can be read and set using
1472 /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1473 /// `_MM_SET_ROUNDING_MODE(mode)`.
1474 ///
1475 /// The rounding modes are:
1476 ///
1477 /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1478 ///   value. If two values are equally close, round to even (i.e., least
1479 ///   significant bit will be zero).
1480 ///
1481 /// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1482 ///
1483 /// * `_MM_ROUND_UP`: Round toward positive Infinity.
1484 ///
1485 /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1486 ///
1487 /// Example:
1488 ///
1489 /// ```rust,ignore
1490 /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1491 /// ```
1492 ///
1493 /// ## Denormals-are-zero/Flush-to-zero Mode
1494 ///
1495 /// If this bit is set, values that would be denormalized will be set to zero
1496 /// instead. This is turned off by default.
1497 ///
1498 /// You can read and enable/disable this mode via the helper functions
1499 /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1500 ///
1501 /// ```rust,ignore
1502 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1503 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1504 /// ```
1505 ///
1506 ///
1507 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1508 #[inline]
1509 #[target_feature(enable = "sse")]
1510 #[cfg_attr(test, assert_instr(ldmxcsr))]
1511 #[stable(feature = "simd_x86", since = "1.27.0")]
1512 pub unsafe fn _mm_setcsr(val: u32) {
1513     ldmxcsr(&val as *const _ as *const i8);
1514 }
1515
1516 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1517 #[stable(feature = "simd_x86", since = "1.27.0")]
1518 pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1519 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1520 #[stable(feature = "simd_x86", since = "1.27.0")]
1521 pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1522 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1523 #[stable(feature = "simd_x86", since = "1.27.0")]
1524 pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1525 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1526 #[stable(feature = "simd_x86", since = "1.27.0")]
1527 pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1528 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1529 #[stable(feature = "simd_x86", since = "1.27.0")]
1530 pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1531 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1532 #[stable(feature = "simd_x86", since = "1.27.0")]
1533 pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1534 /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1535 #[stable(feature = "simd_x86", since = "1.27.0")]
1536 pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1537
1538 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1539 #[stable(feature = "simd_x86", since = "1.27.0")]
1540 pub const _MM_MASK_INVALID: u32 = 0x0080;
1541 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1542 #[stable(feature = "simd_x86", since = "1.27.0")]
1543 pub const _MM_MASK_DENORM: u32 = 0x0100;
1544 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1545 #[stable(feature = "simd_x86", since = "1.27.0")]
1546 pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1547 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1548 #[stable(feature = "simd_x86", since = "1.27.0")]
1549 pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1550 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1551 #[stable(feature = "simd_x86", since = "1.27.0")]
1552 pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1553 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1554 #[stable(feature = "simd_x86", since = "1.27.0")]
1555 pub const _MM_MASK_INEXACT: u32 = 0x1000;
1556 /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1557 #[stable(feature = "simd_x86", since = "1.27.0")]
1558 pub const _MM_MASK_MASK: u32 = 0x1f80;
1559
1560 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1561 #[stable(feature = "simd_x86", since = "1.27.0")]
1562 pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1563 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1564 #[stable(feature = "simd_x86", since = "1.27.0")]
1565 pub const _MM_ROUND_DOWN: u32 = 0x2000;
1566 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1567 #[stable(feature = "simd_x86", since = "1.27.0")]
1568 pub const _MM_ROUND_UP: u32 = 0x4000;
1569 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1570 #[stable(feature = "simd_x86", since = "1.27.0")]
1571 pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1572
1573 /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1574 #[stable(feature = "simd_x86", since = "1.27.0")]
1575 pub const _MM_ROUND_MASK: u32 = 0x6000;
1576
1577 /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1578 #[stable(feature = "simd_x86", since = "1.27.0")]
1579 pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1580 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1581 #[stable(feature = "simd_x86", since = "1.27.0")]
1582 pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1583 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1584 #[stable(feature = "simd_x86", since = "1.27.0")]
1585 pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1586
1587 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1588 ///
1589 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1590 #[inline]
1591 #[allow(non_snake_case)]
1592 #[target_feature(enable = "sse")]
1593 #[stable(feature = "simd_x86", since = "1.27.0")]
1594 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1595     _mm_getcsr() & _MM_MASK_MASK
1596 }
1597
1598 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1599 ///
1600 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1601 #[inline]
1602 #[allow(non_snake_case)]
1603 #[target_feature(enable = "sse")]
1604 #[stable(feature = "simd_x86", since = "1.27.0")]
1605 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1606     _mm_getcsr() & _MM_EXCEPT_MASK
1607 }
1608
1609 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1610 ///
1611 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1612 #[inline]
1613 #[allow(non_snake_case)]
1614 #[target_feature(enable = "sse")]
1615 #[stable(feature = "simd_x86", since = "1.27.0")]
1616 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1617     _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1618 }
1619
1620 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1621 ///
1622 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1623 #[inline]
1624 #[allow(non_snake_case)]
1625 #[target_feature(enable = "sse")]
1626 #[stable(feature = "simd_x86", since = "1.27.0")]
1627 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1628     _mm_getcsr() & _MM_ROUND_MASK
1629 }
1630
1631 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1632 ///
1633 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1634 #[inline]
1635 #[allow(non_snake_case)]
1636 #[target_feature(enable = "sse")]
1637 #[stable(feature = "simd_x86", since = "1.27.0")]
1638 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1639     _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1640 }
1641
1642 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1643 ///
1644 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1645 #[inline]
1646 #[allow(non_snake_case)]
1647 #[target_feature(enable = "sse")]
1648 #[stable(feature = "simd_x86", since = "1.27.0")]
1649 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1650     _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1651 }
1652
1653 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1654 ///
1655 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1656 #[inline]
1657 #[allow(non_snake_case)]
1658 #[target_feature(enable = "sse")]
1659 #[stable(feature = "simd_x86", since = "1.27.0")]
1660 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1661     let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1662     // println!("setting csr={:x}", val);
1663     _mm_setcsr(val)
1664 }
1665
1666 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1667 ///
1668 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1669 #[inline]
1670 #[allow(non_snake_case)]
1671 #[target_feature(enable = "sse")]
1672 #[stable(feature = "simd_x86", since = "1.27.0")]
1673 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1674     _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1675 }
1676
1677 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1678 #[stable(feature = "simd_x86", since = "1.27.0")]
1679 pub const _MM_HINT_T0: i32 = 3;
1680
1681 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1682 #[stable(feature = "simd_x86", since = "1.27.0")]
1683 pub const _MM_HINT_T1: i32 = 2;
1684
1685 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1686 #[stable(feature = "simd_x86", since = "1.27.0")]
1687 pub const _MM_HINT_T2: i32 = 1;
1688
1689 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1690 #[stable(feature = "simd_x86", since = "1.27.0")]
1691 pub const _MM_HINT_NTA: i32 = 0;
1692
1693 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1694 #[stable(feature = "simd_x86", since = "1.27.0")]
1695 pub const _MM_HINT_ET0: i32 = 7;
1696
1697 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1698 #[stable(feature = "simd_x86", since = "1.27.0")]
1699 pub const _MM_HINT_ET1: i32 = 6;
1700
1701 /// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1702 ///
1703 /// The `STRATEGY` must be one of:
1704 ///
1705 /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1706 ///   cache hierarchy.
1707 ///
1708 /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1709 ///
1710 /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1711 /// an   implementation-specific choice (e.g., L2 if there is no L3).
1712 ///
1713 /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1714 ///   non-temporal access (NTA) hint. It may be a place closer than main memory
1715 ///   but outside of the cache hierarchy. This is used to reduce access latency
1716 ///   without polluting the cache.
1717 ///
1718 /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1719 ///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1720 ///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1721 ///
1722 /// The actual implementation depends on the particular CPU. This instruction
1723 /// is considered a hint, so the CPU is also free to simply ignore the request.
1724 ///
1725 /// The amount of prefetched data depends on the cache line size of the
1726 /// specific CPU, but it will be at least 32 bytes.
1727 ///
1728 /// Common caveats:
1729 ///
1730 /// * Most modern CPUs already automatically prefetch data based on predicted
1731 ///   access patterns.
1732 ///
1733 /// * Data is usually not fetched if this would cause a TLB miss or a page
1734 ///   fault.
1735 ///
1736 /// * Too much prefetching can cause unnecessary cache evictions.
1737 ///
1738 /// * Prefetching may also fail if there are not enough memory-subsystem
1739 ///   resources (e.g., request buffers).
1740 ///
1741 ///
1742 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1743 #[inline]
1744 #[target_feature(enable = "sse")]
1745 #[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1746 #[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1747 #[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1748 #[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1749 #[rustc_legacy_const_generics(1)]
1750 #[stable(feature = "simd_x86", since = "1.27.0")]
1751 pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1752     // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1753     // `locality` and `rw` are based on our `STRATEGY`.
1754     prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
1755 }
1756
1757 /// Returns vector of type __m128 with undefined elements.
1758 ///
1759 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1760 #[inline]
1761 #[target_feature(enable = "sse")]
1762 #[stable(feature = "simd_x86", since = "1.27.0")]
1763 pub unsafe fn _mm_undefined_ps() -> __m128 {
1764     _mm_set1_ps(0.0)
1765 }
1766
1767 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1768 ///
1769 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1770 #[inline]
1771 #[allow(non_snake_case)]
1772 #[target_feature(enable = "sse")]
1773 #[stable(feature = "simd_x86", since = "1.27.0")]
1774 pub unsafe fn _MM_TRANSPOSE4_PS(
1775     row0: &mut __m128,
1776     row1: &mut __m128,
1777     row2: &mut __m128,
1778     row3: &mut __m128,
1779 ) {
1780     let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1781     let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1782     let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1783     let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1784
1785     *row0 = _mm_movelh_ps(tmp0, tmp2);
1786     *row1 = _mm_movehl_ps(tmp2, tmp0);
1787     *row2 = _mm_movelh_ps(tmp1, tmp3);
1788     *row3 = _mm_movehl_ps(tmp3, tmp1);
1789 }
1790
1791 #[allow(improper_ctypes)]
1792 extern "C" {
1793     #[link_name = "llvm.x86.sse.add.ss"]
1794     fn addss(a: __m128, b: __m128) -> __m128;
1795     #[link_name = "llvm.x86.sse.sub.ss"]
1796     fn subss(a: __m128, b: __m128) -> __m128;
1797     #[link_name = "llvm.x86.sse.mul.ss"]
1798     fn mulss(a: __m128, b: __m128) -> __m128;
1799     #[link_name = "llvm.x86.sse.div.ss"]
1800     fn divss(a: __m128, b: __m128) -> __m128;
1801     #[link_name = "llvm.x86.sse.sqrt.ss"]
1802     fn sqrtss(a: __m128) -> __m128;
1803     #[link_name = "llvm.x86.sse.sqrt.ps"]
1804     fn sqrtps(a: __m128) -> __m128;
1805     #[link_name = "llvm.x86.sse.rcp.ss"]
1806     fn rcpss(a: __m128) -> __m128;
1807     #[link_name = "llvm.x86.sse.rcp.ps"]
1808     fn rcpps(a: __m128) -> __m128;
1809     #[link_name = "llvm.x86.sse.rsqrt.ss"]
1810     fn rsqrtss(a: __m128) -> __m128;
1811     #[link_name = "llvm.x86.sse.rsqrt.ps"]
1812     fn rsqrtps(a: __m128) -> __m128;
1813     #[link_name = "llvm.x86.sse.min.ss"]
1814     fn minss(a: __m128, b: __m128) -> __m128;
1815     #[link_name = "llvm.x86.sse.min.ps"]
1816     fn minps(a: __m128, b: __m128) -> __m128;
1817     #[link_name = "llvm.x86.sse.max.ss"]
1818     fn maxss(a: __m128, b: __m128) -> __m128;
1819     #[link_name = "llvm.x86.sse.max.ps"]
1820     fn maxps(a: __m128, b: __m128) -> __m128;
1821     #[link_name = "llvm.x86.sse.movmsk.ps"]
1822     fn movmskps(a: __m128) -> i32;
1823     #[link_name = "llvm.x86.sse.cmp.ps"]
1824     fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1825     #[link_name = "llvm.x86.sse.comieq.ss"]
1826     fn comieq_ss(a: __m128, b: __m128) -> i32;
1827     #[link_name = "llvm.x86.sse.comilt.ss"]
1828     fn comilt_ss(a: __m128, b: __m128) -> i32;
1829     #[link_name = "llvm.x86.sse.comile.ss"]
1830     fn comile_ss(a: __m128, b: __m128) -> i32;
1831     #[link_name = "llvm.x86.sse.comigt.ss"]
1832     fn comigt_ss(a: __m128, b: __m128) -> i32;
1833     #[link_name = "llvm.x86.sse.comige.ss"]
1834     fn comige_ss(a: __m128, b: __m128) -> i32;
1835     #[link_name = "llvm.x86.sse.comineq.ss"]
1836     fn comineq_ss(a: __m128, b: __m128) -> i32;
1837     #[link_name = "llvm.x86.sse.ucomieq.ss"]
1838     fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1839     #[link_name = "llvm.x86.sse.ucomilt.ss"]
1840     fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1841     #[link_name = "llvm.x86.sse.ucomile.ss"]
1842     fn ucomile_ss(a: __m128, b: __m128) -> i32;
1843     #[link_name = "llvm.x86.sse.ucomigt.ss"]
1844     fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1845     #[link_name = "llvm.x86.sse.ucomige.ss"]
1846     fn ucomige_ss(a: __m128, b: __m128) -> i32;
1847     #[link_name = "llvm.x86.sse.ucomineq.ss"]
1848     fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1849     #[link_name = "llvm.x86.sse.cvtss2si"]
1850     fn cvtss2si(a: __m128) -> i32;
1851     #[link_name = "llvm.x86.sse.cvttss2si"]
1852     fn cvttss2si(a: __m128) -> i32;
1853     #[link_name = "llvm.x86.sse.cvtsi2ss"]
1854     fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1855     #[link_name = "llvm.x86.sse.sfence"]
1856     fn sfence();
1857     #[link_name = "llvm.x86.sse.stmxcsr"]
1858     fn stmxcsr(p: *mut i8);
1859     #[link_name = "llvm.x86.sse.ldmxcsr"]
1860     fn ldmxcsr(p: *const i8);
1861     #[link_name = "llvm.prefetch"]
1862     fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1863     #[link_name = "llvm.x86.sse.cmp.ss"]
1864     fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1865 }
1866
1867 /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1868 ///
1869 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1870 /// exception _may_ be generated.
1871 ///
1872 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
1873 #[inline]
1874 #[target_feature(enable = "sse")]
1875 #[cfg_attr(test, assert_instr(movntps))]
1876 #[stable(feature = "simd_x86", since = "1.27.0")]
1877 #[allow(clippy::cast_ptr_alignment)]
1878 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
1879     intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
1880 }
1881
1882 #[cfg(test)]
1883 mod tests {
1884     use crate::{hint::black_box, mem::transmute};
1885     use std::{boxed, f32::NAN};
1886     use stdarch_test::simd_test;
1887
1888     use crate::core_arch::{simd::*, x86::*};
1889
1890     #[simd_test(enable = "sse")]
1891     unsafe fn test_mm_add_ps() {
1892         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1893         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1894         let r = _mm_add_ps(a, b);
1895         assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
1896     }
1897
1898     #[simd_test(enable = "sse")]
1899     unsafe fn test_mm_add_ss() {
1900         let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
1901         let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
1902         let r = _mm_add_ss(a, b);
1903         assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
1904     }
1905
1906     #[simd_test(enable = "sse")]
1907     unsafe fn test_mm_sub_ps() {
1908         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1909         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1910         let r = _mm_sub_ps(a, b);
1911         assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
1912     }
1913
1914     #[simd_test(enable = "sse")]
1915     unsafe fn test_mm_sub_ss() {
1916         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1917         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1918         let r = _mm_sub_ss(a, b);
1919         assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
1920     }
1921
1922     #[simd_test(enable = "sse")]
1923     unsafe fn test_mm_mul_ps() {
1924         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1925         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1926         let r = _mm_mul_ps(a, b);
1927         assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
1928     }
1929
1930     #[simd_test(enable = "sse")]
1931     unsafe fn test_mm_mul_ss() {
1932         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1933         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1934         let r = _mm_mul_ss(a, b);
1935         assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
1936     }
1937
1938     #[simd_test(enable = "sse")]
1939     unsafe fn test_mm_div_ps() {
1940         let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
1941         let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
1942         let r = _mm_div_ps(a, b);
1943         assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
1944     }
1945
1946     #[simd_test(enable = "sse")]
1947     unsafe fn test_mm_div_ss() {
1948         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1949         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1950         let r = _mm_div_ss(a, b);
1951         assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
1952     }
1953
1954     #[simd_test(enable = "sse")]
1955     unsafe fn test_mm_sqrt_ss() {
1956         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1957         let r = _mm_sqrt_ss(a);
1958         let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
1959         assert_eq_m128(r, e);
1960     }
1961
1962     #[simd_test(enable = "sse")]
1963     unsafe fn test_mm_sqrt_ps() {
1964         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1965         let r = _mm_sqrt_ps(a);
1966         let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
1967         assert_eq_m128(r, e);
1968     }
1969
1970     #[simd_test(enable = "sse")]
1971     unsafe fn test_mm_rcp_ss() {
1972         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1973         let r = _mm_rcp_ss(a);
1974         let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
1975         assert_eq_m128(r, e);
1976     }
1977
1978     #[simd_test(enable = "sse")]
1979     unsafe fn test_mm_rcp_ps() {
1980         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1981         let r = _mm_rcp_ps(a);
1982         let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
1983         let rel_err = 0.00048828125;
1984         for i in 0..4 {
1985             assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
1986         }
1987     }
1988
1989     #[simd_test(enable = "sse")]
1990     unsafe fn test_mm_rsqrt_ss() {
1991         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1992         let r = _mm_rsqrt_ss(a);
1993         let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
1994         let rel_err = 0.00048828125;
1995         for i in 0..4 {
1996             assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
1997         }
1998     }
1999
2000     #[simd_test(enable = "sse")]
2001     unsafe fn test_mm_rsqrt_ps() {
2002         let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2003         let r = _mm_rsqrt_ps(a);
2004         let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2005         let rel_err = 0.00048828125;
2006         for i in 0..4 {
2007             assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2008         }
2009     }
2010
2011     #[simd_test(enable = "sse")]
2012     unsafe fn test_mm_min_ss() {
2013         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2014         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2015         let r = _mm_min_ss(a, b);
2016         assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2017     }
2018
2019     #[simd_test(enable = "sse")]
2020     unsafe fn test_mm_min_ps() {
2021         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2022         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2023         let r = _mm_min_ps(a, b);
2024         assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2025
2026         // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2027         // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2028         // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2029         // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2030         // `r1` to `a` and `r2` to `b`.
2031         let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2032         let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2033         let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2034         let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2035         let a: [u8; 16] = transmute(a);
2036         let b: [u8; 16] = transmute(b);
2037         assert_eq!(r1, b);
2038         assert_eq!(r2, a);
2039         assert_ne!(a, b); // sanity check that -0.0 is actually present
2040     }
2041
2042     #[simd_test(enable = "sse")]
2043     unsafe fn test_mm_max_ss() {
2044         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2045         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2046         let r = _mm_max_ss(a, b);
2047         assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2048     }
2049
2050     #[simd_test(enable = "sse")]
2051     unsafe fn test_mm_max_ps() {
2052         let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2053         let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2054         let r = _mm_max_ps(a, b);
2055         assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2056     }
2057
2058     #[simd_test(enable = "sse")]
2059     unsafe fn test_mm_and_ps() {
2060         let a = transmute(u32x4::splat(0b0011));
2061         let b = transmute(u32x4::splat(0b0101));
2062         let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2063         let e = transmute(u32x4::splat(0b0001));
2064         assert_eq_m128(r, e);
2065     }
2066
2067     #[simd_test(enable = "sse")]
2068     unsafe fn test_mm_andnot_ps() {
2069         let a = transmute(u32x4::splat(0b0011));
2070         let b = transmute(u32x4::splat(0b0101));
2071         let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2072         let e = transmute(u32x4::splat(0b0100));
2073         assert_eq_m128(r, e);
2074     }
2075
2076     #[simd_test(enable = "sse")]
2077     unsafe fn test_mm_or_ps() {
2078         let a = transmute(u32x4::splat(0b0011));
2079         let b = transmute(u32x4::splat(0b0101));
2080         let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2081         let e = transmute(u32x4::splat(0b0111));
2082         assert_eq_m128(r, e);
2083     }
2084
2085     #[simd_test(enable = "sse")]
2086     unsafe fn test_mm_xor_ps() {
2087         let a = transmute(u32x4::splat(0b0011));
2088         let b = transmute(u32x4::splat(0b0101));
2089         let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2090         let e = transmute(u32x4::splat(0b0110));
2091         assert_eq_m128(r, e);
2092     }
2093
2094     #[simd_test(enable = "sse")]
2095     unsafe fn test_mm_cmpeq_ss() {
2096         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2097         let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2098         let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2099         let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
2100         assert_eq!(r, e);
2101
2102         let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2103         let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2104         let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
2105         assert_eq!(r2, e2);
2106     }
2107
2108     #[simd_test(enable = "sse")]
2109     unsafe fn test_mm_cmplt_ss() {
2110         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2111         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2112         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2113         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2114
2115         let b1 = 0u32; // a.extract(0) < b.extract(0)
2116         let c1 = 0u32; // a.extract(0) < c.extract(0)
2117         let d1 = !0u32; // a.extract(0) < d.extract(0)
2118
2119         let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2120         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2121         assert_eq!(rb, eb);
2122
2123         let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2124         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2125         assert_eq!(rc, ec);
2126
2127         let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2128         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2129         assert_eq!(rd, ed);
2130     }
2131
2132     #[simd_test(enable = "sse")]
2133     unsafe fn test_mm_cmple_ss() {
2134         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2135         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2136         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2137         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2138
2139         let b1 = 0u32; // a.extract(0) <= b.extract(0)
2140         let c1 = !0u32; // a.extract(0) <= c.extract(0)
2141         let d1 = !0u32; // a.extract(0) <= d.extract(0)
2142
2143         let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2144         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2145         assert_eq!(rb, eb);
2146
2147         let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2148         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2149         assert_eq!(rc, ec);
2150
2151         let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2152         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2153         assert_eq!(rd, ed);
2154     }
2155
2156     #[simd_test(enable = "sse")]
2157     unsafe fn test_mm_cmpgt_ss() {
2158         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2159         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2160         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2161         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2162
2163         let b1 = !0u32; // a.extract(0) > b.extract(0)
2164         let c1 = 0u32; // a.extract(0) > c.extract(0)
2165         let d1 = 0u32; // a.extract(0) > d.extract(0)
2166
2167         let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2168         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2169         assert_eq!(rb, eb);
2170
2171         let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2172         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2173         assert_eq!(rc, ec);
2174
2175         let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2176         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2177         assert_eq!(rd, ed);
2178     }
2179
2180     #[simd_test(enable = "sse")]
2181     unsafe fn test_mm_cmpge_ss() {
2182         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2183         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2184         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2185         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2186
2187         let b1 = !0u32; // a.extract(0) >= b.extract(0)
2188         let c1 = !0u32; // a.extract(0) >= c.extract(0)
2189         let d1 = 0u32; // a.extract(0) >= d.extract(0)
2190
2191         let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2192         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2193         assert_eq!(rb, eb);
2194
2195         let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2196         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2197         assert_eq!(rc, ec);
2198
2199         let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2200         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2201         assert_eq!(rd, ed);
2202     }
2203
2204     #[simd_test(enable = "sse")]
2205     unsafe fn test_mm_cmpneq_ss() {
2206         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2207         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2208         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2209         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2210
2211         let b1 = !0u32; // a.extract(0) != b.extract(0)
2212         let c1 = 0u32; // a.extract(0) != c.extract(0)
2213         let d1 = !0u32; // a.extract(0) != d.extract(0)
2214
2215         let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2216         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2217         assert_eq!(rb, eb);
2218
2219         let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2220         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2221         assert_eq!(rc, ec);
2222
2223         let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2224         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2225         assert_eq!(rd, ed);
2226     }
2227
2228     #[simd_test(enable = "sse")]
2229     unsafe fn test_mm_cmpnlt_ss() {
2230         // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2231         // must be a difference. It may have to do with behavior in the
2232         // presence of NaNs (signaling or quiet). If so, we should add tests
2233         // for those.
2234
2235         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2236         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2237         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2238         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2239
2240         let b1 = !0u32; // a.extract(0) >= b.extract(0)
2241         let c1 = !0u32; // a.extract(0) >= c.extract(0)
2242         let d1 = 0u32; // a.extract(0) >= d.extract(0)
2243
2244         let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2245         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2246         assert_eq!(rb, eb);
2247
2248         let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2249         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2250         assert_eq!(rc, ec);
2251
2252         let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2253         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2254         assert_eq!(rd, ed);
2255     }
2256
2257     #[simd_test(enable = "sse")]
2258     unsafe fn test_mm_cmpnle_ss() {
2259         // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2260         // must be a difference. It may have to do with behavior in the
2261         // presence
2262         // of NaNs (signaling or quiet). If so, we should add tests for those.
2263
2264         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2265         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2266         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2267         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2268
2269         let b1 = !0u32; // a.extract(0) > b.extract(0)
2270         let c1 = 0u32; // a.extract(0) > c.extract(0)
2271         let d1 = 0u32; // a.extract(0) > d.extract(0)
2272
2273         let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2274         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2275         assert_eq!(rb, eb);
2276
2277         let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2278         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2279         assert_eq!(rc, ec);
2280
2281         let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2282         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2283         assert_eq!(rd, ed);
2284     }
2285
2286     #[simd_test(enable = "sse")]
2287     unsafe fn test_mm_cmpngt_ss() {
2288         // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2289         // must be a difference. It may have to do with behavior in the
2290         // presence of NaNs (signaling or quiet). If so, we should add tests
2291         // for those.
2292
2293         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2294         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2295         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2296         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2297
2298         let b1 = 0u32; // a.extract(0) <= b.extract(0)
2299         let c1 = !0u32; // a.extract(0) <= c.extract(0)
2300         let d1 = !0u32; // a.extract(0) <= d.extract(0)
2301
2302         let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2303         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2304         assert_eq!(rb, eb);
2305
2306         let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2307         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2308         assert_eq!(rc, ec);
2309
2310         let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2311         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2312         assert_eq!(rd, ed);
2313     }
2314
2315     #[simd_test(enable = "sse")]
2316     unsafe fn test_mm_cmpnge_ss() {
2317         // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2318         // must be a difference. It may have to do with behavior in the
2319         // presence of NaNs (signaling or quiet). If so, we should add tests
2320         // for those.
2321
2322         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2323         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2324         let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2325         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2326
2327         let b1 = 0u32; // a.extract(0) < b.extract(0)
2328         let c1 = 0u32; // a.extract(0) < c.extract(0)
2329         let d1 = !0u32; // a.extract(0) < d.extract(0)
2330
2331         let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2332         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2333         assert_eq!(rb, eb);
2334
2335         let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2336         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2337         assert_eq!(rc, ec);
2338
2339         let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2340         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2341         assert_eq!(rd, ed);
2342     }
2343
2344     #[simd_test(enable = "sse")]
2345     unsafe fn test_mm_cmpord_ss() {
2346         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2347         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2348         let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2349         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2350
2351         let b1 = !0u32; // a.extract(0) ord b.extract(0)
2352         let c1 = 0u32; // a.extract(0) ord c.extract(0)
2353         let d1 = !0u32; // a.extract(0) ord d.extract(0)
2354
2355         let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2356         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2357         assert_eq!(rb, eb);
2358
2359         let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2360         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2361         assert_eq!(rc, ec);
2362
2363         let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2364         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2365         assert_eq!(rd, ed);
2366     }
2367
2368     #[simd_test(enable = "sse")]
2369     unsafe fn test_mm_cmpunord_ss() {
2370         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2371         let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2372         let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2373         let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2374
2375         let b1 = 0u32; // a.extract(0) unord b.extract(0)
2376         let c1 = !0u32; // a.extract(0) unord c.extract(0)
2377         let d1 = 0u32; // a.extract(0) unord d.extract(0)
2378
2379         let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2380         let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2381         assert_eq!(rb, eb);
2382
2383         let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2384         let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2385         assert_eq!(rc, ec);
2386
2387         let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2388         let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2389         assert_eq!(rd, ed);
2390     }
2391
2392     #[simd_test(enable = "sse")]
2393     unsafe fn test_mm_cmpeq_ps() {
2394         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2395         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2396         let tru = !0u32;
2397         let fls = 0u32;
2398
2399         let e = u32x4::new(fls, fls, tru, fls);
2400         let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2401         assert_eq!(r, e);
2402     }
2403
2404     #[simd_test(enable = "sse")]
2405     unsafe fn test_mm_cmplt_ps() {
2406         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2407         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2408         let tru = !0u32;
2409         let fls = 0u32;
2410
2411         let e = u32x4::new(tru, fls, fls, fls);
2412         let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2413         assert_eq!(r, e);
2414     }
2415
2416     #[simd_test(enable = "sse")]
2417     unsafe fn test_mm_cmple_ps() {
2418         let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2419         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2420         let tru = !0u32;
2421         let fls = 0u32;
2422
2423         let e = u32x4::new(tru, fls, tru, fls);
2424         let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2425         assert_eq!(r, e);
2426     }
2427
2428     #[simd_test(enable = "sse")]
2429     unsafe fn test_mm_cmpgt_ps() {
2430         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2431         let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2432         let tru = !0u32;
2433         let fls = 0u32;
2434
2435         let e = u32x4::new(fls, tru, fls, fls);
2436         let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2437         assert_eq!(r, e);
2438     }
2439
2440     #[simd_test(enable = "sse")]
2441     unsafe fn test_mm_cmpge_ps() {
2442         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2443         let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2444         let tru = !0u32;
2445         let fls = 0u32;
2446
2447         let e = u32x4::new(fls, tru, tru, fls);
2448         let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2449         assert_eq!(r, e);
2450     }
2451
2452     #[simd_test(enable = "sse")]
2453     unsafe fn test_mm_cmpneq_ps() {
2454         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2455         let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2456         let tru = !0u32;
2457         let fls = 0u32;
2458
2459         let e = u32x4::new(tru, tru, fls, tru);
2460         let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2461         assert_eq!(r, e);
2462     }
2463
2464     #[simd_test(enable = "sse")]
2465     unsafe fn test_mm_cmpnlt_ps() {
2466         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2467         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2468         let tru = !0u32;
2469         let fls = 0u32;
2470
2471         let e = u32x4::new(fls, tru, tru, tru);
2472         let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2473         assert_eq!(r, e);
2474     }
2475
2476     #[simd_test(enable = "sse")]
2477     unsafe fn test_mm_cmpnle_ps() {
2478         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2479         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2480         let tru = !0u32;
2481         let fls = 0u32;
2482
2483         let e = u32x4::new(fls, tru, fls, tru);
2484         let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2485         assert_eq!(r, e);
2486     }
2487
2488     #[simd_test(enable = "sse")]
2489     unsafe fn test_mm_cmpngt_ps() {
2490         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2491         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2492         let tru = !0u32;
2493         let fls = 0u32;
2494
2495         let e = u32x4::new(tru, fls, tru, tru);
2496         let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2497         assert_eq!(r, e);
2498     }
2499
2500     #[simd_test(enable = "sse")]
2501     unsafe fn test_mm_cmpnge_ps() {
2502         let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2503         let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2504         let tru = !0u32;
2505         let fls = 0u32;
2506
2507         let e = u32x4::new(tru, fls, fls, tru);
2508         let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2509         assert_eq!(r, e);
2510     }
2511
2512     #[simd_test(enable = "sse")]
2513     unsafe fn test_mm_cmpord_ps() {
2514         let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2515         let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2516         let tru = !0u32;
2517         let fls = 0u32;
2518
2519         let e = u32x4::new(tru, fls, fls, fls);
2520         let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2521         assert_eq!(r, e);
2522     }
2523
2524     #[simd_test(enable = "sse")]
2525     unsafe fn test_mm_cmpunord_ps() {
2526         let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2527         let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2528         let tru = !0u32;
2529         let fls = 0u32;
2530
2531         let e = u32x4::new(fls, tru, tru, tru);
2532         let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2533         assert_eq!(r, e);
2534     }
2535
2536     #[simd_test(enable = "sse")]
2537     unsafe fn test_mm_comieq_ss() {
2538         let aa = &[3.0f32, 12.0, 23.0, NAN];
2539         let bb = &[3.0f32, 47.5, 1.5, NAN];
2540
2541         let ee = &[1i32, 0, 0, 0];
2542
2543         for i in 0..4 {
2544             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2545             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2546
2547             let r = _mm_comieq_ss(a, b);
2548
2549             assert_eq!(
2550                 ee[i], r,
2551                 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2552                 a, b, r, ee[i], i
2553             );
2554         }
2555     }
2556
2557     #[simd_test(enable = "sse")]
2558     unsafe fn test_mm_comilt_ss() {
2559         let aa = &[3.0f32, 12.0, 23.0, NAN];
2560         let bb = &[3.0f32, 47.5, 1.5, NAN];
2561
2562         let ee = &[0i32, 1, 0, 0];
2563
2564         for i in 0..4 {
2565             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2566             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2567
2568             let r = _mm_comilt_ss(a, b);
2569
2570             assert_eq!(
2571                 ee[i], r,
2572                 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2573                 a, b, r, ee[i], i
2574             );
2575         }
2576     }
2577
2578     #[simd_test(enable = "sse")]
2579     unsafe fn test_mm_comile_ss() {
2580         let aa = &[3.0f32, 12.0, 23.0, NAN];
2581         let bb = &[3.0f32, 47.5, 1.5, NAN];
2582
2583         let ee = &[1i32, 1, 0, 0];
2584
2585         for i in 0..4 {
2586             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2587             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2588
2589             let r = _mm_comile_ss(a, b);
2590
2591             assert_eq!(
2592                 ee[i], r,
2593                 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2594                 a, b, r, ee[i], i
2595             );
2596         }
2597     }
2598
2599     #[simd_test(enable = "sse")]
2600     unsafe fn test_mm_comigt_ss() {
2601         let aa = &[3.0f32, 12.0, 23.0, NAN];
2602         let bb = &[3.0f32, 47.5, 1.5, NAN];
2603
2604         let ee = &[1i32, 0, 1, 0];
2605
2606         for i in 0..4 {
2607             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2608             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2609
2610             let r = _mm_comige_ss(a, b);
2611
2612             assert_eq!(
2613                 ee[i], r,
2614                 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2615                 a, b, r, ee[i], i
2616             );
2617         }
2618     }
2619
2620     #[simd_test(enable = "sse")]
2621     unsafe fn test_mm_comineq_ss() {
2622         let aa = &[3.0f32, 12.0, 23.0, NAN];
2623         let bb = &[3.0f32, 47.5, 1.5, NAN];
2624
2625         let ee = &[0i32, 1, 1, 1];
2626
2627         for i in 0..4 {
2628             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2629             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2630
2631             let r = _mm_comineq_ss(a, b);
2632
2633             assert_eq!(
2634                 ee[i], r,
2635                 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2636                 a, b, r, ee[i], i
2637             );
2638         }
2639     }
2640
2641     #[simd_test(enable = "sse")]
2642     unsafe fn test_mm_ucomieq_ss() {
2643         let aa = &[3.0f32, 12.0, 23.0, NAN];
2644         let bb = &[3.0f32, 47.5, 1.5, NAN];
2645
2646         let ee = &[1i32, 0, 0, 0];
2647
2648         for i in 0..4 {
2649             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2650             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2651
2652             let r = _mm_ucomieq_ss(a, b);
2653
2654             assert_eq!(
2655                 ee[i], r,
2656                 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2657                 a, b, r, ee[i], i
2658             );
2659         }
2660     }
2661
2662     #[simd_test(enable = "sse")]
2663     unsafe fn test_mm_ucomilt_ss() {
2664         let aa = &[3.0f32, 12.0, 23.0, NAN];
2665         let bb = &[3.0f32, 47.5, 1.5, NAN];
2666
2667         let ee = &[0i32, 1, 0, 0];
2668
2669         for i in 0..4 {
2670             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2671             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2672
2673             let r = _mm_ucomilt_ss(a, b);
2674
2675             assert_eq!(
2676                 ee[i], r,
2677                 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2678                 a, b, r, ee[i], i
2679             );
2680         }
2681     }
2682
2683     #[simd_test(enable = "sse")]
2684     unsafe fn test_mm_ucomile_ss() {
2685         let aa = &[3.0f32, 12.0, 23.0, NAN];
2686         let bb = &[3.0f32, 47.5, 1.5, NAN];
2687
2688         let ee = &[1i32, 1, 0, 0];
2689
2690         for i in 0..4 {
2691             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2692             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2693
2694             let r = _mm_ucomile_ss(a, b);
2695
2696             assert_eq!(
2697                 ee[i], r,
2698                 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2699                 a, b, r, ee[i], i
2700             );
2701         }
2702     }
2703
2704     #[simd_test(enable = "sse")]
2705     unsafe fn test_mm_ucomigt_ss() {
2706         let aa = &[3.0f32, 12.0, 23.0, NAN];
2707         let bb = &[3.0f32, 47.5, 1.5, NAN];
2708
2709         let ee = &[0i32, 0, 1, 0];
2710
2711         for i in 0..4 {
2712             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2713             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2714
2715             let r = _mm_ucomigt_ss(a, b);
2716
2717             assert_eq!(
2718                 ee[i], r,
2719                 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2720                 a, b, r, ee[i], i
2721             );
2722         }
2723     }
2724
2725     #[simd_test(enable = "sse")]
2726     unsafe fn test_mm_ucomige_ss() {
2727         let aa = &[3.0f32, 12.0, 23.0, NAN];
2728         let bb = &[3.0f32, 47.5, 1.5, NAN];
2729
2730         let ee = &[1i32, 0, 1, 0];
2731
2732         for i in 0..4 {
2733             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2734             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2735
2736             let r = _mm_ucomige_ss(a, b);
2737
2738             assert_eq!(
2739                 ee[i], r,
2740                 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2741                 a, b, r, ee[i], i
2742             );
2743         }
2744     }
2745
2746     #[simd_test(enable = "sse")]
2747     unsafe fn test_mm_ucomineq_ss() {
2748         let aa = &[3.0f32, 12.0, 23.0, NAN];
2749         let bb = &[3.0f32, 47.5, 1.5, NAN];
2750
2751         let ee = &[0i32, 1, 1, 1];
2752
2753         for i in 0..4 {
2754             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2755             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2756
2757             let r = _mm_ucomineq_ss(a, b);
2758
2759             assert_eq!(
2760                 ee[i], r,
2761                 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2762                 a, b, r, ee[i], i
2763             );
2764         }
2765     }
2766
2767     #[simd_test(enable = "sse")]
2768     unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2769         // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2770         // Invalid Operation Exception while `ucomieq_ss` should not.
2771         let aa = &[3.0f32, NAN, 23.0, NAN];
2772         let bb = &[3.0f32, 47.5, NAN, NAN];
2773
2774         let ee = &[1i32, 0, 0, 0];
2775         let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2776
2777         for i in 0..4 {
2778             let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2779             let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2780
2781             _MM_SET_EXCEPTION_STATE(0);
2782             let r1 = _mm_comieq_ss(*black_box(&a), b);
2783             let s1 = _MM_GET_EXCEPTION_STATE();
2784
2785             _MM_SET_EXCEPTION_STATE(0);
2786             let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2787             let s2 = _MM_GET_EXCEPTION_STATE();
2788
2789             assert_eq!(
2790                 ee[i], r1,
2791                 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2792                 a, b, r1, ee[i], i
2793             );
2794             assert_eq!(
2795                 ee[i], r2,
2796                 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2797                 a, b, r2, ee[i], i
2798             );
2799             assert_eq!(
2800                 s1,
2801                 exc[i] * _MM_EXCEPT_INVALID,
2802                 "_mm_comieq_ss() set exception flags: {} (i={})",
2803                 s1,
2804                 i
2805             );
2806             assert_eq!(
2807                 s2,
2808                 0, // ucomieq_ss should not signal an exception
2809                 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2810                 s2,
2811                 i
2812             );
2813         }
2814     }
2815
2816     #[simd_test(enable = "sse")]
2817     unsafe fn test_mm_cvtss_si32() {
2818         let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2819         let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2820         for i in 0..inputs.len() {
2821             let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2822             let e = result[i];
2823             let r = _mm_cvtss_si32(x);
2824             assert_eq!(
2825                 e, r,
2826                 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2827                 i, x, r, e
2828             );
2829         }
2830     }
2831
2832     #[simd_test(enable = "sse")]
2833     unsafe fn test_mm_cvttss_si32() {
2834         let inputs = &[
2835             (42.0f32, 42i32),
2836             (-31.4, -31),
2837             (-33.5, -33),
2838             (-34.5, -34),
2839             (10.999, 10),
2840             (-5.99, -5),
2841             (4.0e10, i32::MIN),
2842             (4.0e-10, 0),
2843             (NAN, i32::MIN),
2844             (2147483500.1, 2147483520),
2845         ];
2846         for i in 0..inputs.len() {
2847             let (xi, e) = inputs[i];
2848             let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2849             let r = _mm_cvttss_si32(x);
2850             assert_eq!(
2851                 e, r,
2852                 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2853                 i, x, r, e
2854             );
2855         }
2856     }
2857
2858     #[simd_test(enable = "sse")]
2859     unsafe fn test_mm_cvtsi32_ss() {
2860         let inputs = &[
2861             (4555i32, 4555.0f32),
2862             (322223333, 322223330.0),
2863             (-432, -432.0),
2864             (-322223333, -322223330.0),
2865         ];
2866
2867         for i in 0..inputs.len() {
2868             let (x, f) = inputs[i];
2869             let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2870             let r = _mm_cvtsi32_ss(a, x);
2871             let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2872             assert_eq_m128(e, r);
2873         }
2874     }
2875
2876     #[simd_test(enable = "sse")]
2877     unsafe fn test_mm_cvtss_f32() {
2878         let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2879         assert_eq!(_mm_cvtss_f32(a), 312.0134);
2880     }
2881
2882     #[simd_test(enable = "sse")]
2883     unsafe fn test_mm_set_ss() {
2884         let r = _mm_set_ss(black_box(4.25));
2885         assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2886     }
2887
2888     #[simd_test(enable = "sse")]
2889     unsafe fn test_mm_set1_ps() {
2890         let r1 = _mm_set1_ps(black_box(4.25));
2891         let r2 = _mm_set_ps1(black_box(4.25));
2892         assert_eq!(get_m128(r1, 0), 4.25);
2893         assert_eq!(get_m128(r1, 1), 4.25);
2894         assert_eq!(get_m128(r1, 2), 4.25);
2895         assert_eq!(get_m128(r1, 3), 4.25);
2896         assert_eq!(get_m128(r2, 0), 4.25);
2897         assert_eq!(get_m128(r2, 1), 4.25);
2898         assert_eq!(get_m128(r2, 2), 4.25);
2899         assert_eq!(get_m128(r2, 3), 4.25);
2900     }
2901
2902     #[simd_test(enable = "sse")]
2903     unsafe fn test_mm_set_ps() {
2904         let r = _mm_set_ps(
2905             black_box(1.0),
2906             black_box(2.0),
2907             black_box(3.0),
2908             black_box(4.0),
2909         );
2910         assert_eq!(get_m128(r, 0), 4.0);
2911         assert_eq!(get_m128(r, 1), 3.0);
2912         assert_eq!(get_m128(r, 2), 2.0);
2913         assert_eq!(get_m128(r, 3), 1.0);
2914     }
2915
2916     #[simd_test(enable = "sse")]
2917     unsafe fn test_mm_setr_ps() {
2918         let r = _mm_setr_ps(
2919             black_box(1.0),
2920             black_box(2.0),
2921             black_box(3.0),
2922             black_box(4.0),
2923         );
2924         assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
2925     }
2926
2927     #[simd_test(enable = "sse")]
2928     unsafe fn test_mm_setzero_ps() {
2929         let r = *black_box(&_mm_setzero_ps());
2930         assert_eq_m128(r, _mm_set1_ps(0.0));
2931     }
2932
2933     #[simd_test(enable = "sse")]
2934     unsafe fn test_mm_shuffle() {
2935         assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
2936         assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
2937         assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
2938     }
2939
2940     #[simd_test(enable = "sse")]
2941     unsafe fn test_mm_shuffle_ps() {
2942         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2943         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2944         let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
2945         assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
2946     }
2947
2948     #[simd_test(enable = "sse")]
2949     unsafe fn test_mm_unpackhi_ps() {
2950         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2951         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2952         let r = _mm_unpackhi_ps(a, b);
2953         assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
2954     }
2955
2956     #[simd_test(enable = "sse")]
2957     unsafe fn test_mm_unpacklo_ps() {
2958         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2959         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2960         let r = _mm_unpacklo_ps(a, b);
2961         assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
2962     }
2963
2964     #[simd_test(enable = "sse")]
2965     unsafe fn test_mm_movehl_ps() {
2966         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2967         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2968         let r = _mm_movehl_ps(a, b);
2969         assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
2970     }
2971
2972     #[simd_test(enable = "sse")]
2973     unsafe fn test_mm_movelh_ps() {
2974         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2975         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2976         let r = _mm_movelh_ps(a, b);
2977         assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
2978     }
2979
2980     #[simd_test(enable = "sse")]
2981     unsafe fn test_mm_load_ss() {
2982         let a = 42.0f32;
2983         let r = _mm_load_ss(&a as *const f32);
2984         assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
2985     }
2986
2987     #[simd_test(enable = "sse")]
2988     unsafe fn test_mm_load1_ps() {
2989         let a = 42.0f32;
2990         let r = _mm_load1_ps(&a as *const f32);
2991         assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
2992     }
2993
2994     #[simd_test(enable = "sse")]
2995     unsafe fn test_mm_load_ps() {
2996         let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
2997
2998         let mut p = vals.as_ptr();
2999         let mut fixup = 0.0f32;
3000
3001         // Make sure p is aligned, otherwise we might get a
3002         // (signal: 11, SIGSEGV: invalid memory reference)
3003
3004         let unalignment = (p as usize) & 0xf;
3005         if unalignment != 0 {
3006             let delta = (16 - unalignment) >> 2;
3007             fixup = delta as f32;
3008             p = p.add(delta);
3009         }
3010
3011         let r = _mm_load_ps(p);
3012         let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3013         assert_eq_m128(r, e);
3014     }
3015
3016     #[simd_test(enable = "sse")]
3017     unsafe fn test_mm_loadu_ps() {
3018         let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3019         let p = vals.as_ptr().add(3);
3020         let r = _mm_loadu_ps(black_box(p));
3021         assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3022     }
3023
3024     #[simd_test(enable = "sse")]
3025     unsafe fn test_mm_loadr_ps() {
3026         let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3027
3028         let mut p = vals.as_ptr();
3029         let mut fixup = 0.0f32;
3030
3031         // Make sure p is aligned, otherwise we might get a
3032         // (signal: 11, SIGSEGV: invalid memory reference)
3033
3034         let unalignment = (p as usize) & 0xf;
3035         if unalignment != 0 {
3036             let delta = (16 - unalignment) >> 2;
3037             fixup = delta as f32;
3038             p = p.add(delta);
3039         }
3040
3041         let r = _mm_loadr_ps(p);
3042         let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3043         assert_eq_m128(r, e);
3044     }
3045
3046     #[simd_test(enable = "sse2")]
3047     unsafe fn test_mm_loadu_si64() {
3048         let a = _mm_setr_epi64x(5, 6);
3049         let r = _mm_loadu_si64(&a as *const _ as *const _);
3050         assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
3051     }
3052
3053     #[simd_test(enable = "sse")]
3054     unsafe fn test_mm_store_ss() {
3055         let mut vals = [0.0f32; 8];
3056         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3057         _mm_store_ss(vals.as_mut_ptr().add(1), a);
3058
3059         assert_eq!(vals[0], 0.0);
3060         assert_eq!(vals[1], 1.0);
3061         assert_eq!(vals[2], 0.0);
3062     }
3063
3064     #[simd_test(enable = "sse")]
3065     unsafe fn test_mm_store1_ps() {
3066         let mut vals = [0.0f32; 8];
3067         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3068
3069         let mut ofs = 0;
3070         let mut p = vals.as_mut_ptr();
3071
3072         if (p as usize) & 0xf != 0 {
3073             ofs = ((16 - (p as usize)) & 0xf) >> 2;
3074             p = p.add(ofs);
3075         }
3076
3077         _mm_store1_ps(p, *black_box(&a));
3078
3079         if ofs > 0 {
3080             assert_eq!(vals[ofs - 1], 0.0);
3081         }
3082         assert_eq!(vals[ofs + 0], 1.0);
3083         assert_eq!(vals[ofs + 1], 1.0);
3084         assert_eq!(vals[ofs + 2], 1.0);
3085         assert_eq!(vals[ofs + 3], 1.0);
3086         assert_eq!(vals[ofs + 4], 0.0);
3087     }
3088
3089     #[simd_test(enable = "sse")]
3090     unsafe fn test_mm_store_ps() {
3091         let mut vals = [0.0f32; 8];
3092         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3093
3094         let mut ofs = 0;
3095         let mut p = vals.as_mut_ptr();
3096
3097         // Align p to 16-byte boundary
3098         if (p as usize) & 0xf != 0 {
3099             ofs = ((16 - (p as usize)) & 0xf) >> 2;
3100             p = p.add(ofs);
3101         }
3102
3103         _mm_store_ps(p, *black_box(&a));
3104
3105         if ofs > 0 {
3106             assert_eq!(vals[ofs - 1], 0.0);
3107         }
3108         assert_eq!(vals[ofs + 0], 1.0);
3109         assert_eq!(vals[ofs + 1], 2.0);
3110         assert_eq!(vals[ofs + 2], 3.0);
3111         assert_eq!(vals[ofs + 3], 4.0);
3112         assert_eq!(vals[ofs + 4], 0.0);
3113     }
3114
3115     #[simd_test(enable = "sse")]
3116     unsafe fn test_mm_storer_ps() {
3117         let mut vals = [0.0f32; 8];
3118         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3119
3120         let mut ofs = 0;
3121         let mut p = vals.as_mut_ptr();
3122
3123         // Align p to 16-byte boundary
3124         if (p as usize) & 0xf != 0 {
3125             ofs = ((16 - (p as usize)) & 0xf) >> 2;
3126             p = p.add(ofs);
3127         }
3128
3129         _mm_storer_ps(p, *black_box(&a));
3130
3131         if ofs > 0 {
3132             assert_eq!(vals[ofs - 1], 0.0);
3133         }
3134         assert_eq!(vals[ofs + 0], 4.0);
3135         assert_eq!(vals[ofs + 1], 3.0);
3136         assert_eq!(vals[ofs + 2], 2.0);
3137         assert_eq!(vals[ofs + 3], 1.0);
3138         assert_eq!(vals[ofs + 4], 0.0);
3139     }
3140
3141     #[simd_test(enable = "sse")]
3142     unsafe fn test_mm_storeu_ps() {
3143         let mut vals = [0.0f32; 8];
3144         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3145
3146         let mut ofs = 0;
3147         let mut p = vals.as_mut_ptr();
3148
3149         // Make sure p is **not** aligned to 16-byte boundary
3150         if (p as usize) & 0xf == 0 {
3151             ofs = 1;
3152             p = p.add(1);
3153         }
3154
3155         _mm_storeu_ps(p, *black_box(&a));
3156
3157         if ofs > 0 {
3158             assert_eq!(vals[ofs - 1], 0.0);
3159         }
3160         assert_eq!(vals[ofs + 0], 1.0);
3161         assert_eq!(vals[ofs + 1], 2.0);
3162         assert_eq!(vals[ofs + 2], 3.0);
3163         assert_eq!(vals[ofs + 3], 4.0);
3164         assert_eq!(vals[ofs + 4], 0.0);
3165     }
3166
3167     #[simd_test(enable = "sse")]
3168     unsafe fn test_mm_move_ss() {
3169         let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3170         let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3171
3172         let r = _mm_move_ss(a, b);
3173         let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3174         assert_eq_m128(e, r);
3175     }
3176
3177     #[simd_test(enable = "sse")]
3178     unsafe fn test_mm_movemask_ps() {
3179         let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3180         assert_eq!(r, 0b0101);
3181
3182         let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3183         assert_eq!(r, 0b0111);
3184     }
3185
3186     #[simd_test(enable = "sse")]
3187     unsafe fn test_mm_sfence() {
3188         _mm_sfence();
3189     }
3190
3191     #[simd_test(enable = "sse")]
3192     unsafe fn test_mm_getcsr_setcsr_1() {
3193         let saved_csr = _mm_getcsr();
3194
3195         let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3196         let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3197
3198         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3199         let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3200
3201         _mm_setcsr(saved_csr);
3202
3203         let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3204         assert_eq_m128(r, exp); // first component is a denormalized f32
3205     }
3206
3207     #[simd_test(enable = "sse")]
3208     unsafe fn test_mm_getcsr_setcsr_2() {
3209         // Same as _mm_setcsr_1 test, but with opposite flag value.
3210
3211         let saved_csr = _mm_getcsr();
3212
3213         let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3214         let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3215
3216         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3217         let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3218
3219         _mm_setcsr(saved_csr);
3220
3221         let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3222         assert_eq_m128(r, exp); // first component is a denormalized f32
3223     }
3224
3225     #[simd_test(enable = "sse")]
3226     unsafe fn test_mm_getcsr_setcsr_underflow() {
3227         _MM_SET_EXCEPTION_STATE(0);
3228
3229         let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3230         let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3231
3232         assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3233
3234         let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3235
3236         let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3237         assert_eq_m128(r, exp);
3238
3239         let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3240         assert_eq!(underflow, true);
3241     }
3242
3243     #[simd_test(enable = "sse")]
3244     unsafe fn test_MM_TRANSPOSE4_PS() {
3245         let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3246         let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3247         let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3248         let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3249
3250         _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3251
3252         assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3253         assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3254         assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3255         assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3256     }
3257
3258     #[repr(align(16))]
3259     struct Memory {
3260         pub data: [f32; 4],
3261     }
3262
3263     #[simd_test(enable = "sse")]
3264     unsafe fn test_mm_stream_ps() {
3265         let a = _mm_set1_ps(7.0);
3266         let mut mem = Memory { data: [-1.0; 4] };
3267
3268         _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
3269         for i in 0..4 {
3270             assert_eq!(mem.data[i], get_m128(a, i));
3271         }
3272     }
3273 }