1 //! Streaming SIMD Extensions (SSE)
4 core_arch
::{simd::*, simd_llvm::*, x86::*}
,
9 use stdarch_test
::assert_instr
;
11 /// Adds the first component of `a` and `b`, the other components are copied
14 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
16 #[target_feature(enable = "sse")]
17 #[cfg_attr(test, assert_instr(addss))]
18 #[stable(feature = "simd_x86", since = "1.27.0")]
19 pub unsafe fn _mm_add_ss(a
: __m128
, b
: __m128
) -> __m128
{
23 /// Adds __m128 vectors.
25 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
27 #[target_feature(enable = "sse")]
28 #[cfg_attr(test, assert_instr(addps))]
29 #[stable(feature = "simd_x86", since = "1.27.0")]
30 pub unsafe fn _mm_add_ps(a
: __m128
, b
: __m128
) -> __m128
{
34 /// Subtracts the first component of `b` from `a`, the other components are
37 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
39 #[target_feature(enable = "sse")]
40 #[cfg_attr(test, assert_instr(subss))]
41 #[stable(feature = "simd_x86", since = "1.27.0")]
42 pub unsafe fn _mm_sub_ss(a
: __m128
, b
: __m128
) -> __m128
{
46 /// Subtracts __m128 vectors.
48 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
50 #[target_feature(enable = "sse")]
51 #[cfg_attr(test, assert_instr(subps))]
52 #[stable(feature = "simd_x86", since = "1.27.0")]
53 pub unsafe fn _mm_sub_ps(a
: __m128
, b
: __m128
) -> __m128
{
57 /// Multiplies the first component of `a` and `b`, the other components are
60 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
62 #[target_feature(enable = "sse")]
63 #[cfg_attr(test, assert_instr(mulss))]
64 #[stable(feature = "simd_x86", since = "1.27.0")]
65 pub unsafe fn _mm_mul_ss(a
: __m128
, b
: __m128
) -> __m128
{
69 /// Multiplies __m128 vectors.
71 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
73 #[target_feature(enable = "sse")]
74 #[cfg_attr(test, assert_instr(mulps))]
75 #[stable(feature = "simd_x86", since = "1.27.0")]
76 pub unsafe fn _mm_mul_ps(a
: __m128
, b
: __m128
) -> __m128
{
80 /// Divides the first component of `b` by `a`, the other components are
83 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
85 #[target_feature(enable = "sse")]
86 #[cfg_attr(test, assert_instr(divss))]
87 #[stable(feature = "simd_x86", since = "1.27.0")]
88 pub unsafe fn _mm_div_ss(a
: __m128
, b
: __m128
) -> __m128
{
92 /// Divides __m128 vectors.
94 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
96 #[target_feature(enable = "sse")]
97 #[cfg_attr(test, assert_instr(divps))]
98 #[stable(feature = "simd_x86", since = "1.27.0")]
99 pub unsafe fn _mm_div_ps(a
: __m128
, b
: __m128
) -> __m128
{
103 /// Returns the square root of the first single-precision (32-bit)
104 /// floating-point element in `a`, the other elements are unchanged.
106 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
108 #[target_feature(enable = "sse")]
109 #[cfg_attr(test, assert_instr(sqrtss))]
110 #[stable(feature = "simd_x86", since = "1.27.0")]
111 pub unsafe fn _mm_sqrt_ss(a
: __m128
) -> __m128
{
115 /// Returns the square root of packed single-precision (32-bit) floating-point
118 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
120 #[target_feature(enable = "sse")]
121 #[cfg_attr(test, assert_instr(sqrtps))]
122 #[stable(feature = "simd_x86", since = "1.27.0")]
123 pub unsafe fn _mm_sqrt_ps(a
: __m128
) -> __m128
{
127 /// Returns the approximate reciprocal of the first single-precision
128 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
130 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
132 #[target_feature(enable = "sse")]
133 #[cfg_attr(test, assert_instr(rcpss))]
134 #[stable(feature = "simd_x86", since = "1.27.0")]
135 pub unsafe fn _mm_rcp_ss(a
: __m128
) -> __m128
{
139 /// Returns the approximate reciprocal of packed single-precision (32-bit)
140 /// floating-point elements in `a`.
142 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
144 #[target_feature(enable = "sse")]
145 #[cfg_attr(test, assert_instr(rcpps))]
146 #[stable(feature = "simd_x86", since = "1.27.0")]
147 pub unsafe fn _mm_rcp_ps(a
: __m128
) -> __m128
{
151 /// Returns the approximate reciprocal square root of the first single-precision
152 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
154 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
156 #[target_feature(enable = "sse")]
157 #[cfg_attr(test, assert_instr(rsqrtss))]
158 #[stable(feature = "simd_x86", since = "1.27.0")]
159 pub unsafe fn _mm_rsqrt_ss(a
: __m128
) -> __m128
{
163 /// Returns the approximate reciprocal square root of packed single-precision
164 /// (32-bit) floating-point elements in `a`.
166 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
168 #[target_feature(enable = "sse")]
169 #[cfg_attr(test, assert_instr(rsqrtps))]
170 #[stable(feature = "simd_x86", since = "1.27.0")]
171 pub unsafe fn _mm_rsqrt_ps(a
: __m128
) -> __m128
{
175 /// Compares the first single-precision (32-bit) floating-point element of `a`
176 /// and `b`, and return the minimum value in the first element of the return
177 /// value, the other elements are copied from `a`.
179 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
181 #[target_feature(enable = "sse")]
182 #[cfg_attr(test, assert_instr(minss))]
183 #[stable(feature = "simd_x86", since = "1.27.0")]
184 pub unsafe fn _mm_min_ss(a
: __m128
, b
: __m128
) -> __m128
{
188 /// Compares packed single-precision (32-bit) floating-point elements in `a` and
189 /// `b`, and return the corresponding minimum values.
191 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
193 #[target_feature(enable = "sse")]
194 #[cfg_attr(test, assert_instr(minps))]
195 #[stable(feature = "simd_x86", since = "1.27.0")]
196 pub unsafe fn _mm_min_ps(a
: __m128
, b
: __m128
) -> __m128
{
197 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
201 /// Compares the first single-precision (32-bit) floating-point element of `a`
202 /// and `b`, and return the maximum value in the first element of the return
203 /// value, the other elements are copied from `a`.
205 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
207 #[target_feature(enable = "sse")]
208 #[cfg_attr(test, assert_instr(maxss))]
209 #[stable(feature = "simd_x86", since = "1.27.0")]
210 pub unsafe fn _mm_max_ss(a
: __m128
, b
: __m128
) -> __m128
{
214 /// Compares packed single-precision (32-bit) floating-point elements in `a` and
215 /// `b`, and return the corresponding maximum values.
217 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
219 #[target_feature(enable = "sse")]
220 #[cfg_attr(test, assert_instr(maxps))]
221 #[stable(feature = "simd_x86", since = "1.27.0")]
222 pub unsafe fn _mm_max_ps(a
: __m128
, b
: __m128
) -> __m128
{
223 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
227 /// Bitwise AND of packed single-precision (32-bit) floating-point elements.
229 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
231 #[target_feature(enable = "sse")]
232 // i586 only seems to generate plain `and` instructions, so ignore it.
234 all(test
, any(target_arch
= "x86_64", target_feature
= "sse2")),
237 #[stable(feature = "simd_x86", since = "1.27.0")]
238 pub unsafe fn _mm_and_ps(a
: __m128
, b
: __m128
) -> __m128
{
239 let a
: __m128i
= mem
::transmute(a
);
240 let b
: __m128i
= mem
::transmute(b
);
241 mem
::transmute(simd_and(a
, b
))
244 /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
247 /// Computes `!a & b` for each bit in `a` and `b`.
249 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
251 #[target_feature(enable = "sse")]
252 // i586 only seems to generate plain `not` and `and` instructions, so ignore
255 all(test
, any(target_arch
= "x86_64", target_feature
= "sse2")),
258 #[stable(feature = "simd_x86", since = "1.27.0")]
259 pub unsafe fn _mm_andnot_ps(a
: __m128
, b
: __m128
) -> __m128
{
260 let a
: __m128i
= mem
::transmute(a
);
261 let b
: __m128i
= mem
::transmute(b
);
262 let mask
: __m128i
= mem
::transmute(i32x4
::splat(-1));
263 mem
::transmute(simd_and(simd_xor(mask
, a
), b
))
266 /// Bitwise OR of packed single-precision (32-bit) floating-point elements.
268 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
270 #[target_feature(enable = "sse")]
271 // i586 only seems to generate plain `or` instructions, so we ignore it.
273 all(test
, any(target_arch
= "x86_64", target_feature
= "sse2")),
276 #[stable(feature = "simd_x86", since = "1.27.0")]
277 pub unsafe fn _mm_or_ps(a
: __m128
, b
: __m128
) -> __m128
{
278 let a
: __m128i
= mem
::transmute(a
);
279 let b
: __m128i
= mem
::transmute(b
);
280 mem
::transmute(simd_or(a
, b
))
283 /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
286 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
288 #[target_feature(enable = "sse")]
289 // i586 only seems to generate plain `xor` instructions, so we ignore it.
291 all(test
, any(target_arch
= "x86_64", target_feature
= "sse2")),
294 #[stable(feature = "simd_x86", since = "1.27.0")]
295 pub unsafe fn _mm_xor_ps(a
: __m128
, b
: __m128
) -> __m128
{
296 let a
: __m128i
= mem
::transmute(a
);
297 let b
: __m128i
= mem
::transmute(b
);
298 mem
::transmute(simd_xor(a
, b
))
301 /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
302 /// the result will be `0xffffffff` if the two inputs are equal, or `0`
303 /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
305 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
307 #[target_feature(enable = "sse")]
308 #[cfg_attr(test, assert_instr(cmpeqss))]
309 #[stable(feature = "simd_x86", since = "1.27.0")]
310 pub unsafe fn _mm_cmpeq_ss(a
: __m128
, b
: __m128
) -> __m128
{
314 /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
315 /// of the result will be `0xffffffff` if `a.extract(0)` is less than
316 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
317 /// upper 96 bits of `a`.
319 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
321 #[target_feature(enable = "sse")]
322 #[cfg_attr(test, assert_instr(cmpltss))]
323 #[stable(feature = "simd_x86", since = "1.27.0")]
324 pub unsafe fn _mm_cmplt_ss(a
: __m128
, b
: __m128
) -> __m128
{
328 /// Compares the lowest `f32` of both inputs for less than or equal. The lowest
329 /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
330 /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
331 /// are the upper 96 bits of `a`.
333 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
335 #[target_feature(enable = "sse")]
336 #[cfg_attr(test, assert_instr(cmpless))]
337 #[stable(feature = "simd_x86", since = "1.27.0")]
338 pub unsafe fn _mm_cmple_ss(a
: __m128
, b
: __m128
) -> __m128
{
342 /// Compares the lowest `f32` of both inputs for greater than. The lowest 32
343 /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
344 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345 /// are the upper 96 bits of `a`.
347 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
349 #[target_feature(enable = "sse")]
350 #[cfg_attr(test, assert_instr(cmpltss))]
351 #[stable(feature = "simd_x86", since = "1.27.0")]
352 pub unsafe fn _mm_cmpgt_ss(a
: __m128
, b
: __m128
) -> __m128
{
353 simd_shuffle
!(a
, cmpss(b
, a
, 1), [4, 1, 2, 3])
356 /// Compares the lowest `f32` of both inputs for greater than or equal. The
357 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
358 /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
359 /// of the result are the upper 96 bits of `a`.
361 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
363 #[target_feature(enable = "sse")]
364 #[cfg_attr(test, assert_instr(cmpless))]
365 #[stable(feature = "simd_x86", since = "1.27.0")]
366 pub unsafe fn _mm_cmpge_ss(a
: __m128
, b
: __m128
) -> __m128
{
367 simd_shuffle
!(a
, cmpss(b
, a
, 2), [4, 1, 2, 3])
370 /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
371 /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
372 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
373 /// upper 96 bits of `a`.
375 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
377 #[target_feature(enable = "sse")]
378 #[cfg_attr(test, assert_instr(cmpneqss))]
379 #[stable(feature = "simd_x86", since = "1.27.0")]
380 pub unsafe fn _mm_cmpneq_ss(a
: __m128
, b
: __m128
) -> __m128
{
384 /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
385 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
386 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387 /// upper 96 bits of `a`.
389 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
391 #[target_feature(enable = "sse")]
392 #[cfg_attr(test, assert_instr(cmpnltss))]
393 #[stable(feature = "simd_x86", since = "1.27.0")]
394 pub unsafe fn _mm_cmpnlt_ss(a
: __m128
, b
: __m128
) -> __m128
{
398 /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
399 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
400 /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
401 /// of the result are the upper 96 bits of `a`.
403 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
405 #[target_feature(enable = "sse")]
406 #[cfg_attr(test, assert_instr(cmpnless))]
407 #[stable(feature = "simd_x86", since = "1.27.0")]
408 pub unsafe fn _mm_cmpnle_ss(a
: __m128
, b
: __m128
) -> __m128
{
412 /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
413 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
414 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
415 /// the upper 96 bits of `a`.
417 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
419 #[target_feature(enable = "sse")]
420 #[cfg_attr(test, assert_instr(cmpnltss))]
421 #[stable(feature = "simd_x86", since = "1.27.0")]
422 pub unsafe fn _mm_cmpngt_ss(a
: __m128
, b
: __m128
) -> __m128
{
423 simd_shuffle
!(a
, cmpss(b
, a
, 5), [4, 1, 2, 3])
426 /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
427 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
428 /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
429 /// bits of the result are the upper 96 bits of `a`.
431 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
433 #[target_feature(enable = "sse")]
434 #[cfg_attr(test, assert_instr(cmpnless))]
435 #[stable(feature = "simd_x86", since = "1.27.0")]
436 pub unsafe fn _mm_cmpnge_ss(a
: __m128
, b
: __m128
) -> __m128
{
437 simd_shuffle
!(a
, cmpss(b
, a
, 6), [4, 1, 2, 3])
440 /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
441 /// the result will be `0xffffffff` if neither of `a.extract(0)` or
442 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
443 /// are the upper 96 bits of `a`.
445 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
447 #[target_feature(enable = "sse")]
448 #[cfg_attr(test, assert_instr(cmpordss))]
449 #[stable(feature = "simd_x86", since = "1.27.0")]
450 pub unsafe fn _mm_cmpord_ss(a
: __m128
, b
: __m128
) -> __m128
{
454 /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
455 /// of the result will be `0xffffffff` if any of `a.extract(0)` or
456 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457 /// are the upper 96 bits of `a`.
459 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
461 #[target_feature(enable = "sse")]
462 #[cfg_attr(test, assert_instr(cmpunordss))]
463 #[stable(feature = "simd_x86", since = "1.27.0")]
464 pub unsafe fn _mm_cmpunord_ss(a
: __m128
, b
: __m128
) -> __m128
{
468 /// Compares each of the four floats in `a` to the corresponding element in `b`.
469 /// The result in the output vector will be `0xffffffff` if the input elements
470 /// were equal, or `0` otherwise.
472 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
474 #[target_feature(enable = "sse")]
475 #[cfg_attr(test, assert_instr(cmpeqps))]
476 #[stable(feature = "simd_x86", since = "1.27.0")]
477 pub unsafe fn _mm_cmpeq_ps(a
: __m128
, b
: __m128
) -> __m128
{
481 /// Compares each of the four floats in `a` to the corresponding element in `b`.
482 /// The result in the output vector will be `0xffffffff` if the input element
483 /// in `a` is less than the corresponding element in `b`, or `0` otherwise.
485 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
487 #[target_feature(enable = "sse")]
488 #[cfg_attr(test, assert_instr(cmpltps))]
489 #[stable(feature = "simd_x86", since = "1.27.0")]
490 pub unsafe fn _mm_cmplt_ps(a
: __m128
, b
: __m128
) -> __m128
{
494 /// Compares each of the four floats in `a` to the corresponding element in `b`.
495 /// The result in the output vector will be `0xffffffff` if the input element
496 /// in `a` is less than or equal to the corresponding element in `b`, or `0`
499 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
501 #[target_feature(enable = "sse")]
502 #[cfg_attr(test, assert_instr(cmpleps))]
503 #[stable(feature = "simd_x86", since = "1.27.0")]
504 pub unsafe fn _mm_cmple_ps(a
: __m128
, b
: __m128
) -> __m128
{
508 /// Compares each of the four floats in `a` to the corresponding element in `b`.
509 /// The result in the output vector will be `0xffffffff` if the input element
510 /// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
512 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
514 #[target_feature(enable = "sse")]
515 #[cfg_attr(test, assert_instr(cmpltps))]
516 #[stable(feature = "simd_x86", since = "1.27.0")]
517 pub unsafe fn _mm_cmpgt_ps(a
: __m128
, b
: __m128
) -> __m128
{
521 /// Compares each of the four floats in `a` to the corresponding element in `b`.
522 /// The result in the output vector will be `0xffffffff` if the input element
523 /// in `a` is greater than or equal to the corresponding element in `b`, or `0`
526 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
528 #[target_feature(enable = "sse")]
529 #[cfg_attr(test, assert_instr(cmpleps))]
530 #[stable(feature = "simd_x86", since = "1.27.0")]
531 pub unsafe fn _mm_cmpge_ps(a
: __m128
, b
: __m128
) -> __m128
{
535 /// Compares each of the four floats in `a` to the corresponding element in `b`.
536 /// The result in the output vector will be `0xffffffff` if the input elements
537 /// are **not** equal, or `0` otherwise.
539 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
541 #[target_feature(enable = "sse")]
542 #[cfg_attr(test, assert_instr(cmpneqps))]
543 #[stable(feature = "simd_x86", since = "1.27.0")]
544 pub unsafe fn _mm_cmpneq_ps(a
: __m128
, b
: __m128
) -> __m128
{
548 /// Compares each of the four floats in `a` to the corresponding element in `b`.
549 /// The result in the output vector will be `0xffffffff` if the input element
550 /// in `a` is **not** less than the corresponding element in `b`, or `0`
553 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
555 #[target_feature(enable = "sse")]
556 #[cfg_attr(test, assert_instr(cmpnltps))]
557 #[stable(feature = "simd_x86", since = "1.27.0")]
558 pub unsafe fn _mm_cmpnlt_ps(a
: __m128
, b
: __m128
) -> __m128
{
562 /// Compares each of the four floats in `a` to the corresponding element in `b`.
563 /// The result in the output vector will be `0xffffffff` if the input element
564 /// in `a` is **not** less than or equal to the corresponding element in `b`, or
567 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
569 #[target_feature(enable = "sse")]
570 #[cfg_attr(test, assert_instr(cmpnleps))]
571 #[stable(feature = "simd_x86", since = "1.27.0")]
572 pub unsafe fn _mm_cmpnle_ps(a
: __m128
, b
: __m128
) -> __m128
{
576 /// Compares each of the four floats in `a` to the corresponding element in `b`.
577 /// The result in the output vector will be `0xffffffff` if the input element
578 /// in `a` is **not** greater than the corresponding element in `b`, or `0`
581 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
583 #[target_feature(enable = "sse")]
584 #[cfg_attr(test, assert_instr(cmpnltps))]
585 #[stable(feature = "simd_x86", since = "1.27.0")]
586 pub unsafe fn _mm_cmpngt_ps(a
: __m128
, b
: __m128
) -> __m128
{
590 /// Compares each of the four floats in `a` to the corresponding element in `b`.
591 /// The result in the output vector will be `0xffffffff` if the input element
592 /// in `a` is **not** greater than or equal to the corresponding element in `b`,
593 /// or `0` otherwise.
595 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
597 #[target_feature(enable = "sse")]
598 #[cfg_attr(test, assert_instr(cmpnleps))]
599 #[stable(feature = "simd_x86", since = "1.27.0")]
600 pub unsafe fn _mm_cmpnge_ps(a
: __m128
, b
: __m128
) -> __m128
{
604 /// Compares each of the four floats in `a` to the corresponding element in `b`.
605 /// Returns four floats that have one of two possible bit patterns. The element
606 /// in the output vector will be `0xffffffff` if the input elements in `a` and
607 /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
609 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
611 #[target_feature(enable = "sse")]
612 #[cfg_attr(test, assert_instr(cmpordps))]
613 #[stable(feature = "simd_x86", since = "1.27.0")]
614 pub unsafe fn _mm_cmpord_ps(a
: __m128
, b
: __m128
) -> __m128
{
618 /// Compares each of the four floats in `a` to the corresponding element in `b`.
619 /// Returns four floats that have one of two possible bit patterns. The element
620 /// in the output vector will be `0xffffffff` if the input elements in `a` and
621 /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
623 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
625 #[target_feature(enable = "sse")]
626 #[cfg_attr(test, assert_instr(cmpunordps))]
627 #[stable(feature = "simd_x86", since = "1.27.0")]
628 pub unsafe fn _mm_cmpunord_ps(a
: __m128
, b
: __m128
) -> __m128
{
632 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
633 /// `1` if they are equal, or `0` otherwise.
635 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
637 #[target_feature(enable = "sse")]
638 #[cfg_attr(test, assert_instr(comiss))]
639 #[stable(feature = "simd_x86", since = "1.27.0")]
640 pub unsafe fn _mm_comieq_ss(a
: __m128
, b
: __m128
) -> i32 {
644 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
645 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
647 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
649 #[target_feature(enable = "sse")]
650 #[cfg_attr(test, assert_instr(comiss))]
651 #[stable(feature = "simd_x86", since = "1.27.0")]
652 pub unsafe fn _mm_comilt_ss(a
: __m128
, b
: __m128
) -> i32 {
656 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
657 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
660 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
662 #[target_feature(enable = "sse")]
663 #[cfg_attr(test, assert_instr(comiss))]
664 #[stable(feature = "simd_x86", since = "1.27.0")]
665 pub unsafe fn _mm_comile_ss(a
: __m128
, b
: __m128
) -> i32 {
669 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
670 /// `1` if the value from `a` is greater than the one from `b`, or `0`
673 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
675 #[target_feature(enable = "sse")]
676 #[cfg_attr(test, assert_instr(comiss))]
677 #[stable(feature = "simd_x86", since = "1.27.0")]
678 pub unsafe fn _mm_comigt_ss(a
: __m128
, b
: __m128
) -> i32 {
682 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
683 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
686 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
688 #[target_feature(enable = "sse")]
689 #[cfg_attr(test, assert_instr(comiss))]
690 #[stable(feature = "simd_x86", since = "1.27.0")]
691 pub unsafe fn _mm_comige_ss(a
: __m128
, b
: __m128
) -> i32 {
695 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696 /// `1` if they are **not** equal, or `0` otherwise.
698 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
700 #[target_feature(enable = "sse")]
701 #[cfg_attr(test, assert_instr(comiss))]
702 #[stable(feature = "simd_x86", since = "1.27.0")]
703 pub unsafe fn _mm_comineq_ss(a
: __m128
, b
: __m128
) -> i32 {
707 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
708 /// `1` if they are equal, or `0` otherwise. This instruction will not signal
709 /// an exception if either argument is a quiet NaN.
711 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
713 #[target_feature(enable = "sse")]
714 #[cfg_attr(test, assert_instr(ucomiss))]
715 #[stable(feature = "simd_x86", since = "1.27.0")]
716 pub unsafe fn _mm_ucomieq_ss(a
: __m128
, b
: __m128
) -> i32 {
720 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
721 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
722 /// This instruction will not signal an exception if either argument is a quiet
725 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
727 #[target_feature(enable = "sse")]
728 #[cfg_attr(test, assert_instr(ucomiss))]
729 #[stable(feature = "simd_x86", since = "1.27.0")]
730 pub unsafe fn _mm_ucomilt_ss(a
: __m128
, b
: __m128
) -> i32 {
734 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
735 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
736 /// otherwise. This instruction will not signal an exception if either argument
739 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
741 #[target_feature(enable = "sse")]
742 #[cfg_attr(test, assert_instr(ucomiss))]
743 #[stable(feature = "simd_x86", since = "1.27.0")]
744 pub unsafe fn _mm_ucomile_ss(a
: __m128
, b
: __m128
) -> i32 {
748 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
749 /// `1` if the value from `a` is greater than the one from `b`, or `0`
750 /// otherwise. This instruction will not signal an exception if either argument
753 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
755 #[target_feature(enable = "sse")]
756 #[cfg_attr(test, assert_instr(ucomiss))]
757 #[stable(feature = "simd_x86", since = "1.27.0")]
758 pub unsafe fn _mm_ucomigt_ss(a
: __m128
, b
: __m128
) -> i32 {
762 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
763 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
764 /// `0` otherwise. This instruction will not signal an exception if either
765 /// argument is a quiet NaN.
767 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
769 #[target_feature(enable = "sse")]
770 #[cfg_attr(test, assert_instr(ucomiss))]
771 #[stable(feature = "simd_x86", since = "1.27.0")]
772 pub unsafe fn _mm_ucomige_ss(a
: __m128
, b
: __m128
) -> i32 {
776 /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777 /// `1` if they are **not** equal, or `0` otherwise. This instruction will not
778 /// signal an exception if either argument is a quiet NaN.
780 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
782 #[target_feature(enable = "sse")]
783 #[cfg_attr(test, assert_instr(ucomiss))]
784 #[stable(feature = "simd_x86", since = "1.27.0")]
785 pub unsafe fn _mm_ucomineq_ss(a
: __m128
, b
: __m128
) -> i32 {
789 /// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
791 /// The result is rounded according to the current rounding mode. If the result
792 /// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
793 /// (`i32::MIN`) or an invalid operation floating point exception if
794 /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
796 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
798 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
800 #[target_feature(enable = "sse")]
801 #[cfg_attr(test, assert_instr(cvtss2si))]
802 #[stable(feature = "simd_x86", since = "1.27.0")]
803 pub unsafe fn _mm_cvtss_si32(a
: __m128
) -> i32 {
807 /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
809 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
811 #[target_feature(enable = "sse")]
812 #[cfg_attr(test, assert_instr(cvtss2si))]
813 #[stable(feature = "simd_x86", since = "1.27.0")]
814 pub unsafe fn _mm_cvt_ss2si(a
: __m128
) -> i32 {
818 /// Converts the lowest 32 bit float in the input vector to a 32 bit integer
822 /// The result is rounded always using truncation (round towards zero). If the
823 /// result cannot be represented as a 32 bit integer the result will be
824 /// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
825 /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
827 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
829 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
831 #[target_feature(enable = "sse")]
832 #[cfg_attr(test, assert_instr(cvttss2si))]
833 #[stable(feature = "simd_x86", since = "1.27.0")]
834 pub unsafe fn _mm_cvttss_si32(a
: __m128
) -> i32 {
838 /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
840 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
842 #[target_feature(enable = "sse")]
843 #[cfg_attr(test, assert_instr(cvttss2si))]
844 #[stable(feature = "simd_x86", since = "1.27.0")]
845 pub unsafe fn _mm_cvtt_ss2si(a
: __m128
) -> i32 {
849 /// Extracts the lowest 32 bit float from the input vector.
851 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
853 #[target_feature(enable = "sse")]
854 // No point in using assert_instrs. In Unix x86_64 calling convention this is a
855 // no-op, and on Windows it's just a `mov`.
856 #[stable(feature = "simd_x86", since = "1.27.0")]
857 pub unsafe fn _mm_cvtss_f32(a
: __m128
) -> f32 {
861 /// Converts a 32 bit integer to a 32 bit float. The result vector is the input
862 /// vector `a` with the lowest 32 bit float replaced by the converted integer.
864 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
867 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
869 #[target_feature(enable = "sse")]
870 #[cfg_attr(test, assert_instr(cvtsi2ss))]
871 #[stable(feature = "simd_x86", since = "1.27.0")]
872 pub unsafe fn _mm_cvtsi32_ss(a
: __m128
, b
: i32) -> __m128
{
876 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
878 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
880 #[target_feature(enable = "sse")]
881 #[cfg_attr(test, assert_instr(cvtsi2ss))]
882 #[stable(feature = "simd_x86", since = "1.27.0")]
883 pub unsafe fn _mm_cvt_si2ss(a
: __m128
, b
: i32) -> __m128
{
887 /// Construct a `__m128` with the lowest element set to `a` and the rest set to
890 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
892 #[target_feature(enable = "sse")]
893 #[cfg_attr(test, assert_instr(movss))]
894 #[stable(feature = "simd_x86", since = "1.27.0")]
895 pub unsafe fn _mm_set_ss(a
: f32) -> __m128
{
896 __m128(a
, 0.0, 0.0, 0.0)
899 /// Construct a `__m128` with all element set to `a`.
901 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
903 #[target_feature(enable = "sse")]
904 #[cfg_attr(test, assert_instr(shufps))]
905 #[stable(feature = "simd_x86", since = "1.27.0")]
906 pub unsafe fn _mm_set1_ps(a
: f32) -> __m128
{
910 /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
912 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
914 #[target_feature(enable = "sse")]
915 #[cfg_attr(test, assert_instr(shufps))]
916 #[stable(feature = "simd_x86", since = "1.27.0")]
917 pub unsafe fn _mm_set_ps1(a
: f32) -> __m128
{
921 /// Construct a `__m128` from four floating point values highest to lowest.
923 /// Note that `a` will be the highest 32 bits of the result, and `d` the
924 /// lowest. This matches the standard way of writing bit patterns on x86:
927 /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
928 /// +---------+---------+---------+---------+
929 /// | a | b | c | d | result
930 /// +---------+---------+---------+---------+
936 /// let v = _mm_set_ps(d, c, b, a);
939 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
941 #[target_feature(enable = "sse")]
942 #[cfg_attr(test, assert_instr(unpcklps))]
943 #[stable(feature = "simd_x86", since = "1.27.0")]
944 pub unsafe fn _mm_set_ps(a
: f32, b
: f32, c
: f32, d
: f32) -> __m128
{
948 /// Construct a `__m128` from four floating point values lowest to highest.
950 /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
951 /// bits of the result, and `d` the highest.
954 /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
957 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
959 #[target_feature(enable = "sse")]
961 all(test
, any(target_os
= "windows", target_arch
= "x86_64")),
962 assert_instr(unpcklps
)
964 // On a 32-bit architecture on non-Windows it just copies the operands from the stack.
966 all(test
, all(not(target_os
= "windows"), target_arch
= "x86")),
969 #[stable(feature = "simd_x86", since = "1.27.0")]
970 pub unsafe fn _mm_setr_ps(a
: f32, b
: f32, c
: f32, d
: f32) -> __m128
{
974 /// Construct a `__m128` with all elements initialized to zero.
976 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
978 #[target_feature(enable = "sse")]
979 #[cfg_attr(test, assert_instr(xorps))]
980 #[stable(feature = "simd_x86", since = "1.27.0")]
981 pub unsafe fn _mm_setzero_ps() -> __m128
{
982 __m128(0.0, 0.0, 0.0, 0.0)
985 /// A utility function for creating masks to use with Intel shuffle and
986 /// permute intrinsics.
988 #[allow(non_snake_case)]
989 #[unstable(feature = "stdarch", issue = "27731")]
990 pub const fn _MM_SHUFFLE(z
: u32, y
: u32, x
: u32, w
: u32) -> i32 {
991 ((z
<< 6) | (y
<< 4) | (x
<< 2) | w
) as i32
994 /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
995 /// `b` using `MASK`.
997 /// The lower half of result takes values from `a` and the higher half from
998 /// `b`. Mask is split to 2 control bits each to index the element from inputs.
1000 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1002 /// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1003 /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1004 /// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1005 /// Performing an implicit type conversion between an unsigned integer and a signed integer
1006 /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1008 #[target_feature(enable = "sse")]
1009 #[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1010 #[rustc_legacy_const_generics(2)]
1011 #[stable(feature = "simd_x86", since = "1.27.0")]
1012 pub unsafe fn _mm_shuffle_ps
<const MASK
: i32>(a
: __m128
, b
: __m128
) -> __m128
{
1013 static_assert_uimm_bits
!(MASK
, 8);
1019 (MASK
as u32 >> 2) & 0b11,
1020 ((MASK
as u32 >> 4) & 0b11) + 4,
1021 ((MASK
as u32 >> 6) & 0b11) + 4,
1026 /// Unpacks and interleave single-precision (32-bit) floating-point elements
1027 /// from the higher half of `a` and `b`.
1029 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1031 #[target_feature(enable = "sse")]
1032 #[cfg_attr(test, assert_instr(unpckhps))]
1033 #[stable(feature = "simd_x86", since = "1.27.0")]
1034 pub unsafe fn _mm_unpackhi_ps(a
: __m128
, b
: __m128
) -> __m128
{
1035 simd_shuffle
!(a
, b
, [2, 6, 3, 7])
1038 /// Unpacks and interleave single-precision (32-bit) floating-point elements
1039 /// from the lower half of `a` and `b`.
1041 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1043 #[target_feature(enable = "sse")]
1044 #[cfg_attr(test, assert_instr(unpcklps))]
1045 #[stable(feature = "simd_x86", since = "1.27.0")]
1046 pub unsafe fn _mm_unpacklo_ps(a
: __m128
, b
: __m128
) -> __m128
{
1047 simd_shuffle
!(a
, b
, [0, 4, 1, 5])
1050 /// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1051 /// lower half of result.
1053 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1055 #[target_feature(enable = "sse")]
1056 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
1057 #[stable(feature = "simd_x86", since = "1.27.0")]
1058 pub unsafe fn _mm_movehl_ps(a
: __m128
, b
: __m128
) -> __m128
{
1059 // TODO; figure why this is a different instruction on Windows?
1060 simd_shuffle
!(a
, b
, [6, 7, 2, 3])
1063 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1064 /// higher half of result.
1066 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1068 #[target_feature(enable = "sse")]
1069 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
1070 #[stable(feature = "simd_x86", since = "1.27.0")]
1071 pub unsafe fn _mm_movelh_ps(a
: __m128
, b
: __m128
) -> __m128
{
1072 simd_shuffle
!(a
, b
, [0, 1, 4, 5])
1075 /// Returns a mask of the most significant bit of each element in `a`.
1077 /// The mask is stored in the 4 least significant bits of the return value.
1078 /// All other bits are set to `0`.
1080 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1082 #[target_feature(enable = "sse")]
1083 #[cfg_attr(test, assert_instr(movmskps))]
1084 #[stable(feature = "simd_x86", since = "1.27.0")]
1085 pub unsafe fn _mm_movemask_ps(a
: __m128
) -> i32 {
1089 /// Construct a `__m128` with the lowest element read from `p` and the other
1090 /// elements set to zero.
1092 /// This corresponds to instructions `VMOVSS` / `MOVSS`.
1094 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1096 #[target_feature(enable = "sse")]
1097 #[cfg_attr(test, assert_instr(movss))]
1098 #[stable(feature = "simd_x86", since = "1.27.0")]
1099 pub unsafe fn _mm_load_ss(p
: *const f32) -> __m128
{
1100 __m128(*p
, 0.0, 0.0, 0.0)
1103 /// Construct a `__m128` by duplicating the value read from `p` into all
1106 /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1109 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1111 #[target_feature(enable = "sse")]
1112 #[cfg_attr(test, assert_instr(movss))]
1113 #[stable(feature = "simd_x86", since = "1.27.0")]
1114 pub unsafe fn _mm_load1_ps(p
: *const f32) -> __m128
{
1119 /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1121 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1123 #[target_feature(enable = "sse")]
1124 #[cfg_attr(test, assert_instr(movss))]
1125 #[stable(feature = "simd_x86", since = "1.27.0")]
1126 pub unsafe fn _mm_load_ps1(p
: *const f32) -> __m128
{
1130 /// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1131 /// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1132 /// protection fault will be triggered (fatal program crash).
1134 /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1137 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1139 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1141 #[target_feature(enable = "sse")]
1142 #[cfg_attr(test, assert_instr(movaps))]
1143 #[stable(feature = "simd_x86", since = "1.27.0")]
1144 #[allow(clippy::cast_ptr_alignment)]
1145 pub unsafe fn _mm_load_ps(p
: *const f32) -> __m128
{
1146 *(p
as *const __m128
)
1149 /// Loads four `f32` values from memory into a `__m128`. There are no
1151 /// on memory alignment. For aligned memory
1152 /// [`_mm_load_ps`](fn._mm_load_ps.html)
1155 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1157 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1159 #[target_feature(enable = "sse")]
1160 #[cfg_attr(test, assert_instr(movups))]
1161 #[stable(feature = "simd_x86", since = "1.27.0")]
1162 pub unsafe fn _mm_loadu_ps(p
: *const f32) -> __m128
{
1163 // Note: Using `*p` would require `f32` alignment, but `movups` has no
1164 // alignment restrictions.
1165 let mut dst
= _mm_undefined_ps();
1166 ptr
::copy_nonoverlapping(
1168 &mut dst
as *mut __m128
as *mut u8,
1169 mem
::size_of
::<__m128
>(),
1174 /// Loads four `f32` values from aligned memory into a `__m128` in reverse
1177 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1178 /// protection fault will be triggered (fatal program crash).
1180 /// Functionally equivalent to the following code sequence (assuming `p`
1181 /// satisfies the alignment restrictions):
1185 /// let a1 = *p.add(1);
1186 /// let a2 = *p.add(2);
1187 /// let a3 = *p.add(3);
1188 /// __m128::new(a3, a2, a1, a0)
1191 /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1194 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1196 #[target_feature(enable = "sse")]
1197 #[cfg_attr(test, assert_instr(movaps))]
1198 #[stable(feature = "simd_x86", since = "1.27.0")]
1199 pub unsafe fn _mm_loadr_ps(p
: *const f32) -> __m128
{
1200 let a
= _mm_load_ps(p
);
1201 simd_shuffle
!(a
, a
, [3, 2, 1, 0])
1204 /// Loads unaligned 64-bits of integer data from memory into new vector.
1206 /// `mem_addr` does not need to be aligned on any particular boundary.
1208 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64)
1210 #[target_feature(enable = "sse")]
1211 #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1212 pub unsafe fn _mm_loadu_si64(mem_addr
: *const u8) -> __m128i
{
1213 transmute(i64x2(ptr
::read_unaligned(mem_addr
as *const i64), 0))
1216 /// Stores the lowest 32 bit float of `a` into memory.
1218 /// This intrinsic corresponds to the `MOVSS` instruction.
1220 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1222 #[target_feature(enable = "sse")]
1223 #[cfg_attr(test, assert_instr(movss))]
1224 #[stable(feature = "simd_x86", since = "1.27.0")]
1225 pub unsafe fn _mm_store_ss(p
: *mut f32, a
: __m128
) {
1226 *p
= simd_extract(a
, 0);
1229 /// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1232 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1233 /// protection fault will be triggered (fatal program crash).
1235 /// Functionally equivalent to the following code sequence (assuming `p`
1236 /// satisfies the alignment restrictions):
1239 /// let x = a.extract(0);
1246 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1248 #[target_feature(enable = "sse")]
1249 #[cfg_attr(test, assert_instr(movaps))]
1250 #[stable(feature = "simd_x86", since = "1.27.0")]
1251 #[allow(clippy::cast_ptr_alignment)]
1252 pub unsafe fn _mm_store1_ps(p
: *mut f32, a
: __m128
) {
1253 let b
: __m128
= simd_shuffle
!(a
, a
, [0, 0, 0, 0]);
1254 *(p
as *mut __m128
) = b
;
1257 /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1259 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1261 #[target_feature(enable = "sse")]
1262 #[cfg_attr(test, assert_instr(movaps))]
1263 #[stable(feature = "simd_x86", since = "1.27.0")]
1264 pub unsafe fn _mm_store_ps1(p
: *mut f32, a
: __m128
) {
1265 _mm_store1_ps(p
, a
);
1268 /// Stores four 32-bit floats into *aligned* memory.
1270 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1271 /// protection fault will be triggered (fatal program crash).
1273 /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1276 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1278 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1280 #[target_feature(enable = "sse")]
1281 #[cfg_attr(test, assert_instr(movaps))]
1282 #[stable(feature = "simd_x86", since = "1.27.0")]
1283 #[allow(clippy::cast_ptr_alignment)]
1284 pub unsafe fn _mm_store_ps(p
: *mut f32, a
: __m128
) {
1285 *(p
as *mut __m128
) = a
;
1288 /// Stores four 32-bit floats into memory. There are no restrictions on memory
1289 /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1292 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1294 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1296 #[target_feature(enable = "sse")]
1297 #[cfg_attr(test, assert_instr(movups))]
1298 #[stable(feature = "simd_x86", since = "1.27.0")]
1299 pub unsafe fn _mm_storeu_ps(p
: *mut f32, a
: __m128
) {
1300 ptr
::copy_nonoverlapping(
1301 &a
as *const __m128
as *const u8,
1303 mem
::size_of
::<__m128
>(),
1307 /// Stores four 32-bit floats into *aligned* memory in reverse order.
1309 /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1310 /// protection fault will be triggered (fatal program crash).
1312 /// Functionally equivalent to the following code sequence (assuming `p`
1313 /// satisfies the alignment restrictions):
1316 /// *p = a.extract(3);
1317 /// *p.add(1) = a.extract(2);
1318 /// *p.add(2) = a.extract(1);
1319 /// *p.add(3) = a.extract(0);
1322 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1324 #[target_feature(enable = "sse")]
1325 #[cfg_attr(test, assert_instr(movaps))]
1326 #[stable(feature = "simd_x86", since = "1.27.0")]
1327 #[allow(clippy::cast_ptr_alignment)]
1328 pub unsafe fn _mm_storer_ps(p
: *mut f32, a
: __m128
) {
1329 let b
: __m128
= simd_shuffle
!(a
, a
, [3, 2, 1, 0]);
1330 *(p
as *mut __m128
) = b
;
1333 /// Returns a `__m128` with the first component from `b` and the remaining
1334 /// components from `a`.
1336 /// In other words for any `a` and `b`:
1338 /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1341 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1343 #[target_feature(enable = "sse")]
1344 #[cfg_attr(test, assert_instr(movss))]
1345 #[stable(feature = "simd_x86", since = "1.27.0")]
1346 pub unsafe fn _mm_move_ss(a
: __m128
, b
: __m128
) -> __m128
{
1347 simd_shuffle
!(a
, b
, [4, 1, 2, 3])
1350 /// Performs a serializing operation on all store-to-memory instructions that
1351 /// were issued prior to this instruction.
1353 /// Guarantees that every store instruction that precedes, in program order, is
1354 /// globally visible before any store instruction which follows the fence in
1357 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1359 #[target_feature(enable = "sse")]
1360 #[cfg_attr(test, assert_instr(sfence))]
1361 #[stable(feature = "simd_x86", since = "1.27.0")]
1362 pub unsafe fn _mm_sfence() {
1366 /// Gets the unsigned 32-bit value of the MXCSR control and status register.
1368 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1370 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1372 #[target_feature(enable = "sse")]
1373 #[cfg_attr(test, assert_instr(stmxcsr))]
1374 #[stable(feature = "simd_x86", since = "1.27.0")]
1375 pub unsafe fn _mm_getcsr() -> u32 {
1376 let mut result
= 0_i32;
1377 stmxcsr((&mut result
) as *mut _
as *mut i8);
1381 /// Sets the MXCSR register with the 32-bit unsigned integer value.
1383 /// This register controls how SIMD instructions handle floating point
1384 /// operations. Modifying this register only affects the current thread.
1386 /// It contains several groups of flags:
1388 /// * *Exception flags* report which exceptions occurred since last they were
1391 /// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1393 /// these flags are all set to 1, so all exceptions are masked. When an
1394 /// an exception is masked, the processor simply sets the exception flag and
1395 /// continues the operation. If the exception is unmasked, the flag is also set
1396 /// but additionally an exception handler is invoked.
1398 /// * *Rounding mode flags* control the rounding mode of floating point
1401 /// * The *denormals-are-zero mode flag* turns all numbers which would be
1402 /// denormalized (exponent bits are all zeros) into zeros.
1404 /// ## Exception Flags
1406 /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1407 /// Infinity by Infinity).
1409 /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1410 /// number. Mainly this can cause loss of precision.
1412 /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1414 /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1415 /// result was too large to be represented (e.g., an `f32` with absolute
1417 /// greater than `2^128`).
1419 /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1420 /// result was too small to be represented in a normalized way (e.g., an
1422 /// with absulte value smaller than `2^-126`.)
1424 /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1425 /// precision exception). This means some precision was lost due to rounding.
1426 /// For example, the fraction `1/3` cannot be represented accurately in a
1427 /// 32 or 64 bit float and computing it would cause this exception to be
1428 /// raised. Precision exceptions are very common, so they are usually masked.
1430 /// Exception flags can be read and set using the convenience functions
1431 /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1432 /// check if an operation caused some overflow:
1435 /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1436 /// // perform calculations
1437 /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1438 /// // handle overflow
1442 /// ## Masking Flags
1444 /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1445 /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1446 /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1448 /// A single masking bit can be set via
1451 /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1454 /// However, since mask bits are by default all set to 1, it is more common to
1455 /// want to *disable* certain bits. For example, to unmask the underflow
1459 /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1463 /// Warning: an unmasked exception will cause an exception handler to be
1465 /// The standard handler will simply terminate the process. So, in this case
1466 /// any underflow exception would terminate the current process with something
1467 /// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1469 /// ## Rounding Mode
1471 /// The rounding mode is describe using two bits. It can be read and set using
1472 /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1473 /// `_MM_SET_ROUNDING_MODE(mode)`.
1475 /// The rounding modes are:
1477 /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1478 /// value. If two values are equally close, round to even (i.e., least
1479 /// significant bit will be zero).
1481 /// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1483 /// * `_MM_ROUND_UP`: Round toward positive Infinity.
1485 /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1490 /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1493 /// ## Denormals-are-zero/Flush-to-zero Mode
1495 /// If this bit is set, values that would be denormalized will be set to zero
1496 /// instead. This is turned off by default.
1498 /// You can read and enable/disable this mode via the helper functions
1499 /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1502 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1503 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1507 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1509 #[target_feature(enable = "sse")]
1510 #[cfg_attr(test, assert_instr(ldmxcsr))]
1511 #[stable(feature = "simd_x86", since = "1.27.0")]
1512 pub unsafe fn _mm_setcsr(val
: u32) {
1513 ldmxcsr(&val
as *const _
as *const i8);
1516 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1517 #[stable(feature = "simd_x86", since = "1.27.0")]
1518 pub const _MM_EXCEPT_INVALID
: u32 = 0x0001;
1519 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1520 #[stable(feature = "simd_x86", since = "1.27.0")]
1521 pub const _MM_EXCEPT_DENORM
: u32 = 0x0002;
1522 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1523 #[stable(feature = "simd_x86", since = "1.27.0")]
1524 pub const _MM_EXCEPT_DIV_ZERO
: u32 = 0x0004;
1525 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1526 #[stable(feature = "simd_x86", since = "1.27.0")]
1527 pub const _MM_EXCEPT_OVERFLOW
: u32 = 0x0008;
1528 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1529 #[stable(feature = "simd_x86", since = "1.27.0")]
1530 pub const _MM_EXCEPT_UNDERFLOW
: u32 = 0x0010;
1531 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1532 #[stable(feature = "simd_x86", since = "1.27.0")]
1533 pub const _MM_EXCEPT_INEXACT
: u32 = 0x0020;
1534 /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1535 #[stable(feature = "simd_x86", since = "1.27.0")]
1536 pub const _MM_EXCEPT_MASK
: u32 = 0x003f;
1538 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1539 #[stable(feature = "simd_x86", since = "1.27.0")]
1540 pub const _MM_MASK_INVALID
: u32 = 0x0080;
1541 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1542 #[stable(feature = "simd_x86", since = "1.27.0")]
1543 pub const _MM_MASK_DENORM
: u32 = 0x0100;
1544 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1545 #[stable(feature = "simd_x86", since = "1.27.0")]
1546 pub const _MM_MASK_DIV_ZERO
: u32 = 0x0200;
1547 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1548 #[stable(feature = "simd_x86", since = "1.27.0")]
1549 pub const _MM_MASK_OVERFLOW
: u32 = 0x0400;
1550 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1551 #[stable(feature = "simd_x86", since = "1.27.0")]
1552 pub const _MM_MASK_UNDERFLOW
: u32 = 0x0800;
1553 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1554 #[stable(feature = "simd_x86", since = "1.27.0")]
1555 pub const _MM_MASK_INEXACT
: u32 = 0x1000;
1556 /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1557 #[stable(feature = "simd_x86", since = "1.27.0")]
1558 pub const _MM_MASK_MASK
: u32 = 0x1f80;
1560 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1561 #[stable(feature = "simd_x86", since = "1.27.0")]
1562 pub const _MM_ROUND_NEAREST
: u32 = 0x0000;
1563 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1564 #[stable(feature = "simd_x86", since = "1.27.0")]
1565 pub const _MM_ROUND_DOWN
: u32 = 0x2000;
1566 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1567 #[stable(feature = "simd_x86", since = "1.27.0")]
1568 pub const _MM_ROUND_UP
: u32 = 0x4000;
1569 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1570 #[stable(feature = "simd_x86", since = "1.27.0")]
1571 pub const _MM_ROUND_TOWARD_ZERO
: u32 = 0x6000;
1573 /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1574 #[stable(feature = "simd_x86", since = "1.27.0")]
1575 pub const _MM_ROUND_MASK
: u32 = 0x6000;
1577 /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1578 #[stable(feature = "simd_x86", since = "1.27.0")]
1579 pub const _MM_FLUSH_ZERO_MASK
: u32 = 0x8000;
1580 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1581 #[stable(feature = "simd_x86", since = "1.27.0")]
1582 pub const _MM_FLUSH_ZERO_ON
: u32 = 0x8000;
1583 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1584 #[stable(feature = "simd_x86", since = "1.27.0")]
1585 pub const _MM_FLUSH_ZERO_OFF
: u32 = 0x0000;
1587 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1589 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1591 #[allow(non_snake_case)]
1592 #[target_feature(enable = "sse")]
1593 #[stable(feature = "simd_x86", since = "1.27.0")]
1594 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1595 _mm_getcsr() & _MM_MASK_MASK
1598 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1600 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1602 #[allow(non_snake_case)]
1603 #[target_feature(enable = "sse")]
1604 #[stable(feature = "simd_x86", since = "1.27.0")]
1605 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1606 _mm_getcsr() & _MM_EXCEPT_MASK
1609 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1611 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1613 #[allow(non_snake_case)]
1614 #[target_feature(enable = "sse")]
1615 #[stable(feature = "simd_x86", since = "1.27.0")]
1616 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1617 _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1620 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1622 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1624 #[allow(non_snake_case)]
1625 #[target_feature(enable = "sse")]
1626 #[stable(feature = "simd_x86", since = "1.27.0")]
1627 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1628 _mm_getcsr() & _MM_ROUND_MASK
1631 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1633 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1635 #[allow(non_snake_case)]
1636 #[target_feature(enable = "sse")]
1637 #[stable(feature = "simd_x86", since = "1.27.0")]
1638 pub unsafe fn _MM_SET_EXCEPTION_MASK(x
: u32) {
1639 _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK
) | x
)
1642 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1644 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1646 #[allow(non_snake_case)]
1647 #[target_feature(enable = "sse")]
1648 #[stable(feature = "simd_x86", since = "1.27.0")]
1649 pub unsafe fn _MM_SET_EXCEPTION_STATE(x
: u32) {
1650 _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK
) | x
)
1653 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1655 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1657 #[allow(non_snake_case)]
1658 #[target_feature(enable = "sse")]
1659 #[stable(feature = "simd_x86", since = "1.27.0")]
1660 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x
: u32) {
1661 let val
= (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK
) | x
;
1662 // println!("setting csr={:x}", val);
1666 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
1668 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1670 #[allow(non_snake_case)]
1671 #[target_feature(enable = "sse")]
1672 #[stable(feature = "simd_x86", since = "1.27.0")]
1673 pub unsafe fn _MM_SET_ROUNDING_MODE(x
: u32) {
1674 _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK
) | x
)
1677 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1678 #[stable(feature = "simd_x86", since = "1.27.0")]
1679 pub const _MM_HINT_T0
: i32 = 3;
1681 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1682 #[stable(feature = "simd_x86", since = "1.27.0")]
1683 pub const _MM_HINT_T1
: i32 = 2;
1685 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1686 #[stable(feature = "simd_x86", since = "1.27.0")]
1687 pub const _MM_HINT_T2
: i32 = 1;
1689 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1690 #[stable(feature = "simd_x86", since = "1.27.0")]
1691 pub const _MM_HINT_NTA
: i32 = 0;
1693 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1694 #[stable(feature = "simd_x86", since = "1.27.0")]
1695 pub const _MM_HINT_ET0
: i32 = 7;
1697 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
1698 #[stable(feature = "simd_x86", since = "1.27.0")]
1699 pub const _MM_HINT_ET1
: i32 = 6;
1701 /// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1703 /// The `STRATEGY` must be one of:
1705 /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1706 /// cache hierarchy.
1708 /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1710 /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1711 /// an implementation-specific choice (e.g., L2 if there is no L3).
1713 /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1714 /// non-temporal access (NTA) hint. It may be a place closer than main memory
1715 /// but outside of the cache hierarchy. This is used to reduce access latency
1716 /// without polluting the cache.
1718 /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1719 /// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1720 /// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1722 /// The actual implementation depends on the particular CPU. This instruction
1723 /// is considered a hint, so the CPU is also free to simply ignore the request.
1725 /// The amount of prefetched data depends on the cache line size of the
1726 /// specific CPU, but it will be at least 32 bytes.
1730 /// * Most modern CPUs already automatically prefetch data based on predicted
1731 /// access patterns.
1733 /// * Data is usually not fetched if this would cause a TLB miss or a page
1736 /// * Too much prefetching can cause unnecessary cache evictions.
1738 /// * Prefetching may also fail if there are not enough memory-subsystem
1739 /// resources (e.g., request buffers).
1742 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1744 #[target_feature(enable = "sse")]
1745 #[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1746 #[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1747 #[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1748 #[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1749 #[rustc_legacy_const_generics(1)]
1750 #[stable(feature = "simd_x86", since = "1.27.0")]
1751 pub unsafe fn _mm_prefetch
<const STRATEGY
: i32>(p
: *const i8) {
1752 // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1753 // `locality` and `rw` are based on our `STRATEGY`.
1754 prefetch(p
, (STRATEGY
>> 2) & 1, STRATEGY
& 3, 1);
1757 /// Returns vector of type __m128 with undefined elements.
1759 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1761 #[target_feature(enable = "sse")]
1762 #[stable(feature = "simd_x86", since = "1.27.0")]
1763 pub unsafe fn _mm_undefined_ps() -> __m128
{
1767 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1769 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1771 #[allow(non_snake_case)]
1772 #[target_feature(enable = "sse")]
1773 #[stable(feature = "simd_x86", since = "1.27.0")]
1774 pub unsafe fn _MM_TRANSPOSE4_PS(
1780 let tmp0
= _mm_unpacklo_ps(*row0
, *row1
);
1781 let tmp2
= _mm_unpacklo_ps(*row2
, *row3
);
1782 let tmp1
= _mm_unpackhi_ps(*row0
, *row1
);
1783 let tmp3
= _mm_unpackhi_ps(*row2
, *row3
);
1785 *row0
= _mm_movelh_ps(tmp0
, tmp2
);
1786 *row1
= _mm_movehl_ps(tmp2
, tmp0
);
1787 *row2
= _mm_movelh_ps(tmp1
, tmp3
);
1788 *row3
= _mm_movehl_ps(tmp3
, tmp1
);
1791 #[allow(improper_ctypes)]
1793 #[link_name = "llvm.x86.sse.add.ss"]
1794 fn addss(a
: __m128
, b
: __m128
) -> __m128
;
1795 #[link_name = "llvm.x86.sse.sub.ss"]
1796 fn subss(a
: __m128
, b
: __m128
) -> __m128
;
1797 #[link_name = "llvm.x86.sse.mul.ss"]
1798 fn mulss(a
: __m128
, b
: __m128
) -> __m128
;
1799 #[link_name = "llvm.x86.sse.div.ss"]
1800 fn divss(a
: __m128
, b
: __m128
) -> __m128
;
1801 #[link_name = "llvm.x86.sse.sqrt.ss"]
1802 fn sqrtss(a
: __m128
) -> __m128
;
1803 #[link_name = "llvm.x86.sse.sqrt.ps"]
1804 fn sqrtps(a
: __m128
) -> __m128
;
1805 #[link_name = "llvm.x86.sse.rcp.ss"]
1806 fn rcpss(a
: __m128
) -> __m128
;
1807 #[link_name = "llvm.x86.sse.rcp.ps"]
1808 fn rcpps(a
: __m128
) -> __m128
;
1809 #[link_name = "llvm.x86.sse.rsqrt.ss"]
1810 fn rsqrtss(a
: __m128
) -> __m128
;
1811 #[link_name = "llvm.x86.sse.rsqrt.ps"]
1812 fn rsqrtps(a
: __m128
) -> __m128
;
1813 #[link_name = "llvm.x86.sse.min.ss"]
1814 fn minss(a
: __m128
, b
: __m128
) -> __m128
;
1815 #[link_name = "llvm.x86.sse.min.ps"]
1816 fn minps(a
: __m128
, b
: __m128
) -> __m128
;
1817 #[link_name = "llvm.x86.sse.max.ss"]
1818 fn maxss(a
: __m128
, b
: __m128
) -> __m128
;
1819 #[link_name = "llvm.x86.sse.max.ps"]
1820 fn maxps(a
: __m128
, b
: __m128
) -> __m128
;
1821 #[link_name = "llvm.x86.sse.movmsk.ps"]
1822 fn movmskps(a
: __m128
) -> i32;
1823 #[link_name = "llvm.x86.sse.cmp.ps"]
1824 fn cmpps(a
: __m128
, b
: __m128
, imm8
: i8) -> __m128
;
1825 #[link_name = "llvm.x86.sse.comieq.ss"]
1826 fn comieq_ss(a
: __m128
, b
: __m128
) -> i32;
1827 #[link_name = "llvm.x86.sse.comilt.ss"]
1828 fn comilt_ss(a
: __m128
, b
: __m128
) -> i32;
1829 #[link_name = "llvm.x86.sse.comile.ss"]
1830 fn comile_ss(a
: __m128
, b
: __m128
) -> i32;
1831 #[link_name = "llvm.x86.sse.comigt.ss"]
1832 fn comigt_ss(a
: __m128
, b
: __m128
) -> i32;
1833 #[link_name = "llvm.x86.sse.comige.ss"]
1834 fn comige_ss(a
: __m128
, b
: __m128
) -> i32;
1835 #[link_name = "llvm.x86.sse.comineq.ss"]
1836 fn comineq_ss(a
: __m128
, b
: __m128
) -> i32;
1837 #[link_name = "llvm.x86.sse.ucomieq.ss"]
1838 fn ucomieq_ss(a
: __m128
, b
: __m128
) -> i32;
1839 #[link_name = "llvm.x86.sse.ucomilt.ss"]
1840 fn ucomilt_ss(a
: __m128
, b
: __m128
) -> i32;
1841 #[link_name = "llvm.x86.sse.ucomile.ss"]
1842 fn ucomile_ss(a
: __m128
, b
: __m128
) -> i32;
1843 #[link_name = "llvm.x86.sse.ucomigt.ss"]
1844 fn ucomigt_ss(a
: __m128
, b
: __m128
) -> i32;
1845 #[link_name = "llvm.x86.sse.ucomige.ss"]
1846 fn ucomige_ss(a
: __m128
, b
: __m128
) -> i32;
1847 #[link_name = "llvm.x86.sse.ucomineq.ss"]
1848 fn ucomineq_ss(a
: __m128
, b
: __m128
) -> i32;
1849 #[link_name = "llvm.x86.sse.cvtss2si"]
1850 fn cvtss2si(a
: __m128
) -> i32;
1851 #[link_name = "llvm.x86.sse.cvttss2si"]
1852 fn cvttss2si(a
: __m128
) -> i32;
1853 #[link_name = "llvm.x86.sse.cvtsi2ss"]
1854 fn cvtsi2ss(a
: __m128
, b
: i32) -> __m128
;
1855 #[link_name = "llvm.x86.sse.sfence"]
1857 #[link_name = "llvm.x86.sse.stmxcsr"]
1858 fn stmxcsr(p
: *mut i8);
1859 #[link_name = "llvm.x86.sse.ldmxcsr"]
1860 fn ldmxcsr(p
: *const i8);
1861 #[link_name = "llvm.prefetch"]
1862 fn prefetch(p
: *const i8, rw
: i32, loc
: i32, ty
: i32);
1863 #[link_name = "llvm.x86.sse.cmp.ss"]
1864 fn cmpss(a
: __m128
, b
: __m128
, imm8
: i8) -> __m128
;
1867 /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1869 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1870 /// exception _may_ be generated.
1872 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
1874 #[target_feature(enable = "sse")]
1875 #[cfg_attr(test, assert_instr(movntps))]
1876 #[stable(feature = "simd_x86", since = "1.27.0")]
1877 #[allow(clippy::cast_ptr_alignment)]
1878 pub unsafe fn _mm_stream_ps(mem_addr
: *mut f32, a
: __m128
) {
1879 intrinsics
::nontemporal_store(mem_addr
as *mut __m128
, a
);
1884 use crate::{hint::black_box, mem::transmute}
;
1885 use std
::{boxed, f32::NAN}
;
1886 use stdarch_test
::simd_test
;
1888 use crate::core_arch
::{simd::*, x86::*}
;
1890 #[simd_test(enable = "sse")]
1891 unsafe fn test_mm_add_ps() {
1892 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1893 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1894 let r
= _mm_add_ps(a
, b
);
1895 assert_eq_m128(r
, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
1898 #[simd_test(enable = "sse")]
1899 unsafe fn test_mm_add_ss() {
1900 let a
= _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
1901 let b
= _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
1902 let r
= _mm_add_ss(a
, b
);
1903 assert_eq_m128(r
, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
1906 #[simd_test(enable = "sse")]
1907 unsafe fn test_mm_sub_ps() {
1908 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1909 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1910 let r
= _mm_sub_ps(a
, b
);
1911 assert_eq_m128(r
, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
1914 #[simd_test(enable = "sse")]
1915 unsafe fn test_mm_sub_ss() {
1916 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1917 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1918 let r
= _mm_sub_ss(a
, b
);
1919 assert_eq_m128(r
, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
1922 #[simd_test(enable = "sse")]
1923 unsafe fn test_mm_mul_ps() {
1924 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1925 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1926 let r
= _mm_mul_ps(a
, b
);
1927 assert_eq_m128(r
, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
1930 #[simd_test(enable = "sse")]
1931 unsafe fn test_mm_mul_ss() {
1932 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1933 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1934 let r
= _mm_mul_ss(a
, b
);
1935 assert_eq_m128(r
, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
1938 #[simd_test(enable = "sse")]
1939 unsafe fn test_mm_div_ps() {
1940 let a
= _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
1941 let b
= _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
1942 let r
= _mm_div_ps(a
, b
);
1943 assert_eq_m128(r
, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
1946 #[simd_test(enable = "sse")]
1947 unsafe fn test_mm_div_ss() {
1948 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1949 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1950 let r
= _mm_div_ss(a
, b
);
1951 assert_eq_m128(r
, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
1954 #[simd_test(enable = "sse")]
1955 unsafe fn test_mm_sqrt_ss() {
1956 let a
= _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1957 let r
= _mm_sqrt_ss(a
);
1958 let e
= _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
1959 assert_eq_m128(r
, e
);
1962 #[simd_test(enable = "sse")]
1963 unsafe fn test_mm_sqrt_ps() {
1964 let a
= _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1965 let r
= _mm_sqrt_ps(a
);
1966 let e
= _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
1967 assert_eq_m128(r
, e
);
1970 #[simd_test(enable = "sse")]
1971 unsafe fn test_mm_rcp_ss() {
1972 let a
= _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1973 let r
= _mm_rcp_ss(a
);
1974 let e
= _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
1975 assert_eq_m128(r
, e
);
1978 #[simd_test(enable = "sse")]
1979 unsafe fn test_mm_rcp_ps() {
1980 let a
= _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1981 let r
= _mm_rcp_ps(a
);
1982 let e
= _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
1983 let rel_err
= 0.00048828125;
1985 assert_approx_eq
!(get_m128(r
, i
), get_m128(e
, i
), 2. * rel_err
);
1989 #[simd_test(enable = "sse")]
1990 unsafe fn test_mm_rsqrt_ss() {
1991 let a
= _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1992 let r
= _mm_rsqrt_ss(a
);
1993 let e
= _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
1994 let rel_err
= 0.00048828125;
1996 assert_approx_eq
!(get_m128(r
, i
), get_m128(e
, i
), 2. * rel_err
);
2000 #[simd_test(enable = "sse")]
2001 unsafe fn test_mm_rsqrt_ps() {
2002 let a
= _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2003 let r
= _mm_rsqrt_ps(a
);
2004 let e
= _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2005 let rel_err
= 0.00048828125;
2007 assert_approx_eq
!(get_m128(r
, i
), get_m128(e
, i
), 2. * rel_err
);
2011 #[simd_test(enable = "sse")]
2012 unsafe fn test_mm_min_ss() {
2013 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2014 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2015 let r
= _mm_min_ss(a
, b
);
2016 assert_eq_m128(r
, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2019 #[simd_test(enable = "sse")]
2020 unsafe fn test_mm_min_ps() {
2021 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2022 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2023 let r
= _mm_min_ps(a
, b
);
2024 assert_eq_m128(r
, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2026 // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2027 // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2028 // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2029 // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2030 // `r1` to `a` and `r2` to `b`.
2031 let a
= _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2032 let b
= _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2033 let r1
: [u8; 16] = transmute(_mm_min_ps(a
, b
));
2034 let r2
: [u8; 16] = transmute(_mm_min_ps(b
, a
));
2035 let a
: [u8; 16] = transmute(a
);
2036 let b
: [u8; 16] = transmute(b
);
2039 assert_ne
!(a
, b
); // sanity check that -0.0 is actually present
2042 #[simd_test(enable = "sse")]
2043 unsafe fn test_mm_max_ss() {
2044 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2045 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2046 let r
= _mm_max_ss(a
, b
);
2047 assert_eq_m128(r
, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2050 #[simd_test(enable = "sse")]
2051 unsafe fn test_mm_max_ps() {
2052 let a
= _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2053 let b
= _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2054 let r
= _mm_max_ps(a
, b
);
2055 assert_eq_m128(r
, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2058 #[simd_test(enable = "sse")]
2059 unsafe fn test_mm_and_ps() {
2060 let a
= transmute(u32x4
::splat(0b0011));
2061 let b
= transmute(u32x4
::splat(0b0101));
2062 let r
= _mm_and_ps(*black_box(&a
), *black_box(&b
));
2063 let e
= transmute(u32x4
::splat(0b0001));
2064 assert_eq_m128(r
, e
);
2067 #[simd_test(enable = "sse")]
2068 unsafe fn test_mm_andnot_ps() {
2069 let a
= transmute(u32x4
::splat(0b0011));
2070 let b
= transmute(u32x4
::splat(0b0101));
2071 let r
= _mm_andnot_ps(*black_box(&a
), *black_box(&b
));
2072 let e
= transmute(u32x4
::splat(0b0100));
2073 assert_eq_m128(r
, e
);
2076 #[simd_test(enable = "sse")]
2077 unsafe fn test_mm_or_ps() {
2078 let a
= transmute(u32x4
::splat(0b0011));
2079 let b
= transmute(u32x4
::splat(0b0101));
2080 let r
= _mm_or_ps(*black_box(&a
), *black_box(&b
));
2081 let e
= transmute(u32x4
::splat(0b0111));
2082 assert_eq_m128(r
, e
);
2085 #[simd_test(enable = "sse")]
2086 unsafe fn test_mm_xor_ps() {
2087 let a
= transmute(u32x4
::splat(0b0011));
2088 let b
= transmute(u32x4
::splat(0b0101));
2089 let r
= _mm_xor_ps(*black_box(&a
), *black_box(&b
));
2090 let e
= transmute(u32x4
::splat(0b0110));
2091 assert_eq_m128(r
, e
);
2094 #[simd_test(enable = "sse")]
2095 unsafe fn test_mm_cmpeq_ss() {
2096 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2097 let b
= _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2098 let r
: u32x4
= transmute(_mm_cmpeq_ss(a
, b
));
2099 let e
: u32x4
= transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
2102 let b2
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2103 let r2
: u32x4
= transmute(_mm_cmpeq_ss(a
, b2
));
2104 let e2
: u32x4
= transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
2108 #[simd_test(enable = "sse")]
2109 unsafe fn test_mm_cmplt_ss() {
2110 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2111 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2112 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2113 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2115 let b1
= 0u32; // a.extract(0) < b.extract(0)
2116 let c1
= 0u32; // a.extract(0) < c.extract(0)
2117 let d1
= !0u32; // a.extract(0) < d.extract(0)
2119 let rb
: u32x4
= transmute(_mm_cmplt_ss(a
, b
));
2120 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2123 let rc
: u32x4
= transmute(_mm_cmplt_ss(a
, c
));
2124 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2127 let rd
: u32x4
= transmute(_mm_cmplt_ss(a
, d
));
2128 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2132 #[simd_test(enable = "sse")]
2133 unsafe fn test_mm_cmple_ss() {
2134 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2135 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2136 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2137 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2139 let b1
= 0u32; // a.extract(0) <= b.extract(0)
2140 let c1
= !0u32; // a.extract(0) <= c.extract(0)
2141 let d1
= !0u32; // a.extract(0) <= d.extract(0)
2143 let rb
: u32x4
= transmute(_mm_cmple_ss(a
, b
));
2144 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2147 let rc
: u32x4
= transmute(_mm_cmple_ss(a
, c
));
2148 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2151 let rd
: u32x4
= transmute(_mm_cmple_ss(a
, d
));
2152 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2156 #[simd_test(enable = "sse")]
2157 unsafe fn test_mm_cmpgt_ss() {
2158 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2159 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2160 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2161 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2163 let b1
= !0u32; // a.extract(0) > b.extract(0)
2164 let c1
= 0u32; // a.extract(0) > c.extract(0)
2165 let d1
= 0u32; // a.extract(0) > d.extract(0)
2167 let rb
: u32x4
= transmute(_mm_cmpgt_ss(a
, b
));
2168 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2171 let rc
: u32x4
= transmute(_mm_cmpgt_ss(a
, c
));
2172 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2175 let rd
: u32x4
= transmute(_mm_cmpgt_ss(a
, d
));
2176 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2180 #[simd_test(enable = "sse")]
2181 unsafe fn test_mm_cmpge_ss() {
2182 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2183 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2184 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2185 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2187 let b1
= !0u32; // a.extract(0) >= b.extract(0)
2188 let c1
= !0u32; // a.extract(0) >= c.extract(0)
2189 let d1
= 0u32; // a.extract(0) >= d.extract(0)
2191 let rb
: u32x4
= transmute(_mm_cmpge_ss(a
, b
));
2192 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2195 let rc
: u32x4
= transmute(_mm_cmpge_ss(a
, c
));
2196 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2199 let rd
: u32x4
= transmute(_mm_cmpge_ss(a
, d
));
2200 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2204 #[simd_test(enable = "sse")]
2205 unsafe fn test_mm_cmpneq_ss() {
2206 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2207 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2208 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2209 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2211 let b1
= !0u32; // a.extract(0) != b.extract(0)
2212 let c1
= 0u32; // a.extract(0) != c.extract(0)
2213 let d1
= !0u32; // a.extract(0) != d.extract(0)
2215 let rb
: u32x4
= transmute(_mm_cmpneq_ss(a
, b
));
2216 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2219 let rc
: u32x4
= transmute(_mm_cmpneq_ss(a
, c
));
2220 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2223 let rd
: u32x4
= transmute(_mm_cmpneq_ss(a
, d
));
2224 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2228 #[simd_test(enable = "sse")]
2229 unsafe fn test_mm_cmpnlt_ss() {
2230 // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2231 // must be a difference. It may have to do with behavior in the
2232 // presence of NaNs (signaling or quiet). If so, we should add tests
2235 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2236 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2237 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2238 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2240 let b1
= !0u32; // a.extract(0) >= b.extract(0)
2241 let c1
= !0u32; // a.extract(0) >= c.extract(0)
2242 let d1
= 0u32; // a.extract(0) >= d.extract(0)
2244 let rb
: u32x4
= transmute(_mm_cmpnlt_ss(a
, b
));
2245 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2248 let rc
: u32x4
= transmute(_mm_cmpnlt_ss(a
, c
));
2249 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2252 let rd
: u32x4
= transmute(_mm_cmpnlt_ss(a
, d
));
2253 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2257 #[simd_test(enable = "sse")]
2258 unsafe fn test_mm_cmpnle_ss() {
2259 // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2260 // must be a difference. It may have to do with behavior in the
2262 // of NaNs (signaling or quiet). If so, we should add tests for those.
2264 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2265 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2266 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2267 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2269 let b1
= !0u32; // a.extract(0) > b.extract(0)
2270 let c1
= 0u32; // a.extract(0) > c.extract(0)
2271 let d1
= 0u32; // a.extract(0) > d.extract(0)
2273 let rb
: u32x4
= transmute(_mm_cmpnle_ss(a
, b
));
2274 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2277 let rc
: u32x4
= transmute(_mm_cmpnle_ss(a
, c
));
2278 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2281 let rd
: u32x4
= transmute(_mm_cmpnle_ss(a
, d
));
2282 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2286 #[simd_test(enable = "sse")]
2287 unsafe fn test_mm_cmpngt_ss() {
2288 // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2289 // must be a difference. It may have to do with behavior in the
2290 // presence of NaNs (signaling or quiet). If so, we should add tests
2293 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2294 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2295 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2296 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2298 let b1
= 0u32; // a.extract(0) <= b.extract(0)
2299 let c1
= !0u32; // a.extract(0) <= c.extract(0)
2300 let d1
= !0u32; // a.extract(0) <= d.extract(0)
2302 let rb
: u32x4
= transmute(_mm_cmpngt_ss(a
, b
));
2303 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2306 let rc
: u32x4
= transmute(_mm_cmpngt_ss(a
, c
));
2307 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2310 let rd
: u32x4
= transmute(_mm_cmpngt_ss(a
, d
));
2311 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2315 #[simd_test(enable = "sse")]
2316 unsafe fn test_mm_cmpnge_ss() {
2317 // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2318 // must be a difference. It may have to do with behavior in the
2319 // presence of NaNs (signaling or quiet). If so, we should add tests
2322 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2323 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2324 let c
= _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2325 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2327 let b1
= 0u32; // a.extract(0) < b.extract(0)
2328 let c1
= 0u32; // a.extract(0) < c.extract(0)
2329 let d1
= !0u32; // a.extract(0) < d.extract(0)
2331 let rb
: u32x4
= transmute(_mm_cmpnge_ss(a
, b
));
2332 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2335 let rc
: u32x4
= transmute(_mm_cmpnge_ss(a
, c
));
2336 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2339 let rd
: u32x4
= transmute(_mm_cmpnge_ss(a
, d
));
2340 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2344 #[simd_test(enable = "sse")]
2345 unsafe fn test_mm_cmpord_ss() {
2346 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2347 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2348 let c
= _mm_setr_ps(NAN
, 5.0, 6.0, 7.0);
2349 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2351 let b1
= !0u32; // a.extract(0) ord b.extract(0)
2352 let c1
= 0u32; // a.extract(0) ord c.extract(0)
2353 let d1
= !0u32; // a.extract(0) ord d.extract(0)
2355 let rb
: u32x4
= transmute(_mm_cmpord_ss(a
, b
));
2356 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2359 let rc
: u32x4
= transmute(_mm_cmpord_ss(a
, c
));
2360 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2363 let rd
: u32x4
= transmute(_mm_cmpord_ss(a
, d
));
2364 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2368 #[simd_test(enable = "sse")]
2369 unsafe fn test_mm_cmpunord_ss() {
2370 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2371 let b
= _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2372 let c
= _mm_setr_ps(NAN
, 5.0, 6.0, 7.0);
2373 let d
= _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2375 let b1
= 0u32; // a.extract(0) unord b.extract(0)
2376 let c1
= !0u32; // a.extract(0) unord c.extract(0)
2377 let d1
= 0u32; // a.extract(0) unord d.extract(0)
2379 let rb
: u32x4
= transmute(_mm_cmpunord_ss(a
, b
));
2380 let eb
: u32x4
= transmute(_mm_setr_ps(transmute(b1
), 2.0, 3.0, 4.0));
2383 let rc
: u32x4
= transmute(_mm_cmpunord_ss(a
, c
));
2384 let ec
: u32x4
= transmute(_mm_setr_ps(transmute(c1
), 2.0, 3.0, 4.0));
2387 let rd
: u32x4
= transmute(_mm_cmpunord_ss(a
, d
));
2388 let ed
: u32x4
= transmute(_mm_setr_ps(transmute(d1
), 2.0, 3.0, 4.0));
2392 #[simd_test(enable = "sse")]
2393 unsafe fn test_mm_cmpeq_ps() {
2394 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2395 let b
= _mm_setr_ps(15.0, 20.0, 1.0, NAN
);
2399 let e
= u32x4
::new(fls
, fls
, tru
, fls
);
2400 let r
: u32x4
= transmute(_mm_cmpeq_ps(a
, b
));
2404 #[simd_test(enable = "sse")]
2405 unsafe fn test_mm_cmplt_ps() {
2406 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2407 let b
= _mm_setr_ps(15.0, 20.0, 1.0, NAN
);
2411 let e
= u32x4
::new(tru
, fls
, fls
, fls
);
2412 let r
: u32x4
= transmute(_mm_cmplt_ps(a
, b
));
2416 #[simd_test(enable = "sse")]
2417 unsafe fn test_mm_cmple_ps() {
2418 let a
= _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2419 let b
= _mm_setr_ps(15.0, 20.0, 1.0, NAN
);
2423 let e
= u32x4
::new(tru
, fls
, tru
, fls
);
2424 let r
: u32x4
= transmute(_mm_cmple_ps(a
, b
));
2428 #[simd_test(enable = "sse")]
2429 unsafe fn test_mm_cmpgt_ps() {
2430 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2431 let b
= _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2435 let e
= u32x4
::new(fls
, tru
, fls
, fls
);
2436 let r
: u32x4
= transmute(_mm_cmpgt_ps(a
, b
));
2440 #[simd_test(enable = "sse")]
2441 unsafe fn test_mm_cmpge_ps() {
2442 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2443 let b
= _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2447 let e
= u32x4
::new(fls
, tru
, tru
, fls
);
2448 let r
: u32x4
= transmute(_mm_cmpge_ps(a
, b
));
2452 #[simd_test(enable = "sse")]
2453 unsafe fn test_mm_cmpneq_ps() {
2454 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2455 let b
= _mm_setr_ps(15.0, 20.0, 1.0, NAN
);
2459 let e
= u32x4
::new(tru
, tru
, fls
, tru
);
2460 let r
: u32x4
= transmute(_mm_cmpneq_ps(a
, b
));
2464 #[simd_test(enable = "sse")]
2465 unsafe fn test_mm_cmpnlt_ps() {
2466 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2467 let b
= _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2471 let e
= u32x4
::new(fls
, tru
, tru
, tru
);
2472 let r
: u32x4
= transmute(_mm_cmpnlt_ps(a
, b
));
2476 #[simd_test(enable = "sse")]
2477 unsafe fn test_mm_cmpnle_ps() {
2478 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2479 let b
= _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2483 let e
= u32x4
::new(fls
, tru
, fls
, tru
);
2484 let r
: u32x4
= transmute(_mm_cmpnle_ps(a
, b
));
2488 #[simd_test(enable = "sse")]
2489 unsafe fn test_mm_cmpngt_ps() {
2490 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2491 let b
= _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2495 let e
= u32x4
::new(tru
, fls
, tru
, tru
);
2496 let r
: u32x4
= transmute(_mm_cmpngt_ps(a
, b
));
2500 #[simd_test(enable = "sse")]
2501 unsafe fn test_mm_cmpnge_ps() {
2502 let a
= _mm_setr_ps(10.0, 50.0, 1.0, NAN
);
2503 let b
= _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2507 let e
= u32x4
::new(tru
, fls
, fls
, tru
);
2508 let r
: u32x4
= transmute(_mm_cmpnge_ps(a
, b
));
2512 #[simd_test(enable = "sse")]
2513 unsafe fn test_mm_cmpord_ps() {
2514 let a
= _mm_setr_ps(10.0, 50.0, NAN
, NAN
);
2515 let b
= _mm_setr_ps(15.0, NAN
, 1.0, NAN
);
2519 let e
= u32x4
::new(tru
, fls
, fls
, fls
);
2520 let r
: u32x4
= transmute(_mm_cmpord_ps(a
, b
));
2524 #[simd_test(enable = "sse")]
2525 unsafe fn test_mm_cmpunord_ps() {
2526 let a
= _mm_setr_ps(10.0, 50.0, NAN
, NAN
);
2527 let b
= _mm_setr_ps(15.0, NAN
, 1.0, NAN
);
2531 let e
= u32x4
::new(fls
, tru
, tru
, tru
);
2532 let r
: u32x4
= transmute(_mm_cmpunord_ps(a
, b
));
2536 #[simd_test(enable = "sse")]
2537 unsafe fn test_mm_comieq_ss() {
2538 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2539 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2541 let ee
= &[1i32, 0, 0, 0];
2544 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2545 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2547 let r
= _mm_comieq_ss(a
, b
);
2551 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2557 #[simd_test(enable = "sse")]
2558 unsafe fn test_mm_comilt_ss() {
2559 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2560 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2562 let ee
= &[0i32, 1, 0, 0];
2565 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2566 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2568 let r
= _mm_comilt_ss(a
, b
);
2572 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2578 #[simd_test(enable = "sse")]
2579 unsafe fn test_mm_comile_ss() {
2580 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2581 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2583 let ee
= &[1i32, 1, 0, 0];
2586 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2587 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2589 let r
= _mm_comile_ss(a
, b
);
2593 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2599 #[simd_test(enable = "sse")]
2600 unsafe fn test_mm_comigt_ss() {
2601 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2602 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2604 let ee
= &[1i32, 0, 1, 0];
2607 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2608 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2610 let r
= _mm_comige_ss(a
, b
);
2614 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2620 #[simd_test(enable = "sse")]
2621 unsafe fn test_mm_comineq_ss() {
2622 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2623 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2625 let ee
= &[0i32, 1, 1, 1];
2628 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2629 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2631 let r
= _mm_comineq_ss(a
, b
);
2635 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2641 #[simd_test(enable = "sse")]
2642 unsafe fn test_mm_ucomieq_ss() {
2643 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2644 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2646 let ee
= &[1i32, 0, 0, 0];
2649 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2650 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2652 let r
= _mm_ucomieq_ss(a
, b
);
2656 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2662 #[simd_test(enable = "sse")]
2663 unsafe fn test_mm_ucomilt_ss() {
2664 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2665 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2667 let ee
= &[0i32, 1, 0, 0];
2670 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2671 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2673 let r
= _mm_ucomilt_ss(a
, b
);
2677 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2683 #[simd_test(enable = "sse")]
2684 unsafe fn test_mm_ucomile_ss() {
2685 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2686 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2688 let ee
= &[1i32, 1, 0, 0];
2691 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2692 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2694 let r
= _mm_ucomile_ss(a
, b
);
2698 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2704 #[simd_test(enable = "sse")]
2705 unsafe fn test_mm_ucomigt_ss() {
2706 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2707 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2709 let ee
= &[0i32, 0, 1, 0];
2712 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2713 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2715 let r
= _mm_ucomigt_ss(a
, b
);
2719 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2725 #[simd_test(enable = "sse")]
2726 unsafe fn test_mm_ucomige_ss() {
2727 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2728 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2730 let ee
= &[1i32, 0, 1, 0];
2733 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2734 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2736 let r
= _mm_ucomige_ss(a
, b
);
2740 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2746 #[simd_test(enable = "sse")]
2747 unsafe fn test_mm_ucomineq_ss() {
2748 let aa
= &[3.0f32, 12.0, 23.0, NAN
];
2749 let bb
= &[3.0f32, 47.5, 1.5, NAN
];
2751 let ee
= &[0i32, 1, 1, 1];
2754 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2755 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2757 let r
= _mm_ucomineq_ss(a
, b
);
2761 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2767 #[simd_test(enable = "sse")]
2768 unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2769 // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2770 // Invalid Operation Exception while `ucomieq_ss` should not.
2771 let aa
= &[3.0f32, NAN
, 23.0, NAN
];
2772 let bb
= &[3.0f32, 47.5, NAN
, NAN
];
2774 let ee
= &[1i32, 0, 0, 0];
2775 let exc
= &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2778 let a
= _mm_setr_ps(aa
[i
], 1.0, 2.0, 3.0);
2779 let b
= _mm_setr_ps(bb
[i
], 0.0, 2.0, 4.0);
2781 _MM_SET_EXCEPTION_STATE(0);
2782 let r1
= _mm_comieq_ss(*black_box(&a
), b
);
2783 let s1
= _MM_GET_EXCEPTION_STATE();
2785 _MM_SET_EXCEPTION_STATE(0);
2786 let r2
= _mm_ucomieq_ss(*black_box(&a
), b
);
2787 let s2
= _MM_GET_EXCEPTION_STATE();
2791 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2796 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2801 exc
[i
] * _MM_EXCEPT_INVALID
,
2802 "_mm_comieq_ss() set exception flags: {} (i={})",
2808 0, // ucomieq_ss should not signal an exception
2809 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2816 #[simd_test(enable = "sse")]
2817 unsafe fn test_mm_cvtss_si32() {
2818 let inputs
= &[42.0f32, -3.1, 4.0e10
, 4.0e-20, NAN
, 2147483500.1];
2819 let result
= &[42i32, -3, i32::MIN
, 0, i32::MIN
, 2147483520];
2820 for i
in 0..inputs
.len() {
2821 let x
= _mm_setr_ps(inputs
[i
], 1.0, 3.0, 4.0);
2823 let r
= _mm_cvtss_si32(x
);
2826 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2832 #[simd_test(enable = "sse")]
2833 unsafe fn test_mm_cvttss_si32() {
2844 (2147483500.1, 2147483520),
2846 for i
in 0..inputs
.len() {
2847 let (xi
, e
) = inputs
[i
];
2848 let x
= _mm_setr_ps(xi
, 1.0, 3.0, 4.0);
2849 let r
= _mm_cvttss_si32(x
);
2852 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2858 #[simd_test(enable = "sse")]
2859 unsafe fn test_mm_cvtsi32_ss() {
2861 (4555i32, 4555.0f32),
2862 (322223333, 322223330.0),
2864 (-322223333, -322223330.0),
2867 for i
in 0..inputs
.len() {
2868 let (x
, f
) = inputs
[i
];
2869 let a
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2870 let r
= _mm_cvtsi32_ss(a
, x
);
2871 let e
= _mm_setr_ps(f
, 6.0, 7.0, 8.0);
2872 assert_eq_m128(e
, r
);
2876 #[simd_test(enable = "sse")]
2877 unsafe fn test_mm_cvtss_f32() {
2878 let a
= _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2879 assert_eq
!(_mm_cvtss_f32(a
), 312.0134);
2882 #[simd_test(enable = "sse")]
2883 unsafe fn test_mm_set_ss() {
2884 let r
= _mm_set_ss(black_box(4.25));
2885 assert_eq_m128(r
, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2888 #[simd_test(enable = "sse")]
2889 unsafe fn test_mm_set1_ps() {
2890 let r1
= _mm_set1_ps(black_box(4.25));
2891 let r2
= _mm_set_ps1(black_box(4.25));
2892 assert_eq
!(get_m128(r1
, 0), 4.25);
2893 assert_eq
!(get_m128(r1
, 1), 4.25);
2894 assert_eq
!(get_m128(r1
, 2), 4.25);
2895 assert_eq
!(get_m128(r1
, 3), 4.25);
2896 assert_eq
!(get_m128(r2
, 0), 4.25);
2897 assert_eq
!(get_m128(r2
, 1), 4.25);
2898 assert_eq
!(get_m128(r2
, 2), 4.25);
2899 assert_eq
!(get_m128(r2
, 3), 4.25);
2902 #[simd_test(enable = "sse")]
2903 unsafe fn test_mm_set_ps() {
2910 assert_eq
!(get_m128(r
, 0), 4.0);
2911 assert_eq
!(get_m128(r
, 1), 3.0);
2912 assert_eq
!(get_m128(r
, 2), 2.0);
2913 assert_eq
!(get_m128(r
, 3), 1.0);
2916 #[simd_test(enable = "sse")]
2917 unsafe fn test_mm_setr_ps() {
2918 let r
= _mm_setr_ps(
2924 assert_eq_m128(r
, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
2927 #[simd_test(enable = "sse")]
2928 unsafe fn test_mm_setzero_ps() {
2929 let r
= *black_box(&_mm_setzero_ps());
2930 assert_eq_m128(r
, _mm_set1_ps(0.0));
2933 #[simd_test(enable = "sse")]
2934 unsafe fn test_mm_shuffle() {
2935 assert_eq
!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
2936 assert_eq
!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
2937 assert_eq
!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
2940 #[simd_test(enable = "sse")]
2941 unsafe fn test_mm_shuffle_ps() {
2942 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2943 let b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2944 let r
= _mm_shuffle_ps
::<0b00_01_01_11>(a
, b
);
2945 assert_eq_m128(r
, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
2948 #[simd_test(enable = "sse")]
2949 unsafe fn test_mm_unpackhi_ps() {
2950 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2951 let b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2952 let r
= _mm_unpackhi_ps(a
, b
);
2953 assert_eq_m128(r
, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
2956 #[simd_test(enable = "sse")]
2957 unsafe fn test_mm_unpacklo_ps() {
2958 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2959 let b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2960 let r
= _mm_unpacklo_ps(a
, b
);
2961 assert_eq_m128(r
, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
2964 #[simd_test(enable = "sse")]
2965 unsafe fn test_mm_movehl_ps() {
2966 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2967 let b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2968 let r
= _mm_movehl_ps(a
, b
);
2969 assert_eq_m128(r
, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
2972 #[simd_test(enable = "sse")]
2973 unsafe fn test_mm_movelh_ps() {
2974 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2975 let b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2976 let r
= _mm_movelh_ps(a
, b
);
2977 assert_eq_m128(r
, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
2980 #[simd_test(enable = "sse")]
2981 unsafe fn test_mm_load_ss() {
2983 let r
= _mm_load_ss(&a
as *const f32);
2984 assert_eq_m128(r
, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
2987 #[simd_test(enable = "sse")]
2988 unsafe fn test_mm_load1_ps() {
2990 let r
= _mm_load1_ps(&a
as *const f32);
2991 assert_eq_m128(r
, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
2994 #[simd_test(enable = "sse")]
2995 unsafe fn test_mm_load_ps() {
2996 let vals
= &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
2998 let mut p
= vals
.as_ptr();
2999 let mut fixup
= 0.0f32;
3001 // Make sure p is aligned, otherwise we might get a
3002 // (signal: 11, SIGSEGV: invalid memory reference)
3004 let unalignment
= (p
as usize) & 0xf;
3005 if unalignment
!= 0 {
3006 let delta
= (16 - unalignment
) >> 2;
3007 fixup
= delta
as f32;
3011 let r
= _mm_load_ps(p
);
3012 let e
= _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup
));
3013 assert_eq_m128(r
, e
);
3016 #[simd_test(enable = "sse")]
3017 unsafe fn test_mm_loadu_ps() {
3018 let vals
= &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3019 let p
= vals
.as_ptr().add(3);
3020 let r
= _mm_loadu_ps(black_box(p
));
3021 assert_eq_m128(r
, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3024 #[simd_test(enable = "sse")]
3025 unsafe fn test_mm_loadr_ps() {
3026 let vals
= &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3028 let mut p
= vals
.as_ptr();
3029 let mut fixup
= 0.0f32;
3031 // Make sure p is aligned, otherwise we might get a
3032 // (signal: 11, SIGSEGV: invalid memory reference)
3034 let unalignment
= (p
as usize) & 0xf;
3035 if unalignment
!= 0 {
3036 let delta
= (16 - unalignment
) >> 2;
3037 fixup
= delta
as f32;
3041 let r
= _mm_loadr_ps(p
);
3042 let e
= _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup
));
3043 assert_eq_m128(r
, e
);
3046 #[simd_test(enable = "sse2")]
3047 unsafe fn test_mm_loadu_si64() {
3048 let a
= _mm_setr_epi64x(5, 6);
3049 let r
= _mm_loadu_si64(&a
as *const _
as *const _
);
3050 assert_eq_m128i(r
, _mm_setr_epi64x(5, 0));
3053 #[simd_test(enable = "sse")]
3054 unsafe fn test_mm_store_ss() {
3055 let mut vals
= [0.0f32; 8];
3056 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3057 _mm_store_ss(vals
.as_mut_ptr().add(1), a
);
3059 assert_eq
!(vals
[0], 0.0);
3060 assert_eq
!(vals
[1], 1.0);
3061 assert_eq
!(vals
[2], 0.0);
3064 #[simd_test(enable = "sse")]
3065 unsafe fn test_mm_store1_ps() {
3066 let mut vals
= [0.0f32; 8];
3067 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3070 let mut p
= vals
.as_mut_ptr();
3072 if (p
as usize) & 0xf != 0 {
3073 ofs
= ((16 - (p
as usize)) & 0xf) >> 2;
3077 _mm_store1_ps(p
, *black_box(&a
));
3080 assert_eq
!(vals
[ofs
- 1], 0.0);
3082 assert_eq
!(vals
[ofs
+ 0], 1.0);
3083 assert_eq
!(vals
[ofs
+ 1], 1.0);
3084 assert_eq
!(vals
[ofs
+ 2], 1.0);
3085 assert_eq
!(vals
[ofs
+ 3], 1.0);
3086 assert_eq
!(vals
[ofs
+ 4], 0.0);
3089 #[simd_test(enable = "sse")]
3090 unsafe fn test_mm_store_ps() {
3091 let mut vals
= [0.0f32; 8];
3092 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3095 let mut p
= vals
.as_mut_ptr();
3097 // Align p to 16-byte boundary
3098 if (p
as usize) & 0xf != 0 {
3099 ofs
= ((16 - (p
as usize)) & 0xf) >> 2;
3103 _mm_store_ps(p
, *black_box(&a
));
3106 assert_eq
!(vals
[ofs
- 1], 0.0);
3108 assert_eq
!(vals
[ofs
+ 0], 1.0);
3109 assert_eq
!(vals
[ofs
+ 1], 2.0);
3110 assert_eq
!(vals
[ofs
+ 2], 3.0);
3111 assert_eq
!(vals
[ofs
+ 3], 4.0);
3112 assert_eq
!(vals
[ofs
+ 4], 0.0);
3115 #[simd_test(enable = "sse")]
3116 unsafe fn test_mm_storer_ps() {
3117 let mut vals
= [0.0f32; 8];
3118 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3121 let mut p
= vals
.as_mut_ptr();
3123 // Align p to 16-byte boundary
3124 if (p
as usize) & 0xf != 0 {
3125 ofs
= ((16 - (p
as usize)) & 0xf) >> 2;
3129 _mm_storer_ps(p
, *black_box(&a
));
3132 assert_eq
!(vals
[ofs
- 1], 0.0);
3134 assert_eq
!(vals
[ofs
+ 0], 4.0);
3135 assert_eq
!(vals
[ofs
+ 1], 3.0);
3136 assert_eq
!(vals
[ofs
+ 2], 2.0);
3137 assert_eq
!(vals
[ofs
+ 3], 1.0);
3138 assert_eq
!(vals
[ofs
+ 4], 0.0);
3141 #[simd_test(enable = "sse")]
3142 unsafe fn test_mm_storeu_ps() {
3143 let mut vals
= [0.0f32; 8];
3144 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3147 let mut p
= vals
.as_mut_ptr();
3149 // Make sure p is **not** aligned to 16-byte boundary
3150 if (p
as usize) & 0xf == 0 {
3155 _mm_storeu_ps(p
, *black_box(&a
));
3158 assert_eq
!(vals
[ofs
- 1], 0.0);
3160 assert_eq
!(vals
[ofs
+ 0], 1.0);
3161 assert_eq
!(vals
[ofs
+ 1], 2.0);
3162 assert_eq
!(vals
[ofs
+ 2], 3.0);
3163 assert_eq
!(vals
[ofs
+ 3], 4.0);
3164 assert_eq
!(vals
[ofs
+ 4], 0.0);
3167 #[simd_test(enable = "sse")]
3168 unsafe fn test_mm_move_ss() {
3169 let a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3170 let b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3172 let r
= _mm_move_ss(a
, b
);
3173 let e
= _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3174 assert_eq_m128(e
, r
);
3177 #[simd_test(enable = "sse")]
3178 unsafe fn test_mm_movemask_ps() {
3179 let r
= _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3180 assert_eq
!(r
, 0b0101);
3182 let r
= _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3183 assert_eq
!(r
, 0b0111);
3186 #[simd_test(enable = "sse")]
3187 unsafe fn test_mm_sfence() {
3191 #[simd_test(enable = "sse")]
3192 unsafe fn test_mm_getcsr_setcsr_1() {
3193 let saved_csr
= _mm_getcsr();
3195 let a
= _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3196 let b
= _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3198 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON
);
3199 let r
= _mm_mul_ps(*black_box(&a
), *black_box(&b
));
3201 _mm_setcsr(saved_csr
);
3203 let exp
= _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3204 assert_eq_m128(r
, exp
); // first component is a denormalized f32
3207 #[simd_test(enable = "sse")]
3208 unsafe fn test_mm_getcsr_setcsr_2() {
3209 // Same as _mm_setcsr_1 test, but with opposite flag value.
3211 let saved_csr
= _mm_getcsr();
3213 let a
= _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3214 let b
= _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3216 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF
);
3217 let r
= _mm_mul_ps(*black_box(&a
), *black_box(&b
));
3219 _mm_setcsr(saved_csr
);
3221 let exp
= _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3222 assert_eq_m128(r
, exp
); // first component is a denormalized f32
3225 #[simd_test(enable = "sse")]
3226 unsafe fn test_mm_getcsr_setcsr_underflow() {
3227 _MM_SET_EXCEPTION_STATE(0);
3229 let a
= _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3230 let b
= _mm_setr_ps(1e
-5, 0.0, 0.0, 1.0);
3232 assert_eq
!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3234 let r
= _mm_mul_ps(*black_box(&a
), *black_box(&b
));
3236 let exp
= _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3237 assert_eq_m128(r
, exp
);
3239 let underflow
= _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW
!= 0;
3240 assert_eq
!(underflow
, true);
3243 #[simd_test(enable = "sse")]
3244 unsafe fn test_MM_TRANSPOSE4_PS() {
3245 let mut a
= _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3246 let mut b
= _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3247 let mut c
= _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3248 let mut d
= _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3250 _MM_TRANSPOSE4_PS(&mut a
, &mut b
, &mut c
, &mut d
);
3252 assert_eq_m128(a
, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3253 assert_eq_m128(b
, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3254 assert_eq_m128(c
, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3255 assert_eq_m128(d
, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3263 #[simd_test(enable = "sse")]
3264 unsafe fn test_mm_stream_ps() {
3265 let a
= _mm_set1_ps(7.0);
3266 let mut mem
= Memory { data: [-1.0; 4] }
;
3268 _mm_stream_ps(&mut mem
.data
[0] as *mut f32, a
);
3270 assert_eq
!(mem
.data
[i
], get_m128(a
, i
));