]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/sse.rs
New upstream version 1.49.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / sse.rs
CommitLineData
0531ce1d
XL
1//! Streaming SIMD Extensions (SSE)
2
532ac7d7
XL
3use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 intrinsics, mem, ptr,
6};
0531ce1d
XL
7
8#[cfg(test)]
416331ca 9use stdarch_test::assert_instr;
0531ce1d
XL
10
11/// Adds the first component of `a` and `b`, the other components are copied
12/// from `a`.
83c7162d
XL
13///
14/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss)
0531ce1d
XL
15#[inline]
16#[target_feature(enable = "sse")]
17#[cfg_attr(test, assert_instr(addss))]
83c7162d 18#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
19pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
20 addss(a, b)
21}
22
23/// Adds __m128 vectors.
83c7162d
XL
24///
25/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps)
0531ce1d
XL
26#[inline]
27#[target_feature(enable = "sse")]
28#[cfg_attr(test, assert_instr(addps))]
83c7162d 29#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
30pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
31 simd_add(a, b)
32}
33
34/// Subtracts the first component of `b` from `a`, the other components are
35/// copied from `a`.
83c7162d
XL
36///
37/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss)
0531ce1d
XL
38#[inline]
39#[target_feature(enable = "sse")]
40#[cfg_attr(test, assert_instr(subss))]
83c7162d 41#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
42pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
43 subss(a, b)
44}
45
46/// Subtracts __m128 vectors.
83c7162d
XL
47///
48/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps)
0531ce1d
XL
49#[inline]
50#[target_feature(enable = "sse")]
51#[cfg_attr(test, assert_instr(subps))]
83c7162d 52#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
53pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
54 simd_sub(a, b)
55}
56
57/// Multiplies the first component of `a` and `b`, the other components are
58/// copied from `a`.
83c7162d
XL
59///
60/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss)
0531ce1d
XL
61#[inline]
62#[target_feature(enable = "sse")]
63#[cfg_attr(test, assert_instr(mulss))]
83c7162d 64#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
65pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
66 mulss(a, b)
67}
68
69/// Multiplies __m128 vectors.
83c7162d
XL
70///
71/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps)
0531ce1d
XL
72#[inline]
73#[target_feature(enable = "sse")]
74#[cfg_attr(test, assert_instr(mulps))]
83c7162d 75#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
76pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
77 simd_mul(a, b)
78}
79
80/// Divides the first component of `b` by `a`, the other components are
81/// copied from `a`.
83c7162d
XL
82///
83/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss)
0531ce1d
XL
84#[inline]
85#[target_feature(enable = "sse")]
86#[cfg_attr(test, assert_instr(divss))]
83c7162d 87#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
88pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
89 divss(a, b)
90}
91
92/// Divides __m128 vectors.
83c7162d
XL
93///
94/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps)
0531ce1d
XL
95#[inline]
96#[target_feature(enable = "sse")]
97#[cfg_attr(test, assert_instr(divps))]
83c7162d 98#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
99pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
100 simd_div(a, b)
101}
102
532ac7d7 103/// Returns the square root of the first single-precision (32-bit)
0531ce1d 104/// floating-point element in `a`, the other elements are unchanged.
83c7162d
XL
105///
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss)
0531ce1d
XL
107#[inline]
108#[target_feature(enable = "sse")]
109#[cfg_attr(test, assert_instr(sqrtss))]
83c7162d 110#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
111pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
112 sqrtss(a)
113}
114
532ac7d7 115/// Returns the square root of packed single-precision (32-bit) floating-point
0531ce1d 116/// elements in `a`.
83c7162d
XL
117///
118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps)
0531ce1d
XL
119#[inline]
120#[target_feature(enable = "sse")]
121#[cfg_attr(test, assert_instr(sqrtps))]
83c7162d 122#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
123pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
124 sqrtps(a)
125}
126
532ac7d7 127/// Returns the approximate reciprocal of the first single-precision
0531ce1d 128/// (32-bit) floating-point element in `a`, the other elements are unchanged.
83c7162d
XL
129///
130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss)
0531ce1d
XL
131#[inline]
132#[target_feature(enable = "sse")]
133#[cfg_attr(test, assert_instr(rcpss))]
83c7162d 134#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
135pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
136 rcpss(a)
137}
138
532ac7d7 139/// Returns the approximate reciprocal of packed single-precision (32-bit)
0531ce1d 140/// floating-point elements in `a`.
83c7162d
XL
141///
142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps)
0531ce1d
XL
143#[inline]
144#[target_feature(enable = "sse")]
145#[cfg_attr(test, assert_instr(rcpps))]
83c7162d 146#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
147pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
148 rcpps(a)
149}
150
532ac7d7 151/// Returns the approximate reciprocal square root of the fist single-precision
0531ce1d 152/// (32-bit) floating-point elements in `a`, the other elements are unchanged.
83c7162d
XL
153///
154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss)
0531ce1d
XL
155#[inline]
156#[target_feature(enable = "sse")]
157#[cfg_attr(test, assert_instr(rsqrtss))]
83c7162d 158#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
159pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
160 rsqrtss(a)
161}
162
532ac7d7 163/// Returns the approximate reciprocal square root of packed single-precision
0531ce1d 164/// (32-bit) floating-point elements in `a`.
83c7162d
XL
165///
166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps)
0531ce1d
XL
167#[inline]
168#[target_feature(enable = "sse")]
169#[cfg_attr(test, assert_instr(rsqrtps))]
83c7162d 170#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
171pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
172 rsqrtps(a)
173}
174
532ac7d7 175/// Compares the first single-precision (32-bit) floating-point element of `a`
0531ce1d
XL
176/// and `b`, and return the minimum value in the first element of the return
177/// value, the other elements are copied from `a`.
83c7162d
XL
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss)
0531ce1d
XL
180#[inline]
181#[target_feature(enable = "sse")]
182#[cfg_attr(test, assert_instr(minss))]
83c7162d 183#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
184pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
185 minss(a, b)
186}
187
532ac7d7 188/// Compares packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 189/// `b`, and return the corresponding minimum values.
83c7162d
XL
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps)
0531ce1d
XL
192#[inline]
193#[target_feature(enable = "sse")]
194#[cfg_attr(test, assert_instr(minps))]
83c7162d 195#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 196pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
74b04a01 197 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
0531ce1d
XL
198 minps(a, b)
199}
200
532ac7d7 201/// Compares the first single-precision (32-bit) floating-point element of `a`
0531ce1d
XL
202/// and `b`, and return the maximum value in the first element of the return
203/// value, the other elements are copied from `a`.
83c7162d
XL
204///
205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss)
0531ce1d
XL
206#[inline]
207#[target_feature(enable = "sse")]
208#[cfg_attr(test, assert_instr(maxss))]
83c7162d 209#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
210pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
211 maxss(a, b)
212}
213
532ac7d7 214/// Compares packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 215/// `b`, and return the corresponding maximum values.
83c7162d
XL
216///
217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps)
0531ce1d
XL
218#[inline]
219#[target_feature(enable = "sse")]
220#[cfg_attr(test, assert_instr(maxps))]
83c7162d 221#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 222pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
74b04a01 223 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
0531ce1d
XL
224 maxps(a, b)
225}
226
227/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
83c7162d
XL
228///
229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps)
0531ce1d
XL
230#[inline]
231#[target_feature(enable = "sse")]
232// i586 only seems to generate plain `and` instructions, so ignore it.
8faf50e0
XL
233#[cfg_attr(
234 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
235 assert_instr(andps)
236)]
83c7162d 237#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
238pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
239 let a: __m128i = mem::transmute(a);
240 let b: __m128i = mem::transmute(b);
241 mem::transmute(simd_and(a, b))
242}
243
244/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
245/// elements.
246///
247/// Computes `!a & b` for each bit in `a` and `b`.
83c7162d
XL
248///
249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps)
0531ce1d
XL
250#[inline]
251#[target_feature(enable = "sse")]
252// i586 only seems to generate plain `not` and `and` instructions, so ignore
253// it.
8faf50e0
XL
254#[cfg_attr(
255 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
256 assert_instr(andnps)
257)]
83c7162d 258#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
259pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
260 let a: __m128i = mem::transmute(a);
261 let b: __m128i = mem::transmute(b);
262 let mask: __m128i = mem::transmute(i32x4::splat(-1));
263 mem::transmute(simd_and(simd_xor(mask, a), b))
264}
265
266/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
83c7162d
XL
267///
268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps)
0531ce1d
XL
269#[inline]
270#[target_feature(enable = "sse")]
271// i586 only seems to generate plain `or` instructions, so we ignore it.
8faf50e0
XL
272#[cfg_attr(
273 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
274 assert_instr(orps)
275)]
83c7162d 276#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
277pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
278 let a: __m128i = mem::transmute(a);
279 let b: __m128i = mem::transmute(b);
280 mem::transmute(simd_or(a, b))
281}
282
283/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
284/// elements.
83c7162d
XL
285///
286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps)
0531ce1d
XL
287#[inline]
288#[target_feature(enable = "sse")]
289// i586 only seems to generate plain `xor` instructions, so we ignore it.
8faf50e0
XL
290#[cfg_attr(
291 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
292 assert_instr(xorps)
293)]
83c7162d 294#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
295pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
296 let a: __m128i = mem::transmute(a);
297 let b: __m128i = mem::transmute(b);
298 mem::transmute(simd_xor(a, b))
299}
300
532ac7d7 301/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
0531ce1d
XL
302/// the result will be `0xffffffff` if the two inputs are equal, or `0`
303/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
83c7162d
XL
304///
305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss)
0531ce1d
XL
306#[inline]
307#[target_feature(enable = "sse")]
308#[cfg_attr(test, assert_instr(cmpeqss))]
83c7162d 309#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
310pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
311 cmpss(a, b, 0)
312}
313
532ac7d7 314/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
0531ce1d
XL
315/// of the result will be `0xffffffff` if `a.extract(0)` is less than
316/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
317/// upper 96 bits of `a`.
83c7162d
XL
318///
319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss)
0531ce1d
XL
320#[inline]
321#[target_feature(enable = "sse")]
322#[cfg_attr(test, assert_instr(cmpltss))]
83c7162d 323#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
324pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
325 cmpss(a, b, 1)
326}
327
532ac7d7 328/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
0531ce1d
XL
329/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
330/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
331/// are the upper 96 bits of `a`.
83c7162d
XL
332///
333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss)
0531ce1d
XL
334#[inline]
335#[target_feature(enable = "sse")]
336#[cfg_attr(test, assert_instr(cmpless))]
83c7162d 337#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
338pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
339 cmpss(a, b, 2)
340}
341
532ac7d7 342/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
0531ce1d
XL
343/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
344/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345/// are the upper 96 bits of `a`.
83c7162d
XL
346///
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss)
0531ce1d
XL
348#[inline]
349#[target_feature(enable = "sse")]
350#[cfg_attr(test, assert_instr(cmpltss))]
83c7162d 351#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
352pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
353 simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3])
354}
355
532ac7d7 356/// Compares the lowest `f32` of both inputs for greater than or equal. The
0531ce1d
XL
357/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
358/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
359/// of the result are the upper 96 bits of `a`.
83c7162d
XL
360///
361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss)
0531ce1d
XL
362#[inline]
363#[target_feature(enable = "sse")]
364#[cfg_attr(test, assert_instr(cmpless))]
83c7162d 365#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
366pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
367 simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3])
368}
369
532ac7d7 370/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
0531ce1d
XL
371/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
372/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
373/// upper 96 bits of `a`.
83c7162d
XL
374///
375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss)
0531ce1d
XL
376#[inline]
377#[target_feature(enable = "sse")]
378#[cfg_attr(test, assert_instr(cmpneqss))]
83c7162d 379#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
380pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
381 cmpss(a, b, 4)
382}
383
532ac7d7 384/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
0531ce1d
XL
385/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
386/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387/// upper 96 bits of `a`.
83c7162d
XL
388///
389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss)
0531ce1d
XL
390#[inline]
391#[target_feature(enable = "sse")]
392#[cfg_attr(test, assert_instr(cmpnltss))]
83c7162d 393#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
394pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
395 cmpss(a, b, 5)
396}
397
532ac7d7 398/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
0531ce1d
XL
399/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
400/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
401/// of the result are the upper 96 bits of `a`.
83c7162d
XL
402///
403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss)
0531ce1d
XL
404#[inline]
405#[target_feature(enable = "sse")]
406#[cfg_attr(test, assert_instr(cmpnless))]
83c7162d 407#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
408pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
409 cmpss(a, b, 6)
410}
411
532ac7d7 412/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
0531ce1d
XL
413/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
414/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
415/// the upper 96 bits of `a`.
83c7162d
XL
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss)
0531ce1d
XL
418#[inline]
419#[target_feature(enable = "sse")]
420#[cfg_attr(test, assert_instr(cmpnltss))]
83c7162d 421#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
422pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
423 simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3])
424}
425
532ac7d7 426/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
0531ce1d
XL
427/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
428/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
429/// bits of the result are the upper 96 bits of `a`.
83c7162d
XL
430///
431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss)
0531ce1d
XL
432#[inline]
433#[target_feature(enable = "sse")]
434#[cfg_attr(test, assert_instr(cmpnless))]
83c7162d 435#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
436pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
437 simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3])
438}
439
532ac7d7 440/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
0531ce1d
XL
441/// the result will be `0xffffffff` if neither of `a.extract(0)` or
442/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
443/// are the upper 96 bits of `a`.
83c7162d
XL
444///
445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss)
0531ce1d
XL
446#[inline]
447#[target_feature(enable = "sse")]
448#[cfg_attr(test, assert_instr(cmpordss))]
83c7162d 449#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
450pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
451 cmpss(a, b, 7)
452}
453
532ac7d7 454/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
0531ce1d
XL
455/// of the result will be `0xffffffff` if any of `a.extract(0)` or
456/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457/// are the upper 96 bits of `a`.
83c7162d
XL
458///
459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss)
0531ce1d
XL
460#[inline]
461#[target_feature(enable = "sse")]
462#[cfg_attr(test, assert_instr(cmpunordss))]
83c7162d 463#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
464pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
465 cmpss(a, b, 3)
466}
467
532ac7d7 468/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
469/// The result in the output vector will be `0xffffffff` if the input elements
470/// were equal, or `0` otherwise.
83c7162d
XL
471///
472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps)
0531ce1d
XL
473#[inline]
474#[target_feature(enable = "sse")]
475#[cfg_attr(test, assert_instr(cmpeqps))]
83c7162d 476#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
477pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
478 cmpps(a, b, 0)
479}
480
532ac7d7 481/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
482/// The result in the output vector will be `0xffffffff` if the input element
483/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
83c7162d
XL
484///
485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps)
0531ce1d
XL
486#[inline]
487#[target_feature(enable = "sse")]
488#[cfg_attr(test, assert_instr(cmpltps))]
83c7162d 489#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
490pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
491 cmpps(a, b, 1)
492}
493
532ac7d7 494/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
495/// The result in the output vector will be `0xffffffff` if the input element
496/// in `a` is less than or equal to the corresponding element in `b`, or `0`
497/// otherwise.
83c7162d
XL
498///
499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps)
0531ce1d
XL
500#[inline]
501#[target_feature(enable = "sse")]
502#[cfg_attr(test, assert_instr(cmpleps))]
83c7162d 503#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
504pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
505 cmpps(a, b, 2)
506}
507
532ac7d7 508/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
509/// The result in the output vector will be `0xffffffff` if the input element
510/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
83c7162d
XL
511///
512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps)
0531ce1d
XL
513#[inline]
514#[target_feature(enable = "sse")]
515#[cfg_attr(test, assert_instr(cmpltps))]
83c7162d 516#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
517pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
518 cmpps(b, a, 1)
519}
520
532ac7d7 521/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
522/// The result in the output vector will be `0xffffffff` if the input element
523/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
524/// otherwise.
83c7162d
XL
525///
526/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps)
0531ce1d
XL
527#[inline]
528#[target_feature(enable = "sse")]
529#[cfg_attr(test, assert_instr(cmpleps))]
83c7162d 530#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
531pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
532 cmpps(b, a, 2)
533}
534
532ac7d7 535/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 536/// The result in the output vector will be `0xffffffff` if the input elements
532ac7d7 537/// are **not** equal, or `0` otherwise.
83c7162d
XL
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps)
0531ce1d
XL
540#[inline]
541#[target_feature(enable = "sse")]
542#[cfg_attr(test, assert_instr(cmpneqps))]
83c7162d 543#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
544pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
545 cmpps(a, b, 4)
546}
547
532ac7d7 548/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 549/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 550/// in `a` is **not** less than the corresponding element in `b`, or `0`
0531ce1d 551/// otherwise.
83c7162d
XL
552///
553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps)
0531ce1d
XL
554#[inline]
555#[target_feature(enable = "sse")]
556#[cfg_attr(test, assert_instr(cmpnltps))]
83c7162d 557#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
558pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
559 cmpps(a, b, 5)
560}
561
532ac7d7 562/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 563/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 564/// in `a` is **not** less than or equal to the corresponding element in `b`, or
0531ce1d 565/// `0` otherwise.
83c7162d
XL
566///
567/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps)
0531ce1d
XL
568#[inline]
569#[target_feature(enable = "sse")]
570#[cfg_attr(test, assert_instr(cmpnleps))]
83c7162d 571#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
572pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
573 cmpps(a, b, 6)
574}
575
532ac7d7 576/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 577/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 578/// in `a` is **not** greater than the corresponding element in `b`, or `0`
0531ce1d 579/// otherwise.
83c7162d
XL
580///
581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps)
0531ce1d
XL
582#[inline]
583#[target_feature(enable = "sse")]
584#[cfg_attr(test, assert_instr(cmpnltps))]
83c7162d 585#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
586pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
587 cmpps(b, a, 5)
588}
589
532ac7d7 590/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 591/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 592/// in `a` is **not** greater than or equal to the corresponding element in `b`,
0531ce1d 593/// or `0` otherwise.
83c7162d
XL
594///
595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps)
0531ce1d
XL
596#[inline]
597#[target_feature(enable = "sse")]
598#[cfg_attr(test, assert_instr(cmpnleps))]
83c7162d 599#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
600pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
601 cmpps(b, a, 6)
602}
603
532ac7d7 604/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
605/// Returns four floats that have one of two possible bit patterns. The element
606/// in the output vector will be `0xffffffff` if the input elements in `a` and
607/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
83c7162d
XL
608///
609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps)
0531ce1d
XL
610#[inline]
611#[target_feature(enable = "sse")]
612#[cfg_attr(test, assert_instr(cmpordps))]
83c7162d 613#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
614pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
615 cmpps(b, a, 7)
616}
617
532ac7d7 618/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
619/// Returns four floats that have one of two possible bit patterns. The element
620/// in the output vector will be `0xffffffff` if the input elements in `a` and
621/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
83c7162d
XL
622///
623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps)
0531ce1d
XL
624#[inline]
625#[target_feature(enable = "sse")]
626#[cfg_attr(test, assert_instr(cmpunordps))]
83c7162d 627#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
628pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
629 cmpps(b, a, 3)
630}
631
532ac7d7 632/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d 633/// `1` if they are equal, or `0` otherwise.
83c7162d
XL
634///
635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss)
0531ce1d
XL
636#[inline]
637#[target_feature(enable = "sse")]
638#[cfg_attr(test, assert_instr(comiss))]
83c7162d 639#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
640pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
641 comieq_ss(a, b)
642}
643
532ac7d7 644/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d 645/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
83c7162d
XL
646///
647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss)
0531ce1d
XL
648#[inline]
649#[target_feature(enable = "sse")]
650#[cfg_attr(test, assert_instr(comiss))]
83c7162d 651#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
652pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
653 comilt_ss(a, b)
654}
655
532ac7d7 656/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
657/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
658/// otherwise.
83c7162d
XL
659///
660/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss)
0531ce1d
XL
661#[inline]
662#[target_feature(enable = "sse")]
663#[cfg_attr(test, assert_instr(comiss))]
83c7162d 664#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
665pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
666 comile_ss(a, b)
667}
668
532ac7d7 669/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
670/// `1` if the value from `a` is greater than the one from `b`, or `0`
671/// otherwise.
83c7162d
XL
672///
673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss)
0531ce1d
XL
674#[inline]
675#[target_feature(enable = "sse")]
676#[cfg_attr(test, assert_instr(comiss))]
83c7162d 677#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
678pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
679 comigt_ss(a, b)
680}
681
532ac7d7 682/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
683/// `1` if the value from `a` is greater than or equal to the one from `b`, or
684/// `0` otherwise.
83c7162d
XL
685///
686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss)
0531ce1d
XL
687#[inline]
688#[target_feature(enable = "sse")]
689#[cfg_attr(test, assert_instr(comiss))]
83c7162d 690#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
691pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
692 comige_ss(a, b)
693}
694
532ac7d7
XL
695/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696/// `1` if they are **not** equal, or `0` otherwise.
83c7162d
XL
697///
698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss)
0531ce1d
XL
699#[inline]
700#[target_feature(enable = "sse")]
701#[cfg_attr(test, assert_instr(comiss))]
83c7162d 702#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
703pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
704 comineq_ss(a, b)
705}
706
532ac7d7 707/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
708/// `1` if they are equal, or `0` otherwise. This instruction will not signal
709/// an exception if either argument is a quiet NaN.
83c7162d
XL
710///
711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss)
0531ce1d
XL
712#[inline]
713#[target_feature(enable = "sse")]
714#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 715#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
716pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
717 ucomieq_ss(a, b)
718}
719
532ac7d7 720/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
721/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
722/// This instruction will not signal an exception if either argument is a quiet
723/// NaN.
83c7162d
XL
724///
725/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss)
0531ce1d
XL
726#[inline]
727#[target_feature(enable = "sse")]
728#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 729#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
730pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
731 ucomilt_ss(a, b)
732}
733
532ac7d7 734/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
735/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
736/// otherwise. This instruction will not signal an exception if either argument
737/// is a quiet NaN.
83c7162d
XL
738///
739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss)
0531ce1d
XL
740#[inline]
741#[target_feature(enable = "sse")]
742#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 743#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
744pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
745 ucomile_ss(a, b)
746}
747
532ac7d7 748/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
749/// `1` if the value from `a` is greater than the one from `b`, or `0`
750/// otherwise. This instruction will not signal an exception if either argument
751/// is a quiet NaN.
83c7162d
XL
752///
753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss)
0531ce1d
XL
754#[inline]
755#[target_feature(enable = "sse")]
756#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 757#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
758pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
759 ucomigt_ss(a, b)
760}
761
532ac7d7 762/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
763/// `1` if the value from `a` is greater than or equal to the one from `b`, or
764/// `0` otherwise. This instruction will not signal an exception if either
765/// argument is a quiet NaN.
83c7162d
XL
766///
767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss)
0531ce1d
XL
768#[inline]
769#[target_feature(enable = "sse")]
770#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 771#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
772pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
773 ucomige_ss(a, b)
774}
775
532ac7d7
XL
776/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
0531ce1d 778/// signal an exception if either argument is a quiet NaN.
83c7162d
XL
779///
780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss)
0531ce1d
XL
781#[inline]
782#[target_feature(enable = "sse")]
783#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 784#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
785pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
786 ucomineq_ss(a, b)
787}
788
532ac7d7 789/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
0531ce1d
XL
790///
791/// The result is rounded according to the current rounding mode. If the result
792/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
ba9703b0 793/// (`i32::MIN`) or an invalid operation floating point exception if
0531ce1d
XL
794/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
795///
796/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
83c7162d
XL
797///
798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32)
0531ce1d
XL
799#[inline]
800#[target_feature(enable = "sse")]
801#[cfg_attr(test, assert_instr(cvtss2si))]
83c7162d 802#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
803pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
804 cvtss2si(a)
805}
806
807/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
83c7162d
XL
808///
809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si)
0531ce1d
XL
810#[inline]
811#[target_feature(enable = "sse")]
812#[cfg_attr(test, assert_instr(cvtss2si))]
83c7162d 813#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
814pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
815 _mm_cvtss_si32(a)
816}
817
532ac7d7 818/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
0531ce1d
XL
819/// with
820/// truncation.
821///
822/// The result is rounded always using truncation (round towards zero). If the
823/// result cannot be represented as a 32 bit integer the result will be
ba9703b0 824/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
0531ce1d
XL
825/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
826///
827/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
83c7162d
XL
828///
829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32)
0531ce1d
XL
830#[inline]
831#[target_feature(enable = "sse")]
832#[cfg_attr(test, assert_instr(cvttss2si))]
83c7162d 833#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
834pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
835 cvttss2si(a)
836}
837
838/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
83c7162d
XL
839///
840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si)
0531ce1d
XL
841#[inline]
842#[target_feature(enable = "sse")]
843#[cfg_attr(test, assert_instr(cvttss2si))]
83c7162d 844#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
845pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
846 _mm_cvttss_si32(a)
847}
848
532ac7d7 849/// Extracts the lowest 32 bit float from the input vector.
83c7162d
XL
850///
851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32)
0531ce1d
XL
852#[inline]
853#[target_feature(enable = "sse")]
854// No point in using assert_instrs. In Unix x86_64 calling convention this is a
855// no-op, and on Windows it's just a `mov`.
83c7162d 856#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
857pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
858 simd_extract(a, 0)
859}
860
532ac7d7 861/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
0531ce1d
XL
862/// vector `a` with the lowest 32 bit float replaced by the converted integer.
863///
864/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
865/// input).
83c7162d
XL
866///
867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss)
0531ce1d
XL
868#[inline]
869#[target_feature(enable = "sse")]
870#[cfg_attr(test, assert_instr(cvtsi2ss))]
83c7162d 871#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
872pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
873 cvtsi2ss(a, b)
874}
875
876/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
83c7162d
XL
877///
878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss)
0531ce1d
XL
879#[inline]
880#[target_feature(enable = "sse")]
881#[cfg_attr(test, assert_instr(cvtsi2ss))]
83c7162d 882#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
883pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
884 _mm_cvtsi32_ss(a, b)
885}
886
887/// Construct a `__m128` with the lowest element set to `a` and the rest set to
888/// zero.
83c7162d
XL
889///
890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss)
0531ce1d
XL
891#[inline]
892#[target_feature(enable = "sse")]
893#[cfg_attr(test, assert_instr(movss))]
83c7162d 894#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
895pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
896 __m128(a, 0.0, 0.0, 0.0)
897}
898
899/// Construct a `__m128` with all element set to `a`.
83c7162d
XL
900///
901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps)
0531ce1d
XL
902#[inline]
903#[target_feature(enable = "sse")]
904#[cfg_attr(test, assert_instr(shufps))]
83c7162d 905#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
906pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
907 __m128(a, a, a, a)
908}
909
910/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
83c7162d
XL
911///
912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1)
0531ce1d
XL
913#[inline]
914#[target_feature(enable = "sse")]
915#[cfg_attr(test, assert_instr(shufps))]
83c7162d 916#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
917pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
918 _mm_set1_ps(a)
919}
920
921/// Construct a `__m128` from four floating point values highest to lowest.
922///
923/// Note that `a` will be the highest 32 bits of the result, and `d` the
924/// lowest. This matches the standard way of writing bit patterns on x86:
925///
926/// ```text
927/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
928/// +---------+---------+---------+---------+
929/// | a | b | c | d | result
930/// +---------+---------+---------+---------+
931/// ```
932///
933/// Alternatively:
934///
935/// ```text
936/// let v = _mm_set_ps(d, c, b, a);
937/// ```
83c7162d
XL
938///
939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps)
0531ce1d
XL
940#[inline]
941#[target_feature(enable = "sse")]
942#[cfg_attr(test, assert_instr(unpcklps))]
83c7162d 943#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
944pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
945 __m128(d, c, b, a)
946}
947
948/// Construct a `__m128` from four floating point values lowest to highest.
949///
950/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
951/// bits of the result, and `d` the highest.
952///
953/// ```text
954/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
955/// ```
83c7162d
XL
956///
957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps)
0531ce1d
XL
958#[inline]
959#[target_feature(enable = "sse")]
960#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))]
961// On a 32-bit architecture it just copies the operands from the stack.
962#[cfg_attr(all(test, target_arch = "x86"), assert_instr(movaps))]
83c7162d 963#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
964pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
965 __m128(a, b, c, d)
966}
967
968/// Construct a `__m128` with all elements initialized to zero.
83c7162d
XL
969///
970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps)
0531ce1d
XL
971#[inline]
972#[target_feature(enable = "sse")]
973#[cfg_attr(test, assert_instr(xorps))]
83c7162d 974#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
975pub unsafe fn _mm_setzero_ps() -> __m128 {
976 __m128(0.0, 0.0, 0.0, 0.0)
977}
978
0bf4aa26
XL
979/// A utility function for creating masks to use with Intel shuffle and
980/// permute intrinsics.
8faf50e0
XL
981#[inline]
982#[allow(non_snake_case)]
416331ca 983#[unstable(feature = "stdarch", issue = "27731")]
0731742a
XL
984pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
985 ((z << 6) | (y << 4) | (x << 2) | w) as i32
8faf50e0
XL
986}
987
532ac7d7 988/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d
XL
989/// `b` using `mask`.
990///
991/// The lower half of result takes values from `a` and the higher half from
992/// `b`. Mask is split to 2 control bits each to index the element from inputs.
83c7162d
XL
993///
994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
3dfed10e
XL
995///
996/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
997/// `_mm_shuffle_ps` is supposed to take an `i32` instead of an `u32`
998/// as is the case for [other shuffle intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_).
999/// Performing an implicit type conversion between an unsigned integer and a signed integer
1000/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
0531ce1d
XL
1001#[inline]
1002#[target_feature(enable = "sse")]
1003#[cfg_attr(test, assert_instr(shufps, mask = 3))]
1004#[rustc_args_required_const(2)]
83c7162d 1005#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1006pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: i32) -> __m128 {
0531ce1d
XL
1007 let mask = (mask & 0xFF) as u8;
1008
1009 macro_rules! shuffle_done {
1010 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1011 simd_shuffle4(a, b, [$x01, $x23, $x45, $x67])
83c7162d 1012 };
0531ce1d
XL
1013 }
1014 macro_rules! shuffle_x67 {
1015 ($x01:expr, $x23:expr, $x45:expr) => {
1016 match (mask >> 6) & 0b11 {
1017 0b00 => shuffle_done!($x01, $x23, $x45, 4),
1018 0b01 => shuffle_done!($x01, $x23, $x45, 5),
1019 0b10 => shuffle_done!($x01, $x23, $x45, 6),
1020 _ => shuffle_done!($x01, $x23, $x45, 7),
1021 }
83c7162d 1022 };
0531ce1d
XL
1023 }
1024 macro_rules! shuffle_x45 {
1025 ($x01:expr, $x23:expr) => {
1026 match (mask >> 4) & 0b11 {
1027 0b00 => shuffle_x67!($x01, $x23, 4),
1028 0b01 => shuffle_x67!($x01, $x23, 5),
1029 0b10 => shuffle_x67!($x01, $x23, 6),
1030 _ => shuffle_x67!($x01, $x23, 7),
1031 }
83c7162d 1032 };
0531ce1d
XL
1033 }
1034 macro_rules! shuffle_x23 {
1035 ($x01:expr) => {
1036 match (mask >> 2) & 0b11 {
1037 0b00 => shuffle_x45!($x01, 0),
1038 0b01 => shuffle_x45!($x01, 1),
1039 0b10 => shuffle_x45!($x01, 2),
1040 _ => shuffle_x45!($x01, 3),
1041 }
83c7162d 1042 };
0531ce1d
XL
1043 }
1044 match mask & 0b11 {
1045 0b00 => shuffle_x23!(0),
1046 0b01 => shuffle_x23!(1),
1047 0b10 => shuffle_x23!(2),
1048 _ => shuffle_x23!(3),
1049 }
1050}
1051
532ac7d7 1052/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1053/// from the higher half of `a` and `b`.
83c7162d
XL
1054///
1055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps)
0531ce1d
XL
1056#[inline]
1057#[target_feature(enable = "sse")]
1058#[cfg_attr(test, assert_instr(unpckhps))]
83c7162d 1059#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1060pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1061 simd_shuffle4(a, b, [2, 6, 3, 7])
1062}
1063
532ac7d7 1064/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1065/// from the lower half of `a` and `b`.
83c7162d
XL
1066///
1067/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps)
0531ce1d
XL
1068#[inline]
1069#[target_feature(enable = "sse")]
1070#[cfg_attr(test, assert_instr(unpcklps))]
83c7162d 1071#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1072pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1073 simd_shuffle4(a, b, [0, 4, 1, 5])
1074}
1075
1076/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
1077/// lower half of result.
83c7162d
XL
1078///
1079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps)
0531ce1d
XL
1080#[inline]
1081#[target_feature(enable = "sse")]
0731742a 1082#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
83c7162d 1083#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1084pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1085 // TODO; figure why this is a different instruction on Windows?
1086 simd_shuffle4(a, b, [6, 7, 2, 3])
1087}
1088
1089/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1090/// higher half of result.
83c7162d
XL
1091///
1092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps)
0531ce1d
XL
1093#[inline]
1094#[target_feature(enable = "sse")]
0731742a 1095#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
83c7162d 1096#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1097pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1098 simd_shuffle4(a, b, [0, 1, 4, 5])
1099}
1100
532ac7d7 1101/// Returns a mask of the most significant bit of each element in `a`.
0531ce1d
XL
1102///
1103/// The mask is stored in the 4 least significant bits of the return value.
1104/// All other bits are set to `0`.
83c7162d
XL
1105///
1106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
0531ce1d
XL
1107#[inline]
1108#[target_feature(enable = "sse")]
e1599b0c
XL
1109// FIXME: LLVM9 trunk has the following bug:
1110// https://github.com/rust-lang/stdarch/issues/794
1111// so we only temporarily test this on i686 and x86_64 but not on i586:
1112#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))]
83c7162d 1113#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1114pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1115 movmskps(a)
1116}
1117
0531ce1d
XL
1118/// Construct a `__m128` with the lowest element read from `p` and the other
1119/// elements set to zero.
1120///
1121/// This corresponds to instructions `VMOVSS` / `MOVSS`.
83c7162d
XL
1122///
1123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss)
0531ce1d
XL
1124#[inline]
1125#[target_feature(enable = "sse")]
1126#[cfg_attr(test, assert_instr(movss))]
83c7162d 1127#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1128pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1129 __m128(*p, 0.0, 0.0, 0.0)
1130}
1131
1132/// Construct a `__m128` by duplicating the value read from `p` into all
1133/// elements.
1134///
1135/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1136/// shuffling.
83c7162d
XL
1137///
1138/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps)
0531ce1d
XL
1139#[inline]
1140#[target_feature(enable = "sse")]
1141#[cfg_attr(test, assert_instr(movss))]
83c7162d 1142#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1143pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1144 let a = *p;
1145 __m128(a, a, a, a)
1146}
1147
1148/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
83c7162d
XL
1149///
1150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1)
0531ce1d
XL
1151#[inline]
1152#[target_feature(enable = "sse")]
1153#[cfg_attr(test, assert_instr(movss))]
83c7162d 1154#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1155pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1156 _mm_load1_ps(p)
1157}
1158
532ac7d7 1159/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
0531ce1d
XL
1160/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1161/// protection fault will be triggered (fatal program crash).
1162///
1163/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1164/// memory.
1165///
1166/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
83c7162d
XL
1167///
1168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps)
0531ce1d
XL
1169#[inline]
1170#[target_feature(enable = "sse")]
1171#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1172#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1173#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1174pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1175 *(p as *const __m128)
1176}
1177
532ac7d7 1178/// Loads four `f32` values from memory into a `__m128`. There are no
0531ce1d
XL
1179/// restrictions
1180/// on memory alignment. For aligned memory
1181/// [`_mm_load_ps`](fn._mm_load_ps.html)
1182/// may be faster.
1183///
1184/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
83c7162d
XL
1185///
1186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps)
0531ce1d
XL
1187#[inline]
1188#[target_feature(enable = "sse")]
1189#[cfg_attr(test, assert_instr(movups))]
83c7162d 1190#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1191pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1192 // Note: Using `*p` would require `f32` alignment, but `movups` has no
1193 // alignment restrictions.
1194 let mut dst = _mm_undefined_ps();
1195 ptr::copy_nonoverlapping(
1196 p as *const u8,
1197 &mut dst as *mut __m128 as *mut u8,
1198 mem::size_of::<__m128>(),
1199 );
1200 dst
1201}
1202
532ac7d7 1203/// Loads four `f32` values from aligned memory into a `__m128` in reverse
0531ce1d
XL
1204/// order.
1205///
1206/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1207/// protection fault will be triggered (fatal program crash).
1208///
1209/// Functionally equivalent to the following code sequence (assuming `p`
1210/// satisfies the alignment restrictions):
1211///
1212/// ```text
1213/// let a0 = *p;
1214/// let a1 = *p.offset(1);
1215/// let a2 = *p.offset(2);
1216/// let a3 = *p.offset(3);
1217/// __m128::new(a3, a2, a1, a0)
1218/// ```
1219///
1220/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1221/// shuffling.
83c7162d
XL
1222///
1223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps)
0531ce1d
XL
1224#[inline]
1225#[target_feature(enable = "sse")]
1226#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1227#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1228pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1229 let a = _mm_load_ps(p);
1230 simd_shuffle4(a, a, [3, 2, 1, 0])
1231}
1232
3dfed10e
XL
1233/// Loads unaligned 64-bits of integer data from memory into new vector.
1234///
1235/// `mem_addr` does not need to be aligned on any particular boundary.
1236///
1237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64)
1238#[inline]
1239#[target_feature(enable = "sse")]
1240#[cfg_attr(all(test, not(target_arch = "x86")), assert_instr(movq))]
1241#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1242pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
1243 transmute(i64x2(0, ptr::read_unaligned(mem_addr as *const i64)))
1244}
1245
532ac7d7 1246/// Stores the lowest 32 bit float of `a` into memory.
0531ce1d
XL
1247///
1248/// This intrinsic corresponds to the `MOVSS` instruction.
83c7162d
XL
1249///
1250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss)
0531ce1d
XL
1251#[inline]
1252#[target_feature(enable = "sse")]
1253#[cfg_attr(test, assert_instr(movss))]
83c7162d 1254#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1255pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1256 *p = simd_extract(a, 0);
1257}
1258
532ac7d7 1259/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
0531ce1d
XL
1260/// memory.
1261///
1262/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1263/// protection fault will be triggered (fatal program crash).
1264///
1265/// Functionally equivalent to the following code sequence (assuming `p`
1266/// satisfies the alignment restrictions):
1267///
1268/// ```text
1269/// let x = a.extract(0);
1270/// *p = x;
1271/// *p.offset(1) = x;
1272/// *p.offset(2) = x;
1273/// *p.offset(3) = x;
1274/// ```
83c7162d
XL
1275///
1276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps)
0531ce1d
XL
1277#[inline]
1278#[target_feature(enable = "sse")]
1279#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1280#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1281#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1282pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1283 let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]);
1284 *(p as *mut __m128) = b;
1285}
1286
1287/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
83c7162d
XL
1288///
1289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1)
0531ce1d
XL
1290#[inline]
1291#[target_feature(enable = "sse")]
1292#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1293#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1294pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1295 _mm_store1_ps(p, a);
1296}
1297
532ac7d7 1298/// Stores four 32-bit floats into *aligned* memory.
0531ce1d
XL
1299///
1300/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1301/// protection fault will be triggered (fatal program crash).
1302///
1303/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1304/// memory.
1305///
1306/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
83c7162d
XL
1307///
1308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps)
0531ce1d
XL
1309#[inline]
1310#[target_feature(enable = "sse")]
1311#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1312#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1313#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1314pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1315 *(p as *mut __m128) = a;
1316}
1317
532ac7d7 1318/// Stores four 32-bit floats into memory. There are no restrictions on memory
0531ce1d
XL
1319/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1320/// faster.
1321///
1322/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
83c7162d
XL
1323///
1324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps)
0531ce1d
XL
1325#[inline]
1326#[target_feature(enable = "sse")]
1327#[cfg_attr(test, assert_instr(movups))]
83c7162d 1328#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1329pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1330 ptr::copy_nonoverlapping(
1331 &a as *const __m128 as *const u8,
1332 p as *mut u8,
1333 mem::size_of::<__m128>(),
1334 );
1335}
1336
532ac7d7 1337/// Stores four 32-bit floats into *aligned* memory in reverse order.
0531ce1d
XL
1338///
1339/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1340/// protection fault will be triggered (fatal program crash).
1341///
1342/// Functionally equivalent to the following code sequence (assuming `p`
1343/// satisfies the alignment restrictions):
1344///
1345/// ```text
1346/// *p = a.extract(3);
1347/// *p.offset(1) = a.extract(2);
1348/// *p.offset(2) = a.extract(1);
1349/// *p.offset(3) = a.extract(0);
1350/// ```
83c7162d
XL
1351///
1352/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps)
0531ce1d
XL
1353#[inline]
1354#[target_feature(enable = "sse")]
1355#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1356#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1357#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1358pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1359 let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]);
1360 *(p as *mut __m128) = b;
1361}
1362
532ac7d7 1363/// Returns a `__m128` with the first component from `b` and the remaining
0531ce1d
XL
1364/// components from `a`.
1365///
1366/// In other words for any `a` and `b`:
1367/// ```text
1368/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1369/// ```
83c7162d
XL
1370///
1371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss)
0531ce1d
XL
1372#[inline]
1373#[target_feature(enable = "sse")]
1374#[cfg_attr(test, assert_instr(movss))]
83c7162d 1375#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1376pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1377 simd_shuffle4(a, b, [4, 1, 2, 3])
1378}
1379
532ac7d7 1380/// Performs a serializing operation on all store-to-memory instructions that
0531ce1d
XL
1381/// were issued prior to this instruction.
1382///
1383/// Guarantees that every store instruction that precedes, in program order, is
1384/// globally visible before any store instruction which follows the fence in
1385/// program order.
83c7162d
XL
1386///
1387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence)
0531ce1d
XL
1388#[inline]
1389#[target_feature(enable = "sse")]
1390#[cfg_attr(test, assert_instr(sfence))]
83c7162d 1391#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1392pub unsafe fn _mm_sfence() {
1393 sfence()
1394}
1395
532ac7d7 1396/// Gets the unsigned 32-bit value of the MXCSR control and status register.
0531ce1d
XL
1397///
1398/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1399///
1400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr)
0531ce1d
XL
1401#[inline]
1402#[target_feature(enable = "sse")]
1403#[cfg_attr(test, assert_instr(stmxcsr))]
83c7162d 1404#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1405pub unsafe fn _mm_getcsr() -> u32 {
1406 let mut result = 0_i32;
1407 stmxcsr((&mut result) as *mut _ as *mut i8);
1408 result as u32
1409}
1410
532ac7d7 1411/// Sets the MXCSR register with the 32-bit unsigned integer value.
0531ce1d
XL
1412///
1413/// This register constrols how SIMD instructions handle floating point
1414/// operations. Modifying this register only affects the current thread.
1415///
1416/// It contains several groups of flags:
1417///
1418/// * *Exception flags* report which exceptions occurred since last they were
1419/// reset.
1420///
1421/// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1422/// default
1423/// these flags are all set to 1, so all exceptions are masked. When an
1424/// an exception is masked, the processor simply sets the exception flag and
1425/// continues the operation. If the exception is unmasked, the flag is also set
1426/// but additionally an exception handler is invoked.
1427///
1428/// * *Rounding mode flags* control the rounding mode of floating point
1429/// instructions.
1430///
1431/// * The *denormals-are-zero mode flag* turns all numbers which would be
1432/// denormalized (exponent bits are all zeros) into zeros.
1433///
1434/// ## Exception Flags
1435///
1436/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1437/// Infinity by Infinity).
1438///
1439/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1440/// number. Mainly this can cause loss of precision.
1441///
1442/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured.
1443///
1444/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a
1445/// result was too large to be represented (e.g., an `f32` with absolute
1446/// value
1447/// greater than `2^128`).
1448///
1449/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a
1450/// result was too small to be represented in a normalized way (e.g., an
1451/// `f32`
1452/// with absulte value smaller than `2^-126`.)
1453///
1454/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a.
1455/// precision exception). This means some precision was lost due to rounding.
1456/// For example, the fraction `1/3` cannot be represented accurately in a
1457/// 32 or 64 bit float and computing it would cause this exception to be
1458/// raised. Precision exceptions are very common, so they are usually masked.
1459///
1460/// Exception flags can be read and set using the convenience functions
1461/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1462/// check if an operation caused some overflow:
1463///
1464/// ```rust,ignore
1465/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1466/// // perform calculations
1467/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1468/// // handle overflow
1469/// }
1470/// ```
1471///
1472/// ## Masking Flags
1473///
1474/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1475/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1476/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1477///
1478/// A single masking bit can be set via
1479///
1480/// ```rust,ignore
1481/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1482/// ```
1483///
1484/// However, since mask bits are by default all set to 1, it is more common to
1485/// want to *disable* certain bits. For example, to unmask the underflow
1486/// exception, use:
1487///
1488/// ```rust,ignore
1489/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1490/// exception
1491/// ```
1492///
1493/// Warning: an unmasked exception will cause an exception handler to be
1494/// called.
1495/// The standard handler will simply terminate the process. So, in this case
1496/// any underflow exception would terminate the current process with something
1497/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1498///
1499/// ## Rounding Mode
1500///
1501/// The rounding mode is describe using two bits. It can be read and set using
1502/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1503/// `_MM_SET_ROUNDING_MODE(mode)`.
1504///
1505/// The rounding modes are:
1506///
1507/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1508/// value. If two values are equally close, round to even (i.e., least
1509/// significant bit will be zero).
1510///
1511/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1512///
1513/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1514///
1515/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1516///
1517/// Example:
1518///
1519/// ```rust,ignore
1520/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1521/// ```
1522///
1523/// ## Denormals-are-zero/Flush-to-zero Mode
1524///
1525/// If this bit is set, values that would be denormalized will be set to zero
1526/// instead. This is turned off by default.
1527///
1528/// You can read and enable/disable this mode via the helper functions
1529/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1530///
1531/// ```rust,ignore
1532/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1533/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1534/// ```
1535///
83c7162d
XL
1536///
1537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr)
0531ce1d
XL
1538#[inline]
1539#[target_feature(enable = "sse")]
1540#[cfg_attr(test, assert_instr(ldmxcsr))]
83c7162d 1541#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1542pub unsafe fn _mm_setcsr(val: u32) {
1543 ldmxcsr(&val as *const _ as *const i8);
1544}
1545
1546/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1547#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1548pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1549/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1550#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1551pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1552/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1553#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1554pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1555/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1556#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1557pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1558/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1559#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1560pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1561/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1562#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1563pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1564/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
83c7162d 1565#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1566pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1567
1568/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1569#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1570pub const _MM_MASK_INVALID: u32 = 0x0080;
1571/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1572#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1573pub const _MM_MASK_DENORM: u32 = 0x0100;
1574/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1575#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1576pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1577/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1578#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1579pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1580/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1581#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1582pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1583/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1584#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1585pub const _MM_MASK_INEXACT: u32 = 0x1000;
1586/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
83c7162d 1587#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1588pub const _MM_MASK_MASK: u32 = 0x1f80;
1589
1590/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1591#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1592pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1593/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1594#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1595pub const _MM_ROUND_DOWN: u32 = 0x2000;
1596/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1597#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1598pub const _MM_ROUND_UP: u32 = 0x4000;
1599/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1600#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1601pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1602
1603/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
83c7162d 1604#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1605pub const _MM_ROUND_MASK: u32 = 0x6000;
1606
1607/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
83c7162d 1608#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1609pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1610/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1611#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1612pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1613/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1614#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1615pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1616
1617/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1618///
1619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK)
0531ce1d
XL
1620#[inline]
1621#[allow(non_snake_case)]
1622#[target_feature(enable = "sse")]
83c7162d 1623#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1624pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1625 _mm_getcsr() & _MM_MASK_MASK
1626}
1627
1628/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1629///
1630/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE)
0531ce1d
XL
1631#[inline]
1632#[allow(non_snake_case)]
1633#[target_feature(enable = "sse")]
83c7162d 1634#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1635pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1636 _mm_getcsr() & _MM_EXCEPT_MASK
1637}
1638
1639/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1640///
1641/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE)
0531ce1d
XL
1642#[inline]
1643#[allow(non_snake_case)]
1644#[target_feature(enable = "sse")]
83c7162d 1645#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1646pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1647 _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1648}
1649
1650/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1651///
1652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE)
0531ce1d
XL
1653#[inline]
1654#[allow(non_snake_case)]
1655#[target_feature(enable = "sse")]
83c7162d 1656#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1657pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1658 _mm_getcsr() & _MM_ROUND_MASK
1659}
1660
1661/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1662///
1663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK)
0531ce1d
XL
1664#[inline]
1665#[allow(non_snake_case)]
1666#[target_feature(enable = "sse")]
83c7162d 1667#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1668pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1669 _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1670}
1671
1672/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1673///
1674/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE)
0531ce1d
XL
1675#[inline]
1676#[allow(non_snake_case)]
1677#[target_feature(enable = "sse")]
83c7162d 1678#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1679pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1680 _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1681}
1682
1683/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1684///
1685/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE)
0531ce1d
XL
1686#[inline]
1687#[allow(non_snake_case)]
1688#[target_feature(enable = "sse")]
83c7162d 1689#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1690pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1691 let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1692 // println!("setting csr={:x}", val);
1693 _mm_setcsr(val)
1694}
1695
1696/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1697///
1698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE)
0531ce1d
XL
1699#[inline]
1700#[allow(non_snake_case)]
1701#[target_feature(enable = "sse")]
83c7162d 1702#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1703pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1704 _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1705}
1706
1707/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1708#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1709pub const _MM_HINT_T0: i32 = 3;
1710
1711/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1712#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1713pub const _MM_HINT_T1: i32 = 2;
1714
1715/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1716#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1717pub const _MM_HINT_T2: i32 = 1;
1718
1719/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1720#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1721pub const _MM_HINT_NTA: i32 = 0;
1722
1723/// Fetch the cache line that contains address `p` using the given `strategy`.
1724///
1725/// The `strategy` must be one of:
1726///
1727/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
416331ca 1728/// cache hierarchy.
0531ce1d
XL
1729///
1730/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1731///
83c7162d
XL
1732/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1733/// an implementation-specific choice (e.g., L2 if there is no L3).
0531ce1d
XL
1734///
1735/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1736/// non-temporal access (NTA) hint. It may be a place closer than main memory
1737/// but outside of the cache hierarchy. This is used to reduce access latency
1738/// without polluting the cache.
1739///
1740/// The actual implementation depends on the particular CPU. This instruction
1741/// is considered a hint, so the CPU is also free to simply ignore the request.
1742///
83c7162d
XL
1743/// The amount of prefetched data depends on the cache line size of the
1744/// specific CPU, but it will be at least 32 bytes.
0531ce1d
XL
1745///
1746/// Common caveats:
1747///
1748/// * Most modern CPUs already automatically prefetch data based on predicted
1749/// access patterns.
1750///
1751/// * Data is usually not fetched if this would cause a TLB miss or a page
1752/// fault.
1753///
1754/// * Too much prefetching can cause unnecessary cache evictions.
1755///
1756/// * Prefetching may also fail if there are not enough memory-subsystem
1757/// resources (e.g., request buffers).
1758///
83c7162d
XL
1759///
1760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch)
0531ce1d
XL
1761#[inline]
1762#[target_feature(enable = "sse")]
1763#[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))]
1764#[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))]
1765#[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))]
1766#[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))]
1767#[rustc_args_required_const(1)]
83c7162d 1768#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1769pub unsafe fn _mm_prefetch(p: *const i8, strategy: i32) {
1770 // The `strategy` must be a compile-time constant, so we use a short form
1771 // of `constify_imm8!` for now.
1772 // We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and
1773 // `cache type` = 1 (data cache). `locality` is based on our `strategy`.
1774 macro_rules! pref {
1775 ($imm8:expr) => {
1776 match $imm8 {
1777 0 => prefetch(p, 0, 0, 1),
1778 1 => prefetch(p, 0, 1, 1),
1779 2 => prefetch(p, 0, 2, 1),
1780 _ => prefetch(p, 0, 3, 1),
1781 }
83c7162d 1782 };
0531ce1d
XL
1783 }
1784 pref!(strategy)
1785}
1786
532ac7d7 1787/// Returns vector of type __m128 with undefined elements.
83c7162d
XL
1788///
1789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps)
0531ce1d
XL
1790#[inline]
1791#[target_feature(enable = "sse")]
83c7162d 1792#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1793pub unsafe fn _mm_undefined_ps() -> __m128 {
3dfed10e 1794 _mm_set1_ps(0.0)
0531ce1d
XL
1795}
1796
1797/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
83c7162d
XL
1798///
1799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS)
0531ce1d
XL
1800#[inline]
1801#[allow(non_snake_case)]
1802#[target_feature(enable = "sse")]
83c7162d 1803#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1804pub unsafe fn _MM_TRANSPOSE4_PS(
0731742a
XL
1805 row0: &mut __m128,
1806 row1: &mut __m128,
1807 row2: &mut __m128,
1808 row3: &mut __m128,
0531ce1d
XL
1809) {
1810 let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1811 let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1812 let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1813 let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1814
1815 *row0 = _mm_movelh_ps(tmp0, tmp2);
1816 *row1 = _mm_movehl_ps(tmp2, tmp0);
1817 *row2 = _mm_movelh_ps(tmp1, tmp3);
1818 *row3 = _mm_movehl_ps(tmp3, tmp1);
1819}
1820
1821#[allow(improper_ctypes)]
1822extern "C" {
1823 #[link_name = "llvm.x86.sse.add.ss"]
1824 fn addss(a: __m128, b: __m128) -> __m128;
1825 #[link_name = "llvm.x86.sse.sub.ss"]
1826 fn subss(a: __m128, b: __m128) -> __m128;
1827 #[link_name = "llvm.x86.sse.mul.ss"]
1828 fn mulss(a: __m128, b: __m128) -> __m128;
1829 #[link_name = "llvm.x86.sse.div.ss"]
1830 fn divss(a: __m128, b: __m128) -> __m128;
1831 #[link_name = "llvm.x86.sse.sqrt.ss"]
1832 fn sqrtss(a: __m128) -> __m128;
1833 #[link_name = "llvm.x86.sse.sqrt.ps"]
1834 fn sqrtps(a: __m128) -> __m128;
1835 #[link_name = "llvm.x86.sse.rcp.ss"]
1836 fn rcpss(a: __m128) -> __m128;
1837 #[link_name = "llvm.x86.sse.rcp.ps"]
1838 fn rcpps(a: __m128) -> __m128;
1839 #[link_name = "llvm.x86.sse.rsqrt.ss"]
1840 fn rsqrtss(a: __m128) -> __m128;
1841 #[link_name = "llvm.x86.sse.rsqrt.ps"]
1842 fn rsqrtps(a: __m128) -> __m128;
1843 #[link_name = "llvm.x86.sse.min.ss"]
1844 fn minss(a: __m128, b: __m128) -> __m128;
1845 #[link_name = "llvm.x86.sse.min.ps"]
1846 fn minps(a: __m128, b: __m128) -> __m128;
1847 #[link_name = "llvm.x86.sse.max.ss"]
1848 fn maxss(a: __m128, b: __m128) -> __m128;
1849 #[link_name = "llvm.x86.sse.max.ps"]
1850 fn maxps(a: __m128, b: __m128) -> __m128;
1851 #[link_name = "llvm.x86.sse.movmsk.ps"]
1852 fn movmskps(a: __m128) -> i32;
1853 #[link_name = "llvm.x86.sse.cmp.ps"]
1854 fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1855 #[link_name = "llvm.x86.sse.comieq.ss"]
1856 fn comieq_ss(a: __m128, b: __m128) -> i32;
1857 #[link_name = "llvm.x86.sse.comilt.ss"]
1858 fn comilt_ss(a: __m128, b: __m128) -> i32;
1859 #[link_name = "llvm.x86.sse.comile.ss"]
1860 fn comile_ss(a: __m128, b: __m128) -> i32;
1861 #[link_name = "llvm.x86.sse.comigt.ss"]
1862 fn comigt_ss(a: __m128, b: __m128) -> i32;
1863 #[link_name = "llvm.x86.sse.comige.ss"]
1864 fn comige_ss(a: __m128, b: __m128) -> i32;
1865 #[link_name = "llvm.x86.sse.comineq.ss"]
1866 fn comineq_ss(a: __m128, b: __m128) -> i32;
1867 #[link_name = "llvm.x86.sse.ucomieq.ss"]
1868 fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1869 #[link_name = "llvm.x86.sse.ucomilt.ss"]
1870 fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1871 #[link_name = "llvm.x86.sse.ucomile.ss"]
1872 fn ucomile_ss(a: __m128, b: __m128) -> i32;
1873 #[link_name = "llvm.x86.sse.ucomigt.ss"]
1874 fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1875 #[link_name = "llvm.x86.sse.ucomige.ss"]
1876 fn ucomige_ss(a: __m128, b: __m128) -> i32;
1877 #[link_name = "llvm.x86.sse.ucomineq.ss"]
1878 fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1879 #[link_name = "llvm.x86.sse.cvtss2si"]
1880 fn cvtss2si(a: __m128) -> i32;
1881 #[link_name = "llvm.x86.sse.cvttss2si"]
1882 fn cvttss2si(a: __m128) -> i32;
1883 #[link_name = "llvm.x86.sse.cvtsi2ss"]
1884 fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1885 #[link_name = "llvm.x86.sse.sfence"]
1886 fn sfence();
1887 #[link_name = "llvm.x86.sse.stmxcsr"]
1888 fn stmxcsr(p: *mut i8);
1889 #[link_name = "llvm.x86.sse.ldmxcsr"]
1890 fn ldmxcsr(p: *const i8);
1891 #[link_name = "llvm.prefetch"]
1892 fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1893 #[link_name = "llvm.x86.sse.cmp.ss"]
1894 fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
0531ce1d
XL
1895}
1896
1897/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1898///
1899/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1900/// exception _may_ be generated.
83c7162d
XL
1901///
1902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps)
0531ce1d
XL
1903#[inline]
1904#[target_feature(enable = "sse")]
1905#[cfg_attr(test, assert_instr(movntps))]
83c7162d 1906#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1907#[allow(clippy::cast_ptr_alignment)]
0531ce1d 1908pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
0731742a 1909 intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
0531ce1d
XL
1910}
1911
0531ce1d
XL
1912#[cfg(test)]
1913mod tests {
48663c56
XL
1914 use crate::{hint::black_box, mem::transmute};
1915 use std::{boxed, f32::NAN};
416331ca 1916 use stdarch_test::simd_test;
0531ce1d 1917
532ac7d7 1918 use crate::core_arch::{simd::*, x86::*};
0531ce1d 1919
83c7162d 1920 #[simd_test(enable = "sse")]
0531ce1d
XL
1921 unsafe fn test_mm_add_ps() {
1922 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1923 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1924 let r = _mm_add_ps(a, b);
1925 assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
1926 }
1927
83c7162d 1928 #[simd_test(enable = "sse")]
0531ce1d
XL
1929 unsafe fn test_mm_add_ss() {
1930 let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
1931 let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
1932 let r = _mm_add_ss(a, b);
1933 assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
1934 }
1935
83c7162d 1936 #[simd_test(enable = "sse")]
0531ce1d
XL
1937 unsafe fn test_mm_sub_ps() {
1938 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1939 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1940 let r = _mm_sub_ps(a, b);
1941 assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
1942 }
1943
83c7162d 1944 #[simd_test(enable = "sse")]
0531ce1d
XL
1945 unsafe fn test_mm_sub_ss() {
1946 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1947 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1948 let r = _mm_sub_ss(a, b);
1949 assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
1950 }
1951
83c7162d 1952 #[simd_test(enable = "sse")]
0531ce1d
XL
1953 unsafe fn test_mm_mul_ps() {
1954 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1955 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1956 let r = _mm_mul_ps(a, b);
1957 assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
1958 }
1959
83c7162d 1960 #[simd_test(enable = "sse")]
0531ce1d
XL
1961 unsafe fn test_mm_mul_ss() {
1962 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1963 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1964 let r = _mm_mul_ss(a, b);
1965 assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
1966 }
1967
83c7162d 1968 #[simd_test(enable = "sse")]
0531ce1d
XL
1969 unsafe fn test_mm_div_ps() {
1970 let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
1971 let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
1972 let r = _mm_div_ps(a, b);
1973 assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
1974 }
1975
83c7162d 1976 #[simd_test(enable = "sse")]
0531ce1d
XL
1977 unsafe fn test_mm_div_ss() {
1978 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1979 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1980 let r = _mm_div_ss(a, b);
1981 assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
1982 }
1983
83c7162d 1984 #[simd_test(enable = "sse")]
0531ce1d
XL
1985 unsafe fn test_mm_sqrt_ss() {
1986 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1987 let r = _mm_sqrt_ss(a);
1988 let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
1989 assert_eq_m128(r, e);
1990 }
1991
83c7162d 1992 #[simd_test(enable = "sse")]
0531ce1d
XL
1993 unsafe fn test_mm_sqrt_ps() {
1994 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1995 let r = _mm_sqrt_ps(a);
1996 let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
1997 assert_eq_m128(r, e);
1998 }
1999
83c7162d 2000 #[simd_test(enable = "sse")]
0531ce1d
XL
2001 unsafe fn test_mm_rcp_ss() {
2002 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2003 let r = _mm_rcp_ss(a);
2004 let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2005 assert_eq_m128(r, e);
2006 }
2007
83c7162d 2008 #[simd_test(enable = "sse")]
0531ce1d
XL
2009 unsafe fn test_mm_rcp_ps() {
2010 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2011 let r = _mm_rcp_ps(a);
2012 let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2013 let rel_err = 0.00048828125;
2014 for i in 0..4 {
2015 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2016 }
2017 }
2018
83c7162d 2019 #[simd_test(enable = "sse")]
0531ce1d
XL
2020 unsafe fn test_mm_rsqrt_ss() {
2021 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2022 let r = _mm_rsqrt_ss(a);
2023 let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2024 let rel_err = 0.00048828125;
2025 for i in 0..4 {
2026 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2027 }
2028 }
2029
83c7162d 2030 #[simd_test(enable = "sse")]
0531ce1d
XL
2031 unsafe fn test_mm_rsqrt_ps() {
2032 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2033 let r = _mm_rsqrt_ps(a);
2034 let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2035 let rel_err = 0.00048828125;
2036 for i in 0..4 {
2037 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2038 }
2039 }
2040
83c7162d 2041 #[simd_test(enable = "sse")]
0531ce1d
XL
2042 unsafe fn test_mm_min_ss() {
2043 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2044 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2045 let r = _mm_min_ss(a, b);
2046 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2047 }
2048
83c7162d 2049 #[simd_test(enable = "sse")]
0531ce1d
XL
2050 unsafe fn test_mm_min_ps() {
2051 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2052 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2053 let r = _mm_min_ps(a, b);
2054 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
74b04a01
XL
2055
2056 // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2057 // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2058 // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2059 // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2060 // `r1` to `a` and `r2` to `b`.
2061 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2062 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2063 let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2064 let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2065 let a: [u8; 16] = transmute(a);
2066 let b: [u8; 16] = transmute(b);
2067 assert_eq!(r1, b);
2068 assert_eq!(r2, a);
2069 assert_ne!(a, b); // sanity check that -0.0 is actually present
0531ce1d
XL
2070 }
2071
83c7162d 2072 #[simd_test(enable = "sse")]
0531ce1d
XL
2073 unsafe fn test_mm_max_ss() {
2074 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2075 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2076 let r = _mm_max_ss(a, b);
2077 assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2078 }
2079
83c7162d 2080 #[simd_test(enable = "sse")]
0531ce1d
XL
2081 unsafe fn test_mm_max_ps() {
2082 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2083 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2084 let r = _mm_max_ps(a, b);
2085 assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2086 }
2087
83c7162d 2088 #[simd_test(enable = "sse")]
0531ce1d
XL
2089 unsafe fn test_mm_and_ps() {
2090 let a = transmute(u32x4::splat(0b0011));
2091 let b = transmute(u32x4::splat(0b0101));
2092 let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2093 let e = transmute(u32x4::splat(0b0001));
2094 assert_eq_m128(r, e);
2095 }
2096
83c7162d 2097 #[simd_test(enable = "sse")]
0531ce1d
XL
2098 unsafe fn test_mm_andnot_ps() {
2099 let a = transmute(u32x4::splat(0b0011));
2100 let b = transmute(u32x4::splat(0b0101));
2101 let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2102 let e = transmute(u32x4::splat(0b0100));
2103 assert_eq_m128(r, e);
2104 }
2105
83c7162d 2106 #[simd_test(enable = "sse")]
0531ce1d
XL
2107 unsafe fn test_mm_or_ps() {
2108 let a = transmute(u32x4::splat(0b0011));
2109 let b = transmute(u32x4::splat(0b0101));
2110 let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2111 let e = transmute(u32x4::splat(0b0111));
2112 assert_eq_m128(r, e);
2113 }
2114
83c7162d 2115 #[simd_test(enable = "sse")]
0531ce1d
XL
2116 unsafe fn test_mm_xor_ps() {
2117 let a = transmute(u32x4::splat(0b0011));
2118 let b = transmute(u32x4::splat(0b0101));
2119 let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2120 let e = transmute(u32x4::splat(0b0110));
2121 assert_eq_m128(r, e);
2122 }
2123
83c7162d 2124 #[simd_test(enable = "sse")]
0531ce1d
XL
2125 unsafe fn test_mm_cmpeq_ss() {
2126 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2127 let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2128 let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2129 let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
2130 assert_eq!(r, e);
2131
2132 let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2133 let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
0731742a 2134 let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
0531ce1d
XL
2135 assert_eq!(r2, e2);
2136 }
2137
83c7162d 2138 #[simd_test(enable = "sse")]
0531ce1d
XL
2139 unsafe fn test_mm_cmplt_ss() {
2140 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2141 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2142 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2143 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2144
2145 let b1 = 0u32; // a.extract(0) < b.extract(0)
2146 let c1 = 0u32; // a.extract(0) < c.extract(0)
2147 let d1 = !0u32; // a.extract(0) < d.extract(0)
2148
2149 let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2150 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2151 assert_eq!(rb, eb);
2152
2153 let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2154 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2155 assert_eq!(rc, ec);
2156
2157 let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2158 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2159 assert_eq!(rd, ed);
2160 }
2161
83c7162d 2162 #[simd_test(enable = "sse")]
0531ce1d
XL
2163 unsafe fn test_mm_cmple_ss() {
2164 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2165 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2166 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2167 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2168
2169 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2170 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2171 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2172
2173 let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2174 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2175 assert_eq!(rb, eb);
2176
2177 let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2178 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2179 assert_eq!(rc, ec);
2180
2181 let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2182 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2183 assert_eq!(rd, ed);
2184 }
2185
83c7162d 2186 #[simd_test(enable = "sse")]
0531ce1d
XL
2187 unsafe fn test_mm_cmpgt_ss() {
2188 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2189 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2190 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2191 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2192
2193 let b1 = !0u32; // a.extract(0) > b.extract(0)
2194 let c1 = 0u32; // a.extract(0) > c.extract(0)
2195 let d1 = 0u32; // a.extract(0) > d.extract(0)
2196
2197 let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2198 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2199 assert_eq!(rb, eb);
2200
2201 let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2202 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2203 assert_eq!(rc, ec);
2204
2205 let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2206 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2207 assert_eq!(rd, ed);
2208 }
2209
83c7162d 2210 #[simd_test(enable = "sse")]
0531ce1d
XL
2211 unsafe fn test_mm_cmpge_ss() {
2212 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2213 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2214 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2215 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2216
2217 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2218 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2219 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2220
2221 let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2222 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2223 assert_eq!(rb, eb);
2224
2225 let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2226 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2227 assert_eq!(rc, ec);
2228
2229 let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2230 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2231 assert_eq!(rd, ed);
2232 }
2233
83c7162d 2234 #[simd_test(enable = "sse")]
0531ce1d
XL
2235 unsafe fn test_mm_cmpneq_ss() {
2236 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2237 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2238 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2239 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2240
2241 let b1 = !0u32; // a.extract(0) != b.extract(0)
2242 let c1 = 0u32; // a.extract(0) != c.extract(0)
2243 let d1 = !0u32; // a.extract(0) != d.extract(0)
2244
2245 let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2246 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2247 assert_eq!(rb, eb);
2248
2249 let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2250 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2251 assert_eq!(rc, ec);
2252
2253 let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2254 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2255 assert_eq!(rd, ed);
2256 }
2257
83c7162d 2258 #[simd_test(enable = "sse")]
0531ce1d 2259 unsafe fn test_mm_cmpnlt_ss() {
532ac7d7 2260 // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
0531ce1d
XL
2261 // must be a difference. It may have to do with behavior in the
2262 // presence of NaNs (signaling or quiet). If so, we should add tests
2263 // for those.
2264
2265 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2266 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2267 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2268 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2269
2270 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2271 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2272 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2273
2274 let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2275 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2276 assert_eq!(rb, eb);
2277
2278 let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2279 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2280 assert_eq!(rc, ec);
2281
2282 let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2283 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2284 assert_eq!(rd, ed);
2285 }
2286
83c7162d 2287 #[simd_test(enable = "sse")]
0531ce1d 2288 unsafe fn test_mm_cmpnle_ss() {
532ac7d7 2289 // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
0531ce1d
XL
2290 // must be a difference. It may have to do with behavior in the
2291 // presence
2292 // of NaNs (signaling or quiet). If so, we should add tests for those.
2293
2294 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2295 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2296 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2297 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2298
2299 let b1 = !0u32; // a.extract(0) > b.extract(0)
2300 let c1 = 0u32; // a.extract(0) > c.extract(0)
2301 let d1 = 0u32; // a.extract(0) > d.extract(0)
2302
2303 let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2304 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2305 assert_eq!(rb, eb);
2306
2307 let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2308 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2309 assert_eq!(rc, ec);
2310
2311 let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2312 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2313 assert_eq!(rd, ed);
2314 }
2315
83c7162d 2316 #[simd_test(enable = "sse")]
0531ce1d 2317 unsafe fn test_mm_cmpngt_ss() {
532ac7d7 2318 // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
0531ce1d
XL
2319 // must be a difference. It may have to do with behavior in the
2320 // presence of NaNs (signaling or quiet). If so, we should add tests
2321 // for those.
2322
2323 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2324 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2325 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2326 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2327
2328 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2329 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2330 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2331
2332 let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2333 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2334 assert_eq!(rb, eb);
2335
2336 let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2337 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2338 assert_eq!(rc, ec);
2339
2340 let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2341 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2342 assert_eq!(rd, ed);
2343 }
2344
83c7162d 2345 #[simd_test(enable = "sse")]
0531ce1d 2346 unsafe fn test_mm_cmpnge_ss() {
532ac7d7 2347 // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
0531ce1d
XL
2348 // must be a difference. It may have to do with behavior in the
2349 // presence of NaNs (signaling or quiet). If so, we should add tests
2350 // for those.
2351
2352 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2353 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2354 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2355 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2356
2357 let b1 = 0u32; // a.extract(0) < b.extract(0)
2358 let c1 = 0u32; // a.extract(0) < c.extract(0)
2359 let d1 = !0u32; // a.extract(0) < d.extract(0)
2360
2361 let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2362 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2363 assert_eq!(rb, eb);
2364
2365 let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2366 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2367 assert_eq!(rc, ec);
2368
2369 let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2370 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2371 assert_eq!(rd, ed);
2372 }
2373
83c7162d 2374 #[simd_test(enable = "sse")]
0531ce1d
XL
2375 unsafe fn test_mm_cmpord_ss() {
2376 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2377 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2378 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2379 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2380
2381 let b1 = !0u32; // a.extract(0) ord b.extract(0)
2382 let c1 = 0u32; // a.extract(0) ord c.extract(0)
2383 let d1 = !0u32; // a.extract(0) ord d.extract(0)
2384
2385 let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2386 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2387 assert_eq!(rb, eb);
2388
2389 let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2390 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2391 assert_eq!(rc, ec);
2392
2393 let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2394 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2395 assert_eq!(rd, ed);
2396 }
2397
83c7162d 2398 #[simd_test(enable = "sse")]
0531ce1d
XL
2399 unsafe fn test_mm_cmpunord_ss() {
2400 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2401 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2402 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2403 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2404
2405 let b1 = 0u32; // a.extract(0) unord b.extract(0)
2406 let c1 = !0u32; // a.extract(0) unord c.extract(0)
2407 let d1 = 0u32; // a.extract(0) unord d.extract(0)
2408
2409 let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2410 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2411 assert_eq!(rb, eb);
2412
2413 let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2414 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2415 assert_eq!(rc, ec);
2416
2417 let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2418 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2419 assert_eq!(rd, ed);
2420 }
2421
83c7162d 2422 #[simd_test(enable = "sse")]
0531ce1d
XL
2423 unsafe fn test_mm_cmpeq_ps() {
2424 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2425 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2426 let tru = !0u32;
2427 let fls = 0u32;
2428
2429 let e = u32x4::new(fls, fls, tru, fls);
2430 let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2431 assert_eq!(r, e);
2432 }
2433
83c7162d 2434 #[simd_test(enable = "sse")]
0531ce1d
XL
2435 unsafe fn test_mm_cmplt_ps() {
2436 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2437 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2438 let tru = !0u32;
2439 let fls = 0u32;
2440
2441 let e = u32x4::new(tru, fls, fls, fls);
2442 let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2443 assert_eq!(r, e);
2444 }
2445
83c7162d 2446 #[simd_test(enable = "sse")]
0531ce1d
XL
2447 unsafe fn test_mm_cmple_ps() {
2448 let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2449 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2450 let tru = !0u32;
2451 let fls = 0u32;
2452
2453 let e = u32x4::new(tru, fls, tru, fls);
2454 let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2455 assert_eq!(r, e);
2456 }
2457
83c7162d 2458 #[simd_test(enable = "sse")]
0531ce1d
XL
2459 unsafe fn test_mm_cmpgt_ps() {
2460 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2461 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2462 let tru = !0u32;
2463 let fls = 0u32;
2464
2465 let e = u32x4::new(fls, tru, fls, fls);
2466 let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2467 assert_eq!(r, e);
2468 }
2469
83c7162d 2470 #[simd_test(enable = "sse")]
0531ce1d
XL
2471 unsafe fn test_mm_cmpge_ps() {
2472 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2473 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2474 let tru = !0u32;
2475 let fls = 0u32;
2476
2477 let e = u32x4::new(fls, tru, tru, fls);
2478 let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2479 assert_eq!(r, e);
2480 }
2481
83c7162d 2482 #[simd_test(enable = "sse")]
0531ce1d
XL
2483 unsafe fn test_mm_cmpneq_ps() {
2484 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2485 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2486 let tru = !0u32;
2487 let fls = 0u32;
2488
2489 let e = u32x4::new(tru, tru, fls, tru);
2490 let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2491 assert_eq!(r, e);
2492 }
2493
83c7162d 2494 #[simd_test(enable = "sse")]
0531ce1d
XL
2495 unsafe fn test_mm_cmpnlt_ps() {
2496 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2497 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2498 let tru = !0u32;
2499 let fls = 0u32;
2500
2501 let e = u32x4::new(fls, tru, tru, tru);
2502 let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2503 assert_eq!(r, e);
2504 }
2505
83c7162d 2506 #[simd_test(enable = "sse")]
0531ce1d
XL
2507 unsafe fn test_mm_cmpnle_ps() {
2508 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2509 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2510 let tru = !0u32;
2511 let fls = 0u32;
2512
2513 let e = u32x4::new(fls, tru, fls, tru);
2514 let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2515 assert_eq!(r, e);
2516 }
2517
83c7162d 2518 #[simd_test(enable = "sse")]
0531ce1d
XL
2519 unsafe fn test_mm_cmpngt_ps() {
2520 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2521 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2522 let tru = !0u32;
2523 let fls = 0u32;
2524
2525 let e = u32x4::new(tru, fls, tru, tru);
2526 let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2527 assert_eq!(r, e);
2528 }
2529
83c7162d 2530 #[simd_test(enable = "sse")]
0531ce1d
XL
2531 unsafe fn test_mm_cmpnge_ps() {
2532 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2533 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2534 let tru = !0u32;
2535 let fls = 0u32;
2536
2537 let e = u32x4::new(tru, fls, fls, tru);
2538 let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2539 assert_eq!(r, e);
2540 }
2541
83c7162d 2542 #[simd_test(enable = "sse")]
0531ce1d
XL
2543 unsafe fn test_mm_cmpord_ps() {
2544 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2545 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2546 let tru = !0u32;
2547 let fls = 0u32;
2548
2549 let e = u32x4::new(tru, fls, fls, fls);
2550 let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2551 assert_eq!(r, e);
2552 }
2553
83c7162d 2554 #[simd_test(enable = "sse")]
0531ce1d
XL
2555 unsafe fn test_mm_cmpunord_ps() {
2556 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2557 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2558 let tru = !0u32;
2559 let fls = 0u32;
2560
2561 let e = u32x4::new(fls, tru, tru, tru);
2562 let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2563 assert_eq!(r, e);
2564 }
2565
83c7162d 2566 #[simd_test(enable = "sse")]
0531ce1d
XL
2567 unsafe fn test_mm_comieq_ss() {
2568 let aa = &[3.0f32, 12.0, 23.0, NAN];
2569 let bb = &[3.0f32, 47.5, 1.5, NAN];
2570
2571 let ee = &[1i32, 0, 0, 0];
2572
2573 for i in 0..4 {
2574 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2575 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2576
2577 let r = _mm_comieq_ss(a, b);
2578
2579 assert_eq!(
2580 ee[i], r,
2581 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2582 a, b, r, ee[i], i
2583 );
2584 }
2585 }
2586
83c7162d 2587 #[simd_test(enable = "sse")]
0531ce1d
XL
2588 unsafe fn test_mm_comilt_ss() {
2589 let aa = &[3.0f32, 12.0, 23.0, NAN];
2590 let bb = &[3.0f32, 47.5, 1.5, NAN];
2591
2592 let ee = &[0i32, 1, 0, 0];
2593
2594 for i in 0..4 {
2595 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2596 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2597
2598 let r = _mm_comilt_ss(a, b);
2599
2600 assert_eq!(
2601 ee[i], r,
2602 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2603 a, b, r, ee[i], i
2604 );
2605 }
2606 }
2607
83c7162d 2608 #[simd_test(enable = "sse")]
0531ce1d
XL
2609 unsafe fn test_mm_comile_ss() {
2610 let aa = &[3.0f32, 12.0, 23.0, NAN];
2611 let bb = &[3.0f32, 47.5, 1.5, NAN];
2612
2613 let ee = &[1i32, 1, 0, 0];
2614
2615 for i in 0..4 {
2616 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2617 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2618
2619 let r = _mm_comile_ss(a, b);
2620
2621 assert_eq!(
2622 ee[i], r,
2623 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2624 a, b, r, ee[i], i
2625 );
2626 }
2627 }
2628
83c7162d 2629 #[simd_test(enable = "sse")]
0531ce1d
XL
2630 unsafe fn test_mm_comigt_ss() {
2631 let aa = &[3.0f32, 12.0, 23.0, NAN];
2632 let bb = &[3.0f32, 47.5, 1.5, NAN];
2633
2634 let ee = &[1i32, 0, 1, 0];
2635
2636 for i in 0..4 {
2637 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2638 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2639
2640 let r = _mm_comige_ss(a, b);
2641
2642 assert_eq!(
2643 ee[i], r,
2644 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2645 a, b, r, ee[i], i
2646 );
2647 }
2648 }
2649
83c7162d 2650 #[simd_test(enable = "sse")]
0531ce1d
XL
2651 unsafe fn test_mm_comineq_ss() {
2652 let aa = &[3.0f32, 12.0, 23.0, NAN];
2653 let bb = &[3.0f32, 47.5, 1.5, NAN];
2654
2655 let ee = &[0i32, 1, 1, 1];
2656
2657 for i in 0..4 {
2658 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2659 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2660
2661 let r = _mm_comineq_ss(a, b);
2662
2663 assert_eq!(
2664 ee[i], r,
2665 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2666 a, b, r, ee[i], i
2667 );
2668 }
2669 }
2670
83c7162d 2671 #[simd_test(enable = "sse")]
0531ce1d
XL
2672 unsafe fn test_mm_ucomieq_ss() {
2673 let aa = &[3.0f32, 12.0, 23.0, NAN];
2674 let bb = &[3.0f32, 47.5, 1.5, NAN];
2675
2676 let ee = &[1i32, 0, 0, 0];
2677
2678 for i in 0..4 {
2679 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2680 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2681
2682 let r = _mm_ucomieq_ss(a, b);
2683
2684 assert_eq!(
2685 ee[i], r,
2686 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2687 a, b, r, ee[i], i
2688 );
2689 }
2690 }
2691
83c7162d 2692 #[simd_test(enable = "sse")]
0531ce1d
XL
2693 unsafe fn test_mm_ucomilt_ss() {
2694 let aa = &[3.0f32, 12.0, 23.0, NAN];
2695 let bb = &[3.0f32, 47.5, 1.5, NAN];
2696
2697 let ee = &[0i32, 1, 0, 0];
2698
2699 for i in 0..4 {
2700 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2701 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2702
2703 let r = _mm_ucomilt_ss(a, b);
2704
2705 assert_eq!(
2706 ee[i], r,
2707 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2708 a, b, r, ee[i], i
2709 );
2710 }
2711 }
2712
83c7162d 2713 #[simd_test(enable = "sse")]
0531ce1d
XL
2714 unsafe fn test_mm_ucomile_ss() {
2715 let aa = &[3.0f32, 12.0, 23.0, NAN];
2716 let bb = &[3.0f32, 47.5, 1.5, NAN];
2717
2718 let ee = &[1i32, 1, 0, 0];
2719
2720 for i in 0..4 {
2721 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2722 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2723
2724 let r = _mm_ucomile_ss(a, b);
2725
2726 assert_eq!(
2727 ee[i], r,
2728 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2729 a, b, r, ee[i], i
2730 );
2731 }
2732 }
2733
83c7162d 2734 #[simd_test(enable = "sse")]
0531ce1d
XL
2735 unsafe fn test_mm_ucomigt_ss() {
2736 let aa = &[3.0f32, 12.0, 23.0, NAN];
2737 let bb = &[3.0f32, 47.5, 1.5, NAN];
2738
2739 let ee = &[0i32, 0, 1, 0];
2740
2741 for i in 0..4 {
2742 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2743 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2744
2745 let r = _mm_ucomigt_ss(a, b);
2746
2747 assert_eq!(
2748 ee[i], r,
2749 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2750 a, b, r, ee[i], i
2751 );
2752 }
2753 }
2754
83c7162d 2755 #[simd_test(enable = "sse")]
0531ce1d
XL
2756 unsafe fn test_mm_ucomige_ss() {
2757 let aa = &[3.0f32, 12.0, 23.0, NAN];
2758 let bb = &[3.0f32, 47.5, 1.5, NAN];
2759
2760 let ee = &[1i32, 0, 1, 0];
2761
2762 for i in 0..4 {
2763 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2764 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2765
2766 let r = _mm_ucomige_ss(a, b);
2767
2768 assert_eq!(
2769 ee[i], r,
2770 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2771 a, b, r, ee[i], i
2772 );
2773 }
2774 }
2775
83c7162d 2776 #[simd_test(enable = "sse")]
0531ce1d
XL
2777 unsafe fn test_mm_ucomineq_ss() {
2778 let aa = &[3.0f32, 12.0, 23.0, NAN];
2779 let bb = &[3.0f32, 47.5, 1.5, NAN];
2780
2781 let ee = &[0i32, 1, 1, 1];
2782
2783 for i in 0..4 {
2784 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2785 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2786
2787 let r = _mm_ucomineq_ss(a, b);
2788
2789 assert_eq!(
2790 ee[i], r,
2791 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2792 a, b, r, ee[i], i
2793 );
2794 }
2795 }
2796
83c7162d 2797 #[simd_test(enable = "sse")]
0531ce1d
XL
2798 unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2799 // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2800 // Invalid Operation Exception while `ucomieq_ss` should not.
2801 let aa = &[3.0f32, NAN, 23.0, NAN];
2802 let bb = &[3.0f32, 47.5, NAN, NAN];
2803
2804 let ee = &[1i32, 0, 0, 0];
2805 let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2806
2807 for i in 0..4 {
2808 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2809 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2810
2811 _MM_SET_EXCEPTION_STATE(0);
2812 let r1 = _mm_comieq_ss(*black_box(&a), b);
2813 let s1 = _MM_GET_EXCEPTION_STATE();
2814
2815 _MM_SET_EXCEPTION_STATE(0);
2816 let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2817 let s2 = _MM_GET_EXCEPTION_STATE();
2818
2819 assert_eq!(
2820 ee[i], r1,
2821 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2822 a, b, r1, ee[i], i
2823 );
2824 assert_eq!(
2825 ee[i], r2,
2826 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2827 a, b, r2, ee[i], i
2828 );
2829 assert_eq!(
2830 s1,
2831 exc[i] * _MM_EXCEPT_INVALID,
2832 "_mm_comieq_ss() set exception flags: {} (i={})",
2833 s1,
2834 i
2835 );
2836 assert_eq!(
2837 s2,
2838 0, // ucomieq_ss should not signal an exception
2839 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2840 s2,
2841 i
2842 );
2843 }
2844 }
2845
83c7162d 2846 #[simd_test(enable = "sse")]
0531ce1d 2847 unsafe fn test_mm_cvtss_si32() {
8faf50e0 2848 let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
ba9703b0 2849 let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
0531ce1d
XL
2850 for i in 0..inputs.len() {
2851 let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2852 let e = result[i];
2853 let r = _mm_cvtss_si32(x);
2854 assert_eq!(
2855 e, r,
2856 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2857 i, x, r, e
2858 );
2859 }
2860 }
2861
83c7162d 2862 #[simd_test(enable = "sse")]
0531ce1d
XL
2863 unsafe fn test_mm_cvttss_si32() {
2864 let inputs = &[
2865 (42.0f32, 42i32),
2866 (-31.4, -31),
2867 (-33.5, -33),
2868 (-34.5, -34),
2869 (10.999, 10),
2870 (-5.99, -5),
ba9703b0 2871 (4.0e10, i32::MIN),
0531ce1d 2872 (4.0e-10, 0),
ba9703b0 2873 (NAN, i32::MIN),
0531ce1d
XL
2874 (2147483500.1, 2147483520),
2875 ];
2876 for i in 0..inputs.len() {
2877 let (xi, e) = inputs[i];
2878 let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2879 let r = _mm_cvttss_si32(x);
2880 assert_eq!(
2881 e, r,
2882 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2883 i, x, r, e
2884 );
2885 }
2886 }
2887
83c7162d 2888 #[simd_test(enable = "sse")]
e1599b0c 2889 unsafe fn test_mm_cvtsi32_ss() {
0531ce1d
XL
2890 let inputs = &[
2891 (4555i32, 4555.0f32),
2892 (322223333, 322223330.0),
2893 (-432, -432.0),
2894 (-322223333, -322223330.0),
2895 ];
2896
2897 for i in 0..inputs.len() {
2898 let (x, f) = inputs[i];
2899 let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2900 let r = _mm_cvtsi32_ss(a, x);
2901 let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2902 assert_eq_m128(e, r);
2903 }
2904 }
2905
83c7162d 2906 #[simd_test(enable = "sse")]
e1599b0c 2907 unsafe fn test_mm_cvtss_f32() {
0531ce1d
XL
2908 let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2909 assert_eq!(_mm_cvtss_f32(a), 312.0134);
2910 }
2911
83c7162d 2912 #[simd_test(enable = "sse")]
0531ce1d
XL
2913 unsafe fn test_mm_set_ss() {
2914 let r = _mm_set_ss(black_box(4.25));
2915 assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2916 }
2917
83c7162d 2918 #[simd_test(enable = "sse")]
0531ce1d
XL
2919 unsafe fn test_mm_set1_ps() {
2920 let r1 = _mm_set1_ps(black_box(4.25));
2921 let r2 = _mm_set_ps1(black_box(4.25));
2922 assert_eq!(get_m128(r1, 0), 4.25);
2923 assert_eq!(get_m128(r1, 1), 4.25);
2924 assert_eq!(get_m128(r1, 2), 4.25);
2925 assert_eq!(get_m128(r1, 3), 4.25);
2926 assert_eq!(get_m128(r2, 0), 4.25);
2927 assert_eq!(get_m128(r2, 1), 4.25);
2928 assert_eq!(get_m128(r2, 2), 4.25);
2929 assert_eq!(get_m128(r2, 3), 4.25);
2930 }
2931
83c7162d 2932 #[simd_test(enable = "sse")]
0531ce1d
XL
2933 unsafe fn test_mm_set_ps() {
2934 let r = _mm_set_ps(
2935 black_box(1.0),
2936 black_box(2.0),
2937 black_box(3.0),
2938 black_box(4.0),
2939 );
2940 assert_eq!(get_m128(r, 0), 4.0);
2941 assert_eq!(get_m128(r, 1), 3.0);
2942 assert_eq!(get_m128(r, 2), 2.0);
2943 assert_eq!(get_m128(r, 3), 1.0);
2944 }
2945
83c7162d 2946 #[simd_test(enable = "sse")]
0531ce1d
XL
2947 unsafe fn test_mm_setr_ps() {
2948 let r = _mm_setr_ps(
2949 black_box(1.0),
2950 black_box(2.0),
2951 black_box(3.0),
2952 black_box(4.0),
2953 );
2954 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
2955 }
2956
83c7162d 2957 #[simd_test(enable = "sse")]
0531ce1d
XL
2958 unsafe fn test_mm_setzero_ps() {
2959 let r = *black_box(&_mm_setzero_ps());
2960 assert_eq_m128(r, _mm_set1_ps(0.0));
2961 }
2962
8faf50e0
XL
2963 #[simd_test(enable = "sse")]
2964 unsafe fn test_mm_shuffle() {
2965 assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
2966 assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
2967 assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
2968 }
2969
83c7162d 2970 #[simd_test(enable = "sse")]
0531ce1d
XL
2971 unsafe fn test_mm_shuffle_ps() {
2972 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2973 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2974 let r = _mm_shuffle_ps(a, b, 0b00_01_01_11);
2975 assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
2976 }
2977
83c7162d 2978 #[simd_test(enable = "sse")]
0531ce1d
XL
2979 unsafe fn test_mm_unpackhi_ps() {
2980 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2981 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2982 let r = _mm_unpackhi_ps(a, b);
2983 assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
2984 }
2985
83c7162d 2986 #[simd_test(enable = "sse")]
0531ce1d
XL
2987 unsafe fn test_mm_unpacklo_ps() {
2988 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2989 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2990 let r = _mm_unpacklo_ps(a, b);
2991 assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
2992 }
2993
83c7162d 2994 #[simd_test(enable = "sse")]
0531ce1d
XL
2995 unsafe fn test_mm_movehl_ps() {
2996 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2997 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2998 let r = _mm_movehl_ps(a, b);
2999 assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3000 }
3001
83c7162d 3002 #[simd_test(enable = "sse")]
0531ce1d
XL
3003 unsafe fn test_mm_movelh_ps() {
3004 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3005 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3006 let r = _mm_movelh_ps(a, b);
3007 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3008 }
3009
83c7162d 3010 #[simd_test(enable = "sse")]
0531ce1d
XL
3011 unsafe fn test_mm_load_ss() {
3012 let a = 42.0f32;
3013 let r = _mm_load_ss(&a as *const f32);
3014 assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3015 }
3016
83c7162d 3017 #[simd_test(enable = "sse")]
0531ce1d
XL
3018 unsafe fn test_mm_load1_ps() {
3019 let a = 42.0f32;
3020 let r = _mm_load1_ps(&a as *const f32);
3021 assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3022 }
3023
83c7162d 3024 #[simd_test(enable = "sse")]
0531ce1d
XL
3025 unsafe fn test_mm_load_ps() {
3026 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3027
3028 let mut p = vals.as_ptr();
3029 let mut fixup = 0.0f32;
3030
3031 // Make sure p is aligned, otherwise we might get a
3032 // (signal: 11, SIGSEGV: invalid memory reference)
3033
3034 let unalignment = (p as usize) & 0xf;
3035 if unalignment != 0 {
3036 let delta = ((16 - unalignment) >> 2) as isize;
3037 fixup = delta as f32;
3038 p = p.offset(delta);
3039 }
3040
3041 let r = _mm_load_ps(p);
0731742a 3042 let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
0531ce1d
XL
3043 assert_eq_m128(r, e);
3044 }
3045
83c7162d 3046 #[simd_test(enable = "sse")]
0531ce1d
XL
3047 unsafe fn test_mm_loadu_ps() {
3048 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3049 let p = vals.as_ptr().offset(3);
3050 let r = _mm_loadu_ps(black_box(p));
3051 assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3052 }
3053
83c7162d 3054 #[simd_test(enable = "sse")]
0531ce1d
XL
3055 unsafe fn test_mm_loadr_ps() {
3056 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3057
3058 let mut p = vals.as_ptr();
3059 let mut fixup = 0.0f32;
3060
3061 // Make sure p is aligned, otherwise we might get a
3062 // (signal: 11, SIGSEGV: invalid memory reference)
3063
3064 let unalignment = (p as usize) & 0xf;
3065 if unalignment != 0 {
3066 let delta = ((16 - unalignment) >> 2) as isize;
3067 fixup = delta as f32;
3068 p = p.offset(delta);
3069 }
3070
3071 let r = _mm_loadr_ps(p);
0731742a 3072 let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
0531ce1d
XL
3073 assert_eq_m128(r, e);
3074 }
3075
3dfed10e
XL
3076 #[simd_test(enable = "sse2")]
3077 unsafe fn test_mm_loadu_si64() {
3078 let a = _mm_setr_epi64x(5, 6);
3079 let r = _mm_loadu_si64(&a as *const _ as *const _);
3080 assert_eq_m128i(r, _mm_set_epi64x(5, 0));
3081 }
3082
83c7162d 3083 #[simd_test(enable = "sse")]
0531ce1d
XL
3084 unsafe fn test_mm_store_ss() {
3085 let mut vals = [0.0f32; 8];
3086 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3087 _mm_store_ss(vals.as_mut_ptr().offset(1), a);
3088
3089 assert_eq!(vals[0], 0.0);
3090 assert_eq!(vals[1], 1.0);
3091 assert_eq!(vals[2], 0.0);
3092 }
3093
83c7162d 3094 #[simd_test(enable = "sse")]
0531ce1d
XL
3095 unsafe fn test_mm_store1_ps() {
3096 let mut vals = [0.0f32; 8];
3097 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3098
3099 let mut ofs = 0;
3100 let mut p = vals.as_mut_ptr();
3101
3102 if (p as usize) & 0xf != 0 {
3103 ofs = (16 - (p as usize) & 0xf) >> 2;
3104 p = p.offset(ofs as isize);
3105 }
3106
3107 _mm_store1_ps(p, *black_box(&a));
3108
3109 if ofs > 0 {
3110 assert_eq!(vals[ofs - 1], 0.0);
3111 }
3112 assert_eq!(vals[ofs + 0], 1.0);
3113 assert_eq!(vals[ofs + 1], 1.0);
3114 assert_eq!(vals[ofs + 2], 1.0);
3115 assert_eq!(vals[ofs + 3], 1.0);
3116 assert_eq!(vals[ofs + 4], 0.0);
3117 }
3118
83c7162d 3119 #[simd_test(enable = "sse")]
0531ce1d
XL
3120 unsafe fn test_mm_store_ps() {
3121 let mut vals = [0.0f32; 8];
3122 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3123
3124 let mut ofs = 0;
3125 let mut p = vals.as_mut_ptr();
3126
3127 // Align p to 16-byte boundary
3128 if (p as usize) & 0xf != 0 {
3129 ofs = (16 - (p as usize) & 0xf) >> 2;
3130 p = p.offset(ofs as isize);
3131 }
3132
3133 _mm_store_ps(p, *black_box(&a));
3134
3135 if ofs > 0 {
3136 assert_eq!(vals[ofs - 1], 0.0);
3137 }
3138 assert_eq!(vals[ofs + 0], 1.0);
3139 assert_eq!(vals[ofs + 1], 2.0);
3140 assert_eq!(vals[ofs + 2], 3.0);
3141 assert_eq!(vals[ofs + 3], 4.0);
3142 assert_eq!(vals[ofs + 4], 0.0);
3143 }
3144
83c7162d 3145 #[simd_test(enable = "sse")]
0531ce1d
XL
3146 unsafe fn test_mm_storer_ps() {
3147 let mut vals = [0.0f32; 8];
3148 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3149
3150 let mut ofs = 0;
3151 let mut p = vals.as_mut_ptr();
3152
3153 // Align p to 16-byte boundary
3154 if (p as usize) & 0xf != 0 {
3155 ofs = (16 - (p as usize) & 0xf) >> 2;
3156 p = p.offset(ofs as isize);
3157 }
3158
3159 _mm_storer_ps(p, *black_box(&a));
3160
3161 if ofs > 0 {
3162 assert_eq!(vals[ofs - 1], 0.0);
3163 }
3164 assert_eq!(vals[ofs + 0], 4.0);
3165 assert_eq!(vals[ofs + 1], 3.0);
3166 assert_eq!(vals[ofs + 2], 2.0);
3167 assert_eq!(vals[ofs + 3], 1.0);
3168 assert_eq!(vals[ofs + 4], 0.0);
3169 }
3170
83c7162d 3171 #[simd_test(enable = "sse")]
0531ce1d
XL
3172 unsafe fn test_mm_storeu_ps() {
3173 let mut vals = [0.0f32; 8];
3174 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3175
3176 let mut ofs = 0;
3177 let mut p = vals.as_mut_ptr();
3178
532ac7d7 3179 // Make sure p is **not** aligned to 16-byte boundary
0531ce1d
XL
3180 if (p as usize) & 0xf == 0 {
3181 ofs = 1;
3182 p = p.offset(1);
3183 }
3184
3185 _mm_storeu_ps(p, *black_box(&a));
3186
3187 if ofs > 0 {
3188 assert_eq!(vals[ofs - 1], 0.0);
3189 }
3190 assert_eq!(vals[ofs + 0], 1.0);
3191 assert_eq!(vals[ofs + 1], 2.0);
3192 assert_eq!(vals[ofs + 2], 3.0);
3193 assert_eq!(vals[ofs + 3], 4.0);
3194 assert_eq!(vals[ofs + 4], 0.0);
3195 }
3196
83c7162d 3197 #[simd_test(enable = "sse")]
0531ce1d
XL
3198 unsafe fn test_mm_move_ss() {
3199 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3200 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3201
3202 let r = _mm_move_ss(a, b);
3203 let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3204 assert_eq_m128(e, r);
3205 }
3206
83c7162d 3207 #[simd_test(enable = "sse")]
0531ce1d
XL
3208 unsafe fn test_mm_movemask_ps() {
3209 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3210 assert_eq!(r, 0b0101);
3211
3212 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3213 assert_eq!(r, 0b0111);
3214 }
3215
83c7162d 3216 #[simd_test(enable = "sse")]
0531ce1d
XL
3217 unsafe fn test_mm_sfence() {
3218 _mm_sfence();
3219 }
3220
83c7162d 3221 #[simd_test(enable = "sse")]
0531ce1d
XL
3222 unsafe fn test_mm_getcsr_setcsr_1() {
3223 let saved_csr = _mm_getcsr();
3224
3225 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3226 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3227
3228 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3229 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3230
3231 _mm_setcsr(saved_csr);
3232
3233 let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3234 assert_eq_m128(r, exp); // first component is a denormalized f32
3235 }
3236
83c7162d 3237 #[simd_test(enable = "sse")]
0531ce1d
XL
3238 unsafe fn test_mm_getcsr_setcsr_2() {
3239 // Same as _mm_setcsr_1 test, but with opposite flag value.
3240
3241 let saved_csr = _mm_getcsr();
3242
3243 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3244 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3245
3246 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3247 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3248
3249 _mm_setcsr(saved_csr);
3250
3251 let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3252 assert_eq_m128(r, exp); // first component is a denormalized f32
3253 }
3254
83c7162d 3255 #[simd_test(enable = "sse")]
0531ce1d
XL
3256 unsafe fn test_mm_getcsr_setcsr_underflow() {
3257 _MM_SET_EXCEPTION_STATE(0);
3258
3259 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3260 let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3261
3262 assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3263
3264 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3265
3266 let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3267 assert_eq_m128(r, exp);
3268
3269 let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3270 assert_eq!(underflow, true);
3271 }
3272
83c7162d 3273 #[simd_test(enable = "sse")]
0531ce1d
XL
3274 unsafe fn test_MM_TRANSPOSE4_PS() {
3275 let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3276 let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3277 let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3278 let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3279
3280 _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3281
3282 assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3283 assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3284 assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3285 assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3286 }
3287
3288 #[repr(align(16))]
3289 struct Memory {
3290 pub data: [f32; 4],
3291 }
3292
83c7162d 3293 #[simd_test(enable = "sse")]
0531ce1d
XL
3294 unsafe fn test_mm_stream_ps() {
3295 let a = _mm_set1_ps(7.0);
8faf50e0 3296 let mut mem = Memory { data: [-1.0; 4] };
0531ce1d
XL
3297
3298 _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
3299 for i in 0..4 {
3300 assert_eq!(mem.data[i], get_m128(a, i));
3301 }
3302 }
0531ce1d 3303}