]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/sse.rs
New upstream version 1.59.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / sse.rs
CommitLineData
0531ce1d
XL
1//! Streaming SIMD Extensions (SSE)
2
532ac7d7
XL
3use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 intrinsics, mem, ptr,
6};
0531ce1d
XL
7
8#[cfg(test)]
416331ca 9use stdarch_test::assert_instr;
0531ce1d
XL
10
11/// Adds the first component of `a` and `b`, the other components are copied
12/// from `a`.
83c7162d
XL
13///
14/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss)
0531ce1d
XL
15#[inline]
16#[target_feature(enable = "sse")]
17#[cfg_attr(test, assert_instr(addss))]
83c7162d 18#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
19pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
20 addss(a, b)
21}
22
23/// Adds __m128 vectors.
83c7162d
XL
24///
25/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps)
0531ce1d
XL
26#[inline]
27#[target_feature(enable = "sse")]
28#[cfg_attr(test, assert_instr(addps))]
83c7162d 29#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
30pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
31 simd_add(a, b)
32}
33
34/// Subtracts the first component of `b` from `a`, the other components are
35/// copied from `a`.
83c7162d
XL
36///
37/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss)
0531ce1d
XL
38#[inline]
39#[target_feature(enable = "sse")]
40#[cfg_attr(test, assert_instr(subss))]
83c7162d 41#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
42pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
43 subss(a, b)
44}
45
46/// Subtracts __m128 vectors.
83c7162d
XL
47///
48/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps)
0531ce1d
XL
49#[inline]
50#[target_feature(enable = "sse")]
51#[cfg_attr(test, assert_instr(subps))]
83c7162d 52#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
53pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
54 simd_sub(a, b)
55}
56
57/// Multiplies the first component of `a` and `b`, the other components are
58/// copied from `a`.
83c7162d
XL
59///
60/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss)
0531ce1d
XL
61#[inline]
62#[target_feature(enable = "sse")]
63#[cfg_attr(test, assert_instr(mulss))]
83c7162d 64#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
65pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
66 mulss(a, b)
67}
68
69/// Multiplies __m128 vectors.
83c7162d
XL
70///
71/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps)
0531ce1d
XL
72#[inline]
73#[target_feature(enable = "sse")]
74#[cfg_attr(test, assert_instr(mulps))]
83c7162d 75#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
76pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
77 simd_mul(a, b)
78}
79
80/// Divides the first component of `b` by `a`, the other components are
81/// copied from `a`.
83c7162d
XL
82///
83/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss)
0531ce1d
XL
84#[inline]
85#[target_feature(enable = "sse")]
86#[cfg_attr(test, assert_instr(divss))]
83c7162d 87#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
88pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
89 divss(a, b)
90}
91
92/// Divides __m128 vectors.
83c7162d
XL
93///
94/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps)
0531ce1d
XL
95#[inline]
96#[target_feature(enable = "sse")]
97#[cfg_attr(test, assert_instr(divps))]
83c7162d 98#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
99pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
100 simd_div(a, b)
101}
102
532ac7d7 103/// Returns the square root of the first single-precision (32-bit)
0531ce1d 104/// floating-point element in `a`, the other elements are unchanged.
83c7162d
XL
105///
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss)
0531ce1d
XL
107#[inline]
108#[target_feature(enable = "sse")]
109#[cfg_attr(test, assert_instr(sqrtss))]
83c7162d 110#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
111pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
112 sqrtss(a)
113}
114
532ac7d7 115/// Returns the square root of packed single-precision (32-bit) floating-point
0531ce1d 116/// elements in `a`.
83c7162d
XL
117///
118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps)
0531ce1d
XL
119#[inline]
120#[target_feature(enable = "sse")]
121#[cfg_attr(test, assert_instr(sqrtps))]
83c7162d 122#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
123pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
124 sqrtps(a)
125}
126
532ac7d7 127/// Returns the approximate reciprocal of the first single-precision
0531ce1d 128/// (32-bit) floating-point element in `a`, the other elements are unchanged.
83c7162d
XL
129///
130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss)
0531ce1d
XL
131#[inline]
132#[target_feature(enable = "sse")]
133#[cfg_attr(test, assert_instr(rcpss))]
83c7162d 134#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
135pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
136 rcpss(a)
137}
138
532ac7d7 139/// Returns the approximate reciprocal of packed single-precision (32-bit)
0531ce1d 140/// floating-point elements in `a`.
83c7162d
XL
141///
142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps)
0531ce1d
XL
143#[inline]
144#[target_feature(enable = "sse")]
145#[cfg_attr(test, assert_instr(rcpps))]
83c7162d 146#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
147pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
148 rcpps(a)
149}
150
17df50a5
XL
151/// Returns the approximate reciprocal square root of the first single-precision
152/// (32-bit) floating-point element in `a`, the other elements are unchanged.
83c7162d
XL
153///
154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss)
0531ce1d
XL
155#[inline]
156#[target_feature(enable = "sse")]
157#[cfg_attr(test, assert_instr(rsqrtss))]
83c7162d 158#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
159pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
160 rsqrtss(a)
161}
162
532ac7d7 163/// Returns the approximate reciprocal square root of packed single-precision
0531ce1d 164/// (32-bit) floating-point elements in `a`.
83c7162d
XL
165///
166/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps)
0531ce1d
XL
167#[inline]
168#[target_feature(enable = "sse")]
169#[cfg_attr(test, assert_instr(rsqrtps))]
83c7162d 170#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
171pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
172 rsqrtps(a)
173}
174
532ac7d7 175/// Compares the first single-precision (32-bit) floating-point element of `a`
0531ce1d
XL
176/// and `b`, and return the minimum value in the first element of the return
177/// value, the other elements are copied from `a`.
83c7162d
XL
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss)
0531ce1d
XL
180#[inline]
181#[target_feature(enable = "sse")]
182#[cfg_attr(test, assert_instr(minss))]
83c7162d 183#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
184pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
185 minss(a, b)
186}
187
532ac7d7 188/// Compares packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 189/// `b`, and return the corresponding minimum values.
83c7162d
XL
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps)
0531ce1d
XL
192#[inline]
193#[target_feature(enable = "sse")]
194#[cfg_attr(test, assert_instr(minps))]
83c7162d 195#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 196pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
74b04a01 197 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
0531ce1d
XL
198 minps(a, b)
199}
200
532ac7d7 201/// Compares the first single-precision (32-bit) floating-point element of `a`
0531ce1d
XL
202/// and `b`, and return the maximum value in the first element of the return
203/// value, the other elements are copied from `a`.
83c7162d
XL
204///
205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss)
0531ce1d
XL
206#[inline]
207#[target_feature(enable = "sse")]
208#[cfg_attr(test, assert_instr(maxss))]
83c7162d 209#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
210pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
211 maxss(a, b)
212}
213
532ac7d7 214/// Compares packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 215/// `b`, and return the corresponding maximum values.
83c7162d
XL
216///
217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps)
0531ce1d
XL
218#[inline]
219#[target_feature(enable = "sse")]
220#[cfg_attr(test, assert_instr(maxps))]
83c7162d 221#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 222pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
74b04a01 223 // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
0531ce1d
XL
224 maxps(a, b)
225}
226
227/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
83c7162d
XL
228///
229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps)
0531ce1d
XL
230#[inline]
231#[target_feature(enable = "sse")]
232// i586 only seems to generate plain `and` instructions, so ignore it.
8faf50e0
XL
233#[cfg_attr(
234 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
235 assert_instr(andps)
236)]
83c7162d 237#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
238pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
239 let a: __m128i = mem::transmute(a);
240 let b: __m128i = mem::transmute(b);
241 mem::transmute(simd_and(a, b))
242}
243
244/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
245/// elements.
246///
247/// Computes `!a & b` for each bit in `a` and `b`.
83c7162d
XL
248///
249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps)
0531ce1d
XL
250#[inline]
251#[target_feature(enable = "sse")]
252// i586 only seems to generate plain `not` and `and` instructions, so ignore
253// it.
8faf50e0
XL
254#[cfg_attr(
255 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
256 assert_instr(andnps)
257)]
83c7162d 258#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
259pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
260 let a: __m128i = mem::transmute(a);
261 let b: __m128i = mem::transmute(b);
262 let mask: __m128i = mem::transmute(i32x4::splat(-1));
263 mem::transmute(simd_and(simd_xor(mask, a), b))
264}
265
266/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
83c7162d
XL
267///
268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps)
0531ce1d
XL
269#[inline]
270#[target_feature(enable = "sse")]
271// i586 only seems to generate plain `or` instructions, so we ignore it.
8faf50e0
XL
272#[cfg_attr(
273 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
274 assert_instr(orps)
275)]
83c7162d 276#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
277pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
278 let a: __m128i = mem::transmute(a);
279 let b: __m128i = mem::transmute(b);
280 mem::transmute(simd_or(a, b))
281}
282
283/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
284/// elements.
83c7162d
XL
285///
286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps)
0531ce1d
XL
287#[inline]
288#[target_feature(enable = "sse")]
289// i586 only seems to generate plain `xor` instructions, so we ignore it.
8faf50e0
XL
290#[cfg_attr(
291 all(test, any(target_arch = "x86_64", target_feature = "sse2")),
292 assert_instr(xorps)
293)]
83c7162d 294#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
295pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
296 let a: __m128i = mem::transmute(a);
297 let b: __m128i = mem::transmute(b);
298 mem::transmute(simd_xor(a, b))
299}
300
532ac7d7 301/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
0531ce1d
XL
302/// the result will be `0xffffffff` if the two inputs are equal, or `0`
303/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
83c7162d
XL
304///
305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss)
0531ce1d
XL
306#[inline]
307#[target_feature(enable = "sse")]
308#[cfg_attr(test, assert_instr(cmpeqss))]
83c7162d 309#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
310pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
311 cmpss(a, b, 0)
312}
313
532ac7d7 314/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
0531ce1d
XL
315/// of the result will be `0xffffffff` if `a.extract(0)` is less than
316/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
317/// upper 96 bits of `a`.
83c7162d
XL
318///
319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss)
0531ce1d
XL
320#[inline]
321#[target_feature(enable = "sse")]
322#[cfg_attr(test, assert_instr(cmpltss))]
83c7162d 323#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
324pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
325 cmpss(a, b, 1)
326}
327
532ac7d7 328/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
0531ce1d
XL
329/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
330/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
331/// are the upper 96 bits of `a`.
83c7162d
XL
332///
333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss)
0531ce1d
XL
334#[inline]
335#[target_feature(enable = "sse")]
336#[cfg_attr(test, assert_instr(cmpless))]
83c7162d 337#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
338pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
339 cmpss(a, b, 2)
340}
341
532ac7d7 342/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
0531ce1d
XL
343/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
344/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
345/// are the upper 96 bits of `a`.
83c7162d
XL
346///
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss)
0531ce1d
XL
348#[inline]
349#[target_feature(enable = "sse")]
350#[cfg_attr(test, assert_instr(cmpltss))]
83c7162d 351#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 352pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
17df50a5 353 simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3])
0531ce1d
XL
354}
355
532ac7d7 356/// Compares the lowest `f32` of both inputs for greater than or equal. The
0531ce1d
XL
357/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
358/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
359/// of the result are the upper 96 bits of `a`.
83c7162d
XL
360///
361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss)
0531ce1d
XL
362#[inline]
363#[target_feature(enable = "sse")]
364#[cfg_attr(test, assert_instr(cmpless))]
83c7162d 365#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 366pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
17df50a5 367 simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3])
0531ce1d
XL
368}
369
532ac7d7 370/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
0531ce1d
XL
371/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
372/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
373/// upper 96 bits of `a`.
83c7162d
XL
374///
375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss)
0531ce1d
XL
376#[inline]
377#[target_feature(enable = "sse")]
378#[cfg_attr(test, assert_instr(cmpneqss))]
83c7162d 379#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
380pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
381 cmpss(a, b, 4)
382}
383
532ac7d7 384/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
0531ce1d
XL
385/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
386/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
387/// upper 96 bits of `a`.
83c7162d
XL
388///
389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss)
0531ce1d
XL
390#[inline]
391#[target_feature(enable = "sse")]
392#[cfg_attr(test, assert_instr(cmpnltss))]
83c7162d 393#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
394pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
395 cmpss(a, b, 5)
396}
397
532ac7d7 398/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
0531ce1d
XL
399/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
400/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
401/// of the result are the upper 96 bits of `a`.
83c7162d
XL
402///
403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss)
0531ce1d
XL
404#[inline]
405#[target_feature(enable = "sse")]
406#[cfg_attr(test, assert_instr(cmpnless))]
83c7162d 407#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
408pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
409 cmpss(a, b, 6)
410}
411
532ac7d7 412/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
0531ce1d
XL
413/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
414/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
415/// the upper 96 bits of `a`.
83c7162d
XL
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss)
0531ce1d
XL
418#[inline]
419#[target_feature(enable = "sse")]
420#[cfg_attr(test, assert_instr(cmpnltss))]
83c7162d 421#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 422pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
17df50a5 423 simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3])
0531ce1d
XL
424}
425
532ac7d7 426/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
0531ce1d
XL
427/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
428/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
429/// bits of the result are the upper 96 bits of `a`.
83c7162d
XL
430///
431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss)
0531ce1d
XL
432#[inline]
433#[target_feature(enable = "sse")]
434#[cfg_attr(test, assert_instr(cmpnless))]
83c7162d 435#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 436pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
17df50a5 437 simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3])
0531ce1d
XL
438}
439
532ac7d7 440/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
0531ce1d
XL
441/// the result will be `0xffffffff` if neither of `a.extract(0)` or
442/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
443/// are the upper 96 bits of `a`.
83c7162d
XL
444///
445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss)
0531ce1d
XL
446#[inline]
447#[target_feature(enable = "sse")]
448#[cfg_attr(test, assert_instr(cmpordss))]
83c7162d 449#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
450pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
451 cmpss(a, b, 7)
452}
453
532ac7d7 454/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
0531ce1d
XL
455/// of the result will be `0xffffffff` if any of `a.extract(0)` or
456/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
457/// are the upper 96 bits of `a`.
83c7162d
XL
458///
459/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss)
0531ce1d
XL
460#[inline]
461#[target_feature(enable = "sse")]
462#[cfg_attr(test, assert_instr(cmpunordss))]
83c7162d 463#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
464pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
465 cmpss(a, b, 3)
466}
467
532ac7d7 468/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
469/// The result in the output vector will be `0xffffffff` if the input elements
470/// were equal, or `0` otherwise.
83c7162d
XL
471///
472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps)
0531ce1d
XL
473#[inline]
474#[target_feature(enable = "sse")]
475#[cfg_attr(test, assert_instr(cmpeqps))]
83c7162d 476#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
477pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
478 cmpps(a, b, 0)
479}
480
532ac7d7 481/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
482/// The result in the output vector will be `0xffffffff` if the input element
483/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
83c7162d
XL
484///
485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps)
0531ce1d
XL
486#[inline]
487#[target_feature(enable = "sse")]
488#[cfg_attr(test, assert_instr(cmpltps))]
83c7162d 489#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
490pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
491 cmpps(a, b, 1)
492}
493
532ac7d7 494/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
495/// The result in the output vector will be `0xffffffff` if the input element
496/// in `a` is less than or equal to the corresponding element in `b`, or `0`
497/// otherwise.
83c7162d
XL
498///
499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps)
0531ce1d
XL
500#[inline]
501#[target_feature(enable = "sse")]
502#[cfg_attr(test, assert_instr(cmpleps))]
83c7162d 503#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
504pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
505 cmpps(a, b, 2)
506}
507
532ac7d7 508/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
509/// The result in the output vector will be `0xffffffff` if the input element
510/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
83c7162d
XL
511///
512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps)
0531ce1d
XL
513#[inline]
514#[target_feature(enable = "sse")]
515#[cfg_attr(test, assert_instr(cmpltps))]
83c7162d 516#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
517pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
518 cmpps(b, a, 1)
519}
520
532ac7d7 521/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
522/// The result in the output vector will be `0xffffffff` if the input element
523/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
524/// otherwise.
83c7162d
XL
525///
526/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps)
0531ce1d
XL
527#[inline]
528#[target_feature(enable = "sse")]
529#[cfg_attr(test, assert_instr(cmpleps))]
83c7162d 530#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
531pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
532 cmpps(b, a, 2)
533}
534
532ac7d7 535/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 536/// The result in the output vector will be `0xffffffff` if the input elements
532ac7d7 537/// are **not** equal, or `0` otherwise.
83c7162d
XL
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps)
0531ce1d
XL
540#[inline]
541#[target_feature(enable = "sse")]
542#[cfg_attr(test, assert_instr(cmpneqps))]
83c7162d 543#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
544pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
545 cmpps(a, b, 4)
546}
547
532ac7d7 548/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 549/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 550/// in `a` is **not** less than the corresponding element in `b`, or `0`
0531ce1d 551/// otherwise.
83c7162d
XL
552///
553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps)
0531ce1d
XL
554#[inline]
555#[target_feature(enable = "sse")]
556#[cfg_attr(test, assert_instr(cmpnltps))]
83c7162d 557#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
558pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
559 cmpps(a, b, 5)
560}
561
532ac7d7 562/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 563/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 564/// in `a` is **not** less than or equal to the corresponding element in `b`, or
0531ce1d 565/// `0` otherwise.
83c7162d
XL
566///
567/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps)
0531ce1d
XL
568#[inline]
569#[target_feature(enable = "sse")]
570#[cfg_attr(test, assert_instr(cmpnleps))]
83c7162d 571#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
572pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
573 cmpps(a, b, 6)
574}
575
532ac7d7 576/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 577/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 578/// in `a` is **not** greater than the corresponding element in `b`, or `0`
0531ce1d 579/// otherwise.
83c7162d
XL
580///
581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps)
0531ce1d
XL
582#[inline]
583#[target_feature(enable = "sse")]
584#[cfg_attr(test, assert_instr(cmpnltps))]
83c7162d 585#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
586pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
587 cmpps(b, a, 5)
588}
589
532ac7d7 590/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d 591/// The result in the output vector will be `0xffffffff` if the input element
532ac7d7 592/// in `a` is **not** greater than or equal to the corresponding element in `b`,
0531ce1d 593/// or `0` otherwise.
83c7162d
XL
594///
595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps)
0531ce1d
XL
596#[inline]
597#[target_feature(enable = "sse")]
598#[cfg_attr(test, assert_instr(cmpnleps))]
83c7162d 599#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
600pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
601 cmpps(b, a, 6)
602}
603
532ac7d7 604/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
605/// Returns four floats that have one of two possible bit patterns. The element
606/// in the output vector will be `0xffffffff` if the input elements in `a` and
607/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
83c7162d
XL
608///
609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps)
0531ce1d
XL
610#[inline]
611#[target_feature(enable = "sse")]
612#[cfg_attr(test, assert_instr(cmpordps))]
83c7162d 613#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
614pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
615 cmpps(b, a, 7)
616}
617
532ac7d7 618/// Compares each of the four floats in `a` to the corresponding element in `b`.
0531ce1d
XL
619/// Returns four floats that have one of two possible bit patterns. The element
620/// in the output vector will be `0xffffffff` if the input elements in `a` and
621/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
83c7162d
XL
622///
623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps)
0531ce1d
XL
624#[inline]
625#[target_feature(enable = "sse")]
626#[cfg_attr(test, assert_instr(cmpunordps))]
83c7162d 627#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
628pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
629 cmpps(b, a, 3)
630}
631
532ac7d7 632/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d 633/// `1` if they are equal, or `0` otherwise.
83c7162d
XL
634///
635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss)
0531ce1d
XL
636#[inline]
637#[target_feature(enable = "sse")]
638#[cfg_attr(test, assert_instr(comiss))]
83c7162d 639#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
640pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
641 comieq_ss(a, b)
642}
643
532ac7d7 644/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d 645/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
83c7162d
XL
646///
647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss)
0531ce1d
XL
648#[inline]
649#[target_feature(enable = "sse")]
650#[cfg_attr(test, assert_instr(comiss))]
83c7162d 651#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
652pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
653 comilt_ss(a, b)
654}
655
532ac7d7 656/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
657/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
658/// otherwise.
83c7162d
XL
659///
660/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss)
0531ce1d
XL
661#[inline]
662#[target_feature(enable = "sse")]
663#[cfg_attr(test, assert_instr(comiss))]
83c7162d 664#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
665pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
666 comile_ss(a, b)
667}
668
532ac7d7 669/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
670/// `1` if the value from `a` is greater than the one from `b`, or `0`
671/// otherwise.
83c7162d
XL
672///
673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss)
0531ce1d
XL
674#[inline]
675#[target_feature(enable = "sse")]
676#[cfg_attr(test, assert_instr(comiss))]
83c7162d 677#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
678pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
679 comigt_ss(a, b)
680}
681
532ac7d7 682/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
683/// `1` if the value from `a` is greater than or equal to the one from `b`, or
684/// `0` otherwise.
83c7162d
XL
685///
686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss)
0531ce1d
XL
687#[inline]
688#[target_feature(enable = "sse")]
689#[cfg_attr(test, assert_instr(comiss))]
83c7162d 690#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
691pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
692 comige_ss(a, b)
693}
694
532ac7d7
XL
695/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696/// `1` if they are **not** equal, or `0` otherwise.
83c7162d
XL
697///
698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss)
0531ce1d
XL
699#[inline]
700#[target_feature(enable = "sse")]
701#[cfg_attr(test, assert_instr(comiss))]
83c7162d 702#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
703pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
704 comineq_ss(a, b)
705}
706
532ac7d7 707/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
708/// `1` if they are equal, or `0` otherwise. This instruction will not signal
709/// an exception if either argument is a quiet NaN.
83c7162d
XL
710///
711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss)
0531ce1d
XL
712#[inline]
713#[target_feature(enable = "sse")]
714#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 715#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
716pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
717 ucomieq_ss(a, b)
718}
719
532ac7d7 720/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
721/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
722/// This instruction will not signal an exception if either argument is a quiet
723/// NaN.
83c7162d
XL
724///
725/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss)
0531ce1d
XL
726#[inline]
727#[target_feature(enable = "sse")]
728#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 729#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
730pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
731 ucomilt_ss(a, b)
732}
733
532ac7d7 734/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
735/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
736/// otherwise. This instruction will not signal an exception if either argument
737/// is a quiet NaN.
83c7162d
XL
738///
739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss)
0531ce1d
XL
740#[inline]
741#[target_feature(enable = "sse")]
742#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 743#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
744pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
745 ucomile_ss(a, b)
746}
747
532ac7d7 748/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
749/// `1` if the value from `a` is greater than the one from `b`, or `0`
750/// otherwise. This instruction will not signal an exception if either argument
751/// is a quiet NaN.
83c7162d
XL
752///
753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss)
0531ce1d
XL
754#[inline]
755#[target_feature(enable = "sse")]
756#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 757#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
758pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
759 ucomigt_ss(a, b)
760}
761
532ac7d7 762/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
0531ce1d
XL
763/// `1` if the value from `a` is greater than or equal to the one from `b`, or
764/// `0` otherwise. This instruction will not signal an exception if either
765/// argument is a quiet NaN.
83c7162d
XL
766///
767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss)
0531ce1d
XL
768#[inline]
769#[target_feature(enable = "sse")]
770#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 771#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
772pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
773 ucomige_ss(a, b)
774}
775
532ac7d7
XL
776/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
777/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
0531ce1d 778/// signal an exception if either argument is a quiet NaN.
83c7162d
XL
779///
780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss)
0531ce1d
XL
781#[inline]
782#[target_feature(enable = "sse")]
783#[cfg_attr(test, assert_instr(ucomiss))]
83c7162d 784#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
785pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
786 ucomineq_ss(a, b)
787}
788
532ac7d7 789/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
0531ce1d
XL
790///
791/// The result is rounded according to the current rounding mode. If the result
792/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
ba9703b0 793/// (`i32::MIN`) or an invalid operation floating point exception if
0531ce1d
XL
794/// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
795///
796/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
83c7162d
XL
797///
798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32)
0531ce1d
XL
799#[inline]
800#[target_feature(enable = "sse")]
801#[cfg_attr(test, assert_instr(cvtss2si))]
83c7162d 802#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
803pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
804 cvtss2si(a)
805}
806
807/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
83c7162d
XL
808///
809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si)
0531ce1d
XL
810#[inline]
811#[target_feature(enable = "sse")]
812#[cfg_attr(test, assert_instr(cvtss2si))]
83c7162d 813#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
814pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
815 _mm_cvtss_si32(a)
816}
817
532ac7d7 818/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
0531ce1d
XL
819/// with
820/// truncation.
821///
822/// The result is rounded always using truncation (round towards zero). If the
823/// result cannot be represented as a 32 bit integer the result will be
ba9703b0 824/// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point
0531ce1d
XL
825/// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
826///
827/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
83c7162d
XL
828///
829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32)
0531ce1d
XL
830#[inline]
831#[target_feature(enable = "sse")]
832#[cfg_attr(test, assert_instr(cvttss2si))]
83c7162d 833#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
834pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
835 cvttss2si(a)
836}
837
838/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
83c7162d
XL
839///
840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si)
0531ce1d
XL
841#[inline]
842#[target_feature(enable = "sse")]
843#[cfg_attr(test, assert_instr(cvttss2si))]
83c7162d 844#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
845pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
846 _mm_cvttss_si32(a)
847}
848
532ac7d7 849/// Extracts the lowest 32 bit float from the input vector.
83c7162d
XL
850///
851/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32)
0531ce1d
XL
852#[inline]
853#[target_feature(enable = "sse")]
854// No point in using assert_instrs. In Unix x86_64 calling convention this is a
855// no-op, and on Windows it's just a `mov`.
83c7162d 856#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
857pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
858 simd_extract(a, 0)
859}
860
532ac7d7 861/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
0531ce1d
XL
862/// vector `a` with the lowest 32 bit float replaced by the converted integer.
863///
864/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
865/// input).
83c7162d
XL
866///
867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss)
0531ce1d
XL
868#[inline]
869#[target_feature(enable = "sse")]
870#[cfg_attr(test, assert_instr(cvtsi2ss))]
83c7162d 871#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
872pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
873 cvtsi2ss(a, b)
874}
875
876/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
83c7162d
XL
877///
878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss)
0531ce1d
XL
879#[inline]
880#[target_feature(enable = "sse")]
881#[cfg_attr(test, assert_instr(cvtsi2ss))]
83c7162d 882#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
883pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
884 _mm_cvtsi32_ss(a, b)
885}
886
887/// Construct a `__m128` with the lowest element set to `a` and the rest set to
888/// zero.
83c7162d
XL
889///
890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss)
0531ce1d
XL
891#[inline]
892#[target_feature(enable = "sse")]
893#[cfg_attr(test, assert_instr(movss))]
83c7162d 894#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
895pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
896 __m128(a, 0.0, 0.0, 0.0)
897}
898
899/// Construct a `__m128` with all element set to `a`.
83c7162d
XL
900///
901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps)
0531ce1d
XL
902#[inline]
903#[target_feature(enable = "sse")]
904#[cfg_attr(test, assert_instr(shufps))]
83c7162d 905#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
906pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
907 __m128(a, a, a, a)
908}
909
910/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
83c7162d
XL
911///
912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1)
0531ce1d
XL
913#[inline]
914#[target_feature(enable = "sse")]
915#[cfg_attr(test, assert_instr(shufps))]
83c7162d 916#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
917pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
918 _mm_set1_ps(a)
919}
920
921/// Construct a `__m128` from four floating point values highest to lowest.
922///
923/// Note that `a` will be the highest 32 bits of the result, and `d` the
924/// lowest. This matches the standard way of writing bit patterns on x86:
925///
926/// ```text
927/// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0
928/// +---------+---------+---------+---------+
929/// | a | b | c | d | result
930/// +---------+---------+---------+---------+
931/// ```
932///
933/// Alternatively:
934///
935/// ```text
936/// let v = _mm_set_ps(d, c, b, a);
937/// ```
83c7162d
XL
938///
939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps)
0531ce1d
XL
940#[inline]
941#[target_feature(enable = "sse")]
942#[cfg_attr(test, assert_instr(unpcklps))]
83c7162d 943#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
944pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
945 __m128(d, c, b, a)
946}
947
948/// Construct a `__m128` from four floating point values lowest to highest.
949///
950/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
951/// bits of the result, and `d` the highest.
952///
953/// ```text
954/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
955/// ```
83c7162d
XL
956///
957/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps)
0531ce1d
XL
958#[inline]
959#[target_feature(enable = "sse")]
fc512014
XL
960#[cfg_attr(
961 all(test, any(target_os = "windows", target_arch = "x86_64")),
962 assert_instr(unpcklps)
963)]
964// On a 32-bit architecture on non-Windows it just copies the operands from the stack.
965#[cfg_attr(
966 all(test, all(not(target_os = "windows"), target_arch = "x86")),
967 assert_instr(movaps)
968)]
83c7162d 969#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
970pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
971 __m128(a, b, c, d)
972}
973
974/// Construct a `__m128` with all elements initialized to zero.
83c7162d
XL
975///
976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps)
0531ce1d
XL
977#[inline]
978#[target_feature(enable = "sse")]
979#[cfg_attr(test, assert_instr(xorps))]
83c7162d 980#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
981pub unsafe fn _mm_setzero_ps() -> __m128 {
982 __m128(0.0, 0.0, 0.0, 0.0)
983}
984
0bf4aa26
XL
985/// A utility function for creating masks to use with Intel shuffle and
986/// permute intrinsics.
8faf50e0
XL
987#[inline]
988#[allow(non_snake_case)]
416331ca 989#[unstable(feature = "stdarch", issue = "27731")]
0731742a
XL
990pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
991 ((z << 6) | (y << 4) | (x << 2) | w) as i32
8faf50e0
XL
992}
993
532ac7d7 994/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
17df50a5 995/// `b` using `MASK`.
0531ce1d
XL
996///
997/// The lower half of result takes values from `a` and the higher half from
998/// `b`. Mask is split to 2 control bits each to index the element from inputs.
83c7162d
XL
999///
1000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
3dfed10e
XL
1001///
1002/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
94222f64 1003/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
3dfed10e
XL
1004/// as is the case for [other shuffle intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_).
1005/// Performing an implicit type conversion between an unsigned integer and a signed integer
1006/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
0531ce1d
XL
1007#[inline]
1008#[target_feature(enable = "sse")]
17df50a5
XL
1009#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1010#[rustc_legacy_const_generics(2)]
1011#[stable(feature = "simd_x86", since = "1.27.0")]
1012pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1013 static_assert_imm8!(MASK);
1014 simd_shuffle4!(
1015 a,
1016 b,
1017 <const MASK: i32> [
1018 MASK as u32 & 0b11,
1019 (MASK as u32 >> 2) & 0b11,
1020 ((MASK as u32 >> 4) & 0b11) + 4,
1021 ((MASK as u32 >> 6) & 0b11) + 4,
1022 ],
1023 )
0531ce1d
XL
1024}
1025
532ac7d7 1026/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1027/// from the higher half of `a` and `b`.
83c7162d
XL
1028///
1029/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps)
0531ce1d
XL
1030#[inline]
1031#[target_feature(enable = "sse")]
1032#[cfg_attr(test, assert_instr(unpckhps))]
83c7162d 1033#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1034pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
17df50a5 1035 simd_shuffle4!(a, b, [2, 6, 3, 7])
0531ce1d
XL
1036}
1037
532ac7d7 1038/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1039/// from the lower half of `a` and `b`.
83c7162d
XL
1040///
1041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps)
0531ce1d
XL
1042#[inline]
1043#[target_feature(enable = "sse")]
1044#[cfg_attr(test, assert_instr(unpcklps))]
83c7162d 1045#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1046pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
17df50a5 1047 simd_shuffle4!(a, b, [0, 4, 1, 5])
0531ce1d
XL
1048}
1049
1050/// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
1051/// lower half of result.
83c7162d
XL
1052///
1053/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps)
0531ce1d
XL
1054#[inline]
1055#[target_feature(enable = "sse")]
0731742a 1056#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))]
83c7162d 1057#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1058pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1059 // TODO; figure why this is a different instruction on Windows?
17df50a5 1060 simd_shuffle4!(a, b, [6, 7, 2, 3])
0531ce1d
XL
1061}
1062
1063/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1064/// higher half of result.
83c7162d
XL
1065///
1066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps)
0531ce1d
XL
1067#[inline]
1068#[target_feature(enable = "sse")]
0731742a 1069#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
83c7162d 1070#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1071pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
17df50a5 1072 simd_shuffle4!(a, b, [0, 1, 4, 5])
0531ce1d
XL
1073}
1074
532ac7d7 1075/// Returns a mask of the most significant bit of each element in `a`.
0531ce1d
XL
1076///
1077/// The mask is stored in the 4 least significant bits of the return value.
1078/// All other bits are set to `0`.
83c7162d
XL
1079///
1080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
0531ce1d
XL
1081#[inline]
1082#[target_feature(enable = "sse")]
e1599b0c
XL
1083// FIXME: LLVM9 trunk has the following bug:
1084// https://github.com/rust-lang/stdarch/issues/794
1085// so we only temporarily test this on i686 and x86_64 but not on i586:
1086#[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))]
83c7162d 1087#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1088pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1089 movmskps(a)
1090}
1091
0531ce1d
XL
1092/// Construct a `__m128` with the lowest element read from `p` and the other
1093/// elements set to zero.
1094///
1095/// This corresponds to instructions `VMOVSS` / `MOVSS`.
83c7162d
XL
1096///
1097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss)
0531ce1d
XL
1098#[inline]
1099#[target_feature(enable = "sse")]
1100#[cfg_attr(test, assert_instr(movss))]
83c7162d 1101#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1102pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1103 __m128(*p, 0.0, 0.0, 0.0)
1104}
1105
1106/// Construct a `__m128` by duplicating the value read from `p` into all
1107/// elements.
1108///
1109/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1110/// shuffling.
83c7162d
XL
1111///
1112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps)
0531ce1d
XL
1113#[inline]
1114#[target_feature(enable = "sse")]
1115#[cfg_attr(test, assert_instr(movss))]
83c7162d 1116#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1117pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1118 let a = *p;
1119 __m128(a, a, a, a)
1120}
1121
1122/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
83c7162d
XL
1123///
1124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1)
0531ce1d
XL
1125#[inline]
1126#[target_feature(enable = "sse")]
1127#[cfg_attr(test, assert_instr(movss))]
83c7162d 1128#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1129pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1130 _mm_load1_ps(p)
1131}
1132
532ac7d7 1133/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
0531ce1d
XL
1134/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1135/// protection fault will be triggered (fatal program crash).
1136///
1137/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1138/// memory.
1139///
1140/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
83c7162d
XL
1141///
1142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps)
0531ce1d
XL
1143#[inline]
1144#[target_feature(enable = "sse")]
1145#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1146#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1147#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1148pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1149 *(p as *const __m128)
1150}
1151
532ac7d7 1152/// Loads four `f32` values from memory into a `__m128`. There are no
0531ce1d
XL
1153/// restrictions
1154/// on memory alignment. For aligned memory
1155/// [`_mm_load_ps`](fn._mm_load_ps.html)
1156/// may be faster.
1157///
1158/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
83c7162d
XL
1159///
1160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps)
0531ce1d
XL
1161#[inline]
1162#[target_feature(enable = "sse")]
1163#[cfg_attr(test, assert_instr(movups))]
83c7162d 1164#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1165pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1166 // Note: Using `*p` would require `f32` alignment, but `movups` has no
1167 // alignment restrictions.
1168 let mut dst = _mm_undefined_ps();
1169 ptr::copy_nonoverlapping(
1170 p as *const u8,
1171 &mut dst as *mut __m128 as *mut u8,
1172 mem::size_of::<__m128>(),
1173 );
1174 dst
1175}
1176
532ac7d7 1177/// Loads four `f32` values from aligned memory into a `__m128` in reverse
0531ce1d
XL
1178/// order.
1179///
1180/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1181/// protection fault will be triggered (fatal program crash).
1182///
1183/// Functionally equivalent to the following code sequence (assuming `p`
1184/// satisfies the alignment restrictions):
1185///
1186/// ```text
1187/// let a0 = *p;
1188/// let a1 = *p.offset(1);
1189/// let a2 = *p.offset(2);
1190/// let a3 = *p.offset(3);
1191/// __m128::new(a3, a2, a1, a0)
1192/// ```
1193///
1194/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1195/// shuffling.
83c7162d
XL
1196///
1197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps)
0531ce1d
XL
1198#[inline]
1199#[target_feature(enable = "sse")]
1200#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1201#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1202pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1203 let a = _mm_load_ps(p);
17df50a5 1204 simd_shuffle4!(a, a, [3, 2, 1, 0])
0531ce1d
XL
1205}
1206
3dfed10e
XL
1207/// Loads unaligned 64-bits of integer data from memory into new vector.
1208///
1209/// `mem_addr` does not need to be aligned on any particular boundary.
1210///
1211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64)
1212#[inline]
1213#[target_feature(enable = "sse")]
3dfed10e
XL
1214#[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")]
1215pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
cdc7bbd5 1216 transmute(i64x2(ptr::read_unaligned(mem_addr as *const i64), 0))
3dfed10e
XL
1217}
1218
532ac7d7 1219/// Stores the lowest 32 bit float of `a` into memory.
0531ce1d
XL
1220///
1221/// This intrinsic corresponds to the `MOVSS` instruction.
83c7162d
XL
1222///
1223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss)
0531ce1d
XL
1224#[inline]
1225#[target_feature(enable = "sse")]
1226#[cfg_attr(test, assert_instr(movss))]
83c7162d 1227#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1228pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1229 *p = simd_extract(a, 0);
1230}
1231
532ac7d7 1232/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
0531ce1d
XL
1233/// memory.
1234///
1235/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1236/// protection fault will be triggered (fatal program crash).
1237///
1238/// Functionally equivalent to the following code sequence (assuming `p`
1239/// satisfies the alignment restrictions):
1240///
1241/// ```text
1242/// let x = a.extract(0);
1243/// *p = x;
1244/// *p.offset(1) = x;
1245/// *p.offset(2) = x;
1246/// *p.offset(3) = x;
1247/// ```
83c7162d
XL
1248///
1249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps)
0531ce1d
XL
1250#[inline]
1251#[target_feature(enable = "sse")]
1252#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1253#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1254#[allow(clippy::cast_ptr_alignment)]
0531ce1d 1255pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
17df50a5 1256 let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]);
0531ce1d
XL
1257 *(p as *mut __m128) = b;
1258}
1259
1260/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
83c7162d
XL
1261///
1262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1)
0531ce1d
XL
1263#[inline]
1264#[target_feature(enable = "sse")]
1265#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1266#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1267pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1268 _mm_store1_ps(p, a);
1269}
1270
532ac7d7 1271/// Stores four 32-bit floats into *aligned* memory.
0531ce1d
XL
1272///
1273/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1274/// protection fault will be triggered (fatal program crash).
1275///
1276/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1277/// memory.
1278///
1279/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
83c7162d
XL
1280///
1281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps)
0531ce1d
XL
1282#[inline]
1283#[target_feature(enable = "sse")]
1284#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1285#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1286#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1287pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1288 *(p as *mut __m128) = a;
1289}
1290
532ac7d7 1291/// Stores four 32-bit floats into memory. There are no restrictions on memory
0531ce1d
XL
1292/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1293/// faster.
1294///
1295/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
83c7162d
XL
1296///
1297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps)
0531ce1d
XL
1298#[inline]
1299#[target_feature(enable = "sse")]
1300#[cfg_attr(test, assert_instr(movups))]
83c7162d 1301#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1302pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1303 ptr::copy_nonoverlapping(
1304 &a as *const __m128 as *const u8,
1305 p as *mut u8,
1306 mem::size_of::<__m128>(),
1307 );
1308}
1309
532ac7d7 1310/// Stores four 32-bit floats into *aligned* memory in reverse order.
0531ce1d
XL
1311///
1312/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1313/// protection fault will be triggered (fatal program crash).
1314///
1315/// Functionally equivalent to the following code sequence (assuming `p`
1316/// satisfies the alignment restrictions):
1317///
1318/// ```text
1319/// *p = a.extract(3);
1320/// *p.offset(1) = a.extract(2);
1321/// *p.offset(2) = a.extract(1);
1322/// *p.offset(3) = a.extract(0);
1323/// ```
83c7162d
XL
1324///
1325/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps)
0531ce1d
XL
1326#[inline]
1327#[target_feature(enable = "sse")]
1328#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1329#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1330#[allow(clippy::cast_ptr_alignment)]
0531ce1d 1331pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
17df50a5 1332 let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]);
0531ce1d
XL
1333 *(p as *mut __m128) = b;
1334}
1335
532ac7d7 1336/// Returns a `__m128` with the first component from `b` and the remaining
0531ce1d
XL
1337/// components from `a`.
1338///
1339/// In other words for any `a` and `b`:
1340/// ```text
1341/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1342/// ```
83c7162d
XL
1343///
1344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss)
0531ce1d
XL
1345#[inline]
1346#[target_feature(enable = "sse")]
1347#[cfg_attr(test, assert_instr(movss))]
83c7162d 1348#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1349pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
17df50a5 1350 simd_shuffle4!(a, b, [4, 1, 2, 3])
0531ce1d
XL
1351}
1352
532ac7d7 1353/// Performs a serializing operation on all store-to-memory instructions that
0531ce1d
XL
1354/// were issued prior to this instruction.
1355///
1356/// Guarantees that every store instruction that precedes, in program order, is
1357/// globally visible before any store instruction which follows the fence in
1358/// program order.
83c7162d
XL
1359///
1360/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence)
0531ce1d
XL
1361#[inline]
1362#[target_feature(enable = "sse")]
1363#[cfg_attr(test, assert_instr(sfence))]
83c7162d 1364#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1365pub unsafe fn _mm_sfence() {
1366 sfence()
1367}
1368
532ac7d7 1369/// Gets the unsigned 32-bit value of the MXCSR control and status register.
0531ce1d
XL
1370///
1371/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1372///
1373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr)
0531ce1d
XL
1374#[inline]
1375#[target_feature(enable = "sse")]
1376#[cfg_attr(test, assert_instr(stmxcsr))]
83c7162d 1377#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1378pub unsafe fn _mm_getcsr() -> u32 {
1379 let mut result = 0_i32;
1380 stmxcsr((&mut result) as *mut _ as *mut i8);
1381 result as u32
1382}
1383
532ac7d7 1384/// Sets the MXCSR register with the 32-bit unsigned integer value.
0531ce1d
XL
1385///
1386/// This register constrols how SIMD instructions handle floating point
1387/// operations. Modifying this register only affects the current thread.
1388///
1389/// It contains several groups of flags:
1390///
1391/// * *Exception flags* report which exceptions occurred since last they were
1392/// reset.
1393///
1394/// * *Masking flags* can be used to mask (ignore) certain exceptions. By
1395/// default
1396/// these flags are all set to 1, so all exceptions are masked. When an
1397/// an exception is masked, the processor simply sets the exception flag and
1398/// continues the operation. If the exception is unmasked, the flag is also set
1399/// but additionally an exception handler is invoked.
1400///
1401/// * *Rounding mode flags* control the rounding mode of floating point
1402/// instructions.
1403///
1404/// * The *denormals-are-zero mode flag* turns all numbers which would be
1405/// denormalized (exponent bits are all zeros) into zeros.
1406///
1407/// ## Exception Flags
1408///
1409/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1410/// Infinity by Infinity).
1411///
1412/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1413/// number. Mainly this can cause loss of precision.
1414///
a2a8927a 1415/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
0531ce1d 1416///
a2a8927a 1417/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
0531ce1d
XL
1418/// result was too large to be represented (e.g., an `f32` with absolute
1419/// value
1420/// greater than `2^128`).
1421///
a2a8927a 1422/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
0531ce1d
XL
1423/// result was too small to be represented in a normalized way (e.g., an
1424/// `f32`
1425/// with absulte value smaller than `2^-126`.)
1426///
a2a8927a 1427/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
0531ce1d
XL
1428/// precision exception). This means some precision was lost due to rounding.
1429/// For example, the fraction `1/3` cannot be represented accurately in a
1430/// 32 or 64 bit float and computing it would cause this exception to be
1431/// raised. Precision exceptions are very common, so they are usually masked.
1432///
1433/// Exception flags can be read and set using the convenience functions
1434/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1435/// check if an operation caused some overflow:
1436///
1437/// ```rust,ignore
1438/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1439/// // perform calculations
1440/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1441/// // handle overflow
1442/// }
1443/// ```
1444///
1445/// ## Masking Flags
1446///
1447/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1448/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1449/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1450///
1451/// A single masking bit can be set via
1452///
1453/// ```rust,ignore
1454/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1455/// ```
1456///
1457/// However, since mask bits are by default all set to 1, it is more common to
1458/// want to *disable* certain bits. For example, to unmask the underflow
1459/// exception, use:
1460///
1461/// ```rust,ignore
1462/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1463/// exception
1464/// ```
1465///
1466/// Warning: an unmasked exception will cause an exception handler to be
1467/// called.
1468/// The standard handler will simply terminate the process. So, in this case
1469/// any underflow exception would terminate the current process with something
1470/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1471///
1472/// ## Rounding Mode
1473///
1474/// The rounding mode is describe using two bits. It can be read and set using
1475/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1476/// `_MM_SET_ROUNDING_MODE(mode)`.
1477///
1478/// The rounding modes are:
1479///
1480/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1481/// value. If two values are equally close, round to even (i.e., least
1482/// significant bit will be zero).
1483///
1484/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1485///
1486/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1487///
1488/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1489///
1490/// Example:
1491///
1492/// ```rust,ignore
1493/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1494/// ```
1495///
1496/// ## Denormals-are-zero/Flush-to-zero Mode
1497///
1498/// If this bit is set, values that would be denormalized will be set to zero
1499/// instead. This is turned off by default.
1500///
1501/// You can read and enable/disable this mode via the helper functions
1502/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1503///
1504/// ```rust,ignore
1505/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1506/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1507/// ```
1508///
83c7162d
XL
1509///
1510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr)
0531ce1d
XL
1511#[inline]
1512#[target_feature(enable = "sse")]
1513#[cfg_attr(test, assert_instr(ldmxcsr))]
83c7162d 1514#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1515pub unsafe fn _mm_setcsr(val: u32) {
1516 ldmxcsr(&val as *const _ as *const i8);
1517}
1518
1519/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1520#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1521pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1522/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1523#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1524pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1525/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1526#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1527pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1528/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1529#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1530pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1531/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1532#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1533pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1534/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1535#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1536pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1537/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
83c7162d 1538#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1539pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1540
1541/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1542#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1543pub const _MM_MASK_INVALID: u32 = 0x0080;
1544/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1545#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1546pub const _MM_MASK_DENORM: u32 = 0x0100;
1547/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1548#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1549pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1550/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1551#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1552pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1553/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1554#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1555pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1556/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1557#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1558pub const _MM_MASK_INEXACT: u32 = 0x1000;
1559/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
83c7162d 1560#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1561pub const _MM_MASK_MASK: u32 = 0x1f80;
1562
1563/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1564#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1565pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1566/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1567#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1568pub const _MM_ROUND_DOWN: u32 = 0x2000;
1569/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1570#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1571pub const _MM_ROUND_UP: u32 = 0x4000;
1572/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1573#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1574pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1575
1576/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
83c7162d 1577#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1578pub const _MM_ROUND_MASK: u32 = 0x6000;
1579
1580/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
83c7162d 1581#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1582pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1583/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1584#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1585pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1586/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d 1587#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1588pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1589
1590/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1591///
1592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK)
0531ce1d
XL
1593#[inline]
1594#[allow(non_snake_case)]
1595#[target_feature(enable = "sse")]
83c7162d 1596#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1597pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1598 _mm_getcsr() & _MM_MASK_MASK
1599}
1600
1601/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1602///
1603/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE)
0531ce1d
XL
1604#[inline]
1605#[allow(non_snake_case)]
1606#[target_feature(enable = "sse")]
83c7162d 1607#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1608pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1609 _mm_getcsr() & _MM_EXCEPT_MASK
1610}
1611
1612/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1613///
1614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE)
0531ce1d
XL
1615#[inline]
1616#[allow(non_snake_case)]
1617#[target_feature(enable = "sse")]
83c7162d 1618#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1619pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1620 _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1621}
1622
1623/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1624///
1625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE)
0531ce1d
XL
1626#[inline]
1627#[allow(non_snake_case)]
1628#[target_feature(enable = "sse")]
83c7162d 1629#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1630pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1631 _mm_getcsr() & _MM_ROUND_MASK
1632}
1633
1634/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1635///
1636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK)
0531ce1d
XL
1637#[inline]
1638#[allow(non_snake_case)]
1639#[target_feature(enable = "sse")]
83c7162d 1640#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1641pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1642 _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1643}
1644
1645/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1646///
1647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE)
0531ce1d
XL
1648#[inline]
1649#[allow(non_snake_case)]
1650#[target_feature(enable = "sse")]
83c7162d 1651#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1652pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1653 _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1654}
1655
1656/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1657///
1658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE)
0531ce1d
XL
1659#[inline]
1660#[allow(non_snake_case)]
1661#[target_feature(enable = "sse")]
83c7162d 1662#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1663pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1664 let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1665 // println!("setting csr={:x}", val);
1666 _mm_setcsr(val)
1667}
1668
1669/// See [`_mm_setcsr`](fn._mm_setcsr.html)
83c7162d
XL
1670///
1671/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE)
0531ce1d
XL
1672#[inline]
1673#[allow(non_snake_case)]
1674#[target_feature(enable = "sse")]
83c7162d 1675#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1676pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1677 _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1678}
1679
1680/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1681#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1682pub const _MM_HINT_T0: i32 = 3;
1683
1684/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1685#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1686pub const _MM_HINT_T1: i32 = 2;
1687
1688/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1689#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1690pub const _MM_HINT_T2: i32 = 1;
1691
1692/// See [`_mm_prefetch`](fn._mm_prefetch.html).
83c7162d 1693#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1694pub const _MM_HINT_NTA: i32 = 0;
1695
17df50a5
XL
1696/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1697#[stable(feature = "simd_x86", since = "1.27.0")]
1698pub const _MM_HINT_ET0: i32 = 7;
1699
1700/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702pub const _MM_HINT_ET1: i32 = 6;
1703
1704/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
0531ce1d 1705///
17df50a5 1706/// The `STRATEGY` must be one of:
0531ce1d
XL
1707///
1708/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
416331ca 1709/// cache hierarchy.
0531ce1d
XL
1710///
1711/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1712///
83c7162d
XL
1713/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1714/// an implementation-specific choice (e.g., L2 if there is no L3).
0531ce1d
XL
1715///
1716/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1717/// non-temporal access (NTA) hint. It may be a place closer than main memory
1718/// but outside of the cache hierarchy. This is used to reduce access latency
1719/// without polluting the cache.
1720///
17df50a5
XL
1721/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1722/// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1723/// and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1724///
0531ce1d
XL
1725/// The actual implementation depends on the particular CPU. This instruction
1726/// is considered a hint, so the CPU is also free to simply ignore the request.
1727///
83c7162d
XL
1728/// The amount of prefetched data depends on the cache line size of the
1729/// specific CPU, but it will be at least 32 bytes.
0531ce1d
XL
1730///
1731/// Common caveats:
1732///
1733/// * Most modern CPUs already automatically prefetch data based on predicted
1734/// access patterns.
1735///
1736/// * Data is usually not fetched if this would cause a TLB miss or a page
1737/// fault.
1738///
1739/// * Too much prefetching can cause unnecessary cache evictions.
1740///
1741/// * Prefetching may also fail if there are not enough memory-subsystem
1742/// resources (e.g., request buffers).
1743///
83c7162d
XL
1744///
1745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch)
0531ce1d
XL
1746#[inline]
1747#[target_feature(enable = "sse")]
17df50a5
XL
1748#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1749#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1750#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1751#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1752#[rustc_legacy_const_generics(1)]
83c7162d 1753#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1754pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
a2a8927a 1755 // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
17df50a5
XL
1756 // `locality` and `rw` are based on our `STRATEGY`.
1757 prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
0531ce1d
XL
1758}
1759
532ac7d7 1760/// Returns vector of type __m128 with undefined elements.
83c7162d
XL
1761///
1762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps)
0531ce1d
XL
1763#[inline]
1764#[target_feature(enable = "sse")]
83c7162d 1765#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1766pub unsafe fn _mm_undefined_ps() -> __m128 {
3dfed10e 1767 _mm_set1_ps(0.0)
0531ce1d
XL
1768}
1769
1770/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
83c7162d
XL
1771///
1772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS)
0531ce1d
XL
1773#[inline]
1774#[allow(non_snake_case)]
1775#[target_feature(enable = "sse")]
83c7162d 1776#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1777pub unsafe fn _MM_TRANSPOSE4_PS(
0731742a
XL
1778 row0: &mut __m128,
1779 row1: &mut __m128,
1780 row2: &mut __m128,
1781 row3: &mut __m128,
0531ce1d
XL
1782) {
1783 let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1784 let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1785 let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1786 let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1787
1788 *row0 = _mm_movelh_ps(tmp0, tmp2);
1789 *row1 = _mm_movehl_ps(tmp2, tmp0);
1790 *row2 = _mm_movelh_ps(tmp1, tmp3);
1791 *row3 = _mm_movehl_ps(tmp3, tmp1);
1792}
1793
1794#[allow(improper_ctypes)]
1795extern "C" {
1796 #[link_name = "llvm.x86.sse.add.ss"]
1797 fn addss(a: __m128, b: __m128) -> __m128;
1798 #[link_name = "llvm.x86.sse.sub.ss"]
1799 fn subss(a: __m128, b: __m128) -> __m128;
1800 #[link_name = "llvm.x86.sse.mul.ss"]
1801 fn mulss(a: __m128, b: __m128) -> __m128;
1802 #[link_name = "llvm.x86.sse.div.ss"]
1803 fn divss(a: __m128, b: __m128) -> __m128;
1804 #[link_name = "llvm.x86.sse.sqrt.ss"]
1805 fn sqrtss(a: __m128) -> __m128;
1806 #[link_name = "llvm.x86.sse.sqrt.ps"]
1807 fn sqrtps(a: __m128) -> __m128;
1808 #[link_name = "llvm.x86.sse.rcp.ss"]
1809 fn rcpss(a: __m128) -> __m128;
1810 #[link_name = "llvm.x86.sse.rcp.ps"]
1811 fn rcpps(a: __m128) -> __m128;
1812 #[link_name = "llvm.x86.sse.rsqrt.ss"]
1813 fn rsqrtss(a: __m128) -> __m128;
1814 #[link_name = "llvm.x86.sse.rsqrt.ps"]
1815 fn rsqrtps(a: __m128) -> __m128;
1816 #[link_name = "llvm.x86.sse.min.ss"]
1817 fn minss(a: __m128, b: __m128) -> __m128;
1818 #[link_name = "llvm.x86.sse.min.ps"]
1819 fn minps(a: __m128, b: __m128) -> __m128;
1820 #[link_name = "llvm.x86.sse.max.ss"]
1821 fn maxss(a: __m128, b: __m128) -> __m128;
1822 #[link_name = "llvm.x86.sse.max.ps"]
1823 fn maxps(a: __m128, b: __m128) -> __m128;
1824 #[link_name = "llvm.x86.sse.movmsk.ps"]
1825 fn movmskps(a: __m128) -> i32;
1826 #[link_name = "llvm.x86.sse.cmp.ps"]
1827 fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1828 #[link_name = "llvm.x86.sse.comieq.ss"]
1829 fn comieq_ss(a: __m128, b: __m128) -> i32;
1830 #[link_name = "llvm.x86.sse.comilt.ss"]
1831 fn comilt_ss(a: __m128, b: __m128) -> i32;
1832 #[link_name = "llvm.x86.sse.comile.ss"]
1833 fn comile_ss(a: __m128, b: __m128) -> i32;
1834 #[link_name = "llvm.x86.sse.comigt.ss"]
1835 fn comigt_ss(a: __m128, b: __m128) -> i32;
1836 #[link_name = "llvm.x86.sse.comige.ss"]
1837 fn comige_ss(a: __m128, b: __m128) -> i32;
1838 #[link_name = "llvm.x86.sse.comineq.ss"]
1839 fn comineq_ss(a: __m128, b: __m128) -> i32;
1840 #[link_name = "llvm.x86.sse.ucomieq.ss"]
1841 fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1842 #[link_name = "llvm.x86.sse.ucomilt.ss"]
1843 fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1844 #[link_name = "llvm.x86.sse.ucomile.ss"]
1845 fn ucomile_ss(a: __m128, b: __m128) -> i32;
1846 #[link_name = "llvm.x86.sse.ucomigt.ss"]
1847 fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1848 #[link_name = "llvm.x86.sse.ucomige.ss"]
1849 fn ucomige_ss(a: __m128, b: __m128) -> i32;
1850 #[link_name = "llvm.x86.sse.ucomineq.ss"]
1851 fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1852 #[link_name = "llvm.x86.sse.cvtss2si"]
1853 fn cvtss2si(a: __m128) -> i32;
1854 #[link_name = "llvm.x86.sse.cvttss2si"]
1855 fn cvttss2si(a: __m128) -> i32;
1856 #[link_name = "llvm.x86.sse.cvtsi2ss"]
1857 fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1858 #[link_name = "llvm.x86.sse.sfence"]
1859 fn sfence();
1860 #[link_name = "llvm.x86.sse.stmxcsr"]
1861 fn stmxcsr(p: *mut i8);
1862 #[link_name = "llvm.x86.sse.ldmxcsr"]
1863 fn ldmxcsr(p: *const i8);
1864 #[link_name = "llvm.prefetch"]
1865 fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1866 #[link_name = "llvm.x86.sse.cmp.ss"]
1867 fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
0531ce1d
XL
1868}
1869
1870/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1871///
1872/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1873/// exception _may_ be generated.
83c7162d
XL
1874///
1875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps)
0531ce1d
XL
1876#[inline]
1877#[target_feature(enable = "sse")]
1878#[cfg_attr(test, assert_instr(movntps))]
83c7162d 1879#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1880#[allow(clippy::cast_ptr_alignment)]
0531ce1d 1881pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
0731742a 1882 intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
0531ce1d
XL
1883}
1884
0531ce1d
XL
1885#[cfg(test)]
1886mod tests {
48663c56
XL
1887 use crate::{hint::black_box, mem::transmute};
1888 use std::{boxed, f32::NAN};
416331ca 1889 use stdarch_test::simd_test;
0531ce1d 1890
532ac7d7 1891 use crate::core_arch::{simd::*, x86::*};
0531ce1d 1892
83c7162d 1893 #[simd_test(enable = "sse")]
0531ce1d
XL
1894 unsafe fn test_mm_add_ps() {
1895 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1896 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1897 let r = _mm_add_ps(a, b);
1898 assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
1899 }
1900
83c7162d 1901 #[simd_test(enable = "sse")]
0531ce1d
XL
1902 unsafe fn test_mm_add_ss() {
1903 let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
1904 let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
1905 let r = _mm_add_ss(a, b);
1906 assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
1907 }
1908
83c7162d 1909 #[simd_test(enable = "sse")]
0531ce1d
XL
1910 unsafe fn test_mm_sub_ps() {
1911 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1912 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1913 let r = _mm_sub_ps(a, b);
1914 assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
1915 }
1916
83c7162d 1917 #[simd_test(enable = "sse")]
0531ce1d
XL
1918 unsafe fn test_mm_sub_ss() {
1919 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1920 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1921 let r = _mm_sub_ss(a, b);
1922 assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
1923 }
1924
83c7162d 1925 #[simd_test(enable = "sse")]
0531ce1d
XL
1926 unsafe fn test_mm_mul_ps() {
1927 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1928 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1929 let r = _mm_mul_ps(a, b);
1930 assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
1931 }
1932
83c7162d 1933 #[simd_test(enable = "sse")]
0531ce1d
XL
1934 unsafe fn test_mm_mul_ss() {
1935 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1936 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1937 let r = _mm_mul_ss(a, b);
1938 assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
1939 }
1940
83c7162d 1941 #[simd_test(enable = "sse")]
0531ce1d
XL
1942 unsafe fn test_mm_div_ps() {
1943 let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
1944 let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
1945 let r = _mm_div_ps(a, b);
1946 assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
1947 }
1948
83c7162d 1949 #[simd_test(enable = "sse")]
0531ce1d
XL
1950 unsafe fn test_mm_div_ss() {
1951 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
1952 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
1953 let r = _mm_div_ss(a, b);
1954 assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
1955 }
1956
83c7162d 1957 #[simd_test(enable = "sse")]
0531ce1d
XL
1958 unsafe fn test_mm_sqrt_ss() {
1959 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1960 let r = _mm_sqrt_ss(a);
1961 let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
1962 assert_eq_m128(r, e);
1963 }
1964
83c7162d 1965 #[simd_test(enable = "sse")]
0531ce1d
XL
1966 unsafe fn test_mm_sqrt_ps() {
1967 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1968 let r = _mm_sqrt_ps(a);
1969 let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
1970 assert_eq_m128(r, e);
1971 }
1972
83c7162d 1973 #[simd_test(enable = "sse")]
0531ce1d
XL
1974 unsafe fn test_mm_rcp_ss() {
1975 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1976 let r = _mm_rcp_ss(a);
1977 let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
1978 assert_eq_m128(r, e);
1979 }
1980
83c7162d 1981 #[simd_test(enable = "sse")]
0531ce1d
XL
1982 unsafe fn test_mm_rcp_ps() {
1983 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1984 let r = _mm_rcp_ps(a);
1985 let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
1986 let rel_err = 0.00048828125;
1987 for i in 0..4 {
1988 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
1989 }
1990 }
1991
83c7162d 1992 #[simd_test(enable = "sse")]
0531ce1d
XL
1993 unsafe fn test_mm_rsqrt_ss() {
1994 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
1995 let r = _mm_rsqrt_ss(a);
1996 let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
1997 let rel_err = 0.00048828125;
1998 for i in 0..4 {
1999 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2000 }
2001 }
2002
83c7162d 2003 #[simd_test(enable = "sse")]
0531ce1d
XL
2004 unsafe fn test_mm_rsqrt_ps() {
2005 let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2006 let r = _mm_rsqrt_ps(a);
2007 let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2008 let rel_err = 0.00048828125;
2009 for i in 0..4 {
2010 assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2011 }
2012 }
2013
83c7162d 2014 #[simd_test(enable = "sse")]
0531ce1d
XL
2015 unsafe fn test_mm_min_ss() {
2016 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2017 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2018 let r = _mm_min_ss(a, b);
2019 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2020 }
2021
83c7162d 2022 #[simd_test(enable = "sse")]
0531ce1d
XL
2023 unsafe fn test_mm_min_ps() {
2024 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2025 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2026 let r = _mm_min_ps(a, b);
2027 assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
74b04a01
XL
2028
2029 // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2030 // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2031 // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2032 // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2033 // `r1` to `a` and `r2` to `b`.
2034 let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2035 let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2036 let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2037 let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2038 let a: [u8; 16] = transmute(a);
2039 let b: [u8; 16] = transmute(b);
2040 assert_eq!(r1, b);
2041 assert_eq!(r2, a);
2042 assert_ne!(a, b); // sanity check that -0.0 is actually present
0531ce1d
XL
2043 }
2044
83c7162d 2045 #[simd_test(enable = "sse")]
0531ce1d
XL
2046 unsafe fn test_mm_max_ss() {
2047 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2048 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2049 let r = _mm_max_ss(a, b);
2050 assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2051 }
2052
83c7162d 2053 #[simd_test(enable = "sse")]
0531ce1d
XL
2054 unsafe fn test_mm_max_ps() {
2055 let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2056 let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2057 let r = _mm_max_ps(a, b);
2058 assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2059 }
2060
83c7162d 2061 #[simd_test(enable = "sse")]
0531ce1d
XL
2062 unsafe fn test_mm_and_ps() {
2063 let a = transmute(u32x4::splat(0b0011));
2064 let b = transmute(u32x4::splat(0b0101));
2065 let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2066 let e = transmute(u32x4::splat(0b0001));
2067 assert_eq_m128(r, e);
2068 }
2069
83c7162d 2070 #[simd_test(enable = "sse")]
0531ce1d
XL
2071 unsafe fn test_mm_andnot_ps() {
2072 let a = transmute(u32x4::splat(0b0011));
2073 let b = transmute(u32x4::splat(0b0101));
2074 let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2075 let e = transmute(u32x4::splat(0b0100));
2076 assert_eq_m128(r, e);
2077 }
2078
83c7162d 2079 #[simd_test(enable = "sse")]
0531ce1d
XL
2080 unsafe fn test_mm_or_ps() {
2081 let a = transmute(u32x4::splat(0b0011));
2082 let b = transmute(u32x4::splat(0b0101));
2083 let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2084 let e = transmute(u32x4::splat(0b0111));
2085 assert_eq_m128(r, e);
2086 }
2087
83c7162d 2088 #[simd_test(enable = "sse")]
0531ce1d
XL
2089 unsafe fn test_mm_xor_ps() {
2090 let a = transmute(u32x4::splat(0b0011));
2091 let b = transmute(u32x4::splat(0b0101));
2092 let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2093 let e = transmute(u32x4::splat(0b0110));
2094 assert_eq_m128(r, e);
2095 }
2096
83c7162d 2097 #[simd_test(enable = "sse")]
0531ce1d
XL
2098 unsafe fn test_mm_cmpeq_ss() {
2099 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2100 let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2101 let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2102 let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0));
2103 assert_eq!(r, e);
2104
2105 let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2106 let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
0731742a 2107 let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0));
0531ce1d
XL
2108 assert_eq!(r2, e2);
2109 }
2110
83c7162d 2111 #[simd_test(enable = "sse")]
0531ce1d
XL
2112 unsafe fn test_mm_cmplt_ss() {
2113 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2114 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2115 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2116 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2117
2118 let b1 = 0u32; // a.extract(0) < b.extract(0)
2119 let c1 = 0u32; // a.extract(0) < c.extract(0)
2120 let d1 = !0u32; // a.extract(0) < d.extract(0)
2121
2122 let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2123 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2124 assert_eq!(rb, eb);
2125
2126 let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2127 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2128 assert_eq!(rc, ec);
2129
2130 let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2131 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2132 assert_eq!(rd, ed);
2133 }
2134
83c7162d 2135 #[simd_test(enable = "sse")]
0531ce1d
XL
2136 unsafe fn test_mm_cmple_ss() {
2137 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2138 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2139 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2140 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2141
2142 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2143 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2144 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2145
2146 let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2147 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2148 assert_eq!(rb, eb);
2149
2150 let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2151 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2152 assert_eq!(rc, ec);
2153
2154 let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2155 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2156 assert_eq!(rd, ed);
2157 }
2158
83c7162d 2159 #[simd_test(enable = "sse")]
0531ce1d
XL
2160 unsafe fn test_mm_cmpgt_ss() {
2161 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2162 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2163 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2164 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2165
2166 let b1 = !0u32; // a.extract(0) > b.extract(0)
2167 let c1 = 0u32; // a.extract(0) > c.extract(0)
2168 let d1 = 0u32; // a.extract(0) > d.extract(0)
2169
2170 let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2171 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2172 assert_eq!(rb, eb);
2173
2174 let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2175 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2176 assert_eq!(rc, ec);
2177
2178 let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2179 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2180 assert_eq!(rd, ed);
2181 }
2182
83c7162d 2183 #[simd_test(enable = "sse")]
0531ce1d
XL
2184 unsafe fn test_mm_cmpge_ss() {
2185 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2186 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2187 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2188 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2189
2190 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2191 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2192 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2193
2194 let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2195 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2196 assert_eq!(rb, eb);
2197
2198 let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2199 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2200 assert_eq!(rc, ec);
2201
2202 let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2203 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2204 assert_eq!(rd, ed);
2205 }
2206
83c7162d 2207 #[simd_test(enable = "sse")]
0531ce1d
XL
2208 unsafe fn test_mm_cmpneq_ss() {
2209 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2210 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2211 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2212 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2213
2214 let b1 = !0u32; // a.extract(0) != b.extract(0)
2215 let c1 = 0u32; // a.extract(0) != c.extract(0)
2216 let d1 = !0u32; // a.extract(0) != d.extract(0)
2217
2218 let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2219 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2220 assert_eq!(rb, eb);
2221
2222 let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2223 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2224 assert_eq!(rc, ec);
2225
2226 let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2227 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2228 assert_eq!(rd, ed);
2229 }
2230
83c7162d 2231 #[simd_test(enable = "sse")]
0531ce1d 2232 unsafe fn test_mm_cmpnlt_ss() {
532ac7d7 2233 // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
0531ce1d
XL
2234 // must be a difference. It may have to do with behavior in the
2235 // presence of NaNs (signaling or quiet). If so, we should add tests
2236 // for those.
2237
2238 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2239 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2240 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2241 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2242
2243 let b1 = !0u32; // a.extract(0) >= b.extract(0)
2244 let c1 = !0u32; // a.extract(0) >= c.extract(0)
2245 let d1 = 0u32; // a.extract(0) >= d.extract(0)
2246
2247 let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2248 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2249 assert_eq!(rb, eb);
2250
2251 let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2252 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2253 assert_eq!(rc, ec);
2254
2255 let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2256 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2257 assert_eq!(rd, ed);
2258 }
2259
83c7162d 2260 #[simd_test(enable = "sse")]
0531ce1d 2261 unsafe fn test_mm_cmpnle_ss() {
532ac7d7 2262 // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
0531ce1d
XL
2263 // must be a difference. It may have to do with behavior in the
2264 // presence
2265 // of NaNs (signaling or quiet). If so, we should add tests for those.
2266
2267 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2268 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2269 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2270 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2271
2272 let b1 = !0u32; // a.extract(0) > b.extract(0)
2273 let c1 = 0u32; // a.extract(0) > c.extract(0)
2274 let d1 = 0u32; // a.extract(0) > d.extract(0)
2275
2276 let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2277 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2278 assert_eq!(rb, eb);
2279
2280 let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2281 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2282 assert_eq!(rc, ec);
2283
2284 let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2285 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2286 assert_eq!(rd, ed);
2287 }
2288
83c7162d 2289 #[simd_test(enable = "sse")]
0531ce1d 2290 unsafe fn test_mm_cmpngt_ss() {
532ac7d7 2291 // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
0531ce1d
XL
2292 // must be a difference. It may have to do with behavior in the
2293 // presence of NaNs (signaling or quiet). If so, we should add tests
2294 // for those.
2295
2296 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2297 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2298 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2299 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2300
2301 let b1 = 0u32; // a.extract(0) <= b.extract(0)
2302 let c1 = !0u32; // a.extract(0) <= c.extract(0)
2303 let d1 = !0u32; // a.extract(0) <= d.extract(0)
2304
2305 let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2306 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2307 assert_eq!(rb, eb);
2308
2309 let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2310 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2311 assert_eq!(rc, ec);
2312
2313 let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2314 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2315 assert_eq!(rd, ed);
2316 }
2317
83c7162d 2318 #[simd_test(enable = "sse")]
0531ce1d 2319 unsafe fn test_mm_cmpnge_ss() {
532ac7d7 2320 // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
0531ce1d
XL
2321 // must be a difference. It may have to do with behavior in the
2322 // presence of NaNs (signaling or quiet). If so, we should add tests
2323 // for those.
2324
2325 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2326 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2327 let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2328 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2329
2330 let b1 = 0u32; // a.extract(0) < b.extract(0)
2331 let c1 = 0u32; // a.extract(0) < c.extract(0)
2332 let d1 = !0u32; // a.extract(0) < d.extract(0)
2333
2334 let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2335 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2336 assert_eq!(rb, eb);
2337
2338 let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2339 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2340 assert_eq!(rc, ec);
2341
2342 let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2343 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2344 assert_eq!(rd, ed);
2345 }
2346
83c7162d 2347 #[simd_test(enable = "sse")]
0531ce1d
XL
2348 unsafe fn test_mm_cmpord_ss() {
2349 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2350 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2351 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2352 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2353
2354 let b1 = !0u32; // a.extract(0) ord b.extract(0)
2355 let c1 = 0u32; // a.extract(0) ord c.extract(0)
2356 let d1 = !0u32; // a.extract(0) ord d.extract(0)
2357
2358 let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2359 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2360 assert_eq!(rb, eb);
2361
2362 let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2363 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2364 assert_eq!(rc, ec);
2365
2366 let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2367 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2368 assert_eq!(rd, ed);
2369 }
2370
83c7162d 2371 #[simd_test(enable = "sse")]
0531ce1d
XL
2372 unsafe fn test_mm_cmpunord_ss() {
2373 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2374 let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2375 let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2376 let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2377
2378 let b1 = 0u32; // a.extract(0) unord b.extract(0)
2379 let c1 = !0u32; // a.extract(0) unord c.extract(0)
2380 let d1 = 0u32; // a.extract(0) unord d.extract(0)
2381
2382 let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2383 let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0));
2384 assert_eq!(rb, eb);
2385
2386 let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2387 let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0));
2388 assert_eq!(rc, ec);
2389
2390 let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2391 let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0));
2392 assert_eq!(rd, ed);
2393 }
2394
83c7162d 2395 #[simd_test(enable = "sse")]
0531ce1d
XL
2396 unsafe fn test_mm_cmpeq_ps() {
2397 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2398 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2399 let tru = !0u32;
2400 let fls = 0u32;
2401
2402 let e = u32x4::new(fls, fls, tru, fls);
2403 let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2404 assert_eq!(r, e);
2405 }
2406
83c7162d 2407 #[simd_test(enable = "sse")]
0531ce1d
XL
2408 unsafe fn test_mm_cmplt_ps() {
2409 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2410 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2411 let tru = !0u32;
2412 let fls = 0u32;
2413
2414 let e = u32x4::new(tru, fls, fls, fls);
2415 let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2416 assert_eq!(r, e);
2417 }
2418
83c7162d 2419 #[simd_test(enable = "sse")]
0531ce1d
XL
2420 unsafe fn test_mm_cmple_ps() {
2421 let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2422 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2423 let tru = !0u32;
2424 let fls = 0u32;
2425
2426 let e = u32x4::new(tru, fls, tru, fls);
2427 let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2428 assert_eq!(r, e);
2429 }
2430
83c7162d 2431 #[simd_test(enable = "sse")]
0531ce1d
XL
2432 unsafe fn test_mm_cmpgt_ps() {
2433 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2434 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2435 let tru = !0u32;
2436 let fls = 0u32;
2437
2438 let e = u32x4::new(fls, tru, fls, fls);
2439 let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2440 assert_eq!(r, e);
2441 }
2442
83c7162d 2443 #[simd_test(enable = "sse")]
0531ce1d
XL
2444 unsafe fn test_mm_cmpge_ps() {
2445 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2446 let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2447 let tru = !0u32;
2448 let fls = 0u32;
2449
2450 let e = u32x4::new(fls, tru, tru, fls);
2451 let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2452 assert_eq!(r, e);
2453 }
2454
83c7162d 2455 #[simd_test(enable = "sse")]
0531ce1d
XL
2456 unsafe fn test_mm_cmpneq_ps() {
2457 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2458 let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2459 let tru = !0u32;
2460 let fls = 0u32;
2461
2462 let e = u32x4::new(tru, tru, fls, tru);
2463 let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2464 assert_eq!(r, e);
2465 }
2466
83c7162d 2467 #[simd_test(enable = "sse")]
0531ce1d
XL
2468 unsafe fn test_mm_cmpnlt_ps() {
2469 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2470 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2471 let tru = !0u32;
2472 let fls = 0u32;
2473
2474 let e = u32x4::new(fls, tru, tru, tru);
2475 let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2476 assert_eq!(r, e);
2477 }
2478
83c7162d 2479 #[simd_test(enable = "sse")]
0531ce1d
XL
2480 unsafe fn test_mm_cmpnle_ps() {
2481 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2482 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2483 let tru = !0u32;
2484 let fls = 0u32;
2485
2486 let e = u32x4::new(fls, tru, fls, tru);
2487 let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2488 assert_eq!(r, e);
2489 }
2490
83c7162d 2491 #[simd_test(enable = "sse")]
0531ce1d
XL
2492 unsafe fn test_mm_cmpngt_ps() {
2493 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2494 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2495 let tru = !0u32;
2496 let fls = 0u32;
2497
2498 let e = u32x4::new(tru, fls, tru, tru);
2499 let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2500 assert_eq!(r, e);
2501 }
2502
83c7162d 2503 #[simd_test(enable = "sse")]
0531ce1d
XL
2504 unsafe fn test_mm_cmpnge_ps() {
2505 let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2506 let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2507 let tru = !0u32;
2508 let fls = 0u32;
2509
2510 let e = u32x4::new(tru, fls, fls, tru);
2511 let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2512 assert_eq!(r, e);
2513 }
2514
83c7162d 2515 #[simd_test(enable = "sse")]
0531ce1d
XL
2516 unsafe fn test_mm_cmpord_ps() {
2517 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2518 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2519 let tru = !0u32;
2520 let fls = 0u32;
2521
2522 let e = u32x4::new(tru, fls, fls, fls);
2523 let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2524 assert_eq!(r, e);
2525 }
2526
83c7162d 2527 #[simd_test(enable = "sse")]
0531ce1d
XL
2528 unsafe fn test_mm_cmpunord_ps() {
2529 let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2530 let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2531 let tru = !0u32;
2532 let fls = 0u32;
2533
2534 let e = u32x4::new(fls, tru, tru, tru);
2535 let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2536 assert_eq!(r, e);
2537 }
2538
83c7162d 2539 #[simd_test(enable = "sse")]
0531ce1d
XL
2540 unsafe fn test_mm_comieq_ss() {
2541 let aa = &[3.0f32, 12.0, 23.0, NAN];
2542 let bb = &[3.0f32, 47.5, 1.5, NAN];
2543
2544 let ee = &[1i32, 0, 0, 0];
2545
2546 for i in 0..4 {
2547 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2548 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2549
2550 let r = _mm_comieq_ss(a, b);
2551
2552 assert_eq!(
2553 ee[i], r,
2554 "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2555 a, b, r, ee[i], i
2556 );
2557 }
2558 }
2559
83c7162d 2560 #[simd_test(enable = "sse")]
0531ce1d
XL
2561 unsafe fn test_mm_comilt_ss() {
2562 let aa = &[3.0f32, 12.0, 23.0, NAN];
2563 let bb = &[3.0f32, 47.5, 1.5, NAN];
2564
2565 let ee = &[0i32, 1, 0, 0];
2566
2567 for i in 0..4 {
2568 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2569 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2570
2571 let r = _mm_comilt_ss(a, b);
2572
2573 assert_eq!(
2574 ee[i], r,
2575 "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2576 a, b, r, ee[i], i
2577 );
2578 }
2579 }
2580
83c7162d 2581 #[simd_test(enable = "sse")]
0531ce1d
XL
2582 unsafe fn test_mm_comile_ss() {
2583 let aa = &[3.0f32, 12.0, 23.0, NAN];
2584 let bb = &[3.0f32, 47.5, 1.5, NAN];
2585
2586 let ee = &[1i32, 1, 0, 0];
2587
2588 for i in 0..4 {
2589 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2590 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2591
2592 let r = _mm_comile_ss(a, b);
2593
2594 assert_eq!(
2595 ee[i], r,
2596 "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2597 a, b, r, ee[i], i
2598 );
2599 }
2600 }
2601
83c7162d 2602 #[simd_test(enable = "sse")]
0531ce1d
XL
2603 unsafe fn test_mm_comigt_ss() {
2604 let aa = &[3.0f32, 12.0, 23.0, NAN];
2605 let bb = &[3.0f32, 47.5, 1.5, NAN];
2606
2607 let ee = &[1i32, 0, 1, 0];
2608
2609 for i in 0..4 {
2610 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2611 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2612
2613 let r = _mm_comige_ss(a, b);
2614
2615 assert_eq!(
2616 ee[i], r,
2617 "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2618 a, b, r, ee[i], i
2619 );
2620 }
2621 }
2622
83c7162d 2623 #[simd_test(enable = "sse")]
0531ce1d
XL
2624 unsafe fn test_mm_comineq_ss() {
2625 let aa = &[3.0f32, 12.0, 23.0, NAN];
2626 let bb = &[3.0f32, 47.5, 1.5, NAN];
2627
2628 let ee = &[0i32, 1, 1, 1];
2629
2630 for i in 0..4 {
2631 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2632 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2633
2634 let r = _mm_comineq_ss(a, b);
2635
2636 assert_eq!(
2637 ee[i], r,
2638 "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2639 a, b, r, ee[i], i
2640 );
2641 }
2642 }
2643
83c7162d 2644 #[simd_test(enable = "sse")]
0531ce1d
XL
2645 unsafe fn test_mm_ucomieq_ss() {
2646 let aa = &[3.0f32, 12.0, 23.0, NAN];
2647 let bb = &[3.0f32, 47.5, 1.5, NAN];
2648
2649 let ee = &[1i32, 0, 0, 0];
2650
2651 for i in 0..4 {
2652 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2653 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2654
2655 let r = _mm_ucomieq_ss(a, b);
2656
2657 assert_eq!(
2658 ee[i], r,
2659 "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2660 a, b, r, ee[i], i
2661 );
2662 }
2663 }
2664
83c7162d 2665 #[simd_test(enable = "sse")]
0531ce1d
XL
2666 unsafe fn test_mm_ucomilt_ss() {
2667 let aa = &[3.0f32, 12.0, 23.0, NAN];
2668 let bb = &[3.0f32, 47.5, 1.5, NAN];
2669
2670 let ee = &[0i32, 1, 0, 0];
2671
2672 for i in 0..4 {
2673 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2674 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2675
2676 let r = _mm_ucomilt_ss(a, b);
2677
2678 assert_eq!(
2679 ee[i], r,
2680 "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2681 a, b, r, ee[i], i
2682 );
2683 }
2684 }
2685
83c7162d 2686 #[simd_test(enable = "sse")]
0531ce1d
XL
2687 unsafe fn test_mm_ucomile_ss() {
2688 let aa = &[3.0f32, 12.0, 23.0, NAN];
2689 let bb = &[3.0f32, 47.5, 1.5, NAN];
2690
2691 let ee = &[1i32, 1, 0, 0];
2692
2693 for i in 0..4 {
2694 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2695 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2696
2697 let r = _mm_ucomile_ss(a, b);
2698
2699 assert_eq!(
2700 ee[i], r,
2701 "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2702 a, b, r, ee[i], i
2703 );
2704 }
2705 }
2706
83c7162d 2707 #[simd_test(enable = "sse")]
0531ce1d
XL
2708 unsafe fn test_mm_ucomigt_ss() {
2709 let aa = &[3.0f32, 12.0, 23.0, NAN];
2710 let bb = &[3.0f32, 47.5, 1.5, NAN];
2711
2712 let ee = &[0i32, 0, 1, 0];
2713
2714 for i in 0..4 {
2715 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2716 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2717
2718 let r = _mm_ucomigt_ss(a, b);
2719
2720 assert_eq!(
2721 ee[i], r,
2722 "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2723 a, b, r, ee[i], i
2724 );
2725 }
2726 }
2727
83c7162d 2728 #[simd_test(enable = "sse")]
0531ce1d
XL
2729 unsafe fn test_mm_ucomige_ss() {
2730 let aa = &[3.0f32, 12.0, 23.0, NAN];
2731 let bb = &[3.0f32, 47.5, 1.5, NAN];
2732
2733 let ee = &[1i32, 0, 1, 0];
2734
2735 for i in 0..4 {
2736 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2737 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2738
2739 let r = _mm_ucomige_ss(a, b);
2740
2741 assert_eq!(
2742 ee[i], r,
2743 "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2744 a, b, r, ee[i], i
2745 );
2746 }
2747 }
2748
83c7162d 2749 #[simd_test(enable = "sse")]
0531ce1d
XL
2750 unsafe fn test_mm_ucomineq_ss() {
2751 let aa = &[3.0f32, 12.0, 23.0, NAN];
2752 let bb = &[3.0f32, 47.5, 1.5, NAN];
2753
2754 let ee = &[0i32, 1, 1, 1];
2755
2756 for i in 0..4 {
2757 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2758 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2759
2760 let r = _mm_ucomineq_ss(a, b);
2761
2762 assert_eq!(
2763 ee[i], r,
2764 "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2765 a, b, r, ee[i], i
2766 );
2767 }
2768 }
2769
83c7162d 2770 #[simd_test(enable = "sse")]
0531ce1d
XL
2771 unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2772 // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2773 // Invalid Operation Exception while `ucomieq_ss` should not.
2774 let aa = &[3.0f32, NAN, 23.0, NAN];
2775 let bb = &[3.0f32, 47.5, NAN, NAN];
2776
2777 let ee = &[1i32, 0, 0, 0];
2778 let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2779
2780 for i in 0..4 {
2781 let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2782 let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2783
2784 _MM_SET_EXCEPTION_STATE(0);
2785 let r1 = _mm_comieq_ss(*black_box(&a), b);
2786 let s1 = _MM_GET_EXCEPTION_STATE();
2787
2788 _MM_SET_EXCEPTION_STATE(0);
2789 let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2790 let s2 = _MM_GET_EXCEPTION_STATE();
2791
2792 assert_eq!(
2793 ee[i], r1,
2794 "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2795 a, b, r1, ee[i], i
2796 );
2797 assert_eq!(
2798 ee[i], r2,
2799 "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2800 a, b, r2, ee[i], i
2801 );
2802 assert_eq!(
2803 s1,
2804 exc[i] * _MM_EXCEPT_INVALID,
2805 "_mm_comieq_ss() set exception flags: {} (i={})",
2806 s1,
2807 i
2808 );
2809 assert_eq!(
2810 s2,
2811 0, // ucomieq_ss should not signal an exception
2812 "_mm_ucomieq_ss() set exception flags: {} (i={})",
2813 s2,
2814 i
2815 );
2816 }
2817 }
2818
83c7162d 2819 #[simd_test(enable = "sse")]
0531ce1d 2820 unsafe fn test_mm_cvtss_si32() {
8faf50e0 2821 let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
ba9703b0 2822 let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
0531ce1d
XL
2823 for i in 0..inputs.len() {
2824 let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2825 let e = result[i];
2826 let r = _mm_cvtss_si32(x);
2827 assert_eq!(
2828 e, r,
2829 "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2830 i, x, r, e
2831 );
2832 }
2833 }
2834
83c7162d 2835 #[simd_test(enable = "sse")]
0531ce1d
XL
2836 unsafe fn test_mm_cvttss_si32() {
2837 let inputs = &[
2838 (42.0f32, 42i32),
2839 (-31.4, -31),
2840 (-33.5, -33),
2841 (-34.5, -34),
2842 (10.999, 10),
2843 (-5.99, -5),
ba9703b0 2844 (4.0e10, i32::MIN),
0531ce1d 2845 (4.0e-10, 0),
ba9703b0 2846 (NAN, i32::MIN),
0531ce1d
XL
2847 (2147483500.1, 2147483520),
2848 ];
2849 for i in 0..inputs.len() {
2850 let (xi, e) = inputs[i];
2851 let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2852 let r = _mm_cvttss_si32(x);
2853 assert_eq!(
2854 e, r,
2855 "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2856 i, x, r, e
2857 );
2858 }
2859 }
2860
83c7162d 2861 #[simd_test(enable = "sse")]
e1599b0c 2862 unsafe fn test_mm_cvtsi32_ss() {
0531ce1d
XL
2863 let inputs = &[
2864 (4555i32, 4555.0f32),
2865 (322223333, 322223330.0),
2866 (-432, -432.0),
2867 (-322223333, -322223330.0),
2868 ];
2869
2870 for i in 0..inputs.len() {
2871 let (x, f) = inputs[i];
2872 let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2873 let r = _mm_cvtsi32_ss(a, x);
2874 let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
2875 assert_eq_m128(e, r);
2876 }
2877 }
2878
83c7162d 2879 #[simd_test(enable = "sse")]
e1599b0c 2880 unsafe fn test_mm_cvtss_f32() {
0531ce1d
XL
2881 let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
2882 assert_eq!(_mm_cvtss_f32(a), 312.0134);
2883 }
2884
83c7162d 2885 #[simd_test(enable = "sse")]
0531ce1d
XL
2886 unsafe fn test_mm_set_ss() {
2887 let r = _mm_set_ss(black_box(4.25));
2888 assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
2889 }
2890
83c7162d 2891 #[simd_test(enable = "sse")]
0531ce1d
XL
2892 unsafe fn test_mm_set1_ps() {
2893 let r1 = _mm_set1_ps(black_box(4.25));
2894 let r2 = _mm_set_ps1(black_box(4.25));
2895 assert_eq!(get_m128(r1, 0), 4.25);
2896 assert_eq!(get_m128(r1, 1), 4.25);
2897 assert_eq!(get_m128(r1, 2), 4.25);
2898 assert_eq!(get_m128(r1, 3), 4.25);
2899 assert_eq!(get_m128(r2, 0), 4.25);
2900 assert_eq!(get_m128(r2, 1), 4.25);
2901 assert_eq!(get_m128(r2, 2), 4.25);
2902 assert_eq!(get_m128(r2, 3), 4.25);
2903 }
2904
83c7162d 2905 #[simd_test(enable = "sse")]
0531ce1d
XL
2906 unsafe fn test_mm_set_ps() {
2907 let r = _mm_set_ps(
2908 black_box(1.0),
2909 black_box(2.0),
2910 black_box(3.0),
2911 black_box(4.0),
2912 );
2913 assert_eq!(get_m128(r, 0), 4.0);
2914 assert_eq!(get_m128(r, 1), 3.0);
2915 assert_eq!(get_m128(r, 2), 2.0);
2916 assert_eq!(get_m128(r, 3), 1.0);
2917 }
2918
83c7162d 2919 #[simd_test(enable = "sse")]
0531ce1d
XL
2920 unsafe fn test_mm_setr_ps() {
2921 let r = _mm_setr_ps(
2922 black_box(1.0),
2923 black_box(2.0),
2924 black_box(3.0),
2925 black_box(4.0),
2926 );
2927 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
2928 }
2929
83c7162d 2930 #[simd_test(enable = "sse")]
0531ce1d
XL
2931 unsafe fn test_mm_setzero_ps() {
2932 let r = *black_box(&_mm_setzero_ps());
2933 assert_eq_m128(r, _mm_set1_ps(0.0));
2934 }
2935
8faf50e0
XL
2936 #[simd_test(enable = "sse")]
2937 unsafe fn test_mm_shuffle() {
2938 assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
2939 assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
2940 assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
2941 }
2942
83c7162d 2943 #[simd_test(enable = "sse")]
0531ce1d
XL
2944 unsafe fn test_mm_shuffle_ps() {
2945 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2946 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
17df50a5 2947 let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
0531ce1d
XL
2948 assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
2949 }
2950
83c7162d 2951 #[simd_test(enable = "sse")]
0531ce1d
XL
2952 unsafe fn test_mm_unpackhi_ps() {
2953 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2954 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2955 let r = _mm_unpackhi_ps(a, b);
2956 assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
2957 }
2958
83c7162d 2959 #[simd_test(enable = "sse")]
0531ce1d
XL
2960 unsafe fn test_mm_unpacklo_ps() {
2961 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2962 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2963 let r = _mm_unpacklo_ps(a, b);
2964 assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
2965 }
2966
83c7162d 2967 #[simd_test(enable = "sse")]
0531ce1d
XL
2968 unsafe fn test_mm_movehl_ps() {
2969 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2970 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2971 let r = _mm_movehl_ps(a, b);
2972 assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
2973 }
2974
83c7162d 2975 #[simd_test(enable = "sse")]
0531ce1d
XL
2976 unsafe fn test_mm_movelh_ps() {
2977 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2978 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
2979 let r = _mm_movelh_ps(a, b);
2980 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
2981 }
2982
83c7162d 2983 #[simd_test(enable = "sse")]
0531ce1d
XL
2984 unsafe fn test_mm_load_ss() {
2985 let a = 42.0f32;
2986 let r = _mm_load_ss(&a as *const f32);
2987 assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
2988 }
2989
83c7162d 2990 #[simd_test(enable = "sse")]
0531ce1d
XL
2991 unsafe fn test_mm_load1_ps() {
2992 let a = 42.0f32;
2993 let r = _mm_load1_ps(&a as *const f32);
2994 assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
2995 }
2996
83c7162d 2997 #[simd_test(enable = "sse")]
0531ce1d
XL
2998 unsafe fn test_mm_load_ps() {
2999 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3000
3001 let mut p = vals.as_ptr();
3002 let mut fixup = 0.0f32;
3003
3004 // Make sure p is aligned, otherwise we might get a
3005 // (signal: 11, SIGSEGV: invalid memory reference)
3006
3007 let unalignment = (p as usize) & 0xf;
3008 if unalignment != 0 {
3009 let delta = ((16 - unalignment) >> 2) as isize;
3010 fixup = delta as f32;
3011 p = p.offset(delta);
3012 }
3013
3014 let r = _mm_load_ps(p);
0731742a 3015 let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
0531ce1d
XL
3016 assert_eq_m128(r, e);
3017 }
3018
83c7162d 3019 #[simd_test(enable = "sse")]
0531ce1d
XL
3020 unsafe fn test_mm_loadu_ps() {
3021 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3022 let p = vals.as_ptr().offset(3);
3023 let r = _mm_loadu_ps(black_box(p));
3024 assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3025 }
3026
83c7162d 3027 #[simd_test(enable = "sse")]
0531ce1d
XL
3028 unsafe fn test_mm_loadr_ps() {
3029 let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3030
3031 let mut p = vals.as_ptr();
3032 let mut fixup = 0.0f32;
3033
3034 // Make sure p is aligned, otherwise we might get a
3035 // (signal: 11, SIGSEGV: invalid memory reference)
3036
3037 let unalignment = (p as usize) & 0xf;
3038 if unalignment != 0 {
3039 let delta = ((16 - unalignment) >> 2) as isize;
3040 fixup = delta as f32;
3041 p = p.offset(delta);
3042 }
3043
3044 let r = _mm_loadr_ps(p);
0731742a 3045 let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
0531ce1d
XL
3046 assert_eq_m128(r, e);
3047 }
3048
3dfed10e
XL
3049 #[simd_test(enable = "sse2")]
3050 unsafe fn test_mm_loadu_si64() {
3051 let a = _mm_setr_epi64x(5, 6);
3052 let r = _mm_loadu_si64(&a as *const _ as *const _);
cdc7bbd5 3053 assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
3dfed10e
XL
3054 }
3055
83c7162d 3056 #[simd_test(enable = "sse")]
0531ce1d
XL
3057 unsafe fn test_mm_store_ss() {
3058 let mut vals = [0.0f32; 8];
3059 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3060 _mm_store_ss(vals.as_mut_ptr().offset(1), a);
3061
3062 assert_eq!(vals[0], 0.0);
3063 assert_eq!(vals[1], 1.0);
3064 assert_eq!(vals[2], 0.0);
3065 }
3066
83c7162d 3067 #[simd_test(enable = "sse")]
0531ce1d
XL
3068 unsafe fn test_mm_store1_ps() {
3069 let mut vals = [0.0f32; 8];
3070 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3071
3072 let mut ofs = 0;
3073 let mut p = vals.as_mut_ptr();
3074
3075 if (p as usize) & 0xf != 0 {
fc512014
XL
3076 ofs = ((16 - (p as usize)) & 0xf) >> 2;
3077 p = p.add(ofs);
0531ce1d
XL
3078 }
3079
3080 _mm_store1_ps(p, *black_box(&a));
3081
3082 if ofs > 0 {
3083 assert_eq!(vals[ofs - 1], 0.0);
3084 }
3085 assert_eq!(vals[ofs + 0], 1.0);
3086 assert_eq!(vals[ofs + 1], 1.0);
3087 assert_eq!(vals[ofs + 2], 1.0);
3088 assert_eq!(vals[ofs + 3], 1.0);
3089 assert_eq!(vals[ofs + 4], 0.0);
3090 }
3091
83c7162d 3092 #[simd_test(enable = "sse")]
0531ce1d
XL
3093 unsafe fn test_mm_store_ps() {
3094 let mut vals = [0.0f32; 8];
3095 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3096
3097 let mut ofs = 0;
3098 let mut p = vals.as_mut_ptr();
3099
3100 // Align p to 16-byte boundary
3101 if (p as usize) & 0xf != 0 {
fc512014
XL
3102 ofs = ((16 - (p as usize)) & 0xf) >> 2;
3103 p = p.add(ofs);
0531ce1d
XL
3104 }
3105
3106 _mm_store_ps(p, *black_box(&a));
3107
3108 if ofs > 0 {
3109 assert_eq!(vals[ofs - 1], 0.0);
3110 }
3111 assert_eq!(vals[ofs + 0], 1.0);
3112 assert_eq!(vals[ofs + 1], 2.0);
3113 assert_eq!(vals[ofs + 2], 3.0);
3114 assert_eq!(vals[ofs + 3], 4.0);
3115 assert_eq!(vals[ofs + 4], 0.0);
3116 }
3117
83c7162d 3118 #[simd_test(enable = "sse")]
0531ce1d
XL
3119 unsafe fn test_mm_storer_ps() {
3120 let mut vals = [0.0f32; 8];
3121 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3122
3123 let mut ofs = 0;
3124 let mut p = vals.as_mut_ptr();
3125
3126 // Align p to 16-byte boundary
3127 if (p as usize) & 0xf != 0 {
fc512014
XL
3128 ofs = ((16 - (p as usize)) & 0xf) >> 2;
3129 p = p.add(ofs);
0531ce1d
XL
3130 }
3131
3132 _mm_storer_ps(p, *black_box(&a));
3133
3134 if ofs > 0 {
3135 assert_eq!(vals[ofs - 1], 0.0);
3136 }
3137 assert_eq!(vals[ofs + 0], 4.0);
3138 assert_eq!(vals[ofs + 1], 3.0);
3139 assert_eq!(vals[ofs + 2], 2.0);
3140 assert_eq!(vals[ofs + 3], 1.0);
3141 assert_eq!(vals[ofs + 4], 0.0);
3142 }
3143
83c7162d 3144 #[simd_test(enable = "sse")]
0531ce1d
XL
3145 unsafe fn test_mm_storeu_ps() {
3146 let mut vals = [0.0f32; 8];
3147 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3148
3149 let mut ofs = 0;
3150 let mut p = vals.as_mut_ptr();
3151
532ac7d7 3152 // Make sure p is **not** aligned to 16-byte boundary
0531ce1d
XL
3153 if (p as usize) & 0xf == 0 {
3154 ofs = 1;
3155 p = p.offset(1);
3156 }
3157
3158 _mm_storeu_ps(p, *black_box(&a));
3159
3160 if ofs > 0 {
3161 assert_eq!(vals[ofs - 1], 0.0);
3162 }
3163 assert_eq!(vals[ofs + 0], 1.0);
3164 assert_eq!(vals[ofs + 1], 2.0);
3165 assert_eq!(vals[ofs + 2], 3.0);
3166 assert_eq!(vals[ofs + 3], 4.0);
3167 assert_eq!(vals[ofs + 4], 0.0);
3168 }
3169
83c7162d 3170 #[simd_test(enable = "sse")]
0531ce1d
XL
3171 unsafe fn test_mm_move_ss() {
3172 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3173 let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3174
3175 let r = _mm_move_ss(a, b);
3176 let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3177 assert_eq_m128(e, r);
3178 }
3179
83c7162d 3180 #[simd_test(enable = "sse")]
0531ce1d
XL
3181 unsafe fn test_mm_movemask_ps() {
3182 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3183 assert_eq!(r, 0b0101);
3184
3185 let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3186 assert_eq!(r, 0b0111);
3187 }
3188
83c7162d 3189 #[simd_test(enable = "sse")]
0531ce1d
XL
3190 unsafe fn test_mm_sfence() {
3191 _mm_sfence();
3192 }
3193
83c7162d 3194 #[simd_test(enable = "sse")]
0531ce1d
XL
3195 unsafe fn test_mm_getcsr_setcsr_1() {
3196 let saved_csr = _mm_getcsr();
3197
3198 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3199 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3200
3201 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3202 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3203
3204 _mm_setcsr(saved_csr);
3205
3206 let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3207 assert_eq_m128(r, exp); // first component is a denormalized f32
3208 }
3209
83c7162d 3210 #[simd_test(enable = "sse")]
0531ce1d
XL
3211 unsafe fn test_mm_getcsr_setcsr_2() {
3212 // Same as _mm_setcsr_1 test, but with opposite flag value.
3213
3214 let saved_csr = _mm_getcsr();
3215
3216 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3217 let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3218
3219 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3220 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3221
3222 _mm_setcsr(saved_csr);
3223
3224 let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3225 assert_eq_m128(r, exp); // first component is a denormalized f32
3226 }
3227
83c7162d 3228 #[simd_test(enable = "sse")]
0531ce1d
XL
3229 unsafe fn test_mm_getcsr_setcsr_underflow() {
3230 _MM_SET_EXCEPTION_STATE(0);
3231
3232 let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3233 let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3234
3235 assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3236
3237 let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3238
3239 let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3240 assert_eq_m128(r, exp);
3241
3242 let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3243 assert_eq!(underflow, true);
3244 }
3245
83c7162d 3246 #[simd_test(enable = "sse")]
0531ce1d
XL
3247 unsafe fn test_MM_TRANSPOSE4_PS() {
3248 let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3249 let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3250 let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3251 let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3252
3253 _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3254
3255 assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3256 assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3257 assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3258 assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3259 }
3260
3261 #[repr(align(16))]
3262 struct Memory {
3263 pub data: [f32; 4],
3264 }
3265
83c7162d 3266 #[simd_test(enable = "sse")]
0531ce1d
XL
3267 unsafe fn test_mm_stream_ps() {
3268 let a = _mm_set1_ps(7.0);
8faf50e0 3269 let mut mem = Memory { data: [-1.0; 4] };
0531ce1d
XL
3270
3271 _mm_stream_ps(&mut mem.data[0] as *mut f32, a);
3272 for i in 0..4 {
3273 assert_eq!(mem.data[i], get_m128(a, i));
3274 }
3275 }
0531ce1d 3276}