]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! Streaming SIMD Extensions (SSE) |
2 | ||
532ac7d7 XL |
3 | use crate::{ |
4 | core_arch::{simd::*, simd_llvm::*, x86::*}, | |
5 | intrinsics, mem, ptr, | |
6 | }; | |
0531ce1d XL |
7 | |
8 | #[cfg(test)] | |
416331ca | 9 | use stdarch_test::assert_instr; |
0531ce1d XL |
10 | |
11 | /// Adds the first component of `a` and `b`, the other components are copied | |
12 | /// from `a`. | |
83c7162d XL |
13 | /// |
14 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss) | |
0531ce1d XL |
15 | #[inline] |
16 | #[target_feature(enable = "sse")] | |
17 | #[cfg_attr(test, assert_instr(addss))] | |
83c7162d | 18 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
19 | pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { |
20 | addss(a, b) | |
21 | } | |
22 | ||
23 | /// Adds __m128 vectors. | |
83c7162d XL |
24 | /// |
25 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps) | |
0531ce1d XL |
26 | #[inline] |
27 | #[target_feature(enable = "sse")] | |
28 | #[cfg_attr(test, assert_instr(addps))] | |
83c7162d | 29 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
30 | pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { |
31 | simd_add(a, b) | |
32 | } | |
33 | ||
34 | /// Subtracts the first component of `b` from `a`, the other components are | |
35 | /// copied from `a`. | |
83c7162d XL |
36 | /// |
37 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss) | |
0531ce1d XL |
38 | #[inline] |
39 | #[target_feature(enable = "sse")] | |
40 | #[cfg_attr(test, assert_instr(subss))] | |
83c7162d | 41 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
42 | pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { |
43 | subss(a, b) | |
44 | } | |
45 | ||
46 | /// Subtracts __m128 vectors. | |
83c7162d XL |
47 | /// |
48 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps) | |
0531ce1d XL |
49 | #[inline] |
50 | #[target_feature(enable = "sse")] | |
51 | #[cfg_attr(test, assert_instr(subps))] | |
83c7162d | 52 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
53 | pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { |
54 | simd_sub(a, b) | |
55 | } | |
56 | ||
57 | /// Multiplies the first component of `a` and `b`, the other components are | |
58 | /// copied from `a`. | |
83c7162d XL |
59 | /// |
60 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss) | |
0531ce1d XL |
61 | #[inline] |
62 | #[target_feature(enable = "sse")] | |
63 | #[cfg_attr(test, assert_instr(mulss))] | |
83c7162d | 64 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
65 | pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { |
66 | mulss(a, b) | |
67 | } | |
68 | ||
69 | /// Multiplies __m128 vectors. | |
83c7162d XL |
70 | /// |
71 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps) | |
0531ce1d XL |
72 | #[inline] |
73 | #[target_feature(enable = "sse")] | |
74 | #[cfg_attr(test, assert_instr(mulps))] | |
83c7162d | 75 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
76 | pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { |
77 | simd_mul(a, b) | |
78 | } | |
79 | ||
80 | /// Divides the first component of `b` by `a`, the other components are | |
81 | /// copied from `a`. | |
83c7162d XL |
82 | /// |
83 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss) | |
0531ce1d XL |
84 | #[inline] |
85 | #[target_feature(enable = "sse")] | |
86 | #[cfg_attr(test, assert_instr(divss))] | |
83c7162d | 87 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
88 | pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { |
89 | divss(a, b) | |
90 | } | |
91 | ||
92 | /// Divides __m128 vectors. | |
83c7162d XL |
93 | /// |
94 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps) | |
0531ce1d XL |
95 | #[inline] |
96 | #[target_feature(enable = "sse")] | |
97 | #[cfg_attr(test, assert_instr(divps))] | |
83c7162d | 98 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
99 | pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { |
100 | simd_div(a, b) | |
101 | } | |
102 | ||
532ac7d7 | 103 | /// Returns the square root of the first single-precision (32-bit) |
0531ce1d | 104 | /// floating-point element in `a`, the other elements are unchanged. |
83c7162d XL |
105 | /// |
106 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss) | |
0531ce1d XL |
107 | #[inline] |
108 | #[target_feature(enable = "sse")] | |
109 | #[cfg_attr(test, assert_instr(sqrtss))] | |
83c7162d | 110 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
111 | pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 { |
112 | sqrtss(a) | |
113 | } | |
114 | ||
532ac7d7 | 115 | /// Returns the square root of packed single-precision (32-bit) floating-point |
0531ce1d | 116 | /// elements in `a`. |
83c7162d XL |
117 | /// |
118 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps) | |
0531ce1d XL |
119 | #[inline] |
120 | #[target_feature(enable = "sse")] | |
121 | #[cfg_attr(test, assert_instr(sqrtps))] | |
83c7162d | 122 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
123 | pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 { |
124 | sqrtps(a) | |
125 | } | |
126 | ||
532ac7d7 | 127 | /// Returns the approximate reciprocal of the first single-precision |
0531ce1d | 128 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
83c7162d XL |
129 | /// |
130 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss) | |
0531ce1d XL |
131 | #[inline] |
132 | #[target_feature(enable = "sse")] | |
133 | #[cfg_attr(test, assert_instr(rcpss))] | |
83c7162d | 134 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
135 | pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 { |
136 | rcpss(a) | |
137 | } | |
138 | ||
532ac7d7 | 139 | /// Returns the approximate reciprocal of packed single-precision (32-bit) |
0531ce1d | 140 | /// floating-point elements in `a`. |
83c7162d XL |
141 | /// |
142 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps) | |
0531ce1d XL |
143 | #[inline] |
144 | #[target_feature(enable = "sse")] | |
145 | #[cfg_attr(test, assert_instr(rcpps))] | |
83c7162d | 146 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
147 | pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 { |
148 | rcpps(a) | |
149 | } | |
150 | ||
17df50a5 XL |
151 | /// Returns the approximate reciprocal square root of the first single-precision |
152 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. | |
83c7162d XL |
153 | /// |
154 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss) | |
0531ce1d XL |
155 | #[inline] |
156 | #[target_feature(enable = "sse")] | |
157 | #[cfg_attr(test, assert_instr(rsqrtss))] | |
83c7162d | 158 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
159 | pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 { |
160 | rsqrtss(a) | |
161 | } | |
162 | ||
532ac7d7 | 163 | /// Returns the approximate reciprocal square root of packed single-precision |
0531ce1d | 164 | /// (32-bit) floating-point elements in `a`. |
83c7162d XL |
165 | /// |
166 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps) | |
0531ce1d XL |
167 | #[inline] |
168 | #[target_feature(enable = "sse")] | |
169 | #[cfg_attr(test, assert_instr(rsqrtps))] | |
83c7162d | 170 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
171 | pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 { |
172 | rsqrtps(a) | |
173 | } | |
174 | ||
532ac7d7 | 175 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
0531ce1d XL |
176 | /// and `b`, and return the minimum value in the first element of the return |
177 | /// value, the other elements are copied from `a`. | |
83c7162d XL |
178 | /// |
179 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss) | |
0531ce1d XL |
180 | #[inline] |
181 | #[target_feature(enable = "sse")] | |
182 | #[cfg_attr(test, assert_instr(minss))] | |
83c7162d | 183 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
184 | pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { |
185 | minss(a, b) | |
186 | } | |
187 | ||
532ac7d7 | 188 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d | 189 | /// `b`, and return the corresponding minimum values. |
83c7162d XL |
190 | /// |
191 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps) | |
0531ce1d XL |
192 | #[inline] |
193 | #[target_feature(enable = "sse")] | |
194 | #[cfg_attr(test, assert_instr(minps))] | |
83c7162d | 195 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 196 | pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { |
74b04a01 | 197 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`. |
0531ce1d XL |
198 | minps(a, b) |
199 | } | |
200 | ||
532ac7d7 | 201 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
0531ce1d XL |
202 | /// and `b`, and return the maximum value in the first element of the return |
203 | /// value, the other elements are copied from `a`. | |
83c7162d XL |
204 | /// |
205 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss) | |
0531ce1d XL |
206 | #[inline] |
207 | #[target_feature(enable = "sse")] | |
208 | #[cfg_attr(test, assert_instr(maxss))] | |
83c7162d | 209 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
210 | pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { |
211 | maxss(a, b) | |
212 | } | |
213 | ||
532ac7d7 | 214 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d | 215 | /// `b`, and return the corresponding maximum values. |
83c7162d XL |
216 | /// |
217 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps) | |
0531ce1d XL |
218 | #[inline] |
219 | #[target_feature(enable = "sse")] | |
220 | #[cfg_attr(test, assert_instr(maxps))] | |
83c7162d | 221 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 222 | pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { |
74b04a01 | 223 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`. |
0531ce1d XL |
224 | maxps(a, b) |
225 | } | |
226 | ||
227 | /// Bitwise AND of packed single-precision (32-bit) floating-point elements. | |
83c7162d XL |
228 | /// |
229 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps) | |
0531ce1d XL |
230 | #[inline] |
231 | #[target_feature(enable = "sse")] | |
232 | // i586 only seems to generate plain `and` instructions, so ignore it. | |
8faf50e0 XL |
233 | #[cfg_attr( |
234 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
235 | assert_instr(andps) | |
236 | )] | |
83c7162d | 237 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
238 | pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { |
239 | let a: __m128i = mem::transmute(a); | |
240 | let b: __m128i = mem::transmute(b); | |
241 | mem::transmute(simd_and(a, b)) | |
242 | } | |
243 | ||
244 | /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point | |
245 | /// elements. | |
246 | /// | |
247 | /// Computes `!a & b` for each bit in `a` and `b`. | |
83c7162d XL |
248 | /// |
249 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps) | |
0531ce1d XL |
250 | #[inline] |
251 | #[target_feature(enable = "sse")] | |
252 | // i586 only seems to generate plain `not` and `and` instructions, so ignore | |
253 | // it. | |
8faf50e0 XL |
254 | #[cfg_attr( |
255 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
256 | assert_instr(andnps) | |
257 | )] | |
83c7162d | 258 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
259 | pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { |
260 | let a: __m128i = mem::transmute(a); | |
261 | let b: __m128i = mem::transmute(b); | |
262 | let mask: __m128i = mem::transmute(i32x4::splat(-1)); | |
263 | mem::transmute(simd_and(simd_xor(mask, a), b)) | |
264 | } | |
265 | ||
266 | /// Bitwise OR of packed single-precision (32-bit) floating-point elements. | |
83c7162d XL |
267 | /// |
268 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps) | |
0531ce1d XL |
269 | #[inline] |
270 | #[target_feature(enable = "sse")] | |
271 | // i586 only seems to generate plain `or` instructions, so we ignore it. | |
8faf50e0 XL |
272 | #[cfg_attr( |
273 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
274 | assert_instr(orps) | |
275 | )] | |
83c7162d | 276 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
277 | pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { |
278 | let a: __m128i = mem::transmute(a); | |
279 | let b: __m128i = mem::transmute(b); | |
280 | mem::transmute(simd_or(a, b)) | |
281 | } | |
282 | ||
283 | /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point | |
284 | /// elements. | |
83c7162d XL |
285 | /// |
286 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps) | |
0531ce1d XL |
287 | #[inline] |
288 | #[target_feature(enable = "sse")] | |
289 | // i586 only seems to generate plain `xor` instructions, so we ignore it. | |
8faf50e0 XL |
290 | #[cfg_attr( |
291 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
292 | assert_instr(xorps) | |
293 | )] | |
83c7162d | 294 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
295 | pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { |
296 | let a: __m128i = mem::transmute(a); | |
297 | let b: __m128i = mem::transmute(b); | |
298 | mem::transmute(simd_xor(a, b)) | |
299 | } | |
300 | ||
532ac7d7 | 301 | /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of |
0531ce1d XL |
302 | /// the result will be `0xffffffff` if the two inputs are equal, or `0` |
303 | /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`. | |
83c7162d XL |
304 | /// |
305 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss) | |
0531ce1d XL |
306 | #[inline] |
307 | #[target_feature(enable = "sse")] | |
308 | #[cfg_attr(test, assert_instr(cmpeqss))] | |
83c7162d | 309 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
310 | pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { |
311 | cmpss(a, b, 0) | |
312 | } | |
313 | ||
532ac7d7 | 314 | /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits |
0531ce1d XL |
315 | /// of the result will be `0xffffffff` if `a.extract(0)` is less than |
316 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the | |
317 | /// upper 96 bits of `a`. | |
83c7162d XL |
318 | /// |
319 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss) | |
0531ce1d XL |
320 | #[inline] |
321 | #[target_feature(enable = "sse")] | |
322 | #[cfg_attr(test, assert_instr(cmpltss))] | |
83c7162d | 323 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
324 | pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { |
325 | cmpss(a, b, 1) | |
326 | } | |
327 | ||
532ac7d7 | 328 | /// Compares the lowest `f32` of both inputs for less than or equal. The lowest |
0531ce1d XL |
329 | /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than |
330 | /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result | |
331 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
332 | /// |
333 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss) | |
0531ce1d XL |
334 | #[inline] |
335 | #[target_feature(enable = "sse")] | |
336 | #[cfg_attr(test, assert_instr(cmpless))] | |
83c7162d | 337 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
338 | pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { |
339 | cmpss(a, b, 2) | |
340 | } | |
341 | ||
532ac7d7 | 342 | /// Compares the lowest `f32` of both inputs for greater than. The lowest 32 |
0531ce1d XL |
343 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater |
344 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result | |
345 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
346 | /// |
347 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss) | |
0531ce1d XL |
348 | #[inline] |
349 | #[target_feature(enable = "sse")] | |
350 | #[cfg_attr(test, assert_instr(cmpltss))] | |
83c7162d | 351 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 352 | pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 353 | simd_shuffle4!(a, cmpss(b, a, 1), [4, 1, 2, 3]) |
0531ce1d XL |
354 | } |
355 | ||
532ac7d7 | 356 | /// Compares the lowest `f32` of both inputs for greater than or equal. The |
0531ce1d XL |
357 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is |
358 | /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits | |
359 | /// of the result are the upper 96 bits of `a`. | |
83c7162d XL |
360 | /// |
361 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss) | |
0531ce1d XL |
362 | #[inline] |
363 | #[target_feature(enable = "sse")] | |
364 | #[cfg_attr(test, assert_instr(cmpless))] | |
83c7162d | 365 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 366 | pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 367 | simd_shuffle4!(a, cmpss(b, a, 2), [4, 1, 2, 3]) |
0531ce1d XL |
368 | } |
369 | ||
532ac7d7 | 370 | /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits |
0531ce1d XL |
371 | /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to |
372 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the | |
373 | /// upper 96 bits of `a`. | |
83c7162d XL |
374 | /// |
375 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss) | |
0531ce1d XL |
376 | #[inline] |
377 | #[target_feature(enable = "sse")] | |
378 | #[cfg_attr(test, assert_instr(cmpneqss))] | |
83c7162d | 379 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
380 | pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { |
381 | cmpss(a, b, 4) | |
382 | } | |
383 | ||
532ac7d7 | 384 | /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32 |
0531ce1d XL |
385 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than |
386 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the | |
387 | /// upper 96 bits of `a`. | |
83c7162d XL |
388 | /// |
389 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss) | |
0531ce1d XL |
390 | #[inline] |
391 | #[target_feature(enable = "sse")] | |
392 | #[cfg_attr(test, assert_instr(cmpnltss))] | |
83c7162d | 393 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
394 | pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { |
395 | cmpss(a, b, 5) | |
396 | } | |
397 | ||
532ac7d7 | 398 | /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The |
0531ce1d XL |
399 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
400 | /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits | |
401 | /// of the result are the upper 96 bits of `a`. | |
83c7162d XL |
402 | /// |
403 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss) | |
0531ce1d XL |
404 | #[inline] |
405 | #[target_feature(enable = "sse")] | |
406 | #[cfg_attr(test, assert_instr(cmpnless))] | |
83c7162d | 407 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
408 | pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { |
409 | cmpss(a, b, 6) | |
410 | } | |
411 | ||
532ac7d7 | 412 | /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32 |
0531ce1d XL |
413 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater |
414 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are | |
415 | /// the upper 96 bits of `a`. | |
83c7162d XL |
416 | /// |
417 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss) | |
0531ce1d XL |
418 | #[inline] |
419 | #[target_feature(enable = "sse")] | |
420 | #[cfg_attr(test, assert_instr(cmpnltss))] | |
83c7162d | 421 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 422 | pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 423 | simd_shuffle4!(a, cmpss(b, a, 5), [4, 1, 2, 3]) |
0531ce1d XL |
424 | } |
425 | ||
532ac7d7 | 426 | /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The |
0531ce1d XL |
427 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
428 | /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 | |
429 | /// bits of the result are the upper 96 bits of `a`. | |
83c7162d XL |
430 | /// |
431 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss) | |
0531ce1d XL |
432 | #[inline] |
433 | #[target_feature(enable = "sse")] | |
434 | #[cfg_attr(test, assert_instr(cmpnless))] | |
83c7162d | 435 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 436 | pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 437 | simd_shuffle4!(a, cmpss(b, a, 6), [4, 1, 2, 3]) |
0531ce1d XL |
438 | } |
439 | ||
532ac7d7 | 440 | /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of |
0531ce1d XL |
441 | /// the result will be `0xffffffff` if neither of `a.extract(0)` or |
442 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result | |
443 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
444 | /// |
445 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss) | |
0531ce1d XL |
446 | #[inline] |
447 | #[target_feature(enable = "sse")] | |
448 | #[cfg_attr(test, assert_instr(cmpordss))] | |
83c7162d | 449 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
450 | pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { |
451 | cmpss(a, b, 7) | |
452 | } | |
453 | ||
532ac7d7 | 454 | /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits |
0531ce1d XL |
455 | /// of the result will be `0xffffffff` if any of `a.extract(0)` or |
456 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result | |
457 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
458 | /// |
459 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss) | |
0531ce1d XL |
460 | #[inline] |
461 | #[target_feature(enable = "sse")] | |
462 | #[cfg_attr(test, assert_instr(cmpunordss))] | |
83c7162d | 463 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
464 | pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { |
465 | cmpss(a, b, 3) | |
466 | } | |
467 | ||
532ac7d7 | 468 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
469 | /// The result in the output vector will be `0xffffffff` if the input elements |
470 | /// were equal, or `0` otherwise. | |
83c7162d XL |
471 | /// |
472 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps) | |
0531ce1d XL |
473 | #[inline] |
474 | #[target_feature(enable = "sse")] | |
475 | #[cfg_attr(test, assert_instr(cmpeqps))] | |
83c7162d | 476 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
477 | pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { |
478 | cmpps(a, b, 0) | |
479 | } | |
480 | ||
532ac7d7 | 481 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
482 | /// The result in the output vector will be `0xffffffff` if the input element |
483 | /// in `a` is less than the corresponding element in `b`, or `0` otherwise. | |
83c7162d XL |
484 | /// |
485 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps) | |
0531ce1d XL |
486 | #[inline] |
487 | #[target_feature(enable = "sse")] | |
488 | #[cfg_attr(test, assert_instr(cmpltps))] | |
83c7162d | 489 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
490 | pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { |
491 | cmpps(a, b, 1) | |
492 | } | |
493 | ||
532ac7d7 | 494 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
495 | /// The result in the output vector will be `0xffffffff` if the input element |
496 | /// in `a` is less than or equal to the corresponding element in `b`, or `0` | |
497 | /// otherwise. | |
83c7162d XL |
498 | /// |
499 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps) | |
0531ce1d XL |
500 | #[inline] |
501 | #[target_feature(enable = "sse")] | |
502 | #[cfg_attr(test, assert_instr(cmpleps))] | |
83c7162d | 503 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
504 | pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { |
505 | cmpps(a, b, 2) | |
506 | } | |
507 | ||
532ac7d7 | 508 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
509 | /// The result in the output vector will be `0xffffffff` if the input element |
510 | /// in `a` is greater than the corresponding element in `b`, or `0` otherwise. | |
83c7162d XL |
511 | /// |
512 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps) | |
0531ce1d XL |
513 | #[inline] |
514 | #[target_feature(enable = "sse")] | |
515 | #[cfg_attr(test, assert_instr(cmpltps))] | |
83c7162d | 516 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
517 | pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { |
518 | cmpps(b, a, 1) | |
519 | } | |
520 | ||
532ac7d7 | 521 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
522 | /// The result in the output vector will be `0xffffffff` if the input element |
523 | /// in `a` is greater than or equal to the corresponding element in `b`, or `0` | |
524 | /// otherwise. | |
83c7162d XL |
525 | /// |
526 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps) | |
0531ce1d XL |
527 | #[inline] |
528 | #[target_feature(enable = "sse")] | |
529 | #[cfg_attr(test, assert_instr(cmpleps))] | |
83c7162d | 530 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
531 | pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { |
532 | cmpps(b, a, 2) | |
533 | } | |
534 | ||
532ac7d7 | 535 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 536 | /// The result in the output vector will be `0xffffffff` if the input elements |
532ac7d7 | 537 | /// are **not** equal, or `0` otherwise. |
83c7162d XL |
538 | /// |
539 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps) | |
0531ce1d XL |
540 | #[inline] |
541 | #[target_feature(enable = "sse")] | |
542 | #[cfg_attr(test, assert_instr(cmpneqps))] | |
83c7162d | 543 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
544 | pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { |
545 | cmpps(a, b, 4) | |
546 | } | |
547 | ||
532ac7d7 | 548 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 549 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 550 | /// in `a` is **not** less than the corresponding element in `b`, or `0` |
0531ce1d | 551 | /// otherwise. |
83c7162d XL |
552 | /// |
553 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps) | |
0531ce1d XL |
554 | #[inline] |
555 | #[target_feature(enable = "sse")] | |
556 | #[cfg_attr(test, assert_instr(cmpnltps))] | |
83c7162d | 557 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
558 | pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { |
559 | cmpps(a, b, 5) | |
560 | } | |
561 | ||
532ac7d7 | 562 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 563 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 564 | /// in `a` is **not** less than or equal to the corresponding element in `b`, or |
0531ce1d | 565 | /// `0` otherwise. |
83c7162d XL |
566 | /// |
567 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps) | |
0531ce1d XL |
568 | #[inline] |
569 | #[target_feature(enable = "sse")] | |
570 | #[cfg_attr(test, assert_instr(cmpnleps))] | |
83c7162d | 571 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
572 | pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { |
573 | cmpps(a, b, 6) | |
574 | } | |
575 | ||
532ac7d7 | 576 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 577 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 578 | /// in `a` is **not** greater than the corresponding element in `b`, or `0` |
0531ce1d | 579 | /// otherwise. |
83c7162d XL |
580 | /// |
581 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps) | |
0531ce1d XL |
582 | #[inline] |
583 | #[target_feature(enable = "sse")] | |
584 | #[cfg_attr(test, assert_instr(cmpnltps))] | |
83c7162d | 585 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
586 | pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { |
587 | cmpps(b, a, 5) | |
588 | } | |
589 | ||
532ac7d7 | 590 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 591 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 592 | /// in `a` is **not** greater than or equal to the corresponding element in `b`, |
0531ce1d | 593 | /// or `0` otherwise. |
83c7162d XL |
594 | /// |
595 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps) | |
0531ce1d XL |
596 | #[inline] |
597 | #[target_feature(enable = "sse")] | |
598 | #[cfg_attr(test, assert_instr(cmpnleps))] | |
83c7162d | 599 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
600 | pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { |
601 | cmpps(b, a, 6) | |
602 | } | |
603 | ||
532ac7d7 | 604 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
605 | /// Returns four floats that have one of two possible bit patterns. The element |
606 | /// in the output vector will be `0xffffffff` if the input elements in `a` and | |
607 | /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise. | |
83c7162d XL |
608 | /// |
609 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps) | |
0531ce1d XL |
610 | #[inline] |
611 | #[target_feature(enable = "sse")] | |
612 | #[cfg_attr(test, assert_instr(cmpordps))] | |
83c7162d | 613 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
614 | pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { |
615 | cmpps(b, a, 7) | |
616 | } | |
617 | ||
532ac7d7 | 618 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
619 | /// Returns four floats that have one of two possible bit patterns. The element |
620 | /// in the output vector will be `0xffffffff` if the input elements in `a` and | |
621 | /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise. | |
83c7162d XL |
622 | /// |
623 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps) | |
0531ce1d XL |
624 | #[inline] |
625 | #[target_feature(enable = "sse")] | |
626 | #[cfg_attr(test, assert_instr(cmpunordps))] | |
83c7162d | 627 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
628 | pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { |
629 | cmpps(b, a, 3) | |
630 | } | |
631 | ||
532ac7d7 | 632 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d | 633 | /// `1` if they are equal, or `0` otherwise. |
83c7162d XL |
634 | /// |
635 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss) | |
0531ce1d XL |
636 | #[inline] |
637 | #[target_feature(enable = "sse")] | |
638 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 639 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
640 | pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { |
641 | comieq_ss(a, b) | |
642 | } | |
643 | ||
532ac7d7 | 644 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d | 645 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
83c7162d XL |
646 | /// |
647 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss) | |
0531ce1d XL |
648 | #[inline] |
649 | #[target_feature(enable = "sse")] | |
650 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 651 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
652 | pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { |
653 | comilt_ss(a, b) | |
654 | } | |
655 | ||
532ac7d7 | 656 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
657 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
658 | /// otherwise. | |
83c7162d XL |
659 | /// |
660 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss) | |
0531ce1d XL |
661 | #[inline] |
662 | #[target_feature(enable = "sse")] | |
663 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 664 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
665 | pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { |
666 | comile_ss(a, b) | |
667 | } | |
668 | ||
532ac7d7 | 669 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
670 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
671 | /// otherwise. | |
83c7162d XL |
672 | /// |
673 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss) | |
0531ce1d XL |
674 | #[inline] |
675 | #[target_feature(enable = "sse")] | |
676 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 677 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
678 | pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { |
679 | comigt_ss(a, b) | |
680 | } | |
681 | ||
532ac7d7 | 682 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
683 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
684 | /// `0` otherwise. | |
83c7162d XL |
685 | /// |
686 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss) | |
0531ce1d XL |
687 | #[inline] |
688 | #[target_feature(enable = "sse")] | |
689 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 690 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
691 | pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { |
692 | comige_ss(a, b) | |
693 | } | |
694 | ||
532ac7d7 XL |
695 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
696 | /// `1` if they are **not** equal, or `0` otherwise. | |
83c7162d XL |
697 | /// |
698 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss) | |
0531ce1d XL |
699 | #[inline] |
700 | #[target_feature(enable = "sse")] | |
701 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 702 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
703 | pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { |
704 | comineq_ss(a, b) | |
705 | } | |
706 | ||
532ac7d7 | 707 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
708 | /// `1` if they are equal, or `0` otherwise. This instruction will not signal |
709 | /// an exception if either argument is a quiet NaN. | |
83c7162d XL |
710 | /// |
711 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss) | |
0531ce1d XL |
712 | #[inline] |
713 | #[target_feature(enable = "sse")] | |
714 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 715 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
716 | pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { |
717 | ucomieq_ss(a, b) | |
718 | } | |
719 | ||
532ac7d7 | 720 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
721 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
722 | /// This instruction will not signal an exception if either argument is a quiet | |
723 | /// NaN. | |
83c7162d XL |
724 | /// |
725 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss) | |
0531ce1d XL |
726 | #[inline] |
727 | #[target_feature(enable = "sse")] | |
728 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 729 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
730 | pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { |
731 | ucomilt_ss(a, b) | |
732 | } | |
733 | ||
532ac7d7 | 734 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
735 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
736 | /// otherwise. This instruction will not signal an exception if either argument | |
737 | /// is a quiet NaN. | |
83c7162d XL |
738 | /// |
739 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss) | |
0531ce1d XL |
740 | #[inline] |
741 | #[target_feature(enable = "sse")] | |
742 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 743 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
744 | pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { |
745 | ucomile_ss(a, b) | |
746 | } | |
747 | ||
532ac7d7 | 748 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
749 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
750 | /// otherwise. This instruction will not signal an exception if either argument | |
751 | /// is a quiet NaN. | |
83c7162d XL |
752 | /// |
753 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss) | |
0531ce1d XL |
754 | #[inline] |
755 | #[target_feature(enable = "sse")] | |
756 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 757 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
758 | pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { |
759 | ucomigt_ss(a, b) | |
760 | } | |
761 | ||
532ac7d7 | 762 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
763 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
764 | /// `0` otherwise. This instruction will not signal an exception if either | |
765 | /// argument is a quiet NaN. | |
83c7162d XL |
766 | /// |
767 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss) | |
0531ce1d XL |
768 | #[inline] |
769 | #[target_feature(enable = "sse")] | |
770 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 771 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
772 | pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { |
773 | ucomige_ss(a, b) | |
774 | } | |
775 | ||
532ac7d7 XL |
776 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
777 | /// `1` if they are **not** equal, or `0` otherwise. This instruction will not | |
0531ce1d | 778 | /// signal an exception if either argument is a quiet NaN. |
83c7162d XL |
779 | /// |
780 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss) | |
0531ce1d XL |
781 | #[inline] |
782 | #[target_feature(enable = "sse")] | |
783 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 784 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
785 | pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { |
786 | ucomineq_ss(a, b) | |
787 | } | |
788 | ||
532ac7d7 | 789 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer. |
0531ce1d XL |
790 | /// |
791 | /// The result is rounded according to the current rounding mode. If the result | |
792 | /// cannot be represented as a 32 bit integer the result will be `0x8000_0000` | |
ba9703b0 | 793 | /// (`i32::MIN`) or an invalid operation floating point exception if |
0531ce1d XL |
794 | /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). |
795 | /// | |
796 | /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). | |
83c7162d XL |
797 | /// |
798 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32) | |
0531ce1d XL |
799 | #[inline] |
800 | #[target_feature(enable = "sse")] | |
801 | #[cfg_attr(test, assert_instr(cvtss2si))] | |
83c7162d | 802 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
803 | pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 { |
804 | cvtss2si(a) | |
805 | } | |
806 | ||
807 | /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). | |
83c7162d XL |
808 | /// |
809 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si) | |
0531ce1d XL |
810 | #[inline] |
811 | #[target_feature(enable = "sse")] | |
812 | #[cfg_attr(test, assert_instr(cvtss2si))] | |
83c7162d | 813 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
814 | pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { |
815 | _mm_cvtss_si32(a) | |
816 | } | |
817 | ||
532ac7d7 | 818 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer |
0531ce1d XL |
819 | /// with |
820 | /// truncation. | |
821 | /// | |
822 | /// The result is rounded always using truncation (round towards zero). If the | |
823 | /// result cannot be represented as a 32 bit integer the result will be | |
ba9703b0 | 824 | /// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point |
0531ce1d XL |
825 | /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). |
826 | /// | |
827 | /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). | |
83c7162d XL |
828 | /// |
829 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32) | |
0531ce1d XL |
830 | #[inline] |
831 | #[target_feature(enable = "sse")] | |
832 | #[cfg_attr(test, assert_instr(cvttss2si))] | |
83c7162d | 833 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
834 | pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 { |
835 | cvttss2si(a) | |
836 | } | |
837 | ||
838 | /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). | |
83c7162d XL |
839 | /// |
840 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si) | |
0531ce1d XL |
841 | #[inline] |
842 | #[target_feature(enable = "sse")] | |
843 | #[cfg_attr(test, assert_instr(cvttss2si))] | |
83c7162d | 844 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
845 | pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 { |
846 | _mm_cvttss_si32(a) | |
847 | } | |
848 | ||
532ac7d7 | 849 | /// Extracts the lowest 32 bit float from the input vector. |
83c7162d XL |
850 | /// |
851 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32) | |
0531ce1d XL |
852 | #[inline] |
853 | #[target_feature(enable = "sse")] | |
854 | // No point in using assert_instrs. In Unix x86_64 calling convention this is a | |
855 | // no-op, and on Windows it's just a `mov`. | |
83c7162d | 856 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
857 | pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 { |
858 | simd_extract(a, 0) | |
859 | } | |
860 | ||
532ac7d7 | 861 | /// Converts a 32 bit integer to a 32 bit float. The result vector is the input |
0531ce1d XL |
862 | /// vector `a` with the lowest 32 bit float replaced by the converted integer. |
863 | /// | |
864 | /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit | |
865 | /// input). | |
83c7162d XL |
866 | /// |
867 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss) | |
0531ce1d XL |
868 | #[inline] |
869 | #[target_feature(enable = "sse")] | |
870 | #[cfg_attr(test, assert_instr(cvtsi2ss))] | |
83c7162d | 871 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
872 | pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { |
873 | cvtsi2ss(a, b) | |
874 | } | |
875 | ||
876 | /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). | |
83c7162d XL |
877 | /// |
878 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss) | |
0531ce1d XL |
879 | #[inline] |
880 | #[target_feature(enable = "sse")] | |
881 | #[cfg_attr(test, assert_instr(cvtsi2ss))] | |
83c7162d | 882 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
883 | pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { |
884 | _mm_cvtsi32_ss(a, b) | |
885 | } | |
886 | ||
887 | /// Construct a `__m128` with the lowest element set to `a` and the rest set to | |
888 | /// zero. | |
83c7162d XL |
889 | /// |
890 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss) | |
0531ce1d XL |
891 | #[inline] |
892 | #[target_feature(enable = "sse")] | |
893 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 894 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
895 | pub unsafe fn _mm_set_ss(a: f32) -> __m128 { |
896 | __m128(a, 0.0, 0.0, 0.0) | |
897 | } | |
898 | ||
899 | /// Construct a `__m128` with all element set to `a`. | |
83c7162d XL |
900 | /// |
901 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps) | |
0531ce1d XL |
902 | #[inline] |
903 | #[target_feature(enable = "sse")] | |
904 | #[cfg_attr(test, assert_instr(shufps))] | |
83c7162d | 905 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
906 | pub unsafe fn _mm_set1_ps(a: f32) -> __m128 { |
907 | __m128(a, a, a, a) | |
908 | } | |
909 | ||
910 | /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html) | |
83c7162d XL |
911 | /// |
912 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1) | |
0531ce1d XL |
913 | #[inline] |
914 | #[target_feature(enable = "sse")] | |
915 | #[cfg_attr(test, assert_instr(shufps))] | |
83c7162d | 916 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
917 | pub unsafe fn _mm_set_ps1(a: f32) -> __m128 { |
918 | _mm_set1_ps(a) | |
919 | } | |
920 | ||
921 | /// Construct a `__m128` from four floating point values highest to lowest. | |
922 | /// | |
923 | /// Note that `a` will be the highest 32 bits of the result, and `d` the | |
924 | /// lowest. This matches the standard way of writing bit patterns on x86: | |
925 | /// | |
926 | /// ```text | |
927 | /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0 | |
928 | /// +---------+---------+---------+---------+ | |
929 | /// | a | b | c | d | result | |
930 | /// +---------+---------+---------+---------+ | |
931 | /// ``` | |
932 | /// | |
933 | /// Alternatively: | |
934 | /// | |
935 | /// ```text | |
936 | /// let v = _mm_set_ps(d, c, b, a); | |
937 | /// ``` | |
83c7162d XL |
938 | /// |
939 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps) | |
0531ce1d XL |
940 | #[inline] |
941 | #[target_feature(enable = "sse")] | |
942 | #[cfg_attr(test, assert_instr(unpcklps))] | |
83c7162d | 943 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
944 | pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
945 | __m128(d, c, b, a) | |
946 | } | |
947 | ||
948 | /// Construct a `__m128` from four floating point values lowest to highest. | |
949 | /// | |
950 | /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32 | |
951 | /// bits of the result, and `d` the highest. | |
952 | /// | |
953 | /// ```text | |
954 | /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d)); | |
955 | /// ``` | |
83c7162d XL |
956 | /// |
957 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps) | |
0531ce1d XL |
958 | #[inline] |
959 | #[target_feature(enable = "sse")] | |
fc512014 XL |
960 | #[cfg_attr( |
961 | all(test, any(target_os = "windows", target_arch = "x86_64")), | |
962 | assert_instr(unpcklps) | |
963 | )] | |
964 | // On a 32-bit architecture on non-Windows it just copies the operands from the stack. | |
965 | #[cfg_attr( | |
966 | all(test, all(not(target_os = "windows"), target_arch = "x86")), | |
967 | assert_instr(movaps) | |
968 | )] | |
83c7162d | 969 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
970 | pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
971 | __m128(a, b, c, d) | |
972 | } | |
973 | ||
974 | /// Construct a `__m128` with all elements initialized to zero. | |
83c7162d XL |
975 | /// |
976 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps) | |
0531ce1d XL |
977 | #[inline] |
978 | #[target_feature(enable = "sse")] | |
979 | #[cfg_attr(test, assert_instr(xorps))] | |
83c7162d | 980 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
981 | pub unsafe fn _mm_setzero_ps() -> __m128 { |
982 | __m128(0.0, 0.0, 0.0, 0.0) | |
983 | } | |
984 | ||
0bf4aa26 XL |
985 | /// A utility function for creating masks to use with Intel shuffle and |
986 | /// permute intrinsics. | |
8faf50e0 XL |
987 | #[inline] |
988 | #[allow(non_snake_case)] | |
416331ca | 989 | #[unstable(feature = "stdarch", issue = "27731")] |
0731742a XL |
990 | pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 { |
991 | ((z << 6) | (y << 4) | (x << 2) | w) as i32 | |
8faf50e0 XL |
992 | } |
993 | ||
532ac7d7 | 994 | /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and |
17df50a5 | 995 | /// `b` using `MASK`. |
0531ce1d XL |
996 | /// |
997 | /// The lower half of result takes values from `a` and the higher half from | |
998 | /// `b`. Mask is split to 2 control bits each to index the element from inputs. | |
83c7162d XL |
999 | /// |
1000 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps) | |
3dfed10e XL |
1001 | /// |
1002 | /// Note that there appears to be a mistake within Intel's Intrinsics Guide. | |
94222f64 | 1003 | /// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32` |
3dfed10e XL |
1004 | /// as is the case for [other shuffle intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_). |
1005 | /// Performing an implicit type conversion between an unsigned integer and a signed integer | |
1006 | /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this. | |
0531ce1d XL |
1007 | #[inline] |
1008 | #[target_feature(enable = "sse")] | |
17df50a5 XL |
1009 | #[cfg_attr(test, assert_instr(shufps, MASK = 3))] |
1010 | #[rustc_legacy_const_generics(2)] | |
1011 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1012 | pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 { | |
1013 | static_assert_imm8!(MASK); | |
1014 | simd_shuffle4!( | |
1015 | a, | |
1016 | b, | |
1017 | <const MASK: i32> [ | |
1018 | MASK as u32 & 0b11, | |
1019 | (MASK as u32 >> 2) & 0b11, | |
1020 | ((MASK as u32 >> 4) & 0b11) + 4, | |
1021 | ((MASK as u32 >> 6) & 0b11) + 4, | |
1022 | ], | |
1023 | ) | |
0531ce1d XL |
1024 | } |
1025 | ||
532ac7d7 | 1026 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
0531ce1d | 1027 | /// from the higher half of `a` and `b`. |
83c7162d XL |
1028 | /// |
1029 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps) | |
0531ce1d XL |
1030 | #[inline] |
1031 | #[target_feature(enable = "sse")] | |
1032 | #[cfg_attr(test, assert_instr(unpckhps))] | |
83c7162d | 1033 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1034 | pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 1035 | simd_shuffle4!(a, b, [2, 6, 3, 7]) |
0531ce1d XL |
1036 | } |
1037 | ||
532ac7d7 | 1038 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
0531ce1d | 1039 | /// from the lower half of `a` and `b`. |
83c7162d XL |
1040 | /// |
1041 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps) | |
0531ce1d XL |
1042 | #[inline] |
1043 | #[target_feature(enable = "sse")] | |
1044 | #[cfg_attr(test, assert_instr(unpcklps))] | |
83c7162d | 1045 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1046 | pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 1047 | simd_shuffle4!(a, b, [0, 4, 1, 5]) |
0531ce1d XL |
1048 | } |
1049 | ||
1050 | /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the | |
1051 | /// lower half of result. | |
83c7162d XL |
1052 | /// |
1053 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps) | |
0531ce1d XL |
1054 | #[inline] |
1055 | #[target_feature(enable = "sse")] | |
0731742a | 1056 | #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))] |
83c7162d | 1057 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1058 | pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 { |
1059 | // TODO; figure why this is a different instruction on Windows? | |
17df50a5 | 1060 | simd_shuffle4!(a, b, [6, 7, 2, 3]) |
0531ce1d XL |
1061 | } |
1062 | ||
1063 | /// Combine lower half of `a` and `b`. The lower half of `b` occupies the | |
1064 | /// higher half of result. | |
83c7162d XL |
1065 | /// |
1066 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps) | |
0531ce1d XL |
1067 | #[inline] |
1068 | #[target_feature(enable = "sse")] | |
0731742a | 1069 | #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))] |
83c7162d | 1070 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1071 | pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 1072 | simd_shuffle4!(a, b, [0, 1, 4, 5]) |
0531ce1d XL |
1073 | } |
1074 | ||
532ac7d7 | 1075 | /// Returns a mask of the most significant bit of each element in `a`. |
0531ce1d XL |
1076 | /// |
1077 | /// The mask is stored in the 4 least significant bits of the return value. | |
1078 | /// All other bits are set to `0`. | |
83c7162d XL |
1079 | /// |
1080 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps) | |
0531ce1d XL |
1081 | #[inline] |
1082 | #[target_feature(enable = "sse")] | |
e1599b0c XL |
1083 | // FIXME: LLVM9 trunk has the following bug: |
1084 | // https://github.com/rust-lang/stdarch/issues/794 | |
1085 | // so we only temporarily test this on i686 and x86_64 but not on i586: | |
1086 | #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))] | |
83c7162d | 1087 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1088 | pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { |
1089 | movmskps(a) | |
1090 | } | |
1091 | ||
0531ce1d XL |
1092 | /// Construct a `__m128` with the lowest element read from `p` and the other |
1093 | /// elements set to zero. | |
1094 | /// | |
1095 | /// This corresponds to instructions `VMOVSS` / `MOVSS`. | |
83c7162d XL |
1096 | /// |
1097 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss) | |
0531ce1d XL |
1098 | #[inline] |
1099 | #[target_feature(enable = "sse")] | |
1100 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1101 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1102 | pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { |
1103 | __m128(*p, 0.0, 0.0, 0.0) | |
1104 | } | |
1105 | ||
1106 | /// Construct a `__m128` by duplicating the value read from `p` into all | |
1107 | /// elements. | |
1108 | /// | |
1109 | /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some | |
1110 | /// shuffling. | |
83c7162d XL |
1111 | /// |
1112 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps) | |
0531ce1d XL |
1113 | #[inline] |
1114 | #[target_feature(enable = "sse")] | |
1115 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1116 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1117 | pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { |
1118 | let a = *p; | |
1119 | __m128(a, a, a, a) | |
1120 | } | |
1121 | ||
1122 | /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) | |
83c7162d XL |
1123 | /// |
1124 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1) | |
0531ce1d XL |
1125 | #[inline] |
1126 | #[target_feature(enable = "sse")] | |
1127 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1128 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1129 | pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { |
1130 | _mm_load1_ps(p) | |
1131 | } | |
1132 | ||
532ac7d7 | 1133 | /// Loads four `f32` values from *aligned* memory into a `__m128`. If the |
0531ce1d XL |
1134 | /// pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1135 | /// protection fault will be triggered (fatal program crash). | |
1136 | /// | |
1137 | /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned | |
1138 | /// memory. | |
1139 | /// | |
1140 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. | |
83c7162d XL |
1141 | /// |
1142 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps) | |
0531ce1d XL |
1143 | #[inline] |
1144 | #[target_feature(enable = "sse")] | |
1145 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1146 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1147 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1148 | pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { |
1149 | *(p as *const __m128) | |
1150 | } | |
1151 | ||
532ac7d7 | 1152 | /// Loads four `f32` values from memory into a `__m128`. There are no |
0531ce1d XL |
1153 | /// restrictions |
1154 | /// on memory alignment. For aligned memory | |
1155 | /// [`_mm_load_ps`](fn._mm_load_ps.html) | |
1156 | /// may be faster. | |
1157 | /// | |
1158 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. | |
83c7162d XL |
1159 | /// |
1160 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps) | |
0531ce1d XL |
1161 | #[inline] |
1162 | #[target_feature(enable = "sse")] | |
1163 | #[cfg_attr(test, assert_instr(movups))] | |
83c7162d | 1164 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1165 | pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { |
1166 | // Note: Using `*p` would require `f32` alignment, but `movups` has no | |
1167 | // alignment restrictions. | |
1168 | let mut dst = _mm_undefined_ps(); | |
1169 | ptr::copy_nonoverlapping( | |
1170 | p as *const u8, | |
1171 | &mut dst as *mut __m128 as *mut u8, | |
1172 | mem::size_of::<__m128>(), | |
1173 | ); | |
1174 | dst | |
1175 | } | |
1176 | ||
532ac7d7 | 1177 | /// Loads four `f32` values from aligned memory into a `__m128` in reverse |
0531ce1d XL |
1178 | /// order. |
1179 | /// | |
1180 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1181 | /// protection fault will be triggered (fatal program crash). | |
1182 | /// | |
1183 | /// Functionally equivalent to the following code sequence (assuming `p` | |
1184 | /// satisfies the alignment restrictions): | |
1185 | /// | |
1186 | /// ```text | |
1187 | /// let a0 = *p; | |
1188 | /// let a1 = *p.offset(1); | |
1189 | /// let a2 = *p.offset(2); | |
1190 | /// let a3 = *p.offset(3); | |
1191 | /// __m128::new(a3, a2, a1, a0) | |
1192 | /// ``` | |
1193 | /// | |
1194 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some | |
1195 | /// shuffling. | |
83c7162d XL |
1196 | /// |
1197 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps) | |
0531ce1d XL |
1198 | #[inline] |
1199 | #[target_feature(enable = "sse")] | |
1200 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1201 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1202 | pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { |
1203 | let a = _mm_load_ps(p); | |
17df50a5 | 1204 | simd_shuffle4!(a, a, [3, 2, 1, 0]) |
0531ce1d XL |
1205 | } |
1206 | ||
3dfed10e XL |
1207 | /// Loads unaligned 64-bits of integer data from memory into new vector. |
1208 | /// | |
1209 | /// `mem_addr` does not need to be aligned on any particular boundary. | |
1210 | /// | |
1211 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) | |
1212 | #[inline] | |
1213 | #[target_feature(enable = "sse")] | |
3dfed10e XL |
1214 | #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] |
1215 | pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { | |
cdc7bbd5 | 1216 | transmute(i64x2(ptr::read_unaligned(mem_addr as *const i64), 0)) |
3dfed10e XL |
1217 | } |
1218 | ||
532ac7d7 | 1219 | /// Stores the lowest 32 bit float of `a` into memory. |
0531ce1d XL |
1220 | /// |
1221 | /// This intrinsic corresponds to the `MOVSS` instruction. | |
83c7162d XL |
1222 | /// |
1223 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss) | |
0531ce1d XL |
1224 | #[inline] |
1225 | #[target_feature(enable = "sse")] | |
1226 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1227 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1228 | pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { |
1229 | *p = simd_extract(a, 0); | |
1230 | } | |
1231 | ||
532ac7d7 | 1232 | /// Stores the lowest 32 bit float of `a` repeated four times into *aligned* |
0531ce1d XL |
1233 | /// memory. |
1234 | /// | |
1235 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1236 | /// protection fault will be triggered (fatal program crash). | |
1237 | /// | |
1238 | /// Functionally equivalent to the following code sequence (assuming `p` | |
1239 | /// satisfies the alignment restrictions): | |
1240 | /// | |
1241 | /// ```text | |
1242 | /// let x = a.extract(0); | |
1243 | /// *p = x; | |
1244 | /// *p.offset(1) = x; | |
1245 | /// *p.offset(2) = x; | |
1246 | /// *p.offset(3) = x; | |
1247 | /// ``` | |
83c7162d XL |
1248 | /// |
1249 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps) | |
0531ce1d XL |
1250 | #[inline] |
1251 | #[target_feature(enable = "sse")] | |
1252 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1253 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1254 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d | 1255 | pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { |
17df50a5 | 1256 | let b: __m128 = simd_shuffle4!(a, a, [0, 0, 0, 0]); |
0531ce1d XL |
1257 | *(p as *mut __m128) = b; |
1258 | } | |
1259 | ||
1260 | /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html) | |
83c7162d XL |
1261 | /// |
1262 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1) | |
0531ce1d XL |
1263 | #[inline] |
1264 | #[target_feature(enable = "sse")] | |
1265 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1266 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1267 | pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { |
1268 | _mm_store1_ps(p, a); | |
1269 | } | |
1270 | ||
532ac7d7 | 1271 | /// Stores four 32-bit floats into *aligned* memory. |
0531ce1d XL |
1272 | /// |
1273 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1274 | /// protection fault will be triggered (fatal program crash). | |
1275 | /// | |
1276 | /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned | |
1277 | /// memory. | |
1278 | /// | |
1279 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. | |
83c7162d XL |
1280 | /// |
1281 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps) | |
0531ce1d XL |
1282 | #[inline] |
1283 | #[target_feature(enable = "sse")] | |
1284 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1285 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1286 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1287 | pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { |
1288 | *(p as *mut __m128) = a; | |
1289 | } | |
1290 | ||
532ac7d7 | 1291 | /// Stores four 32-bit floats into memory. There are no restrictions on memory |
0531ce1d XL |
1292 | /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be |
1293 | /// faster. | |
1294 | /// | |
1295 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. | |
83c7162d XL |
1296 | /// |
1297 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps) | |
0531ce1d XL |
1298 | #[inline] |
1299 | #[target_feature(enable = "sse")] | |
1300 | #[cfg_attr(test, assert_instr(movups))] | |
83c7162d | 1301 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1302 | pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { |
1303 | ptr::copy_nonoverlapping( | |
1304 | &a as *const __m128 as *const u8, | |
1305 | p as *mut u8, | |
1306 | mem::size_of::<__m128>(), | |
1307 | ); | |
1308 | } | |
1309 | ||
532ac7d7 | 1310 | /// Stores four 32-bit floats into *aligned* memory in reverse order. |
0531ce1d XL |
1311 | /// |
1312 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1313 | /// protection fault will be triggered (fatal program crash). | |
1314 | /// | |
1315 | /// Functionally equivalent to the following code sequence (assuming `p` | |
1316 | /// satisfies the alignment restrictions): | |
1317 | /// | |
1318 | /// ```text | |
1319 | /// *p = a.extract(3); | |
1320 | /// *p.offset(1) = a.extract(2); | |
1321 | /// *p.offset(2) = a.extract(1); | |
1322 | /// *p.offset(3) = a.extract(0); | |
1323 | /// ``` | |
83c7162d XL |
1324 | /// |
1325 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps) | |
0531ce1d XL |
1326 | #[inline] |
1327 | #[target_feature(enable = "sse")] | |
1328 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1329 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1330 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d | 1331 | pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { |
17df50a5 | 1332 | let b: __m128 = simd_shuffle4!(a, a, [3, 2, 1, 0]); |
0531ce1d XL |
1333 | *(p as *mut __m128) = b; |
1334 | } | |
1335 | ||
532ac7d7 | 1336 | /// Returns a `__m128` with the first component from `b` and the remaining |
0531ce1d XL |
1337 | /// components from `a`. |
1338 | /// | |
1339 | /// In other words for any `a` and `b`: | |
1340 | /// ```text | |
1341 | /// _mm_move_ss(a, b) == a.replace(0, b.extract(0)) | |
1342 | /// ``` | |
83c7162d XL |
1343 | /// |
1344 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss) | |
0531ce1d XL |
1345 | #[inline] |
1346 | #[target_feature(enable = "sse")] | |
1347 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1348 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1349 | pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { |
17df50a5 | 1350 | simd_shuffle4!(a, b, [4, 1, 2, 3]) |
0531ce1d XL |
1351 | } |
1352 | ||
532ac7d7 | 1353 | /// Performs a serializing operation on all store-to-memory instructions that |
0531ce1d XL |
1354 | /// were issued prior to this instruction. |
1355 | /// | |
1356 | /// Guarantees that every store instruction that precedes, in program order, is | |
1357 | /// globally visible before any store instruction which follows the fence in | |
1358 | /// program order. | |
83c7162d XL |
1359 | /// |
1360 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence) | |
0531ce1d XL |
1361 | #[inline] |
1362 | #[target_feature(enable = "sse")] | |
1363 | #[cfg_attr(test, assert_instr(sfence))] | |
83c7162d | 1364 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1365 | pub unsafe fn _mm_sfence() { |
1366 | sfence() | |
1367 | } | |
1368 | ||
532ac7d7 | 1369 | /// Gets the unsigned 32-bit value of the MXCSR control and status register. |
0531ce1d XL |
1370 | /// |
1371 | /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1372 | /// |
1373 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr) | |
0531ce1d XL |
1374 | #[inline] |
1375 | #[target_feature(enable = "sse")] | |
1376 | #[cfg_attr(test, assert_instr(stmxcsr))] | |
83c7162d | 1377 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1378 | pub unsafe fn _mm_getcsr() -> u32 { |
1379 | let mut result = 0_i32; | |
1380 | stmxcsr((&mut result) as *mut _ as *mut i8); | |
1381 | result as u32 | |
1382 | } | |
1383 | ||
532ac7d7 | 1384 | /// Sets the MXCSR register with the 32-bit unsigned integer value. |
0531ce1d XL |
1385 | /// |
1386 | /// This register constrols how SIMD instructions handle floating point | |
1387 | /// operations. Modifying this register only affects the current thread. | |
1388 | /// | |
1389 | /// It contains several groups of flags: | |
1390 | /// | |
1391 | /// * *Exception flags* report which exceptions occurred since last they were | |
1392 | /// reset. | |
1393 | /// | |
1394 | /// * *Masking flags* can be used to mask (ignore) certain exceptions. By | |
1395 | /// default | |
1396 | /// these flags are all set to 1, so all exceptions are masked. When an | |
1397 | /// an exception is masked, the processor simply sets the exception flag and | |
1398 | /// continues the operation. If the exception is unmasked, the flag is also set | |
1399 | /// but additionally an exception handler is invoked. | |
1400 | /// | |
1401 | /// * *Rounding mode flags* control the rounding mode of floating point | |
1402 | /// instructions. | |
1403 | /// | |
1404 | /// * The *denormals-are-zero mode flag* turns all numbers which would be | |
1405 | /// denormalized (exponent bits are all zeros) into zeros. | |
1406 | /// | |
1407 | /// ## Exception Flags | |
1408 | /// | |
1409 | /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing | |
1410 | /// Infinity by Infinity). | |
1411 | /// | |
1412 | /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized | |
1413 | /// number. Mainly this can cause loss of precision. | |
1414 | /// | |
a2a8927a | 1415 | /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred. |
0531ce1d | 1416 | /// |
a2a8927a | 1417 | /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a |
0531ce1d XL |
1418 | /// result was too large to be represented (e.g., an `f32` with absolute |
1419 | /// value | |
1420 | /// greater than `2^128`). | |
1421 | /// | |
a2a8927a | 1422 | /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a |
0531ce1d XL |
1423 | /// result was too small to be represented in a normalized way (e.g., an |
1424 | /// `f32` | |
1425 | /// with absulte value smaller than `2^-126`.) | |
1426 | /// | |
a2a8927a | 1427 | /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a. |
0531ce1d XL |
1428 | /// precision exception). This means some precision was lost due to rounding. |
1429 | /// For example, the fraction `1/3` cannot be represented accurately in a | |
1430 | /// 32 or 64 bit float and computing it would cause this exception to be | |
1431 | /// raised. Precision exceptions are very common, so they are usually masked. | |
1432 | /// | |
1433 | /// Exception flags can be read and set using the convenience functions | |
1434 | /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to | |
1435 | /// check if an operation caused some overflow: | |
1436 | /// | |
1437 | /// ```rust,ignore | |
1438 | /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags | |
1439 | /// // perform calculations | |
1440 | /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 { | |
1441 | /// // handle overflow | |
1442 | /// } | |
1443 | /// ``` | |
1444 | /// | |
1445 | /// ## Masking Flags | |
1446 | /// | |
1447 | /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`, | |
1448 | /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`, | |
1449 | /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. | |
1450 | /// | |
1451 | /// A single masking bit can be set via | |
1452 | /// | |
1453 | /// ```rust,ignore | |
1454 | /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW); | |
1455 | /// ``` | |
1456 | /// | |
1457 | /// However, since mask bits are by default all set to 1, it is more common to | |
1458 | /// want to *disable* certain bits. For example, to unmask the underflow | |
1459 | /// exception, use: | |
1460 | /// | |
1461 | /// ```rust,ignore | |
1462 | /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow | |
1463 | /// exception | |
1464 | /// ``` | |
1465 | /// | |
1466 | /// Warning: an unmasked exception will cause an exception handler to be | |
1467 | /// called. | |
1468 | /// The standard handler will simply terminate the process. So, in this case | |
1469 | /// any underflow exception would terminate the current process with something | |
1470 | /// like `signal: 8, SIGFPE: erroneous arithmetic operation`. | |
1471 | /// | |
1472 | /// ## Rounding Mode | |
1473 | /// | |
1474 | /// The rounding mode is describe using two bits. It can be read and set using | |
1475 | /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and | |
1476 | /// `_MM_SET_ROUNDING_MODE(mode)`. | |
1477 | /// | |
1478 | /// The rounding modes are: | |
1479 | /// | |
1480 | /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision | |
1481 | /// value. If two values are equally close, round to even (i.e., least | |
1482 | /// significant bit will be zero). | |
1483 | /// | |
1484 | /// * `_MM_ROUND_DOWN`: Round toward negative Infinity. | |
1485 | /// | |
1486 | /// * `_MM_ROUND_UP`: Round toward positive Infinity. | |
1487 | /// | |
1488 | /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate). | |
1489 | /// | |
1490 | /// Example: | |
1491 | /// | |
1492 | /// ```rust,ignore | |
1493 | /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN) | |
1494 | /// ``` | |
1495 | /// | |
1496 | /// ## Denormals-are-zero/Flush-to-zero Mode | |
1497 | /// | |
1498 | /// If this bit is set, values that would be denormalized will be set to zero | |
1499 | /// instead. This is turned off by default. | |
1500 | /// | |
1501 | /// You can read and enable/disable this mode via the helper functions | |
1502 | /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`: | |
1503 | /// | |
1504 | /// ```rust,ignore | |
1505 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) | |
1506 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on | |
1507 | /// ``` | |
1508 | /// | |
83c7162d XL |
1509 | /// |
1510 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr) | |
0531ce1d XL |
1511 | #[inline] |
1512 | #[target_feature(enable = "sse")] | |
1513 | #[cfg_attr(test, assert_instr(ldmxcsr))] | |
83c7162d | 1514 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1515 | pub unsafe fn _mm_setcsr(val: u32) { |
1516 | ldmxcsr(&val as *const _ as *const i8); | |
1517 | } | |
1518 | ||
1519 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1520 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1521 | pub const _MM_EXCEPT_INVALID: u32 = 0x0001; |
1522 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1523 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1524 | pub const _MM_EXCEPT_DENORM: u32 = 0x0002; |
1525 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1526 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1527 | pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; |
1528 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1529 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1530 | pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; |
1531 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1532 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1533 | pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; |
1534 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1535 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1536 | pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; |
1537 | /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html) | |
83c7162d | 1538 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1539 | pub const _MM_EXCEPT_MASK: u32 = 0x003f; |
1540 | ||
1541 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1542 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1543 | pub const _MM_MASK_INVALID: u32 = 0x0080; |
1544 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1545 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1546 | pub const _MM_MASK_DENORM: u32 = 0x0100; |
1547 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1548 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1549 | pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; |
1550 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1551 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1552 | pub const _MM_MASK_OVERFLOW: u32 = 0x0400; |
1553 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1554 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1555 | pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; |
1556 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1557 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1558 | pub const _MM_MASK_INEXACT: u32 = 0x1000; |
1559 | /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html) | |
83c7162d | 1560 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1561 | pub const _MM_MASK_MASK: u32 = 0x1f80; |
1562 | ||
1563 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1564 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1565 | pub const _MM_ROUND_NEAREST: u32 = 0x0000; |
1566 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1567 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1568 | pub const _MM_ROUND_DOWN: u32 = 0x2000; |
1569 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1570 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1571 | pub const _MM_ROUND_UP: u32 = 0x4000; |
1572 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1573 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1574 | pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; |
1575 | ||
1576 | /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html) | |
83c7162d | 1577 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1578 | pub const _MM_ROUND_MASK: u32 = 0x6000; |
1579 | ||
1580 | /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html) | |
83c7162d | 1581 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1582 | pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; |
1583 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1584 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1585 | pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; |
1586 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1587 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1588 | pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; |
1589 | ||
1590 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1591 | /// |
1592 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK) | |
0531ce1d XL |
1593 | #[inline] |
1594 | #[allow(non_snake_case)] | |
1595 | #[target_feature(enable = "sse")] | |
83c7162d | 1596 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1597 | pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { |
1598 | _mm_getcsr() & _MM_MASK_MASK | |
1599 | } | |
1600 | ||
1601 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1602 | /// |
1603 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE) | |
0531ce1d XL |
1604 | #[inline] |
1605 | #[allow(non_snake_case)] | |
1606 | #[target_feature(enable = "sse")] | |
83c7162d | 1607 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1608 | pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { |
1609 | _mm_getcsr() & _MM_EXCEPT_MASK | |
1610 | } | |
1611 | ||
1612 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1613 | /// |
1614 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE) | |
0531ce1d XL |
1615 | #[inline] |
1616 | #[allow(non_snake_case)] | |
1617 | #[target_feature(enable = "sse")] | |
83c7162d | 1618 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1619 | pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { |
1620 | _mm_getcsr() & _MM_FLUSH_ZERO_MASK | |
1621 | } | |
1622 | ||
1623 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1624 | /// |
1625 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE) | |
0531ce1d XL |
1626 | #[inline] |
1627 | #[allow(non_snake_case)] | |
1628 | #[target_feature(enable = "sse")] | |
83c7162d | 1629 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1630 | pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { |
1631 | _mm_getcsr() & _MM_ROUND_MASK | |
1632 | } | |
1633 | ||
1634 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1635 | /// |
1636 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK) | |
0531ce1d XL |
1637 | #[inline] |
1638 | #[allow(non_snake_case)] | |
1639 | #[target_feature(enable = "sse")] | |
83c7162d | 1640 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1641 | pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { |
1642 | _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x) | |
1643 | } | |
1644 | ||
1645 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1646 | /// |
1647 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE) | |
0531ce1d XL |
1648 | #[inline] |
1649 | #[allow(non_snake_case)] | |
1650 | #[target_feature(enable = "sse")] | |
83c7162d | 1651 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1652 | pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { |
1653 | _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x) | |
1654 | } | |
1655 | ||
1656 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1657 | /// |
1658 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE) | |
0531ce1d XL |
1659 | #[inline] |
1660 | #[allow(non_snake_case)] | |
1661 | #[target_feature(enable = "sse")] | |
83c7162d | 1662 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1663 | pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { |
1664 | let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x; | |
1665 | // println!("setting csr={:x}", val); | |
1666 | _mm_setcsr(val) | |
1667 | } | |
1668 | ||
1669 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1670 | /// |
1671 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE) | |
0531ce1d XL |
1672 | #[inline] |
1673 | #[allow(non_snake_case)] | |
1674 | #[target_feature(enable = "sse")] | |
83c7162d | 1675 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1676 | pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { |
1677 | _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x) | |
1678 | } | |
1679 | ||
1680 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1681 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1682 | pub const _MM_HINT_T0: i32 = 3; |
1683 | ||
1684 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1685 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1686 | pub const _MM_HINT_T1: i32 = 2; |
1687 | ||
1688 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1689 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1690 | pub const _MM_HINT_T2: i32 = 1; |
1691 | ||
1692 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1693 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1694 | pub const _MM_HINT_NTA: i32 = 0; |
1695 | ||
17df50a5 XL |
1696 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). |
1697 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1698 | pub const _MM_HINT_ET0: i32 = 7; | |
1699 | ||
1700 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
1701 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1702 | pub const _MM_HINT_ET1: i32 = 6; | |
1703 | ||
1704 | /// Fetch the cache line that contains address `p` using the given `STRATEGY`. | |
0531ce1d | 1705 | /// |
17df50a5 | 1706 | /// The `STRATEGY` must be one of: |
0531ce1d XL |
1707 | /// |
1708 | /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the | |
416331ca | 1709 | /// cache hierarchy. |
0531ce1d XL |
1710 | /// |
1711 | /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher. | |
1712 | /// | |
83c7162d XL |
1713 | /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or |
1714 | /// an implementation-specific choice (e.g., L2 if there is no L3). | |
0531ce1d XL |
1715 | /// |
1716 | /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the | |
1717 | /// non-temporal access (NTA) hint. It may be a place closer than main memory | |
1718 | /// but outside of the cache hierarchy. This is used to reduce access latency | |
1719 | /// without polluting the cache. | |
1720 | /// | |
17df50a5 XL |
1721 | /// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and |
1722 | /// [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0` | |
1723 | /// and `_MM_HINT_T1` but indicate an anticipation to write to the address. | |
1724 | /// | |
0531ce1d XL |
1725 | /// The actual implementation depends on the particular CPU. This instruction |
1726 | /// is considered a hint, so the CPU is also free to simply ignore the request. | |
1727 | /// | |
83c7162d XL |
1728 | /// The amount of prefetched data depends on the cache line size of the |
1729 | /// specific CPU, but it will be at least 32 bytes. | |
0531ce1d XL |
1730 | /// |
1731 | /// Common caveats: | |
1732 | /// | |
1733 | /// * Most modern CPUs already automatically prefetch data based on predicted | |
1734 | /// access patterns. | |
1735 | /// | |
1736 | /// * Data is usually not fetched if this would cause a TLB miss or a page | |
1737 | /// fault. | |
1738 | /// | |
1739 | /// * Too much prefetching can cause unnecessary cache evictions. | |
1740 | /// | |
1741 | /// * Prefetching may also fail if there are not enough memory-subsystem | |
1742 | /// resources (e.g., request buffers). | |
1743 | /// | |
83c7162d XL |
1744 | /// |
1745 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch) | |
0531ce1d XL |
1746 | #[inline] |
1747 | #[target_feature(enable = "sse")] | |
17df50a5 XL |
1748 | #[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))] |
1749 | #[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))] | |
1750 | #[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))] | |
1751 | #[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))] | |
1752 | #[rustc_legacy_const_generics(1)] | |
83c7162d | 1753 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1754 | pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) { |
a2a8927a | 1755 | // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache). |
17df50a5 XL |
1756 | // `locality` and `rw` are based on our `STRATEGY`. |
1757 | prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1); | |
0531ce1d XL |
1758 | } |
1759 | ||
532ac7d7 | 1760 | /// Returns vector of type __m128 with undefined elements. |
83c7162d XL |
1761 | /// |
1762 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps) | |
0531ce1d XL |
1763 | #[inline] |
1764 | #[target_feature(enable = "sse")] | |
83c7162d | 1765 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1766 | pub unsafe fn _mm_undefined_ps() -> __m128 { |
3dfed10e | 1767 | _mm_set1_ps(0.0) |
0531ce1d XL |
1768 | } |
1769 | ||
1770 | /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place. | |
83c7162d XL |
1771 | /// |
1772 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS) | |
0531ce1d XL |
1773 | #[inline] |
1774 | #[allow(non_snake_case)] | |
1775 | #[target_feature(enable = "sse")] | |
83c7162d | 1776 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1777 | pub unsafe fn _MM_TRANSPOSE4_PS( |
0731742a XL |
1778 | row0: &mut __m128, |
1779 | row1: &mut __m128, | |
1780 | row2: &mut __m128, | |
1781 | row3: &mut __m128, | |
0531ce1d XL |
1782 | ) { |
1783 | let tmp0 = _mm_unpacklo_ps(*row0, *row1); | |
1784 | let tmp2 = _mm_unpacklo_ps(*row2, *row3); | |
1785 | let tmp1 = _mm_unpackhi_ps(*row0, *row1); | |
1786 | let tmp3 = _mm_unpackhi_ps(*row2, *row3); | |
1787 | ||
1788 | *row0 = _mm_movelh_ps(tmp0, tmp2); | |
1789 | *row1 = _mm_movehl_ps(tmp2, tmp0); | |
1790 | *row2 = _mm_movelh_ps(tmp1, tmp3); | |
1791 | *row3 = _mm_movehl_ps(tmp3, tmp1); | |
1792 | } | |
1793 | ||
1794 | #[allow(improper_ctypes)] | |
1795 | extern "C" { | |
1796 | #[link_name = "llvm.x86.sse.add.ss"] | |
1797 | fn addss(a: __m128, b: __m128) -> __m128; | |
1798 | #[link_name = "llvm.x86.sse.sub.ss"] | |
1799 | fn subss(a: __m128, b: __m128) -> __m128; | |
1800 | #[link_name = "llvm.x86.sse.mul.ss"] | |
1801 | fn mulss(a: __m128, b: __m128) -> __m128; | |
1802 | #[link_name = "llvm.x86.sse.div.ss"] | |
1803 | fn divss(a: __m128, b: __m128) -> __m128; | |
1804 | #[link_name = "llvm.x86.sse.sqrt.ss"] | |
1805 | fn sqrtss(a: __m128) -> __m128; | |
1806 | #[link_name = "llvm.x86.sse.sqrt.ps"] | |
1807 | fn sqrtps(a: __m128) -> __m128; | |
1808 | #[link_name = "llvm.x86.sse.rcp.ss"] | |
1809 | fn rcpss(a: __m128) -> __m128; | |
1810 | #[link_name = "llvm.x86.sse.rcp.ps"] | |
1811 | fn rcpps(a: __m128) -> __m128; | |
1812 | #[link_name = "llvm.x86.sse.rsqrt.ss"] | |
1813 | fn rsqrtss(a: __m128) -> __m128; | |
1814 | #[link_name = "llvm.x86.sse.rsqrt.ps"] | |
1815 | fn rsqrtps(a: __m128) -> __m128; | |
1816 | #[link_name = "llvm.x86.sse.min.ss"] | |
1817 | fn minss(a: __m128, b: __m128) -> __m128; | |
1818 | #[link_name = "llvm.x86.sse.min.ps"] | |
1819 | fn minps(a: __m128, b: __m128) -> __m128; | |
1820 | #[link_name = "llvm.x86.sse.max.ss"] | |
1821 | fn maxss(a: __m128, b: __m128) -> __m128; | |
1822 | #[link_name = "llvm.x86.sse.max.ps"] | |
1823 | fn maxps(a: __m128, b: __m128) -> __m128; | |
1824 | #[link_name = "llvm.x86.sse.movmsk.ps"] | |
1825 | fn movmskps(a: __m128) -> i32; | |
1826 | #[link_name = "llvm.x86.sse.cmp.ps"] | |
1827 | fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; | |
1828 | #[link_name = "llvm.x86.sse.comieq.ss"] | |
1829 | fn comieq_ss(a: __m128, b: __m128) -> i32; | |
1830 | #[link_name = "llvm.x86.sse.comilt.ss"] | |
1831 | fn comilt_ss(a: __m128, b: __m128) -> i32; | |
1832 | #[link_name = "llvm.x86.sse.comile.ss"] | |
1833 | fn comile_ss(a: __m128, b: __m128) -> i32; | |
1834 | #[link_name = "llvm.x86.sse.comigt.ss"] | |
1835 | fn comigt_ss(a: __m128, b: __m128) -> i32; | |
1836 | #[link_name = "llvm.x86.sse.comige.ss"] | |
1837 | fn comige_ss(a: __m128, b: __m128) -> i32; | |
1838 | #[link_name = "llvm.x86.sse.comineq.ss"] | |
1839 | fn comineq_ss(a: __m128, b: __m128) -> i32; | |
1840 | #[link_name = "llvm.x86.sse.ucomieq.ss"] | |
1841 | fn ucomieq_ss(a: __m128, b: __m128) -> i32; | |
1842 | #[link_name = "llvm.x86.sse.ucomilt.ss"] | |
1843 | fn ucomilt_ss(a: __m128, b: __m128) -> i32; | |
1844 | #[link_name = "llvm.x86.sse.ucomile.ss"] | |
1845 | fn ucomile_ss(a: __m128, b: __m128) -> i32; | |
1846 | #[link_name = "llvm.x86.sse.ucomigt.ss"] | |
1847 | fn ucomigt_ss(a: __m128, b: __m128) -> i32; | |
1848 | #[link_name = "llvm.x86.sse.ucomige.ss"] | |
1849 | fn ucomige_ss(a: __m128, b: __m128) -> i32; | |
1850 | #[link_name = "llvm.x86.sse.ucomineq.ss"] | |
1851 | fn ucomineq_ss(a: __m128, b: __m128) -> i32; | |
1852 | #[link_name = "llvm.x86.sse.cvtss2si"] | |
1853 | fn cvtss2si(a: __m128) -> i32; | |
1854 | #[link_name = "llvm.x86.sse.cvttss2si"] | |
1855 | fn cvttss2si(a: __m128) -> i32; | |
1856 | #[link_name = "llvm.x86.sse.cvtsi2ss"] | |
1857 | fn cvtsi2ss(a: __m128, b: i32) -> __m128; | |
1858 | #[link_name = "llvm.x86.sse.sfence"] | |
1859 | fn sfence(); | |
1860 | #[link_name = "llvm.x86.sse.stmxcsr"] | |
1861 | fn stmxcsr(p: *mut i8); | |
1862 | #[link_name = "llvm.x86.sse.ldmxcsr"] | |
1863 | fn ldmxcsr(p: *const i8); | |
1864 | #[link_name = "llvm.prefetch"] | |
1865 | fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32); | |
1866 | #[link_name = "llvm.x86.sse.cmp.ss"] | |
1867 | fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128; | |
0531ce1d XL |
1868 | } |
1869 | ||
1870 | /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint. | |
1871 | /// | |
1872 | /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection | |
1873 | /// exception _may_ be generated. | |
83c7162d XL |
1874 | /// |
1875 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps) | |
0531ce1d XL |
1876 | #[inline] |
1877 | #[target_feature(enable = "sse")] | |
1878 | #[cfg_attr(test, assert_instr(movntps))] | |
83c7162d | 1879 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1880 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d | 1881 | pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { |
0731742a | 1882 | intrinsics::nontemporal_store(mem_addr as *mut __m128, a); |
0531ce1d XL |
1883 | } |
1884 | ||
0531ce1d XL |
1885 | #[cfg(test)] |
1886 | mod tests { | |
48663c56 XL |
1887 | use crate::{hint::black_box, mem::transmute}; |
1888 | use std::{boxed, f32::NAN}; | |
416331ca | 1889 | use stdarch_test::simd_test; |
0531ce1d | 1890 | |
532ac7d7 | 1891 | use crate::core_arch::{simd::*, x86::*}; |
0531ce1d | 1892 | |
83c7162d | 1893 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1894 | unsafe fn test_mm_add_ps() { |
1895 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1896 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1897 | let r = _mm_add_ps(a, b); | |
1898 | assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0)); | |
1899 | } | |
1900 | ||
83c7162d | 1901 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1902 | unsafe fn test_mm_add_ss() { |
1903 | let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0); | |
1904 | let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0); | |
1905 | let r = _mm_add_ss(a, b); | |
1906 | assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0)); | |
1907 | } | |
1908 | ||
83c7162d | 1909 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1910 | unsafe fn test_mm_sub_ps() { |
1911 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1912 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1913 | let r = _mm_sub_ps(a, b); | |
1914 | assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0)); | |
1915 | } | |
1916 | ||
83c7162d | 1917 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1918 | unsafe fn test_mm_sub_ss() { |
1919 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1920 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1921 | let r = _mm_sub_ss(a, b); | |
1922 | assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0)); | |
1923 | } | |
1924 | ||
83c7162d | 1925 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1926 | unsafe fn test_mm_mul_ps() { |
1927 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1928 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1929 | let r = _mm_mul_ps(a, b); | |
1930 | assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0)); | |
1931 | } | |
1932 | ||
83c7162d | 1933 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1934 | unsafe fn test_mm_mul_ss() { |
1935 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1936 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1937 | let r = _mm_mul_ss(a, b); | |
1938 | assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0)); | |
1939 | } | |
1940 | ||
83c7162d | 1941 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1942 | unsafe fn test_mm_div_ps() { |
1943 | let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0); | |
1944 | let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0); | |
1945 | let r = _mm_div_ps(a, b); | |
1946 | assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0)); | |
1947 | } | |
1948 | ||
83c7162d | 1949 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1950 | unsafe fn test_mm_div_ss() { |
1951 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1952 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1953 | let r = _mm_div_ss(a, b); | |
1954 | assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0)); | |
1955 | } | |
1956 | ||
83c7162d | 1957 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1958 | unsafe fn test_mm_sqrt_ss() { |
1959 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1960 | let r = _mm_sqrt_ss(a); | |
1961 | let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0); | |
1962 | assert_eq_m128(r, e); | |
1963 | } | |
1964 | ||
83c7162d | 1965 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1966 | unsafe fn test_mm_sqrt_ps() { |
1967 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1968 | let r = _mm_sqrt_ps(a); | |
1969 | let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0); | |
1970 | assert_eq_m128(r, e); | |
1971 | } | |
1972 | ||
83c7162d | 1973 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1974 | unsafe fn test_mm_rcp_ss() { |
1975 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1976 | let r = _mm_rcp_ss(a); | |
1977 | let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); | |
1978 | assert_eq_m128(r, e); | |
1979 | } | |
1980 | ||
83c7162d | 1981 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1982 | unsafe fn test_mm_rcp_ps() { |
1983 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1984 | let r = _mm_rcp_ps(a); | |
1985 | let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215); | |
1986 | let rel_err = 0.00048828125; | |
1987 | for i in 0..4 { | |
1988 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); | |
1989 | } | |
1990 | } | |
1991 | ||
83c7162d | 1992 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1993 | unsafe fn test_mm_rsqrt_ss() { |
1994 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1995 | let r = _mm_rsqrt_ss(a); | |
1996 | let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0); | |
1997 | let rel_err = 0.00048828125; | |
1998 | for i in 0..4 { | |
1999 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); | |
2000 | } | |
2001 | } | |
2002 | ||
83c7162d | 2003 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2004 | unsafe fn test_mm_rsqrt_ps() { |
2005 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
2006 | let r = _mm_rsqrt_ps(a); | |
2007 | let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845); | |
2008 | let rel_err = 0.00048828125; | |
2009 | for i in 0..4 { | |
2010 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); | |
2011 | } | |
2012 | } | |
2013 | ||
83c7162d | 2014 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2015 | unsafe fn test_mm_min_ss() { |
2016 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2017 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2018 | let r = _mm_min_ss(a, b); | |
2019 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); | |
2020 | } | |
2021 | ||
83c7162d | 2022 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2023 | unsafe fn test_mm_min_ps() { |
2024 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2025 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2026 | let r = _mm_min_ps(a, b); | |
2027 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); | |
74b04a01 XL |
2028 | |
2029 | // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min` | |
2030 | // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic | |
2031 | // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from | |
2032 | // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals | |
2033 | // `r1` to `a` and `r2` to `b`. | |
2034 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); | |
2035 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); | |
2036 | let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); | |
2037 | let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); | |
2038 | let a: [u8; 16] = transmute(a); | |
2039 | let b: [u8; 16] = transmute(b); | |
2040 | assert_eq!(r1, b); | |
2041 | assert_eq!(r2, a); | |
2042 | assert_ne!(a, b); // sanity check that -0.0 is actually present | |
0531ce1d XL |
2043 | } |
2044 | ||
83c7162d | 2045 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2046 | unsafe fn test_mm_max_ss() { |
2047 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2048 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2049 | let r = _mm_max_ss(a, b); | |
2050 | assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0)); | |
2051 | } | |
2052 | ||
83c7162d | 2053 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2054 | unsafe fn test_mm_max_ps() { |
2055 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2056 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2057 | let r = _mm_max_ps(a, b); | |
2058 | assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); | |
2059 | } | |
2060 | ||
83c7162d | 2061 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2062 | unsafe fn test_mm_and_ps() { |
2063 | let a = transmute(u32x4::splat(0b0011)); | |
2064 | let b = transmute(u32x4::splat(0b0101)); | |
2065 | let r = _mm_and_ps(*black_box(&a), *black_box(&b)); | |
2066 | let e = transmute(u32x4::splat(0b0001)); | |
2067 | assert_eq_m128(r, e); | |
2068 | } | |
2069 | ||
83c7162d | 2070 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2071 | unsafe fn test_mm_andnot_ps() { |
2072 | let a = transmute(u32x4::splat(0b0011)); | |
2073 | let b = transmute(u32x4::splat(0b0101)); | |
2074 | let r = _mm_andnot_ps(*black_box(&a), *black_box(&b)); | |
2075 | let e = transmute(u32x4::splat(0b0100)); | |
2076 | assert_eq_m128(r, e); | |
2077 | } | |
2078 | ||
83c7162d | 2079 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2080 | unsafe fn test_mm_or_ps() { |
2081 | let a = transmute(u32x4::splat(0b0011)); | |
2082 | let b = transmute(u32x4::splat(0b0101)); | |
2083 | let r = _mm_or_ps(*black_box(&a), *black_box(&b)); | |
2084 | let e = transmute(u32x4::splat(0b0111)); | |
2085 | assert_eq_m128(r, e); | |
2086 | } | |
2087 | ||
83c7162d | 2088 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2089 | unsafe fn test_mm_xor_ps() { |
2090 | let a = transmute(u32x4::splat(0b0011)); | |
2091 | let b = transmute(u32x4::splat(0b0101)); | |
2092 | let r = _mm_xor_ps(*black_box(&a), *black_box(&b)); | |
2093 | let e = transmute(u32x4::splat(0b0110)); | |
2094 | assert_eq_m128(r, e); | |
2095 | } | |
2096 | ||
83c7162d | 2097 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2098 | unsafe fn test_mm_cmpeq_ss() { |
2099 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2100 | let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); | |
2101 | let r: u32x4 = transmute(_mm_cmpeq_ss(a, b)); | |
2102 | let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0)); | |
2103 | assert_eq!(r, e); | |
2104 | ||
2105 | let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2106 | let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); | |
0731742a | 2107 | let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0)); |
0531ce1d XL |
2108 | assert_eq!(r2, e2); |
2109 | } | |
2110 | ||
83c7162d | 2111 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2112 | unsafe fn test_mm_cmplt_ss() { |
2113 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2114 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2115 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2116 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2117 | ||
2118 | let b1 = 0u32; // a.extract(0) < b.extract(0) | |
2119 | let c1 = 0u32; // a.extract(0) < c.extract(0) | |
2120 | let d1 = !0u32; // a.extract(0) < d.extract(0) | |
2121 | ||
2122 | let rb: u32x4 = transmute(_mm_cmplt_ss(a, b)); | |
2123 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2124 | assert_eq!(rb, eb); | |
2125 | ||
2126 | let rc: u32x4 = transmute(_mm_cmplt_ss(a, c)); | |
2127 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2128 | assert_eq!(rc, ec); | |
2129 | ||
2130 | let rd: u32x4 = transmute(_mm_cmplt_ss(a, d)); | |
2131 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2132 | assert_eq!(rd, ed); | |
2133 | } | |
2134 | ||
83c7162d | 2135 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2136 | unsafe fn test_mm_cmple_ss() { |
2137 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2138 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2139 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2140 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2141 | ||
2142 | let b1 = 0u32; // a.extract(0) <= b.extract(0) | |
2143 | let c1 = !0u32; // a.extract(0) <= c.extract(0) | |
2144 | let d1 = !0u32; // a.extract(0) <= d.extract(0) | |
2145 | ||
2146 | let rb: u32x4 = transmute(_mm_cmple_ss(a, b)); | |
2147 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2148 | assert_eq!(rb, eb); | |
2149 | ||
2150 | let rc: u32x4 = transmute(_mm_cmple_ss(a, c)); | |
2151 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2152 | assert_eq!(rc, ec); | |
2153 | ||
2154 | let rd: u32x4 = transmute(_mm_cmple_ss(a, d)); | |
2155 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2156 | assert_eq!(rd, ed); | |
2157 | } | |
2158 | ||
83c7162d | 2159 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2160 | unsafe fn test_mm_cmpgt_ss() { |
2161 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2162 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2163 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2164 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2165 | ||
2166 | let b1 = !0u32; // a.extract(0) > b.extract(0) | |
2167 | let c1 = 0u32; // a.extract(0) > c.extract(0) | |
2168 | let d1 = 0u32; // a.extract(0) > d.extract(0) | |
2169 | ||
2170 | let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b)); | |
2171 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2172 | assert_eq!(rb, eb); | |
2173 | ||
2174 | let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c)); | |
2175 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2176 | assert_eq!(rc, ec); | |
2177 | ||
2178 | let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d)); | |
2179 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2180 | assert_eq!(rd, ed); | |
2181 | } | |
2182 | ||
83c7162d | 2183 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2184 | unsafe fn test_mm_cmpge_ss() { |
2185 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2186 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2187 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2188 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2189 | ||
2190 | let b1 = !0u32; // a.extract(0) >= b.extract(0) | |
2191 | let c1 = !0u32; // a.extract(0) >= c.extract(0) | |
2192 | let d1 = 0u32; // a.extract(0) >= d.extract(0) | |
2193 | ||
2194 | let rb: u32x4 = transmute(_mm_cmpge_ss(a, b)); | |
2195 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2196 | assert_eq!(rb, eb); | |
2197 | ||
2198 | let rc: u32x4 = transmute(_mm_cmpge_ss(a, c)); | |
2199 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2200 | assert_eq!(rc, ec); | |
2201 | ||
2202 | let rd: u32x4 = transmute(_mm_cmpge_ss(a, d)); | |
2203 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2204 | assert_eq!(rd, ed); | |
2205 | } | |
2206 | ||
83c7162d | 2207 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2208 | unsafe fn test_mm_cmpneq_ss() { |
2209 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2210 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2211 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2212 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2213 | ||
2214 | let b1 = !0u32; // a.extract(0) != b.extract(0) | |
2215 | let c1 = 0u32; // a.extract(0) != c.extract(0) | |
2216 | let d1 = !0u32; // a.extract(0) != d.extract(0) | |
2217 | ||
2218 | let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b)); | |
2219 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2220 | assert_eq!(rb, eb); | |
2221 | ||
2222 | let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c)); | |
2223 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2224 | assert_eq!(rc, ec); | |
2225 | ||
2226 | let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d)); | |
2227 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2228 | assert_eq!(rd, ed); | |
2229 | } | |
2230 | ||
83c7162d | 2231 | #[simd_test(enable = "sse")] |
0531ce1d | 2232 | unsafe fn test_mm_cmpnlt_ss() { |
532ac7d7 | 2233 | // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there |
0531ce1d XL |
2234 | // must be a difference. It may have to do with behavior in the |
2235 | // presence of NaNs (signaling or quiet). If so, we should add tests | |
2236 | // for those. | |
2237 | ||
2238 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2239 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2240 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2241 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2242 | ||
2243 | let b1 = !0u32; // a.extract(0) >= b.extract(0) | |
2244 | let c1 = !0u32; // a.extract(0) >= c.extract(0) | |
2245 | let d1 = 0u32; // a.extract(0) >= d.extract(0) | |
2246 | ||
2247 | let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b)); | |
2248 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2249 | assert_eq!(rb, eb); | |
2250 | ||
2251 | let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c)); | |
2252 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2253 | assert_eq!(rc, ec); | |
2254 | ||
2255 | let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d)); | |
2256 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2257 | assert_eq!(rd, ed); | |
2258 | } | |
2259 | ||
83c7162d | 2260 | #[simd_test(enable = "sse")] |
0531ce1d | 2261 | unsafe fn test_mm_cmpnle_ss() { |
532ac7d7 | 2262 | // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there |
0531ce1d XL |
2263 | // must be a difference. It may have to do with behavior in the |
2264 | // presence | |
2265 | // of NaNs (signaling or quiet). If so, we should add tests for those. | |
2266 | ||
2267 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2268 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2269 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2270 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2271 | ||
2272 | let b1 = !0u32; // a.extract(0) > b.extract(0) | |
2273 | let c1 = 0u32; // a.extract(0) > c.extract(0) | |
2274 | let d1 = 0u32; // a.extract(0) > d.extract(0) | |
2275 | ||
2276 | let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b)); | |
2277 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2278 | assert_eq!(rb, eb); | |
2279 | ||
2280 | let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c)); | |
2281 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2282 | assert_eq!(rc, ec); | |
2283 | ||
2284 | let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d)); | |
2285 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2286 | assert_eq!(rd, ed); | |
2287 | } | |
2288 | ||
83c7162d | 2289 | #[simd_test(enable = "sse")] |
0531ce1d | 2290 | unsafe fn test_mm_cmpngt_ss() { |
532ac7d7 | 2291 | // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there |
0531ce1d XL |
2292 | // must be a difference. It may have to do with behavior in the |
2293 | // presence of NaNs (signaling or quiet). If so, we should add tests | |
2294 | // for those. | |
2295 | ||
2296 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2297 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2298 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2299 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2300 | ||
2301 | let b1 = 0u32; // a.extract(0) <= b.extract(0) | |
2302 | let c1 = !0u32; // a.extract(0) <= c.extract(0) | |
2303 | let d1 = !0u32; // a.extract(0) <= d.extract(0) | |
2304 | ||
2305 | let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b)); | |
2306 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2307 | assert_eq!(rb, eb); | |
2308 | ||
2309 | let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c)); | |
2310 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2311 | assert_eq!(rc, ec); | |
2312 | ||
2313 | let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d)); | |
2314 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2315 | assert_eq!(rd, ed); | |
2316 | } | |
2317 | ||
83c7162d | 2318 | #[simd_test(enable = "sse")] |
0531ce1d | 2319 | unsafe fn test_mm_cmpnge_ss() { |
532ac7d7 | 2320 | // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there |
0531ce1d XL |
2321 | // must be a difference. It may have to do with behavior in the |
2322 | // presence of NaNs (signaling or quiet). If so, we should add tests | |
2323 | // for those. | |
2324 | ||
2325 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2326 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2327 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2328 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2329 | ||
2330 | let b1 = 0u32; // a.extract(0) < b.extract(0) | |
2331 | let c1 = 0u32; // a.extract(0) < c.extract(0) | |
2332 | let d1 = !0u32; // a.extract(0) < d.extract(0) | |
2333 | ||
2334 | let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b)); | |
2335 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2336 | assert_eq!(rb, eb); | |
2337 | ||
2338 | let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c)); | |
2339 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2340 | assert_eq!(rc, ec); | |
2341 | ||
2342 | let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d)); | |
2343 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2344 | assert_eq!(rd, ed); | |
2345 | } | |
2346 | ||
83c7162d | 2347 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2348 | unsafe fn test_mm_cmpord_ss() { |
2349 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2350 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2351 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); | |
2352 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2353 | ||
2354 | let b1 = !0u32; // a.extract(0) ord b.extract(0) | |
2355 | let c1 = 0u32; // a.extract(0) ord c.extract(0) | |
2356 | let d1 = !0u32; // a.extract(0) ord d.extract(0) | |
2357 | ||
2358 | let rb: u32x4 = transmute(_mm_cmpord_ss(a, b)); | |
2359 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2360 | assert_eq!(rb, eb); | |
2361 | ||
2362 | let rc: u32x4 = transmute(_mm_cmpord_ss(a, c)); | |
2363 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2364 | assert_eq!(rc, ec); | |
2365 | ||
2366 | let rd: u32x4 = transmute(_mm_cmpord_ss(a, d)); | |
2367 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2368 | assert_eq!(rd, ed); | |
2369 | } | |
2370 | ||
83c7162d | 2371 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2372 | unsafe fn test_mm_cmpunord_ss() { |
2373 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2374 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2375 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); | |
2376 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2377 | ||
2378 | let b1 = 0u32; // a.extract(0) unord b.extract(0) | |
2379 | let c1 = !0u32; // a.extract(0) unord c.extract(0) | |
2380 | let d1 = 0u32; // a.extract(0) unord d.extract(0) | |
2381 | ||
2382 | let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b)); | |
2383 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2384 | assert_eq!(rb, eb); | |
2385 | ||
2386 | let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c)); | |
2387 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2388 | assert_eq!(rc, ec); | |
2389 | ||
2390 | let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d)); | |
2391 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2392 | assert_eq!(rd, ed); | |
2393 | } | |
2394 | ||
83c7162d | 2395 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2396 | unsafe fn test_mm_cmpeq_ps() { |
2397 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2398 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2399 | let tru = !0u32; | |
2400 | let fls = 0u32; | |
2401 | ||
2402 | let e = u32x4::new(fls, fls, tru, fls); | |
2403 | let r: u32x4 = transmute(_mm_cmpeq_ps(a, b)); | |
2404 | assert_eq!(r, e); | |
2405 | } | |
2406 | ||
83c7162d | 2407 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2408 | unsafe fn test_mm_cmplt_ps() { |
2409 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2410 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2411 | let tru = !0u32; | |
2412 | let fls = 0u32; | |
2413 | ||
2414 | let e = u32x4::new(tru, fls, fls, fls); | |
2415 | let r: u32x4 = transmute(_mm_cmplt_ps(a, b)); | |
2416 | assert_eq!(r, e); | |
2417 | } | |
2418 | ||
83c7162d | 2419 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2420 | unsafe fn test_mm_cmple_ps() { |
2421 | let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0); | |
2422 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2423 | let tru = !0u32; | |
2424 | let fls = 0u32; | |
2425 | ||
2426 | let e = u32x4::new(tru, fls, tru, fls); | |
2427 | let r: u32x4 = transmute(_mm_cmple_ps(a, b)); | |
2428 | assert_eq!(r, e); | |
2429 | } | |
2430 | ||
83c7162d | 2431 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2432 | unsafe fn test_mm_cmpgt_ps() { |
2433 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2434 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); | |
2435 | let tru = !0u32; | |
2436 | let fls = 0u32; | |
2437 | ||
2438 | let e = u32x4::new(fls, tru, fls, fls); | |
2439 | let r: u32x4 = transmute(_mm_cmpgt_ps(a, b)); | |
2440 | assert_eq!(r, e); | |
2441 | } | |
2442 | ||
83c7162d | 2443 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2444 | unsafe fn test_mm_cmpge_ps() { |
2445 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2446 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); | |
2447 | let tru = !0u32; | |
2448 | let fls = 0u32; | |
2449 | ||
2450 | let e = u32x4::new(fls, tru, tru, fls); | |
2451 | let r: u32x4 = transmute(_mm_cmpge_ps(a, b)); | |
2452 | assert_eq!(r, e); | |
2453 | } | |
2454 | ||
83c7162d | 2455 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2456 | unsafe fn test_mm_cmpneq_ps() { |
2457 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2458 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2459 | let tru = !0u32; | |
2460 | let fls = 0u32; | |
2461 | ||
2462 | let e = u32x4::new(tru, tru, fls, tru); | |
2463 | let r: u32x4 = transmute(_mm_cmpneq_ps(a, b)); | |
2464 | assert_eq!(r, e); | |
2465 | } | |
2466 | ||
83c7162d | 2467 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2468 | unsafe fn test_mm_cmpnlt_ps() { |
2469 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2470 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2471 | let tru = !0u32; | |
2472 | let fls = 0u32; | |
2473 | ||
2474 | let e = u32x4::new(fls, tru, tru, tru); | |
2475 | let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b)); | |
2476 | assert_eq!(r, e); | |
2477 | } | |
2478 | ||
83c7162d | 2479 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2480 | unsafe fn test_mm_cmpnle_ps() { |
2481 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2482 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2483 | let tru = !0u32; | |
2484 | let fls = 0u32; | |
2485 | ||
2486 | let e = u32x4::new(fls, tru, fls, tru); | |
2487 | let r: u32x4 = transmute(_mm_cmpnle_ps(a, b)); | |
2488 | assert_eq!(r, e); | |
2489 | } | |
2490 | ||
83c7162d | 2491 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2492 | unsafe fn test_mm_cmpngt_ps() { |
2493 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2494 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2495 | let tru = !0u32; | |
2496 | let fls = 0u32; | |
2497 | ||
2498 | let e = u32x4::new(tru, fls, tru, tru); | |
2499 | let r: u32x4 = transmute(_mm_cmpngt_ps(a, b)); | |
2500 | assert_eq!(r, e); | |
2501 | } | |
2502 | ||
83c7162d | 2503 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2504 | unsafe fn test_mm_cmpnge_ps() { |
2505 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2506 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2507 | let tru = !0u32; | |
2508 | let fls = 0u32; | |
2509 | ||
2510 | let e = u32x4::new(tru, fls, fls, tru); | |
2511 | let r: u32x4 = transmute(_mm_cmpnge_ps(a, b)); | |
2512 | assert_eq!(r, e); | |
2513 | } | |
2514 | ||
83c7162d | 2515 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2516 | unsafe fn test_mm_cmpord_ps() { |
2517 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); | |
2518 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); | |
2519 | let tru = !0u32; | |
2520 | let fls = 0u32; | |
2521 | ||
2522 | let e = u32x4::new(tru, fls, fls, fls); | |
2523 | let r: u32x4 = transmute(_mm_cmpord_ps(a, b)); | |
2524 | assert_eq!(r, e); | |
2525 | } | |
2526 | ||
83c7162d | 2527 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2528 | unsafe fn test_mm_cmpunord_ps() { |
2529 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); | |
2530 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); | |
2531 | let tru = !0u32; | |
2532 | let fls = 0u32; | |
2533 | ||
2534 | let e = u32x4::new(fls, tru, tru, tru); | |
2535 | let r: u32x4 = transmute(_mm_cmpunord_ps(a, b)); | |
2536 | assert_eq!(r, e); | |
2537 | } | |
2538 | ||
83c7162d | 2539 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2540 | unsafe fn test_mm_comieq_ss() { |
2541 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2542 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2543 | ||
2544 | let ee = &[1i32, 0, 0, 0]; | |
2545 | ||
2546 | for i in 0..4 { | |
2547 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2548 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2549 | ||
2550 | let r = _mm_comieq_ss(a, b); | |
2551 | ||
2552 | assert_eq!( | |
2553 | ee[i], r, | |
2554 | "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2555 | a, b, r, ee[i], i | |
2556 | ); | |
2557 | } | |
2558 | } | |
2559 | ||
83c7162d | 2560 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2561 | unsafe fn test_mm_comilt_ss() { |
2562 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2563 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2564 | ||
2565 | let ee = &[0i32, 1, 0, 0]; | |
2566 | ||
2567 | for i in 0..4 { | |
2568 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2569 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2570 | ||
2571 | let r = _mm_comilt_ss(a, b); | |
2572 | ||
2573 | assert_eq!( | |
2574 | ee[i], r, | |
2575 | "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2576 | a, b, r, ee[i], i | |
2577 | ); | |
2578 | } | |
2579 | } | |
2580 | ||
83c7162d | 2581 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2582 | unsafe fn test_mm_comile_ss() { |
2583 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2584 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2585 | ||
2586 | let ee = &[1i32, 1, 0, 0]; | |
2587 | ||
2588 | for i in 0..4 { | |
2589 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2590 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2591 | ||
2592 | let r = _mm_comile_ss(a, b); | |
2593 | ||
2594 | assert_eq!( | |
2595 | ee[i], r, | |
2596 | "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2597 | a, b, r, ee[i], i | |
2598 | ); | |
2599 | } | |
2600 | } | |
2601 | ||
83c7162d | 2602 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2603 | unsafe fn test_mm_comigt_ss() { |
2604 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2605 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2606 | ||
2607 | let ee = &[1i32, 0, 1, 0]; | |
2608 | ||
2609 | for i in 0..4 { | |
2610 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2611 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2612 | ||
2613 | let r = _mm_comige_ss(a, b); | |
2614 | ||
2615 | assert_eq!( | |
2616 | ee[i], r, | |
2617 | "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2618 | a, b, r, ee[i], i | |
2619 | ); | |
2620 | } | |
2621 | } | |
2622 | ||
83c7162d | 2623 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2624 | unsafe fn test_mm_comineq_ss() { |
2625 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2626 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2627 | ||
2628 | let ee = &[0i32, 1, 1, 1]; | |
2629 | ||
2630 | for i in 0..4 { | |
2631 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2632 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2633 | ||
2634 | let r = _mm_comineq_ss(a, b); | |
2635 | ||
2636 | assert_eq!( | |
2637 | ee[i], r, | |
2638 | "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2639 | a, b, r, ee[i], i | |
2640 | ); | |
2641 | } | |
2642 | } | |
2643 | ||
83c7162d | 2644 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2645 | unsafe fn test_mm_ucomieq_ss() { |
2646 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2647 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2648 | ||
2649 | let ee = &[1i32, 0, 0, 0]; | |
2650 | ||
2651 | for i in 0..4 { | |
2652 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2653 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2654 | ||
2655 | let r = _mm_ucomieq_ss(a, b); | |
2656 | ||
2657 | assert_eq!( | |
2658 | ee[i], r, | |
2659 | "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2660 | a, b, r, ee[i], i | |
2661 | ); | |
2662 | } | |
2663 | } | |
2664 | ||
83c7162d | 2665 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2666 | unsafe fn test_mm_ucomilt_ss() { |
2667 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2668 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2669 | ||
2670 | let ee = &[0i32, 1, 0, 0]; | |
2671 | ||
2672 | for i in 0..4 { | |
2673 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2674 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2675 | ||
2676 | let r = _mm_ucomilt_ss(a, b); | |
2677 | ||
2678 | assert_eq!( | |
2679 | ee[i], r, | |
2680 | "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2681 | a, b, r, ee[i], i | |
2682 | ); | |
2683 | } | |
2684 | } | |
2685 | ||
83c7162d | 2686 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2687 | unsafe fn test_mm_ucomile_ss() { |
2688 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2689 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2690 | ||
2691 | let ee = &[1i32, 1, 0, 0]; | |
2692 | ||
2693 | for i in 0..4 { | |
2694 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2695 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2696 | ||
2697 | let r = _mm_ucomile_ss(a, b); | |
2698 | ||
2699 | assert_eq!( | |
2700 | ee[i], r, | |
2701 | "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2702 | a, b, r, ee[i], i | |
2703 | ); | |
2704 | } | |
2705 | } | |
2706 | ||
83c7162d | 2707 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2708 | unsafe fn test_mm_ucomigt_ss() { |
2709 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2710 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2711 | ||
2712 | let ee = &[0i32, 0, 1, 0]; | |
2713 | ||
2714 | for i in 0..4 { | |
2715 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2716 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2717 | ||
2718 | let r = _mm_ucomigt_ss(a, b); | |
2719 | ||
2720 | assert_eq!( | |
2721 | ee[i], r, | |
2722 | "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2723 | a, b, r, ee[i], i | |
2724 | ); | |
2725 | } | |
2726 | } | |
2727 | ||
83c7162d | 2728 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2729 | unsafe fn test_mm_ucomige_ss() { |
2730 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2731 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2732 | ||
2733 | let ee = &[1i32, 0, 1, 0]; | |
2734 | ||
2735 | for i in 0..4 { | |
2736 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2737 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2738 | ||
2739 | let r = _mm_ucomige_ss(a, b); | |
2740 | ||
2741 | assert_eq!( | |
2742 | ee[i], r, | |
2743 | "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2744 | a, b, r, ee[i], i | |
2745 | ); | |
2746 | } | |
2747 | } | |
2748 | ||
83c7162d | 2749 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2750 | unsafe fn test_mm_ucomineq_ss() { |
2751 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2752 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2753 | ||
2754 | let ee = &[0i32, 1, 1, 1]; | |
2755 | ||
2756 | for i in 0..4 { | |
2757 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2758 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2759 | ||
2760 | let r = _mm_ucomineq_ss(a, b); | |
2761 | ||
2762 | assert_eq!( | |
2763 | ee[i], r, | |
2764 | "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2765 | a, b, r, ee[i], i | |
2766 | ); | |
2767 | } | |
2768 | } | |
2769 | ||
83c7162d | 2770 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2771 | unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() { |
2772 | // If one of the arguments is a quiet NaN `comieq_ss` should signal an | |
2773 | // Invalid Operation Exception while `ucomieq_ss` should not. | |
2774 | let aa = &[3.0f32, NAN, 23.0, NAN]; | |
2775 | let bb = &[3.0f32, 47.5, NAN, NAN]; | |
2776 | ||
2777 | let ee = &[1i32, 0, 0, 0]; | |
2778 | let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception? | |
2779 | ||
2780 | for i in 0..4 { | |
2781 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2782 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2783 | ||
2784 | _MM_SET_EXCEPTION_STATE(0); | |
2785 | let r1 = _mm_comieq_ss(*black_box(&a), b); | |
2786 | let s1 = _MM_GET_EXCEPTION_STATE(); | |
2787 | ||
2788 | _MM_SET_EXCEPTION_STATE(0); | |
2789 | let r2 = _mm_ucomieq_ss(*black_box(&a), b); | |
2790 | let s2 = _MM_GET_EXCEPTION_STATE(); | |
2791 | ||
2792 | assert_eq!( | |
2793 | ee[i], r1, | |
2794 | "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2795 | a, b, r1, ee[i], i | |
2796 | ); | |
2797 | assert_eq!( | |
2798 | ee[i], r2, | |
2799 | "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2800 | a, b, r2, ee[i], i | |
2801 | ); | |
2802 | assert_eq!( | |
2803 | s1, | |
2804 | exc[i] * _MM_EXCEPT_INVALID, | |
2805 | "_mm_comieq_ss() set exception flags: {} (i={})", | |
2806 | s1, | |
2807 | i | |
2808 | ); | |
2809 | assert_eq!( | |
2810 | s2, | |
2811 | 0, // ucomieq_ss should not signal an exception | |
2812 | "_mm_ucomieq_ss() set exception flags: {} (i={})", | |
2813 | s2, | |
2814 | i | |
2815 | ); | |
2816 | } | |
2817 | } | |
2818 | ||
83c7162d | 2819 | #[simd_test(enable = "sse")] |
0531ce1d | 2820 | unsafe fn test_mm_cvtss_si32() { |
8faf50e0 | 2821 | let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; |
ba9703b0 | 2822 | let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520]; |
0531ce1d XL |
2823 | for i in 0..inputs.len() { |
2824 | let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); | |
2825 | let e = result[i]; | |
2826 | let r = _mm_cvtss_si32(x); | |
2827 | assert_eq!( | |
2828 | e, r, | |
2829 | "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}", | |
2830 | i, x, r, e | |
2831 | ); | |
2832 | } | |
2833 | } | |
2834 | ||
83c7162d | 2835 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2836 | unsafe fn test_mm_cvttss_si32() { |
2837 | let inputs = &[ | |
2838 | (42.0f32, 42i32), | |
2839 | (-31.4, -31), | |
2840 | (-33.5, -33), | |
2841 | (-34.5, -34), | |
2842 | (10.999, 10), | |
2843 | (-5.99, -5), | |
ba9703b0 | 2844 | (4.0e10, i32::MIN), |
0531ce1d | 2845 | (4.0e-10, 0), |
ba9703b0 | 2846 | (NAN, i32::MIN), |
0531ce1d XL |
2847 | (2147483500.1, 2147483520), |
2848 | ]; | |
2849 | for i in 0..inputs.len() { | |
2850 | let (xi, e) = inputs[i]; | |
2851 | let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); | |
2852 | let r = _mm_cvttss_si32(x); | |
2853 | assert_eq!( | |
2854 | e, r, | |
2855 | "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}", | |
2856 | i, x, r, e | |
2857 | ); | |
2858 | } | |
2859 | } | |
2860 | ||
83c7162d | 2861 | #[simd_test(enable = "sse")] |
e1599b0c | 2862 | unsafe fn test_mm_cvtsi32_ss() { |
0531ce1d XL |
2863 | let inputs = &[ |
2864 | (4555i32, 4555.0f32), | |
2865 | (322223333, 322223330.0), | |
2866 | (-432, -432.0), | |
2867 | (-322223333, -322223330.0), | |
2868 | ]; | |
2869 | ||
2870 | for i in 0..inputs.len() { | |
2871 | let (x, f) = inputs[i]; | |
2872 | let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2873 | let r = _mm_cvtsi32_ss(a, x); | |
2874 | let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); | |
2875 | assert_eq_m128(e, r); | |
2876 | } | |
2877 | } | |
2878 | ||
83c7162d | 2879 | #[simd_test(enable = "sse")] |
e1599b0c | 2880 | unsafe fn test_mm_cvtss_f32() { |
0531ce1d XL |
2881 | let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0); |
2882 | assert_eq!(_mm_cvtss_f32(a), 312.0134); | |
2883 | } | |
2884 | ||
83c7162d | 2885 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2886 | unsafe fn test_mm_set_ss() { |
2887 | let r = _mm_set_ss(black_box(4.25)); | |
2888 | assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0)); | |
2889 | } | |
2890 | ||
83c7162d | 2891 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2892 | unsafe fn test_mm_set1_ps() { |
2893 | let r1 = _mm_set1_ps(black_box(4.25)); | |
2894 | let r2 = _mm_set_ps1(black_box(4.25)); | |
2895 | assert_eq!(get_m128(r1, 0), 4.25); | |
2896 | assert_eq!(get_m128(r1, 1), 4.25); | |
2897 | assert_eq!(get_m128(r1, 2), 4.25); | |
2898 | assert_eq!(get_m128(r1, 3), 4.25); | |
2899 | assert_eq!(get_m128(r2, 0), 4.25); | |
2900 | assert_eq!(get_m128(r2, 1), 4.25); | |
2901 | assert_eq!(get_m128(r2, 2), 4.25); | |
2902 | assert_eq!(get_m128(r2, 3), 4.25); | |
2903 | } | |
2904 | ||
83c7162d | 2905 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2906 | unsafe fn test_mm_set_ps() { |
2907 | let r = _mm_set_ps( | |
2908 | black_box(1.0), | |
2909 | black_box(2.0), | |
2910 | black_box(3.0), | |
2911 | black_box(4.0), | |
2912 | ); | |
2913 | assert_eq!(get_m128(r, 0), 4.0); | |
2914 | assert_eq!(get_m128(r, 1), 3.0); | |
2915 | assert_eq!(get_m128(r, 2), 2.0); | |
2916 | assert_eq!(get_m128(r, 3), 1.0); | |
2917 | } | |
2918 | ||
83c7162d | 2919 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2920 | unsafe fn test_mm_setr_ps() { |
2921 | let r = _mm_setr_ps( | |
2922 | black_box(1.0), | |
2923 | black_box(2.0), | |
2924 | black_box(3.0), | |
2925 | black_box(4.0), | |
2926 | ); | |
2927 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0)); | |
2928 | } | |
2929 | ||
83c7162d | 2930 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2931 | unsafe fn test_mm_setzero_ps() { |
2932 | let r = *black_box(&_mm_setzero_ps()); | |
2933 | assert_eq_m128(r, _mm_set1_ps(0.0)); | |
2934 | } | |
2935 | ||
8faf50e0 XL |
2936 | #[simd_test(enable = "sse")] |
2937 | unsafe fn test_mm_shuffle() { | |
2938 | assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); | |
2939 | assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); | |
2940 | assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); | |
2941 | } | |
2942 | ||
83c7162d | 2943 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2944 | unsafe fn test_mm_shuffle_ps() { |
2945 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2946 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
17df50a5 | 2947 | let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b); |
0531ce1d XL |
2948 | assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0)); |
2949 | } | |
2950 | ||
83c7162d | 2951 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2952 | unsafe fn test_mm_unpackhi_ps() { |
2953 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2954 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2955 | let r = _mm_unpackhi_ps(a, b); | |
2956 | assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0)); | |
2957 | } | |
2958 | ||
83c7162d | 2959 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2960 | unsafe fn test_mm_unpacklo_ps() { |
2961 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2962 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2963 | let r = _mm_unpacklo_ps(a, b); | |
2964 | assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0)); | |
2965 | } | |
2966 | ||
83c7162d | 2967 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2968 | unsafe fn test_mm_movehl_ps() { |
2969 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2970 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2971 | let r = _mm_movehl_ps(a, b); | |
2972 | assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0)); | |
2973 | } | |
2974 | ||
83c7162d | 2975 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2976 | unsafe fn test_mm_movelh_ps() { |
2977 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2978 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2979 | let r = _mm_movelh_ps(a, b); | |
2980 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0)); | |
2981 | } | |
2982 | ||
83c7162d | 2983 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2984 | unsafe fn test_mm_load_ss() { |
2985 | let a = 42.0f32; | |
2986 | let r = _mm_load_ss(&a as *const f32); | |
2987 | assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0)); | |
2988 | } | |
2989 | ||
83c7162d | 2990 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2991 | unsafe fn test_mm_load1_ps() { |
2992 | let a = 42.0f32; | |
2993 | let r = _mm_load1_ps(&a as *const f32); | |
2994 | assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0)); | |
2995 | } | |
2996 | ||
83c7162d | 2997 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2998 | unsafe fn test_mm_load_ps() { |
2999 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | |
3000 | ||
3001 | let mut p = vals.as_ptr(); | |
3002 | let mut fixup = 0.0f32; | |
3003 | ||
3004 | // Make sure p is aligned, otherwise we might get a | |
3005 | // (signal: 11, SIGSEGV: invalid memory reference) | |
3006 | ||
3007 | let unalignment = (p as usize) & 0xf; | |
3008 | if unalignment != 0 { | |
3009 | let delta = ((16 - unalignment) >> 2) as isize; | |
3010 | fixup = delta as f32; | |
3011 | p = p.offset(delta); | |
3012 | } | |
3013 | ||
3014 | let r = _mm_load_ps(p); | |
0731742a | 3015 | let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); |
0531ce1d XL |
3016 | assert_eq_m128(r, e); |
3017 | } | |
3018 | ||
83c7162d | 3019 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3020 | unsafe fn test_mm_loadu_ps() { |
3021 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | |
3022 | let p = vals.as_ptr().offset(3); | |
3023 | let r = _mm_loadu_ps(black_box(p)); | |
3024 | assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0)); | |
3025 | } | |
3026 | ||
83c7162d | 3027 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3028 | unsafe fn test_mm_loadr_ps() { |
3029 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | |
3030 | ||
3031 | let mut p = vals.as_ptr(); | |
3032 | let mut fixup = 0.0f32; | |
3033 | ||
3034 | // Make sure p is aligned, otherwise we might get a | |
3035 | // (signal: 11, SIGSEGV: invalid memory reference) | |
3036 | ||
3037 | let unalignment = (p as usize) & 0xf; | |
3038 | if unalignment != 0 { | |
3039 | let delta = ((16 - unalignment) >> 2) as isize; | |
3040 | fixup = delta as f32; | |
3041 | p = p.offset(delta); | |
3042 | } | |
3043 | ||
3044 | let r = _mm_loadr_ps(p); | |
0731742a | 3045 | let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); |
0531ce1d XL |
3046 | assert_eq_m128(r, e); |
3047 | } | |
3048 | ||
3dfed10e XL |
3049 | #[simd_test(enable = "sse2")] |
3050 | unsafe fn test_mm_loadu_si64() { | |
3051 | let a = _mm_setr_epi64x(5, 6); | |
3052 | let r = _mm_loadu_si64(&a as *const _ as *const _); | |
cdc7bbd5 | 3053 | assert_eq_m128i(r, _mm_setr_epi64x(5, 0)); |
3dfed10e XL |
3054 | } |
3055 | ||
83c7162d | 3056 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3057 | unsafe fn test_mm_store_ss() { |
3058 | let mut vals = [0.0f32; 8]; | |
3059 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3060 | _mm_store_ss(vals.as_mut_ptr().offset(1), a); | |
3061 | ||
3062 | assert_eq!(vals[0], 0.0); | |
3063 | assert_eq!(vals[1], 1.0); | |
3064 | assert_eq!(vals[2], 0.0); | |
3065 | } | |
3066 | ||
83c7162d | 3067 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3068 | unsafe fn test_mm_store1_ps() { |
3069 | let mut vals = [0.0f32; 8]; | |
3070 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3071 | ||
3072 | let mut ofs = 0; | |
3073 | let mut p = vals.as_mut_ptr(); | |
3074 | ||
3075 | if (p as usize) & 0xf != 0 { | |
fc512014 XL |
3076 | ofs = ((16 - (p as usize)) & 0xf) >> 2; |
3077 | p = p.add(ofs); | |
0531ce1d XL |
3078 | } |
3079 | ||
3080 | _mm_store1_ps(p, *black_box(&a)); | |
3081 | ||
3082 | if ofs > 0 { | |
3083 | assert_eq!(vals[ofs - 1], 0.0); | |
3084 | } | |
3085 | assert_eq!(vals[ofs + 0], 1.0); | |
3086 | assert_eq!(vals[ofs + 1], 1.0); | |
3087 | assert_eq!(vals[ofs + 2], 1.0); | |
3088 | assert_eq!(vals[ofs + 3], 1.0); | |
3089 | assert_eq!(vals[ofs + 4], 0.0); | |
3090 | } | |
3091 | ||
83c7162d | 3092 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3093 | unsafe fn test_mm_store_ps() { |
3094 | let mut vals = [0.0f32; 8]; | |
3095 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3096 | ||
3097 | let mut ofs = 0; | |
3098 | let mut p = vals.as_mut_ptr(); | |
3099 | ||
3100 | // Align p to 16-byte boundary | |
3101 | if (p as usize) & 0xf != 0 { | |
fc512014 XL |
3102 | ofs = ((16 - (p as usize)) & 0xf) >> 2; |
3103 | p = p.add(ofs); | |
0531ce1d XL |
3104 | } |
3105 | ||
3106 | _mm_store_ps(p, *black_box(&a)); | |
3107 | ||
3108 | if ofs > 0 { | |
3109 | assert_eq!(vals[ofs - 1], 0.0); | |
3110 | } | |
3111 | assert_eq!(vals[ofs + 0], 1.0); | |
3112 | assert_eq!(vals[ofs + 1], 2.0); | |
3113 | assert_eq!(vals[ofs + 2], 3.0); | |
3114 | assert_eq!(vals[ofs + 3], 4.0); | |
3115 | assert_eq!(vals[ofs + 4], 0.0); | |
3116 | } | |
3117 | ||
83c7162d | 3118 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3119 | unsafe fn test_mm_storer_ps() { |
3120 | let mut vals = [0.0f32; 8]; | |
3121 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3122 | ||
3123 | let mut ofs = 0; | |
3124 | let mut p = vals.as_mut_ptr(); | |
3125 | ||
3126 | // Align p to 16-byte boundary | |
3127 | if (p as usize) & 0xf != 0 { | |
fc512014 XL |
3128 | ofs = ((16 - (p as usize)) & 0xf) >> 2; |
3129 | p = p.add(ofs); | |
0531ce1d XL |
3130 | } |
3131 | ||
3132 | _mm_storer_ps(p, *black_box(&a)); | |
3133 | ||
3134 | if ofs > 0 { | |
3135 | assert_eq!(vals[ofs - 1], 0.0); | |
3136 | } | |
3137 | assert_eq!(vals[ofs + 0], 4.0); | |
3138 | assert_eq!(vals[ofs + 1], 3.0); | |
3139 | assert_eq!(vals[ofs + 2], 2.0); | |
3140 | assert_eq!(vals[ofs + 3], 1.0); | |
3141 | assert_eq!(vals[ofs + 4], 0.0); | |
3142 | } | |
3143 | ||
83c7162d | 3144 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3145 | unsafe fn test_mm_storeu_ps() { |
3146 | let mut vals = [0.0f32; 8]; | |
3147 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3148 | ||
3149 | let mut ofs = 0; | |
3150 | let mut p = vals.as_mut_ptr(); | |
3151 | ||
532ac7d7 | 3152 | // Make sure p is **not** aligned to 16-byte boundary |
0531ce1d XL |
3153 | if (p as usize) & 0xf == 0 { |
3154 | ofs = 1; | |
3155 | p = p.offset(1); | |
3156 | } | |
3157 | ||
3158 | _mm_storeu_ps(p, *black_box(&a)); | |
3159 | ||
3160 | if ofs > 0 { | |
3161 | assert_eq!(vals[ofs - 1], 0.0); | |
3162 | } | |
3163 | assert_eq!(vals[ofs + 0], 1.0); | |
3164 | assert_eq!(vals[ofs + 1], 2.0); | |
3165 | assert_eq!(vals[ofs + 2], 3.0); | |
3166 | assert_eq!(vals[ofs + 3], 4.0); | |
3167 | assert_eq!(vals[ofs + 4], 0.0); | |
3168 | } | |
3169 | ||
83c7162d | 3170 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3171 | unsafe fn test_mm_move_ss() { |
3172 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3173 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
3174 | ||
3175 | let r = _mm_move_ss(a, b); | |
3176 | let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0); | |
3177 | assert_eq_m128(e, r); | |
3178 | } | |
3179 | ||
83c7162d | 3180 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3181 | unsafe fn test_mm_movemask_ps() { |
3182 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0)); | |
3183 | assert_eq!(r, 0b0101); | |
3184 | ||
3185 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0)); | |
3186 | assert_eq!(r, 0b0111); | |
3187 | } | |
3188 | ||
83c7162d | 3189 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3190 | unsafe fn test_mm_sfence() { |
3191 | _mm_sfence(); | |
3192 | } | |
3193 | ||
83c7162d | 3194 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3195 | unsafe fn test_mm_getcsr_setcsr_1() { |
3196 | let saved_csr = _mm_getcsr(); | |
3197 | ||
3198 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); | |
3199 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); | |
3200 | ||
3201 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); | |
3202 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); | |
3203 | ||
3204 | _mm_setcsr(saved_csr); | |
3205 | ||
3206 | let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0); | |
3207 | assert_eq_m128(r, exp); // first component is a denormalized f32 | |
3208 | } | |
3209 | ||
83c7162d | 3210 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3211 | unsafe fn test_mm_getcsr_setcsr_2() { |
3212 | // Same as _mm_setcsr_1 test, but with opposite flag value. | |
3213 | ||
3214 | let saved_csr = _mm_getcsr(); | |
3215 | ||
3216 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); | |
3217 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); | |
3218 | ||
3219 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); | |
3220 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); | |
3221 | ||
3222 | _mm_setcsr(saved_csr); | |
3223 | ||
3224 | let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0); | |
3225 | assert_eq_m128(r, exp); // first component is a denormalized f32 | |
3226 | } | |
3227 | ||
83c7162d | 3228 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3229 | unsafe fn test_mm_getcsr_setcsr_underflow() { |
3230 | _MM_SET_EXCEPTION_STATE(0); | |
3231 | ||
3232 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); | |
3233 | let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0); | |
3234 | ||
3235 | assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure | |
3236 | ||
3237 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); | |
3238 | ||
3239 | let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0); | |
3240 | assert_eq_m128(r, exp); | |
3241 | ||
3242 | let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0; | |
3243 | assert_eq!(underflow, true); | |
3244 | } | |
3245 | ||
83c7162d | 3246 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3247 | unsafe fn test_MM_TRANSPOSE4_PS() { |
3248 | let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3249 | let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
3250 | let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0); | |
3251 | let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0); | |
3252 | ||
3253 | _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d); | |
3254 | ||
3255 | assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0)); | |
3256 | assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0)); | |
3257 | assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0)); | |
3258 | assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0)); | |
3259 | } | |
3260 | ||
3261 | #[repr(align(16))] | |
3262 | struct Memory { | |
3263 | pub data: [f32; 4], | |
3264 | } | |
3265 | ||
83c7162d | 3266 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3267 | unsafe fn test_mm_stream_ps() { |
3268 | let a = _mm_set1_ps(7.0); | |
8faf50e0 | 3269 | let mut mem = Memory { data: [-1.0; 4] }; |
0531ce1d XL |
3270 | |
3271 | _mm_stream_ps(&mut mem.data[0] as *mut f32, a); | |
3272 | for i in 0..4 { | |
3273 | assert_eq!(mem.data[i], get_m128(a, i)); | |
3274 | } | |
3275 | } | |
0531ce1d | 3276 | } |