]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! Streaming SIMD Extensions (SSE) |
2 | ||
532ac7d7 XL |
3 | use crate::{ |
4 | core_arch::{simd::*, simd_llvm::*, x86::*}, | |
5 | intrinsics, mem, ptr, | |
6 | }; | |
0531ce1d XL |
7 | |
8 | #[cfg(test)] | |
416331ca | 9 | use stdarch_test::assert_instr; |
0531ce1d XL |
10 | |
11 | /// Adds the first component of `a` and `b`, the other components are copied | |
12 | /// from `a`. | |
83c7162d XL |
13 | /// |
14 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss) | |
0531ce1d XL |
15 | #[inline] |
16 | #[target_feature(enable = "sse")] | |
17 | #[cfg_attr(test, assert_instr(addss))] | |
83c7162d | 18 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
19 | pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 { |
20 | addss(a, b) | |
21 | } | |
22 | ||
23 | /// Adds __m128 vectors. | |
83c7162d XL |
24 | /// |
25 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps) | |
0531ce1d XL |
26 | #[inline] |
27 | #[target_feature(enable = "sse")] | |
28 | #[cfg_attr(test, assert_instr(addps))] | |
83c7162d | 29 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
30 | pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 { |
31 | simd_add(a, b) | |
32 | } | |
33 | ||
34 | /// Subtracts the first component of `b` from `a`, the other components are | |
35 | /// copied from `a`. | |
83c7162d XL |
36 | /// |
37 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss) | |
0531ce1d XL |
38 | #[inline] |
39 | #[target_feature(enable = "sse")] | |
40 | #[cfg_attr(test, assert_instr(subss))] | |
83c7162d | 41 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
42 | pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 { |
43 | subss(a, b) | |
44 | } | |
45 | ||
46 | /// Subtracts __m128 vectors. | |
83c7162d XL |
47 | /// |
48 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps) | |
0531ce1d XL |
49 | #[inline] |
50 | #[target_feature(enable = "sse")] | |
51 | #[cfg_attr(test, assert_instr(subps))] | |
83c7162d | 52 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
53 | pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 { |
54 | simd_sub(a, b) | |
55 | } | |
56 | ||
57 | /// Multiplies the first component of `a` and `b`, the other components are | |
58 | /// copied from `a`. | |
83c7162d XL |
59 | /// |
60 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss) | |
0531ce1d XL |
61 | #[inline] |
62 | #[target_feature(enable = "sse")] | |
63 | #[cfg_attr(test, assert_instr(mulss))] | |
83c7162d | 64 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
65 | pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 { |
66 | mulss(a, b) | |
67 | } | |
68 | ||
69 | /// Multiplies __m128 vectors. | |
83c7162d XL |
70 | /// |
71 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps) | |
0531ce1d XL |
72 | #[inline] |
73 | #[target_feature(enable = "sse")] | |
74 | #[cfg_attr(test, assert_instr(mulps))] | |
83c7162d | 75 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
76 | pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 { |
77 | simd_mul(a, b) | |
78 | } | |
79 | ||
80 | /// Divides the first component of `b` by `a`, the other components are | |
81 | /// copied from `a`. | |
83c7162d XL |
82 | /// |
83 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss) | |
0531ce1d XL |
84 | #[inline] |
85 | #[target_feature(enable = "sse")] | |
86 | #[cfg_attr(test, assert_instr(divss))] | |
83c7162d | 87 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
88 | pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 { |
89 | divss(a, b) | |
90 | } | |
91 | ||
92 | /// Divides __m128 vectors. | |
83c7162d XL |
93 | /// |
94 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps) | |
0531ce1d XL |
95 | #[inline] |
96 | #[target_feature(enable = "sse")] | |
97 | #[cfg_attr(test, assert_instr(divps))] | |
83c7162d | 98 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
99 | pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 { |
100 | simd_div(a, b) | |
101 | } | |
102 | ||
532ac7d7 | 103 | /// Returns the square root of the first single-precision (32-bit) |
0531ce1d | 104 | /// floating-point element in `a`, the other elements are unchanged. |
83c7162d XL |
105 | /// |
106 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss) | |
0531ce1d XL |
107 | #[inline] |
108 | #[target_feature(enable = "sse")] | |
109 | #[cfg_attr(test, assert_instr(sqrtss))] | |
83c7162d | 110 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
111 | pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 { |
112 | sqrtss(a) | |
113 | } | |
114 | ||
532ac7d7 | 115 | /// Returns the square root of packed single-precision (32-bit) floating-point |
0531ce1d | 116 | /// elements in `a`. |
83c7162d XL |
117 | /// |
118 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps) | |
0531ce1d XL |
119 | #[inline] |
120 | #[target_feature(enable = "sse")] | |
121 | #[cfg_attr(test, assert_instr(sqrtps))] | |
83c7162d | 122 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
123 | pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 { |
124 | sqrtps(a) | |
125 | } | |
126 | ||
532ac7d7 | 127 | /// Returns the approximate reciprocal of the first single-precision |
0531ce1d | 128 | /// (32-bit) floating-point element in `a`, the other elements are unchanged. |
83c7162d XL |
129 | /// |
130 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss) | |
0531ce1d XL |
131 | #[inline] |
132 | #[target_feature(enable = "sse")] | |
133 | #[cfg_attr(test, assert_instr(rcpss))] | |
83c7162d | 134 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
135 | pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 { |
136 | rcpss(a) | |
137 | } | |
138 | ||
532ac7d7 | 139 | /// Returns the approximate reciprocal of packed single-precision (32-bit) |
0531ce1d | 140 | /// floating-point elements in `a`. |
83c7162d XL |
141 | /// |
142 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps) | |
0531ce1d XL |
143 | #[inline] |
144 | #[target_feature(enable = "sse")] | |
145 | #[cfg_attr(test, assert_instr(rcpps))] | |
83c7162d | 146 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
147 | pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 { |
148 | rcpps(a) | |
149 | } | |
150 | ||
532ac7d7 | 151 | /// Returns the approximate reciprocal square root of the fist single-precision |
0531ce1d | 152 | /// (32-bit) floating-point elements in `a`, the other elements are unchanged. |
83c7162d XL |
153 | /// |
154 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss) | |
0531ce1d XL |
155 | #[inline] |
156 | #[target_feature(enable = "sse")] | |
157 | #[cfg_attr(test, assert_instr(rsqrtss))] | |
83c7162d | 158 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
159 | pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 { |
160 | rsqrtss(a) | |
161 | } | |
162 | ||
532ac7d7 | 163 | /// Returns the approximate reciprocal square root of packed single-precision |
0531ce1d | 164 | /// (32-bit) floating-point elements in `a`. |
83c7162d XL |
165 | /// |
166 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps) | |
0531ce1d XL |
167 | #[inline] |
168 | #[target_feature(enable = "sse")] | |
169 | #[cfg_attr(test, assert_instr(rsqrtps))] | |
83c7162d | 170 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
171 | pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 { |
172 | rsqrtps(a) | |
173 | } | |
174 | ||
532ac7d7 | 175 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
0531ce1d XL |
176 | /// and `b`, and return the minimum value in the first element of the return |
177 | /// value, the other elements are copied from `a`. | |
83c7162d XL |
178 | /// |
179 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss) | |
0531ce1d XL |
180 | #[inline] |
181 | #[target_feature(enable = "sse")] | |
182 | #[cfg_attr(test, assert_instr(minss))] | |
83c7162d | 183 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
184 | pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 { |
185 | minss(a, b) | |
186 | } | |
187 | ||
532ac7d7 | 188 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d | 189 | /// `b`, and return the corresponding minimum values. |
83c7162d XL |
190 | /// |
191 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps) | |
0531ce1d XL |
192 | #[inline] |
193 | #[target_feature(enable = "sse")] | |
194 | #[cfg_attr(test, assert_instr(minps))] | |
83c7162d | 195 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 196 | pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 { |
74b04a01 | 197 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`. |
0531ce1d XL |
198 | minps(a, b) |
199 | } | |
200 | ||
532ac7d7 | 201 | /// Compares the first single-precision (32-bit) floating-point element of `a` |
0531ce1d XL |
202 | /// and `b`, and return the maximum value in the first element of the return |
203 | /// value, the other elements are copied from `a`. | |
83c7162d XL |
204 | /// |
205 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss) | |
0531ce1d XL |
206 | #[inline] |
207 | #[target_feature(enable = "sse")] | |
208 | #[cfg_attr(test, assert_instr(maxss))] | |
83c7162d | 209 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
210 | pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 { |
211 | maxss(a, b) | |
212 | } | |
213 | ||
532ac7d7 | 214 | /// Compares packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d | 215 | /// `b`, and return the corresponding maximum values. |
83c7162d XL |
216 | /// |
217 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps) | |
0531ce1d XL |
218 | #[inline] |
219 | #[target_feature(enable = "sse")] | |
220 | #[cfg_attr(test, assert_instr(maxps))] | |
83c7162d | 221 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 222 | pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 { |
74b04a01 | 223 | // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`. |
0531ce1d XL |
224 | maxps(a, b) |
225 | } | |
226 | ||
227 | /// Bitwise AND of packed single-precision (32-bit) floating-point elements. | |
83c7162d XL |
228 | /// |
229 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps) | |
0531ce1d XL |
230 | #[inline] |
231 | #[target_feature(enable = "sse")] | |
232 | // i586 only seems to generate plain `and` instructions, so ignore it. | |
8faf50e0 XL |
233 | #[cfg_attr( |
234 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
235 | assert_instr(andps) | |
236 | )] | |
83c7162d | 237 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
238 | pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 { |
239 | let a: __m128i = mem::transmute(a); | |
240 | let b: __m128i = mem::transmute(b); | |
241 | mem::transmute(simd_and(a, b)) | |
242 | } | |
243 | ||
244 | /// Bitwise AND-NOT of packed single-precision (32-bit) floating-point | |
245 | /// elements. | |
246 | /// | |
247 | /// Computes `!a & b` for each bit in `a` and `b`. | |
83c7162d XL |
248 | /// |
249 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps) | |
0531ce1d XL |
250 | #[inline] |
251 | #[target_feature(enable = "sse")] | |
252 | // i586 only seems to generate plain `not` and `and` instructions, so ignore | |
253 | // it. | |
8faf50e0 XL |
254 | #[cfg_attr( |
255 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
256 | assert_instr(andnps) | |
257 | )] | |
83c7162d | 258 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
259 | pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 { |
260 | let a: __m128i = mem::transmute(a); | |
261 | let b: __m128i = mem::transmute(b); | |
262 | let mask: __m128i = mem::transmute(i32x4::splat(-1)); | |
263 | mem::transmute(simd_and(simd_xor(mask, a), b)) | |
264 | } | |
265 | ||
266 | /// Bitwise OR of packed single-precision (32-bit) floating-point elements. | |
83c7162d XL |
267 | /// |
268 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps) | |
0531ce1d XL |
269 | #[inline] |
270 | #[target_feature(enable = "sse")] | |
271 | // i586 only seems to generate plain `or` instructions, so we ignore it. | |
8faf50e0 XL |
272 | #[cfg_attr( |
273 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
274 | assert_instr(orps) | |
275 | )] | |
83c7162d | 276 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
277 | pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 { |
278 | let a: __m128i = mem::transmute(a); | |
279 | let b: __m128i = mem::transmute(b); | |
280 | mem::transmute(simd_or(a, b)) | |
281 | } | |
282 | ||
283 | /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point | |
284 | /// elements. | |
83c7162d XL |
285 | /// |
286 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps) | |
0531ce1d XL |
287 | #[inline] |
288 | #[target_feature(enable = "sse")] | |
289 | // i586 only seems to generate plain `xor` instructions, so we ignore it. | |
8faf50e0 XL |
290 | #[cfg_attr( |
291 | all(test, any(target_arch = "x86_64", target_feature = "sse2")), | |
292 | assert_instr(xorps) | |
293 | )] | |
83c7162d | 294 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
295 | pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 { |
296 | let a: __m128i = mem::transmute(a); | |
297 | let b: __m128i = mem::transmute(b); | |
298 | mem::transmute(simd_xor(a, b)) | |
299 | } | |
300 | ||
532ac7d7 | 301 | /// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of |
0531ce1d XL |
302 | /// the result will be `0xffffffff` if the two inputs are equal, or `0` |
303 | /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`. | |
83c7162d XL |
304 | /// |
305 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss) | |
0531ce1d XL |
306 | #[inline] |
307 | #[target_feature(enable = "sse")] | |
308 | #[cfg_attr(test, assert_instr(cmpeqss))] | |
83c7162d | 309 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
310 | pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 { |
311 | cmpss(a, b, 0) | |
312 | } | |
313 | ||
532ac7d7 | 314 | /// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits |
0531ce1d XL |
315 | /// of the result will be `0xffffffff` if `a.extract(0)` is less than |
316 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the | |
317 | /// upper 96 bits of `a`. | |
83c7162d XL |
318 | /// |
319 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss) | |
0531ce1d XL |
320 | #[inline] |
321 | #[target_feature(enable = "sse")] | |
322 | #[cfg_attr(test, assert_instr(cmpltss))] | |
83c7162d | 323 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
324 | pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 { |
325 | cmpss(a, b, 1) | |
326 | } | |
327 | ||
532ac7d7 | 328 | /// Compares the lowest `f32` of both inputs for less than or equal. The lowest |
0531ce1d XL |
329 | /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than |
330 | /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result | |
331 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
332 | /// |
333 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss) | |
0531ce1d XL |
334 | #[inline] |
335 | #[target_feature(enable = "sse")] | |
336 | #[cfg_attr(test, assert_instr(cmpless))] | |
83c7162d | 337 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
338 | pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 { |
339 | cmpss(a, b, 2) | |
340 | } | |
341 | ||
532ac7d7 | 342 | /// Compares the lowest `f32` of both inputs for greater than. The lowest 32 |
0531ce1d XL |
343 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater |
344 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result | |
345 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
346 | /// |
347 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss) | |
0531ce1d XL |
348 | #[inline] |
349 | #[target_feature(enable = "sse")] | |
350 | #[cfg_attr(test, assert_instr(cmpltss))] | |
83c7162d | 351 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
352 | pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 { |
353 | simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3]) | |
354 | } | |
355 | ||
532ac7d7 | 356 | /// Compares the lowest `f32` of both inputs for greater than or equal. The |
0531ce1d XL |
357 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is |
358 | /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits | |
359 | /// of the result are the upper 96 bits of `a`. | |
83c7162d XL |
360 | /// |
361 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss) | |
0531ce1d XL |
362 | #[inline] |
363 | #[target_feature(enable = "sse")] | |
364 | #[cfg_attr(test, assert_instr(cmpless))] | |
83c7162d | 365 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
366 | pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 { |
367 | simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3]) | |
368 | } | |
369 | ||
532ac7d7 | 370 | /// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits |
0531ce1d XL |
371 | /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to |
372 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the | |
373 | /// upper 96 bits of `a`. | |
83c7162d XL |
374 | /// |
375 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss) | |
0531ce1d XL |
376 | #[inline] |
377 | #[target_feature(enable = "sse")] | |
378 | #[cfg_attr(test, assert_instr(cmpneqss))] | |
83c7162d | 379 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
380 | pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 { |
381 | cmpss(a, b, 4) | |
382 | } | |
383 | ||
532ac7d7 | 384 | /// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32 |
0531ce1d XL |
385 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than |
386 | /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the | |
387 | /// upper 96 bits of `a`. | |
83c7162d XL |
388 | /// |
389 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss) | |
0531ce1d XL |
390 | #[inline] |
391 | #[target_feature(enable = "sse")] | |
392 | #[cfg_attr(test, assert_instr(cmpnltss))] | |
83c7162d | 393 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
394 | pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 { |
395 | cmpss(a, b, 5) | |
396 | } | |
397 | ||
532ac7d7 | 398 | /// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The |
0531ce1d XL |
399 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
400 | /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits | |
401 | /// of the result are the upper 96 bits of `a`. | |
83c7162d XL |
402 | /// |
403 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss) | |
0531ce1d XL |
404 | #[inline] |
405 | #[target_feature(enable = "sse")] | |
406 | #[cfg_attr(test, assert_instr(cmpnless))] | |
83c7162d | 407 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
408 | pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 { |
409 | cmpss(a, b, 6) | |
410 | } | |
411 | ||
532ac7d7 | 412 | /// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32 |
0531ce1d XL |
413 | /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater |
414 | /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are | |
415 | /// the upper 96 bits of `a`. | |
83c7162d XL |
416 | /// |
417 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss) | |
0531ce1d XL |
418 | #[inline] |
419 | #[target_feature(enable = "sse")] | |
420 | #[cfg_attr(test, assert_instr(cmpnltss))] | |
83c7162d | 421 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
422 | pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 { |
423 | simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3]) | |
424 | } | |
425 | ||
532ac7d7 | 426 | /// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The |
0531ce1d XL |
427 | /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not |
428 | /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 | |
429 | /// bits of the result are the upper 96 bits of `a`. | |
83c7162d XL |
430 | /// |
431 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss) | |
0531ce1d XL |
432 | #[inline] |
433 | #[target_feature(enable = "sse")] | |
434 | #[cfg_attr(test, assert_instr(cmpnless))] | |
83c7162d | 435 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
436 | pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 { |
437 | simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3]) | |
438 | } | |
439 | ||
532ac7d7 | 440 | /// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of |
0531ce1d XL |
441 | /// the result will be `0xffffffff` if neither of `a.extract(0)` or |
442 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result | |
443 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
444 | /// |
445 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss) | |
0531ce1d XL |
446 | #[inline] |
447 | #[target_feature(enable = "sse")] | |
448 | #[cfg_attr(test, assert_instr(cmpordss))] | |
83c7162d | 449 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
450 | pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 { |
451 | cmpss(a, b, 7) | |
452 | } | |
453 | ||
532ac7d7 | 454 | /// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits |
0531ce1d XL |
455 | /// of the result will be `0xffffffff` if any of `a.extract(0)` or |
456 | /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result | |
457 | /// are the upper 96 bits of `a`. | |
83c7162d XL |
458 | /// |
459 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss) | |
0531ce1d XL |
460 | #[inline] |
461 | #[target_feature(enable = "sse")] | |
462 | #[cfg_attr(test, assert_instr(cmpunordss))] | |
83c7162d | 463 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
464 | pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 { |
465 | cmpss(a, b, 3) | |
466 | } | |
467 | ||
532ac7d7 | 468 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
469 | /// The result in the output vector will be `0xffffffff` if the input elements |
470 | /// were equal, or `0` otherwise. | |
83c7162d XL |
471 | /// |
472 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps) | |
0531ce1d XL |
473 | #[inline] |
474 | #[target_feature(enable = "sse")] | |
475 | #[cfg_attr(test, assert_instr(cmpeqps))] | |
83c7162d | 476 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
477 | pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 { |
478 | cmpps(a, b, 0) | |
479 | } | |
480 | ||
532ac7d7 | 481 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
482 | /// The result in the output vector will be `0xffffffff` if the input element |
483 | /// in `a` is less than the corresponding element in `b`, or `0` otherwise. | |
83c7162d XL |
484 | /// |
485 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps) | |
0531ce1d XL |
486 | #[inline] |
487 | #[target_feature(enable = "sse")] | |
488 | #[cfg_attr(test, assert_instr(cmpltps))] | |
83c7162d | 489 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
490 | pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 { |
491 | cmpps(a, b, 1) | |
492 | } | |
493 | ||
532ac7d7 | 494 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
495 | /// The result in the output vector will be `0xffffffff` if the input element |
496 | /// in `a` is less than or equal to the corresponding element in `b`, or `0` | |
497 | /// otherwise. | |
83c7162d XL |
498 | /// |
499 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps) | |
0531ce1d XL |
500 | #[inline] |
501 | #[target_feature(enable = "sse")] | |
502 | #[cfg_attr(test, assert_instr(cmpleps))] | |
83c7162d | 503 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
504 | pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 { |
505 | cmpps(a, b, 2) | |
506 | } | |
507 | ||
532ac7d7 | 508 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
509 | /// The result in the output vector will be `0xffffffff` if the input element |
510 | /// in `a` is greater than the corresponding element in `b`, or `0` otherwise. | |
83c7162d XL |
511 | /// |
512 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps) | |
0531ce1d XL |
513 | #[inline] |
514 | #[target_feature(enable = "sse")] | |
515 | #[cfg_attr(test, assert_instr(cmpltps))] | |
83c7162d | 516 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
517 | pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 { |
518 | cmpps(b, a, 1) | |
519 | } | |
520 | ||
532ac7d7 | 521 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
522 | /// The result in the output vector will be `0xffffffff` if the input element |
523 | /// in `a` is greater than or equal to the corresponding element in `b`, or `0` | |
524 | /// otherwise. | |
83c7162d XL |
525 | /// |
526 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps) | |
0531ce1d XL |
527 | #[inline] |
528 | #[target_feature(enable = "sse")] | |
529 | #[cfg_attr(test, assert_instr(cmpleps))] | |
83c7162d | 530 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
531 | pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 { |
532 | cmpps(b, a, 2) | |
533 | } | |
534 | ||
532ac7d7 | 535 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 536 | /// The result in the output vector will be `0xffffffff` if the input elements |
532ac7d7 | 537 | /// are **not** equal, or `0` otherwise. |
83c7162d XL |
538 | /// |
539 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps) | |
0531ce1d XL |
540 | #[inline] |
541 | #[target_feature(enable = "sse")] | |
542 | #[cfg_attr(test, assert_instr(cmpneqps))] | |
83c7162d | 543 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
544 | pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 { |
545 | cmpps(a, b, 4) | |
546 | } | |
547 | ||
532ac7d7 | 548 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 549 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 550 | /// in `a` is **not** less than the corresponding element in `b`, or `0` |
0531ce1d | 551 | /// otherwise. |
83c7162d XL |
552 | /// |
553 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps) | |
0531ce1d XL |
554 | #[inline] |
555 | #[target_feature(enable = "sse")] | |
556 | #[cfg_attr(test, assert_instr(cmpnltps))] | |
83c7162d | 557 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
558 | pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 { |
559 | cmpps(a, b, 5) | |
560 | } | |
561 | ||
532ac7d7 | 562 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 563 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 564 | /// in `a` is **not** less than or equal to the corresponding element in `b`, or |
0531ce1d | 565 | /// `0` otherwise. |
83c7162d XL |
566 | /// |
567 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps) | |
0531ce1d XL |
568 | #[inline] |
569 | #[target_feature(enable = "sse")] | |
570 | #[cfg_attr(test, assert_instr(cmpnleps))] | |
83c7162d | 571 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
572 | pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 { |
573 | cmpps(a, b, 6) | |
574 | } | |
575 | ||
532ac7d7 | 576 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 577 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 578 | /// in `a` is **not** greater than the corresponding element in `b`, or `0` |
0531ce1d | 579 | /// otherwise. |
83c7162d XL |
580 | /// |
581 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps) | |
0531ce1d XL |
582 | #[inline] |
583 | #[target_feature(enable = "sse")] | |
584 | #[cfg_attr(test, assert_instr(cmpnltps))] | |
83c7162d | 585 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
586 | pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 { |
587 | cmpps(b, a, 5) | |
588 | } | |
589 | ||
532ac7d7 | 590 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d | 591 | /// The result in the output vector will be `0xffffffff` if the input element |
532ac7d7 | 592 | /// in `a` is **not** greater than or equal to the corresponding element in `b`, |
0531ce1d | 593 | /// or `0` otherwise. |
83c7162d XL |
594 | /// |
595 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps) | |
0531ce1d XL |
596 | #[inline] |
597 | #[target_feature(enable = "sse")] | |
598 | #[cfg_attr(test, assert_instr(cmpnleps))] | |
83c7162d | 599 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
600 | pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 { |
601 | cmpps(b, a, 6) | |
602 | } | |
603 | ||
532ac7d7 | 604 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
605 | /// Returns four floats that have one of two possible bit patterns. The element |
606 | /// in the output vector will be `0xffffffff` if the input elements in `a` and | |
607 | /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise. | |
83c7162d XL |
608 | /// |
609 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps) | |
0531ce1d XL |
610 | #[inline] |
611 | #[target_feature(enable = "sse")] | |
612 | #[cfg_attr(test, assert_instr(cmpordps))] | |
83c7162d | 613 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
614 | pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 { |
615 | cmpps(b, a, 7) | |
616 | } | |
617 | ||
532ac7d7 | 618 | /// Compares each of the four floats in `a` to the corresponding element in `b`. |
0531ce1d XL |
619 | /// Returns four floats that have one of two possible bit patterns. The element |
620 | /// in the output vector will be `0xffffffff` if the input elements in `a` and | |
621 | /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise. | |
83c7162d XL |
622 | /// |
623 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps) | |
0531ce1d XL |
624 | #[inline] |
625 | #[target_feature(enable = "sse")] | |
626 | #[cfg_attr(test, assert_instr(cmpunordps))] | |
83c7162d | 627 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
628 | pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 { |
629 | cmpps(b, a, 3) | |
630 | } | |
631 | ||
532ac7d7 | 632 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d | 633 | /// `1` if they are equal, or `0` otherwise. |
83c7162d XL |
634 | /// |
635 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss) | |
0531ce1d XL |
636 | #[inline] |
637 | #[target_feature(enable = "sse")] | |
638 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 639 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
640 | pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 { |
641 | comieq_ss(a, b) | |
642 | } | |
643 | ||
532ac7d7 | 644 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d | 645 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
83c7162d XL |
646 | /// |
647 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss) | |
0531ce1d XL |
648 | #[inline] |
649 | #[target_feature(enable = "sse")] | |
650 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 651 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
652 | pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 { |
653 | comilt_ss(a, b) | |
654 | } | |
655 | ||
532ac7d7 | 656 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
657 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
658 | /// otherwise. | |
83c7162d XL |
659 | /// |
660 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss) | |
0531ce1d XL |
661 | #[inline] |
662 | #[target_feature(enable = "sse")] | |
663 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 664 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
665 | pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 { |
666 | comile_ss(a, b) | |
667 | } | |
668 | ||
532ac7d7 | 669 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
670 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
671 | /// otherwise. | |
83c7162d XL |
672 | /// |
673 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss) | |
0531ce1d XL |
674 | #[inline] |
675 | #[target_feature(enable = "sse")] | |
676 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 677 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
678 | pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 { |
679 | comigt_ss(a, b) | |
680 | } | |
681 | ||
532ac7d7 | 682 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
683 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
684 | /// `0` otherwise. | |
83c7162d XL |
685 | /// |
686 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss) | |
0531ce1d XL |
687 | #[inline] |
688 | #[target_feature(enable = "sse")] | |
689 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 690 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
691 | pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 { |
692 | comige_ss(a, b) | |
693 | } | |
694 | ||
532ac7d7 XL |
695 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
696 | /// `1` if they are **not** equal, or `0` otherwise. | |
83c7162d XL |
697 | /// |
698 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss) | |
0531ce1d XL |
699 | #[inline] |
700 | #[target_feature(enable = "sse")] | |
701 | #[cfg_attr(test, assert_instr(comiss))] | |
83c7162d | 702 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
703 | pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 { |
704 | comineq_ss(a, b) | |
705 | } | |
706 | ||
532ac7d7 | 707 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
708 | /// `1` if they are equal, or `0` otherwise. This instruction will not signal |
709 | /// an exception if either argument is a quiet NaN. | |
83c7162d XL |
710 | /// |
711 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss) | |
0531ce1d XL |
712 | #[inline] |
713 | #[target_feature(enable = "sse")] | |
714 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 715 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
716 | pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 { |
717 | ucomieq_ss(a, b) | |
718 | } | |
719 | ||
532ac7d7 | 720 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
721 | /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise. |
722 | /// This instruction will not signal an exception if either argument is a quiet | |
723 | /// NaN. | |
83c7162d XL |
724 | /// |
725 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss) | |
0531ce1d XL |
726 | #[inline] |
727 | #[target_feature(enable = "sse")] | |
728 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 729 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
730 | pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 { |
731 | ucomilt_ss(a, b) | |
732 | } | |
733 | ||
532ac7d7 | 734 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
735 | /// `1` if the value from `a` is less than or equal to the one from `b`, or `0` |
736 | /// otherwise. This instruction will not signal an exception if either argument | |
737 | /// is a quiet NaN. | |
83c7162d XL |
738 | /// |
739 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss) | |
0531ce1d XL |
740 | #[inline] |
741 | #[target_feature(enable = "sse")] | |
742 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 743 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
744 | pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 { |
745 | ucomile_ss(a, b) | |
746 | } | |
747 | ||
532ac7d7 | 748 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
749 | /// `1` if the value from `a` is greater than the one from `b`, or `0` |
750 | /// otherwise. This instruction will not signal an exception if either argument | |
751 | /// is a quiet NaN. | |
83c7162d XL |
752 | /// |
753 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss) | |
0531ce1d XL |
754 | #[inline] |
755 | #[target_feature(enable = "sse")] | |
756 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 757 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
758 | pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 { |
759 | ucomigt_ss(a, b) | |
760 | } | |
761 | ||
532ac7d7 | 762 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
0531ce1d XL |
763 | /// `1` if the value from `a` is greater than or equal to the one from `b`, or |
764 | /// `0` otherwise. This instruction will not signal an exception if either | |
765 | /// argument is a quiet NaN. | |
83c7162d XL |
766 | /// |
767 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss) | |
0531ce1d XL |
768 | #[inline] |
769 | #[target_feature(enable = "sse")] | |
770 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 771 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
772 | pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 { |
773 | ucomige_ss(a, b) | |
774 | } | |
775 | ||
532ac7d7 XL |
776 | /// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns |
777 | /// `1` if they are **not** equal, or `0` otherwise. This instruction will not | |
0531ce1d | 778 | /// signal an exception if either argument is a quiet NaN. |
83c7162d XL |
779 | /// |
780 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss) | |
0531ce1d XL |
781 | #[inline] |
782 | #[target_feature(enable = "sse")] | |
783 | #[cfg_attr(test, assert_instr(ucomiss))] | |
83c7162d | 784 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
785 | pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 { |
786 | ucomineq_ss(a, b) | |
787 | } | |
788 | ||
532ac7d7 | 789 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer. |
0531ce1d XL |
790 | /// |
791 | /// The result is rounded according to the current rounding mode. If the result | |
792 | /// cannot be represented as a 32 bit integer the result will be `0x8000_0000` | |
ba9703b0 | 793 | /// (`i32::MIN`) or an invalid operation floating point exception if |
0531ce1d XL |
794 | /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). |
795 | /// | |
796 | /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output). | |
83c7162d XL |
797 | /// |
798 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32) | |
0531ce1d XL |
799 | #[inline] |
800 | #[target_feature(enable = "sse")] | |
801 | #[cfg_attr(test, assert_instr(cvtss2si))] | |
83c7162d | 802 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
803 | pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 { |
804 | cvtss2si(a) | |
805 | } | |
806 | ||
807 | /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html). | |
83c7162d XL |
808 | /// |
809 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si) | |
0531ce1d XL |
810 | #[inline] |
811 | #[target_feature(enable = "sse")] | |
812 | #[cfg_attr(test, assert_instr(cvtss2si))] | |
83c7162d | 813 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
814 | pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 { |
815 | _mm_cvtss_si32(a) | |
816 | } | |
817 | ||
532ac7d7 | 818 | /// Converts the lowest 32 bit float in the input vector to a 32 bit integer |
0531ce1d XL |
819 | /// with |
820 | /// truncation. | |
821 | /// | |
822 | /// The result is rounded always using truncation (round towards zero). If the | |
823 | /// result cannot be represented as a 32 bit integer the result will be | |
ba9703b0 | 824 | /// `0x8000_0000` (`i32::MIN`) or an invalid operation floating point |
0531ce1d XL |
825 | /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)). |
826 | /// | |
827 | /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output). | |
83c7162d XL |
828 | /// |
829 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32) | |
0531ce1d XL |
830 | #[inline] |
831 | #[target_feature(enable = "sse")] | |
832 | #[cfg_attr(test, assert_instr(cvttss2si))] | |
83c7162d | 833 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
834 | pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 { |
835 | cvttss2si(a) | |
836 | } | |
837 | ||
838 | /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html). | |
83c7162d XL |
839 | /// |
840 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si) | |
0531ce1d XL |
841 | #[inline] |
842 | #[target_feature(enable = "sse")] | |
843 | #[cfg_attr(test, assert_instr(cvttss2si))] | |
83c7162d | 844 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
845 | pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 { |
846 | _mm_cvttss_si32(a) | |
847 | } | |
848 | ||
532ac7d7 | 849 | /// Extracts the lowest 32 bit float from the input vector. |
83c7162d XL |
850 | /// |
851 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32) | |
0531ce1d XL |
852 | #[inline] |
853 | #[target_feature(enable = "sse")] | |
854 | // No point in using assert_instrs. In Unix x86_64 calling convention this is a | |
855 | // no-op, and on Windows it's just a `mov`. | |
83c7162d | 856 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
857 | pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 { |
858 | simd_extract(a, 0) | |
859 | } | |
860 | ||
532ac7d7 | 861 | /// Converts a 32 bit integer to a 32 bit float. The result vector is the input |
0531ce1d XL |
862 | /// vector `a` with the lowest 32 bit float replaced by the converted integer. |
863 | /// | |
864 | /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit | |
865 | /// input). | |
83c7162d XL |
866 | /// |
867 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss) | |
0531ce1d XL |
868 | #[inline] |
869 | #[target_feature(enable = "sse")] | |
870 | #[cfg_attr(test, assert_instr(cvtsi2ss))] | |
83c7162d | 871 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
872 | pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 { |
873 | cvtsi2ss(a, b) | |
874 | } | |
875 | ||
876 | /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html). | |
83c7162d XL |
877 | /// |
878 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss) | |
0531ce1d XL |
879 | #[inline] |
880 | #[target_feature(enable = "sse")] | |
881 | #[cfg_attr(test, assert_instr(cvtsi2ss))] | |
83c7162d | 882 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
883 | pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 { |
884 | _mm_cvtsi32_ss(a, b) | |
885 | } | |
886 | ||
887 | /// Construct a `__m128` with the lowest element set to `a` and the rest set to | |
888 | /// zero. | |
83c7162d XL |
889 | /// |
890 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss) | |
0531ce1d XL |
891 | #[inline] |
892 | #[target_feature(enable = "sse")] | |
893 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 894 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
895 | pub unsafe fn _mm_set_ss(a: f32) -> __m128 { |
896 | __m128(a, 0.0, 0.0, 0.0) | |
897 | } | |
898 | ||
899 | /// Construct a `__m128` with all element set to `a`. | |
83c7162d XL |
900 | /// |
901 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps) | |
0531ce1d XL |
902 | #[inline] |
903 | #[target_feature(enable = "sse")] | |
904 | #[cfg_attr(test, assert_instr(shufps))] | |
83c7162d | 905 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
906 | pub unsafe fn _mm_set1_ps(a: f32) -> __m128 { |
907 | __m128(a, a, a, a) | |
908 | } | |
909 | ||
910 | /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html) | |
83c7162d XL |
911 | /// |
912 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1) | |
0531ce1d XL |
913 | #[inline] |
914 | #[target_feature(enable = "sse")] | |
915 | #[cfg_attr(test, assert_instr(shufps))] | |
83c7162d | 916 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
917 | pub unsafe fn _mm_set_ps1(a: f32) -> __m128 { |
918 | _mm_set1_ps(a) | |
919 | } | |
920 | ||
921 | /// Construct a `__m128` from four floating point values highest to lowest. | |
922 | /// | |
923 | /// Note that `a` will be the highest 32 bits of the result, and `d` the | |
924 | /// lowest. This matches the standard way of writing bit patterns on x86: | |
925 | /// | |
926 | /// ```text | |
927 | /// bit 127 .. 96 95 .. 64 63 .. 32 31 .. 0 | |
928 | /// +---------+---------+---------+---------+ | |
929 | /// | a | b | c | d | result | |
930 | /// +---------+---------+---------+---------+ | |
931 | /// ``` | |
932 | /// | |
933 | /// Alternatively: | |
934 | /// | |
935 | /// ```text | |
936 | /// let v = _mm_set_ps(d, c, b, a); | |
937 | /// ``` | |
83c7162d XL |
938 | /// |
939 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps) | |
0531ce1d XL |
940 | #[inline] |
941 | #[target_feature(enable = "sse")] | |
942 | #[cfg_attr(test, assert_instr(unpcklps))] | |
83c7162d | 943 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
944 | pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
945 | __m128(d, c, b, a) | |
946 | } | |
947 | ||
948 | /// Construct a `__m128` from four floating point values lowest to highest. | |
949 | /// | |
950 | /// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32 | |
951 | /// bits of the result, and `d` the highest. | |
952 | /// | |
953 | /// ```text | |
954 | /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d)); | |
955 | /// ``` | |
83c7162d XL |
956 | /// |
957 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps) | |
0531ce1d XL |
958 | #[inline] |
959 | #[target_feature(enable = "sse")] | |
960 | #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))] | |
961 | // On a 32-bit architecture it just copies the operands from the stack. | |
962 | #[cfg_attr(all(test, target_arch = "x86"), assert_instr(movaps))] | |
83c7162d | 963 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
964 | pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 { |
965 | __m128(a, b, c, d) | |
966 | } | |
967 | ||
968 | /// Construct a `__m128` with all elements initialized to zero. | |
83c7162d XL |
969 | /// |
970 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps) | |
0531ce1d XL |
971 | #[inline] |
972 | #[target_feature(enable = "sse")] | |
973 | #[cfg_attr(test, assert_instr(xorps))] | |
83c7162d | 974 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
975 | pub unsafe fn _mm_setzero_ps() -> __m128 { |
976 | __m128(0.0, 0.0, 0.0, 0.0) | |
977 | } | |
978 | ||
0bf4aa26 XL |
979 | /// A utility function for creating masks to use with Intel shuffle and |
980 | /// permute intrinsics. | |
8faf50e0 XL |
981 | #[inline] |
982 | #[allow(non_snake_case)] | |
416331ca | 983 | #[unstable(feature = "stdarch", issue = "27731")] |
0731742a XL |
984 | pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 { |
985 | ((z << 6) | (y << 4) | (x << 2) | w) as i32 | |
8faf50e0 XL |
986 | } |
987 | ||
532ac7d7 | 988 | /// Shuffles packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d XL |
989 | /// `b` using `mask`. |
990 | /// | |
991 | /// The lower half of result takes values from `a` and the higher half from | |
992 | /// `b`. Mask is split to 2 control bits each to index the element from inputs. | |
83c7162d XL |
993 | /// |
994 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps) | |
3dfed10e XL |
995 | /// |
996 | /// Note that there appears to be a mistake within Intel's Intrinsics Guide. | |
997 | /// `_mm_shuffle_ps` is supposed to take an `i32` instead of an `u32` | |
998 | /// as is the case for [other shuffle intrinsics](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_). | |
999 | /// Performing an implicit type conversion between an unsigned integer and a signed integer | |
1000 | /// does not cause a problem in C, however Rust's commitment to strong typing does not allow this. | |
0531ce1d XL |
1001 | #[inline] |
1002 | #[target_feature(enable = "sse")] | |
1003 | #[cfg_attr(test, assert_instr(shufps, mask = 3))] | |
1004 | #[rustc_args_required_const(2)] | |
83c7162d | 1005 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1006 | pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: i32) -> __m128 { |
0531ce1d XL |
1007 | let mask = (mask & 0xFF) as u8; |
1008 | ||
1009 | macro_rules! shuffle_done { | |
1010 | ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => { | |
1011 | simd_shuffle4(a, b, [$x01, $x23, $x45, $x67]) | |
83c7162d | 1012 | }; |
0531ce1d XL |
1013 | } |
1014 | macro_rules! shuffle_x67 { | |
1015 | ($x01:expr, $x23:expr, $x45:expr) => { | |
1016 | match (mask >> 6) & 0b11 { | |
1017 | 0b00 => shuffle_done!($x01, $x23, $x45, 4), | |
1018 | 0b01 => shuffle_done!($x01, $x23, $x45, 5), | |
1019 | 0b10 => shuffle_done!($x01, $x23, $x45, 6), | |
1020 | _ => shuffle_done!($x01, $x23, $x45, 7), | |
1021 | } | |
83c7162d | 1022 | }; |
0531ce1d XL |
1023 | } |
1024 | macro_rules! shuffle_x45 { | |
1025 | ($x01:expr, $x23:expr) => { | |
1026 | match (mask >> 4) & 0b11 { | |
1027 | 0b00 => shuffle_x67!($x01, $x23, 4), | |
1028 | 0b01 => shuffle_x67!($x01, $x23, 5), | |
1029 | 0b10 => shuffle_x67!($x01, $x23, 6), | |
1030 | _ => shuffle_x67!($x01, $x23, 7), | |
1031 | } | |
83c7162d | 1032 | }; |
0531ce1d XL |
1033 | } |
1034 | macro_rules! shuffle_x23 { | |
1035 | ($x01:expr) => { | |
1036 | match (mask >> 2) & 0b11 { | |
1037 | 0b00 => shuffle_x45!($x01, 0), | |
1038 | 0b01 => shuffle_x45!($x01, 1), | |
1039 | 0b10 => shuffle_x45!($x01, 2), | |
1040 | _ => shuffle_x45!($x01, 3), | |
1041 | } | |
83c7162d | 1042 | }; |
0531ce1d XL |
1043 | } |
1044 | match mask & 0b11 { | |
1045 | 0b00 => shuffle_x23!(0), | |
1046 | 0b01 => shuffle_x23!(1), | |
1047 | 0b10 => shuffle_x23!(2), | |
1048 | _ => shuffle_x23!(3), | |
1049 | } | |
1050 | } | |
1051 | ||
532ac7d7 | 1052 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
0531ce1d | 1053 | /// from the higher half of `a` and `b`. |
83c7162d XL |
1054 | /// |
1055 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps) | |
0531ce1d XL |
1056 | #[inline] |
1057 | #[target_feature(enable = "sse")] | |
1058 | #[cfg_attr(test, assert_instr(unpckhps))] | |
83c7162d | 1059 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1060 | pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 { |
1061 | simd_shuffle4(a, b, [2, 6, 3, 7]) | |
1062 | } | |
1063 | ||
532ac7d7 | 1064 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
0531ce1d | 1065 | /// from the lower half of `a` and `b`. |
83c7162d XL |
1066 | /// |
1067 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps) | |
0531ce1d XL |
1068 | #[inline] |
1069 | #[target_feature(enable = "sse")] | |
1070 | #[cfg_attr(test, assert_instr(unpcklps))] | |
83c7162d | 1071 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1072 | pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 { |
1073 | simd_shuffle4(a, b, [0, 4, 1, 5]) | |
1074 | } | |
1075 | ||
1076 | /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the | |
1077 | /// lower half of result. | |
83c7162d XL |
1078 | /// |
1079 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps) | |
0531ce1d XL |
1080 | #[inline] |
1081 | #[target_feature(enable = "sse")] | |
0731742a | 1082 | #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhlps))] |
83c7162d | 1083 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1084 | pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 { |
1085 | // TODO; figure why this is a different instruction on Windows? | |
1086 | simd_shuffle4(a, b, [6, 7, 2, 3]) | |
1087 | } | |
1088 | ||
1089 | /// Combine lower half of `a` and `b`. The lower half of `b` occupies the | |
1090 | /// higher half of result. | |
83c7162d XL |
1091 | /// |
1092 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps) | |
0531ce1d XL |
1093 | #[inline] |
1094 | #[target_feature(enable = "sse")] | |
0731742a | 1095 | #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))] |
83c7162d | 1096 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1097 | pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 { |
1098 | simd_shuffle4(a, b, [0, 1, 4, 5]) | |
1099 | } | |
1100 | ||
532ac7d7 | 1101 | /// Returns a mask of the most significant bit of each element in `a`. |
0531ce1d XL |
1102 | /// |
1103 | /// The mask is stored in the 4 least significant bits of the return value. | |
1104 | /// All other bits are set to `0`. | |
83c7162d XL |
1105 | /// |
1106 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps) | |
0531ce1d XL |
1107 | #[inline] |
1108 | #[target_feature(enable = "sse")] | |
e1599b0c XL |
1109 | // FIXME: LLVM9 trunk has the following bug: |
1110 | // https://github.com/rust-lang/stdarch/issues/794 | |
1111 | // so we only temporarily test this on i686 and x86_64 but not on i586: | |
1112 | #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(movmskps))] | |
83c7162d | 1113 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1114 | pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 { |
1115 | movmskps(a) | |
1116 | } | |
1117 | ||
0531ce1d XL |
1118 | /// Construct a `__m128` with the lowest element read from `p` and the other |
1119 | /// elements set to zero. | |
1120 | /// | |
1121 | /// This corresponds to instructions `VMOVSS` / `MOVSS`. | |
83c7162d XL |
1122 | /// |
1123 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss) | |
0531ce1d XL |
1124 | #[inline] |
1125 | #[target_feature(enable = "sse")] | |
1126 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1127 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1128 | pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 { |
1129 | __m128(*p, 0.0, 0.0, 0.0) | |
1130 | } | |
1131 | ||
1132 | /// Construct a `__m128` by duplicating the value read from `p` into all | |
1133 | /// elements. | |
1134 | /// | |
1135 | /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some | |
1136 | /// shuffling. | |
83c7162d XL |
1137 | /// |
1138 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps) | |
0531ce1d XL |
1139 | #[inline] |
1140 | #[target_feature(enable = "sse")] | |
1141 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1142 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1143 | pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 { |
1144 | let a = *p; | |
1145 | __m128(a, a, a, a) | |
1146 | } | |
1147 | ||
1148 | /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html) | |
83c7162d XL |
1149 | /// |
1150 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1) | |
0531ce1d XL |
1151 | #[inline] |
1152 | #[target_feature(enable = "sse")] | |
1153 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1154 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1155 | pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 { |
1156 | _mm_load1_ps(p) | |
1157 | } | |
1158 | ||
532ac7d7 | 1159 | /// Loads four `f32` values from *aligned* memory into a `__m128`. If the |
0531ce1d XL |
1160 | /// pointer is not aligned to a 128-bit boundary (16 bytes) a general |
1161 | /// protection fault will be triggered (fatal program crash). | |
1162 | /// | |
1163 | /// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned | |
1164 | /// memory. | |
1165 | /// | |
1166 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. | |
83c7162d XL |
1167 | /// |
1168 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps) | |
0531ce1d XL |
1169 | #[inline] |
1170 | #[target_feature(enable = "sse")] | |
1171 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1172 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1173 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1174 | pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 { |
1175 | *(p as *const __m128) | |
1176 | } | |
1177 | ||
532ac7d7 | 1178 | /// Loads four `f32` values from memory into a `__m128`. There are no |
0531ce1d XL |
1179 | /// restrictions |
1180 | /// on memory alignment. For aligned memory | |
1181 | /// [`_mm_load_ps`](fn._mm_load_ps.html) | |
1182 | /// may be faster. | |
1183 | /// | |
1184 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. | |
83c7162d XL |
1185 | /// |
1186 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps) | |
0531ce1d XL |
1187 | #[inline] |
1188 | #[target_feature(enable = "sse")] | |
1189 | #[cfg_attr(test, assert_instr(movups))] | |
83c7162d | 1190 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1191 | pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 { |
1192 | // Note: Using `*p` would require `f32` alignment, but `movups` has no | |
1193 | // alignment restrictions. | |
1194 | let mut dst = _mm_undefined_ps(); | |
1195 | ptr::copy_nonoverlapping( | |
1196 | p as *const u8, | |
1197 | &mut dst as *mut __m128 as *mut u8, | |
1198 | mem::size_of::<__m128>(), | |
1199 | ); | |
1200 | dst | |
1201 | } | |
1202 | ||
532ac7d7 | 1203 | /// Loads four `f32` values from aligned memory into a `__m128` in reverse |
0531ce1d XL |
1204 | /// order. |
1205 | /// | |
1206 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1207 | /// protection fault will be triggered (fatal program crash). | |
1208 | /// | |
1209 | /// Functionally equivalent to the following code sequence (assuming `p` | |
1210 | /// satisfies the alignment restrictions): | |
1211 | /// | |
1212 | /// ```text | |
1213 | /// let a0 = *p; | |
1214 | /// let a1 = *p.offset(1); | |
1215 | /// let a2 = *p.offset(2); | |
1216 | /// let a3 = *p.offset(3); | |
1217 | /// __m128::new(a3, a2, a1, a0) | |
1218 | /// ``` | |
1219 | /// | |
1220 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some | |
1221 | /// shuffling. | |
83c7162d XL |
1222 | /// |
1223 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps) | |
0531ce1d XL |
1224 | #[inline] |
1225 | #[target_feature(enable = "sse")] | |
1226 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1227 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1228 | pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 { |
1229 | let a = _mm_load_ps(p); | |
1230 | simd_shuffle4(a, a, [3, 2, 1, 0]) | |
1231 | } | |
1232 | ||
3dfed10e XL |
1233 | /// Loads unaligned 64-bits of integer data from memory into new vector. |
1234 | /// | |
1235 | /// `mem_addr` does not need to be aligned on any particular boundary. | |
1236 | /// | |
1237 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64) | |
1238 | #[inline] | |
1239 | #[target_feature(enable = "sse")] | |
1240 | #[cfg_attr(all(test, not(target_arch = "x86")), assert_instr(movq))] | |
1241 | #[stable(feature = "simd_x86_mm_loadu_si64", since = "1.46.0")] | |
1242 | pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i { | |
1243 | transmute(i64x2(0, ptr::read_unaligned(mem_addr as *const i64))) | |
1244 | } | |
1245 | ||
532ac7d7 | 1246 | /// Stores the lowest 32 bit float of `a` into memory. |
0531ce1d XL |
1247 | /// |
1248 | /// This intrinsic corresponds to the `MOVSS` instruction. | |
83c7162d XL |
1249 | /// |
1250 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss) | |
0531ce1d XL |
1251 | #[inline] |
1252 | #[target_feature(enable = "sse")] | |
1253 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1254 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1255 | pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) { |
1256 | *p = simd_extract(a, 0); | |
1257 | } | |
1258 | ||
532ac7d7 | 1259 | /// Stores the lowest 32 bit float of `a` repeated four times into *aligned* |
0531ce1d XL |
1260 | /// memory. |
1261 | /// | |
1262 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1263 | /// protection fault will be triggered (fatal program crash). | |
1264 | /// | |
1265 | /// Functionally equivalent to the following code sequence (assuming `p` | |
1266 | /// satisfies the alignment restrictions): | |
1267 | /// | |
1268 | /// ```text | |
1269 | /// let x = a.extract(0); | |
1270 | /// *p = x; | |
1271 | /// *p.offset(1) = x; | |
1272 | /// *p.offset(2) = x; | |
1273 | /// *p.offset(3) = x; | |
1274 | /// ``` | |
83c7162d XL |
1275 | /// |
1276 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps) | |
0531ce1d XL |
1277 | #[inline] |
1278 | #[target_feature(enable = "sse")] | |
1279 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1280 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1281 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1282 | pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) { |
1283 | let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]); | |
1284 | *(p as *mut __m128) = b; | |
1285 | } | |
1286 | ||
1287 | /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html) | |
83c7162d XL |
1288 | /// |
1289 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1) | |
0531ce1d XL |
1290 | #[inline] |
1291 | #[target_feature(enable = "sse")] | |
1292 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1293 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1294 | pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) { |
1295 | _mm_store1_ps(p, a); | |
1296 | } | |
1297 | ||
532ac7d7 | 1298 | /// Stores four 32-bit floats into *aligned* memory. |
0531ce1d XL |
1299 | /// |
1300 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1301 | /// protection fault will be triggered (fatal program crash). | |
1302 | /// | |
1303 | /// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned | |
1304 | /// memory. | |
1305 | /// | |
1306 | /// This corresponds to instructions `VMOVAPS` / `MOVAPS`. | |
83c7162d XL |
1307 | /// |
1308 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps) | |
0531ce1d XL |
1309 | #[inline] |
1310 | #[target_feature(enable = "sse")] | |
1311 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1312 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1313 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1314 | pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) { |
1315 | *(p as *mut __m128) = a; | |
1316 | } | |
1317 | ||
532ac7d7 | 1318 | /// Stores four 32-bit floats into memory. There are no restrictions on memory |
0531ce1d XL |
1319 | /// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be |
1320 | /// faster. | |
1321 | /// | |
1322 | /// This corresponds to instructions `VMOVUPS` / `MOVUPS`. | |
83c7162d XL |
1323 | /// |
1324 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps) | |
0531ce1d XL |
1325 | #[inline] |
1326 | #[target_feature(enable = "sse")] | |
1327 | #[cfg_attr(test, assert_instr(movups))] | |
83c7162d | 1328 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1329 | pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) { |
1330 | ptr::copy_nonoverlapping( | |
1331 | &a as *const __m128 as *const u8, | |
1332 | p as *mut u8, | |
1333 | mem::size_of::<__m128>(), | |
1334 | ); | |
1335 | } | |
1336 | ||
532ac7d7 | 1337 | /// Stores four 32-bit floats into *aligned* memory in reverse order. |
0531ce1d XL |
1338 | /// |
1339 | /// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general | |
1340 | /// protection fault will be triggered (fatal program crash). | |
1341 | /// | |
1342 | /// Functionally equivalent to the following code sequence (assuming `p` | |
1343 | /// satisfies the alignment restrictions): | |
1344 | /// | |
1345 | /// ```text | |
1346 | /// *p = a.extract(3); | |
1347 | /// *p.offset(1) = a.extract(2); | |
1348 | /// *p.offset(2) = a.extract(1); | |
1349 | /// *p.offset(3) = a.extract(0); | |
1350 | /// ``` | |
83c7162d XL |
1351 | /// |
1352 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps) | |
0531ce1d XL |
1353 | #[inline] |
1354 | #[target_feature(enable = "sse")] | |
1355 | #[cfg_attr(test, assert_instr(movaps))] | |
83c7162d | 1356 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1357 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1358 | pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) { |
1359 | let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]); | |
1360 | *(p as *mut __m128) = b; | |
1361 | } | |
1362 | ||
532ac7d7 | 1363 | /// Returns a `__m128` with the first component from `b` and the remaining |
0531ce1d XL |
1364 | /// components from `a`. |
1365 | /// | |
1366 | /// In other words for any `a` and `b`: | |
1367 | /// ```text | |
1368 | /// _mm_move_ss(a, b) == a.replace(0, b.extract(0)) | |
1369 | /// ``` | |
83c7162d XL |
1370 | /// |
1371 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss) | |
0531ce1d XL |
1372 | #[inline] |
1373 | #[target_feature(enable = "sse")] | |
1374 | #[cfg_attr(test, assert_instr(movss))] | |
83c7162d | 1375 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1376 | pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 { |
1377 | simd_shuffle4(a, b, [4, 1, 2, 3]) | |
1378 | } | |
1379 | ||
532ac7d7 | 1380 | /// Performs a serializing operation on all store-to-memory instructions that |
0531ce1d XL |
1381 | /// were issued prior to this instruction. |
1382 | /// | |
1383 | /// Guarantees that every store instruction that precedes, in program order, is | |
1384 | /// globally visible before any store instruction which follows the fence in | |
1385 | /// program order. | |
83c7162d XL |
1386 | /// |
1387 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence) | |
0531ce1d XL |
1388 | #[inline] |
1389 | #[target_feature(enable = "sse")] | |
1390 | #[cfg_attr(test, assert_instr(sfence))] | |
83c7162d | 1391 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1392 | pub unsafe fn _mm_sfence() { |
1393 | sfence() | |
1394 | } | |
1395 | ||
532ac7d7 | 1396 | /// Gets the unsigned 32-bit value of the MXCSR control and status register. |
0531ce1d XL |
1397 | /// |
1398 | /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1399 | /// |
1400 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr) | |
0531ce1d XL |
1401 | #[inline] |
1402 | #[target_feature(enable = "sse")] | |
1403 | #[cfg_attr(test, assert_instr(stmxcsr))] | |
83c7162d | 1404 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1405 | pub unsafe fn _mm_getcsr() -> u32 { |
1406 | let mut result = 0_i32; | |
1407 | stmxcsr((&mut result) as *mut _ as *mut i8); | |
1408 | result as u32 | |
1409 | } | |
1410 | ||
532ac7d7 | 1411 | /// Sets the MXCSR register with the 32-bit unsigned integer value. |
0531ce1d XL |
1412 | /// |
1413 | /// This register constrols how SIMD instructions handle floating point | |
1414 | /// operations. Modifying this register only affects the current thread. | |
1415 | /// | |
1416 | /// It contains several groups of flags: | |
1417 | /// | |
1418 | /// * *Exception flags* report which exceptions occurred since last they were | |
1419 | /// reset. | |
1420 | /// | |
1421 | /// * *Masking flags* can be used to mask (ignore) certain exceptions. By | |
1422 | /// default | |
1423 | /// these flags are all set to 1, so all exceptions are masked. When an | |
1424 | /// an exception is masked, the processor simply sets the exception flag and | |
1425 | /// continues the operation. If the exception is unmasked, the flag is also set | |
1426 | /// but additionally an exception handler is invoked. | |
1427 | /// | |
1428 | /// * *Rounding mode flags* control the rounding mode of floating point | |
1429 | /// instructions. | |
1430 | /// | |
1431 | /// * The *denormals-are-zero mode flag* turns all numbers which would be | |
1432 | /// denormalized (exponent bits are all zeros) into zeros. | |
1433 | /// | |
1434 | /// ## Exception Flags | |
1435 | /// | |
1436 | /// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing | |
1437 | /// Infinity by Infinity). | |
1438 | /// | |
1439 | /// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized | |
1440 | /// number. Mainly this can cause loss of precision. | |
1441 | /// | |
1442 | /// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occured. | |
1443 | /// | |
1444 | /// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occured, i.e., a | |
1445 | /// result was too large to be represented (e.g., an `f32` with absolute | |
1446 | /// value | |
1447 | /// greater than `2^128`). | |
1448 | /// | |
1449 | /// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occured, i.e., a | |
1450 | /// result was too small to be represented in a normalized way (e.g., an | |
1451 | /// `f32` | |
1452 | /// with absulte value smaller than `2^-126`.) | |
1453 | /// | |
1454 | /// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occured (a.k.a. | |
1455 | /// precision exception). This means some precision was lost due to rounding. | |
1456 | /// For example, the fraction `1/3` cannot be represented accurately in a | |
1457 | /// 32 or 64 bit float and computing it would cause this exception to be | |
1458 | /// raised. Precision exceptions are very common, so they are usually masked. | |
1459 | /// | |
1460 | /// Exception flags can be read and set using the convenience functions | |
1461 | /// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to | |
1462 | /// check if an operation caused some overflow: | |
1463 | /// | |
1464 | /// ```rust,ignore | |
1465 | /// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags | |
1466 | /// // perform calculations | |
1467 | /// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 { | |
1468 | /// // handle overflow | |
1469 | /// } | |
1470 | /// ``` | |
1471 | /// | |
1472 | /// ## Masking Flags | |
1473 | /// | |
1474 | /// There is one masking flag for each exception flag: `_MM_MASK_INVALID`, | |
1475 | /// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`, | |
1476 | /// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`. | |
1477 | /// | |
1478 | /// A single masking bit can be set via | |
1479 | /// | |
1480 | /// ```rust,ignore | |
1481 | /// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW); | |
1482 | /// ``` | |
1483 | /// | |
1484 | /// However, since mask bits are by default all set to 1, it is more common to | |
1485 | /// want to *disable* certain bits. For example, to unmask the underflow | |
1486 | /// exception, use: | |
1487 | /// | |
1488 | /// ```rust,ignore | |
1489 | /// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow | |
1490 | /// exception | |
1491 | /// ``` | |
1492 | /// | |
1493 | /// Warning: an unmasked exception will cause an exception handler to be | |
1494 | /// called. | |
1495 | /// The standard handler will simply terminate the process. So, in this case | |
1496 | /// any underflow exception would terminate the current process with something | |
1497 | /// like `signal: 8, SIGFPE: erroneous arithmetic operation`. | |
1498 | /// | |
1499 | /// ## Rounding Mode | |
1500 | /// | |
1501 | /// The rounding mode is describe using two bits. It can be read and set using | |
1502 | /// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and | |
1503 | /// `_MM_SET_ROUNDING_MODE(mode)`. | |
1504 | /// | |
1505 | /// The rounding modes are: | |
1506 | /// | |
1507 | /// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision | |
1508 | /// value. If two values are equally close, round to even (i.e., least | |
1509 | /// significant bit will be zero). | |
1510 | /// | |
1511 | /// * `_MM_ROUND_DOWN`: Round toward negative Infinity. | |
1512 | /// | |
1513 | /// * `_MM_ROUND_UP`: Round toward positive Infinity. | |
1514 | /// | |
1515 | /// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate). | |
1516 | /// | |
1517 | /// Example: | |
1518 | /// | |
1519 | /// ```rust,ignore | |
1520 | /// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN) | |
1521 | /// ``` | |
1522 | /// | |
1523 | /// ## Denormals-are-zero/Flush-to-zero Mode | |
1524 | /// | |
1525 | /// If this bit is set, values that would be denormalized will be set to zero | |
1526 | /// instead. This is turned off by default. | |
1527 | /// | |
1528 | /// You can read and enable/disable this mode via the helper functions | |
1529 | /// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`: | |
1530 | /// | |
1531 | /// ```rust,ignore | |
1532 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default) | |
1533 | /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on | |
1534 | /// ``` | |
1535 | /// | |
83c7162d XL |
1536 | /// |
1537 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr) | |
0531ce1d XL |
1538 | #[inline] |
1539 | #[target_feature(enable = "sse")] | |
1540 | #[cfg_attr(test, assert_instr(ldmxcsr))] | |
83c7162d | 1541 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1542 | pub unsafe fn _mm_setcsr(val: u32) { |
1543 | ldmxcsr(&val as *const _ as *const i8); | |
1544 | } | |
1545 | ||
1546 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1547 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1548 | pub const _MM_EXCEPT_INVALID: u32 = 0x0001; |
1549 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1550 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1551 | pub const _MM_EXCEPT_DENORM: u32 = 0x0002; |
1552 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1553 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1554 | pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004; |
1555 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1556 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1557 | pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008; |
1558 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1559 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1560 | pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010; |
1561 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1562 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1563 | pub const _MM_EXCEPT_INEXACT: u32 = 0x0020; |
1564 | /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html) | |
83c7162d | 1565 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1566 | pub const _MM_EXCEPT_MASK: u32 = 0x003f; |
1567 | ||
1568 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1569 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1570 | pub const _MM_MASK_INVALID: u32 = 0x0080; |
1571 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1572 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1573 | pub const _MM_MASK_DENORM: u32 = 0x0100; |
1574 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1575 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1576 | pub const _MM_MASK_DIV_ZERO: u32 = 0x0200; |
1577 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1578 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1579 | pub const _MM_MASK_OVERFLOW: u32 = 0x0400; |
1580 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1581 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1582 | pub const _MM_MASK_UNDERFLOW: u32 = 0x0800; |
1583 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1584 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1585 | pub const _MM_MASK_INEXACT: u32 = 0x1000; |
1586 | /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html) | |
83c7162d | 1587 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1588 | pub const _MM_MASK_MASK: u32 = 0x1f80; |
1589 | ||
1590 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1591 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1592 | pub const _MM_ROUND_NEAREST: u32 = 0x0000; |
1593 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1594 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1595 | pub const _MM_ROUND_DOWN: u32 = 0x2000; |
1596 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1597 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1598 | pub const _MM_ROUND_UP: u32 = 0x4000; |
1599 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1600 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1601 | pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000; |
1602 | ||
1603 | /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html) | |
83c7162d | 1604 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1605 | pub const _MM_ROUND_MASK: u32 = 0x6000; |
1606 | ||
1607 | /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html) | |
83c7162d | 1608 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1609 | pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000; |
1610 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1611 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1612 | pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000; |
1613 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d | 1614 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1615 | pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000; |
1616 | ||
1617 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1618 | /// |
1619 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK) | |
0531ce1d XL |
1620 | #[inline] |
1621 | #[allow(non_snake_case)] | |
1622 | #[target_feature(enable = "sse")] | |
83c7162d | 1623 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1624 | pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 { |
1625 | _mm_getcsr() & _MM_MASK_MASK | |
1626 | } | |
1627 | ||
1628 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1629 | /// |
1630 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE) | |
0531ce1d XL |
1631 | #[inline] |
1632 | #[allow(non_snake_case)] | |
1633 | #[target_feature(enable = "sse")] | |
83c7162d | 1634 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1635 | pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 { |
1636 | _mm_getcsr() & _MM_EXCEPT_MASK | |
1637 | } | |
1638 | ||
1639 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1640 | /// |
1641 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE) | |
0531ce1d XL |
1642 | #[inline] |
1643 | #[allow(non_snake_case)] | |
1644 | #[target_feature(enable = "sse")] | |
83c7162d | 1645 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1646 | pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 { |
1647 | _mm_getcsr() & _MM_FLUSH_ZERO_MASK | |
1648 | } | |
1649 | ||
1650 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1651 | /// |
1652 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE) | |
0531ce1d XL |
1653 | #[inline] |
1654 | #[allow(non_snake_case)] | |
1655 | #[target_feature(enable = "sse")] | |
83c7162d | 1656 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1657 | pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 { |
1658 | _mm_getcsr() & _MM_ROUND_MASK | |
1659 | } | |
1660 | ||
1661 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1662 | /// |
1663 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK) | |
0531ce1d XL |
1664 | #[inline] |
1665 | #[allow(non_snake_case)] | |
1666 | #[target_feature(enable = "sse")] | |
83c7162d | 1667 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1668 | pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) { |
1669 | _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x) | |
1670 | } | |
1671 | ||
1672 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1673 | /// |
1674 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE) | |
0531ce1d XL |
1675 | #[inline] |
1676 | #[allow(non_snake_case)] | |
1677 | #[target_feature(enable = "sse")] | |
83c7162d | 1678 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1679 | pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) { |
1680 | _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x) | |
1681 | } | |
1682 | ||
1683 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1684 | /// |
1685 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE) | |
0531ce1d XL |
1686 | #[inline] |
1687 | #[allow(non_snake_case)] | |
1688 | #[target_feature(enable = "sse")] | |
83c7162d | 1689 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1690 | pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) { |
1691 | let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x; | |
1692 | // println!("setting csr={:x}", val); | |
1693 | _mm_setcsr(val) | |
1694 | } | |
1695 | ||
1696 | /// See [`_mm_setcsr`](fn._mm_setcsr.html) | |
83c7162d XL |
1697 | /// |
1698 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE) | |
0531ce1d XL |
1699 | #[inline] |
1700 | #[allow(non_snake_case)] | |
1701 | #[target_feature(enable = "sse")] | |
83c7162d | 1702 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1703 | pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) { |
1704 | _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x) | |
1705 | } | |
1706 | ||
1707 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1708 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1709 | pub const _MM_HINT_T0: i32 = 3; |
1710 | ||
1711 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1712 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1713 | pub const _MM_HINT_T1: i32 = 2; |
1714 | ||
1715 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1716 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1717 | pub const _MM_HINT_T2: i32 = 1; |
1718 | ||
1719 | /// See [`_mm_prefetch`](fn._mm_prefetch.html). | |
83c7162d | 1720 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1721 | pub const _MM_HINT_NTA: i32 = 0; |
1722 | ||
1723 | /// Fetch the cache line that contains address `p` using the given `strategy`. | |
1724 | /// | |
1725 | /// The `strategy` must be one of: | |
1726 | /// | |
1727 | /// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the | |
416331ca | 1728 | /// cache hierarchy. |
0531ce1d XL |
1729 | /// |
1730 | /// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher. | |
1731 | /// | |
83c7162d XL |
1732 | /// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or |
1733 | /// an implementation-specific choice (e.g., L2 if there is no L3). | |
0531ce1d XL |
1734 | /// |
1735 | /// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the | |
1736 | /// non-temporal access (NTA) hint. It may be a place closer than main memory | |
1737 | /// but outside of the cache hierarchy. This is used to reduce access latency | |
1738 | /// without polluting the cache. | |
1739 | /// | |
1740 | /// The actual implementation depends on the particular CPU. This instruction | |
1741 | /// is considered a hint, so the CPU is also free to simply ignore the request. | |
1742 | /// | |
83c7162d XL |
1743 | /// The amount of prefetched data depends on the cache line size of the |
1744 | /// specific CPU, but it will be at least 32 bytes. | |
0531ce1d XL |
1745 | /// |
1746 | /// Common caveats: | |
1747 | /// | |
1748 | /// * Most modern CPUs already automatically prefetch data based on predicted | |
1749 | /// access patterns. | |
1750 | /// | |
1751 | /// * Data is usually not fetched if this would cause a TLB miss or a page | |
1752 | /// fault. | |
1753 | /// | |
1754 | /// * Too much prefetching can cause unnecessary cache evictions. | |
1755 | /// | |
1756 | /// * Prefetching may also fail if there are not enough memory-subsystem | |
1757 | /// resources (e.g., request buffers). | |
1758 | /// | |
83c7162d XL |
1759 | /// |
1760 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch) | |
0531ce1d XL |
1761 | #[inline] |
1762 | #[target_feature(enable = "sse")] | |
1763 | #[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))] | |
1764 | #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))] | |
1765 | #[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))] | |
1766 | #[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))] | |
1767 | #[rustc_args_required_const(1)] | |
83c7162d | 1768 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1769 | pub unsafe fn _mm_prefetch(p: *const i8, strategy: i32) { |
1770 | // The `strategy` must be a compile-time constant, so we use a short form | |
1771 | // of `constify_imm8!` for now. | |
1772 | // We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and | |
1773 | // `cache type` = 1 (data cache). `locality` is based on our `strategy`. | |
1774 | macro_rules! pref { | |
1775 | ($imm8:expr) => { | |
1776 | match $imm8 { | |
1777 | 0 => prefetch(p, 0, 0, 1), | |
1778 | 1 => prefetch(p, 0, 1, 1), | |
1779 | 2 => prefetch(p, 0, 2, 1), | |
1780 | _ => prefetch(p, 0, 3, 1), | |
1781 | } | |
83c7162d | 1782 | }; |
0531ce1d XL |
1783 | } |
1784 | pref!(strategy) | |
1785 | } | |
1786 | ||
532ac7d7 | 1787 | /// Returns vector of type __m128 with undefined elements. |
83c7162d XL |
1788 | /// |
1789 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps) | |
0531ce1d XL |
1790 | #[inline] |
1791 | #[target_feature(enable = "sse")] | |
83c7162d | 1792 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1793 | pub unsafe fn _mm_undefined_ps() -> __m128 { |
3dfed10e | 1794 | _mm_set1_ps(0.0) |
0531ce1d XL |
1795 | } |
1796 | ||
1797 | /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place. | |
83c7162d XL |
1798 | /// |
1799 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS) | |
0531ce1d XL |
1800 | #[inline] |
1801 | #[allow(non_snake_case)] | |
1802 | #[target_feature(enable = "sse")] | |
83c7162d | 1803 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1804 | pub unsafe fn _MM_TRANSPOSE4_PS( |
0731742a XL |
1805 | row0: &mut __m128, |
1806 | row1: &mut __m128, | |
1807 | row2: &mut __m128, | |
1808 | row3: &mut __m128, | |
0531ce1d XL |
1809 | ) { |
1810 | let tmp0 = _mm_unpacklo_ps(*row0, *row1); | |
1811 | let tmp2 = _mm_unpacklo_ps(*row2, *row3); | |
1812 | let tmp1 = _mm_unpackhi_ps(*row0, *row1); | |
1813 | let tmp3 = _mm_unpackhi_ps(*row2, *row3); | |
1814 | ||
1815 | *row0 = _mm_movelh_ps(tmp0, tmp2); | |
1816 | *row1 = _mm_movehl_ps(tmp2, tmp0); | |
1817 | *row2 = _mm_movelh_ps(tmp1, tmp3); | |
1818 | *row3 = _mm_movehl_ps(tmp3, tmp1); | |
1819 | } | |
1820 | ||
1821 | #[allow(improper_ctypes)] | |
1822 | extern "C" { | |
1823 | #[link_name = "llvm.x86.sse.add.ss"] | |
1824 | fn addss(a: __m128, b: __m128) -> __m128; | |
1825 | #[link_name = "llvm.x86.sse.sub.ss"] | |
1826 | fn subss(a: __m128, b: __m128) -> __m128; | |
1827 | #[link_name = "llvm.x86.sse.mul.ss"] | |
1828 | fn mulss(a: __m128, b: __m128) -> __m128; | |
1829 | #[link_name = "llvm.x86.sse.div.ss"] | |
1830 | fn divss(a: __m128, b: __m128) -> __m128; | |
1831 | #[link_name = "llvm.x86.sse.sqrt.ss"] | |
1832 | fn sqrtss(a: __m128) -> __m128; | |
1833 | #[link_name = "llvm.x86.sse.sqrt.ps"] | |
1834 | fn sqrtps(a: __m128) -> __m128; | |
1835 | #[link_name = "llvm.x86.sse.rcp.ss"] | |
1836 | fn rcpss(a: __m128) -> __m128; | |
1837 | #[link_name = "llvm.x86.sse.rcp.ps"] | |
1838 | fn rcpps(a: __m128) -> __m128; | |
1839 | #[link_name = "llvm.x86.sse.rsqrt.ss"] | |
1840 | fn rsqrtss(a: __m128) -> __m128; | |
1841 | #[link_name = "llvm.x86.sse.rsqrt.ps"] | |
1842 | fn rsqrtps(a: __m128) -> __m128; | |
1843 | #[link_name = "llvm.x86.sse.min.ss"] | |
1844 | fn minss(a: __m128, b: __m128) -> __m128; | |
1845 | #[link_name = "llvm.x86.sse.min.ps"] | |
1846 | fn minps(a: __m128, b: __m128) -> __m128; | |
1847 | #[link_name = "llvm.x86.sse.max.ss"] | |
1848 | fn maxss(a: __m128, b: __m128) -> __m128; | |
1849 | #[link_name = "llvm.x86.sse.max.ps"] | |
1850 | fn maxps(a: __m128, b: __m128) -> __m128; | |
1851 | #[link_name = "llvm.x86.sse.movmsk.ps"] | |
1852 | fn movmskps(a: __m128) -> i32; | |
1853 | #[link_name = "llvm.x86.sse.cmp.ps"] | |
1854 | fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128; | |
1855 | #[link_name = "llvm.x86.sse.comieq.ss"] | |
1856 | fn comieq_ss(a: __m128, b: __m128) -> i32; | |
1857 | #[link_name = "llvm.x86.sse.comilt.ss"] | |
1858 | fn comilt_ss(a: __m128, b: __m128) -> i32; | |
1859 | #[link_name = "llvm.x86.sse.comile.ss"] | |
1860 | fn comile_ss(a: __m128, b: __m128) -> i32; | |
1861 | #[link_name = "llvm.x86.sse.comigt.ss"] | |
1862 | fn comigt_ss(a: __m128, b: __m128) -> i32; | |
1863 | #[link_name = "llvm.x86.sse.comige.ss"] | |
1864 | fn comige_ss(a: __m128, b: __m128) -> i32; | |
1865 | #[link_name = "llvm.x86.sse.comineq.ss"] | |
1866 | fn comineq_ss(a: __m128, b: __m128) -> i32; | |
1867 | #[link_name = "llvm.x86.sse.ucomieq.ss"] | |
1868 | fn ucomieq_ss(a: __m128, b: __m128) -> i32; | |
1869 | #[link_name = "llvm.x86.sse.ucomilt.ss"] | |
1870 | fn ucomilt_ss(a: __m128, b: __m128) -> i32; | |
1871 | #[link_name = "llvm.x86.sse.ucomile.ss"] | |
1872 | fn ucomile_ss(a: __m128, b: __m128) -> i32; | |
1873 | #[link_name = "llvm.x86.sse.ucomigt.ss"] | |
1874 | fn ucomigt_ss(a: __m128, b: __m128) -> i32; | |
1875 | #[link_name = "llvm.x86.sse.ucomige.ss"] | |
1876 | fn ucomige_ss(a: __m128, b: __m128) -> i32; | |
1877 | #[link_name = "llvm.x86.sse.ucomineq.ss"] | |
1878 | fn ucomineq_ss(a: __m128, b: __m128) -> i32; | |
1879 | #[link_name = "llvm.x86.sse.cvtss2si"] | |
1880 | fn cvtss2si(a: __m128) -> i32; | |
1881 | #[link_name = "llvm.x86.sse.cvttss2si"] | |
1882 | fn cvttss2si(a: __m128) -> i32; | |
1883 | #[link_name = "llvm.x86.sse.cvtsi2ss"] | |
1884 | fn cvtsi2ss(a: __m128, b: i32) -> __m128; | |
1885 | #[link_name = "llvm.x86.sse.sfence"] | |
1886 | fn sfence(); | |
1887 | #[link_name = "llvm.x86.sse.stmxcsr"] | |
1888 | fn stmxcsr(p: *mut i8); | |
1889 | #[link_name = "llvm.x86.sse.ldmxcsr"] | |
1890 | fn ldmxcsr(p: *const i8); | |
1891 | #[link_name = "llvm.prefetch"] | |
1892 | fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32); | |
1893 | #[link_name = "llvm.x86.sse.cmp.ss"] | |
1894 | fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128; | |
0531ce1d XL |
1895 | } |
1896 | ||
1897 | /// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint. | |
1898 | /// | |
1899 | /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection | |
1900 | /// exception _may_ be generated. | |
83c7162d XL |
1901 | /// |
1902 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps) | |
0531ce1d XL |
1903 | #[inline] |
1904 | #[target_feature(enable = "sse")] | |
1905 | #[cfg_attr(test, assert_instr(movntps))] | |
83c7162d | 1906 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1907 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d | 1908 | pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) { |
0731742a | 1909 | intrinsics::nontemporal_store(mem_addr as *mut __m128, a); |
0531ce1d XL |
1910 | } |
1911 | ||
0531ce1d XL |
1912 | #[cfg(test)] |
1913 | mod tests { | |
48663c56 XL |
1914 | use crate::{hint::black_box, mem::transmute}; |
1915 | use std::{boxed, f32::NAN}; | |
416331ca | 1916 | use stdarch_test::simd_test; |
0531ce1d | 1917 | |
532ac7d7 | 1918 | use crate::core_arch::{simd::*, x86::*}; |
0531ce1d | 1919 | |
83c7162d | 1920 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1921 | unsafe fn test_mm_add_ps() { |
1922 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1923 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1924 | let r = _mm_add_ps(a, b); | |
1925 | assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0)); | |
1926 | } | |
1927 | ||
83c7162d | 1928 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1929 | unsafe fn test_mm_add_ss() { |
1930 | let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0); | |
1931 | let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0); | |
1932 | let r = _mm_add_ss(a, b); | |
1933 | assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0)); | |
1934 | } | |
1935 | ||
83c7162d | 1936 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1937 | unsafe fn test_mm_sub_ps() { |
1938 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1939 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1940 | let r = _mm_sub_ps(a, b); | |
1941 | assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0)); | |
1942 | } | |
1943 | ||
83c7162d | 1944 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1945 | unsafe fn test_mm_sub_ss() { |
1946 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1947 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1948 | let r = _mm_sub_ss(a, b); | |
1949 | assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0)); | |
1950 | } | |
1951 | ||
83c7162d | 1952 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1953 | unsafe fn test_mm_mul_ps() { |
1954 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1955 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1956 | let r = _mm_mul_ps(a, b); | |
1957 | assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0)); | |
1958 | } | |
1959 | ||
83c7162d | 1960 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1961 | unsafe fn test_mm_mul_ss() { |
1962 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1963 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1964 | let r = _mm_mul_ss(a, b); | |
1965 | assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0)); | |
1966 | } | |
1967 | ||
83c7162d | 1968 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1969 | unsafe fn test_mm_div_ps() { |
1970 | let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0); | |
1971 | let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0); | |
1972 | let r = _mm_div_ps(a, b); | |
1973 | assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0)); | |
1974 | } | |
1975 | ||
83c7162d | 1976 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1977 | unsafe fn test_mm_div_ss() { |
1978 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
1979 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
1980 | let r = _mm_div_ss(a, b); | |
1981 | assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0)); | |
1982 | } | |
1983 | ||
83c7162d | 1984 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1985 | unsafe fn test_mm_sqrt_ss() { |
1986 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1987 | let r = _mm_sqrt_ss(a); | |
1988 | let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0); | |
1989 | assert_eq_m128(r, e); | |
1990 | } | |
1991 | ||
83c7162d | 1992 | #[simd_test(enable = "sse")] |
0531ce1d XL |
1993 | unsafe fn test_mm_sqrt_ps() { |
1994 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
1995 | let r = _mm_sqrt_ps(a); | |
1996 | let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0); | |
1997 | assert_eq_m128(r, e); | |
1998 | } | |
1999 | ||
83c7162d | 2000 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2001 | unsafe fn test_mm_rcp_ss() { |
2002 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
2003 | let r = _mm_rcp_ss(a); | |
2004 | let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0); | |
2005 | assert_eq_m128(r, e); | |
2006 | } | |
2007 | ||
83c7162d | 2008 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2009 | unsafe fn test_mm_rcp_ps() { |
2010 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
2011 | let r = _mm_rcp_ps(a); | |
2012 | let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215); | |
2013 | let rel_err = 0.00048828125; | |
2014 | for i in 0..4 { | |
2015 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); | |
2016 | } | |
2017 | } | |
2018 | ||
83c7162d | 2019 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2020 | unsafe fn test_mm_rsqrt_ss() { |
2021 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
2022 | let r = _mm_rsqrt_ss(a); | |
2023 | let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0); | |
2024 | let rel_err = 0.00048828125; | |
2025 | for i in 0..4 { | |
2026 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); | |
2027 | } | |
2028 | } | |
2029 | ||
83c7162d | 2030 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2031 | unsafe fn test_mm_rsqrt_ps() { |
2032 | let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0); | |
2033 | let r = _mm_rsqrt_ps(a); | |
2034 | let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845); | |
2035 | let rel_err = 0.00048828125; | |
2036 | for i in 0..4 { | |
2037 | assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err); | |
2038 | } | |
2039 | } | |
2040 | ||
83c7162d | 2041 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2042 | unsafe fn test_mm_min_ss() { |
2043 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2044 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2045 | let r = _mm_min_ss(a, b); | |
2046 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); | |
2047 | } | |
2048 | ||
83c7162d | 2049 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2050 | unsafe fn test_mm_min_ps() { |
2051 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2052 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2053 | let r = _mm_min_ps(a, b); | |
2054 | assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0)); | |
74b04a01 XL |
2055 | |
2056 | // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min` | |
2057 | // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic | |
2058 | // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from | |
2059 | // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals | |
2060 | // `r1` to `a` and `r2` to `b`. | |
2061 | let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0); | |
2062 | let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0); | |
2063 | let r1: [u8; 16] = transmute(_mm_min_ps(a, b)); | |
2064 | let r2: [u8; 16] = transmute(_mm_min_ps(b, a)); | |
2065 | let a: [u8; 16] = transmute(a); | |
2066 | let b: [u8; 16] = transmute(b); | |
2067 | assert_eq!(r1, b); | |
2068 | assert_eq!(r2, a); | |
2069 | assert_ne!(a, b); // sanity check that -0.0 is actually present | |
0531ce1d XL |
2070 | } |
2071 | ||
83c7162d | 2072 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2073 | unsafe fn test_mm_max_ss() { |
2074 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2075 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2076 | let r = _mm_max_ss(a, b); | |
2077 | assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0)); | |
2078 | } | |
2079 | ||
83c7162d | 2080 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2081 | unsafe fn test_mm_max_ps() { |
2082 | let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0); | |
2083 | let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0); | |
2084 | let r = _mm_max_ps(a, b); | |
2085 | assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0)); | |
2086 | } | |
2087 | ||
83c7162d | 2088 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2089 | unsafe fn test_mm_and_ps() { |
2090 | let a = transmute(u32x4::splat(0b0011)); | |
2091 | let b = transmute(u32x4::splat(0b0101)); | |
2092 | let r = _mm_and_ps(*black_box(&a), *black_box(&b)); | |
2093 | let e = transmute(u32x4::splat(0b0001)); | |
2094 | assert_eq_m128(r, e); | |
2095 | } | |
2096 | ||
83c7162d | 2097 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2098 | unsafe fn test_mm_andnot_ps() { |
2099 | let a = transmute(u32x4::splat(0b0011)); | |
2100 | let b = transmute(u32x4::splat(0b0101)); | |
2101 | let r = _mm_andnot_ps(*black_box(&a), *black_box(&b)); | |
2102 | let e = transmute(u32x4::splat(0b0100)); | |
2103 | assert_eq_m128(r, e); | |
2104 | } | |
2105 | ||
83c7162d | 2106 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2107 | unsafe fn test_mm_or_ps() { |
2108 | let a = transmute(u32x4::splat(0b0011)); | |
2109 | let b = transmute(u32x4::splat(0b0101)); | |
2110 | let r = _mm_or_ps(*black_box(&a), *black_box(&b)); | |
2111 | let e = transmute(u32x4::splat(0b0111)); | |
2112 | assert_eq_m128(r, e); | |
2113 | } | |
2114 | ||
83c7162d | 2115 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2116 | unsafe fn test_mm_xor_ps() { |
2117 | let a = transmute(u32x4::splat(0b0011)); | |
2118 | let b = transmute(u32x4::splat(0b0101)); | |
2119 | let r = _mm_xor_ps(*black_box(&a), *black_box(&b)); | |
2120 | let e = transmute(u32x4::splat(0b0110)); | |
2121 | assert_eq_m128(r, e); | |
2122 | } | |
2123 | ||
83c7162d | 2124 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2125 | unsafe fn test_mm_cmpeq_ss() { |
2126 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2127 | let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0); | |
2128 | let r: u32x4 = transmute(_mm_cmpeq_ss(a, b)); | |
2129 | let e: u32x4 = transmute(_mm_setr_ps(transmute(0u32), 2.0, 3.0, 4.0)); | |
2130 | assert_eq!(r, e); | |
2131 | ||
2132 | let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2133 | let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2)); | |
0731742a | 2134 | let e2: u32x4 = transmute(_mm_setr_ps(transmute(0xffffffffu32), 2.0, 3.0, 4.0)); |
0531ce1d XL |
2135 | assert_eq!(r2, e2); |
2136 | } | |
2137 | ||
83c7162d | 2138 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2139 | unsafe fn test_mm_cmplt_ss() { |
2140 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2141 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2142 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2143 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2144 | ||
2145 | let b1 = 0u32; // a.extract(0) < b.extract(0) | |
2146 | let c1 = 0u32; // a.extract(0) < c.extract(0) | |
2147 | let d1 = !0u32; // a.extract(0) < d.extract(0) | |
2148 | ||
2149 | let rb: u32x4 = transmute(_mm_cmplt_ss(a, b)); | |
2150 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2151 | assert_eq!(rb, eb); | |
2152 | ||
2153 | let rc: u32x4 = transmute(_mm_cmplt_ss(a, c)); | |
2154 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2155 | assert_eq!(rc, ec); | |
2156 | ||
2157 | let rd: u32x4 = transmute(_mm_cmplt_ss(a, d)); | |
2158 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2159 | assert_eq!(rd, ed); | |
2160 | } | |
2161 | ||
83c7162d | 2162 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2163 | unsafe fn test_mm_cmple_ss() { |
2164 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2165 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2166 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2167 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2168 | ||
2169 | let b1 = 0u32; // a.extract(0) <= b.extract(0) | |
2170 | let c1 = !0u32; // a.extract(0) <= c.extract(0) | |
2171 | let d1 = !0u32; // a.extract(0) <= d.extract(0) | |
2172 | ||
2173 | let rb: u32x4 = transmute(_mm_cmple_ss(a, b)); | |
2174 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2175 | assert_eq!(rb, eb); | |
2176 | ||
2177 | let rc: u32x4 = transmute(_mm_cmple_ss(a, c)); | |
2178 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2179 | assert_eq!(rc, ec); | |
2180 | ||
2181 | let rd: u32x4 = transmute(_mm_cmple_ss(a, d)); | |
2182 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2183 | assert_eq!(rd, ed); | |
2184 | } | |
2185 | ||
83c7162d | 2186 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2187 | unsafe fn test_mm_cmpgt_ss() { |
2188 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2189 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2190 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2191 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2192 | ||
2193 | let b1 = !0u32; // a.extract(0) > b.extract(0) | |
2194 | let c1 = 0u32; // a.extract(0) > c.extract(0) | |
2195 | let d1 = 0u32; // a.extract(0) > d.extract(0) | |
2196 | ||
2197 | let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b)); | |
2198 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2199 | assert_eq!(rb, eb); | |
2200 | ||
2201 | let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c)); | |
2202 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2203 | assert_eq!(rc, ec); | |
2204 | ||
2205 | let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d)); | |
2206 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2207 | assert_eq!(rd, ed); | |
2208 | } | |
2209 | ||
83c7162d | 2210 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2211 | unsafe fn test_mm_cmpge_ss() { |
2212 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2213 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2214 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2215 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2216 | ||
2217 | let b1 = !0u32; // a.extract(0) >= b.extract(0) | |
2218 | let c1 = !0u32; // a.extract(0) >= c.extract(0) | |
2219 | let d1 = 0u32; // a.extract(0) >= d.extract(0) | |
2220 | ||
2221 | let rb: u32x4 = transmute(_mm_cmpge_ss(a, b)); | |
2222 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2223 | assert_eq!(rb, eb); | |
2224 | ||
2225 | let rc: u32x4 = transmute(_mm_cmpge_ss(a, c)); | |
2226 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2227 | assert_eq!(rc, ec); | |
2228 | ||
2229 | let rd: u32x4 = transmute(_mm_cmpge_ss(a, d)); | |
2230 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2231 | assert_eq!(rd, ed); | |
2232 | } | |
2233 | ||
83c7162d | 2234 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2235 | unsafe fn test_mm_cmpneq_ss() { |
2236 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2237 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2238 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2239 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2240 | ||
2241 | let b1 = !0u32; // a.extract(0) != b.extract(0) | |
2242 | let c1 = 0u32; // a.extract(0) != c.extract(0) | |
2243 | let d1 = !0u32; // a.extract(0) != d.extract(0) | |
2244 | ||
2245 | let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b)); | |
2246 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2247 | assert_eq!(rb, eb); | |
2248 | ||
2249 | let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c)); | |
2250 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2251 | assert_eq!(rc, ec); | |
2252 | ||
2253 | let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d)); | |
2254 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2255 | assert_eq!(rd, ed); | |
2256 | } | |
2257 | ||
83c7162d | 2258 | #[simd_test(enable = "sse")] |
0531ce1d | 2259 | unsafe fn test_mm_cmpnlt_ss() { |
532ac7d7 | 2260 | // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there |
0531ce1d XL |
2261 | // must be a difference. It may have to do with behavior in the |
2262 | // presence of NaNs (signaling or quiet). If so, we should add tests | |
2263 | // for those. | |
2264 | ||
2265 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2266 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2267 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2268 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2269 | ||
2270 | let b1 = !0u32; // a.extract(0) >= b.extract(0) | |
2271 | let c1 = !0u32; // a.extract(0) >= c.extract(0) | |
2272 | let d1 = 0u32; // a.extract(0) >= d.extract(0) | |
2273 | ||
2274 | let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b)); | |
2275 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2276 | assert_eq!(rb, eb); | |
2277 | ||
2278 | let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c)); | |
2279 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2280 | assert_eq!(rc, ec); | |
2281 | ||
2282 | let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d)); | |
2283 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2284 | assert_eq!(rd, ed); | |
2285 | } | |
2286 | ||
83c7162d | 2287 | #[simd_test(enable = "sse")] |
0531ce1d | 2288 | unsafe fn test_mm_cmpnle_ss() { |
532ac7d7 | 2289 | // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there |
0531ce1d XL |
2290 | // must be a difference. It may have to do with behavior in the |
2291 | // presence | |
2292 | // of NaNs (signaling or quiet). If so, we should add tests for those. | |
2293 | ||
2294 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2295 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2296 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2297 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2298 | ||
2299 | let b1 = !0u32; // a.extract(0) > b.extract(0) | |
2300 | let c1 = 0u32; // a.extract(0) > c.extract(0) | |
2301 | let d1 = 0u32; // a.extract(0) > d.extract(0) | |
2302 | ||
2303 | let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b)); | |
2304 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2305 | assert_eq!(rb, eb); | |
2306 | ||
2307 | let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c)); | |
2308 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2309 | assert_eq!(rc, ec); | |
2310 | ||
2311 | let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d)); | |
2312 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2313 | assert_eq!(rd, ed); | |
2314 | } | |
2315 | ||
83c7162d | 2316 | #[simd_test(enable = "sse")] |
0531ce1d | 2317 | unsafe fn test_mm_cmpngt_ss() { |
532ac7d7 | 2318 | // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there |
0531ce1d XL |
2319 | // must be a difference. It may have to do with behavior in the |
2320 | // presence of NaNs (signaling or quiet). If so, we should add tests | |
2321 | // for those. | |
2322 | ||
2323 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2324 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2325 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2326 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2327 | ||
2328 | let b1 = 0u32; // a.extract(0) <= b.extract(0) | |
2329 | let c1 = !0u32; // a.extract(0) <= c.extract(0) | |
2330 | let d1 = !0u32; // a.extract(0) <= d.extract(0) | |
2331 | ||
2332 | let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b)); | |
2333 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2334 | assert_eq!(rb, eb); | |
2335 | ||
2336 | let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c)); | |
2337 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2338 | assert_eq!(rc, ec); | |
2339 | ||
2340 | let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d)); | |
2341 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2342 | assert_eq!(rd, ed); | |
2343 | } | |
2344 | ||
83c7162d | 2345 | #[simd_test(enable = "sse")] |
0531ce1d | 2346 | unsafe fn test_mm_cmpnge_ss() { |
532ac7d7 | 2347 | // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there |
0531ce1d XL |
2348 | // must be a difference. It may have to do with behavior in the |
2349 | // presence of NaNs (signaling or quiet). If so, we should add tests | |
2350 | // for those. | |
2351 | ||
2352 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2353 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2354 | let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0); | |
2355 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2356 | ||
2357 | let b1 = 0u32; // a.extract(0) < b.extract(0) | |
2358 | let c1 = 0u32; // a.extract(0) < c.extract(0) | |
2359 | let d1 = !0u32; // a.extract(0) < d.extract(0) | |
2360 | ||
2361 | let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b)); | |
2362 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2363 | assert_eq!(rb, eb); | |
2364 | ||
2365 | let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c)); | |
2366 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2367 | assert_eq!(rc, ec); | |
2368 | ||
2369 | let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d)); | |
2370 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2371 | assert_eq!(rd, ed); | |
2372 | } | |
2373 | ||
83c7162d | 2374 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2375 | unsafe fn test_mm_cmpord_ss() { |
2376 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2377 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2378 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); | |
2379 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2380 | ||
2381 | let b1 = !0u32; // a.extract(0) ord b.extract(0) | |
2382 | let c1 = 0u32; // a.extract(0) ord c.extract(0) | |
2383 | let d1 = !0u32; // a.extract(0) ord d.extract(0) | |
2384 | ||
2385 | let rb: u32x4 = transmute(_mm_cmpord_ss(a, b)); | |
2386 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2387 | assert_eq!(rb, eb); | |
2388 | ||
2389 | let rc: u32x4 = transmute(_mm_cmpord_ss(a, c)); | |
2390 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2391 | assert_eq!(rc, ec); | |
2392 | ||
2393 | let rd: u32x4 = transmute(_mm_cmpord_ss(a, d)); | |
2394 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2395 | assert_eq!(rd, ed); | |
2396 | } | |
2397 | ||
83c7162d | 2398 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2399 | unsafe fn test_mm_cmpunord_ss() { |
2400 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2401 | let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0); | |
2402 | let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0); | |
2403 | let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0); | |
2404 | ||
2405 | let b1 = 0u32; // a.extract(0) unord b.extract(0) | |
2406 | let c1 = !0u32; // a.extract(0) unord c.extract(0) | |
2407 | let d1 = 0u32; // a.extract(0) unord d.extract(0) | |
2408 | ||
2409 | let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b)); | |
2410 | let eb: u32x4 = transmute(_mm_setr_ps(transmute(b1), 2.0, 3.0, 4.0)); | |
2411 | assert_eq!(rb, eb); | |
2412 | ||
2413 | let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c)); | |
2414 | let ec: u32x4 = transmute(_mm_setr_ps(transmute(c1), 2.0, 3.0, 4.0)); | |
2415 | assert_eq!(rc, ec); | |
2416 | ||
2417 | let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d)); | |
2418 | let ed: u32x4 = transmute(_mm_setr_ps(transmute(d1), 2.0, 3.0, 4.0)); | |
2419 | assert_eq!(rd, ed); | |
2420 | } | |
2421 | ||
83c7162d | 2422 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2423 | unsafe fn test_mm_cmpeq_ps() { |
2424 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2425 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2426 | let tru = !0u32; | |
2427 | let fls = 0u32; | |
2428 | ||
2429 | let e = u32x4::new(fls, fls, tru, fls); | |
2430 | let r: u32x4 = transmute(_mm_cmpeq_ps(a, b)); | |
2431 | assert_eq!(r, e); | |
2432 | } | |
2433 | ||
83c7162d | 2434 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2435 | unsafe fn test_mm_cmplt_ps() { |
2436 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2437 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2438 | let tru = !0u32; | |
2439 | let fls = 0u32; | |
2440 | ||
2441 | let e = u32x4::new(tru, fls, fls, fls); | |
2442 | let r: u32x4 = transmute(_mm_cmplt_ps(a, b)); | |
2443 | assert_eq!(r, e); | |
2444 | } | |
2445 | ||
83c7162d | 2446 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2447 | unsafe fn test_mm_cmple_ps() { |
2448 | let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0); | |
2449 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2450 | let tru = !0u32; | |
2451 | let fls = 0u32; | |
2452 | ||
2453 | let e = u32x4::new(tru, fls, tru, fls); | |
2454 | let r: u32x4 = transmute(_mm_cmple_ps(a, b)); | |
2455 | assert_eq!(r, e); | |
2456 | } | |
2457 | ||
83c7162d | 2458 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2459 | unsafe fn test_mm_cmpgt_ps() { |
2460 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2461 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); | |
2462 | let tru = !0u32; | |
2463 | let fls = 0u32; | |
2464 | ||
2465 | let e = u32x4::new(fls, tru, fls, fls); | |
2466 | let r: u32x4 = transmute(_mm_cmpgt_ps(a, b)); | |
2467 | assert_eq!(r, e); | |
2468 | } | |
2469 | ||
83c7162d | 2470 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2471 | unsafe fn test_mm_cmpge_ps() { |
2472 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2473 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0); | |
2474 | let tru = !0u32; | |
2475 | let fls = 0u32; | |
2476 | ||
2477 | let e = u32x4::new(fls, tru, tru, fls); | |
2478 | let r: u32x4 = transmute(_mm_cmpge_ps(a, b)); | |
2479 | assert_eq!(r, e); | |
2480 | } | |
2481 | ||
83c7162d | 2482 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2483 | unsafe fn test_mm_cmpneq_ps() { |
2484 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2485 | let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN); | |
2486 | let tru = !0u32; | |
2487 | let fls = 0u32; | |
2488 | ||
2489 | let e = u32x4::new(tru, tru, fls, tru); | |
2490 | let r: u32x4 = transmute(_mm_cmpneq_ps(a, b)); | |
2491 | assert_eq!(r, e); | |
2492 | } | |
2493 | ||
83c7162d | 2494 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2495 | unsafe fn test_mm_cmpnlt_ps() { |
2496 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2497 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2498 | let tru = !0u32; | |
2499 | let fls = 0u32; | |
2500 | ||
2501 | let e = u32x4::new(fls, tru, tru, tru); | |
2502 | let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b)); | |
2503 | assert_eq!(r, e); | |
2504 | } | |
2505 | ||
83c7162d | 2506 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2507 | unsafe fn test_mm_cmpnle_ps() { |
2508 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2509 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2510 | let tru = !0u32; | |
2511 | let fls = 0u32; | |
2512 | ||
2513 | let e = u32x4::new(fls, tru, fls, tru); | |
2514 | let r: u32x4 = transmute(_mm_cmpnle_ps(a, b)); | |
2515 | assert_eq!(r, e); | |
2516 | } | |
2517 | ||
83c7162d | 2518 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2519 | unsafe fn test_mm_cmpngt_ps() { |
2520 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2521 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2522 | let tru = !0u32; | |
2523 | let fls = 0u32; | |
2524 | ||
2525 | let e = u32x4::new(tru, fls, tru, tru); | |
2526 | let r: u32x4 = transmute(_mm_cmpngt_ps(a, b)); | |
2527 | assert_eq!(r, e); | |
2528 | } | |
2529 | ||
83c7162d | 2530 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2531 | unsafe fn test_mm_cmpnge_ps() { |
2532 | let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN); | |
2533 | let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0); | |
2534 | let tru = !0u32; | |
2535 | let fls = 0u32; | |
2536 | ||
2537 | let e = u32x4::new(tru, fls, fls, tru); | |
2538 | let r: u32x4 = transmute(_mm_cmpnge_ps(a, b)); | |
2539 | assert_eq!(r, e); | |
2540 | } | |
2541 | ||
83c7162d | 2542 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2543 | unsafe fn test_mm_cmpord_ps() { |
2544 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); | |
2545 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); | |
2546 | let tru = !0u32; | |
2547 | let fls = 0u32; | |
2548 | ||
2549 | let e = u32x4::new(tru, fls, fls, fls); | |
2550 | let r: u32x4 = transmute(_mm_cmpord_ps(a, b)); | |
2551 | assert_eq!(r, e); | |
2552 | } | |
2553 | ||
83c7162d | 2554 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2555 | unsafe fn test_mm_cmpunord_ps() { |
2556 | let a = _mm_setr_ps(10.0, 50.0, NAN, NAN); | |
2557 | let b = _mm_setr_ps(15.0, NAN, 1.0, NAN); | |
2558 | let tru = !0u32; | |
2559 | let fls = 0u32; | |
2560 | ||
2561 | let e = u32x4::new(fls, tru, tru, tru); | |
2562 | let r: u32x4 = transmute(_mm_cmpunord_ps(a, b)); | |
2563 | assert_eq!(r, e); | |
2564 | } | |
2565 | ||
83c7162d | 2566 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2567 | unsafe fn test_mm_comieq_ss() { |
2568 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2569 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2570 | ||
2571 | let ee = &[1i32, 0, 0, 0]; | |
2572 | ||
2573 | for i in 0..4 { | |
2574 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2575 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2576 | ||
2577 | let r = _mm_comieq_ss(a, b); | |
2578 | ||
2579 | assert_eq!( | |
2580 | ee[i], r, | |
2581 | "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2582 | a, b, r, ee[i], i | |
2583 | ); | |
2584 | } | |
2585 | } | |
2586 | ||
83c7162d | 2587 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2588 | unsafe fn test_mm_comilt_ss() { |
2589 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2590 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2591 | ||
2592 | let ee = &[0i32, 1, 0, 0]; | |
2593 | ||
2594 | for i in 0..4 { | |
2595 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2596 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2597 | ||
2598 | let r = _mm_comilt_ss(a, b); | |
2599 | ||
2600 | assert_eq!( | |
2601 | ee[i], r, | |
2602 | "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2603 | a, b, r, ee[i], i | |
2604 | ); | |
2605 | } | |
2606 | } | |
2607 | ||
83c7162d | 2608 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2609 | unsafe fn test_mm_comile_ss() { |
2610 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2611 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2612 | ||
2613 | let ee = &[1i32, 1, 0, 0]; | |
2614 | ||
2615 | for i in 0..4 { | |
2616 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2617 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2618 | ||
2619 | let r = _mm_comile_ss(a, b); | |
2620 | ||
2621 | assert_eq!( | |
2622 | ee[i], r, | |
2623 | "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2624 | a, b, r, ee[i], i | |
2625 | ); | |
2626 | } | |
2627 | } | |
2628 | ||
83c7162d | 2629 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2630 | unsafe fn test_mm_comigt_ss() { |
2631 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2632 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2633 | ||
2634 | let ee = &[1i32, 0, 1, 0]; | |
2635 | ||
2636 | for i in 0..4 { | |
2637 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2638 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2639 | ||
2640 | let r = _mm_comige_ss(a, b); | |
2641 | ||
2642 | assert_eq!( | |
2643 | ee[i], r, | |
2644 | "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2645 | a, b, r, ee[i], i | |
2646 | ); | |
2647 | } | |
2648 | } | |
2649 | ||
83c7162d | 2650 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2651 | unsafe fn test_mm_comineq_ss() { |
2652 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2653 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2654 | ||
2655 | let ee = &[0i32, 1, 1, 1]; | |
2656 | ||
2657 | for i in 0..4 { | |
2658 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2659 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2660 | ||
2661 | let r = _mm_comineq_ss(a, b); | |
2662 | ||
2663 | assert_eq!( | |
2664 | ee[i], r, | |
2665 | "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2666 | a, b, r, ee[i], i | |
2667 | ); | |
2668 | } | |
2669 | } | |
2670 | ||
83c7162d | 2671 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2672 | unsafe fn test_mm_ucomieq_ss() { |
2673 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2674 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2675 | ||
2676 | let ee = &[1i32, 0, 0, 0]; | |
2677 | ||
2678 | for i in 0..4 { | |
2679 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2680 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2681 | ||
2682 | let r = _mm_ucomieq_ss(a, b); | |
2683 | ||
2684 | assert_eq!( | |
2685 | ee[i], r, | |
2686 | "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2687 | a, b, r, ee[i], i | |
2688 | ); | |
2689 | } | |
2690 | } | |
2691 | ||
83c7162d | 2692 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2693 | unsafe fn test_mm_ucomilt_ss() { |
2694 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2695 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2696 | ||
2697 | let ee = &[0i32, 1, 0, 0]; | |
2698 | ||
2699 | for i in 0..4 { | |
2700 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2701 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2702 | ||
2703 | let r = _mm_ucomilt_ss(a, b); | |
2704 | ||
2705 | assert_eq!( | |
2706 | ee[i], r, | |
2707 | "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2708 | a, b, r, ee[i], i | |
2709 | ); | |
2710 | } | |
2711 | } | |
2712 | ||
83c7162d | 2713 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2714 | unsafe fn test_mm_ucomile_ss() { |
2715 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2716 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2717 | ||
2718 | let ee = &[1i32, 1, 0, 0]; | |
2719 | ||
2720 | for i in 0..4 { | |
2721 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2722 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2723 | ||
2724 | let r = _mm_ucomile_ss(a, b); | |
2725 | ||
2726 | assert_eq!( | |
2727 | ee[i], r, | |
2728 | "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2729 | a, b, r, ee[i], i | |
2730 | ); | |
2731 | } | |
2732 | } | |
2733 | ||
83c7162d | 2734 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2735 | unsafe fn test_mm_ucomigt_ss() { |
2736 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2737 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2738 | ||
2739 | let ee = &[0i32, 0, 1, 0]; | |
2740 | ||
2741 | for i in 0..4 { | |
2742 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2743 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2744 | ||
2745 | let r = _mm_ucomigt_ss(a, b); | |
2746 | ||
2747 | assert_eq!( | |
2748 | ee[i], r, | |
2749 | "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2750 | a, b, r, ee[i], i | |
2751 | ); | |
2752 | } | |
2753 | } | |
2754 | ||
83c7162d | 2755 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2756 | unsafe fn test_mm_ucomige_ss() { |
2757 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2758 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2759 | ||
2760 | let ee = &[1i32, 0, 1, 0]; | |
2761 | ||
2762 | for i in 0..4 { | |
2763 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2764 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2765 | ||
2766 | let r = _mm_ucomige_ss(a, b); | |
2767 | ||
2768 | assert_eq!( | |
2769 | ee[i], r, | |
2770 | "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2771 | a, b, r, ee[i], i | |
2772 | ); | |
2773 | } | |
2774 | } | |
2775 | ||
83c7162d | 2776 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2777 | unsafe fn test_mm_ucomineq_ss() { |
2778 | let aa = &[3.0f32, 12.0, 23.0, NAN]; | |
2779 | let bb = &[3.0f32, 47.5, 1.5, NAN]; | |
2780 | ||
2781 | let ee = &[0i32, 1, 1, 1]; | |
2782 | ||
2783 | for i in 0..4 { | |
2784 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2785 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2786 | ||
2787 | let r = _mm_ucomineq_ss(a, b); | |
2788 | ||
2789 | assert_eq!( | |
2790 | ee[i], r, | |
2791 | "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2792 | a, b, r, ee[i], i | |
2793 | ); | |
2794 | } | |
2795 | } | |
2796 | ||
83c7162d | 2797 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2798 | unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() { |
2799 | // If one of the arguments is a quiet NaN `comieq_ss` should signal an | |
2800 | // Invalid Operation Exception while `ucomieq_ss` should not. | |
2801 | let aa = &[3.0f32, NAN, 23.0, NAN]; | |
2802 | let bb = &[3.0f32, 47.5, NAN, NAN]; | |
2803 | ||
2804 | let ee = &[1i32, 0, 0, 0]; | |
2805 | let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception? | |
2806 | ||
2807 | for i in 0..4 { | |
2808 | let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0); | |
2809 | let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0); | |
2810 | ||
2811 | _MM_SET_EXCEPTION_STATE(0); | |
2812 | let r1 = _mm_comieq_ss(*black_box(&a), b); | |
2813 | let s1 = _MM_GET_EXCEPTION_STATE(); | |
2814 | ||
2815 | _MM_SET_EXCEPTION_STATE(0); | |
2816 | let r2 = _mm_ucomieq_ss(*black_box(&a), b); | |
2817 | let s2 = _MM_GET_EXCEPTION_STATE(); | |
2818 | ||
2819 | assert_eq!( | |
2820 | ee[i], r1, | |
2821 | "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2822 | a, b, r1, ee[i], i | |
2823 | ); | |
2824 | assert_eq!( | |
2825 | ee[i], r2, | |
2826 | "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})", | |
2827 | a, b, r2, ee[i], i | |
2828 | ); | |
2829 | assert_eq!( | |
2830 | s1, | |
2831 | exc[i] * _MM_EXCEPT_INVALID, | |
2832 | "_mm_comieq_ss() set exception flags: {} (i={})", | |
2833 | s1, | |
2834 | i | |
2835 | ); | |
2836 | assert_eq!( | |
2837 | s2, | |
2838 | 0, // ucomieq_ss should not signal an exception | |
2839 | "_mm_ucomieq_ss() set exception flags: {} (i={})", | |
2840 | s2, | |
2841 | i | |
2842 | ); | |
2843 | } | |
2844 | } | |
2845 | ||
83c7162d | 2846 | #[simd_test(enable = "sse")] |
0531ce1d | 2847 | unsafe fn test_mm_cvtss_si32() { |
8faf50e0 | 2848 | let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1]; |
ba9703b0 | 2849 | let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520]; |
0531ce1d XL |
2850 | for i in 0..inputs.len() { |
2851 | let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0); | |
2852 | let e = result[i]; | |
2853 | let r = _mm_cvtss_si32(x); | |
2854 | assert_eq!( | |
2855 | e, r, | |
2856 | "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}", | |
2857 | i, x, r, e | |
2858 | ); | |
2859 | } | |
2860 | } | |
2861 | ||
83c7162d | 2862 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2863 | unsafe fn test_mm_cvttss_si32() { |
2864 | let inputs = &[ | |
2865 | (42.0f32, 42i32), | |
2866 | (-31.4, -31), | |
2867 | (-33.5, -33), | |
2868 | (-34.5, -34), | |
2869 | (10.999, 10), | |
2870 | (-5.99, -5), | |
ba9703b0 | 2871 | (4.0e10, i32::MIN), |
0531ce1d | 2872 | (4.0e-10, 0), |
ba9703b0 | 2873 | (NAN, i32::MIN), |
0531ce1d XL |
2874 | (2147483500.1, 2147483520), |
2875 | ]; | |
2876 | for i in 0..inputs.len() { | |
2877 | let (xi, e) = inputs[i]; | |
2878 | let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0); | |
2879 | let r = _mm_cvttss_si32(x); | |
2880 | assert_eq!( | |
2881 | e, r, | |
2882 | "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}", | |
2883 | i, x, r, e | |
2884 | ); | |
2885 | } | |
2886 | } | |
2887 | ||
83c7162d | 2888 | #[simd_test(enable = "sse")] |
e1599b0c | 2889 | unsafe fn test_mm_cvtsi32_ss() { |
0531ce1d XL |
2890 | let inputs = &[ |
2891 | (4555i32, 4555.0f32), | |
2892 | (322223333, 322223330.0), | |
2893 | (-432, -432.0), | |
2894 | (-322223333, -322223330.0), | |
2895 | ]; | |
2896 | ||
2897 | for i in 0..inputs.len() { | |
2898 | let (x, f) = inputs[i]; | |
2899 | let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2900 | let r = _mm_cvtsi32_ss(a, x); | |
2901 | let e = _mm_setr_ps(f, 6.0, 7.0, 8.0); | |
2902 | assert_eq_m128(e, r); | |
2903 | } | |
2904 | } | |
2905 | ||
83c7162d | 2906 | #[simd_test(enable = "sse")] |
e1599b0c | 2907 | unsafe fn test_mm_cvtss_f32() { |
0531ce1d XL |
2908 | let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0); |
2909 | assert_eq!(_mm_cvtss_f32(a), 312.0134); | |
2910 | } | |
2911 | ||
83c7162d | 2912 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2913 | unsafe fn test_mm_set_ss() { |
2914 | let r = _mm_set_ss(black_box(4.25)); | |
2915 | assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0)); | |
2916 | } | |
2917 | ||
83c7162d | 2918 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2919 | unsafe fn test_mm_set1_ps() { |
2920 | let r1 = _mm_set1_ps(black_box(4.25)); | |
2921 | let r2 = _mm_set_ps1(black_box(4.25)); | |
2922 | assert_eq!(get_m128(r1, 0), 4.25); | |
2923 | assert_eq!(get_m128(r1, 1), 4.25); | |
2924 | assert_eq!(get_m128(r1, 2), 4.25); | |
2925 | assert_eq!(get_m128(r1, 3), 4.25); | |
2926 | assert_eq!(get_m128(r2, 0), 4.25); | |
2927 | assert_eq!(get_m128(r2, 1), 4.25); | |
2928 | assert_eq!(get_m128(r2, 2), 4.25); | |
2929 | assert_eq!(get_m128(r2, 3), 4.25); | |
2930 | } | |
2931 | ||
83c7162d | 2932 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2933 | unsafe fn test_mm_set_ps() { |
2934 | let r = _mm_set_ps( | |
2935 | black_box(1.0), | |
2936 | black_box(2.0), | |
2937 | black_box(3.0), | |
2938 | black_box(4.0), | |
2939 | ); | |
2940 | assert_eq!(get_m128(r, 0), 4.0); | |
2941 | assert_eq!(get_m128(r, 1), 3.0); | |
2942 | assert_eq!(get_m128(r, 2), 2.0); | |
2943 | assert_eq!(get_m128(r, 3), 1.0); | |
2944 | } | |
2945 | ||
83c7162d | 2946 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2947 | unsafe fn test_mm_setr_ps() { |
2948 | let r = _mm_setr_ps( | |
2949 | black_box(1.0), | |
2950 | black_box(2.0), | |
2951 | black_box(3.0), | |
2952 | black_box(4.0), | |
2953 | ); | |
2954 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0)); | |
2955 | } | |
2956 | ||
83c7162d | 2957 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2958 | unsafe fn test_mm_setzero_ps() { |
2959 | let r = *black_box(&_mm_setzero_ps()); | |
2960 | assert_eq_m128(r, _mm_set1_ps(0.0)); | |
2961 | } | |
2962 | ||
8faf50e0 XL |
2963 | #[simd_test(enable = "sse")] |
2964 | unsafe fn test_mm_shuffle() { | |
2965 | assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11); | |
2966 | assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00); | |
2967 | assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01); | |
2968 | } | |
2969 | ||
83c7162d | 2970 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2971 | unsafe fn test_mm_shuffle_ps() { |
2972 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2973 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2974 | let r = _mm_shuffle_ps(a, b, 0b00_01_01_11); | |
2975 | assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0)); | |
2976 | } | |
2977 | ||
83c7162d | 2978 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2979 | unsafe fn test_mm_unpackhi_ps() { |
2980 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2981 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2982 | let r = _mm_unpackhi_ps(a, b); | |
2983 | assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0)); | |
2984 | } | |
2985 | ||
83c7162d | 2986 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2987 | unsafe fn test_mm_unpacklo_ps() { |
2988 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2989 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2990 | let r = _mm_unpacklo_ps(a, b); | |
2991 | assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0)); | |
2992 | } | |
2993 | ||
83c7162d | 2994 | #[simd_test(enable = "sse")] |
0531ce1d XL |
2995 | unsafe fn test_mm_movehl_ps() { |
2996 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
2997 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
2998 | let r = _mm_movehl_ps(a, b); | |
2999 | assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0)); | |
3000 | } | |
3001 | ||
83c7162d | 3002 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3003 | unsafe fn test_mm_movelh_ps() { |
3004 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3005 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
3006 | let r = _mm_movelh_ps(a, b); | |
3007 | assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0)); | |
3008 | } | |
3009 | ||
83c7162d | 3010 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3011 | unsafe fn test_mm_load_ss() { |
3012 | let a = 42.0f32; | |
3013 | let r = _mm_load_ss(&a as *const f32); | |
3014 | assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0)); | |
3015 | } | |
3016 | ||
83c7162d | 3017 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3018 | unsafe fn test_mm_load1_ps() { |
3019 | let a = 42.0f32; | |
3020 | let r = _mm_load1_ps(&a as *const f32); | |
3021 | assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0)); | |
3022 | } | |
3023 | ||
83c7162d | 3024 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3025 | unsafe fn test_mm_load_ps() { |
3026 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | |
3027 | ||
3028 | let mut p = vals.as_ptr(); | |
3029 | let mut fixup = 0.0f32; | |
3030 | ||
3031 | // Make sure p is aligned, otherwise we might get a | |
3032 | // (signal: 11, SIGSEGV: invalid memory reference) | |
3033 | ||
3034 | let unalignment = (p as usize) & 0xf; | |
3035 | if unalignment != 0 { | |
3036 | let delta = ((16 - unalignment) >> 2) as isize; | |
3037 | fixup = delta as f32; | |
3038 | p = p.offset(delta); | |
3039 | } | |
3040 | ||
3041 | let r = _mm_load_ps(p); | |
0731742a | 3042 | let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup)); |
0531ce1d XL |
3043 | assert_eq_m128(r, e); |
3044 | } | |
3045 | ||
83c7162d | 3046 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3047 | unsafe fn test_mm_loadu_ps() { |
3048 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | |
3049 | let p = vals.as_ptr().offset(3); | |
3050 | let r = _mm_loadu_ps(black_box(p)); | |
3051 | assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0)); | |
3052 | } | |
3053 | ||
83c7162d | 3054 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3055 | unsafe fn test_mm_loadr_ps() { |
3056 | let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; | |
3057 | ||
3058 | let mut p = vals.as_ptr(); | |
3059 | let mut fixup = 0.0f32; | |
3060 | ||
3061 | // Make sure p is aligned, otherwise we might get a | |
3062 | // (signal: 11, SIGSEGV: invalid memory reference) | |
3063 | ||
3064 | let unalignment = (p as usize) & 0xf; | |
3065 | if unalignment != 0 { | |
3066 | let delta = ((16 - unalignment) >> 2) as isize; | |
3067 | fixup = delta as f32; | |
3068 | p = p.offset(delta); | |
3069 | } | |
3070 | ||
3071 | let r = _mm_loadr_ps(p); | |
0731742a | 3072 | let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup)); |
0531ce1d XL |
3073 | assert_eq_m128(r, e); |
3074 | } | |
3075 | ||
3dfed10e XL |
3076 | #[simd_test(enable = "sse2")] |
3077 | unsafe fn test_mm_loadu_si64() { | |
3078 | let a = _mm_setr_epi64x(5, 6); | |
3079 | let r = _mm_loadu_si64(&a as *const _ as *const _); | |
3080 | assert_eq_m128i(r, _mm_set_epi64x(5, 0)); | |
3081 | } | |
3082 | ||
83c7162d | 3083 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3084 | unsafe fn test_mm_store_ss() { |
3085 | let mut vals = [0.0f32; 8]; | |
3086 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3087 | _mm_store_ss(vals.as_mut_ptr().offset(1), a); | |
3088 | ||
3089 | assert_eq!(vals[0], 0.0); | |
3090 | assert_eq!(vals[1], 1.0); | |
3091 | assert_eq!(vals[2], 0.0); | |
3092 | } | |
3093 | ||
83c7162d | 3094 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3095 | unsafe fn test_mm_store1_ps() { |
3096 | let mut vals = [0.0f32; 8]; | |
3097 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3098 | ||
3099 | let mut ofs = 0; | |
3100 | let mut p = vals.as_mut_ptr(); | |
3101 | ||
3102 | if (p as usize) & 0xf != 0 { | |
3103 | ofs = (16 - (p as usize) & 0xf) >> 2; | |
3104 | p = p.offset(ofs as isize); | |
3105 | } | |
3106 | ||
3107 | _mm_store1_ps(p, *black_box(&a)); | |
3108 | ||
3109 | if ofs > 0 { | |
3110 | assert_eq!(vals[ofs - 1], 0.0); | |
3111 | } | |
3112 | assert_eq!(vals[ofs + 0], 1.0); | |
3113 | assert_eq!(vals[ofs + 1], 1.0); | |
3114 | assert_eq!(vals[ofs + 2], 1.0); | |
3115 | assert_eq!(vals[ofs + 3], 1.0); | |
3116 | assert_eq!(vals[ofs + 4], 0.0); | |
3117 | } | |
3118 | ||
83c7162d | 3119 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3120 | unsafe fn test_mm_store_ps() { |
3121 | let mut vals = [0.0f32; 8]; | |
3122 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3123 | ||
3124 | let mut ofs = 0; | |
3125 | let mut p = vals.as_mut_ptr(); | |
3126 | ||
3127 | // Align p to 16-byte boundary | |
3128 | if (p as usize) & 0xf != 0 { | |
3129 | ofs = (16 - (p as usize) & 0xf) >> 2; | |
3130 | p = p.offset(ofs as isize); | |
3131 | } | |
3132 | ||
3133 | _mm_store_ps(p, *black_box(&a)); | |
3134 | ||
3135 | if ofs > 0 { | |
3136 | assert_eq!(vals[ofs - 1], 0.0); | |
3137 | } | |
3138 | assert_eq!(vals[ofs + 0], 1.0); | |
3139 | assert_eq!(vals[ofs + 1], 2.0); | |
3140 | assert_eq!(vals[ofs + 2], 3.0); | |
3141 | assert_eq!(vals[ofs + 3], 4.0); | |
3142 | assert_eq!(vals[ofs + 4], 0.0); | |
3143 | } | |
3144 | ||
83c7162d | 3145 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3146 | unsafe fn test_mm_storer_ps() { |
3147 | let mut vals = [0.0f32; 8]; | |
3148 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3149 | ||
3150 | let mut ofs = 0; | |
3151 | let mut p = vals.as_mut_ptr(); | |
3152 | ||
3153 | // Align p to 16-byte boundary | |
3154 | if (p as usize) & 0xf != 0 { | |
3155 | ofs = (16 - (p as usize) & 0xf) >> 2; | |
3156 | p = p.offset(ofs as isize); | |
3157 | } | |
3158 | ||
3159 | _mm_storer_ps(p, *black_box(&a)); | |
3160 | ||
3161 | if ofs > 0 { | |
3162 | assert_eq!(vals[ofs - 1], 0.0); | |
3163 | } | |
3164 | assert_eq!(vals[ofs + 0], 4.0); | |
3165 | assert_eq!(vals[ofs + 1], 3.0); | |
3166 | assert_eq!(vals[ofs + 2], 2.0); | |
3167 | assert_eq!(vals[ofs + 3], 1.0); | |
3168 | assert_eq!(vals[ofs + 4], 0.0); | |
3169 | } | |
3170 | ||
83c7162d | 3171 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3172 | unsafe fn test_mm_storeu_ps() { |
3173 | let mut vals = [0.0f32; 8]; | |
3174 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3175 | ||
3176 | let mut ofs = 0; | |
3177 | let mut p = vals.as_mut_ptr(); | |
3178 | ||
532ac7d7 | 3179 | // Make sure p is **not** aligned to 16-byte boundary |
0531ce1d XL |
3180 | if (p as usize) & 0xf == 0 { |
3181 | ofs = 1; | |
3182 | p = p.offset(1); | |
3183 | } | |
3184 | ||
3185 | _mm_storeu_ps(p, *black_box(&a)); | |
3186 | ||
3187 | if ofs > 0 { | |
3188 | assert_eq!(vals[ofs - 1], 0.0); | |
3189 | } | |
3190 | assert_eq!(vals[ofs + 0], 1.0); | |
3191 | assert_eq!(vals[ofs + 1], 2.0); | |
3192 | assert_eq!(vals[ofs + 2], 3.0); | |
3193 | assert_eq!(vals[ofs + 3], 4.0); | |
3194 | assert_eq!(vals[ofs + 4], 0.0); | |
3195 | } | |
3196 | ||
83c7162d | 3197 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3198 | unsafe fn test_mm_move_ss() { |
3199 | let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3200 | let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
3201 | ||
3202 | let r = _mm_move_ss(a, b); | |
3203 | let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0); | |
3204 | assert_eq_m128(e, r); | |
3205 | } | |
3206 | ||
83c7162d | 3207 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3208 | unsafe fn test_mm_movemask_ps() { |
3209 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0)); | |
3210 | assert_eq!(r, 0b0101); | |
3211 | ||
3212 | let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0)); | |
3213 | assert_eq!(r, 0b0111); | |
3214 | } | |
3215 | ||
83c7162d | 3216 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3217 | unsafe fn test_mm_sfence() { |
3218 | _mm_sfence(); | |
3219 | } | |
3220 | ||
83c7162d | 3221 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3222 | unsafe fn test_mm_getcsr_setcsr_1() { |
3223 | let saved_csr = _mm_getcsr(); | |
3224 | ||
3225 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); | |
3226 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); | |
3227 | ||
3228 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); | |
3229 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); | |
3230 | ||
3231 | _mm_setcsr(saved_csr); | |
3232 | ||
3233 | let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0); | |
3234 | assert_eq_m128(r, exp); // first component is a denormalized f32 | |
3235 | } | |
3236 | ||
83c7162d | 3237 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3238 | unsafe fn test_mm_getcsr_setcsr_2() { |
3239 | // Same as _mm_setcsr_1 test, but with opposite flag value. | |
3240 | ||
3241 | let saved_csr = _mm_getcsr(); | |
3242 | ||
3243 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); | |
3244 | let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0); | |
3245 | ||
3246 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); | |
3247 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); | |
3248 | ||
3249 | _mm_setcsr(saved_csr); | |
3250 | ||
3251 | let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0); | |
3252 | assert_eq_m128(r, exp); // first component is a denormalized f32 | |
3253 | } | |
3254 | ||
83c7162d | 3255 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3256 | unsafe fn test_mm_getcsr_setcsr_underflow() { |
3257 | _MM_SET_EXCEPTION_STATE(0); | |
3258 | ||
3259 | let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0); | |
3260 | let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0); | |
3261 | ||
3262 | assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure | |
3263 | ||
3264 | let r = _mm_mul_ps(*black_box(&a), *black_box(&b)); | |
3265 | ||
3266 | let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0); | |
3267 | assert_eq_m128(r, exp); | |
3268 | ||
3269 | let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0; | |
3270 | assert_eq!(underflow, true); | |
3271 | } | |
3272 | ||
83c7162d | 3273 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3274 | unsafe fn test_MM_TRANSPOSE4_PS() { |
3275 | let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
3276 | let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); | |
3277 | let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0); | |
3278 | let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0); | |
3279 | ||
3280 | _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d); | |
3281 | ||
3282 | assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0)); | |
3283 | assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0)); | |
3284 | assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0)); | |
3285 | assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0)); | |
3286 | } | |
3287 | ||
3288 | #[repr(align(16))] | |
3289 | struct Memory { | |
3290 | pub data: [f32; 4], | |
3291 | } | |
3292 | ||
83c7162d | 3293 | #[simd_test(enable = "sse")] |
0531ce1d XL |
3294 | unsafe fn test_mm_stream_ps() { |
3295 | let a = _mm_set1_ps(7.0); | |
8faf50e0 | 3296 | let mut mem = Memory { data: [-1.0; 4] }; |
0531ce1d XL |
3297 | |
3298 | _mm_stream_ps(&mut mem.data[0] as *mut f32, a); | |
3299 | for i in 0..4 { | |
3300 | assert_eq!(mem.data[i], get_m128(a, i)); | |
3301 | } | |
3302 | } | |
0531ce1d | 3303 | } |