]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! Advanced Vector Extensions (AVX) |
2 | //! | |
3 | //! The references are: | |
4 | //! | |
5 | //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: | |
6 | //! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture | |
7 | //! Programmer's Manual, Volume 3: General-Purpose and System | |
8 | //! Instructions][amd64_ref]. | |
9 | //! | |
10 | //! [Wikipedia][wiki] provides a quick overview of the instructions available. | |
11 | //! | |
12 | //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf | |
13 | //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf | |
14 | //! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions | |
15 | ||
532ac7d7 XL |
16 | use crate::{ |
17 | core_arch::{simd::*, simd_llvm::*, x86::*}, | |
18 | intrinsics, | |
19 | mem::{self, transmute}, | |
20 | ptr, | |
21 | }; | |
0531ce1d XL |
22 | |
23 | #[cfg(test)] | |
416331ca | 24 | use stdarch_test::assert_instr; |
0531ce1d | 25 | |
532ac7d7 | 26 | /// Adds packed double-precision (64-bit) floating-point elements |
0531ce1d | 27 | /// in `a` and `b`. |
83c7162d | 28 | /// |
353b0b11 | 29 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd) |
0531ce1d XL |
30 | #[inline] |
31 | #[target_feature(enable = "avx")] | |
32 | #[cfg_attr(test, assert_instr(vaddpd))] | |
83c7162d | 33 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
34 | pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d { |
35 | simd_add(a, b) | |
36 | } | |
37 | ||
532ac7d7 | 38 | /// Adds packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d | 39 | /// `b`. |
83c7162d | 40 | /// |
353b0b11 | 41 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps) |
0531ce1d XL |
42 | #[inline] |
43 | #[target_feature(enable = "avx")] | |
44 | #[cfg_attr(test, assert_instr(vaddps))] | |
83c7162d | 45 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
46 | pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 { |
47 | simd_add(a, b) | |
48 | } | |
49 | ||
532ac7d7 XL |
50 | /// Computes the bitwise AND of a packed double-precision (64-bit) |
51 | /// floating-point elements in `a` and `b`. | |
83c7162d | 52 | /// |
353b0b11 | 53 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd) |
0531ce1d XL |
54 | #[inline] |
55 | #[target_feature(enable = "avx")] | |
a2a8927a | 56 | // FIXME: Should be 'vandpd' instruction. |
416331ca | 57 | // See https://github.com/rust-lang/stdarch/issues/71 |
0531ce1d | 58 | #[cfg_attr(test, assert_instr(vandps))] |
83c7162d | 59 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 60 | pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d { |
532ac7d7 XL |
61 | let a: u64x4 = transmute(a); |
62 | let b: u64x4 = transmute(b); | |
63 | transmute(simd_and(a, b)) | |
0531ce1d XL |
64 | } |
65 | ||
532ac7d7 | 66 | /// Computes the bitwise AND of packed single-precision (32-bit) floating-point |
0531ce1d | 67 | /// elements in `a` and `b`. |
83c7162d | 68 | /// |
353b0b11 | 69 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps) |
0531ce1d XL |
70 | #[inline] |
71 | #[target_feature(enable = "avx")] | |
72 | #[cfg_attr(test, assert_instr(vandps))] | |
83c7162d | 73 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 74 | pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 { |
532ac7d7 XL |
75 | let a: u32x8 = transmute(a); |
76 | let b: u32x8 = transmute(b); | |
77 | transmute(simd_and(a, b)) | |
0531ce1d XL |
78 | } |
79 | ||
532ac7d7 | 80 | /// Computes the bitwise OR packed double-precision (64-bit) floating-point |
0531ce1d | 81 | /// elements in `a` and `b`. |
83c7162d | 82 | /// |
353b0b11 | 83 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd) |
0531ce1d XL |
84 | #[inline] |
85 | #[target_feature(enable = "avx")] | |
a2a8927a | 86 | // FIXME: should be `vorpd` instruction. |
416331ca | 87 | // See <https://github.com/rust-lang/stdarch/issues/71>. |
0531ce1d | 88 | #[cfg_attr(test, assert_instr(vorps))] |
83c7162d | 89 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 90 | pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d { |
532ac7d7 XL |
91 | let a: u64x4 = transmute(a); |
92 | let b: u64x4 = transmute(b); | |
93 | transmute(simd_or(a, b)) | |
0531ce1d XL |
94 | } |
95 | ||
532ac7d7 | 96 | /// Computes the bitwise OR packed single-precision (32-bit) floating-point |
0531ce1d | 97 | /// elements in `a` and `b`. |
83c7162d | 98 | /// |
353b0b11 | 99 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps) |
0531ce1d XL |
100 | #[inline] |
101 | #[target_feature(enable = "avx")] | |
102 | #[cfg_attr(test, assert_instr(vorps))] | |
83c7162d | 103 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 104 | pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 { |
532ac7d7 XL |
105 | let a: u32x8 = transmute(a); |
106 | let b: u32x8 = transmute(b); | |
107 | transmute(simd_or(a, b)) | |
0531ce1d XL |
108 | } |
109 | ||
532ac7d7 | 110 | /// Shuffles double-precision (64-bit) floating-point elements within 128-bit |
0531ce1d | 111 | /// lanes using the control in `imm8`. |
83c7162d | 112 | /// |
353b0b11 | 113 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd) |
0531ce1d XL |
114 | #[inline] |
115 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
116 | #[cfg_attr(test, assert_instr(vshufpd, MASK = 3))] |
117 | #[rustc_legacy_const_generics(2)] | |
118 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
119 | pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d { | |
353b0b11 FG |
120 | static_assert_uimm_bits!(MASK, 8); |
121 | simd_shuffle!( | |
17df50a5 XL |
122 | a, |
123 | b, | |
353b0b11 | 124 | [ |
17df50a5 XL |
125 | MASK as u32 & 0b1, |
126 | ((MASK as u32 >> 1) & 0b1) + 4, | |
127 | ((MASK as u32 >> 2) & 0b1) + 2, | |
128 | ((MASK as u32 >> 3) & 0b1) + 6, | |
129 | ], | |
130 | ) | |
0531ce1d XL |
131 | } |
132 | ||
532ac7d7 | 133 | /// Shuffles single-precision (32-bit) floating-point elements in `a` within |
0531ce1d | 134 | /// 128-bit lanes using the control in `imm8`. |
83c7162d | 135 | /// |
353b0b11 | 136 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps) |
0531ce1d XL |
137 | #[inline] |
138 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
139 | #[cfg_attr(test, assert_instr(vshufps, MASK = 3))] |
140 | #[rustc_legacy_const_generics(2)] | |
141 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
142 | pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 { | |
353b0b11 FG |
143 | static_assert_uimm_bits!(MASK, 8); |
144 | simd_shuffle!( | |
17df50a5 XL |
145 | a, |
146 | b, | |
353b0b11 | 147 | [ |
17df50a5 XL |
148 | MASK as u32 & 0b11, |
149 | (MASK as u32 >> 2) & 0b11, | |
150 | ((MASK as u32 >> 4) & 0b11) + 8, | |
151 | ((MASK as u32 >> 6) & 0b11) + 8, | |
152 | (MASK as u32 & 0b11) + 4, | |
153 | ((MASK as u32 >> 2) & 0b11) + 4, | |
154 | ((MASK as u32 >> 4) & 0b11) + 12, | |
155 | ((MASK as u32 >> 6) & 0b11) + 12, | |
156 | ], | |
157 | ) | |
0531ce1d XL |
158 | } |
159 | ||
532ac7d7 XL |
160 | /// Computes the bitwise NOT of packed double-precision (64-bit) floating-point |
161 | /// elements in `a`, and then AND with `b`. | |
83c7162d | 162 | /// |
353b0b11 | 163 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd) |
0531ce1d XL |
164 | #[inline] |
165 | #[target_feature(enable = "avx")] | |
532ac7d7 | 166 | // FIXME: should be `vandnpd` instruction. |
0531ce1d | 167 | #[cfg_attr(test, assert_instr(vandnps))] |
83c7162d | 168 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 169 | pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d { |
532ac7d7 XL |
170 | let a: u64x4 = transmute(a); |
171 | let b: u64x4 = transmute(b); | |
172 | transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b)) | |
0531ce1d XL |
173 | } |
174 | ||
532ac7d7 | 175 | /// Computes the bitwise NOT of packed single-precision (32-bit) floating-point |
0531ce1d XL |
176 | /// elements in `a` |
177 | /// and then AND with `b`. | |
83c7162d | 178 | /// |
353b0b11 | 179 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps) |
0531ce1d XL |
180 | #[inline] |
181 | #[target_feature(enable = "avx")] | |
182 | #[cfg_attr(test, assert_instr(vandnps))] | |
83c7162d | 183 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 184 | pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 { |
532ac7d7 XL |
185 | let a: u32x8 = transmute(a); |
186 | let b: u32x8 = transmute(b); | |
187 | transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b)) | |
0531ce1d XL |
188 | } |
189 | ||
532ac7d7 XL |
190 | /// Compares packed double-precision (64-bit) floating-point elements |
191 | /// in `a` and `b`, and returns packed maximum values | |
83c7162d | 192 | /// |
353b0b11 | 193 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd) |
0531ce1d XL |
194 | #[inline] |
195 | #[target_feature(enable = "avx")] | |
196 | #[cfg_attr(test, assert_instr(vmaxpd))] | |
83c7162d | 197 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 198 | pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d { |
17df50a5 | 199 | vmaxpd(a, b) |
0531ce1d XL |
200 | } |
201 | ||
532ac7d7 XL |
202 | /// Compares packed single-precision (32-bit) floating-point elements in `a` |
203 | /// and `b`, and returns packed maximum values | |
83c7162d | 204 | /// |
353b0b11 | 205 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps) |
0531ce1d XL |
206 | #[inline] |
207 | #[target_feature(enable = "avx")] | |
208 | #[cfg_attr(test, assert_instr(vmaxps))] | |
83c7162d | 209 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 210 | pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 { |
17df50a5 | 211 | vmaxps(a, b) |
0531ce1d XL |
212 | } |
213 | ||
532ac7d7 XL |
214 | /// Compares packed double-precision (64-bit) floating-point elements |
215 | /// in `a` and `b`, and returns packed minimum values | |
83c7162d | 216 | /// |
353b0b11 | 217 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd) |
0531ce1d XL |
218 | #[inline] |
219 | #[target_feature(enable = "avx")] | |
220 | #[cfg_attr(test, assert_instr(vminpd))] | |
83c7162d | 221 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 222 | pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d { |
17df50a5 | 223 | vminpd(a, b) |
0531ce1d XL |
224 | } |
225 | ||
532ac7d7 XL |
226 | /// Compares packed single-precision (32-bit) floating-point elements in `a` |
227 | /// and `b`, and returns packed minimum values | |
83c7162d | 228 | /// |
353b0b11 | 229 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps) |
0531ce1d XL |
230 | #[inline] |
231 | #[target_feature(enable = "avx")] | |
232 | #[cfg_attr(test, assert_instr(vminps))] | |
83c7162d | 233 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 234 | pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 { |
17df50a5 | 235 | vminps(a, b) |
0531ce1d XL |
236 | } |
237 | ||
532ac7d7 | 238 | /// Multiplies packed double-precision (64-bit) floating-point elements |
0531ce1d | 239 | /// in `a` and `b`. |
83c7162d | 240 | /// |
353b0b11 | 241 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd) |
0531ce1d XL |
242 | #[inline] |
243 | #[target_feature(enable = "avx")] | |
244 | #[cfg_attr(test, assert_instr(vmulpd))] | |
83c7162d | 245 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
246 | pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d { |
247 | simd_mul(a, b) | |
248 | } | |
249 | ||
532ac7d7 | 250 | /// Multiplies packed single-precision (32-bit) floating-point elements in `a` and |
0531ce1d | 251 | /// `b`. |
83c7162d | 252 | /// |
353b0b11 | 253 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps) |
0531ce1d XL |
254 | #[inline] |
255 | #[target_feature(enable = "avx")] | |
256 | #[cfg_attr(test, assert_instr(vmulps))] | |
83c7162d | 257 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
258 | pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 { |
259 | simd_mul(a, b) | |
260 | } | |
261 | ||
532ac7d7 | 262 | /// Alternatively adds and subtracts packed double-precision (64-bit) |
0531ce1d | 263 | /// floating-point elements in `a` to/from packed elements in `b`. |
83c7162d | 264 | /// |
353b0b11 | 265 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd) |
0531ce1d XL |
266 | #[inline] |
267 | #[target_feature(enable = "avx")] | |
268 | #[cfg_attr(test, assert_instr(vaddsubpd))] | |
83c7162d | 269 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
270 | pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d { |
271 | addsubpd256(a, b) | |
272 | } | |
273 | ||
532ac7d7 | 274 | /// Alternatively adds and subtracts packed single-precision (32-bit) |
0531ce1d | 275 | /// floating-point elements in `a` to/from packed elements in `b`. |
83c7162d | 276 | /// |
353b0b11 | 277 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps) |
0531ce1d XL |
278 | #[inline] |
279 | #[target_feature(enable = "avx")] | |
280 | #[cfg_attr(test, assert_instr(vaddsubps))] | |
83c7162d | 281 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
282 | pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 { |
283 | addsubps256(a, b) | |
284 | } | |
285 | ||
532ac7d7 | 286 | /// Subtracts packed double-precision (64-bit) floating-point elements in `b` |
0531ce1d | 287 | /// from packed elements in `a`. |
83c7162d | 288 | /// |
353b0b11 | 289 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd) |
0531ce1d XL |
290 | #[inline] |
291 | #[target_feature(enable = "avx")] | |
292 | #[cfg_attr(test, assert_instr(vsubpd))] | |
83c7162d | 293 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
294 | pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d { |
295 | simd_sub(a, b) | |
296 | } | |
297 | ||
532ac7d7 | 298 | /// Subtracts packed single-precision (32-bit) floating-point elements in `b` |
0531ce1d | 299 | /// from packed elements in `a`. |
83c7162d | 300 | /// |
353b0b11 | 301 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps) |
0531ce1d XL |
302 | #[inline] |
303 | #[target_feature(enable = "avx")] | |
304 | #[cfg_attr(test, assert_instr(vsubps))] | |
83c7162d | 305 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
306 | pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 { |
307 | simd_sub(a, b) | |
308 | } | |
309 | ||
532ac7d7 | 310 | /// Computes the division of each of the 8 packed 32-bit floating-point elements |
0531ce1d | 311 | /// in `a` by the corresponding packed elements in `b`. |
83c7162d | 312 | /// |
353b0b11 | 313 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps) |
0531ce1d XL |
314 | #[inline] |
315 | #[target_feature(enable = "avx")] | |
316 | #[cfg_attr(test, assert_instr(vdivps))] | |
83c7162d | 317 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
318 | pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 { |
319 | simd_div(a, b) | |
320 | } | |
321 | ||
532ac7d7 | 322 | /// Computes the division of each of the 4 packed 64-bit floating-point elements |
0531ce1d | 323 | /// in `a` by the corresponding packed elements in `b`. |
83c7162d | 324 | /// |
353b0b11 | 325 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd) |
0531ce1d XL |
326 | #[inline] |
327 | #[target_feature(enable = "avx")] | |
328 | #[cfg_attr(test, assert_instr(vdivpd))] | |
83c7162d | 329 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
330 | pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d { |
331 | simd_div(a, b) | |
332 | } | |
333 | ||
532ac7d7 | 334 | /// Rounds packed double-precision (64-bit) floating point elements in `a` |
17df50a5 | 335 | /// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows: |
0531ce1d XL |
336 | /// |
337 | /// - `0x00`: Round to the nearest whole number. | |
338 | /// - `0x01`: Round down, toward negative infinity. | |
339 | /// - `0x02`: Round up, toward positive infinity. | |
340 | /// - `0x03`: Truncate the values. | |
341 | /// | |
342 | /// For a complete list of options, check [the LLVM docs][llvm_docs]. | |
343 | /// | |
344 | /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 | |
83c7162d | 345 | /// |
353b0b11 | 346 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd) |
0531ce1d XL |
347 | #[inline] |
348 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
349 | #[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))] |
350 | #[rustc_legacy_const_generics(1)] | |
83c7162d | 351 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 352 | pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d { |
353b0b11 | 353 | static_assert_uimm_bits!(ROUNDING, 4); |
17df50a5 | 354 | roundpd256(a, ROUNDING) |
0531ce1d XL |
355 | } |
356 | ||
532ac7d7 | 357 | /// Rounds packed double-precision (64-bit) floating point elements in `a` |
0531ce1d | 358 | /// toward positive infinity. |
83c7162d | 359 | /// |
353b0b11 | 360 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd) |
0531ce1d XL |
361 | #[inline] |
362 | #[target_feature(enable = "avx")] | |
363 | #[cfg_attr(test, assert_instr(vroundpd))] | |
83c7162d | 364 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 365 | pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d { |
74b04a01 | 366 | simd_ceil(a) |
0531ce1d XL |
367 | } |
368 | ||
532ac7d7 | 369 | /// Rounds packed double-precision (64-bit) floating point elements in `a` |
0531ce1d | 370 | /// toward negative infinity. |
83c7162d | 371 | /// |
353b0b11 | 372 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd) |
0531ce1d XL |
373 | #[inline] |
374 | #[target_feature(enable = "avx")] | |
375 | #[cfg_attr(test, assert_instr(vroundpd))] | |
83c7162d | 376 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 377 | pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d { |
74b04a01 | 378 | simd_floor(a) |
0531ce1d XL |
379 | } |
380 | ||
532ac7d7 | 381 | /// Rounds packed single-precision (32-bit) floating point elements in `a` |
17df50a5 | 382 | /// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows: |
0531ce1d XL |
383 | /// |
384 | /// - `0x00`: Round to the nearest whole number. | |
385 | /// - `0x01`: Round down, toward negative infinity. | |
386 | /// - `0x02`: Round up, toward positive infinity. | |
387 | /// - `0x03`: Truncate the values. | |
388 | /// | |
389 | /// For a complete list of options, check [the LLVM docs][llvm_docs]. | |
390 | /// | |
391 | /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382 | |
83c7162d | 392 | /// |
353b0b11 | 393 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps) |
0531ce1d XL |
394 | #[inline] |
395 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
396 | #[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))] |
397 | #[rustc_legacy_const_generics(1)] | |
83c7162d | 398 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 399 | pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 { |
353b0b11 | 400 | static_assert_uimm_bits!(ROUNDING, 4); |
17df50a5 | 401 | roundps256(a, ROUNDING) |
0531ce1d XL |
402 | } |
403 | ||
532ac7d7 | 404 | /// Rounds packed single-precision (32-bit) floating point elements in `a` |
0531ce1d | 405 | /// toward positive infinity. |
83c7162d | 406 | /// |
353b0b11 | 407 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps) |
0531ce1d XL |
408 | #[inline] |
409 | #[target_feature(enable = "avx")] | |
410 | #[cfg_attr(test, assert_instr(vroundps))] | |
83c7162d | 411 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 412 | pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 { |
74b04a01 | 413 | simd_ceil(a) |
0531ce1d XL |
414 | } |
415 | ||
532ac7d7 | 416 | /// Rounds packed single-precision (32-bit) floating point elements in `a` |
0531ce1d | 417 | /// toward negative infinity. |
83c7162d | 418 | /// |
353b0b11 | 419 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps) |
0531ce1d XL |
420 | #[inline] |
421 | #[target_feature(enable = "avx")] | |
422 | #[cfg_attr(test, assert_instr(vroundps))] | |
83c7162d | 423 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 424 | pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 { |
74b04a01 | 425 | simd_floor(a) |
0531ce1d XL |
426 | } |
427 | ||
532ac7d7 | 428 | /// Returns the square root of packed single-precision (32-bit) floating point |
0531ce1d | 429 | /// elements in `a`. |
83c7162d | 430 | /// |
353b0b11 | 431 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps) |
0531ce1d XL |
432 | #[inline] |
433 | #[target_feature(enable = "avx")] | |
434 | #[cfg_attr(test, assert_instr(vsqrtps))] | |
83c7162d | 435 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
436 | pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 { |
437 | sqrtps256(a) | |
438 | } | |
439 | ||
532ac7d7 | 440 | /// Returns the square root of packed double-precision (64-bit) floating point |
0531ce1d | 441 | /// elements in `a`. |
83c7162d | 442 | /// |
353b0b11 | 443 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd) |
0531ce1d XL |
444 | #[inline] |
445 | #[target_feature(enable = "avx")] | |
446 | #[cfg_attr(test, assert_instr(vsqrtpd))] | |
83c7162d | 447 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 448 | pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d { |
74b04a01 | 449 | simd_fsqrt(a) |
0531ce1d XL |
450 | } |
451 | ||
532ac7d7 | 452 | /// Blends packed double-precision (64-bit) floating-point elements from |
0531ce1d | 453 | /// `a` and `b` using control mask `imm8`. |
83c7162d | 454 | /// |
353b0b11 | 455 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd) |
0531ce1d XL |
456 | #[inline] |
457 | #[target_feature(enable = "avx")] | |
8faf50e0 XL |
458 | // Note: LLVM7 prefers single-precision blend instructions when |
459 | // possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194 | |
460 | // #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))] | |
17df50a5 XL |
461 | #[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))] |
462 | #[rustc_legacy_const_generics(2)] | |
463 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
464 | pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d { | |
353b0b11 FG |
465 | static_assert_uimm_bits!(IMM4, 4); |
466 | simd_shuffle!( | |
17df50a5 XL |
467 | a, |
468 | b, | |
353b0b11 | 469 | [ |
17df50a5 XL |
470 | ((IMM4 as u32 >> 0) & 1) * 4 + 0, |
471 | ((IMM4 as u32 >> 1) & 1) * 4 + 1, | |
472 | ((IMM4 as u32 >> 2) & 1) * 4 + 2, | |
473 | ((IMM4 as u32 >> 3) & 1) * 4 + 3, | |
474 | ], | |
475 | ) | |
0531ce1d XL |
476 | } |
477 | ||
532ac7d7 | 478 | /// Blends packed single-precision (32-bit) floating-point elements from |
0531ce1d | 479 | /// `a` and `b` using control mask `imm8`. |
83c7162d | 480 | /// |
353b0b11 | 481 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps) |
0531ce1d XL |
482 | #[inline] |
483 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
484 | #[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))] |
485 | #[rustc_legacy_const_generics(2)] | |
486 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
487 | pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 { | |
353b0b11 FG |
488 | static_assert_uimm_bits!(IMM8, 8); |
489 | simd_shuffle!( | |
17df50a5 XL |
490 | a, |
491 | b, | |
353b0b11 | 492 | [ |
17df50a5 XL |
493 | ((IMM8 as u32 >> 0) & 1) * 8 + 0, |
494 | ((IMM8 as u32 >> 1) & 1) * 8 + 1, | |
495 | ((IMM8 as u32 >> 2) & 1) * 8 + 2, | |
496 | ((IMM8 as u32 >> 3) & 1) * 8 + 3, | |
497 | ((IMM8 as u32 >> 4) & 1) * 8 + 4, | |
498 | ((IMM8 as u32 >> 5) & 1) * 8 + 5, | |
499 | ((IMM8 as u32 >> 6) & 1) * 8 + 6, | |
500 | ((IMM8 as u32 >> 7) & 1) * 8 + 7, | |
501 | ], | |
502 | ) | |
0531ce1d XL |
503 | } |
504 | ||
532ac7d7 | 505 | /// Blends packed double-precision (64-bit) floating-point elements from |
0531ce1d | 506 | /// `a` and `b` using `c` as a mask. |
83c7162d | 507 | /// |
353b0b11 | 508 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd) |
0531ce1d XL |
509 | #[inline] |
510 | #[target_feature(enable = "avx")] | |
511 | #[cfg_attr(test, assert_instr(vblendvpd))] | |
83c7162d | 512 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
513 | pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d { |
514 | vblendvpd(a, b, c) | |
515 | } | |
516 | ||
532ac7d7 | 517 | /// Blends packed single-precision (32-bit) floating-point elements from |
0531ce1d | 518 | /// `a` and `b` using `c` as a mask. |
83c7162d | 519 | /// |
353b0b11 | 520 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps) |
0531ce1d XL |
521 | #[inline] |
522 | #[target_feature(enable = "avx")] | |
523 | #[cfg_attr(test, assert_instr(vblendvps))] | |
83c7162d | 524 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
525 | pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 { |
526 | vblendvps(a, b, c) | |
527 | } | |
528 | ||
532ac7d7 | 529 | /// Conditionally multiplies the packed single-precision (32-bit) floating-point |
0531ce1d XL |
530 | /// elements in `a` and `b` using the high 4 bits in `imm8`, |
531 | /// sum the four products, and conditionally return the sum | |
532 | /// using the low 4 bits of `imm8`. | |
83c7162d | 533 | /// |
353b0b11 | 534 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps) |
0531ce1d XL |
535 | #[inline] |
536 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
537 | #[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))] |
538 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 539 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 540 | pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 { |
353b0b11 | 541 | static_assert_uimm_bits!(IMM8, 8); |
17df50a5 | 542 | vdpps(a, b, IMM8) |
0531ce1d XL |
543 | } |
544 | ||
545 | /// Horizontal addition of adjacent pairs in the two packed vectors | |
546 | /// of 4 64-bit floating points `a` and `b`. | |
547 | /// In the result, sums of elements from `a` are returned in even locations, | |
548 | /// while sums of elements from `b` are returned in odd locations. | |
83c7162d | 549 | /// |
353b0b11 | 550 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd) |
0531ce1d XL |
551 | #[inline] |
552 | #[target_feature(enable = "avx")] | |
553 | #[cfg_attr(test, assert_instr(vhaddpd))] | |
83c7162d | 554 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
555 | pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d { |
556 | vhaddpd(a, b) | |
557 | } | |
558 | ||
559 | /// Horizontal addition of adjacent pairs in the two packed vectors | |
560 | /// of 8 32-bit floating points `a` and `b`. | |
561 | /// In the result, sums of elements from `a` are returned in locations of | |
562 | /// indices 0, 1, 4, 5; while sums of elements from `b` are locations | |
563 | /// 2, 3, 6, 7. | |
83c7162d | 564 | /// |
353b0b11 | 565 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps) |
0531ce1d XL |
566 | #[inline] |
567 | #[target_feature(enable = "avx")] | |
568 | #[cfg_attr(test, assert_instr(vhaddps))] | |
83c7162d | 569 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
570 | pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 { |
571 | vhaddps(a, b) | |
572 | } | |
573 | ||
574 | /// Horizontal subtraction of adjacent pairs in the two packed vectors | |
575 | /// of 4 64-bit floating points `a` and `b`. | |
576 | /// In the result, sums of elements from `a` are returned in even locations, | |
577 | /// while sums of elements from `b` are returned in odd locations. | |
83c7162d | 578 | /// |
353b0b11 | 579 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd) |
0531ce1d XL |
580 | #[inline] |
581 | #[target_feature(enable = "avx")] | |
582 | #[cfg_attr(test, assert_instr(vhsubpd))] | |
83c7162d | 583 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
584 | pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d { |
585 | vhsubpd(a, b) | |
586 | } | |
587 | ||
588 | /// Horizontal subtraction of adjacent pairs in the two packed vectors | |
589 | /// of 8 32-bit floating points `a` and `b`. | |
590 | /// In the result, sums of elements from `a` are returned in locations of | |
591 | /// indices 0, 1, 4, 5; while sums of elements from `b` are locations | |
592 | /// 2, 3, 6, 7. | |
83c7162d | 593 | /// |
353b0b11 | 594 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps) |
0531ce1d XL |
595 | #[inline] |
596 | #[target_feature(enable = "avx")] | |
597 | #[cfg_attr(test, assert_instr(vhsubps))] | |
83c7162d | 598 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
599 | pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 { |
600 | vhsubps(a, b) | |
601 | } | |
602 | ||
532ac7d7 | 603 | /// Computes the bitwise XOR of packed double-precision (64-bit) floating-point |
0531ce1d | 604 | /// elements in `a` and `b`. |
83c7162d | 605 | /// |
353b0b11 | 606 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd) |
0531ce1d XL |
607 | #[inline] |
608 | #[target_feature(enable = "avx")] | |
609 | // FIXME Should be 'vxorpd' instruction. | |
610 | #[cfg_attr(test, assert_instr(vxorps))] | |
83c7162d | 611 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 612 | pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d { |
532ac7d7 XL |
613 | let a: u64x4 = transmute(a); |
614 | let b: u64x4 = transmute(b); | |
615 | transmute(simd_xor(a, b)) | |
0531ce1d XL |
616 | } |
617 | ||
532ac7d7 | 618 | /// Computes the bitwise XOR of packed single-precision (32-bit) floating-point |
0531ce1d | 619 | /// elements in `a` and `b`. |
83c7162d | 620 | /// |
353b0b11 | 621 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps) |
0531ce1d XL |
622 | #[inline] |
623 | #[target_feature(enable = "avx")] | |
624 | #[cfg_attr(test, assert_instr(vxorps))] | |
83c7162d | 625 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 626 | pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 { |
532ac7d7 XL |
627 | let a: u32x8 = transmute(a); |
628 | let b: u32x8 = transmute(b); | |
629 | transmute(simd_xor(a, b)) | |
0531ce1d XL |
630 | } |
631 | ||
632 | /// Equal (ordered, non-signaling) | |
83c7162d | 633 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
634 | pub const _CMP_EQ_OQ: i32 = 0x00; |
635 | /// Less-than (ordered, signaling) | |
83c7162d | 636 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
637 | pub const _CMP_LT_OS: i32 = 0x01; |
638 | /// Less-than-or-equal (ordered, signaling) | |
83c7162d | 639 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
640 | pub const _CMP_LE_OS: i32 = 0x02; |
641 | /// Unordered (non-signaling) | |
83c7162d | 642 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
643 | pub const _CMP_UNORD_Q: i32 = 0x03; |
644 | /// Not-equal (unordered, non-signaling) | |
83c7162d | 645 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
646 | pub const _CMP_NEQ_UQ: i32 = 0x04; |
647 | /// Not-less-than (unordered, signaling) | |
83c7162d | 648 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
649 | pub const _CMP_NLT_US: i32 = 0x05; |
650 | /// Not-less-than-or-equal (unordered, signaling) | |
83c7162d | 651 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
652 | pub const _CMP_NLE_US: i32 = 0x06; |
653 | /// Ordered (non-signaling) | |
83c7162d | 654 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
655 | pub const _CMP_ORD_Q: i32 = 0x07; |
656 | /// Equal (unordered, non-signaling) | |
83c7162d | 657 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
658 | pub const _CMP_EQ_UQ: i32 = 0x08; |
659 | /// Not-greater-than-or-equal (unordered, signaling) | |
83c7162d | 660 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
661 | pub const _CMP_NGE_US: i32 = 0x09; |
662 | /// Not-greater-than (unordered, signaling) | |
83c7162d | 663 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
664 | pub const _CMP_NGT_US: i32 = 0x0a; |
665 | /// False (ordered, non-signaling) | |
83c7162d | 666 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
667 | pub const _CMP_FALSE_OQ: i32 = 0x0b; |
668 | /// Not-equal (ordered, non-signaling) | |
83c7162d | 669 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
670 | pub const _CMP_NEQ_OQ: i32 = 0x0c; |
671 | /// Greater-than-or-equal (ordered, signaling) | |
83c7162d | 672 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
673 | pub const _CMP_GE_OS: i32 = 0x0d; |
674 | /// Greater-than (ordered, signaling) | |
83c7162d | 675 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
676 | pub const _CMP_GT_OS: i32 = 0x0e; |
677 | /// True (unordered, non-signaling) | |
83c7162d | 678 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
679 | pub const _CMP_TRUE_UQ: i32 = 0x0f; |
680 | /// Equal (ordered, signaling) | |
83c7162d | 681 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
682 | pub const _CMP_EQ_OS: i32 = 0x10; |
683 | /// Less-than (ordered, non-signaling) | |
83c7162d | 684 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
685 | pub const _CMP_LT_OQ: i32 = 0x11; |
686 | /// Less-than-or-equal (ordered, non-signaling) | |
83c7162d | 687 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
688 | pub const _CMP_LE_OQ: i32 = 0x12; |
689 | /// Unordered (signaling) | |
83c7162d | 690 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
691 | pub const _CMP_UNORD_S: i32 = 0x13; |
692 | /// Not-equal (unordered, signaling) | |
83c7162d | 693 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
694 | pub const _CMP_NEQ_US: i32 = 0x14; |
695 | /// Not-less-than (unordered, non-signaling) | |
83c7162d | 696 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
697 | pub const _CMP_NLT_UQ: i32 = 0x15; |
698 | /// Not-less-than-or-equal (unordered, non-signaling) | |
83c7162d | 699 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
700 | pub const _CMP_NLE_UQ: i32 = 0x16; |
701 | /// Ordered (signaling) | |
83c7162d | 702 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
703 | pub const _CMP_ORD_S: i32 = 0x17; |
704 | /// Equal (unordered, signaling) | |
83c7162d | 705 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
706 | pub const _CMP_EQ_US: i32 = 0x18; |
707 | /// Not-greater-than-or-equal (unordered, non-signaling) | |
83c7162d | 708 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
709 | pub const _CMP_NGE_UQ: i32 = 0x19; |
710 | /// Not-greater-than (unordered, non-signaling) | |
83c7162d | 711 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
712 | pub const _CMP_NGT_UQ: i32 = 0x1a; |
713 | /// False (ordered, signaling) | |
83c7162d | 714 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
715 | pub const _CMP_FALSE_OS: i32 = 0x1b; |
716 | /// Not-equal (ordered, signaling) | |
83c7162d | 717 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
718 | pub const _CMP_NEQ_OS: i32 = 0x1c; |
719 | /// Greater-than-or-equal (ordered, non-signaling) | |
83c7162d | 720 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
721 | pub const _CMP_GE_OQ: i32 = 0x1d; |
722 | /// Greater-than (ordered, non-signaling) | |
83c7162d | 723 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
724 | pub const _CMP_GT_OQ: i32 = 0x1e; |
725 | /// True (unordered, signaling) | |
83c7162d | 726 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
727 | pub const _CMP_TRUE_US: i32 = 0x1f; |
728 | ||
532ac7d7 | 729 | /// Compares packed double-precision (64-bit) floating-point |
0531ce1d | 730 | /// elements in `a` and `b` based on the comparison operand |
17df50a5 | 731 | /// specified by `IMM5`. |
83c7162d | 732 | /// |
353b0b11 | 733 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd) |
0531ce1d XL |
734 | #[inline] |
735 | #[target_feature(enable = "avx,sse2")] | |
17df50a5 XL |
736 | #[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd |
737 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 738 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 739 | pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d { |
353b0b11 | 740 | static_assert_uimm_bits!(IMM5, 5); |
17df50a5 | 741 | vcmppd(a, b, IMM5 as i8) |
0531ce1d XL |
742 | } |
743 | ||
532ac7d7 | 744 | /// Compares packed double-precision (64-bit) floating-point |
0531ce1d | 745 | /// elements in `a` and `b` based on the comparison operand |
17df50a5 | 746 | /// specified by `IMM5`. |
83c7162d | 747 | /// |
353b0b11 | 748 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd) |
0531ce1d XL |
749 | #[inline] |
750 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
751 | #[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd |
752 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 753 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 754 | pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d { |
353b0b11 | 755 | static_assert_uimm_bits!(IMM5, 5); |
17df50a5 | 756 | vcmppd256(a, b, IMM5 as u8) |
0531ce1d XL |
757 | } |
758 | ||
532ac7d7 | 759 | /// Compares packed single-precision (32-bit) floating-point |
0531ce1d | 760 | /// elements in `a` and `b` based on the comparison operand |
17df50a5 | 761 | /// specified by `IMM5`. |
83c7162d | 762 | /// |
353b0b11 | 763 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps) |
0531ce1d XL |
764 | #[inline] |
765 | #[target_feature(enable = "avx,sse")] | |
17df50a5 XL |
766 | #[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps |
767 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 768 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 769 | pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 { |
353b0b11 | 770 | static_assert_uimm_bits!(IMM5, 5); |
17df50a5 | 771 | vcmpps(a, b, IMM5 as i8) |
0531ce1d XL |
772 | } |
773 | ||
532ac7d7 | 774 | /// Compares packed single-precision (32-bit) floating-point |
0531ce1d | 775 | /// elements in `a` and `b` based on the comparison operand |
17df50a5 | 776 | /// specified by `IMM5`. |
83c7162d | 777 | /// |
353b0b11 | 778 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps) |
0531ce1d XL |
779 | #[inline] |
780 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
781 | #[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps |
782 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 783 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 784 | pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 { |
353b0b11 | 785 | static_assert_uimm_bits!(IMM5, 5); |
17df50a5 | 786 | vcmpps256(a, b, IMM5 as u8) |
0531ce1d XL |
787 | } |
788 | ||
532ac7d7 | 789 | /// Compares the lower double-precision (64-bit) floating-point element in |
17df50a5 | 790 | /// `a` and `b` based on the comparison operand specified by `IMM5`, |
0531ce1d | 791 | /// store the result in the lower element of returned vector, |
532ac7d7 | 792 | /// and copies the upper element from `a` to the upper element of returned |
0531ce1d | 793 | /// vector. |
83c7162d | 794 | /// |
353b0b11 | 795 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd) |
0531ce1d XL |
796 | #[inline] |
797 | #[target_feature(enable = "avx,sse2")] | |
17df50a5 XL |
798 | #[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd |
799 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 800 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 801 | pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d { |
353b0b11 | 802 | static_assert_uimm_bits!(IMM5, 5); |
17df50a5 | 803 | vcmpsd(a, b, IMM5 as i8) |
0531ce1d XL |
804 | } |
805 | ||
532ac7d7 | 806 | /// Compares the lower single-precision (32-bit) floating-point element in |
17df50a5 | 807 | /// `a` and `b` based on the comparison operand specified by `IMM5`, |
0531ce1d | 808 | /// store the result in the lower element of returned vector, |
532ac7d7 | 809 | /// and copies the upper 3 packed elements from `a` to the upper elements of |
0531ce1d | 810 | /// returned vector. |
83c7162d | 811 | /// |
353b0b11 | 812 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss) |
0531ce1d XL |
813 | #[inline] |
814 | #[target_feature(enable = "avx,sse")] | |
17df50a5 XL |
815 | #[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss |
816 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 817 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 818 | pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 { |
353b0b11 | 819 | static_assert_uimm_bits!(IMM5, 5); |
17df50a5 | 820 | vcmpss(a, b, IMM5 as i8) |
0531ce1d XL |
821 | } |
822 | ||
532ac7d7 | 823 | /// Converts packed 32-bit integers in `a` to packed double-precision (64-bit) |
0531ce1d | 824 | /// floating-point elements. |
83c7162d | 825 | /// |
353b0b11 | 826 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd) |
0531ce1d XL |
827 | #[inline] |
828 | #[target_feature(enable = "avx")] | |
829 | #[cfg_attr(test, assert_instr(vcvtdq2pd))] | |
83c7162d | 830 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
831 | pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d { |
832 | simd_cast(a.as_i32x4()) | |
833 | } | |
834 | ||
532ac7d7 | 835 | /// Converts packed 32-bit integers in `a` to packed single-precision (32-bit) |
0531ce1d | 836 | /// floating-point elements. |
83c7162d | 837 | /// |
353b0b11 | 838 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps) |
0531ce1d XL |
839 | #[inline] |
840 | #[target_feature(enable = "avx")] | |
841 | #[cfg_attr(test, assert_instr(vcvtdq2ps))] | |
83c7162d | 842 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
843 | pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 { |
844 | vcvtdq2ps(a.as_i32x8()) | |
845 | } | |
846 | ||
532ac7d7 | 847 | /// Converts packed double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 848 | /// to packed single-precision (32-bit) floating-point elements. |
83c7162d | 849 | /// |
353b0b11 | 850 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps) |
0531ce1d XL |
851 | #[inline] |
852 | #[target_feature(enable = "avx")] | |
853 | #[cfg_attr(test, assert_instr(vcvtpd2ps))] | |
83c7162d | 854 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
855 | pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 { |
856 | vcvtpd2ps(a) | |
857 | } | |
858 | ||
532ac7d7 | 859 | /// Converts packed single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 860 | /// to packed 32-bit integers. |
83c7162d | 861 | /// |
353b0b11 | 862 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32) |
0531ce1d XL |
863 | #[inline] |
864 | #[target_feature(enable = "avx")] | |
865 | #[cfg_attr(test, assert_instr(vcvtps2dq))] | |
83c7162d | 866 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 867 | pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i { |
532ac7d7 | 868 | transmute(vcvtps2dq(a)) |
0531ce1d XL |
869 | } |
870 | ||
532ac7d7 | 871 | /// Converts packed single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 872 | /// to packed double-precision (64-bit) floating-point elements. |
83c7162d | 873 | /// |
353b0b11 | 874 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd) |
0531ce1d XL |
875 | #[inline] |
876 | #[target_feature(enable = "avx")] | |
877 | #[cfg_attr(test, assert_instr(vcvtps2pd))] | |
83c7162d | 878 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
879 | pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d { |
880 | simd_cast(a) | |
881 | } | |
882 | ||
532ac7d7 | 883 | /// Converts packed double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 884 | /// to packed 32-bit integers with truncation. |
83c7162d | 885 | /// |
353b0b11 | 886 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32) |
0531ce1d XL |
887 | #[inline] |
888 | #[target_feature(enable = "avx")] | |
889 | #[cfg_attr(test, assert_instr(vcvttpd2dq))] | |
83c7162d | 890 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 891 | pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i { |
532ac7d7 | 892 | transmute(vcvttpd2dq(a)) |
0531ce1d XL |
893 | } |
894 | ||
532ac7d7 | 895 | /// Converts packed double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 896 | /// to packed 32-bit integers. |
83c7162d | 897 | /// |
353b0b11 | 898 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32) |
0531ce1d XL |
899 | #[inline] |
900 | #[target_feature(enable = "avx")] | |
901 | #[cfg_attr(test, assert_instr(vcvtpd2dq))] | |
83c7162d | 902 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 903 | pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i { |
532ac7d7 | 904 | transmute(vcvtpd2dq(a)) |
0531ce1d XL |
905 | } |
906 | ||
532ac7d7 | 907 | /// Converts packed single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 908 | /// to packed 32-bit integers with truncation. |
83c7162d | 909 | /// |
353b0b11 | 910 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32) |
0531ce1d XL |
911 | #[inline] |
912 | #[target_feature(enable = "avx")] | |
913 | #[cfg_attr(test, assert_instr(vcvttps2dq))] | |
83c7162d | 914 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 915 | pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i { |
532ac7d7 | 916 | transmute(vcvttps2dq(a)) |
0531ce1d XL |
917 | } |
918 | ||
532ac7d7 | 919 | /// Extracts 128 bits (composed of 4 packed single-precision (32-bit) |
0531ce1d | 920 | /// floating-point elements) from `a`, selected with `imm8`. |
83c7162d | 921 | /// |
353b0b11 | 922 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps) |
0531ce1d XL |
923 | #[inline] |
924 | #[target_feature(enable = "avx")] | |
0731742a XL |
925 | #[cfg_attr( |
926 | all(test, not(target_os = "windows")), | |
17df50a5 | 927 | assert_instr(vextractf128, IMM1 = 1) |
0731742a | 928 | )] |
17df50a5 XL |
929 | #[rustc_legacy_const_generics(1)] |
930 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
931 | pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 { | |
353b0b11 FG |
932 | static_assert_uimm_bits!(IMM1, 1); |
933 | simd_shuffle!( | |
17df50a5 XL |
934 | a, |
935 | _mm256_undefined_ps(), | |
353b0b11 | 936 | [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize], |
17df50a5 | 937 | ) |
0531ce1d XL |
938 | } |
939 | ||
532ac7d7 | 940 | /// Extracts 128 bits (composed of 2 packed double-precision (64-bit) |
0531ce1d | 941 | /// floating-point elements) from `a`, selected with `imm8`. |
83c7162d | 942 | /// |
353b0b11 | 943 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd) |
0531ce1d XL |
944 | #[inline] |
945 | #[target_feature(enable = "avx")] | |
0731742a XL |
946 | #[cfg_attr( |
947 | all(test, not(target_os = "windows")), | |
17df50a5 | 948 | assert_instr(vextractf128, IMM1 = 1) |
0731742a | 949 | )] |
17df50a5 | 950 | #[rustc_legacy_const_generics(1)] |
83c7162d | 951 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 952 | pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d { |
353b0b11 FG |
953 | static_assert_uimm_bits!(IMM1, 1); |
954 | simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize]) | |
0531ce1d XL |
955 | } |
956 | ||
532ac7d7 | 957 | /// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`. |
83c7162d | 958 | /// |
353b0b11 | 959 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256) |
0531ce1d XL |
960 | #[inline] |
961 | #[target_feature(enable = "avx")] | |
0731742a XL |
962 | #[cfg_attr( |
963 | all(test, not(target_os = "windows")), | |
17df50a5 | 964 | assert_instr(vextractf128, IMM1 = 1) |
0731742a | 965 | )] |
17df50a5 XL |
966 | #[rustc_legacy_const_generics(1)] |
967 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
968 | pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i { | |
353b0b11 FG |
969 | static_assert_uimm_bits!(IMM1, 1); |
970 | let dst: i64x2 = simd_shuffle!( | |
17df50a5 XL |
971 | a.as_i64x4(), |
972 | _mm256_undefined_si256().as_i64x4(), | |
353b0b11 | 973 | [[0, 1], [2, 3]][IMM1 as usize], |
17df50a5 | 974 | ); |
532ac7d7 | 975 | transmute(dst) |
0531ce1d XL |
976 | } |
977 | ||
532ac7d7 | 978 | /// Zeroes the contents of all XMM or YMM registers. |
83c7162d | 979 | /// |
353b0b11 | 980 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall) |
0531ce1d XL |
981 | #[inline] |
982 | #[target_feature(enable = "avx")] | |
983 | #[cfg_attr(test, assert_instr(vzeroall))] | |
83c7162d | 984 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
985 | pub unsafe fn _mm256_zeroall() { |
986 | vzeroall() | |
987 | } | |
988 | ||
532ac7d7 | 989 | /// Zeroes the upper 128 bits of all YMM registers; |
0531ce1d | 990 | /// the lower 128-bits of the registers are unmodified. |
83c7162d | 991 | /// |
353b0b11 | 992 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper) |
0531ce1d XL |
993 | #[inline] |
994 | #[target_feature(enable = "avx")] | |
995 | #[cfg_attr(test, assert_instr(vzeroupper))] | |
83c7162d | 996 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
997 | pub unsafe fn _mm256_zeroupper() { |
998 | vzeroupper() | |
999 | } | |
1000 | ||
532ac7d7 | 1001 | /// Shuffles single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 1002 | /// within 128-bit lanes using the control in `b`. |
83c7162d | 1003 | /// |
353b0b11 | 1004 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps) |
0531ce1d XL |
1005 | #[inline] |
1006 | #[target_feature(enable = "avx")] | |
1007 | #[cfg_attr(test, assert_instr(vpermilps))] | |
83c7162d | 1008 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1009 | pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 { |
1010 | vpermilps256(a, b.as_i32x8()) | |
1011 | } | |
1012 | ||
532ac7d7 | 1013 | /// Shuffles single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 1014 | /// using the control in `b`. |
83c7162d | 1015 | /// |
353b0b11 | 1016 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps) |
0531ce1d XL |
1017 | #[inline] |
1018 | #[target_feature(enable = "avx")] | |
1019 | #[cfg_attr(test, assert_instr(vpermilps))] | |
83c7162d | 1020 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1021 | pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 { |
1022 | vpermilps(a, b.as_i32x4()) | |
1023 | } | |
1024 | ||
532ac7d7 | 1025 | /// Shuffles single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 1026 | /// within 128-bit lanes using the control in `imm8`. |
83c7162d | 1027 | /// |
353b0b11 | 1028 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps) |
0531ce1d XL |
1029 | #[inline] |
1030 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
1031 | #[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))] |
1032 | #[rustc_legacy_const_generics(1)] | |
1033 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1034 | pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 { | |
353b0b11 FG |
1035 | static_assert_uimm_bits!(IMM8, 8); |
1036 | simd_shuffle!( | |
17df50a5 XL |
1037 | a, |
1038 | _mm256_undefined_ps(), | |
353b0b11 | 1039 | [ |
17df50a5 XL |
1040 | (IMM8 as u32 >> 0) & 0b11, |
1041 | (IMM8 as u32 >> 2) & 0b11, | |
1042 | (IMM8 as u32 >> 4) & 0b11, | |
1043 | (IMM8 as u32 >> 6) & 0b11, | |
1044 | ((IMM8 as u32 >> 0) & 0b11) + 4, | |
1045 | ((IMM8 as u32 >> 2) & 0b11) + 4, | |
1046 | ((IMM8 as u32 >> 4) & 0b11) + 4, | |
1047 | ((IMM8 as u32 >> 6) & 0b11) + 4, | |
1048 | ], | |
1049 | ) | |
0531ce1d XL |
1050 | } |
1051 | ||
532ac7d7 | 1052 | /// Shuffles single-precision (32-bit) floating-point elements in `a` |
0531ce1d | 1053 | /// using the control in `imm8`. |
83c7162d | 1054 | /// |
353b0b11 | 1055 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps) |
0531ce1d XL |
1056 | #[inline] |
1057 | #[target_feature(enable = "avx,sse")] | |
17df50a5 XL |
1058 | #[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))] |
1059 | #[rustc_legacy_const_generics(1)] | |
1060 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1061 | pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 { | |
353b0b11 FG |
1062 | static_assert_uimm_bits!(IMM8, 8); |
1063 | simd_shuffle!( | |
17df50a5 XL |
1064 | a, |
1065 | _mm_undefined_ps(), | |
353b0b11 | 1066 | [ |
17df50a5 XL |
1067 | (IMM8 as u32 >> 0) & 0b11, |
1068 | (IMM8 as u32 >> 2) & 0b11, | |
1069 | (IMM8 as u32 >> 4) & 0b11, | |
1070 | (IMM8 as u32 >> 6) & 0b11, | |
1071 | ], | |
1072 | ) | |
0531ce1d XL |
1073 | } |
1074 | ||
532ac7d7 | 1075 | /// Shuffles double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 1076 | /// within 256-bit lanes using the control in `b`. |
83c7162d | 1077 | /// |
353b0b11 | 1078 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd) |
0531ce1d XL |
1079 | #[inline] |
1080 | #[target_feature(enable = "avx")] | |
1081 | #[cfg_attr(test, assert_instr(vpermilpd))] | |
83c7162d | 1082 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1083 | pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d { |
1084 | vpermilpd256(a, b.as_i64x4()) | |
1085 | } | |
1086 | ||
532ac7d7 | 1087 | /// Shuffles double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 1088 | /// using the control in `b`. |
83c7162d | 1089 | /// |
353b0b11 | 1090 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd) |
0531ce1d XL |
1091 | #[inline] |
1092 | #[target_feature(enable = "avx")] | |
1093 | #[cfg_attr(test, assert_instr(vpermilpd))] | |
83c7162d | 1094 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1095 | pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d { |
1096 | vpermilpd(a, b.as_i64x2()) | |
1097 | } | |
1098 | ||
532ac7d7 | 1099 | /// Shuffles double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 1100 | /// within 128-bit lanes using the control in `imm8`. |
83c7162d | 1101 | /// |
353b0b11 | 1102 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd) |
0531ce1d XL |
1103 | #[inline] |
1104 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
1105 | #[cfg_attr(test, assert_instr(vpermilpd, IMM4 = 0x1))] |
1106 | #[rustc_legacy_const_generics(1)] | |
1107 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1108 | pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d { | |
353b0b11 FG |
1109 | static_assert_uimm_bits!(IMM4, 4); |
1110 | simd_shuffle!( | |
17df50a5 XL |
1111 | a, |
1112 | _mm256_undefined_pd(), | |
353b0b11 | 1113 | [ |
17df50a5 XL |
1114 | ((IMM4 as u32 >> 0) & 1), |
1115 | ((IMM4 as u32 >> 1) & 1), | |
1116 | ((IMM4 as u32 >> 2) & 1) + 2, | |
1117 | ((IMM4 as u32 >> 3) & 1) + 2, | |
1118 | ], | |
1119 | ) | |
0531ce1d XL |
1120 | } |
1121 | ||
532ac7d7 | 1122 | /// Shuffles double-precision (64-bit) floating-point elements in `a` |
0531ce1d | 1123 | /// using the control in `imm8`. |
83c7162d | 1124 | /// |
353b0b11 | 1125 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd) |
0531ce1d XL |
1126 | #[inline] |
1127 | #[target_feature(enable = "avx,sse2")] | |
17df50a5 XL |
1128 | #[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0x1))] |
1129 | #[rustc_legacy_const_generics(1)] | |
1130 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1131 | pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d { | |
353b0b11 FG |
1132 | static_assert_uimm_bits!(IMM2, 2); |
1133 | simd_shuffle!( | |
17df50a5 XL |
1134 | a, |
1135 | _mm_undefined_pd(), | |
353b0b11 | 1136 | [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1], |
17df50a5 | 1137 | ) |
0531ce1d XL |
1138 | } |
1139 | ||
532ac7d7 | 1140 | /// Shuffles 256 bits (composed of 8 packed single-precision (32-bit) |
0531ce1d | 1141 | /// floating-point elements) selected by `imm8` from `a` and `b`. |
83c7162d | 1142 | /// |
353b0b11 | 1143 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps) |
0531ce1d XL |
1144 | #[inline] |
1145 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
1146 | #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))] |
1147 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 1148 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1149 | pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 { |
353b0b11 | 1150 | static_assert_uimm_bits!(IMM8, 8); |
17df50a5 | 1151 | vperm2f128ps256(a, b, IMM8 as i8) |
0531ce1d XL |
1152 | } |
1153 | ||
532ac7d7 | 1154 | /// Shuffles 256 bits (composed of 4 packed double-precision (64-bit) |
0531ce1d | 1155 | /// floating-point elements) selected by `imm8` from `a` and `b`. |
83c7162d | 1156 | /// |
353b0b11 | 1157 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd) |
0531ce1d XL |
1158 | #[inline] |
1159 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
1160 | #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))] |
1161 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 1162 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1163 | pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d { |
353b0b11 | 1164 | static_assert_uimm_bits!(IMM8, 8); |
17df50a5 | 1165 | vperm2f128pd256(a, b, IMM8 as i8) |
0531ce1d XL |
1166 | } |
1167 | ||
1b1a35ee | 1168 | /// Shuffles 128-bits (composed of integer data) selected by `imm8` |
0531ce1d | 1169 | /// from `a` and `b`. |
83c7162d | 1170 | /// |
353b0b11 | 1171 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256) |
0531ce1d XL |
1172 | #[inline] |
1173 | #[target_feature(enable = "avx")] | |
17df50a5 XL |
1174 | #[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))] |
1175 | #[rustc_legacy_const_generics(2)] | |
83c7162d | 1176 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1177 | pub unsafe fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i { |
353b0b11 | 1178 | static_assert_uimm_bits!(IMM8, 8); |
17df50a5 | 1179 | transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8)) |
0531ce1d XL |
1180 | } |
1181 | ||
532ac7d7 | 1182 | /// Broadcasts a single-precision (32-bit) floating-point element from memory |
0531ce1d | 1183 | /// to all elements of the returned vector. |
83c7162d | 1184 | /// |
353b0b11 | 1185 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss) |
0531ce1d XL |
1186 | #[inline] |
1187 | #[target_feature(enable = "avx")] | |
1188 | #[cfg_attr(test, assert_instr(vbroadcastss))] | |
83c7162d | 1189 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1190 | #[allow(clippy::trivially_copy_pass_by_ref)] |
0531ce1d XL |
1191 | pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 { |
1192 | _mm256_set1_ps(*f) | |
1193 | } | |
1194 | ||
532ac7d7 | 1195 | /// Broadcasts a single-precision (32-bit) floating-point element from memory |
0531ce1d | 1196 | /// to all elements of the returned vector. |
83c7162d | 1197 | /// |
353b0b11 | 1198 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss) |
0531ce1d XL |
1199 | #[inline] |
1200 | #[target_feature(enable = "avx")] | |
1201 | #[cfg_attr(test, assert_instr(vbroadcastss))] | |
83c7162d | 1202 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1203 | #[allow(clippy::trivially_copy_pass_by_ref)] |
0531ce1d XL |
1204 | pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 { |
1205 | _mm_set1_ps(*f) | |
1206 | } | |
1207 | ||
532ac7d7 | 1208 | /// Broadcasts a double-precision (64-bit) floating-point element from memory |
0531ce1d | 1209 | /// to all elements of the returned vector. |
83c7162d | 1210 | /// |
353b0b11 | 1211 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd) |
0531ce1d XL |
1212 | #[inline] |
1213 | #[target_feature(enable = "avx")] | |
1214 | #[cfg_attr(test, assert_instr(vbroadcastsd))] | |
83c7162d | 1215 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1216 | #[allow(clippy::trivially_copy_pass_by_ref)] |
0531ce1d XL |
1217 | pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d { |
1218 | _mm256_set1_pd(*f) | |
1219 | } | |
1220 | ||
532ac7d7 | 1221 | /// Broadcasts 128 bits from memory (composed of 4 packed single-precision |
0531ce1d | 1222 | /// (32-bit) floating-point elements) to all elements of the returned vector. |
83c7162d | 1223 | /// |
353b0b11 | 1224 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps) |
0531ce1d XL |
1225 | #[inline] |
1226 | #[target_feature(enable = "avx")] | |
1227 | #[cfg_attr(test, assert_instr(vbroadcastf128))] | |
83c7162d | 1228 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1229 | pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 { |
1230 | vbroadcastf128ps256(a) | |
1231 | } | |
1232 | ||
532ac7d7 | 1233 | /// Broadcasts 128 bits from memory (composed of 2 packed double-precision |
0531ce1d | 1234 | /// (64-bit) floating-point elements) to all elements of the returned vector. |
83c7162d | 1235 | /// |
353b0b11 | 1236 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd) |
0531ce1d XL |
1237 | #[inline] |
1238 | #[target_feature(enable = "avx")] | |
1239 | #[cfg_attr(test, assert_instr(vbroadcastf128))] | |
83c7162d | 1240 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1241 | pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d { |
1242 | vbroadcastf128pd256(a) | |
1243 | } | |
1244 | ||
532ac7d7 | 1245 | /// Copies `a` to result, then inserts 128 bits (composed of 4 packed |
0531ce1d XL |
1246 | /// single-precision (32-bit) floating-point elements) from `b` into result |
1247 | /// at the location specified by `imm8`. | |
83c7162d | 1248 | /// |
353b0b11 | 1249 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps) |
0531ce1d XL |
1250 | #[inline] |
1251 | #[target_feature(enable = "avx")] | |
0731742a XL |
1252 | #[cfg_attr( |
1253 | all(test, not(target_os = "windows")), | |
17df50a5 | 1254 | assert_instr(vinsertf128, IMM1 = 1) |
0731742a | 1255 | )] |
17df50a5 XL |
1256 | #[rustc_legacy_const_generics(2)] |
1257 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1258 | pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 { | |
353b0b11 FG |
1259 | static_assert_uimm_bits!(IMM1, 1); |
1260 | simd_shuffle!( | |
17df50a5 XL |
1261 | a, |
1262 | _mm256_castps128_ps256(b), | |
353b0b11 | 1263 | [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize], |
17df50a5 | 1264 | ) |
0531ce1d XL |
1265 | } |
1266 | ||
532ac7d7 | 1267 | /// Copies `a` to result, then inserts 128 bits (composed of 2 packed |
0531ce1d XL |
1268 | /// double-precision (64-bit) floating-point elements) from `b` into result |
1269 | /// at the location specified by `imm8`. | |
83c7162d | 1270 | /// |
353b0b11 | 1271 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd) |
0531ce1d XL |
1272 | #[inline] |
1273 | #[target_feature(enable = "avx")] | |
0731742a XL |
1274 | #[cfg_attr( |
1275 | all(test, not(target_os = "windows")), | |
17df50a5 | 1276 | assert_instr(vinsertf128, IMM1 = 1) |
0731742a | 1277 | )] |
17df50a5 XL |
1278 | #[rustc_legacy_const_generics(2)] |
1279 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1280 | pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d { | |
353b0b11 FG |
1281 | static_assert_uimm_bits!(IMM1, 1); |
1282 | simd_shuffle!( | |
17df50a5 XL |
1283 | a, |
1284 | _mm256_castpd128_pd256(b), | |
353b0b11 | 1285 | [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize], |
17df50a5 | 1286 | ) |
0531ce1d XL |
1287 | } |
1288 | ||
532ac7d7 | 1289 | /// Copies `a` to result, then inserts 128 bits from `b` into result |
0531ce1d | 1290 | /// at the location specified by `imm8`. |
83c7162d | 1291 | /// |
353b0b11 | 1292 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256) |
0531ce1d XL |
1293 | #[inline] |
1294 | #[target_feature(enable = "avx")] | |
0731742a XL |
1295 | #[cfg_attr( |
1296 | all(test, not(target_os = "windows")), | |
17df50a5 | 1297 | assert_instr(vinsertf128, IMM1 = 1) |
0731742a | 1298 | )] |
17df50a5 XL |
1299 | #[rustc_legacy_const_generics(2)] |
1300 | #[stable(feature = "simd_x86", since = "1.27.0")] | |
1301 | pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i { | |
353b0b11 FG |
1302 | static_assert_uimm_bits!(IMM1, 1); |
1303 | let dst: i64x4 = simd_shuffle!( | |
17df50a5 XL |
1304 | a.as_i64x4(), |
1305 | _mm256_castsi128_si256(b).as_i64x4(), | |
353b0b11 | 1306 | [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize], |
17df50a5 | 1307 | ); |
532ac7d7 | 1308 | transmute(dst) |
0531ce1d XL |
1309 | } |
1310 | ||
532ac7d7 | 1311 | /// Copies `a` to result, and inserts the 8-bit integer `i` into result |
0531ce1d | 1312 | /// at the location specified by `index`. |
83c7162d | 1313 | /// |
353b0b11 | 1314 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8) |
0531ce1d XL |
1315 | #[inline] |
1316 | #[target_feature(enable = "avx")] | |
1317 | // This intrinsic has no corresponding instruction. | |
17df50a5 | 1318 | #[rustc_legacy_const_generics(2)] |
83c7162d | 1319 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1320 | pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i { |
353b0b11 | 1321 | static_assert_uimm_bits!(INDEX, 5); |
17df50a5 | 1322 | transmute(simd_insert(a.as_i8x32(), INDEX as u32, i)) |
0531ce1d XL |
1323 | } |
1324 | ||
532ac7d7 | 1325 | /// Copies `a` to result, and inserts the 16-bit integer `i` into result |
0531ce1d | 1326 | /// at the location specified by `index`. |
83c7162d | 1327 | /// |
353b0b11 | 1328 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16) |
0531ce1d XL |
1329 | #[inline] |
1330 | #[target_feature(enable = "avx")] | |
1331 | // This intrinsic has no corresponding instruction. | |
17df50a5 | 1332 | #[rustc_legacy_const_generics(2)] |
83c7162d | 1333 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1334 | pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i { |
353b0b11 | 1335 | static_assert_uimm_bits!(INDEX, 4); |
17df50a5 | 1336 | transmute(simd_insert(a.as_i16x16(), INDEX as u32, i)) |
0531ce1d XL |
1337 | } |
1338 | ||
532ac7d7 | 1339 | /// Copies `a` to result, and inserts the 32-bit integer `i` into result |
0531ce1d | 1340 | /// at the location specified by `index`. |
83c7162d | 1341 | /// |
353b0b11 | 1342 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32) |
0531ce1d XL |
1343 | #[inline] |
1344 | #[target_feature(enable = "avx")] | |
1345 | // This intrinsic has no corresponding instruction. | |
17df50a5 | 1346 | #[rustc_legacy_const_generics(2)] |
83c7162d | 1347 | #[stable(feature = "simd_x86", since = "1.27.0")] |
17df50a5 | 1348 | pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i { |
353b0b11 | 1349 | static_assert_uimm_bits!(INDEX, 3); |
17df50a5 | 1350 | transmute(simd_insert(a.as_i32x8(), INDEX as u32, i)) |
0531ce1d XL |
1351 | } |
1352 | ||
532ac7d7 | 1353 | /// Loads 256-bits (composed of 4 packed double-precision (64-bit) |
0531ce1d XL |
1354 | /// floating-point elements) from memory into result. |
1355 | /// `mem_addr` must be aligned on a 32-byte boundary or a | |
1356 | /// general-protection exception may be generated. | |
83c7162d | 1357 | /// |
353b0b11 | 1358 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd) |
0531ce1d XL |
1359 | #[inline] |
1360 | #[target_feature(enable = "avx")] | |
1361 | #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected | |
83c7162d | 1362 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1363 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1364 | pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d { |
1365 | *(mem_addr as *const __m256d) | |
1366 | } | |
1367 | ||
532ac7d7 | 1368 | /// Stores 256-bits (composed of 4 packed double-precision (64-bit) |
0531ce1d XL |
1369 | /// floating-point elements) from `a` into memory. |
1370 | /// `mem_addr` must be aligned on a 32-byte boundary or a | |
1371 | /// general-protection exception may be generated. | |
83c7162d | 1372 | /// |
353b0b11 | 1373 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd) |
0531ce1d XL |
1374 | #[inline] |
1375 | #[target_feature(enable = "avx")] | |
1376 | #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected | |
83c7162d | 1377 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1378 | #[allow(clippy::cast_ptr_alignment)] |
416331ca | 1379 | pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) { |
0531ce1d XL |
1380 | *(mem_addr as *mut __m256d) = a; |
1381 | } | |
1382 | ||
532ac7d7 | 1383 | /// Loads 256-bits (composed of 8 packed single-precision (32-bit) |
0531ce1d XL |
1384 | /// floating-point elements) from memory into result. |
1385 | /// `mem_addr` must be aligned on a 32-byte boundary or a | |
1386 | /// general-protection exception may be generated. | |
83c7162d | 1387 | /// |
353b0b11 | 1388 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps) |
0531ce1d XL |
1389 | #[inline] |
1390 | #[target_feature(enable = "avx")] | |
1391 | #[cfg_attr(test, assert_instr(vmovaps))] | |
83c7162d | 1392 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1393 | #[allow(clippy::cast_ptr_alignment)] |
0531ce1d XL |
1394 | pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 { |
1395 | *(mem_addr as *const __m256) | |
1396 | } | |
1397 | ||
532ac7d7 | 1398 | /// Stores 256-bits (composed of 8 packed single-precision (32-bit) |
0531ce1d XL |
1399 | /// floating-point elements) from `a` into memory. |
1400 | /// `mem_addr` must be aligned on a 32-byte boundary or a | |
1401 | /// general-protection exception may be generated. | |
83c7162d | 1402 | /// |
353b0b11 | 1403 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps) |
0531ce1d XL |
1404 | #[inline] |
1405 | #[target_feature(enable = "avx")] | |
1406 | #[cfg_attr(test, assert_instr(vmovaps))] | |
83c7162d | 1407 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1408 | #[allow(clippy::cast_ptr_alignment)] |
416331ca | 1409 | pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) { |
0531ce1d XL |
1410 | *(mem_addr as *mut __m256) = a; |
1411 | } | |
1412 | ||
532ac7d7 | 1413 | /// Loads 256-bits (composed of 4 packed double-precision (64-bit) |
0531ce1d XL |
1414 | /// floating-point elements) from memory into result. |
1415 | /// `mem_addr` does not need to be aligned on any particular boundary. | |
83c7162d | 1416 | /// |
353b0b11 | 1417 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd) |
0531ce1d XL |
1418 | #[inline] |
1419 | #[target_feature(enable = "avx")] | |
1420 | #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected | |
83c7162d | 1421 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1422 | pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d { |
1423 | let mut dst = _mm256_undefined_pd(); | |
1424 | ptr::copy_nonoverlapping( | |
1425 | mem_addr as *const u8, | |
1426 | &mut dst as *mut __m256d as *mut u8, | |
1427 | mem::size_of::<__m256d>(), | |
1428 | ); | |
1429 | dst | |
1430 | } | |
1431 | ||
532ac7d7 | 1432 | /// Stores 256-bits (composed of 4 packed double-precision (64-bit) |
0531ce1d XL |
1433 | /// floating-point elements) from `a` into memory. |
1434 | /// `mem_addr` does not need to be aligned on any particular boundary. | |
83c7162d | 1435 | /// |
353b0b11 | 1436 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd) |
0531ce1d XL |
1437 | #[inline] |
1438 | #[target_feature(enable = "avx")] | |
1439 | #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected | |
83c7162d | 1440 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1441 | pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) { |
1442 | storeupd256(mem_addr, a); | |
1443 | } | |
1444 | ||
532ac7d7 | 1445 | /// Loads 256-bits (composed of 8 packed single-precision (32-bit) |
0531ce1d XL |
1446 | /// floating-point elements) from memory into result. |
1447 | /// `mem_addr` does not need to be aligned on any particular boundary. | |
83c7162d | 1448 | /// |
353b0b11 | 1449 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps) |
0531ce1d XL |
1450 | #[inline] |
1451 | #[target_feature(enable = "avx")] | |
1452 | #[cfg_attr(test, assert_instr(vmovups))] | |
83c7162d | 1453 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1454 | pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 { |
1455 | let mut dst = _mm256_undefined_ps(); | |
1456 | ptr::copy_nonoverlapping( | |
1457 | mem_addr as *const u8, | |
1458 | &mut dst as *mut __m256 as *mut u8, | |
1459 | mem::size_of::<__m256>(), | |
1460 | ); | |
1461 | dst | |
1462 | } | |
1463 | ||
532ac7d7 | 1464 | /// Stores 256-bits (composed of 8 packed single-precision (32-bit) |
0531ce1d XL |
1465 | /// floating-point elements) from `a` into memory. |
1466 | /// `mem_addr` does not need to be aligned on any particular boundary. | |
83c7162d | 1467 | /// |
353b0b11 | 1468 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps) |
0531ce1d XL |
1469 | #[inline] |
1470 | #[target_feature(enable = "avx")] | |
1471 | #[cfg_attr(test, assert_instr(vmovups))] | |
83c7162d | 1472 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1473 | pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) { |
1474 | storeups256(mem_addr, a); | |
1475 | } | |
1476 | ||
532ac7d7 | 1477 | /// Loads 256-bits of integer data from memory into result. |
0531ce1d XL |
1478 | /// `mem_addr` must be aligned on a 32-byte boundary or a |
1479 | /// general-protection exception may be generated. | |
83c7162d | 1480 | /// |
353b0b11 | 1481 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256) |
0531ce1d XL |
1482 | #[inline] |
1483 | #[target_feature(enable = "avx")] | |
1484 | #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected | |
83c7162d | 1485 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1486 | pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i { |
1487 | *mem_addr | |
1488 | } | |
1489 | ||
532ac7d7 | 1490 | /// Stores 256-bits of integer data from `a` into memory. |
0531ce1d XL |
1491 | /// `mem_addr` must be aligned on a 32-byte boundary or a |
1492 | /// general-protection exception may be generated. | |
83c7162d | 1493 | /// |
353b0b11 | 1494 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256) |
0531ce1d XL |
1495 | #[inline] |
1496 | #[target_feature(enable = "avx")] | |
1497 | #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected | |
83c7162d | 1498 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1499 | pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) { |
1500 | *mem_addr = a; | |
1501 | } | |
1502 | ||
532ac7d7 | 1503 | /// Loads 256-bits of integer data from memory into result. |
0531ce1d | 1504 | /// `mem_addr` does not need to be aligned on any particular boundary. |
83c7162d | 1505 | /// |
353b0b11 | 1506 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256) |
0531ce1d XL |
1507 | #[inline] |
1508 | #[target_feature(enable = "avx")] | |
1509 | #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected | |
83c7162d | 1510 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1511 | pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i { |
1512 | let mut dst = _mm256_undefined_si256(); | |
1513 | ptr::copy_nonoverlapping( | |
1514 | mem_addr as *const u8, | |
1515 | &mut dst as *mut __m256i as *mut u8, | |
1516 | mem::size_of::<__m256i>(), | |
1517 | ); | |
1518 | dst | |
1519 | } | |
1520 | ||
532ac7d7 | 1521 | /// Stores 256-bits of integer data from `a` into memory. |
0531ce1d | 1522 | /// `mem_addr` does not need to be aligned on any particular boundary. |
83c7162d | 1523 | /// |
353b0b11 | 1524 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256) |
0531ce1d XL |
1525 | #[inline] |
1526 | #[target_feature(enable = "avx")] | |
1527 | #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected | |
83c7162d | 1528 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1529 | pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) { |
1530 | storeudq256(mem_addr as *mut i8, a.as_i8x32()); | |
1531 | } | |
1532 | ||
532ac7d7 | 1533 | /// Loads packed double-precision (64-bit) floating-point elements from memory |
0531ce1d XL |
1534 | /// into result using `mask` (elements are zeroed out when the high bit of the |
1535 | /// corresponding element is not set). | |
83c7162d | 1536 | /// |
353b0b11 | 1537 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd) |
0531ce1d XL |
1538 | #[inline] |
1539 | #[target_feature(enable = "avx")] | |
1540 | #[cfg_attr(test, assert_instr(vmaskmovpd))] | |
83c7162d | 1541 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 1542 | pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d { |
0531ce1d XL |
1543 | maskloadpd256(mem_addr as *const i8, mask.as_i64x4()) |
1544 | } | |
1545 | ||
532ac7d7 | 1546 | /// Stores packed double-precision (64-bit) floating-point elements from `a` |
0531ce1d | 1547 | /// into memory using `mask`. |
83c7162d | 1548 | /// |
353b0b11 | 1549 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd) |
0531ce1d XL |
1550 | #[inline] |
1551 | #[target_feature(enable = "avx")] | |
1552 | #[cfg_attr(test, assert_instr(vmaskmovpd))] | |
83c7162d | 1553 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 1554 | pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) { |
0531ce1d XL |
1555 | maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a); |
1556 | } | |
1557 | ||
532ac7d7 | 1558 | /// Loads packed double-precision (64-bit) floating-point elements from memory |
0531ce1d XL |
1559 | /// into result using `mask` (elements are zeroed out when the high bit of the |
1560 | /// corresponding element is not set). | |
83c7162d | 1561 | /// |
353b0b11 | 1562 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd) |
0531ce1d XL |
1563 | #[inline] |
1564 | #[target_feature(enable = "avx")] | |
1565 | #[cfg_attr(test, assert_instr(vmaskmovpd))] | |
83c7162d | 1566 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1567 | pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d { |
1568 | maskloadpd(mem_addr as *const i8, mask.as_i64x2()) | |
1569 | } | |
1570 | ||
532ac7d7 | 1571 | /// Stores packed double-precision (64-bit) floating-point elements from `a` |
0531ce1d | 1572 | /// into memory using `mask`. |
83c7162d | 1573 | /// |
353b0b11 | 1574 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd) |
0531ce1d XL |
1575 | #[inline] |
1576 | #[target_feature(enable = "avx")] | |
1577 | #[cfg_attr(test, assert_instr(vmaskmovpd))] | |
83c7162d | 1578 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1579 | pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) { |
1580 | maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a); | |
1581 | } | |
1582 | ||
532ac7d7 | 1583 | /// Loads packed single-precision (32-bit) floating-point elements from memory |
0531ce1d XL |
1584 | /// into result using `mask` (elements are zeroed out when the high bit of the |
1585 | /// corresponding element is not set). | |
83c7162d | 1586 | /// |
353b0b11 | 1587 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps) |
0531ce1d XL |
1588 | #[inline] |
1589 | #[target_feature(enable = "avx")] | |
1590 | #[cfg_attr(test, assert_instr(vmaskmovps))] | |
83c7162d | 1591 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 1592 | pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 { |
0531ce1d XL |
1593 | maskloadps256(mem_addr as *const i8, mask.as_i32x8()) |
1594 | } | |
1595 | ||
532ac7d7 | 1596 | /// Stores packed single-precision (32-bit) floating-point elements from `a` |
0531ce1d | 1597 | /// into memory using `mask`. |
83c7162d | 1598 | /// |
353b0b11 | 1599 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps) |
0531ce1d XL |
1600 | #[inline] |
1601 | #[target_feature(enable = "avx")] | |
1602 | #[cfg_attr(test, assert_instr(vmaskmovps))] | |
83c7162d | 1603 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 1604 | pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) { |
0531ce1d XL |
1605 | maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a); |
1606 | } | |
1607 | ||
532ac7d7 | 1608 | /// Loads packed single-precision (32-bit) floating-point elements from memory |
0531ce1d XL |
1609 | /// into result using `mask` (elements are zeroed out when the high bit of the |
1610 | /// corresponding element is not set). | |
83c7162d | 1611 | /// |
353b0b11 | 1612 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps) |
0531ce1d XL |
1613 | #[inline] |
1614 | #[target_feature(enable = "avx")] | |
1615 | #[cfg_attr(test, assert_instr(vmaskmovps))] | |
83c7162d | 1616 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1617 | pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 { |
1618 | maskloadps(mem_addr as *const i8, mask.as_i32x4()) | |
1619 | } | |
1620 | ||
532ac7d7 | 1621 | /// Stores packed single-precision (32-bit) floating-point elements from `a` |
0531ce1d | 1622 | /// into memory using `mask`. |
83c7162d | 1623 | /// |
353b0b11 | 1624 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps) |
0531ce1d XL |
1625 | #[inline] |
1626 | #[target_feature(enable = "avx")] | |
1627 | #[cfg_attr(test, assert_instr(vmaskmovps))] | |
83c7162d | 1628 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1629 | pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) { |
1630 | maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a); | |
1631 | } | |
1632 | ||
1633 | /// Duplicate odd-indexed single-precision (32-bit) floating-point elements | |
532ac7d7 | 1634 | /// from `a`, and returns the results. |
83c7162d | 1635 | /// |
353b0b11 | 1636 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps) |
0531ce1d XL |
1637 | #[inline] |
1638 | #[target_feature(enable = "avx")] | |
1639 | #[cfg_attr(test, assert_instr(vmovshdup))] | |
83c7162d | 1640 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1641 | pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 { |
353b0b11 | 1642 | simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7]) |
0531ce1d XL |
1643 | } |
1644 | ||
1645 | /// Duplicate even-indexed single-precision (32-bit) floating-point elements | |
532ac7d7 | 1646 | /// from `a`, and returns the results. |
83c7162d | 1647 | /// |
353b0b11 | 1648 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps) |
0531ce1d XL |
1649 | #[inline] |
1650 | #[target_feature(enable = "avx")] | |
1651 | #[cfg_attr(test, assert_instr(vmovsldup))] | |
83c7162d | 1652 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1653 | pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 { |
353b0b11 | 1654 | simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]) |
0531ce1d XL |
1655 | } |
1656 | ||
1657 | /// Duplicate even-indexed double-precision (64-bit) floating-point elements | |
e1599b0c | 1658 | /// from `a`, and returns the results. |
83c7162d | 1659 | /// |
353b0b11 | 1660 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd) |
0531ce1d XL |
1661 | #[inline] |
1662 | #[target_feature(enable = "avx")] | |
1663 | #[cfg_attr(test, assert_instr(vmovddup))] | |
83c7162d | 1664 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1665 | pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d { |
353b0b11 | 1666 | simd_shuffle!(a, a, [0, 0, 2, 2]) |
0531ce1d XL |
1667 | } |
1668 | ||
532ac7d7 | 1669 | /// Loads 256-bits of integer data from unaligned memory into result. |
0531ce1d XL |
1670 | /// This intrinsic may perform better than `_mm256_loadu_si256` when the |
1671 | /// data crosses a cache line boundary. | |
83c7162d | 1672 | /// |
353b0b11 | 1673 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256) |
0531ce1d XL |
1674 | #[inline] |
1675 | #[target_feature(enable = "avx")] | |
1676 | #[cfg_attr(test, assert_instr(vlddqu))] | |
83c7162d | 1677 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1678 | pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i { |
532ac7d7 | 1679 | transmute(vlddqu(mem_addr as *const i8)) |
0531ce1d XL |
1680 | } |
1681 | ||
1682 | /// Moves integer data from a 256-bit integer vector to a 32-byte | |
1683 | /// aligned memory location. To minimize caching, the data is flagged as | |
1684 | /// non-temporal (unlikely to be used again soon) | |
83c7162d | 1685 | /// |
353b0b11 | 1686 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256) |
0531ce1d XL |
1687 | #[inline] |
1688 | #[target_feature(enable = "avx")] | |
1689 | #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq | |
83c7162d | 1690 | #[stable(feature = "simd_x86", since = "1.27.0")] |
a1dfa0c6 XL |
1691 | pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) { |
1692 | intrinsics::nontemporal_store(mem_addr, a); | |
0531ce1d XL |
1693 | } |
1694 | ||
83c7162d | 1695 | /// Moves double-precision values from a 256-bit vector of `[4 x double]` |
0531ce1d XL |
1696 | /// to a 32-byte aligned memory location. To minimize caching, the data is |
1697 | /// flagged as non-temporal (unlikely to be used again soon). | |
83c7162d | 1698 | /// |
353b0b11 | 1699 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd) |
0531ce1d XL |
1700 | #[inline] |
1701 | #[target_feature(enable = "avx")] | |
1702 | #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd | |
83c7162d | 1703 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1704 | #[allow(clippy::cast_ptr_alignment)] |
a1dfa0c6 XL |
1705 | pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) { |
1706 | intrinsics::nontemporal_store(mem_addr as *mut __m256d, a); | |
0531ce1d XL |
1707 | } |
1708 | ||
1709 | /// Moves single-precision floating point values from a 256-bit vector | |
83c7162d | 1710 | /// of `[8 x float]` to a 32-byte aligned memory location. To minimize |
0531ce1d XL |
1711 | /// caching, the data is flagged as non-temporal (unlikely to be used again |
1712 | /// soon). | |
83c7162d | 1713 | /// |
353b0b11 | 1714 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps) |
0531ce1d XL |
1715 | #[inline] |
1716 | #[target_feature(enable = "avx")] | |
1717 | #[cfg_attr(test, assert_instr(vmovntps))] | |
83c7162d | 1718 | #[stable(feature = "simd_x86", since = "1.27.0")] |
48663c56 | 1719 | #[allow(clippy::cast_ptr_alignment)] |
a1dfa0c6 XL |
1720 | pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) { |
1721 | intrinsics::nontemporal_store(mem_addr as *mut __m256, a); | |
0531ce1d XL |
1722 | } |
1723 | ||
532ac7d7 XL |
1724 | /// Computes the approximate reciprocal of packed single-precision (32-bit) |
1725 | /// floating-point elements in `a`, and returns the results. The maximum | |
0531ce1d | 1726 | /// relative error for this approximation is less than 1.5*2^-12. |
83c7162d | 1727 | /// |
353b0b11 | 1728 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps) |
0531ce1d XL |
1729 | #[inline] |
1730 | #[target_feature(enable = "avx")] | |
1731 | #[cfg_attr(test, assert_instr(vrcpps))] | |
83c7162d | 1732 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1733 | pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 { |
1734 | vrcpps(a) | |
1735 | } | |
1736 | ||
532ac7d7 XL |
1737 | /// Computes the approximate reciprocal square root of packed single-precision |
1738 | /// (32-bit) floating-point elements in `a`, and returns the results. | |
0531ce1d | 1739 | /// The maximum relative error for this approximation is less than 1.5*2^-12. |
83c7162d | 1740 | /// |
353b0b11 | 1741 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps) |
0531ce1d XL |
1742 | #[inline] |
1743 | #[target_feature(enable = "avx")] | |
1744 | #[cfg_attr(test, assert_instr(vrsqrtps))] | |
83c7162d | 1745 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1746 | pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 { |
1747 | vrsqrtps(a) | |
1748 | } | |
1749 | ||
532ac7d7 | 1750 | /// Unpacks and interleave double-precision (64-bit) floating-point elements |
0531ce1d | 1751 | /// from the high half of each 128-bit lane in `a` and `b`. |
83c7162d | 1752 | /// |
353b0b11 | 1753 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd) |
0531ce1d XL |
1754 | #[inline] |
1755 | #[target_feature(enable = "avx")] | |
1756 | #[cfg_attr(test, assert_instr(vunpckhpd))] | |
83c7162d | 1757 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1758 | pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d { |
353b0b11 | 1759 | simd_shuffle!(a, b, [1, 5, 3, 7]) |
0531ce1d XL |
1760 | } |
1761 | ||
532ac7d7 | 1762 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
0531ce1d | 1763 | /// from the high half of each 128-bit lane in `a` and `b`. |
83c7162d | 1764 | /// |
353b0b11 | 1765 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps) |
0531ce1d XL |
1766 | #[inline] |
1767 | #[target_feature(enable = "avx")] | |
1768 | #[cfg_attr(test, assert_instr(vunpckhps))] | |
83c7162d | 1769 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1770 | pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 { |
353b0b11 | 1771 | simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15]) |
0531ce1d XL |
1772 | } |
1773 | ||
532ac7d7 | 1774 | /// Unpacks and interleave double-precision (64-bit) floating-point elements |
0531ce1d | 1775 | /// from the low half of each 128-bit lane in `a` and `b`. |
83c7162d | 1776 | /// |
353b0b11 | 1777 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd) |
0531ce1d XL |
1778 | #[inline] |
1779 | #[target_feature(enable = "avx")] | |
1780 | #[cfg_attr(test, assert_instr(vunpcklpd))] | |
83c7162d | 1781 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1782 | pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d { |
353b0b11 | 1783 | simd_shuffle!(a, b, [0, 4, 2, 6]) |
0531ce1d XL |
1784 | } |
1785 | ||
532ac7d7 | 1786 | /// Unpacks and interleave single-precision (32-bit) floating-point elements |
0531ce1d | 1787 | /// from the low half of each 128-bit lane in `a` and `b`. |
83c7162d | 1788 | /// |
353b0b11 | 1789 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps) |
0531ce1d XL |
1790 | #[inline] |
1791 | #[target_feature(enable = "avx")] | |
1792 | #[cfg_attr(test, assert_instr(vunpcklps))] | |
83c7162d | 1793 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 1794 | pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 { |
353b0b11 | 1795 | simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13]) |
0531ce1d XL |
1796 | } |
1797 | ||
532ac7d7 | 1798 | /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and |
0531ce1d | 1799 | /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. |
532ac7d7 | 1800 | /// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if |
0531ce1d | 1801 | /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value. |
83c7162d | 1802 | /// |
353b0b11 | 1803 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256) |
0531ce1d XL |
1804 | #[inline] |
1805 | #[target_feature(enable = "avx")] | |
1806 | #[cfg_attr(test, assert_instr(vptest))] | |
83c7162d | 1807 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1808 | pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 { |
1809 | ptestz256(a.as_i64x4(), b.as_i64x4()) | |
1810 | } | |
1811 | ||
532ac7d7 | 1812 | /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and |
0531ce1d | 1813 | /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. |
532ac7d7 | 1814 | /// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if |
0531ce1d | 1815 | /// the result is zero, otherwise set `CF` to 0. Return the `CF` value. |
83c7162d | 1816 | /// |
353b0b11 | 1817 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256) |
0531ce1d XL |
1818 | #[inline] |
1819 | #[target_feature(enable = "avx")] | |
1820 | #[cfg_attr(test, assert_instr(vptest))] | |
83c7162d | 1821 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1822 | pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 { |
1823 | ptestc256(a.as_i64x4(), b.as_i64x4()) | |
1824 | } | |
1825 | ||
532ac7d7 | 1826 | /// Computes the bitwise AND of 256 bits (representing integer data) in `a` and |
0531ce1d | 1827 | /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0. |
532ac7d7 | 1828 | /// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if |
0531ce1d XL |
1829 | /// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and |
1830 | /// `CF` values are zero, otherwise return 0. | |
83c7162d | 1831 | /// |
353b0b11 | 1832 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256) |
0531ce1d XL |
1833 | #[inline] |
1834 | #[target_feature(enable = "avx")] | |
1835 | #[cfg_attr(test, assert_instr(vptest))] | |
83c7162d | 1836 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1837 | pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 { |
1838 | ptestnzc256(a.as_i64x4(), b.as_i64x4()) | |
1839 | } | |
1840 | ||
532ac7d7 | 1841 | /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) |
0531ce1d XL |
1842 | /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit |
1843 | /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the | |
1844 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1845 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1846 | /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value | |
1847 | /// is zero, otherwise set `CF` to 0. Return the `ZF` value. | |
83c7162d | 1848 | /// |
353b0b11 | 1849 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd) |
0531ce1d XL |
1850 | #[inline] |
1851 | #[target_feature(enable = "avx")] | |
1852 | #[cfg_attr(test, assert_instr(vtestpd))] | |
83c7162d | 1853 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1854 | pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 { |
1855 | vtestzpd256(a, b) | |
1856 | } | |
1857 | ||
532ac7d7 | 1858 | /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) |
0531ce1d XL |
1859 | /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit |
1860 | /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the | |
1861 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1862 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1863 | /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value | |
1864 | /// is zero, otherwise set `CF` to 0. Return the `CF` value. | |
83c7162d | 1865 | /// |
353b0b11 | 1866 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd) |
0531ce1d XL |
1867 | #[inline] |
1868 | #[target_feature(enable = "avx")] | |
1869 | #[cfg_attr(test, assert_instr(vtestpd))] | |
83c7162d | 1870 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1871 | pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 { |
1872 | vtestcpd256(a, b) | |
1873 | } | |
1874 | ||
532ac7d7 | 1875 | /// Computes the bitwise AND of 256 bits (representing double-precision (64-bit) |
0531ce1d XL |
1876 | /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit |
1877 | /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the | |
1878 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1879 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1880 | /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value | |
1881 | /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values | |
1882 | /// are zero, otherwise return 0. | |
83c7162d | 1883 | /// |
353b0b11 | 1884 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd) |
0531ce1d XL |
1885 | #[inline] |
1886 | #[target_feature(enable = "avx")] | |
1887 | #[cfg_attr(test, assert_instr(vtestpd))] | |
83c7162d | 1888 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1889 | pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 { |
1890 | vtestnzcpd256(a, b) | |
1891 | } | |
1892 | ||
532ac7d7 | 1893 | /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) |
0531ce1d XL |
1894 | /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit |
1895 | /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the | |
1896 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1897 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1898 | /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value | |
1899 | /// is zero, otherwise set `CF` to 0. Return the `ZF` value. | |
83c7162d | 1900 | /// |
353b0b11 | 1901 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd) |
0531ce1d XL |
1902 | #[inline] |
1903 | #[target_feature(enable = "avx")] | |
1904 | #[cfg_attr(test, assert_instr(vtestpd))] | |
83c7162d | 1905 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1906 | pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 { |
1907 | vtestzpd(a, b) | |
1908 | } | |
1909 | ||
532ac7d7 | 1910 | /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) |
0531ce1d XL |
1911 | /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit |
1912 | /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the | |
1913 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1914 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1915 | /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value | |
1916 | /// is zero, otherwise set `CF` to 0. Return the `CF` value. | |
83c7162d | 1917 | /// |
353b0b11 | 1918 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd) |
0531ce1d XL |
1919 | #[inline] |
1920 | #[target_feature(enable = "avx")] | |
1921 | #[cfg_attr(test, assert_instr(vtestpd))] | |
83c7162d | 1922 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1923 | pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 { |
1924 | vtestcpd(a, b) | |
1925 | } | |
1926 | ||
532ac7d7 | 1927 | /// Computes the bitwise AND of 128 bits (representing double-precision (64-bit) |
0531ce1d XL |
1928 | /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit |
1929 | /// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the | |
1930 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1931 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1932 | /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value | |
1933 | /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values | |
1934 | /// are zero, otherwise return 0. | |
83c7162d | 1935 | /// |
353b0b11 | 1936 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd) |
0531ce1d XL |
1937 | #[inline] |
1938 | #[target_feature(enable = "avx")] | |
1939 | #[cfg_attr(test, assert_instr(vtestpd))] | |
83c7162d | 1940 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1941 | pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 { |
1942 | vtestnzcpd(a, b) | |
1943 | } | |
1944 | ||
532ac7d7 | 1945 | /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) |
0531ce1d XL |
1946 | /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit |
1947 | /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the | |
1948 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1949 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1950 | /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value | |
1951 | /// is zero, otherwise set `CF` to 0. Return the `ZF` value. | |
83c7162d | 1952 | /// |
353b0b11 | 1953 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps) |
0531ce1d XL |
1954 | #[inline] |
1955 | #[target_feature(enable = "avx")] | |
1956 | #[cfg_attr(test, assert_instr(vtestps))] | |
83c7162d | 1957 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1958 | pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 { |
1959 | vtestzps256(a, b) | |
1960 | } | |
1961 | ||
532ac7d7 | 1962 | /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) |
0531ce1d XL |
1963 | /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit |
1964 | /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the | |
1965 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1966 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1967 | /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value | |
1968 | /// is zero, otherwise set `CF` to 0. Return the `CF` value. | |
83c7162d | 1969 | /// |
353b0b11 | 1970 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps) |
0531ce1d XL |
1971 | #[inline] |
1972 | #[target_feature(enable = "avx")] | |
1973 | #[cfg_attr(test, assert_instr(vtestps))] | |
83c7162d | 1974 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1975 | pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 { |
1976 | vtestcps256(a, b) | |
1977 | } | |
1978 | ||
532ac7d7 | 1979 | /// Computes the bitwise AND of 256 bits (representing single-precision (32-bit) |
0531ce1d XL |
1980 | /// floating-point elements) in `a` and `b`, producing an intermediate 256-bit |
1981 | /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the | |
1982 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
1983 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
1984 | /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value | |
1985 | /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values | |
1986 | /// are zero, otherwise return 0. | |
83c7162d | 1987 | /// |
353b0b11 | 1988 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps) |
0531ce1d XL |
1989 | #[inline] |
1990 | #[target_feature(enable = "avx")] | |
1991 | #[cfg_attr(test, assert_instr(vtestps))] | |
83c7162d | 1992 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1993 | pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 { |
1994 | vtestnzcps256(a, b) | |
1995 | } | |
1996 | ||
532ac7d7 | 1997 | /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) |
0531ce1d XL |
1998 | /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit |
1999 | /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the | |
2000 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
2001 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
2002 | /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value | |
2003 | /// is zero, otherwise set `CF` to 0. Return the `ZF` value. | |
83c7162d | 2004 | /// |
353b0b11 | 2005 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps) |
0531ce1d XL |
2006 | #[inline] |
2007 | #[target_feature(enable = "avx")] | |
2008 | #[cfg_attr(test, assert_instr(vtestps))] | |
83c7162d | 2009 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2010 | pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 { |
2011 | vtestzps(a, b) | |
2012 | } | |
2013 | ||
532ac7d7 | 2014 | /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) |
0531ce1d XL |
2015 | /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit |
2016 | /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the | |
2017 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
2018 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
2019 | /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value | |
2020 | /// is zero, otherwise set `CF` to 0. Return the `CF` value. | |
83c7162d | 2021 | /// |
353b0b11 | 2022 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps) |
0531ce1d XL |
2023 | #[inline] |
2024 | #[target_feature(enable = "avx")] | |
2025 | #[cfg_attr(test, assert_instr(vtestps))] | |
83c7162d | 2026 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2027 | pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 { |
2028 | vtestcps(a, b) | |
2029 | } | |
2030 | ||
532ac7d7 | 2031 | /// Computes the bitwise AND of 128 bits (representing single-precision (32-bit) |
0531ce1d XL |
2032 | /// floating-point elements) in `a` and `b`, producing an intermediate 128-bit |
2033 | /// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the | |
2034 | /// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise | |
2035 | /// NOT of `a` and then AND with `b`, producing an intermediate value, and set | |
2036 | /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value | |
2037 | /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values | |
2038 | /// are zero, otherwise return 0. | |
83c7162d | 2039 | /// |
353b0b11 | 2040 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps) |
0531ce1d XL |
2041 | #[inline] |
2042 | #[target_feature(enable = "avx")] | |
2043 | #[cfg_attr(test, assert_instr(vtestps))] | |
83c7162d | 2044 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2045 | pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 { |
2046 | vtestnzcps(a, b) | |
2047 | } | |
2048 | ||
532ac7d7 | 2049 | /// Sets each bit of the returned mask based on the most significant bit of the |
0531ce1d XL |
2050 | /// corresponding packed double-precision (64-bit) floating-point element in |
2051 | /// `a`. | |
83c7162d | 2052 | /// |
353b0b11 | 2053 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd) |
0531ce1d XL |
2054 | #[inline] |
2055 | #[target_feature(enable = "avx")] | |
2056 | #[cfg_attr(test, assert_instr(vmovmskpd))] | |
83c7162d | 2057 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2058 | pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 { |
2059 | movmskpd256(a) | |
2060 | } | |
2061 | ||
532ac7d7 | 2062 | /// Sets each bit of the returned mask based on the most significant bit of the |
0531ce1d XL |
2063 | /// corresponding packed single-precision (32-bit) floating-point element in |
2064 | /// `a`. | |
83c7162d | 2065 | /// |
353b0b11 | 2066 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps) |
0531ce1d XL |
2067 | #[inline] |
2068 | #[target_feature(enable = "avx")] | |
2069 | #[cfg_attr(test, assert_instr(vmovmskps))] | |
83c7162d | 2070 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2071 | pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 { |
2072 | movmskps256(a) | |
2073 | } | |
2074 | ||
532ac7d7 | 2075 | /// Returns vector of type __m256d with all elements set to zero. |
83c7162d | 2076 | /// |
353b0b11 | 2077 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd) |
0531ce1d XL |
2078 | #[inline] |
2079 | #[target_feature(enable = "avx")] | |
2080 | #[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected | |
83c7162d | 2081 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2082 | pub unsafe fn _mm256_setzero_pd() -> __m256d { |
2083 | _mm256_set1_pd(0.0) | |
2084 | } | |
2085 | ||
532ac7d7 | 2086 | /// Returns vector of type __m256 with all elements set to zero. |
83c7162d | 2087 | /// |
353b0b11 | 2088 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps) |
0531ce1d XL |
2089 | #[inline] |
2090 | #[target_feature(enable = "avx")] | |
2091 | #[cfg_attr(test, assert_instr(vxorps))] | |
83c7162d | 2092 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2093 | pub unsafe fn _mm256_setzero_ps() -> __m256 { |
2094 | _mm256_set1_ps(0.0) | |
2095 | } | |
2096 | ||
532ac7d7 | 2097 | /// Returns vector of type __m256i with all elements set to zero. |
83c7162d | 2098 | /// |
353b0b11 | 2099 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256) |
0531ce1d XL |
2100 | #[inline] |
2101 | #[target_feature(enable = "avx")] | |
2102 | #[cfg_attr(test, assert_instr(vxor))] | |
83c7162d | 2103 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2104 | pub unsafe fn _mm256_setzero_si256() -> __m256i { |
2105 | _mm256_set1_epi8(0) | |
2106 | } | |
2107 | ||
532ac7d7 | 2108 | /// Sets packed double-precision (64-bit) floating-point elements in returned |
0531ce1d | 2109 | /// vector with the supplied values. |
83c7162d | 2110 | /// |
353b0b11 | 2111 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd) |
0531ce1d XL |
2112 | #[inline] |
2113 | #[target_feature(enable = "avx")] | |
2114 | // This intrinsic has no corresponding instruction. | |
2115 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2116 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2117 | pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { |
2118 | _mm256_setr_pd(d, c, b, a) | |
2119 | } | |
2120 | ||
532ac7d7 | 2121 | /// Sets packed single-precision (32-bit) floating-point elements in returned |
0531ce1d | 2122 | /// vector with the supplied values. |
83c7162d | 2123 | /// |
353b0b11 | 2124 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps) |
0531ce1d XL |
2125 | #[inline] |
2126 | #[target_feature(enable = "avx")] | |
2127 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2128 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2129 | pub unsafe fn _mm256_set_ps( |
0731742a XL |
2130 | a: f32, |
2131 | b: f32, | |
2132 | c: f32, | |
2133 | d: f32, | |
2134 | e: f32, | |
2135 | f: f32, | |
2136 | g: f32, | |
2137 | h: f32, | |
0531ce1d XL |
2138 | ) -> __m256 { |
2139 | _mm256_setr_ps(h, g, f, e, d, c, b, a) | |
2140 | } | |
2141 | ||
3c0e092e | 2142 | /// Sets packed 8-bit integers in returned vector with the supplied values. |
83c7162d | 2143 | /// |
353b0b11 | 2144 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8) |
0531ce1d XL |
2145 | #[inline] |
2146 | #[target_feature(enable = "avx")] | |
2147 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2148 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2149 | pub unsafe fn _mm256_set_epi8( |
0731742a XL |
2150 | e00: i8, |
2151 | e01: i8, | |
2152 | e02: i8, | |
2153 | e03: i8, | |
2154 | e04: i8, | |
2155 | e05: i8, | |
2156 | e06: i8, | |
2157 | e07: i8, | |
2158 | e08: i8, | |
2159 | e09: i8, | |
2160 | e10: i8, | |
2161 | e11: i8, | |
2162 | e12: i8, | |
2163 | e13: i8, | |
2164 | e14: i8, | |
2165 | e15: i8, | |
2166 | e16: i8, | |
2167 | e17: i8, | |
2168 | e18: i8, | |
2169 | e19: i8, | |
2170 | e20: i8, | |
2171 | e21: i8, | |
2172 | e22: i8, | |
2173 | e23: i8, | |
2174 | e24: i8, | |
2175 | e25: i8, | |
2176 | e26: i8, | |
2177 | e27: i8, | |
2178 | e28: i8, | |
2179 | e29: i8, | |
2180 | e30: i8, | |
2181 | e31: i8, | |
0531ce1d | 2182 | ) -> __m256i { |
0731742a | 2183 | #[rustfmt::skip] |
0531ce1d XL |
2184 | _mm256_setr_epi8( |
2185 | e31, e30, e29, e28, e27, e26, e25, e24, | |
2186 | e23, e22, e21, e20, e19, e18, e17, e16, | |
2187 | e15, e14, e13, e12, e11, e10, e09, e08, | |
2188 | e07, e06, e05, e04, e03, e02, e01, e00, | |
2189 | ) | |
2190 | } | |
2191 | ||
532ac7d7 | 2192 | /// Sets packed 16-bit integers in returned vector with the supplied values. |
83c7162d | 2193 | /// |
353b0b11 | 2194 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16) |
0531ce1d XL |
2195 | #[inline] |
2196 | #[target_feature(enable = "avx")] | |
2197 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2198 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2199 | pub unsafe fn _mm256_set_epi16( |
0731742a XL |
2200 | e00: i16, |
2201 | e01: i16, | |
2202 | e02: i16, | |
2203 | e03: i16, | |
2204 | e04: i16, | |
2205 | e05: i16, | |
2206 | e06: i16, | |
2207 | e07: i16, | |
2208 | e08: i16, | |
2209 | e09: i16, | |
2210 | e10: i16, | |
2211 | e11: i16, | |
2212 | e12: i16, | |
2213 | e13: i16, | |
2214 | e14: i16, | |
2215 | e15: i16, | |
0531ce1d | 2216 | ) -> __m256i { |
0731742a | 2217 | #[rustfmt::skip] |
0531ce1d XL |
2218 | _mm256_setr_epi16( |
2219 | e15, e14, e13, e12, | |
2220 | e11, e10, e09, e08, | |
2221 | e07, e06, e05, e04, | |
2222 | e03, e02, e01, e00, | |
2223 | ) | |
2224 | } | |
2225 | ||
532ac7d7 | 2226 | /// Sets packed 32-bit integers in returned vector with the supplied values. |
83c7162d | 2227 | /// |
353b0b11 | 2228 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32) |
0531ce1d XL |
2229 | #[inline] |
2230 | #[target_feature(enable = "avx")] | |
2231 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2232 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2233 | pub unsafe fn _mm256_set_epi32( |
0731742a XL |
2234 | e0: i32, |
2235 | e1: i32, | |
2236 | e2: i32, | |
2237 | e3: i32, | |
2238 | e4: i32, | |
2239 | e5: i32, | |
2240 | e6: i32, | |
2241 | e7: i32, | |
0531ce1d XL |
2242 | ) -> __m256i { |
2243 | _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) | |
2244 | } | |
2245 | ||
532ac7d7 | 2246 | /// Sets packed 64-bit integers in returned vector with the supplied values. |
83c7162d | 2247 | /// |
353b0b11 | 2248 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x) |
0531ce1d XL |
2249 | #[inline] |
2250 | #[target_feature(enable = "avx")] | |
2251 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2252 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2253 | pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i { |
2254 | _mm256_setr_epi64x(d, c, b, a) | |
2255 | } | |
2256 | ||
532ac7d7 | 2257 | /// Sets packed double-precision (64-bit) floating-point elements in returned |
0531ce1d | 2258 | /// vector with the supplied values in reverse order. |
83c7162d | 2259 | /// |
353b0b11 | 2260 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd) |
0531ce1d XL |
2261 | #[inline] |
2262 | #[target_feature(enable = "avx")] | |
2263 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2264 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2265 | pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d { |
2266 | __m256d(a, b, c, d) | |
2267 | } | |
2268 | ||
532ac7d7 | 2269 | /// Sets packed single-precision (32-bit) floating-point elements in returned |
0531ce1d | 2270 | /// vector with the supplied values in reverse order. |
83c7162d | 2271 | /// |
353b0b11 | 2272 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps) |
0531ce1d XL |
2273 | #[inline] |
2274 | #[target_feature(enable = "avx")] | |
2275 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2276 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2277 | pub unsafe fn _mm256_setr_ps( |
0731742a XL |
2278 | a: f32, |
2279 | b: f32, | |
2280 | c: f32, | |
2281 | d: f32, | |
2282 | e: f32, | |
2283 | f: f32, | |
2284 | g: f32, | |
2285 | h: f32, | |
0531ce1d XL |
2286 | ) -> __m256 { |
2287 | __m256(a, b, c, d, e, f, g, h) | |
2288 | } | |
2289 | ||
532ac7d7 | 2290 | /// Sets packed 8-bit integers in returned vector with the supplied values in |
0531ce1d | 2291 | /// reverse order. |
83c7162d | 2292 | /// |
353b0b11 | 2293 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8) |
0531ce1d XL |
2294 | #[inline] |
2295 | #[target_feature(enable = "avx")] | |
2296 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2297 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2298 | pub unsafe fn _mm256_setr_epi8( |
0731742a XL |
2299 | e00: i8, |
2300 | e01: i8, | |
2301 | e02: i8, | |
2302 | e03: i8, | |
2303 | e04: i8, | |
2304 | e05: i8, | |
2305 | e06: i8, | |
2306 | e07: i8, | |
2307 | e08: i8, | |
2308 | e09: i8, | |
2309 | e10: i8, | |
2310 | e11: i8, | |
2311 | e12: i8, | |
2312 | e13: i8, | |
2313 | e14: i8, | |
2314 | e15: i8, | |
2315 | e16: i8, | |
2316 | e17: i8, | |
2317 | e18: i8, | |
2318 | e19: i8, | |
2319 | e20: i8, | |
2320 | e21: i8, | |
2321 | e22: i8, | |
2322 | e23: i8, | |
2323 | e24: i8, | |
2324 | e25: i8, | |
2325 | e26: i8, | |
2326 | e27: i8, | |
2327 | e28: i8, | |
2328 | e29: i8, | |
2329 | e30: i8, | |
2330 | e31: i8, | |
0531ce1d | 2331 | ) -> __m256i { |
0731742a | 2332 | #[rustfmt::skip] |
532ac7d7 | 2333 | transmute(i8x32::new( |
0531ce1d XL |
2334 | e00, e01, e02, e03, e04, e05, e06, e07, |
2335 | e08, e09, e10, e11, e12, e13, e14, e15, | |
2336 | e16, e17, e18, e19, e20, e21, e22, e23, | |
2337 | e24, e25, e26, e27, e28, e29, e30, e31, | |
2338 | )) | |
2339 | } | |
2340 | ||
532ac7d7 | 2341 | /// Sets packed 16-bit integers in returned vector with the supplied values in |
0531ce1d | 2342 | /// reverse order. |
83c7162d | 2343 | /// |
353b0b11 | 2344 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16) |
0531ce1d XL |
2345 | #[inline] |
2346 | #[target_feature(enable = "avx")] | |
2347 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2348 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2349 | pub unsafe fn _mm256_setr_epi16( |
0731742a XL |
2350 | e00: i16, |
2351 | e01: i16, | |
2352 | e02: i16, | |
2353 | e03: i16, | |
2354 | e04: i16, | |
2355 | e05: i16, | |
2356 | e06: i16, | |
2357 | e07: i16, | |
2358 | e08: i16, | |
2359 | e09: i16, | |
2360 | e10: i16, | |
2361 | e11: i16, | |
2362 | e12: i16, | |
2363 | e13: i16, | |
2364 | e14: i16, | |
2365 | e15: i16, | |
0531ce1d | 2366 | ) -> __m256i { |
0731742a | 2367 | #[rustfmt::skip] |
532ac7d7 | 2368 | transmute(i16x16::new( |
0531ce1d XL |
2369 | e00, e01, e02, e03, |
2370 | e04, e05, e06, e07, | |
2371 | e08, e09, e10, e11, | |
2372 | e12, e13, e14, e15, | |
2373 | )) | |
2374 | } | |
2375 | ||
532ac7d7 | 2376 | /// Sets packed 32-bit integers in returned vector with the supplied values in |
0531ce1d | 2377 | /// reverse order. |
83c7162d | 2378 | /// |
353b0b11 | 2379 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32) |
0531ce1d XL |
2380 | #[inline] |
2381 | #[target_feature(enable = "avx")] | |
2382 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2383 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2384 | pub unsafe fn _mm256_setr_epi32( |
0731742a XL |
2385 | e0: i32, |
2386 | e1: i32, | |
2387 | e2: i32, | |
2388 | e3: i32, | |
2389 | e4: i32, | |
2390 | e5: i32, | |
2391 | e6: i32, | |
2392 | e7: i32, | |
0531ce1d | 2393 | ) -> __m256i { |
532ac7d7 | 2394 | transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7)) |
0531ce1d XL |
2395 | } |
2396 | ||
532ac7d7 | 2397 | /// Sets packed 64-bit integers in returned vector with the supplied values in |
0531ce1d | 2398 | /// reverse order. |
83c7162d | 2399 | /// |
353b0b11 | 2400 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x) |
0531ce1d XL |
2401 | #[inline] |
2402 | #[target_feature(enable = "avx")] | |
2403 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2404 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2405 | pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i { |
532ac7d7 | 2406 | transmute(i64x4::new(a, b, c, d)) |
0531ce1d XL |
2407 | } |
2408 | ||
532ac7d7 | 2409 | /// Broadcasts double-precision (64-bit) floating-point value `a` to all |
0531ce1d | 2410 | /// elements of returned vector. |
83c7162d | 2411 | /// |
353b0b11 | 2412 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd) |
0531ce1d XL |
2413 | #[inline] |
2414 | #[target_feature(enable = "avx")] | |
2415 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2416 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2417 | pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d { |
2418 | _mm256_setr_pd(a, a, a, a) | |
2419 | } | |
2420 | ||
532ac7d7 | 2421 | /// Broadcasts single-precision (32-bit) floating-point value `a` to all |
0531ce1d | 2422 | /// elements of returned vector. |
83c7162d | 2423 | /// |
353b0b11 | 2424 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps) |
0531ce1d XL |
2425 | #[inline] |
2426 | #[target_feature(enable = "avx")] | |
2427 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2428 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2429 | pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 { |
2430 | _mm256_setr_ps(a, a, a, a, a, a, a, a) | |
2431 | } | |
2432 | ||
532ac7d7 | 2433 | /// Broadcasts 8-bit integer `a` to all elements of returned vector. |
0531ce1d | 2434 | /// This intrinsic may generate the `vpbroadcastb`. |
83c7162d | 2435 | /// |
353b0b11 | 2436 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8) |
0531ce1d XL |
2437 | #[inline] |
2438 | #[target_feature(enable = "avx")] | |
0531ce1d | 2439 | // This intrinsic has no corresponding instruction. |
83c7162d | 2440 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2441 | pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i { |
0731742a | 2442 | #[rustfmt::skip] |
0531ce1d XL |
2443 | _mm256_setr_epi8( |
2444 | a, a, a, a, a, a, a, a, | |
2445 | a, a, a, a, a, a, a, a, | |
2446 | a, a, a, a, a, a, a, a, | |
2447 | a, a, a, a, a, a, a, a, | |
2448 | ) | |
2449 | } | |
2450 | ||
9ffffee4 | 2451 | /// Broadcasts 16-bit integer `a` to all elements of returned vector. |
0531ce1d | 2452 | /// This intrinsic may generate the `vpbroadcastw`. |
83c7162d | 2453 | /// |
353b0b11 | 2454 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16) |
0531ce1d XL |
2455 | #[inline] |
2456 | #[target_feature(enable = "avx")] | |
2457 | //#[cfg_attr(test, assert_instr(vpshufb))] | |
2458 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
2459 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2460 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2461 | pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i { |
2462 | _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a) | |
2463 | } | |
2464 | ||
532ac7d7 | 2465 | /// Broadcasts 32-bit integer `a` to all elements of returned vector. |
0531ce1d | 2466 | /// This intrinsic may generate the `vpbroadcastd`. |
83c7162d | 2467 | /// |
353b0b11 | 2468 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32) |
0531ce1d XL |
2469 | #[inline] |
2470 | #[target_feature(enable = "avx")] | |
2471 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2472 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2473 | pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i { |
2474 | _mm256_setr_epi32(a, a, a, a, a, a, a, a) | |
2475 | } | |
2476 | ||
532ac7d7 | 2477 | /// Broadcasts 64-bit integer `a` to all elements of returned vector. |
0531ce1d | 2478 | /// This intrinsic may generate the `vpbroadcastq`. |
83c7162d | 2479 | /// |
353b0b11 | 2480 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x) |
0531ce1d XL |
2481 | #[inline] |
2482 | #[target_feature(enable = "avx")] | |
e1599b0c XL |
2483 | #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))] |
2484 | #[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))] | |
0531ce1d | 2485 | // This intrinsic has no corresponding instruction. |
83c7162d | 2486 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2487 | pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i { |
2488 | _mm256_setr_epi64x(a, a, a, a) | |
2489 | } | |
2490 | ||
2491 | /// Cast vector of type __m256d to type __m256. | |
83c7162d | 2492 | /// |
353b0b11 | 2493 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps) |
0531ce1d XL |
2494 | #[inline] |
2495 | #[target_feature(enable = "avx")] | |
2496 | // This intrinsic is only used for compilation and does not generate any | |
2497 | // instructions, thus it has zero latency. | |
83c7162d | 2498 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2499 | pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 { |
532ac7d7 | 2500 | transmute(a) |
0531ce1d XL |
2501 | } |
2502 | ||
2503 | /// Cast vector of type __m256 to type __m256d. | |
83c7162d | 2504 | /// |
353b0b11 | 2505 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd) |
0531ce1d XL |
2506 | #[inline] |
2507 | #[target_feature(enable = "avx")] | |
2508 | // This intrinsic is only used for compilation and does not generate any | |
2509 | // instructions, thus it has zero latency. | |
83c7162d | 2510 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2511 | pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d { |
532ac7d7 | 2512 | transmute(a) |
0531ce1d XL |
2513 | } |
2514 | ||
2515 | /// Casts vector of type __m256 to type __m256i. | |
83c7162d | 2516 | /// |
353b0b11 | 2517 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256) |
0531ce1d XL |
2518 | #[inline] |
2519 | #[target_feature(enable = "avx")] | |
2520 | // This intrinsic is only used for compilation and does not generate any | |
2521 | // instructions, thus it has zero latency. | |
83c7162d | 2522 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2523 | pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i { |
532ac7d7 | 2524 | transmute(a) |
0531ce1d XL |
2525 | } |
2526 | ||
2527 | /// Casts vector of type __m256i to type __m256. | |
83c7162d | 2528 | /// |
353b0b11 | 2529 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps) |
0531ce1d XL |
2530 | #[inline] |
2531 | #[target_feature(enable = "avx")] | |
2532 | // This intrinsic is only used for compilation and does not generate any | |
2533 | // instructions, thus it has zero latency. | |
83c7162d | 2534 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2535 | pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 { |
532ac7d7 | 2536 | transmute(a) |
0531ce1d XL |
2537 | } |
2538 | ||
2539 | /// Casts vector of type __m256d to type __m256i. | |
83c7162d | 2540 | /// |
353b0b11 | 2541 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256) |
0531ce1d XL |
2542 | #[inline] |
2543 | #[target_feature(enable = "avx")] | |
2544 | // This intrinsic is only used for compilation and does not generate any | |
2545 | // instructions, thus it has zero latency. | |
83c7162d | 2546 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2547 | pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i { |
532ac7d7 | 2548 | transmute(a) |
0531ce1d XL |
2549 | } |
2550 | ||
2551 | /// Casts vector of type __m256i to type __m256d. | |
83c7162d | 2552 | /// |
353b0b11 | 2553 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd) |
0531ce1d XL |
2554 | #[inline] |
2555 | #[target_feature(enable = "avx")] | |
2556 | // This intrinsic is only used for compilation and does not generate any | |
2557 | // instructions, thus it has zero latency. | |
83c7162d | 2558 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2559 | pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d { |
532ac7d7 | 2560 | transmute(a) |
0531ce1d XL |
2561 | } |
2562 | ||
2563 | /// Casts vector of type __m256 to type __m128. | |
83c7162d | 2564 | /// |
353b0b11 | 2565 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128) |
0531ce1d XL |
2566 | #[inline] |
2567 | #[target_feature(enable = "avx")] | |
2568 | // This intrinsic is only used for compilation and does not generate any | |
2569 | // instructions, thus it has zero latency. | |
83c7162d | 2570 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2571 | pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 { |
353b0b11 | 2572 | simd_shuffle!(a, a, [0, 1, 2, 3]) |
0531ce1d XL |
2573 | } |
2574 | ||
2575 | /// Casts vector of type __m256d to type __m128d. | |
83c7162d | 2576 | /// |
353b0b11 | 2577 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128) |
0531ce1d XL |
2578 | #[inline] |
2579 | #[target_feature(enable = "avx")] | |
2580 | // This intrinsic is only used for compilation and does not generate any | |
2581 | // instructions, thus it has zero latency. | |
83c7162d | 2582 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2583 | pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d { |
353b0b11 | 2584 | simd_shuffle!(a, a, [0, 1]) |
0531ce1d XL |
2585 | } |
2586 | ||
2587 | /// Casts vector of type __m256i to type __m128i. | |
83c7162d | 2588 | /// |
353b0b11 | 2589 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128) |
0531ce1d XL |
2590 | #[inline] |
2591 | #[target_feature(enable = "avx")] | |
2592 | // This intrinsic is only used for compilation and does not generate any | |
2593 | // instructions, thus it has zero latency. | |
83c7162d | 2594 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2595 | pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i { |
2596 | let a = a.as_i64x4(); | |
353b0b11 | 2597 | let dst: i64x2 = simd_shuffle!(a, a, [0, 1]); |
532ac7d7 | 2598 | transmute(dst) |
0531ce1d XL |
2599 | } |
2600 | ||
2601 | /// Casts vector of type __m128 to type __m256; | |
2602 | /// the upper 128 bits of the result are undefined. | |
83c7162d | 2603 | /// |
353b0b11 | 2604 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256) |
0531ce1d XL |
2605 | #[inline] |
2606 | #[target_feature(enable = "avx")] | |
2607 | // This intrinsic is only used for compilation and does not generate any | |
2608 | // instructions, thus it has zero latency. | |
83c7162d | 2609 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2610 | pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 { |
353b0b11 FG |
2611 | // FIXME simd_shuffle!(a, a, [0, 1, 2, 3, -1, -1, -1, -1]) |
2612 | simd_shuffle!(a, a, [0, 1, 2, 3, 0, 0, 0, 0]) | |
0531ce1d XL |
2613 | } |
2614 | ||
2615 | /// Casts vector of type __m128d to type __m256d; | |
2616 | /// the upper 128 bits of the result are undefined. | |
83c7162d | 2617 | /// |
353b0b11 | 2618 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256) |
0531ce1d XL |
2619 | #[inline] |
2620 | #[target_feature(enable = "avx")] | |
2621 | // This intrinsic is only used for compilation and does not generate any | |
2622 | // instructions, thus it has zero latency. | |
83c7162d | 2623 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2624 | pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d { |
353b0b11 FG |
2625 | // FIXME simd_shuffle!(a, a, [0, 1, -1, -1]) |
2626 | simd_shuffle!(a, a, [0, 1, 0, 0]) | |
0531ce1d XL |
2627 | } |
2628 | ||
2629 | /// Casts vector of type __m128i to type __m256i; | |
2630 | /// the upper 128 bits of the result are undefined. | |
83c7162d | 2631 | /// |
353b0b11 | 2632 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256) |
0531ce1d XL |
2633 | #[inline] |
2634 | #[target_feature(enable = "avx")] | |
2635 | // This intrinsic is only used for compilation and does not generate any | |
2636 | // instructions, thus it has zero latency. | |
83c7162d | 2637 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2638 | pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i { |
2639 | let a = a.as_i64x2(); | |
353b0b11 FG |
2640 | // FIXME simd_shuffle!(a, a, [0, 1, -1, -1]) |
2641 | let dst: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 0]); | |
532ac7d7 | 2642 | transmute(dst) |
0531ce1d XL |
2643 | } |
2644 | ||
83c7162d XL |
2645 | /// Constructs a 256-bit floating-point vector of `[8 x float]` from a |
2646 | /// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain | |
0531ce1d | 2647 | /// the value of the source vector. The upper 128 bits are set to zero. |
83c7162d | 2648 | /// |
353b0b11 | 2649 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256) |
0531ce1d XL |
2650 | #[inline] |
2651 | #[target_feature(enable = "avx,sse")] | |
2652 | // This intrinsic is only used for compilation and does not generate any | |
2653 | // instructions, thus it has zero latency. | |
83c7162d | 2654 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2655 | pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 { |
353b0b11 | 2656 | simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7]) |
0531ce1d XL |
2657 | } |
2658 | ||
2659 | /// Constructs a 256-bit integer vector from a 128-bit integer vector. | |
2660 | /// The lower 128 bits contain the value of the source vector. The upper | |
2661 | /// 128 bits are set to zero. | |
83c7162d | 2662 | /// |
353b0b11 | 2663 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256) |
0531ce1d XL |
2664 | #[inline] |
2665 | #[target_feature(enable = "avx,sse2")] | |
2666 | // This intrinsic is only used for compilation and does not generate any | |
2667 | // instructions, thus it has zero latency. | |
83c7162d | 2668 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2669 | pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i { |
2670 | let b = _mm_setzero_si128().as_i64x2(); | |
353b0b11 | 2671 | let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]); |
532ac7d7 | 2672 | transmute(dst) |
0531ce1d XL |
2673 | } |
2674 | ||
83c7162d XL |
2675 | /// Constructs a 256-bit floating-point vector of `[4 x double]` from a |
2676 | /// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits | |
0531ce1d XL |
2677 | /// contain the value of the source vector. The upper 128 bits are set |
2678 | /// to zero. | |
83c7162d | 2679 | /// |
353b0b11 | 2680 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256) |
0531ce1d XL |
2681 | #[inline] |
2682 | #[target_feature(enable = "avx,sse2")] | |
2683 | // This intrinsic is only used for compilation and does not generate any | |
2684 | // instructions, thus it has zero latency. | |
83c7162d | 2685 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2686 | pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d { |
353b0b11 | 2687 | simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3]) |
0531ce1d XL |
2688 | } |
2689 | ||
49aad941 FG |
2690 | /// Returns vector of type `__m256` with indeterminate elements. |
2691 | /// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`]. | |
2692 | /// In practice, this is equivalent to [`mem::zeroed`]. | |
83c7162d | 2693 | /// |
353b0b11 | 2694 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps) |
0531ce1d XL |
2695 | #[inline] |
2696 | #[target_feature(enable = "avx")] | |
2697 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2698 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2699 | pub unsafe fn _mm256_undefined_ps() -> __m256 { |
3dfed10e | 2700 | _mm256_set1_ps(0.0) |
0531ce1d XL |
2701 | } |
2702 | ||
49aad941 FG |
2703 | /// Returns vector of type `__m256d` with indeterminate elements. |
2704 | /// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`]. | |
2705 | /// In practice, this is equivalent to [`mem::zeroed`]. | |
83c7162d | 2706 | /// |
353b0b11 | 2707 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd) |
0531ce1d XL |
2708 | #[inline] |
2709 | #[target_feature(enable = "avx")] | |
2710 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2711 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2712 | pub unsafe fn _mm256_undefined_pd() -> __m256d { |
3dfed10e | 2713 | _mm256_set1_pd(0.0) |
0531ce1d XL |
2714 | } |
2715 | ||
49aad941 FG |
2716 | /// Returns vector of type __m256i with with indeterminate elements. |
2717 | /// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`]. | |
2718 | /// In practice, this is equivalent to [`mem::zeroed`]. | |
83c7162d | 2719 | /// |
353b0b11 | 2720 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256) |
0531ce1d XL |
2721 | #[inline] |
2722 | #[target_feature(enable = "avx")] | |
2723 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2724 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2725 | pub unsafe fn _mm256_undefined_si256() -> __m256i { |
04454e1e | 2726 | __m256i(0, 0, 0, 0) |
0531ce1d XL |
2727 | } |
2728 | ||
532ac7d7 | 2729 | /// Sets packed __m256 returned vector with the supplied values. |
83c7162d | 2730 | /// |
353b0b11 | 2731 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128) |
0531ce1d XL |
2732 | #[inline] |
2733 | #[target_feature(enable = "avx")] | |
2734 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2735 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2736 | pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 { |
353b0b11 | 2737 | simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7]) |
0531ce1d XL |
2738 | } |
2739 | ||
532ac7d7 | 2740 | /// Sets packed __m256d returned vector with the supplied values. |
83c7162d | 2741 | /// |
353b0b11 | 2742 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d) |
0531ce1d XL |
2743 | #[inline] |
2744 | #[target_feature(enable = "avx")] | |
2745 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2746 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2747 | pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d { |
532ac7d7 XL |
2748 | let hi: __m128 = transmute(hi); |
2749 | let lo: __m128 = transmute(lo); | |
2750 | transmute(_mm256_set_m128(hi, lo)) | |
0531ce1d XL |
2751 | } |
2752 | ||
532ac7d7 | 2753 | /// Sets packed __m256i returned vector with the supplied values. |
83c7162d | 2754 | /// |
353b0b11 | 2755 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i) |
0531ce1d XL |
2756 | #[inline] |
2757 | #[target_feature(enable = "avx")] | |
2758 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2759 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 2760 | pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i { |
532ac7d7 XL |
2761 | let hi: __m128 = transmute(hi); |
2762 | let lo: __m128 = transmute(lo); | |
2763 | transmute(_mm256_set_m128(hi, lo)) | |
0531ce1d XL |
2764 | } |
2765 | ||
532ac7d7 | 2766 | /// Sets packed __m256 returned vector with the supplied values. |
83c7162d | 2767 | /// |
353b0b11 | 2768 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128) |
0531ce1d XL |
2769 | #[inline] |
2770 | #[target_feature(enable = "avx")] | |
2771 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2772 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2773 | pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 { |
2774 | _mm256_set_m128(hi, lo) | |
2775 | } | |
2776 | ||
532ac7d7 | 2777 | /// Sets packed __m256d returned vector with the supplied values. |
83c7162d | 2778 | /// |
353b0b11 | 2779 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d) |
0531ce1d XL |
2780 | #[inline] |
2781 | #[target_feature(enable = "avx")] | |
2782 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2783 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2784 | pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d { |
2785 | _mm256_set_m128d(hi, lo) | |
2786 | } | |
2787 | ||
532ac7d7 | 2788 | /// Sets packed __m256i returned vector with the supplied values. |
83c7162d | 2789 | /// |
353b0b11 | 2790 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i) |
0531ce1d XL |
2791 | #[inline] |
2792 | #[target_feature(enable = "avx")] | |
2793 | #[cfg_attr(test, assert_instr(vinsertf128))] | |
83c7162d | 2794 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2795 | pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i { |
2796 | _mm256_set_m128i(hi, lo) | |
2797 | } | |
2798 | ||
532ac7d7 | 2799 | /// Loads two 128-bit values (composed of 4 packed single-precision (32-bit) |
0531ce1d XL |
2800 | /// floating-point elements) from memory, and combine them into a 256-bit |
2801 | /// value. | |
2802 | /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. | |
83c7162d | 2803 | /// |
353b0b11 | 2804 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128) |
0531ce1d XL |
2805 | #[inline] |
2806 | #[target_feature(enable = "avx,sse")] | |
2807 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2808 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 2809 | pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 { |
0531ce1d | 2810 | let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr)); |
17df50a5 | 2811 | _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr)) |
0531ce1d XL |
2812 | } |
2813 | ||
532ac7d7 | 2814 | /// Loads two 128-bit values (composed of 2 packed double-precision (64-bit) |
0531ce1d XL |
2815 | /// floating-point elements) from memory, and combine them into a 256-bit |
2816 | /// value. | |
2817 | /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. | |
83c7162d | 2818 | /// |
353b0b11 | 2819 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d) |
0531ce1d XL |
2820 | #[inline] |
2821 | #[target_feature(enable = "avx,sse2")] | |
2822 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2823 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 2824 | pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d { |
0531ce1d | 2825 | let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr)); |
17df50a5 | 2826 | _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr)) |
0531ce1d XL |
2827 | } |
2828 | ||
532ac7d7 | 2829 | /// Loads two 128-bit values (composed of integer data) from memory, and combine |
0531ce1d XL |
2830 | /// them into a 256-bit value. |
2831 | /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. | |
83c7162d | 2832 | /// |
353b0b11 | 2833 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i) |
0531ce1d XL |
2834 | #[inline] |
2835 | #[target_feature(enable = "avx,sse2")] | |
2836 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2837 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 2838 | pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i { |
0531ce1d | 2839 | let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr)); |
17df50a5 | 2840 | _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr)) |
0531ce1d XL |
2841 | } |
2842 | ||
532ac7d7 | 2843 | /// Stores the high and low 128-bit halves (each composed of 4 packed |
0531ce1d XL |
2844 | /// single-precision (32-bit) floating-point elements) from `a` into memory two |
2845 | /// different 128-bit locations. | |
2846 | /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. | |
83c7162d | 2847 | /// |
353b0b11 | 2848 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128) |
0531ce1d XL |
2849 | #[inline] |
2850 | #[target_feature(enable = "avx,sse")] | |
2851 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2852 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 2853 | pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) { |
0531ce1d XL |
2854 | let lo = _mm256_castps256_ps128(a); |
2855 | _mm_storeu_ps(loaddr, lo); | |
17df50a5 | 2856 | let hi = _mm256_extractf128_ps::<1>(a); |
0531ce1d XL |
2857 | _mm_storeu_ps(hiaddr, hi); |
2858 | } | |
2859 | ||
532ac7d7 | 2860 | /// Stores the high and low 128-bit halves (each composed of 2 packed |
0531ce1d XL |
2861 | /// double-precision (64-bit) floating-point elements) from `a` into memory two |
2862 | /// different 128-bit locations. | |
2863 | /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. | |
83c7162d | 2864 | /// |
353b0b11 | 2865 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d) |
0531ce1d XL |
2866 | #[inline] |
2867 | #[target_feature(enable = "avx,sse2")] | |
2868 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2869 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 2870 | pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) { |
0531ce1d XL |
2871 | let lo = _mm256_castpd256_pd128(a); |
2872 | _mm_storeu_pd(loaddr, lo); | |
17df50a5 | 2873 | let hi = _mm256_extractf128_pd::<1>(a); |
0531ce1d XL |
2874 | _mm_storeu_pd(hiaddr, hi); |
2875 | } | |
2876 | ||
532ac7d7 | 2877 | /// Stores the high and low 128-bit halves (each composed of integer data) from |
0531ce1d XL |
2878 | /// `a` into memory two different 128-bit locations. |
2879 | /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary. | |
83c7162d | 2880 | /// |
353b0b11 | 2881 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i) |
0531ce1d XL |
2882 | #[inline] |
2883 | #[target_feature(enable = "avx,sse2")] | |
2884 | // This intrinsic has no corresponding instruction. | |
83c7162d | 2885 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 2886 | pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) { |
0531ce1d XL |
2887 | let lo = _mm256_castsi256_si128(a); |
2888 | _mm_storeu_si128(loaddr, lo); | |
17df50a5 | 2889 | let hi = _mm256_extractf128_si256::<1>(a); |
0531ce1d XL |
2890 | _mm_storeu_si128(hiaddr, hi); |
2891 | } | |
2892 | ||
83c7162d XL |
2893 | /// Returns the first element of the input vector of `[8 x float]`. |
2894 | /// | |
353b0b11 | 2895 | /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32) |
0531ce1d XL |
2896 | #[inline] |
2897 | #[target_feature(enable = "avx")] | |
2898 | //#[cfg_attr(test, assert_instr(movss))] FIXME | |
83c7162d | 2899 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
2900 | pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 { |
2901 | simd_extract(a, 0) | |
2902 | } | |
2903 | ||
5e7ed085 | 2904 | // LLVM intrinsics used in the above functions |
0531ce1d XL |
2905 | #[allow(improper_ctypes)] |
2906 | extern "C" { | |
2907 | #[link_name = "llvm.x86.avx.addsub.pd.256"] | |
2908 | fn addsubpd256(a: __m256d, b: __m256d) -> __m256d; | |
2909 | #[link_name = "llvm.x86.avx.addsub.ps.256"] | |
2910 | fn addsubps256(a: __m256, b: __m256) -> __m256; | |
0531ce1d XL |
2911 | #[link_name = "llvm.x86.avx.round.pd.256"] |
2912 | fn roundpd256(a: __m256d, b: i32) -> __m256d; | |
2913 | #[link_name = "llvm.x86.avx.round.ps.256"] | |
2914 | fn roundps256(a: __m256, b: i32) -> __m256; | |
0531ce1d XL |
2915 | #[link_name = "llvm.x86.avx.sqrt.ps.256"] |
2916 | fn sqrtps256(a: __m256) -> __m256; | |
2917 | #[link_name = "llvm.x86.avx.blendv.pd.256"] | |
2918 | fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; | |
2919 | #[link_name = "llvm.x86.avx.blendv.ps.256"] | |
2920 | fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256; | |
2921 | #[link_name = "llvm.x86.avx.dp.ps.256"] | |
2922 | fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256; | |
2923 | #[link_name = "llvm.x86.avx.hadd.pd.256"] | |
2924 | fn vhaddpd(a: __m256d, b: __m256d) -> __m256d; | |
2925 | #[link_name = "llvm.x86.avx.hadd.ps.256"] | |
2926 | fn vhaddps(a: __m256, b: __m256) -> __m256; | |
2927 | #[link_name = "llvm.x86.avx.hsub.pd.256"] | |
2928 | fn vhsubpd(a: __m256d, b: __m256d) -> __m256d; | |
2929 | #[link_name = "llvm.x86.avx.hsub.ps.256"] | |
2930 | fn vhsubps(a: __m256, b: __m256) -> __m256; | |
2931 | #[link_name = "llvm.x86.sse2.cmp.pd"] | |
3dfed10e | 2932 | fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d; |
0531ce1d XL |
2933 | #[link_name = "llvm.x86.avx.cmp.pd.256"] |
2934 | fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d; | |
2935 | #[link_name = "llvm.x86.sse.cmp.ps"] | |
3dfed10e | 2936 | fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128; |
0531ce1d XL |
2937 | #[link_name = "llvm.x86.avx.cmp.ps.256"] |
2938 | fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256; | |
2939 | #[link_name = "llvm.x86.sse2.cmp.sd"] | |
3dfed10e | 2940 | fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d; |
0531ce1d | 2941 | #[link_name = "llvm.x86.sse.cmp.ss"] |
3dfed10e | 2942 | fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128; |
0531ce1d XL |
2943 | #[link_name = "llvm.x86.avx.cvtdq2.ps.256"] |
2944 | fn vcvtdq2ps(a: i32x8) -> __m256; | |
2945 | #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"] | |
2946 | fn vcvtpd2ps(a: __m256d) -> __m128; | |
2947 | #[link_name = "llvm.x86.avx.cvt.ps2dq.256"] | |
2948 | fn vcvtps2dq(a: __m256) -> i32x8; | |
2949 | #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"] | |
2950 | fn vcvttpd2dq(a: __m256d) -> i32x4; | |
2951 | #[link_name = "llvm.x86.avx.cvt.pd2dq.256"] | |
2952 | fn vcvtpd2dq(a: __m256d) -> i32x4; | |
2953 | #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"] | |
2954 | fn vcvttps2dq(a: __m256) -> i32x8; | |
2955 | #[link_name = "llvm.x86.avx.vzeroall"] | |
2956 | fn vzeroall(); | |
2957 | #[link_name = "llvm.x86.avx.vzeroupper"] | |
2958 | fn vzeroupper(); | |
2959 | #[link_name = "llvm.x86.avx.vpermilvar.ps.256"] | |
2960 | fn vpermilps256(a: __m256, b: i32x8) -> __m256; | |
2961 | #[link_name = "llvm.x86.avx.vpermilvar.ps"] | |
2962 | fn vpermilps(a: __m128, b: i32x4) -> __m128; | |
2963 | #[link_name = "llvm.x86.avx.vpermilvar.pd.256"] | |
2964 | fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d; | |
2965 | #[link_name = "llvm.x86.avx.vpermilvar.pd"] | |
2966 | fn vpermilpd(a: __m128d, b: i64x2) -> __m128d; | |
2967 | #[link_name = "llvm.x86.avx.vperm2f128.ps.256"] | |
2968 | fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256; | |
2969 | #[link_name = "llvm.x86.avx.vperm2f128.pd.256"] | |
2970 | fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d; | |
2971 | #[link_name = "llvm.x86.avx.vperm2f128.si.256"] | |
2972 | fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8; | |
2973 | #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"] | |
2974 | fn vbroadcastf128ps256(a: &__m128) -> __m256; | |
2975 | #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"] | |
2976 | fn vbroadcastf128pd256(a: &__m128d) -> __m256d; | |
2977 | #[link_name = "llvm.x86.avx.storeu.pd.256"] | |
2978 | fn storeupd256(mem_addr: *mut f64, a: __m256d); | |
2979 | #[link_name = "llvm.x86.avx.storeu.ps.256"] | |
2980 | fn storeups256(mem_addr: *mut f32, a: __m256); | |
2981 | #[link_name = "llvm.x86.avx.storeu.dq.256"] | |
2982 | fn storeudq256(mem_addr: *mut i8, a: i8x32); | |
2983 | #[link_name = "llvm.x86.avx.maskload.pd.256"] | |
2984 | fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d; | |
2985 | #[link_name = "llvm.x86.avx.maskstore.pd.256"] | |
2986 | fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d); | |
2987 | #[link_name = "llvm.x86.avx.maskload.pd"] | |
2988 | fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d; | |
2989 | #[link_name = "llvm.x86.avx.maskstore.pd"] | |
2990 | fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d); | |
2991 | #[link_name = "llvm.x86.avx.maskload.ps.256"] | |
2992 | fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256; | |
2993 | #[link_name = "llvm.x86.avx.maskstore.ps.256"] | |
2994 | fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256); | |
2995 | #[link_name = "llvm.x86.avx.maskload.ps"] | |
2996 | fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128; | |
2997 | #[link_name = "llvm.x86.avx.maskstore.ps"] | |
2998 | fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128); | |
2999 | #[link_name = "llvm.x86.avx.ldu.dq.256"] | |
3000 | fn vlddqu(mem_addr: *const i8) -> i8x32; | |
3001 | #[link_name = "llvm.x86.avx.rcp.ps.256"] | |
3002 | fn vrcpps(a: __m256) -> __m256; | |
3003 | #[link_name = "llvm.x86.avx.rsqrt.ps.256"] | |
3004 | fn vrsqrtps(a: __m256) -> __m256; | |
3005 | #[link_name = "llvm.x86.avx.ptestz.256"] | |
3006 | fn ptestz256(a: i64x4, b: i64x4) -> i32; | |
3007 | #[link_name = "llvm.x86.avx.ptestc.256"] | |
3008 | fn ptestc256(a: i64x4, b: i64x4) -> i32; | |
3009 | #[link_name = "llvm.x86.avx.ptestnzc.256"] | |
3010 | fn ptestnzc256(a: i64x4, b: i64x4) -> i32; | |
3011 | #[link_name = "llvm.x86.avx.vtestz.pd.256"] | |
3012 | fn vtestzpd256(a: __m256d, b: __m256d) -> i32; | |
3013 | #[link_name = "llvm.x86.avx.vtestc.pd.256"] | |
3014 | fn vtestcpd256(a: __m256d, b: __m256d) -> i32; | |
3015 | #[link_name = "llvm.x86.avx.vtestnzc.pd.256"] | |
3016 | fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32; | |
3017 | #[link_name = "llvm.x86.avx.vtestz.pd"] | |
3018 | fn vtestzpd(a: __m128d, b: __m128d) -> i32; | |
3019 | #[link_name = "llvm.x86.avx.vtestc.pd"] | |
3020 | fn vtestcpd(a: __m128d, b: __m128d) -> i32; | |
3021 | #[link_name = "llvm.x86.avx.vtestnzc.pd"] | |
3022 | fn vtestnzcpd(a: __m128d, b: __m128d) -> i32; | |
3023 | #[link_name = "llvm.x86.avx.vtestz.ps.256"] | |
3024 | fn vtestzps256(a: __m256, b: __m256) -> i32; | |
3025 | #[link_name = "llvm.x86.avx.vtestc.ps.256"] | |
3026 | fn vtestcps256(a: __m256, b: __m256) -> i32; | |
3027 | #[link_name = "llvm.x86.avx.vtestnzc.ps.256"] | |
3028 | fn vtestnzcps256(a: __m256, b: __m256) -> i32; | |
3029 | #[link_name = "llvm.x86.avx.vtestz.ps"] | |
3030 | fn vtestzps(a: __m128, b: __m128) -> i32; | |
3031 | #[link_name = "llvm.x86.avx.vtestc.ps"] | |
3032 | fn vtestcps(a: __m128, b: __m128) -> i32; | |
3033 | #[link_name = "llvm.x86.avx.vtestnzc.ps"] | |
3034 | fn vtestnzcps(a: __m128, b: __m128) -> i32; | |
3035 | #[link_name = "llvm.x86.avx.movmsk.pd.256"] | |
3036 | fn movmskpd256(a: __m256d) -> i32; | |
3037 | #[link_name = "llvm.x86.avx.movmsk.ps.256"] | |
3038 | fn movmskps256(a: __m256) -> i32; | |
17df50a5 XL |
3039 | #[link_name = "llvm.x86.avx.min.ps.256"] |
3040 | fn vminps(a: __m256, b: __m256) -> __m256; | |
3041 | #[link_name = "llvm.x86.avx.max.ps.256"] | |
3042 | fn vmaxps(a: __m256, b: __m256) -> __m256; | |
3043 | #[link_name = "llvm.x86.avx.min.pd.256"] | |
3044 | fn vminpd(a: __m256d, b: __m256d) -> __m256d; | |
3045 | #[link_name = "llvm.x86.avx.max.pd.256"] | |
3046 | fn vmaxpd(a: __m256d, b: __m256d) -> __m256d; | |
0531ce1d XL |
3047 | } |
3048 | ||
3049 | #[cfg(test)] | |
3050 | mod tests { | |
48663c56 | 3051 | use crate::hint::black_box; |
416331ca | 3052 | use stdarch_test::simd_test; |
0531ce1d | 3053 | |
532ac7d7 | 3054 | use crate::core_arch::x86::*; |
0531ce1d | 3055 | |
83c7162d | 3056 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3057 | unsafe fn test_mm256_add_pd() { |
3058 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3059 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
3060 | let r = _mm256_add_pd(a, b); | |
3061 | let e = _mm256_setr_pd(6., 8., 10., 12.); | |
3062 | assert_eq_m256d(r, e); | |
3063 | } | |
3064 | ||
83c7162d | 3065 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3066 | unsafe fn test_mm256_add_ps() { |
3067 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
3068 | let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); | |
3069 | let r = _mm256_add_ps(a, b); | |
3070 | let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.); | |
3071 | assert_eq_m256(r, e); | |
3072 | } | |
3073 | ||
83c7162d | 3074 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3075 | unsafe fn test_mm256_and_pd() { |
3076 | let a = _mm256_set1_pd(1.); | |
3077 | let b = _mm256_set1_pd(0.6); | |
3078 | let r = _mm256_and_pd(a, b); | |
3079 | let e = _mm256_set1_pd(0.5); | |
3080 | assert_eq_m256d(r, e); | |
3081 | } | |
3082 | ||
83c7162d | 3083 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3084 | unsafe fn test_mm256_and_ps() { |
3085 | let a = _mm256_set1_ps(1.); | |
3086 | let b = _mm256_set1_ps(0.6); | |
3087 | let r = _mm256_and_ps(a, b); | |
3088 | let e = _mm256_set1_ps(0.5); | |
3089 | assert_eq_m256(r, e); | |
3090 | } | |
3091 | ||
83c7162d | 3092 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3093 | unsafe fn test_mm256_or_pd() { |
3094 | let a = _mm256_set1_pd(1.); | |
3095 | let b = _mm256_set1_pd(0.6); | |
3096 | let r = _mm256_or_pd(a, b); | |
3097 | let e = _mm256_set1_pd(1.2); | |
3098 | assert_eq_m256d(r, e); | |
3099 | } | |
3100 | ||
83c7162d | 3101 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3102 | unsafe fn test_mm256_or_ps() { |
3103 | let a = _mm256_set1_ps(1.); | |
3104 | let b = _mm256_set1_ps(0.6); | |
3105 | let r = _mm256_or_ps(a, b); | |
3106 | let e = _mm256_set1_ps(1.2); | |
3107 | assert_eq_m256(r, e); | |
3108 | } | |
3109 | ||
83c7162d | 3110 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3111 | unsafe fn test_mm256_shuffle_pd() { |
3112 | let a = _mm256_setr_pd(1., 4., 5., 8.); | |
3113 | let b = _mm256_setr_pd(2., 3., 6., 7.); | |
17df50a5 | 3114 | let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b); |
0531ce1d XL |
3115 | let e = _mm256_setr_pd(4., 3., 8., 7.); |
3116 | assert_eq_m256d(r, e); | |
3117 | } | |
3118 | ||
83c7162d | 3119 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3120 | unsafe fn test_mm256_shuffle_ps() { |
3121 | let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.); | |
3122 | let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.); | |
17df50a5 | 3123 | let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b); |
0531ce1d XL |
3124 | let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.); |
3125 | assert_eq_m256(r, e); | |
3126 | } | |
3127 | ||
83c7162d | 3128 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3129 | unsafe fn test_mm256_andnot_pd() { |
3130 | let a = _mm256_set1_pd(0.); | |
3131 | let b = _mm256_set1_pd(0.6); | |
3132 | let r = _mm256_andnot_pd(a, b); | |
3133 | assert_eq_m256d(r, b); | |
3134 | } | |
3135 | ||
83c7162d | 3136 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3137 | unsafe fn test_mm256_andnot_ps() { |
3138 | let a = _mm256_set1_ps(0.); | |
3139 | let b = _mm256_set1_ps(0.6); | |
3140 | let r = _mm256_andnot_ps(a, b); | |
3141 | assert_eq_m256(r, b); | |
3142 | } | |
3143 | ||
83c7162d | 3144 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3145 | unsafe fn test_mm256_max_pd() { |
3146 | let a = _mm256_setr_pd(1., 4., 5., 8.); | |
3147 | let b = _mm256_setr_pd(2., 3., 6., 7.); | |
3148 | let r = _mm256_max_pd(a, b); | |
3149 | let e = _mm256_setr_pd(2., 4., 6., 8.); | |
3150 | assert_eq_m256d(r, e); | |
17df50a5 XL |
3151 | // > If the values being compared are both 0.0s (of either sign), the |
3152 | // > value in the second operand (source operand) is returned. | |
3153 | let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0)); | |
3154 | let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0)); | |
3155 | let wu: [u64; 4] = transmute(w); | |
3156 | let xu: [u64; 4] = transmute(x); | |
3157 | assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]); | |
3158 | assert_eq!(xu, [0u64; 4]); | |
3159 | // > If only one value is a NaN (SNaN or QNaN) for this instruction, the | |
3160 | // > second operand (source operand), either a NaN or a valid | |
3161 | // > floating-point value, is written to the result. | |
3162 | let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0)); | |
3163 | let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN)); | |
3164 | let yf: [f64; 4] = transmute(y); | |
3165 | let zf: [f64; 4] = transmute(z); | |
3166 | assert_eq!(yf, [0.0; 4]); | |
3167 | assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf); | |
0531ce1d XL |
3168 | } |
3169 | ||
83c7162d | 3170 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3171 | unsafe fn test_mm256_max_ps() { |
3172 | let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.); | |
3173 | let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.); | |
3174 | let r = _mm256_max_ps(a, b); | |
3175 | let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.); | |
3176 | assert_eq_m256(r, e); | |
17df50a5 XL |
3177 | // > If the values being compared are both 0.0s (of either sign), the |
3178 | // > value in the second operand (source operand) is returned. | |
3179 | let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0)); | |
3180 | let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0)); | |
3181 | let wu: [u32; 8] = transmute(w); | |
3182 | let xu: [u32; 8] = transmute(x); | |
3183 | assert_eq!(wu, [0x8000_0000u32; 8]); | |
3184 | assert_eq!(xu, [0u32; 8]); | |
3185 | // > If only one value is a NaN (SNaN or QNaN) for this instruction, the | |
3186 | // > second operand (source operand), either a NaN or a valid | |
3187 | // > floating-point value, is written to the result. | |
3188 | let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0)); | |
3189 | let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN)); | |
3190 | let yf: [f32; 8] = transmute(y); | |
3191 | let zf: [f32; 8] = transmute(z); | |
3192 | assert_eq!(yf, [0.0; 8]); | |
3193 | assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf); | |
0531ce1d XL |
3194 | } |
3195 | ||
83c7162d | 3196 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3197 | unsafe fn test_mm256_min_pd() { |
3198 | let a = _mm256_setr_pd(1., 4., 5., 8.); | |
3199 | let b = _mm256_setr_pd(2., 3., 6., 7.); | |
3200 | let r = _mm256_min_pd(a, b); | |
3201 | let e = _mm256_setr_pd(1., 3., 5., 7.); | |
3202 | assert_eq_m256d(r, e); | |
17df50a5 XL |
3203 | // > If the values being compared are both 0.0s (of either sign), the |
3204 | // > value in the second operand (source operand) is returned. | |
3205 | let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0)); | |
3206 | let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0)); | |
3207 | let wu: [u64; 4] = transmute(w); | |
3208 | let xu: [u64; 4] = transmute(x); | |
3209 | assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]); | |
3210 | assert_eq!(xu, [0u64; 4]); | |
3211 | // > If only one value is a NaN (SNaN or QNaN) for this instruction, the | |
3212 | // > second operand (source operand), either a NaN or a valid | |
3213 | // > floating-point value, is written to the result. | |
3214 | let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0)); | |
3215 | let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN)); | |
3216 | let yf: [f64; 4] = transmute(y); | |
3217 | let zf: [f64; 4] = transmute(z); | |
3218 | assert_eq!(yf, [0.0; 4]); | |
3219 | assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf); | |
0531ce1d XL |
3220 | } |
3221 | ||
83c7162d | 3222 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3223 | unsafe fn test_mm256_min_ps() { |
3224 | let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.); | |
3225 | let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.); | |
3226 | let r = _mm256_min_ps(a, b); | |
3227 | let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.); | |
3228 | assert_eq_m256(r, e); | |
17df50a5 XL |
3229 | // > If the values being compared are both 0.0s (of either sign), the |
3230 | // > value in the second operand (source operand) is returned. | |
3231 | let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0)); | |
3232 | let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0)); | |
3233 | let wu: [u32; 8] = transmute(w); | |
3234 | let xu: [u32; 8] = transmute(x); | |
3235 | assert_eq!(wu, [0x8000_0000u32; 8]); | |
3236 | assert_eq!(xu, [0u32; 8]); | |
3237 | // > If only one value is a NaN (SNaN or QNaN) for this instruction, the | |
3238 | // > second operand (source operand), either a NaN or a valid | |
3239 | // > floating-point value, is written to the result. | |
3240 | let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0)); | |
3241 | let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN)); | |
3242 | let yf: [f32; 8] = transmute(y); | |
3243 | let zf: [f32; 8] = transmute(z); | |
3244 | assert_eq!(yf, [0.0; 8]); | |
3245 | assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf); | |
0531ce1d XL |
3246 | } |
3247 | ||
83c7162d | 3248 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3249 | unsafe fn test_mm256_mul_pd() { |
3250 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3251 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
3252 | let r = _mm256_mul_pd(a, b); | |
3253 | let e = _mm256_setr_pd(5., 12., 21., 32.); | |
3254 | assert_eq_m256d(r, e); | |
3255 | } | |
3256 | ||
83c7162d | 3257 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3258 | unsafe fn test_mm256_mul_ps() { |
3259 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
3260 | let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); | |
3261 | let r = _mm256_mul_ps(a, b); | |
3262 | let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.); | |
3263 | assert_eq_m256(r, e); | |
3264 | } | |
3265 | ||
83c7162d | 3266 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3267 | unsafe fn test_mm256_addsub_pd() { |
3268 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3269 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
3270 | let r = _mm256_addsub_pd(a, b); | |
3271 | let e = _mm256_setr_pd(-4., 8., -4., 12.); | |
3272 | assert_eq_m256d(r, e); | |
3273 | } | |
3274 | ||
83c7162d | 3275 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3276 | unsafe fn test_mm256_addsub_ps() { |
3277 | let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.); | |
3278 | let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.); | |
3279 | let r = _mm256_addsub_ps(a, b); | |
3280 | let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.); | |
3281 | assert_eq_m256(r, e); | |
3282 | } | |
3283 | ||
83c7162d | 3284 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3285 | unsafe fn test_mm256_sub_pd() { |
3286 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3287 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
3288 | let r = _mm256_sub_pd(a, b); | |
3289 | let e = _mm256_setr_pd(-4., -4., -4., -4.); | |
3290 | assert_eq_m256d(r, e); | |
3291 | } | |
3292 | ||
83c7162d | 3293 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3294 | unsafe fn test_mm256_sub_ps() { |
3295 | let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.); | |
3296 | let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.); | |
3297 | let r = _mm256_sub_ps(a, b); | |
3298 | let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.); | |
3299 | assert_eq_m256(r, e); | |
3300 | } | |
3301 | ||
83c7162d | 3302 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3303 | unsafe fn test_mm256_round_pd() { |
3304 | let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2); | |
17df50a5 XL |
3305 | let result_closest = _mm256_round_pd::<0b0000>(a); |
3306 | let result_down = _mm256_round_pd::<0b0001>(a); | |
3307 | let result_up = _mm256_round_pd::<0b0010>(a); | |
0531ce1d XL |
3308 | let expected_closest = _mm256_setr_pd(2., 2., 4., -1.); |
3309 | let expected_down = _mm256_setr_pd(1., 2., 3., -2.); | |
3310 | let expected_up = _mm256_setr_pd(2., 3., 4., -1.); | |
3311 | assert_eq_m256d(result_closest, expected_closest); | |
3312 | assert_eq_m256d(result_down, expected_down); | |
3313 | assert_eq_m256d(result_up, expected_up); | |
3314 | } | |
3315 | ||
83c7162d | 3316 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3317 | unsafe fn test_mm256_floor_pd() { |
3318 | let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2); | |
3319 | let result_down = _mm256_floor_pd(a); | |
3320 | let expected_down = _mm256_setr_pd(1., 2., 3., -2.); | |
3321 | assert_eq_m256d(result_down, expected_down); | |
3322 | } | |
3323 | ||
83c7162d | 3324 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3325 | unsafe fn test_mm256_ceil_pd() { |
3326 | let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2); | |
3327 | let result_up = _mm256_ceil_pd(a); | |
3328 | let expected_up = _mm256_setr_pd(2., 3., 4., -1.); | |
3329 | assert_eq_m256d(result_up, expected_up); | |
3330 | } | |
3331 | ||
83c7162d | 3332 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3333 | unsafe fn test_mm256_round_ps() { |
3334 | let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2); | |
17df50a5 XL |
3335 | let result_closest = _mm256_round_ps::<0b0000>(a); |
3336 | let result_down = _mm256_round_ps::<0b0001>(a); | |
3337 | let result_up = _mm256_round_ps::<0b0010>(a); | |
0731742a | 3338 | let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.); |
0531ce1d XL |
3339 | let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.); |
3340 | let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.); | |
3341 | assert_eq_m256(result_closest, expected_closest); | |
3342 | assert_eq_m256(result_down, expected_down); | |
3343 | assert_eq_m256(result_up, expected_up); | |
3344 | } | |
3345 | ||
83c7162d | 3346 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3347 | unsafe fn test_mm256_floor_ps() { |
3348 | let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2); | |
3349 | let result_down = _mm256_floor_ps(a); | |
3350 | let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.); | |
3351 | assert_eq_m256(result_down, expected_down); | |
3352 | } | |
3353 | ||
83c7162d | 3354 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3355 | unsafe fn test_mm256_ceil_ps() { |
3356 | let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2); | |
3357 | let result_up = _mm256_ceil_ps(a); | |
3358 | let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.); | |
3359 | assert_eq_m256(result_up, expected_up); | |
3360 | } | |
3361 | ||
83c7162d | 3362 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3363 | unsafe fn test_mm256_sqrt_pd() { |
3364 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3365 | let r = _mm256_sqrt_pd(a); | |
3366 | let e = _mm256_setr_pd(2., 3., 4., 5.); | |
3367 | assert_eq_m256d(r, e); | |
3368 | } | |
3369 | ||
83c7162d | 3370 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3371 | unsafe fn test_mm256_sqrt_ps() { |
3372 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3373 | let r = _mm256_sqrt_ps(a); | |
3374 | let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.); | |
3375 | assert_eq_m256(r, e); | |
3376 | } | |
3377 | ||
83c7162d | 3378 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3379 | unsafe fn test_mm256_div_ps() { |
3380 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3381 | let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3382 | let r = _mm256_div_ps(a, b); | |
3383 | let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5); | |
3384 | assert_eq_m256(r, e); | |
3385 | } | |
3386 | ||
83c7162d | 3387 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3388 | unsafe fn test_mm256_div_pd() { |
3389 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3390 | let b = _mm256_setr_pd(4., 3., 2., 5.); | |
3391 | let r = _mm256_div_pd(a, b); | |
3392 | let e = _mm256_setr_pd(1., 3., 8., 5.); | |
3393 | assert_eq_m256d(r, e); | |
3394 | } | |
3395 | ||
83c7162d | 3396 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3397 | unsafe fn test_mm256_blend_pd() { |
3398 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3399 | let b = _mm256_setr_pd(4., 3., 2., 5.); | |
17df50a5 | 3400 | let r = _mm256_blend_pd::<0x0>(a, b); |
0531ce1d | 3401 | assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.)); |
17df50a5 | 3402 | let r = _mm256_blend_pd::<0x3>(a, b); |
0531ce1d | 3403 | assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.)); |
17df50a5 | 3404 | let r = _mm256_blend_pd::<0xF>(a, b); |
0531ce1d XL |
3405 | assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.)); |
3406 | } | |
3407 | ||
83c7162d | 3408 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3409 | unsafe fn test_mm256_blend_ps() { |
3410 | let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.); | |
3411 | let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.); | |
17df50a5 | 3412 | let r = _mm256_blend_ps::<0x0>(a, b); |
8faf50e0 | 3413 | assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.)); |
17df50a5 | 3414 | let r = _mm256_blend_ps::<0x3>(a, b); |
8faf50e0 | 3415 | assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.)); |
17df50a5 | 3416 | let r = _mm256_blend_ps::<0xF>(a, b); |
8faf50e0 | 3417 | assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.)); |
0531ce1d XL |
3418 | } |
3419 | ||
83c7162d | 3420 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3421 | unsafe fn test_mm256_blendv_pd() { |
3422 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3423 | let b = _mm256_setr_pd(4., 3., 2., 5.); | |
3424 | let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64); | |
3425 | let r = _mm256_blendv_pd(a, b, c); | |
3426 | let e = _mm256_setr_pd(4., 9., 2., 5.); | |
3427 | assert_eq_m256d(r, e); | |
3428 | } | |
3429 | ||
83c7162d | 3430 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3431 | unsafe fn test_mm256_blendv_ps() { |
3432 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3433 | let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
0731742a | 3434 | #[rustfmt::skip] |
0531ce1d XL |
3435 | let c = _mm256_setr_ps( |
3436 | 0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32, | |
3437 | ); | |
3438 | let r = _mm256_blendv_ps(a, b, c); | |
3439 | let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.); | |
3440 | assert_eq_m256(r, e); | |
3441 | } | |
3442 | ||
83c7162d | 3443 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3444 | unsafe fn test_mm256_dp_ps() { |
3445 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3446 | let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
17df50a5 | 3447 | let r = _mm256_dp_ps::<0xFF>(a, b); |
0731742a | 3448 | let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.); |
0531ce1d XL |
3449 | assert_eq_m256(r, e); |
3450 | } | |
3451 | ||
83c7162d | 3452 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3453 | unsafe fn test_mm256_hadd_pd() { |
3454 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3455 | let b = _mm256_setr_pd(4., 3., 2., 5.); | |
3456 | let r = _mm256_hadd_pd(a, b); | |
3457 | let e = _mm256_setr_pd(13., 7., 41., 7.); | |
3458 | assert_eq_m256d(r, e); | |
3459 | ||
3460 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3461 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
3462 | let r = _mm256_hadd_pd(a, b); | |
3463 | let e = _mm256_setr_pd(3., 11., 7., 15.); | |
3464 | assert_eq_m256d(r, e); | |
3465 | } | |
3466 | ||
83c7162d | 3467 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3468 | unsafe fn test_mm256_hadd_ps() { |
3469 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3470 | let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3471 | let r = _mm256_hadd_ps(a, b); | |
3472 | let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.); | |
3473 | assert_eq_m256(r, e); | |
3474 | ||
3475 | let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.); | |
3476 | let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.); | |
3477 | let r = _mm256_hadd_ps(a, b); | |
3478 | let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.); | |
3479 | assert_eq_m256(r, e); | |
3480 | } | |
3481 | ||
83c7162d | 3482 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3483 | unsafe fn test_mm256_hsub_pd() { |
3484 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3485 | let b = _mm256_setr_pd(4., 3., 2., 5.); | |
3486 | let r = _mm256_hsub_pd(a, b); | |
3487 | let e = _mm256_setr_pd(-5., 1., -9., -3.); | |
3488 | assert_eq_m256d(r, e); | |
3489 | ||
3490 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3491 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
3492 | let r = _mm256_hsub_pd(a, b); | |
3493 | let e = _mm256_setr_pd(-1., -1., -1., -1.); | |
3494 | assert_eq_m256d(r, e); | |
3495 | } | |
3496 | ||
83c7162d | 3497 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3498 | unsafe fn test_mm256_hsub_ps() { |
3499 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3500 | let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3501 | let r = _mm256_hsub_ps(a, b); | |
3502 | let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.); | |
3503 | assert_eq_m256(r, e); | |
3504 | ||
3505 | let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.); | |
3506 | let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.); | |
3507 | let r = _mm256_hsub_ps(a, b); | |
3508 | let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.); | |
3509 | assert_eq_m256(r, e); | |
3510 | } | |
3511 | ||
83c7162d | 3512 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3513 | unsafe fn test_mm256_xor_pd() { |
3514 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3515 | let b = _mm256_set1_pd(0.); | |
3516 | let r = _mm256_xor_pd(a, b); | |
3517 | assert_eq_m256d(r, a); | |
3518 | } | |
3519 | ||
83c7162d | 3520 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3521 | unsafe fn test_mm256_xor_ps() { |
3522 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3523 | let b = _mm256_set1_ps(0.); | |
3524 | let r = _mm256_xor_ps(a, b); | |
3525 | assert_eq_m256(r, a); | |
3526 | } | |
3527 | ||
83c7162d | 3528 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3529 | unsafe fn test_mm_cmp_pd() { |
3530 | let a = _mm_setr_pd(4., 9.); | |
3531 | let b = _mm_setr_pd(4., 3.); | |
17df50a5 | 3532 | let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b); |
0531ce1d XL |
3533 | assert!(get_m128d(r, 0).is_nan()); |
3534 | assert!(get_m128d(r, 1).is_nan()); | |
3535 | } | |
3536 | ||
83c7162d | 3537 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3538 | unsafe fn test_mm256_cmp_pd() { |
3539 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3540 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
17df50a5 | 3541 | let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b); |
0531ce1d XL |
3542 | let e = _mm256_set1_pd(0.); |
3543 | assert_eq_m256d(r, e); | |
3544 | } | |
3545 | ||
83c7162d | 3546 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3547 | unsafe fn test_mm_cmp_ps() { |
3548 | let a = _mm_setr_ps(4., 3., 2., 5.); | |
3549 | let b = _mm_setr_ps(4., 9., 16., 25.); | |
17df50a5 | 3550 | let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b); |
0531ce1d XL |
3551 | assert!(get_m128(r, 0).is_nan()); |
3552 | assert_eq!(get_m128(r, 1), 0.); | |
3553 | assert_eq!(get_m128(r, 2), 0.); | |
3554 | assert_eq!(get_m128(r, 3), 0.); | |
3555 | } | |
3556 | ||
83c7162d | 3557 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3558 | unsafe fn test_mm256_cmp_ps() { |
3559 | let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.); | |
3560 | let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.); | |
17df50a5 | 3561 | let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b); |
0531ce1d XL |
3562 | let e = _mm256_set1_ps(0.); |
3563 | assert_eq_m256(r, e); | |
3564 | } | |
3565 | ||
83c7162d | 3566 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3567 | unsafe fn test_mm_cmp_sd() { |
3568 | let a = _mm_setr_pd(4., 9.); | |
3569 | let b = _mm_setr_pd(4., 3.); | |
17df50a5 | 3570 | let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b); |
0531ce1d XL |
3571 | assert!(get_m128d(r, 0).is_nan()); |
3572 | assert_eq!(get_m128d(r, 1), 9.); | |
3573 | } | |
3574 | ||
83c7162d | 3575 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3576 | unsafe fn test_mm_cmp_ss() { |
3577 | let a = _mm_setr_ps(4., 3., 2., 5.); | |
3578 | let b = _mm_setr_ps(4., 9., 16., 25.); | |
17df50a5 | 3579 | let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b); |
0531ce1d XL |
3580 | assert!(get_m128(r, 0).is_nan()); |
3581 | assert_eq!(get_m128(r, 1), 3.); | |
3582 | assert_eq!(get_m128(r, 2), 2.); | |
3583 | assert_eq!(get_m128(r, 3), 5.); | |
3584 | } | |
3585 | ||
83c7162d | 3586 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3587 | unsafe fn test_mm256_cvtepi32_pd() { |
3588 | let a = _mm_setr_epi32(4, 9, 16, 25); | |
3589 | let r = _mm256_cvtepi32_pd(a); | |
3590 | let e = _mm256_setr_pd(4., 9., 16., 25.); | |
3591 | assert_eq_m256d(r, e); | |
3592 | } | |
3593 | ||
83c7162d | 3594 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3595 | unsafe fn test_mm256_cvtepi32_ps() { |
3596 | let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25); | |
3597 | let r = _mm256_cvtepi32_ps(a); | |
3598 | let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3599 | assert_eq_m256(r, e); | |
3600 | } | |
3601 | ||
83c7162d | 3602 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3603 | unsafe fn test_mm256_cvtpd_ps() { |
3604 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3605 | let r = _mm256_cvtpd_ps(a); | |
3606 | let e = _mm_setr_ps(4., 9., 16., 25.); | |
3607 | assert_eq_m128(r, e); | |
3608 | } | |
3609 | ||
83c7162d | 3610 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3611 | unsafe fn test_mm256_cvtps_epi32() { |
3612 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3613 | let r = _mm256_cvtps_epi32(a); | |
3614 | let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25); | |
3615 | assert_eq_m256i(r, e); | |
3616 | } | |
3617 | ||
83c7162d | 3618 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3619 | unsafe fn test_mm256_cvtps_pd() { |
3620 | let a = _mm_setr_ps(4., 9., 16., 25.); | |
3621 | let r = _mm256_cvtps_pd(a); | |
3622 | let e = _mm256_setr_pd(4., 9., 16., 25.); | |
3623 | assert_eq_m256d(r, e); | |
3624 | } | |
3625 | ||
83c7162d | 3626 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3627 | unsafe fn test_mm256_cvttpd_epi32() { |
3628 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3629 | let r = _mm256_cvttpd_epi32(a); | |
3630 | let e = _mm_setr_epi32(4, 9, 16, 25); | |
3631 | assert_eq_m128i(r, e); | |
3632 | } | |
3633 | ||
83c7162d | 3634 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3635 | unsafe fn test_mm256_cvtpd_epi32() { |
3636 | let a = _mm256_setr_pd(4., 9., 16., 25.); | |
3637 | let r = _mm256_cvtpd_epi32(a); | |
3638 | let e = _mm_setr_epi32(4, 9, 16, 25); | |
3639 | assert_eq_m128i(r, e); | |
3640 | } | |
3641 | ||
83c7162d | 3642 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3643 | unsafe fn test_mm256_cvttps_epi32() { |
3644 | let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.); | |
3645 | let r = _mm256_cvttps_epi32(a); | |
3646 | let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25); | |
3647 | assert_eq_m256i(r, e); | |
3648 | } | |
3649 | ||
83c7162d | 3650 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3651 | unsafe fn test_mm256_extractf128_ps() { |
3652 | let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
17df50a5 | 3653 | let r = _mm256_extractf128_ps::<0>(a); |
0531ce1d XL |
3654 | let e = _mm_setr_ps(4., 3., 2., 5.); |
3655 | assert_eq_m128(r, e); | |
3656 | } | |
3657 | ||
83c7162d | 3658 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3659 | unsafe fn test_mm256_extractf128_pd() { |
3660 | let a = _mm256_setr_pd(4., 3., 2., 5.); | |
17df50a5 | 3661 | let r = _mm256_extractf128_pd::<0>(a); |
0531ce1d XL |
3662 | let e = _mm_setr_pd(4., 3.); |
3663 | assert_eq_m128d(r, e); | |
3664 | } | |
3665 | ||
83c7162d | 3666 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3667 | unsafe fn test_mm256_extractf128_si256() { |
3668 | let a = _mm256_setr_epi64x(4, 3, 2, 5); | |
17df50a5 | 3669 | let r = _mm256_extractf128_si256::<0>(a); |
0531ce1d XL |
3670 | let e = _mm_setr_epi64x(4, 3); |
3671 | assert_eq_m128i(r, e); | |
3672 | } | |
3673 | ||
83c7162d | 3674 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3675 | unsafe fn test_mm256_zeroall() { |
3676 | _mm256_zeroall(); | |
3677 | } | |
3678 | ||
83c7162d | 3679 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3680 | unsafe fn test_mm256_zeroupper() { |
3681 | _mm256_zeroupper(); | |
3682 | } | |
3683 | ||
83c7162d | 3684 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3685 | unsafe fn test_mm256_permutevar_ps() { |
3686 | let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3687 | let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); | |
3688 | let r = _mm256_permutevar_ps(a, b); | |
3689 | let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.); | |
3690 | assert_eq_m256(r, e); | |
3691 | } | |
3692 | ||
83c7162d | 3693 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3694 | unsafe fn test_mm_permutevar_ps() { |
3695 | let a = _mm_setr_ps(4., 3., 2., 5.); | |
3696 | let b = _mm_setr_epi32(1, 2, 3, 4); | |
3697 | let r = _mm_permutevar_ps(a, b); | |
3698 | let e = _mm_setr_ps(3., 2., 5., 4.); | |
3699 | assert_eq_m128(r, e); | |
3700 | } | |
3701 | ||
83c7162d | 3702 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3703 | unsafe fn test_mm256_permute_ps() { |
3704 | let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
17df50a5 | 3705 | let r = _mm256_permute_ps::<0x1b>(a); |
0531ce1d XL |
3706 | let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.); |
3707 | assert_eq_m256(r, e); | |
3708 | } | |
3709 | ||
83c7162d | 3710 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3711 | unsafe fn test_mm_permute_ps() { |
3712 | let a = _mm_setr_ps(4., 3., 2., 5.); | |
17df50a5 | 3713 | let r = _mm_permute_ps::<0x1b>(a); |
0531ce1d XL |
3714 | let e = _mm_setr_ps(5., 2., 3., 4.); |
3715 | assert_eq_m128(r, e); | |
3716 | } | |
3717 | ||
83c7162d | 3718 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3719 | unsafe fn test_mm256_permutevar_pd() { |
3720 | let a = _mm256_setr_pd(4., 3., 2., 5.); | |
3721 | let b = _mm256_setr_epi64x(1, 2, 3, 4); | |
3722 | let r = _mm256_permutevar_pd(a, b); | |
3723 | let e = _mm256_setr_pd(4., 3., 5., 2.); | |
3724 | assert_eq_m256d(r, e); | |
3725 | } | |
3726 | ||
83c7162d | 3727 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3728 | unsafe fn test_mm_permutevar_pd() { |
3729 | let a = _mm_setr_pd(4., 3.); | |
3730 | let b = _mm_setr_epi64x(3, 0); | |
3731 | let r = _mm_permutevar_pd(a, b); | |
3732 | let e = _mm_setr_pd(3., 4.); | |
3733 | assert_eq_m128d(r, e); | |
3734 | } | |
3735 | ||
83c7162d | 3736 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3737 | unsafe fn test_mm256_permute_pd() { |
3738 | let a = _mm256_setr_pd(4., 3., 2., 5.); | |
17df50a5 | 3739 | let r = _mm256_permute_pd::<5>(a); |
0531ce1d XL |
3740 | let e = _mm256_setr_pd(3., 4., 5., 2.); |
3741 | assert_eq_m256d(r, e); | |
3742 | } | |
3743 | ||
83c7162d | 3744 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3745 | unsafe fn test_mm_permute_pd() { |
3746 | let a = _mm_setr_pd(4., 3.); | |
17df50a5 | 3747 | let r = _mm_permute_pd::<1>(a); |
0531ce1d XL |
3748 | let e = _mm_setr_pd(3., 4.); |
3749 | assert_eq_m128d(r, e); | |
3750 | } | |
3751 | ||
83c7162d | 3752 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3753 | unsafe fn test_mm256_permute2f128_ps() { |
3754 | let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.); | |
3755 | let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.); | |
17df50a5 | 3756 | let r = _mm256_permute2f128_ps::<0x13>(a, b); |
0531ce1d XL |
3757 | let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.); |
3758 | assert_eq_m256(r, e); | |
3759 | } | |
3760 | ||
83c7162d | 3761 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3762 | unsafe fn test_mm256_permute2f128_pd() { |
3763 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3764 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
17df50a5 | 3765 | let r = _mm256_permute2f128_pd::<0x31>(a, b); |
0531ce1d XL |
3766 | let e = _mm256_setr_pd(3., 4., 7., 8.); |
3767 | assert_eq_m256d(r, e); | |
3768 | } | |
3769 | ||
83c7162d | 3770 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3771 | unsafe fn test_mm256_permute2f128_si256() { |
3772 | let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4); | |
3773 | let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8); | |
17df50a5 | 3774 | let r = _mm256_permute2f128_si256::<0x20>(a, b); |
0531ce1d XL |
3775 | let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); |
3776 | assert_eq_m256i(r, e); | |
3777 | } | |
3778 | ||
83c7162d | 3779 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3780 | unsafe fn test_mm256_broadcast_ss() { |
3781 | let r = _mm256_broadcast_ss(&3.); | |
3782 | let e = _mm256_set1_ps(3.); | |
3783 | assert_eq_m256(r, e); | |
3784 | } | |
3785 | ||
83c7162d | 3786 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3787 | unsafe fn test_mm_broadcast_ss() { |
3788 | let r = _mm_broadcast_ss(&3.); | |
3789 | let e = _mm_set1_ps(3.); | |
3790 | assert_eq_m128(r, e); | |
3791 | } | |
3792 | ||
83c7162d | 3793 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3794 | unsafe fn test_mm256_broadcast_sd() { |
3795 | let r = _mm256_broadcast_sd(&3.); | |
3796 | let e = _mm256_set1_pd(3.); | |
3797 | assert_eq_m256d(r, e); | |
3798 | } | |
3799 | ||
83c7162d | 3800 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3801 | unsafe fn test_mm256_broadcast_ps() { |
3802 | let a = _mm_setr_ps(4., 3., 2., 5.); | |
3803 | let r = _mm256_broadcast_ps(&a); | |
3804 | let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.); | |
3805 | assert_eq_m256(r, e); | |
3806 | } | |
3807 | ||
83c7162d | 3808 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3809 | unsafe fn test_mm256_broadcast_pd() { |
3810 | let a = _mm_setr_pd(4., 3.); | |
3811 | let r = _mm256_broadcast_pd(&a); | |
3812 | let e = _mm256_setr_pd(4., 3., 4., 3.); | |
3813 | assert_eq_m256d(r, e); | |
3814 | } | |
3815 | ||
83c7162d | 3816 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3817 | unsafe fn test_mm256_insertf128_ps() { |
3818 | let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3819 | let b = _mm_setr_ps(4., 9., 16., 25.); | |
17df50a5 | 3820 | let r = _mm256_insertf128_ps::<0>(a, b); |
0531ce1d XL |
3821 | let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.); |
3822 | assert_eq_m256(r, e); | |
3823 | } | |
3824 | ||
83c7162d | 3825 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3826 | unsafe fn test_mm256_insertf128_pd() { |
3827 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3828 | let b = _mm_setr_pd(5., 6.); | |
17df50a5 | 3829 | let r = _mm256_insertf128_pd::<0>(a, b); |
0531ce1d XL |
3830 | let e = _mm256_setr_pd(5., 6., 3., 4.); |
3831 | assert_eq_m256d(r, e); | |
3832 | } | |
3833 | ||
83c7162d | 3834 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3835 | unsafe fn test_mm256_insertf128_si256() { |
3836 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
3837 | let b = _mm_setr_epi64x(5, 6); | |
17df50a5 | 3838 | let r = _mm256_insertf128_si256::<0>(a, b); |
0531ce1d XL |
3839 | let e = _mm256_setr_epi64x(5, 6, 3, 4); |
3840 | assert_eq_m256i(r, e); | |
3841 | } | |
3842 | ||
83c7162d | 3843 | #[simd_test(enable = "avx")] |
0531ce1d | 3844 | unsafe fn test_mm256_insert_epi8() { |
0731742a | 3845 | #[rustfmt::skip] |
0531ce1d XL |
3846 | let a = _mm256_setr_epi8( |
3847 | 1, 2, 3, 4, 5, 6, 7, 8, | |
3848 | 9, 10, 11, 12, 13, 14, 15, 16, | |
3849 | 17, 18, 19, 20, 21, 22, 23, 24, | |
3850 | 25, 26, 27, 28, 29, 30, 31, 32, | |
3851 | ); | |
17df50a5 | 3852 | let r = _mm256_insert_epi8::<31>(a, 0); |
0731742a | 3853 | #[rustfmt::skip] |
0531ce1d XL |
3854 | let e = _mm256_setr_epi8( |
3855 | 1, 2, 3, 4, 5, 6, 7, 8, | |
3856 | 9, 10, 11, 12, 13, 14, 15, 16, | |
3857 | 17, 18, 19, 20, 21, 22, 23, 24, | |
3858 | 25, 26, 27, 28, 29, 30, 31, 0, | |
3859 | ); | |
3860 | assert_eq_m256i(r, e); | |
3861 | } | |
3862 | ||
83c7162d | 3863 | #[simd_test(enable = "avx")] |
0531ce1d | 3864 | unsafe fn test_mm256_insert_epi16() { |
0731742a | 3865 | #[rustfmt::skip] |
0531ce1d XL |
3866 | let a = _mm256_setr_epi16( |
3867 | 0, 1, 2, 3, 4, 5, 6, 7, | |
3868 | 8, 9, 10, 11, 12, 13, 14, 15, | |
3869 | ); | |
17df50a5 | 3870 | let r = _mm256_insert_epi16::<15>(a, 0); |
0731742a | 3871 | #[rustfmt::skip] |
0531ce1d XL |
3872 | let e = _mm256_setr_epi16( |
3873 | 0, 1, 2, 3, 4, 5, 6, 7, | |
3874 | 8, 9, 10, 11, 12, 13, 14, 0, | |
3875 | ); | |
3876 | assert_eq_m256i(r, e); | |
3877 | } | |
3878 | ||
83c7162d | 3879 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3880 | unsafe fn test_mm256_insert_epi32() { |
3881 | let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); | |
17df50a5 | 3882 | let r = _mm256_insert_epi32::<7>(a, 0); |
0531ce1d XL |
3883 | let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); |
3884 | assert_eq_m256i(r, e); | |
3885 | } | |
3886 | ||
83c7162d | 3887 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3888 | unsafe fn test_mm256_load_pd() { |
3889 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3890 | let p = &a as *const _ as *const f64; | |
3891 | let r = _mm256_load_pd(p); | |
3892 | let e = _mm256_setr_pd(1., 2., 3., 4.); | |
3893 | assert_eq_m256d(r, e); | |
3894 | } | |
3895 | ||
83c7162d | 3896 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3897 | unsafe fn test_mm256_store_pd() { |
3898 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
3899 | let mut r = _mm256_undefined_pd(); | |
3900 | _mm256_store_pd(&mut r as *mut _ as *mut f64, a); | |
3901 | assert_eq_m256d(r, a); | |
3902 | } | |
3903 | ||
83c7162d | 3904 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3905 | unsafe fn test_mm256_load_ps() { |
3906 | let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3907 | let p = &a as *const _ as *const f32; | |
3908 | let r = _mm256_load_ps(p); | |
3909 | let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3910 | assert_eq_m256(r, e); | |
3911 | } | |
3912 | ||
83c7162d | 3913 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3914 | unsafe fn test_mm256_store_ps() { |
3915 | let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3916 | let mut r = _mm256_undefined_ps(); | |
3917 | _mm256_store_ps(&mut r as *mut _ as *mut f32, a); | |
3918 | assert_eq_m256(r, a); | |
3919 | } | |
3920 | ||
83c7162d | 3921 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3922 | unsafe fn test_mm256_loadu_pd() { |
3923 | let a = &[1.0f64, 2., 3., 4.]; | |
3924 | let p = a.as_ptr(); | |
3925 | let r = _mm256_loadu_pd(black_box(p)); | |
3926 | let e = _mm256_setr_pd(1., 2., 3., 4.); | |
3927 | assert_eq_m256d(r, e); | |
3928 | } | |
3929 | ||
83c7162d | 3930 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3931 | unsafe fn test_mm256_storeu_pd() { |
3932 | let a = _mm256_set1_pd(9.); | |
3933 | let mut r = _mm256_undefined_pd(); | |
3934 | _mm256_storeu_pd(&mut r as *mut _ as *mut f64, a); | |
3935 | assert_eq_m256d(r, a); | |
3936 | } | |
3937 | ||
83c7162d | 3938 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3939 | unsafe fn test_mm256_loadu_ps() { |
3940 | let a = &[4., 3., 2., 5., 8., 9., 64., 50.]; | |
3941 | let p = a.as_ptr(); | |
3942 | let r = _mm256_loadu_ps(black_box(p)); | |
3943 | let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.); | |
3944 | assert_eq_m256(r, e); | |
3945 | } | |
3946 | ||
83c7162d | 3947 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3948 | unsafe fn test_mm256_storeu_ps() { |
3949 | let a = _mm256_set1_ps(9.); | |
3950 | let mut r = _mm256_undefined_ps(); | |
3951 | _mm256_storeu_ps(&mut r as *mut _ as *mut f32, a); | |
3952 | assert_eq_m256(r, a); | |
3953 | } | |
3954 | ||
83c7162d | 3955 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3956 | unsafe fn test_mm256_load_si256() { |
3957 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
3958 | let p = &a as *const _; | |
3959 | let r = _mm256_load_si256(p); | |
3960 | let e = _mm256_setr_epi64x(1, 2, 3, 4); | |
3961 | assert_eq_m256i(r, e); | |
3962 | } | |
3963 | ||
83c7162d | 3964 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3965 | unsafe fn test_mm256_store_si256() { |
3966 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
3967 | let mut r = _mm256_undefined_si256(); | |
3968 | _mm256_store_si256(&mut r as *mut _, a); | |
3969 | assert_eq_m256i(r, a); | |
3970 | } | |
3971 | ||
83c7162d | 3972 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3973 | unsafe fn test_mm256_loadu_si256() { |
3974 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
3975 | let p = &a as *const _; | |
3976 | let r = _mm256_loadu_si256(black_box(p)); | |
3977 | let e = _mm256_setr_epi64x(1, 2, 3, 4); | |
3978 | assert_eq_m256i(r, e); | |
3979 | } | |
3980 | ||
83c7162d | 3981 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3982 | unsafe fn test_mm256_storeu_si256() { |
3983 | let a = _mm256_set1_epi8(9); | |
3984 | let mut r = _mm256_undefined_si256(); | |
3985 | _mm256_storeu_si256(&mut r as *mut _, a); | |
3986 | assert_eq_m256i(r, a); | |
3987 | } | |
3988 | ||
83c7162d | 3989 | #[simd_test(enable = "avx")] |
0531ce1d XL |
3990 | unsafe fn test_mm256_maskload_pd() { |
3991 | let a = &[1.0f64, 2., 3., 4.]; | |
3992 | let p = a.as_ptr(); | |
3993 | let mask = _mm256_setr_epi64x(0, !0, 0, !0); | |
3994 | let r = _mm256_maskload_pd(black_box(p), mask); | |
3995 | let e = _mm256_setr_pd(0., 2., 0., 4.); | |
3996 | assert_eq_m256d(r, e); | |
3997 | } | |
3998 | ||
83c7162d | 3999 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4000 | unsafe fn test_mm256_maskstore_pd() { |
4001 | let mut r = _mm256_set1_pd(0.); | |
4002 | let mask = _mm256_setr_epi64x(0, !0, 0, !0); | |
4003 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4004 | _mm256_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a); | |
4005 | let e = _mm256_setr_pd(0., 2., 0., 4.); | |
4006 | assert_eq_m256d(r, e); | |
4007 | } | |
4008 | ||
83c7162d | 4009 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4010 | unsafe fn test_mm_maskload_pd() { |
4011 | let a = &[1.0f64, 2.]; | |
4012 | let p = a.as_ptr(); | |
4013 | let mask = _mm_setr_epi64x(0, !0); | |
4014 | let r = _mm_maskload_pd(black_box(p), mask); | |
4015 | let e = _mm_setr_pd(0., 2.); | |
4016 | assert_eq_m128d(r, e); | |
4017 | } | |
4018 | ||
83c7162d | 4019 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4020 | unsafe fn test_mm_maskstore_pd() { |
4021 | let mut r = _mm_set1_pd(0.); | |
4022 | let mask = _mm_setr_epi64x(0, !0); | |
4023 | let a = _mm_setr_pd(1., 2.); | |
4024 | _mm_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a); | |
4025 | let e = _mm_setr_pd(0., 2.); | |
4026 | assert_eq_m128d(r, e); | |
4027 | } | |
4028 | ||
83c7162d | 4029 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4030 | unsafe fn test_mm256_maskload_ps() { |
4031 | let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.]; | |
4032 | let p = a.as_ptr(); | |
4033 | let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0); | |
4034 | let r = _mm256_maskload_ps(black_box(p), mask); | |
4035 | let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.); | |
4036 | assert_eq_m256(r, e); | |
4037 | } | |
4038 | ||
83c7162d | 4039 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4040 | unsafe fn test_mm256_maskstore_ps() { |
4041 | let mut r = _mm256_set1_ps(0.); | |
4042 | let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0); | |
4043 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4044 | _mm256_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a); | |
4045 | let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.); | |
4046 | assert_eq_m256(r, e); | |
4047 | } | |
4048 | ||
83c7162d | 4049 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4050 | unsafe fn test_mm_maskload_ps() { |
4051 | let a = &[1.0f32, 2., 3., 4.]; | |
4052 | let p = a.as_ptr(); | |
4053 | let mask = _mm_setr_epi32(0, !0, 0, !0); | |
4054 | let r = _mm_maskload_ps(black_box(p), mask); | |
4055 | let e = _mm_setr_ps(0., 2., 0., 4.); | |
4056 | assert_eq_m128(r, e); | |
4057 | } | |
4058 | ||
83c7162d | 4059 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4060 | unsafe fn test_mm_maskstore_ps() { |
4061 | let mut r = _mm_set1_ps(0.); | |
4062 | let mask = _mm_setr_epi32(0, !0, 0, !0); | |
4063 | let a = _mm_setr_ps(1., 2., 3., 4.); | |
4064 | _mm_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a); | |
4065 | let e = _mm_setr_ps(0., 2., 0., 4.); | |
4066 | assert_eq_m128(r, e); | |
4067 | } | |
4068 | ||
83c7162d | 4069 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4070 | unsafe fn test_mm256_movehdup_ps() { |
4071 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4072 | let r = _mm256_movehdup_ps(a); | |
4073 | let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.); | |
4074 | assert_eq_m256(r, e); | |
4075 | } | |
4076 | ||
83c7162d | 4077 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4078 | unsafe fn test_mm256_moveldup_ps() { |
4079 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4080 | let r = _mm256_moveldup_ps(a); | |
4081 | let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.); | |
4082 | assert_eq_m256(r, e); | |
4083 | } | |
4084 | ||
83c7162d | 4085 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4086 | unsafe fn test_mm256_movedup_pd() { |
4087 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4088 | let r = _mm256_movedup_pd(a); | |
4089 | let e = _mm256_setr_pd(1., 1., 3., 3.); | |
4090 | assert_eq_m256d(r, e); | |
4091 | } | |
4092 | ||
83c7162d | 4093 | #[simd_test(enable = "avx")] |
0531ce1d | 4094 | unsafe fn test_mm256_lddqu_si256() { |
0731742a | 4095 | #[rustfmt::skip] |
0531ce1d XL |
4096 | let a = _mm256_setr_epi8( |
4097 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4098 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4099 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4100 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4101 | ); | |
4102 | let p = &a as *const _; | |
4103 | let r = _mm256_lddqu_si256(black_box(p)); | |
0731742a | 4104 | #[rustfmt::skip] |
0531ce1d XL |
4105 | let e = _mm256_setr_epi8( |
4106 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4107 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4108 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4109 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4110 | ); | |
4111 | assert_eq_m256i(r, e); | |
4112 | } | |
4113 | ||
83c7162d | 4114 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4115 | unsafe fn test_mm256_stream_si256() { |
4116 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
4117 | let mut r = _mm256_undefined_si256(); | |
4118 | _mm256_stream_si256(&mut r as *mut _, a); | |
4119 | assert_eq_m256i(r, a); | |
4120 | } | |
4121 | ||
83c7162d | 4122 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4123 | unsafe fn test_mm256_stream_pd() { |
4124 | #[repr(align(32))] | |
4125 | struct Memory { | |
4126 | pub data: [f64; 4], | |
4127 | } | |
4128 | let a = _mm256_set1_pd(7.0); | |
8faf50e0 | 4129 | let mut mem = Memory { data: [-1.0; 4] }; |
0531ce1d XL |
4130 | |
4131 | _mm256_stream_pd(&mut mem.data[0] as *mut f64, a); | |
4132 | for i in 0..4 { | |
4133 | assert_eq!(mem.data[i], get_m256d(a, i)); | |
4134 | } | |
4135 | } | |
4136 | ||
83c7162d | 4137 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4138 | unsafe fn test_mm256_stream_ps() { |
4139 | #[repr(align(32))] | |
4140 | struct Memory { | |
4141 | pub data: [f32; 8], | |
4142 | } | |
4143 | let a = _mm256_set1_ps(7.0); | |
8faf50e0 | 4144 | let mut mem = Memory { data: [-1.0; 8] }; |
0531ce1d XL |
4145 | |
4146 | _mm256_stream_ps(&mut mem.data[0] as *mut f32, a); | |
4147 | for i in 0..8 { | |
4148 | assert_eq!(mem.data[i], get_m256(a, i)); | |
4149 | } | |
4150 | } | |
4151 | ||
83c7162d | 4152 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4153 | unsafe fn test_mm256_rcp_ps() { |
4154 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4155 | let r = _mm256_rcp_ps(a); | |
0731742a | 4156 | #[rustfmt::skip] |
0531ce1d XL |
4157 | let e = _mm256_setr_ps( |
4158 | 0.99975586, 0.49987793, 0.33325195, 0.24993896, | |
4159 | 0.19995117, 0.16662598, 0.14282227, 0.12496948, | |
4160 | ); | |
4161 | let rel_err = 0.00048828125; | |
4162 | for i in 0..8 { | |
4163 | assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err); | |
4164 | } | |
4165 | } | |
4166 | ||
83c7162d | 4167 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4168 | unsafe fn test_mm256_rsqrt_ps() { |
4169 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4170 | let r = _mm256_rsqrt_ps(a); | |
0731742a | 4171 | #[rustfmt::skip] |
0531ce1d XL |
4172 | let e = _mm256_setr_ps( |
4173 | 0.99975586, 0.7069092, 0.5772705, 0.49987793, | |
4174 | 0.44714355, 0.40820313, 0.3779297, 0.3534546, | |
4175 | ); | |
4176 | let rel_err = 0.00048828125; | |
4177 | for i in 0..8 { | |
4178 | assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err); | |
4179 | } | |
4180 | } | |
4181 | ||
83c7162d | 4182 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4183 | unsafe fn test_mm256_unpackhi_pd() { |
4184 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4185 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
4186 | let r = _mm256_unpackhi_pd(a, b); | |
4187 | let e = _mm256_setr_pd(2., 6., 4., 8.); | |
4188 | assert_eq_m256d(r, e); | |
4189 | } | |
4190 | ||
83c7162d | 4191 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4192 | unsafe fn test_mm256_unpackhi_ps() { |
4193 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4194 | let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); | |
4195 | let r = _mm256_unpackhi_ps(a, b); | |
4196 | let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.); | |
4197 | assert_eq_m256(r, e); | |
4198 | } | |
4199 | ||
83c7162d | 4200 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4201 | unsafe fn test_mm256_unpacklo_pd() { |
4202 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4203 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
4204 | let r = _mm256_unpacklo_pd(a, b); | |
4205 | let e = _mm256_setr_pd(1., 5., 3., 7.); | |
4206 | assert_eq_m256d(r, e); | |
4207 | } | |
4208 | ||
83c7162d | 4209 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4210 | unsafe fn test_mm256_unpacklo_ps() { |
4211 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4212 | let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.); | |
4213 | let r = _mm256_unpacklo_ps(a, b); | |
4214 | let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.); | |
4215 | assert_eq_m256(r, e); | |
4216 | } | |
4217 | ||
83c7162d | 4218 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4219 | unsafe fn test_mm256_testz_si256() { |
4220 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
4221 | let b = _mm256_setr_epi64x(5, 6, 7, 8); | |
4222 | let r = _mm256_testz_si256(a, b); | |
4223 | assert_eq!(r, 0); | |
4224 | let b = _mm256_set1_epi64x(0); | |
4225 | let r = _mm256_testz_si256(a, b); | |
4226 | assert_eq!(r, 1); | |
4227 | } | |
4228 | ||
83c7162d | 4229 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4230 | unsafe fn test_mm256_testc_si256() { |
4231 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
4232 | let b = _mm256_setr_epi64x(5, 6, 7, 8); | |
4233 | let r = _mm256_testc_si256(a, b); | |
4234 | assert_eq!(r, 0); | |
4235 | let b = _mm256_set1_epi64x(0); | |
4236 | let r = _mm256_testc_si256(a, b); | |
4237 | assert_eq!(r, 1); | |
4238 | } | |
4239 | ||
83c7162d | 4240 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4241 | unsafe fn test_mm256_testnzc_si256() { |
4242 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
4243 | let b = _mm256_setr_epi64x(5, 6, 7, 8); | |
4244 | let r = _mm256_testnzc_si256(a, b); | |
4245 | assert_eq!(r, 1); | |
4246 | let a = _mm256_setr_epi64x(0, 0, 0, 0); | |
4247 | let b = _mm256_setr_epi64x(0, 0, 0, 0); | |
4248 | let r = _mm256_testnzc_si256(a, b); | |
4249 | assert_eq!(r, 0); | |
4250 | } | |
4251 | ||
83c7162d | 4252 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4253 | unsafe fn test_mm256_testz_pd() { |
4254 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4255 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
4256 | let r = _mm256_testz_pd(a, b); | |
4257 | assert_eq!(r, 1); | |
4258 | let a = _mm256_set1_pd(-1.); | |
4259 | let r = _mm256_testz_pd(a, a); | |
4260 | assert_eq!(r, 0); | |
4261 | } | |
4262 | ||
83c7162d | 4263 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4264 | unsafe fn test_mm256_testc_pd() { |
4265 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4266 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
4267 | let r = _mm256_testc_pd(a, b); | |
4268 | assert_eq!(r, 1); | |
4269 | let a = _mm256_set1_pd(1.); | |
4270 | let b = _mm256_set1_pd(-1.); | |
4271 | let r = _mm256_testc_pd(a, b); | |
4272 | assert_eq!(r, 0); | |
4273 | } | |
4274 | ||
83c7162d | 4275 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4276 | unsafe fn test_mm256_testnzc_pd() { |
4277 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4278 | let b = _mm256_setr_pd(5., 6., 7., 8.); | |
4279 | let r = _mm256_testnzc_pd(a, b); | |
4280 | assert_eq!(r, 0); | |
4281 | let a = _mm256_setr_pd(1., -1., -1., -1.); | |
4282 | let b = _mm256_setr_pd(-1., -1., 1., 1.); | |
4283 | let r = _mm256_testnzc_pd(a, b); | |
4284 | assert_eq!(r, 1); | |
4285 | } | |
4286 | ||
83c7162d | 4287 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4288 | unsafe fn test_mm_testz_pd() { |
4289 | let a = _mm_setr_pd(1., 2.); | |
4290 | let b = _mm_setr_pd(5., 6.); | |
4291 | let r = _mm_testz_pd(a, b); | |
4292 | assert_eq!(r, 1); | |
4293 | let a = _mm_set1_pd(-1.); | |
4294 | let r = _mm_testz_pd(a, a); | |
4295 | assert_eq!(r, 0); | |
4296 | } | |
4297 | ||
83c7162d | 4298 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4299 | unsafe fn test_mm_testc_pd() { |
4300 | let a = _mm_setr_pd(1., 2.); | |
4301 | let b = _mm_setr_pd(5., 6.); | |
4302 | let r = _mm_testc_pd(a, b); | |
4303 | assert_eq!(r, 1); | |
4304 | let a = _mm_set1_pd(1.); | |
4305 | let b = _mm_set1_pd(-1.); | |
4306 | let r = _mm_testc_pd(a, b); | |
4307 | assert_eq!(r, 0); | |
4308 | } | |
4309 | ||
83c7162d | 4310 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4311 | unsafe fn test_mm_testnzc_pd() { |
4312 | let a = _mm_setr_pd(1., 2.); | |
4313 | let b = _mm_setr_pd(5., 6.); | |
4314 | let r = _mm_testnzc_pd(a, b); | |
4315 | assert_eq!(r, 0); | |
4316 | let a = _mm_setr_pd(1., -1.); | |
4317 | let b = _mm_setr_pd(-1., -1.); | |
4318 | let r = _mm_testnzc_pd(a, b); | |
4319 | assert_eq!(r, 1); | |
4320 | } | |
4321 | ||
83c7162d | 4322 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4323 | unsafe fn test_mm256_testz_ps() { |
4324 | let a = _mm256_set1_ps(1.); | |
4325 | let r = _mm256_testz_ps(a, a); | |
4326 | assert_eq!(r, 1); | |
4327 | let a = _mm256_set1_ps(-1.); | |
4328 | let r = _mm256_testz_ps(a, a); | |
4329 | assert_eq!(r, 0); | |
4330 | } | |
4331 | ||
83c7162d | 4332 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4333 | unsafe fn test_mm256_testc_ps() { |
4334 | let a = _mm256_set1_ps(1.); | |
4335 | let r = _mm256_testc_ps(a, a); | |
4336 | assert_eq!(r, 1); | |
4337 | let b = _mm256_set1_ps(-1.); | |
4338 | let r = _mm256_testc_ps(a, b); | |
4339 | assert_eq!(r, 0); | |
4340 | } | |
4341 | ||
83c7162d | 4342 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4343 | unsafe fn test_mm256_testnzc_ps() { |
4344 | let a = _mm256_set1_ps(1.); | |
4345 | let r = _mm256_testnzc_ps(a, a); | |
4346 | assert_eq!(r, 0); | |
4347 | let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.); | |
4348 | let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.); | |
4349 | let r = _mm256_testnzc_ps(a, b); | |
4350 | assert_eq!(r, 1); | |
4351 | } | |
4352 | ||
83c7162d | 4353 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4354 | unsafe fn test_mm_testz_ps() { |
4355 | let a = _mm_set1_ps(1.); | |
4356 | let r = _mm_testz_ps(a, a); | |
4357 | assert_eq!(r, 1); | |
4358 | let a = _mm_set1_ps(-1.); | |
4359 | let r = _mm_testz_ps(a, a); | |
4360 | assert_eq!(r, 0); | |
4361 | } | |
4362 | ||
83c7162d | 4363 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4364 | unsafe fn test_mm_testc_ps() { |
4365 | let a = _mm_set1_ps(1.); | |
4366 | let r = _mm_testc_ps(a, a); | |
4367 | assert_eq!(r, 1); | |
4368 | let b = _mm_set1_ps(-1.); | |
4369 | let r = _mm_testc_ps(a, b); | |
4370 | assert_eq!(r, 0); | |
4371 | } | |
4372 | ||
83c7162d | 4373 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4374 | unsafe fn test_mm_testnzc_ps() { |
4375 | let a = _mm_set1_ps(1.); | |
4376 | let r = _mm_testnzc_ps(a, a); | |
4377 | assert_eq!(r, 0); | |
4378 | let a = _mm_setr_ps(1., -1., -1., -1.); | |
4379 | let b = _mm_setr_ps(-1., -1., 1., 1.); | |
4380 | let r = _mm_testnzc_ps(a, b); | |
4381 | assert_eq!(r, 1); | |
4382 | } | |
4383 | ||
83c7162d | 4384 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4385 | unsafe fn test_mm256_movemask_pd() { |
4386 | let a = _mm256_setr_pd(1., -2., 3., -4.); | |
4387 | let r = _mm256_movemask_pd(a); | |
4388 | assert_eq!(r, 0xA); | |
4389 | } | |
4390 | ||
83c7162d | 4391 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4392 | unsafe fn test_mm256_movemask_ps() { |
4393 | let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.); | |
4394 | let r = _mm256_movemask_ps(a); | |
4395 | assert_eq!(r, 0xAA); | |
4396 | } | |
4397 | ||
83c7162d | 4398 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4399 | unsafe fn test_mm256_setzero_pd() { |
4400 | let r = _mm256_setzero_pd(); | |
4401 | assert_eq_m256d(r, _mm256_set1_pd(0.)); | |
4402 | } | |
4403 | ||
83c7162d | 4404 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4405 | unsafe fn test_mm256_setzero_ps() { |
4406 | let r = _mm256_setzero_ps(); | |
4407 | assert_eq_m256(r, _mm256_set1_ps(0.)); | |
4408 | } | |
4409 | ||
83c7162d | 4410 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4411 | unsafe fn test_mm256_setzero_si256() { |
4412 | let r = _mm256_setzero_si256(); | |
4413 | assert_eq_m256i(r, _mm256_set1_epi8(0)); | |
4414 | } | |
4415 | ||
83c7162d | 4416 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4417 | unsafe fn test_mm256_set_pd() { |
4418 | let r = _mm256_set_pd(1., 2., 3., 4.); | |
4419 | assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.)); | |
4420 | } | |
4421 | ||
83c7162d | 4422 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4423 | unsafe fn test_mm256_set_ps() { |
4424 | let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
8faf50e0 | 4425 | assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.)); |
0531ce1d XL |
4426 | } |
4427 | ||
83c7162d | 4428 | #[simd_test(enable = "avx")] |
0531ce1d | 4429 | unsafe fn test_mm256_set_epi8() { |
0731742a | 4430 | #[rustfmt::skip] |
0531ce1d XL |
4431 | let r = _mm256_set_epi8( |
4432 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4433 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4434 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4435 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4436 | ); | |
0731742a | 4437 | #[rustfmt::skip] |
0531ce1d XL |
4438 | let e = _mm256_setr_epi8( |
4439 | 32, 31, 30, 29, 28, 27, 26, 25, | |
4440 | 24, 23, 22, 21, 20, 19, 18, 17, | |
4441 | 16, 15, 14, 13, 12, 11, 10, 9, | |
4442 | 8, 7, 6, 5, 4, 3, 2, 1 | |
4443 | ); | |
4444 | assert_eq_m256i(r, e); | |
4445 | } | |
4446 | ||
83c7162d | 4447 | #[simd_test(enable = "avx")] |
0531ce1d | 4448 | unsafe fn test_mm256_set_epi16() { |
0731742a | 4449 | #[rustfmt::skip] |
0531ce1d XL |
4450 | let r = _mm256_set_epi16( |
4451 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4452 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4453 | ); | |
0731742a | 4454 | #[rustfmt::skip] |
0531ce1d XL |
4455 | let e = _mm256_setr_epi16( |
4456 | 16, 15, 14, 13, 12, 11, 10, 9, 8, | |
4457 | 7, 6, 5, 4, 3, 2, 1, | |
4458 | ); | |
4459 | assert_eq_m256i(r, e); | |
4460 | } | |
4461 | ||
83c7162d | 4462 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4463 | unsafe fn test_mm256_set_epi32() { |
4464 | let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); | |
4465 | assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1)); | |
4466 | } | |
4467 | ||
83c7162d | 4468 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4469 | unsafe fn test_mm256_set_epi64x() { |
4470 | let r = _mm256_set_epi64x(1, 2, 3, 4); | |
4471 | assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1)); | |
4472 | } | |
4473 | ||
83c7162d | 4474 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4475 | unsafe fn test_mm256_setr_pd() { |
4476 | let r = _mm256_setr_pd(1., 2., 3., 4.); | |
4477 | assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.)); | |
4478 | } | |
4479 | ||
83c7162d | 4480 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4481 | unsafe fn test_mm256_setr_ps() { |
4482 | let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
8faf50e0 | 4483 | assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.)); |
0531ce1d XL |
4484 | } |
4485 | ||
83c7162d | 4486 | #[simd_test(enable = "avx")] |
0531ce1d | 4487 | unsafe fn test_mm256_setr_epi8() { |
0731742a | 4488 | #[rustfmt::skip] |
0531ce1d XL |
4489 | let r = _mm256_setr_epi8( |
4490 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4491 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4492 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4493 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4494 | ); | |
0731742a | 4495 | #[rustfmt::skip] |
0531ce1d XL |
4496 | let e = _mm256_setr_epi8( |
4497 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4498 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4499 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4500 | 25, 26, 27, 28, 29, 30, 31, 32 | |
4501 | ); | |
4502 | ||
4503 | assert_eq_m256i(r, e); | |
4504 | } | |
4505 | ||
83c7162d | 4506 | #[simd_test(enable = "avx")] |
0531ce1d | 4507 | unsafe fn test_mm256_setr_epi16() { |
0731742a | 4508 | #[rustfmt::skip] |
0531ce1d XL |
4509 | let r = _mm256_setr_epi16( |
4510 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4511 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4512 | ); | |
0731742a | 4513 | #[rustfmt::skip] |
0531ce1d XL |
4514 | let e = _mm256_setr_epi16( |
4515 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4516 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4517 | ); | |
4518 | assert_eq_m256i(r, e); | |
4519 | } | |
4520 | ||
83c7162d | 4521 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4522 | unsafe fn test_mm256_setr_epi32() { |
4523 | let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); | |
4524 | assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8)); | |
4525 | } | |
4526 | ||
83c7162d | 4527 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4528 | unsafe fn test_mm256_setr_epi64x() { |
4529 | let r = _mm256_setr_epi64x(1, 2, 3, 4); | |
4530 | assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4)); | |
4531 | } | |
4532 | ||
83c7162d | 4533 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4534 | unsafe fn test_mm256_set1_pd() { |
4535 | let r = _mm256_set1_pd(1.); | |
4536 | assert_eq_m256d(r, _mm256_set1_pd(1.)); | |
4537 | } | |
4538 | ||
83c7162d | 4539 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4540 | unsafe fn test_mm256_set1_ps() { |
4541 | let r = _mm256_set1_ps(1.); | |
4542 | assert_eq_m256(r, _mm256_set1_ps(1.)); | |
4543 | } | |
4544 | ||
83c7162d | 4545 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4546 | unsafe fn test_mm256_set1_epi8() { |
4547 | let r = _mm256_set1_epi8(1); | |
4548 | assert_eq_m256i(r, _mm256_set1_epi8(1)); | |
4549 | } | |
4550 | ||
83c7162d | 4551 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4552 | unsafe fn test_mm256_set1_epi16() { |
4553 | let r = _mm256_set1_epi16(1); | |
4554 | assert_eq_m256i(r, _mm256_set1_epi16(1)); | |
4555 | } | |
4556 | ||
83c7162d | 4557 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4558 | unsafe fn test_mm256_set1_epi32() { |
4559 | let r = _mm256_set1_epi32(1); | |
4560 | assert_eq_m256i(r, _mm256_set1_epi32(1)); | |
4561 | } | |
4562 | ||
83c7162d | 4563 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4564 | unsafe fn test_mm256_set1_epi64x() { |
4565 | let r = _mm256_set1_epi64x(1); | |
4566 | assert_eq_m256i(r, _mm256_set1_epi64x(1)); | |
4567 | } | |
4568 | ||
83c7162d | 4569 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4570 | unsafe fn test_mm256_castpd_ps() { |
4571 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4572 | let r = _mm256_castpd_ps(a); | |
4573 | let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25); | |
4574 | assert_eq_m256(r, e); | |
4575 | } | |
4576 | ||
83c7162d | 4577 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4578 | unsafe fn test_mm256_castps_pd() { |
4579 | let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25); | |
4580 | let r = _mm256_castps_pd(a); | |
4581 | let e = _mm256_setr_pd(1., 2., 3., 4.); | |
4582 | assert_eq_m256d(r, e); | |
4583 | } | |
4584 | ||
83c7162d | 4585 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4586 | unsafe fn test_mm256_castps_si256() { |
4587 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4588 | let r = _mm256_castps_si256(a); | |
0731742a | 4589 | #[rustfmt::skip] |
0531ce1d XL |
4590 | let e = _mm256_setr_epi8( |
4591 | 0, 0, -128, 63, 0, 0, 0, 64, | |
4592 | 0, 0, 64, 64, 0, 0, -128, 64, | |
4593 | 0, 0, -96, 64, 0, 0, -64, 64, | |
4594 | 0, 0, -32, 64, 0, 0, 0, 65, | |
4595 | ); | |
4596 | assert_eq_m256i(r, e); | |
4597 | } | |
4598 | ||
83c7162d | 4599 | #[simd_test(enable = "avx")] |
0531ce1d | 4600 | unsafe fn test_mm256_castsi256_ps() { |
0731742a | 4601 | #[rustfmt::skip] |
0531ce1d XL |
4602 | let a = _mm256_setr_epi8( |
4603 | 0, 0, -128, 63, 0, 0, 0, 64, | |
4604 | 0, 0, 64, 64, 0, 0, -128, 64, | |
4605 | 0, 0, -96, 64, 0, 0, -64, 64, | |
4606 | 0, 0, -32, 64, 0, 0, 0, 65, | |
4607 | ); | |
4608 | let r = _mm256_castsi256_ps(a); | |
4609 | let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4610 | assert_eq_m256(r, e); | |
4611 | } | |
4612 | ||
83c7162d | 4613 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4614 | unsafe fn test_mm256_castpd_si256() { |
4615 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4616 | let r = _mm256_castpd_si256(a); | |
532ac7d7 | 4617 | assert_eq_m256d(transmute(r), a); |
0531ce1d XL |
4618 | } |
4619 | ||
83c7162d | 4620 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4621 | unsafe fn test_mm256_castsi256_pd() { |
4622 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
4623 | let r = _mm256_castsi256_pd(a); | |
532ac7d7 | 4624 | assert_eq_m256d(r, transmute(a)); |
0531ce1d XL |
4625 | } |
4626 | ||
83c7162d | 4627 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4628 | unsafe fn test_mm256_castps256_ps128() { |
4629 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4630 | let r = _mm256_castps256_ps128(a); | |
4631 | assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.)); | |
4632 | } | |
4633 | ||
83c7162d | 4634 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4635 | unsafe fn test_mm256_castpd256_pd128() { |
4636 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4637 | let r = _mm256_castpd256_pd128(a); | |
4638 | assert_eq_m128d(r, _mm_setr_pd(1., 2.)); | |
4639 | } | |
4640 | ||
83c7162d | 4641 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4642 | unsafe fn test_mm256_castsi256_si128() { |
4643 | let a = _mm256_setr_epi64x(1, 2, 3, 4); | |
4644 | let r = _mm256_castsi256_si128(a); | |
4645 | assert_eq_m128i(r, _mm_setr_epi64x(1, 2)); | |
4646 | } | |
4647 | ||
83c7162d | 4648 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4649 | unsafe fn test_mm256_zextps128_ps256() { |
4650 | let a = _mm_setr_ps(1., 2., 3., 4.); | |
4651 | let r = _mm256_zextps128_ps256(a); | |
4652 | let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.); | |
4653 | assert_eq_m256(r, e); | |
4654 | } | |
4655 | ||
83c7162d | 4656 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4657 | unsafe fn test_mm256_zextsi128_si256() { |
4658 | let a = _mm_setr_epi64x(1, 2); | |
4659 | let r = _mm256_zextsi128_si256(a); | |
4660 | let e = _mm256_setr_epi64x(1, 2, 0, 0); | |
4661 | assert_eq_m256i(r, e); | |
4662 | } | |
4663 | ||
83c7162d | 4664 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4665 | unsafe fn test_mm256_zextpd128_pd256() { |
4666 | let a = _mm_setr_pd(1., 2.); | |
4667 | let r = _mm256_zextpd128_pd256(a); | |
4668 | let e = _mm256_setr_pd(1., 2., 0., 0.); | |
4669 | assert_eq_m256d(r, e); | |
4670 | } | |
4671 | ||
83c7162d | 4672 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4673 | unsafe fn test_mm256_set_m128() { |
4674 | let hi = _mm_setr_ps(5., 6., 7., 8.); | |
4675 | let lo = _mm_setr_ps(1., 2., 3., 4.); | |
4676 | let r = _mm256_set_m128(hi, lo); | |
4677 | let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4678 | assert_eq_m256(r, e); | |
4679 | } | |
4680 | ||
83c7162d | 4681 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4682 | unsafe fn test_mm256_set_m128d() { |
4683 | let hi = _mm_setr_pd(3., 4.); | |
4684 | let lo = _mm_setr_pd(1., 2.); | |
4685 | let r = _mm256_set_m128d(hi, lo); | |
4686 | let e = _mm256_setr_pd(1., 2., 3., 4.); | |
4687 | assert_eq_m256d(r, e); | |
4688 | } | |
4689 | ||
83c7162d | 4690 | #[simd_test(enable = "avx")] |
0531ce1d | 4691 | unsafe fn test_mm256_set_m128i() { |
0731742a | 4692 | #[rustfmt::skip] |
0531ce1d XL |
4693 | let hi = _mm_setr_epi8( |
4694 | 17, 18, 19, 20, | |
4695 | 21, 22, 23, 24, | |
4696 | 25, 26, 27, 28, | |
4697 | 29, 30, 31, 32, | |
4698 | ); | |
0731742a | 4699 | #[rustfmt::skip] |
0531ce1d XL |
4700 | let lo = _mm_setr_epi8( |
4701 | 1, 2, 3, 4, | |
4702 | 5, 6, 7, 8, | |
4703 | 9, 10, 11, 12, | |
4704 | 13, 14, 15, 16, | |
4705 | ); | |
4706 | let r = _mm256_set_m128i(hi, lo); | |
0731742a | 4707 | #[rustfmt::skip] |
0531ce1d XL |
4708 | let e = _mm256_setr_epi8( |
4709 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4710 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4711 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4712 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4713 | ); | |
4714 | assert_eq_m256i(r, e); | |
4715 | } | |
4716 | ||
83c7162d | 4717 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4718 | unsafe fn test_mm256_setr_m128() { |
4719 | let lo = _mm_setr_ps(1., 2., 3., 4.); | |
4720 | let hi = _mm_setr_ps(5., 6., 7., 8.); | |
4721 | let r = _mm256_setr_m128(lo, hi); | |
4722 | let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4723 | assert_eq_m256(r, e); | |
4724 | } | |
4725 | ||
83c7162d | 4726 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4727 | unsafe fn test_mm256_setr_m128d() { |
4728 | let lo = _mm_setr_pd(1., 2.); | |
4729 | let hi = _mm_setr_pd(3., 4.); | |
4730 | let r = _mm256_setr_m128d(lo, hi); | |
4731 | let e = _mm256_setr_pd(1., 2., 3., 4.); | |
4732 | assert_eq_m256d(r, e); | |
4733 | } | |
4734 | ||
83c7162d | 4735 | #[simd_test(enable = "avx")] |
0531ce1d | 4736 | unsafe fn test_mm256_setr_m128i() { |
0731742a | 4737 | #[rustfmt::skip] |
0531ce1d XL |
4738 | let lo = _mm_setr_epi8( |
4739 | 1, 2, 3, 4, | |
4740 | 5, 6, 7, 8, | |
4741 | 9, 10, 11, 12, | |
4742 | 13, 14, 15, 16, | |
4743 | ); | |
0731742a | 4744 | #[rustfmt::skip] |
0531ce1d XL |
4745 | let hi = _mm_setr_epi8( |
4746 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4747 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4748 | ); | |
4749 | let r = _mm256_setr_m128i(lo, hi); | |
0731742a | 4750 | #[rustfmt::skip] |
0531ce1d XL |
4751 | let e = _mm256_setr_epi8( |
4752 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4753 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4754 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4755 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4756 | ); | |
4757 | assert_eq_m256i(r, e); | |
4758 | } | |
4759 | ||
83c7162d | 4760 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4761 | unsafe fn test_mm256_loadu2_m128() { |
4762 | let hi = &[5., 6., 7., 8.]; | |
4763 | let hiaddr = hi.as_ptr(); | |
4764 | let lo = &[1., 2., 3., 4.]; | |
4765 | let loaddr = lo.as_ptr(); | |
4766 | let r = _mm256_loadu2_m128(hiaddr, loaddr); | |
4767 | let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4768 | assert_eq_m256(r, e); | |
4769 | } | |
4770 | ||
83c7162d | 4771 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4772 | unsafe fn test_mm256_loadu2_m128d() { |
4773 | let hi = &[3., 4.]; | |
4774 | let hiaddr = hi.as_ptr(); | |
4775 | let lo = &[1., 2.]; | |
4776 | let loaddr = lo.as_ptr(); | |
4777 | let r = _mm256_loadu2_m128d(hiaddr, loaddr); | |
4778 | let e = _mm256_setr_pd(1., 2., 3., 4.); | |
4779 | assert_eq_m256d(r, e); | |
4780 | } | |
4781 | ||
83c7162d | 4782 | #[simd_test(enable = "avx")] |
0531ce1d | 4783 | unsafe fn test_mm256_loadu2_m128i() { |
0731742a | 4784 | #[rustfmt::skip] |
0531ce1d XL |
4785 | let hi = _mm_setr_epi8( |
4786 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4787 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4788 | ); | |
0731742a | 4789 | #[rustfmt::skip] |
0531ce1d XL |
4790 | let lo = _mm_setr_epi8( |
4791 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4792 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4793 | ); | |
0731742a XL |
4794 | let r = _mm256_loadu2_m128i(&hi as *const _ as *const _, &lo as *const _ as *const _); |
4795 | #[rustfmt::skip] | |
0531ce1d XL |
4796 | let e = _mm256_setr_epi8( |
4797 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4798 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4799 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4800 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4801 | ); | |
4802 | assert_eq_m256i(r, e); | |
4803 | } | |
4804 | ||
83c7162d | 4805 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4806 | unsafe fn test_mm256_storeu2_m128() { |
4807 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4808 | let mut hi = _mm_undefined_ps(); | |
4809 | let mut lo = _mm_undefined_ps(); | |
4810 | _mm256_storeu2_m128( | |
4811 | &mut hi as *mut _ as *mut f32, | |
4812 | &mut lo as *mut _ as *mut f32, | |
4813 | a, | |
4814 | ); | |
4815 | assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.)); | |
4816 | assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.)); | |
4817 | } | |
4818 | ||
83c7162d | 4819 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4820 | unsafe fn test_mm256_storeu2_m128d() { |
4821 | let a = _mm256_setr_pd(1., 2., 3., 4.); | |
4822 | let mut hi = _mm_undefined_pd(); | |
4823 | let mut lo = _mm_undefined_pd(); | |
4824 | _mm256_storeu2_m128d( | |
4825 | &mut hi as *mut _ as *mut f64, | |
4826 | &mut lo as *mut _ as *mut f64, | |
4827 | a, | |
4828 | ); | |
4829 | assert_eq_m128d(hi, _mm_setr_pd(3., 4.)); | |
4830 | assert_eq_m128d(lo, _mm_setr_pd(1., 2.)); | |
4831 | } | |
4832 | ||
83c7162d | 4833 | #[simd_test(enable = "avx")] |
0531ce1d | 4834 | unsafe fn test_mm256_storeu2_m128i() { |
0731742a | 4835 | #[rustfmt::skip] |
0531ce1d XL |
4836 | let a = _mm256_setr_epi8( |
4837 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4838 | 9, 10, 11, 12, 13, 14, 15, 16, | |
4839 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4840 | 25, 26, 27, 28, 29, 30, 31, 32, | |
4841 | ); | |
4842 | let mut hi = _mm_undefined_si128(); | |
4843 | let mut lo = _mm_undefined_si128(); | |
4844 | _mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a); | |
0731742a | 4845 | #[rustfmt::skip] |
0531ce1d XL |
4846 | let e_hi = _mm_setr_epi8( |
4847 | 17, 18, 19, 20, 21, 22, 23, 24, | |
4848 | 25, 26, 27, 28, 29, 30, 31, 32 | |
4849 | ); | |
0731742a | 4850 | #[rustfmt::skip] |
0531ce1d XL |
4851 | let e_lo = _mm_setr_epi8( |
4852 | 1, 2, 3, 4, 5, 6, 7, 8, | |
4853 | 9, 10, 11, 12, 13, 14, 15, 16 | |
4854 | ); | |
4855 | ||
4856 | assert_eq_m128i(hi, e_hi); | |
4857 | assert_eq_m128i(lo, e_lo); | |
4858 | } | |
4859 | ||
83c7162d | 4860 | #[simd_test(enable = "avx")] |
0531ce1d XL |
4861 | unsafe fn test_mm256_cvtss_f32() { |
4862 | let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.); | |
4863 | let r = _mm256_cvtss_f32(a); | |
4864 | assert_eq!(r, 1.); | |
4865 | } | |
4866 | } |