]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/avx.rs
New upstream version 1.61.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / avx.rs
CommitLineData
0531ce1d
XL
1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//! Programmer's Manual, Volume 3: General-Purpose and System
8//! Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
532ac7d7
XL
16use crate::{
17 core_arch::{simd::*, simd_llvm::*, x86::*},
18 intrinsics,
19 mem::{self, transmute},
20 ptr,
21};
0531ce1d
XL
22
23#[cfg(test)]
416331ca 24use stdarch_test::assert_instr;
0531ce1d 25
532ac7d7 26/// Adds packed double-precision (64-bit) floating-point elements
0531ce1d 27/// in `a` and `b`.
83c7162d
XL
28///
29/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_pd)
0531ce1d
XL
30#[inline]
31#[target_feature(enable = "avx")]
32#[cfg_attr(test, assert_instr(vaddpd))]
83c7162d 33#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
34pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35 simd_add(a, b)
36}
37
532ac7d7 38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 39/// `b`.
83c7162d
XL
40///
41/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ps)
0531ce1d
XL
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
83c7162d 45#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
46pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
47 simd_add(a, b)
48}
49
532ac7d7
XL
50/// Computes the bitwise AND of a packed double-precision (64-bit)
51/// floating-point elements in `a` and `b`.
83c7162d
XL
52///
53/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_pd)
0531ce1d
XL
54#[inline]
55#[target_feature(enable = "avx")]
a2a8927a 56// FIXME: Should be 'vandpd' instruction.
416331ca 57// See https://github.com/rust-lang/stdarch/issues/71
0531ce1d 58#[cfg_attr(test, assert_instr(vandps))]
83c7162d 59#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 60pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
61 let a: u64x4 = transmute(a);
62 let b: u64x4 = transmute(b);
63 transmute(simd_and(a, b))
0531ce1d
XL
64}
65
532ac7d7 66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
0531ce1d 67/// elements in `a` and `b`.
83c7162d
XL
68///
69/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_ps)
0531ce1d
XL
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
83c7162d 73#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 74pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
75 let a: u32x8 = transmute(a);
76 let b: u32x8 = transmute(b);
77 transmute(simd_and(a, b))
0531ce1d
XL
78}
79
532ac7d7 80/// Computes the bitwise OR packed double-precision (64-bit) floating-point
0531ce1d 81/// elements in `a` and `b`.
83c7162d
XL
82///
83/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_pd)
0531ce1d
XL
84#[inline]
85#[target_feature(enable = "avx")]
a2a8927a 86// FIXME: should be `vorpd` instruction.
416331ca 87// See <https://github.com/rust-lang/stdarch/issues/71>.
0531ce1d 88#[cfg_attr(test, assert_instr(vorps))]
83c7162d 89#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 90pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
91 let a: u64x4 = transmute(a);
92 let b: u64x4 = transmute(b);
93 transmute(simd_or(a, b))
0531ce1d
XL
94}
95
532ac7d7 96/// Computes the bitwise OR packed single-precision (32-bit) floating-point
0531ce1d 97/// elements in `a` and `b`.
83c7162d
XL
98///
99/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_ps)
0531ce1d
XL
100#[inline]
101#[target_feature(enable = "avx")]
102#[cfg_attr(test, assert_instr(vorps))]
83c7162d 103#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 104pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
105 let a: u32x8 = transmute(a);
106 let b: u32x8 = transmute(b);
107 transmute(simd_or(a, b))
0531ce1d
XL
108}
109
532ac7d7 110/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
0531ce1d 111/// lanes using the control in `imm8`.
83c7162d
XL
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_pd)
0531ce1d
XL
114#[inline]
115#[target_feature(enable = "avx")]
17df50a5
XL
116#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
117#[rustc_legacy_const_generics(2)]
118#[stable(feature = "simd_x86", since = "1.27.0")]
119pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
120 static_assert_imm8!(MASK);
121 simd_shuffle4!(
122 a,
123 b,
124 <const MASK: i32> [
125 MASK as u32 & 0b1,
126 ((MASK as u32 >> 1) & 0b1) + 4,
127 ((MASK as u32 >> 2) & 0b1) + 2,
128 ((MASK as u32 >> 3) & 0b1) + 6,
129 ],
130 )
0531ce1d
XL
131}
132
532ac7d7 133/// Shuffles single-precision (32-bit) floating-point elements in `a` within
0531ce1d 134/// 128-bit lanes using the control in `imm8`.
83c7162d
XL
135///
136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_ps)
0531ce1d
XL
137#[inline]
138#[target_feature(enable = "avx")]
17df50a5
XL
139#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
140#[rustc_legacy_const_generics(2)]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
143 static_assert_imm8!(MASK);
144 simd_shuffle8!(
145 a,
146 b,
147 <const MASK: i32> [
148 MASK as u32 & 0b11,
149 (MASK as u32 >> 2) & 0b11,
150 ((MASK as u32 >> 4) & 0b11) + 8,
151 ((MASK as u32 >> 6) & 0b11) + 8,
152 (MASK as u32 & 0b11) + 4,
153 ((MASK as u32 >> 2) & 0b11) + 4,
154 ((MASK as u32 >> 4) & 0b11) + 12,
155 ((MASK as u32 >> 6) & 0b11) + 12,
156 ],
157 )
0531ce1d
XL
158}
159
532ac7d7
XL
160/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
161/// elements in `a`, and then AND with `b`.
83c7162d
XL
162///
163/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_pd)
0531ce1d
XL
164#[inline]
165#[target_feature(enable = "avx")]
532ac7d7 166// FIXME: should be `vandnpd` instruction.
0531ce1d 167#[cfg_attr(test, assert_instr(vandnps))]
83c7162d 168#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 169pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
170 let a: u64x4 = transmute(a);
171 let b: u64x4 = transmute(b);
172 transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
0531ce1d
XL
173}
174
532ac7d7 175/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
0531ce1d
XL
176/// elements in `a`
177/// and then AND with `b`.
83c7162d
XL
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_ps)
0531ce1d
XL
180#[inline]
181#[target_feature(enable = "avx")]
182#[cfg_attr(test, assert_instr(vandnps))]
83c7162d 183#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 184pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
185 let a: u32x8 = transmute(a);
186 let b: u32x8 = transmute(b);
187 transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
0531ce1d
XL
188}
189
532ac7d7
XL
190/// Compares packed double-precision (64-bit) floating-point elements
191/// in `a` and `b`, and returns packed maximum values
83c7162d
XL
192///
193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_pd)
0531ce1d
XL
194#[inline]
195#[target_feature(enable = "avx")]
196#[cfg_attr(test, assert_instr(vmaxpd))]
83c7162d 197#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 198pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
17df50a5 199 vmaxpd(a, b)
0531ce1d
XL
200}
201
532ac7d7
XL
202/// Compares packed single-precision (32-bit) floating-point elements in `a`
203/// and `b`, and returns packed maximum values
83c7162d
XL
204///
205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ps)
0531ce1d
XL
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxps))]
83c7162d 209#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 210pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
17df50a5 211 vmaxps(a, b)
0531ce1d
XL
212}
213
532ac7d7
XL
214/// Compares packed double-precision (64-bit) floating-point elements
215/// in `a` and `b`, and returns packed minimum values
83c7162d
XL
216///
217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_pd)
0531ce1d
XL
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vminpd))]
83c7162d 221#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 222pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
17df50a5 223 vminpd(a, b)
0531ce1d
XL
224}
225
532ac7d7
XL
226/// Compares packed single-precision (32-bit) floating-point elements in `a`
227/// and `b`, and returns packed minimum values
83c7162d
XL
228///
229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ps)
0531ce1d
XL
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminps))]
83c7162d 233#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 234pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
17df50a5 235 vminps(a, b)
0531ce1d
XL
236}
237
532ac7d7 238/// Multiplies packed double-precision (64-bit) floating-point elements
0531ce1d 239/// in `a` and `b`.
83c7162d
XL
240///
241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pd)
0531ce1d
XL
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vmulpd))]
83c7162d 245#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
246pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
247 simd_mul(a, b)
248}
249
532ac7d7 250/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 251/// `b`.
83c7162d
XL
252///
253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ps)
0531ce1d
XL
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulps))]
83c7162d 257#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
258pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
259 simd_mul(a, b)
260}
261
532ac7d7 262/// Alternatively adds and subtracts packed double-precision (64-bit)
0531ce1d 263/// floating-point elements in `a` to/from packed elements in `b`.
83c7162d
XL
264///
265/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_pd)
0531ce1d
XL
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vaddsubpd))]
83c7162d 269#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
270pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
271 addsubpd256(a, b)
272}
273
532ac7d7 274/// Alternatively adds and subtracts packed single-precision (32-bit)
0531ce1d 275/// floating-point elements in `a` to/from packed elements in `b`.
83c7162d
XL
276///
277/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_ps)
0531ce1d
XL
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubps))]
83c7162d 281#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
282pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
283 addsubps256(a, b)
284}
285
532ac7d7 286/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
0531ce1d 287/// from packed elements in `a`.
83c7162d
XL
288///
289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_pd)
0531ce1d
XL
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vsubpd))]
83c7162d 293#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
294pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
295 simd_sub(a, b)
296}
297
532ac7d7 298/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
0531ce1d 299/// from packed elements in `a`.
83c7162d
XL
300///
301/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ps)
0531ce1d
XL
302#[inline]
303#[target_feature(enable = "avx")]
304#[cfg_attr(test, assert_instr(vsubps))]
83c7162d 305#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
306pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
307 simd_sub(a, b)
308}
309
532ac7d7 310/// Computes the division of each of the 8 packed 32-bit floating-point elements
0531ce1d 311/// in `a` by the corresponding packed elements in `b`.
83c7162d
XL
312///
313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ps)
0531ce1d
XL
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vdivps))]
83c7162d 317#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
318pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
319 simd_div(a, b)
320}
321
532ac7d7 322/// Computes the division of each of the 4 packed 64-bit floating-point elements
0531ce1d 323/// in `a` by the corresponding packed elements in `b`.
83c7162d
XL
324///
325/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_pd)
0531ce1d
XL
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vdivpd))]
83c7162d 329#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
330pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
331 simd_div(a, b)
332}
333
532ac7d7 334/// Rounds packed double-precision (64-bit) floating point elements in `a`
17df50a5 335/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
0531ce1d
XL
336///
337/// - `0x00`: Round to the nearest whole number.
338/// - `0x01`: Round down, toward negative infinity.
339/// - `0x02`: Round up, toward positive infinity.
340/// - `0x03`: Truncate the values.
341///
342/// For a complete list of options, check [the LLVM docs][llvm_docs].
343///
344/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
83c7162d
XL
345///
346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_pd)
0531ce1d
XL
347#[inline]
348#[target_feature(enable = "avx")]
17df50a5
XL
349#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
350#[rustc_legacy_const_generics(1)]
83c7162d 351#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
352pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
353 static_assert_imm4!(ROUNDING);
354 roundpd256(a, ROUNDING)
0531ce1d
XL
355}
356
532ac7d7 357/// Rounds packed double-precision (64-bit) floating point elements in `a`
0531ce1d 358/// toward positive infinity.
83c7162d
XL
359///
360/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_pd)
0531ce1d
XL
361#[inline]
362#[target_feature(enable = "avx")]
363#[cfg_attr(test, assert_instr(vroundpd))]
83c7162d 364#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 365pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
74b04a01 366 simd_ceil(a)
0531ce1d
XL
367}
368
532ac7d7 369/// Rounds packed double-precision (64-bit) floating point elements in `a`
0531ce1d 370/// toward negative infinity.
83c7162d
XL
371///
372/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_pd)
0531ce1d
XL
373#[inline]
374#[target_feature(enable = "avx")]
375#[cfg_attr(test, assert_instr(vroundpd))]
83c7162d 376#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 377pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
74b04a01 378 simd_floor(a)
0531ce1d
XL
379}
380
532ac7d7 381/// Rounds packed single-precision (32-bit) floating point elements in `a`
17df50a5 382/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
0531ce1d
XL
383///
384/// - `0x00`: Round to the nearest whole number.
385/// - `0x01`: Round down, toward negative infinity.
386/// - `0x02`: Round up, toward positive infinity.
387/// - `0x03`: Truncate the values.
388///
389/// For a complete list of options, check [the LLVM docs][llvm_docs].
390///
391/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
83c7162d
XL
392///
393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_ps)
0531ce1d
XL
394#[inline]
395#[target_feature(enable = "avx")]
17df50a5
XL
396#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
397#[rustc_legacy_const_generics(1)]
83c7162d 398#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
399pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
400 static_assert_imm4!(ROUNDING);
401 roundps256(a, ROUNDING)
0531ce1d
XL
402}
403
532ac7d7 404/// Rounds packed single-precision (32-bit) floating point elements in `a`
0531ce1d 405/// toward positive infinity.
83c7162d
XL
406///
407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_ps)
0531ce1d
XL
408#[inline]
409#[target_feature(enable = "avx")]
410#[cfg_attr(test, assert_instr(vroundps))]
83c7162d 411#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 412pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
74b04a01 413 simd_ceil(a)
0531ce1d
XL
414}
415
532ac7d7 416/// Rounds packed single-precision (32-bit) floating point elements in `a`
0531ce1d 417/// toward negative infinity.
83c7162d
XL
418///
419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_ps)
0531ce1d
XL
420#[inline]
421#[target_feature(enable = "avx")]
422#[cfg_attr(test, assert_instr(vroundps))]
83c7162d 423#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 424pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
74b04a01 425 simd_floor(a)
0531ce1d
XL
426}
427
532ac7d7 428/// Returns the square root of packed single-precision (32-bit) floating point
0531ce1d 429/// elements in `a`.
83c7162d
XL
430///
431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ps)
0531ce1d
XL
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vsqrtps))]
83c7162d 435#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
436pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
437 sqrtps256(a)
438}
439
532ac7d7 440/// Returns the square root of packed double-precision (64-bit) floating point
0531ce1d 441/// elements in `a`.
83c7162d
XL
442///
443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_pd)
0531ce1d
XL
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vsqrtpd))]
83c7162d 447#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 448pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
74b04a01 449 simd_fsqrt(a)
0531ce1d
XL
450}
451
532ac7d7 452/// Blends packed double-precision (64-bit) floating-point elements from
0531ce1d 453/// `a` and `b` using control mask `imm8`.
83c7162d
XL
454///
455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_pd)
0531ce1d
XL
456#[inline]
457#[target_feature(enable = "avx")]
8faf50e0
XL
458// Note: LLVM7 prefers single-precision blend instructions when
459// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
460// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
17df50a5
XL
461#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
462#[rustc_legacy_const_generics(2)]
463#[stable(feature = "simd_x86", since = "1.27.0")]
464pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
465 static_assert_imm4!(IMM4);
466 simd_shuffle4!(
467 a,
468 b,
469 <const IMM4: i32> [
470 ((IMM4 as u32 >> 0) & 1) * 4 + 0,
471 ((IMM4 as u32 >> 1) & 1) * 4 + 1,
472 ((IMM4 as u32 >> 2) & 1) * 4 + 2,
473 ((IMM4 as u32 >> 3) & 1) * 4 + 3,
474 ],
475 )
0531ce1d
XL
476}
477
532ac7d7 478/// Blends packed single-precision (32-bit) floating-point elements from
0531ce1d 479/// `a` and `b` using control mask `imm8`.
83c7162d
XL
480///
481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_ps)
0531ce1d
XL
482#[inline]
483#[target_feature(enable = "avx")]
17df50a5
XL
484#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
485#[rustc_legacy_const_generics(2)]
486#[stable(feature = "simd_x86", since = "1.27.0")]
487pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
488 static_assert_imm8!(IMM8);
489 simd_shuffle8!(
490 a,
491 b,
492 <const IMM8: i32> [
493 ((IMM8 as u32 >> 0) & 1) * 8 + 0,
494 ((IMM8 as u32 >> 1) & 1) * 8 + 1,
495 ((IMM8 as u32 >> 2) & 1) * 8 + 2,
496 ((IMM8 as u32 >> 3) & 1) * 8 + 3,
497 ((IMM8 as u32 >> 4) & 1) * 8 + 4,
498 ((IMM8 as u32 >> 5) & 1) * 8 + 5,
499 ((IMM8 as u32 >> 6) & 1) * 8 + 6,
500 ((IMM8 as u32 >> 7) & 1) * 8 + 7,
501 ],
502 )
0531ce1d
XL
503}
504
532ac7d7 505/// Blends packed double-precision (64-bit) floating-point elements from
0531ce1d 506/// `a` and `b` using `c` as a mask.
83c7162d
XL
507///
508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_pd)
0531ce1d
XL
509#[inline]
510#[target_feature(enable = "avx")]
511#[cfg_attr(test, assert_instr(vblendvpd))]
83c7162d 512#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
513pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
514 vblendvpd(a, b, c)
515}
516
532ac7d7 517/// Blends packed single-precision (32-bit) floating-point elements from
0531ce1d 518/// `a` and `b` using `c` as a mask.
83c7162d
XL
519///
520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_ps)
0531ce1d
XL
521#[inline]
522#[target_feature(enable = "avx")]
523#[cfg_attr(test, assert_instr(vblendvps))]
83c7162d 524#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
525pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
526 vblendvps(a, b, c)
527}
528
532ac7d7 529/// Conditionally multiplies the packed single-precision (32-bit) floating-point
0531ce1d
XL
530/// elements in `a` and `b` using the high 4 bits in `imm8`,
531/// sum the four products, and conditionally return the sum
532/// using the low 4 bits of `imm8`.
83c7162d
XL
533///
534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dp_ps)
0531ce1d
XL
535#[inline]
536#[target_feature(enable = "avx")]
17df50a5
XL
537#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
538#[rustc_legacy_const_generics(2)]
83c7162d 539#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
540pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
541 static_assert_imm8!(IMM8);
542 vdpps(a, b, IMM8)
0531ce1d
XL
543}
544
545/// Horizontal addition of adjacent pairs in the two packed vectors
546/// of 4 64-bit floating points `a` and `b`.
547/// In the result, sums of elements from `a` are returned in even locations,
548/// while sums of elements from `b` are returned in odd locations.
83c7162d
XL
549///
550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_pd)
0531ce1d
XL
551#[inline]
552#[target_feature(enable = "avx")]
553#[cfg_attr(test, assert_instr(vhaddpd))]
83c7162d 554#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
555pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
556 vhaddpd(a, b)
557}
558
559/// Horizontal addition of adjacent pairs in the two packed vectors
560/// of 8 32-bit floating points `a` and `b`.
561/// In the result, sums of elements from `a` are returned in locations of
562/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
563/// 2, 3, 6, 7.
83c7162d
XL
564///
565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_ps)
0531ce1d
XL
566#[inline]
567#[target_feature(enable = "avx")]
568#[cfg_attr(test, assert_instr(vhaddps))]
83c7162d 569#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
570pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
571 vhaddps(a, b)
572}
573
574/// Horizontal subtraction of adjacent pairs in the two packed vectors
575/// of 4 64-bit floating points `a` and `b`.
576/// In the result, sums of elements from `a` are returned in even locations,
577/// while sums of elements from `b` are returned in odd locations.
83c7162d
XL
578///
579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_pd)
0531ce1d
XL
580#[inline]
581#[target_feature(enable = "avx")]
582#[cfg_attr(test, assert_instr(vhsubpd))]
83c7162d 583#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
584pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
585 vhsubpd(a, b)
586}
587
588/// Horizontal subtraction of adjacent pairs in the two packed vectors
589/// of 8 32-bit floating points `a` and `b`.
590/// In the result, sums of elements from `a` are returned in locations of
591/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
592/// 2, 3, 6, 7.
83c7162d
XL
593///
594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_ps)
0531ce1d
XL
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vhsubps))]
83c7162d 598#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
599pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
600 vhsubps(a, b)
601}
602
532ac7d7 603/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
0531ce1d 604/// elements in `a` and `b`.
83c7162d
XL
605///
606/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_pd)
0531ce1d
XL
607#[inline]
608#[target_feature(enable = "avx")]
609// FIXME Should be 'vxorpd' instruction.
610#[cfg_attr(test, assert_instr(vxorps))]
83c7162d 611#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 612pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
613 let a: u64x4 = transmute(a);
614 let b: u64x4 = transmute(b);
615 transmute(simd_xor(a, b))
0531ce1d
XL
616}
617
532ac7d7 618/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
0531ce1d 619/// elements in `a` and `b`.
83c7162d
XL
620///
621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_ps)
0531ce1d
XL
622#[inline]
623#[target_feature(enable = "avx")]
624#[cfg_attr(test, assert_instr(vxorps))]
83c7162d 625#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 626pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
627 let a: u32x8 = transmute(a);
628 let b: u32x8 = transmute(b);
629 transmute(simd_xor(a, b))
0531ce1d
XL
630}
631
632/// Equal (ordered, non-signaling)
83c7162d 633#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
634pub const _CMP_EQ_OQ: i32 = 0x00;
635/// Less-than (ordered, signaling)
83c7162d 636#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
637pub const _CMP_LT_OS: i32 = 0x01;
638/// Less-than-or-equal (ordered, signaling)
83c7162d 639#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
640pub const _CMP_LE_OS: i32 = 0x02;
641/// Unordered (non-signaling)
83c7162d 642#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
643pub const _CMP_UNORD_Q: i32 = 0x03;
644/// Not-equal (unordered, non-signaling)
83c7162d 645#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
646pub const _CMP_NEQ_UQ: i32 = 0x04;
647/// Not-less-than (unordered, signaling)
83c7162d 648#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
649pub const _CMP_NLT_US: i32 = 0x05;
650/// Not-less-than-or-equal (unordered, signaling)
83c7162d 651#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
652pub const _CMP_NLE_US: i32 = 0x06;
653/// Ordered (non-signaling)
83c7162d 654#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
655pub const _CMP_ORD_Q: i32 = 0x07;
656/// Equal (unordered, non-signaling)
83c7162d 657#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
658pub const _CMP_EQ_UQ: i32 = 0x08;
659/// Not-greater-than-or-equal (unordered, signaling)
83c7162d 660#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
661pub const _CMP_NGE_US: i32 = 0x09;
662/// Not-greater-than (unordered, signaling)
83c7162d 663#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
664pub const _CMP_NGT_US: i32 = 0x0a;
665/// False (ordered, non-signaling)
83c7162d 666#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
667pub const _CMP_FALSE_OQ: i32 = 0x0b;
668/// Not-equal (ordered, non-signaling)
83c7162d 669#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
670pub const _CMP_NEQ_OQ: i32 = 0x0c;
671/// Greater-than-or-equal (ordered, signaling)
83c7162d 672#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
673pub const _CMP_GE_OS: i32 = 0x0d;
674/// Greater-than (ordered, signaling)
83c7162d 675#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
676pub const _CMP_GT_OS: i32 = 0x0e;
677/// True (unordered, non-signaling)
83c7162d 678#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
679pub const _CMP_TRUE_UQ: i32 = 0x0f;
680/// Equal (ordered, signaling)
83c7162d 681#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
682pub const _CMP_EQ_OS: i32 = 0x10;
683/// Less-than (ordered, non-signaling)
83c7162d 684#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
685pub const _CMP_LT_OQ: i32 = 0x11;
686/// Less-than-or-equal (ordered, non-signaling)
83c7162d 687#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
688pub const _CMP_LE_OQ: i32 = 0x12;
689/// Unordered (signaling)
83c7162d 690#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
691pub const _CMP_UNORD_S: i32 = 0x13;
692/// Not-equal (unordered, signaling)
83c7162d 693#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
694pub const _CMP_NEQ_US: i32 = 0x14;
695/// Not-less-than (unordered, non-signaling)
83c7162d 696#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
697pub const _CMP_NLT_UQ: i32 = 0x15;
698/// Not-less-than-or-equal (unordered, non-signaling)
83c7162d 699#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
700pub const _CMP_NLE_UQ: i32 = 0x16;
701/// Ordered (signaling)
83c7162d 702#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
703pub const _CMP_ORD_S: i32 = 0x17;
704/// Equal (unordered, signaling)
83c7162d 705#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
706pub const _CMP_EQ_US: i32 = 0x18;
707/// Not-greater-than-or-equal (unordered, non-signaling)
83c7162d 708#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
709pub const _CMP_NGE_UQ: i32 = 0x19;
710/// Not-greater-than (unordered, non-signaling)
83c7162d 711#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
712pub const _CMP_NGT_UQ: i32 = 0x1a;
713/// False (ordered, signaling)
83c7162d 714#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
715pub const _CMP_FALSE_OS: i32 = 0x1b;
716/// Not-equal (ordered, signaling)
83c7162d 717#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
718pub const _CMP_NEQ_OS: i32 = 0x1c;
719/// Greater-than-or-equal (ordered, non-signaling)
83c7162d 720#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
721pub const _CMP_GE_OQ: i32 = 0x1d;
722/// Greater-than (ordered, non-signaling)
83c7162d 723#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
724pub const _CMP_GT_OQ: i32 = 0x1e;
725/// True (unordered, signaling)
83c7162d 726#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
727pub const _CMP_TRUE_US: i32 = 0x1f;
728
532ac7d7 729/// Compares packed double-precision (64-bit) floating-point
0531ce1d 730/// elements in `a` and `b` based on the comparison operand
17df50a5 731/// specified by `IMM5`.
83c7162d
XL
732///
733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd)
0531ce1d
XL
734#[inline]
735#[target_feature(enable = "avx,sse2")]
17df50a5
XL
736#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
737#[rustc_legacy_const_generics(2)]
83c7162d 738#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
739pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
740 static_assert_imm5!(IMM5);
741 vcmppd(a, b, IMM5 as i8)
0531ce1d
XL
742}
743
532ac7d7 744/// Compares packed double-precision (64-bit) floating-point
0531ce1d 745/// elements in `a` and `b` based on the comparison operand
17df50a5 746/// specified by `IMM5`.
83c7162d
XL
747///
748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd)
0531ce1d
XL
749#[inline]
750#[target_feature(enable = "avx")]
17df50a5
XL
751#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
752#[rustc_legacy_const_generics(2)]
83c7162d 753#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
754pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
755 static_assert_imm5!(IMM5);
756 vcmppd256(a, b, IMM5 as u8)
0531ce1d
XL
757}
758
532ac7d7 759/// Compares packed single-precision (32-bit) floating-point
0531ce1d 760/// elements in `a` and `b` based on the comparison operand
17df50a5 761/// specified by `IMM5`.
83c7162d
XL
762///
763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps)
0531ce1d
XL
764#[inline]
765#[target_feature(enable = "avx,sse")]
17df50a5
XL
766#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
767#[rustc_legacy_const_generics(2)]
83c7162d 768#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
769pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
770 static_assert_imm5!(IMM5);
771 vcmpps(a, b, IMM5 as i8)
0531ce1d
XL
772}
773
532ac7d7 774/// Compares packed single-precision (32-bit) floating-point
0531ce1d 775/// elements in `a` and `b` based on the comparison operand
17df50a5 776/// specified by `IMM5`.
83c7162d
XL
777///
778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps)
0531ce1d
XL
779#[inline]
780#[target_feature(enable = "avx")]
17df50a5
XL
781#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
782#[rustc_legacy_const_generics(2)]
83c7162d 783#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
784pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
785 static_assert_imm5!(IMM5);
786 vcmpps256(a, b, IMM5 as u8)
0531ce1d
XL
787}
788
532ac7d7 789/// Compares the lower double-precision (64-bit) floating-point element in
17df50a5 790/// `a` and `b` based on the comparison operand specified by `IMM5`,
0531ce1d 791/// store the result in the lower element of returned vector,
532ac7d7 792/// and copies the upper element from `a` to the upper element of returned
0531ce1d 793/// vector.
83c7162d
XL
794///
795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd)
0531ce1d
XL
796#[inline]
797#[target_feature(enable = "avx,sse2")]
17df50a5
XL
798#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
799#[rustc_legacy_const_generics(2)]
83c7162d 800#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
801pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
802 static_assert_imm5!(IMM5);
803 vcmpsd(a, b, IMM5 as i8)
0531ce1d
XL
804}
805
532ac7d7 806/// Compares the lower single-precision (32-bit) floating-point element in
17df50a5 807/// `a` and `b` based on the comparison operand specified by `IMM5`,
0531ce1d 808/// store the result in the lower element of returned vector,
532ac7d7 809/// and copies the upper 3 packed elements from `a` to the upper elements of
0531ce1d 810/// returned vector.
83c7162d
XL
811///
812/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss)
0531ce1d
XL
813#[inline]
814#[target_feature(enable = "avx,sse")]
17df50a5
XL
815#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
816#[rustc_legacy_const_generics(2)]
83c7162d 817#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
818pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
819 static_assert_imm5!(IMM5);
820 vcmpss(a, b, IMM5 as i8)
0531ce1d
XL
821}
822
532ac7d7 823/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
0531ce1d 824/// floating-point elements.
83c7162d
XL
825///
826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_pd)
0531ce1d
XL
827#[inline]
828#[target_feature(enable = "avx")]
829#[cfg_attr(test, assert_instr(vcvtdq2pd))]
83c7162d 830#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
831pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
832 simd_cast(a.as_i32x4())
833}
834
532ac7d7 835/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
0531ce1d 836/// floating-point elements.
83c7162d
XL
837///
838/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ps)
0531ce1d
XL
839#[inline]
840#[target_feature(enable = "avx")]
841#[cfg_attr(test, assert_instr(vcvtdq2ps))]
83c7162d 842#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
843pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
844 vcvtdq2ps(a.as_i32x8())
845}
846
532ac7d7 847/// Converts packed double-precision (64-bit) floating-point elements in `a`
0531ce1d 848/// to packed single-precision (32-bit) floating-point elements.
83c7162d
XL
849///
850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ps)
0531ce1d
XL
851#[inline]
852#[target_feature(enable = "avx")]
853#[cfg_attr(test, assert_instr(vcvtpd2ps))]
83c7162d 854#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
855pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
856 vcvtpd2ps(a)
857}
858
532ac7d7 859/// Converts packed single-precision (32-bit) floating-point elements in `a`
0531ce1d 860/// to packed 32-bit integers.
83c7162d
XL
861///
862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epi32)
0531ce1d
XL
863#[inline]
864#[target_feature(enable = "avx")]
865#[cfg_attr(test, assert_instr(vcvtps2dq))]
83c7162d 866#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 867pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
532ac7d7 868 transmute(vcvtps2dq(a))
0531ce1d
XL
869}
870
532ac7d7 871/// Converts packed single-precision (32-bit) floating-point elements in `a`
0531ce1d 872/// to packed double-precision (64-bit) floating-point elements.
83c7162d
XL
873///
874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_pd)
0531ce1d
XL
875#[inline]
876#[target_feature(enable = "avx")]
877#[cfg_attr(test, assert_instr(vcvtps2pd))]
83c7162d 878#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
879pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
880 simd_cast(a)
881}
882
532ac7d7 883/// Converts packed double-precision (64-bit) floating-point elements in `a`
0531ce1d 884/// to packed 32-bit integers with truncation.
83c7162d
XL
885///
886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epi32)
0531ce1d
XL
887#[inline]
888#[target_feature(enable = "avx")]
889#[cfg_attr(test, assert_instr(vcvttpd2dq))]
83c7162d 890#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 891pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
532ac7d7 892 transmute(vcvttpd2dq(a))
0531ce1d
XL
893}
894
532ac7d7 895/// Converts packed double-precision (64-bit) floating-point elements in `a`
0531ce1d 896/// to packed 32-bit integers.
83c7162d
XL
897///
898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epi32)
0531ce1d
XL
899#[inline]
900#[target_feature(enable = "avx")]
901#[cfg_attr(test, assert_instr(vcvtpd2dq))]
83c7162d 902#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 903pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
532ac7d7 904 transmute(vcvtpd2dq(a))
0531ce1d
XL
905}
906
532ac7d7 907/// Converts packed single-precision (32-bit) floating-point elements in `a`
0531ce1d 908/// to packed 32-bit integers with truncation.
83c7162d
XL
909///
910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epi32)
0531ce1d
XL
911#[inline]
912#[target_feature(enable = "avx")]
913#[cfg_attr(test, assert_instr(vcvttps2dq))]
83c7162d 914#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 915pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
532ac7d7 916 transmute(vcvttps2dq(a))
0531ce1d
XL
917}
918
532ac7d7 919/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
0531ce1d 920/// floating-point elements) from `a`, selected with `imm8`.
83c7162d
XL
921///
922/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_ps)
0531ce1d
XL
923#[inline]
924#[target_feature(enable = "avx")]
0731742a
XL
925#[cfg_attr(
926 all(test, not(target_os = "windows")),
17df50a5 927 assert_instr(vextractf128, IMM1 = 1)
0731742a 928)]
17df50a5
XL
929#[rustc_legacy_const_generics(1)]
930#[stable(feature = "simd_x86", since = "1.27.0")]
931pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
932 static_assert_imm1!(IMM1);
933 simd_shuffle4!(
934 a,
935 _mm256_undefined_ps(),
936 <const IMM1: i32> [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
937 )
0531ce1d
XL
938}
939
532ac7d7 940/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
0531ce1d 941/// floating-point elements) from `a`, selected with `imm8`.
83c7162d
XL
942///
943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_pd)
0531ce1d
XL
944#[inline]
945#[target_feature(enable = "avx")]
0731742a
XL
946#[cfg_attr(
947 all(test, not(target_os = "windows")),
17df50a5 948 assert_instr(vextractf128, IMM1 = 1)
0731742a 949)]
17df50a5 950#[rustc_legacy_const_generics(1)]
83c7162d 951#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
952pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
953 static_assert_imm1!(IMM1);
954 simd_shuffle2!(a, _mm256_undefined_pd(), <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize])
0531ce1d
XL
955}
956
532ac7d7 957/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
83c7162d
XL
958///
959/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_si256)
0531ce1d
XL
960#[inline]
961#[target_feature(enable = "avx")]
0731742a
XL
962#[cfg_attr(
963 all(test, not(target_os = "windows")),
17df50a5 964 assert_instr(vextractf128, IMM1 = 1)
0731742a 965)]
17df50a5
XL
966#[rustc_legacy_const_generics(1)]
967#[stable(feature = "simd_x86", since = "1.27.0")]
968pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
969 static_assert_imm1!(IMM1);
970 let dst: i64x2 = simd_shuffle2!(
971 a.as_i64x4(),
972 _mm256_undefined_si256().as_i64x4(),
973 <const IMM1: i32> [[0, 1], [2, 3]][IMM1 as usize],
974 );
532ac7d7 975 transmute(dst)
0531ce1d
XL
976}
977
532ac7d7 978/// Zeroes the contents of all XMM or YMM registers.
83c7162d
XL
979///
980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroall)
0531ce1d
XL
981#[inline]
982#[target_feature(enable = "avx")]
983#[cfg_attr(test, assert_instr(vzeroall))]
83c7162d 984#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
985pub unsafe fn _mm256_zeroall() {
986 vzeroall()
987}
988
532ac7d7 989/// Zeroes the upper 128 bits of all YMM registers;
0531ce1d 990/// the lower 128-bits of the registers are unmodified.
83c7162d
XL
991///
992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroupper)
0531ce1d
XL
993#[inline]
994#[target_feature(enable = "avx")]
995#[cfg_attr(test, assert_instr(vzeroupper))]
83c7162d 996#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
997pub unsafe fn _mm256_zeroupper() {
998 vzeroupper()
999}
1000
532ac7d7 1001/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1002/// within 128-bit lanes using the control in `b`.
83c7162d
XL
1003///
1004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_ps)
0531ce1d
XL
1005#[inline]
1006#[target_feature(enable = "avx")]
1007#[cfg_attr(test, assert_instr(vpermilps))]
83c7162d 1008#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1009pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1010 vpermilps256(a, b.as_i32x8())
1011}
1012
532ac7d7 1013/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1014/// using the control in `b`.
83c7162d
XL
1015///
1016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_ps)
0531ce1d
XL
1017#[inline]
1018#[target_feature(enable = "avx")]
1019#[cfg_attr(test, assert_instr(vpermilps))]
83c7162d 1020#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1021pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1022 vpermilps(a, b.as_i32x4())
1023}
1024
532ac7d7 1025/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1026/// within 128-bit lanes using the control in `imm8`.
83c7162d
XL
1027///
1028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_ps)
0531ce1d
XL
1029#[inline]
1030#[target_feature(enable = "avx")]
17df50a5
XL
1031#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
1032#[rustc_legacy_const_generics(1)]
1033#[stable(feature = "simd_x86", since = "1.27.0")]
1034pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1035 static_assert_imm8!(IMM8);
1036 simd_shuffle8!(
1037 a,
1038 _mm256_undefined_ps(),
1039 <const IMM8: i32> [
1040 (IMM8 as u32 >> 0) & 0b11,
1041 (IMM8 as u32 >> 2) & 0b11,
1042 (IMM8 as u32 >> 4) & 0b11,
1043 (IMM8 as u32 >> 6) & 0b11,
1044 ((IMM8 as u32 >> 0) & 0b11) + 4,
1045 ((IMM8 as u32 >> 2) & 0b11) + 4,
1046 ((IMM8 as u32 >> 4) & 0b11) + 4,
1047 ((IMM8 as u32 >> 6) & 0b11) + 4,
1048 ],
1049 )
0531ce1d
XL
1050}
1051
532ac7d7 1052/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1053/// using the control in `imm8`.
83c7162d
XL
1054///
1055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_ps)
0531ce1d
XL
1056#[inline]
1057#[target_feature(enable = "avx,sse")]
17df50a5
XL
1058#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
1059#[rustc_legacy_const_generics(1)]
1060#[stable(feature = "simd_x86", since = "1.27.0")]
1061pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1062 static_assert_imm8!(IMM8);
1063 simd_shuffle4!(
1064 a,
1065 _mm_undefined_ps(),
1066 <const IMM8: i32> [
1067 (IMM8 as u32 >> 0) & 0b11,
1068 (IMM8 as u32 >> 2) & 0b11,
1069 (IMM8 as u32 >> 4) & 0b11,
1070 (IMM8 as u32 >> 6) & 0b11,
1071 ],
1072 )
0531ce1d
XL
1073}
1074
532ac7d7 1075/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1076/// within 256-bit lanes using the control in `b`.
83c7162d
XL
1077///
1078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_pd)
0531ce1d
XL
1079#[inline]
1080#[target_feature(enable = "avx")]
1081#[cfg_attr(test, assert_instr(vpermilpd))]
83c7162d 1082#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1083pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1084 vpermilpd256(a, b.as_i64x4())
1085}
1086
532ac7d7 1087/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1088/// using the control in `b`.
83c7162d
XL
1089///
1090/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_pd)
0531ce1d
XL
1091#[inline]
1092#[target_feature(enable = "avx")]
1093#[cfg_attr(test, assert_instr(vpermilpd))]
83c7162d 1094#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1095pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1096 vpermilpd(a, b.as_i64x2())
1097}
1098
532ac7d7 1099/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1100/// within 128-bit lanes using the control in `imm8`.
83c7162d
XL
1101///
1102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_pd)
0531ce1d
XL
1103#[inline]
1104#[target_feature(enable = "avx")]
17df50a5
XL
1105#[cfg_attr(test, assert_instr(vpermilpd, IMM4 = 0x1))]
1106#[rustc_legacy_const_generics(1)]
1107#[stable(feature = "simd_x86", since = "1.27.0")]
1108pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1109 static_assert_imm4!(IMM4);
1110 simd_shuffle4!(
1111 a,
1112 _mm256_undefined_pd(),
1113 <const IMM4: i32> [
1114 ((IMM4 as u32 >> 0) & 1),
1115 ((IMM4 as u32 >> 1) & 1),
1116 ((IMM4 as u32 >> 2) & 1) + 2,
1117 ((IMM4 as u32 >> 3) & 1) + 2,
1118 ],
1119 )
0531ce1d
XL
1120}
1121
532ac7d7 1122/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1123/// using the control in `imm8`.
83c7162d
XL
1124///
1125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_pd)
0531ce1d
XL
1126#[inline]
1127#[target_feature(enable = "avx,sse2")]
17df50a5
XL
1128#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0x1))]
1129#[rustc_legacy_const_generics(1)]
1130#[stable(feature = "simd_x86", since = "1.27.0")]
1131pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1132 static_assert_imm2!(IMM2);
1133 simd_shuffle2!(
1134 a,
1135 _mm_undefined_pd(),
1136 <const IMM2: i32> [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1137 )
0531ce1d
XL
1138}
1139
532ac7d7 1140/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
0531ce1d 1141/// floating-point elements) selected by `imm8` from `a` and `b`.
83c7162d
XL
1142///
1143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_ps)
0531ce1d
XL
1144#[inline]
1145#[target_feature(enable = "avx")]
17df50a5
XL
1146#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1147#[rustc_legacy_const_generics(2)]
83c7162d 1148#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
1149pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1150 static_assert_imm8!(IMM8);
1151 vperm2f128ps256(a, b, IMM8 as i8)
0531ce1d
XL
1152}
1153
532ac7d7 1154/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
0531ce1d 1155/// floating-point elements) selected by `imm8` from `a` and `b`.
83c7162d
XL
1156///
1157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_pd)
0531ce1d
XL
1158#[inline]
1159#[target_feature(enable = "avx")]
17df50a5
XL
1160#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1161#[rustc_legacy_const_generics(2)]
83c7162d 1162#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
1163pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1164 static_assert_imm8!(IMM8);
1165 vperm2f128pd256(a, b, IMM8 as i8)
0531ce1d
XL
1166}
1167
1b1a35ee 1168/// Shuffles 128-bits (composed of integer data) selected by `imm8`
0531ce1d 1169/// from `a` and `b`.
83c7162d
XL
1170///
1171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_si256)
0531ce1d
XL
1172#[inline]
1173#[target_feature(enable = "avx")]
17df50a5
XL
1174#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1175#[rustc_legacy_const_generics(2)]
83c7162d 1176#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
1177pub unsafe fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1178 static_assert_imm8!(IMM8);
1179 transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
0531ce1d
XL
1180}
1181
532ac7d7 1182/// Broadcasts a single-precision (32-bit) floating-point element from memory
0531ce1d 1183/// to all elements of the returned vector.
83c7162d
XL
1184///
1185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ss)
0531ce1d
XL
1186#[inline]
1187#[target_feature(enable = "avx")]
1188#[cfg_attr(test, assert_instr(vbroadcastss))]
83c7162d 1189#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1190#[allow(clippy::trivially_copy_pass_by_ref)]
0531ce1d
XL
1191pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1192 _mm256_set1_ps(*f)
1193}
1194
532ac7d7 1195/// Broadcasts a single-precision (32-bit) floating-point element from memory
0531ce1d 1196/// to all elements of the returned vector.
83c7162d
XL
1197///
1198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcast_ss)
0531ce1d
XL
1199#[inline]
1200#[target_feature(enable = "avx")]
1201#[cfg_attr(test, assert_instr(vbroadcastss))]
83c7162d 1202#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1203#[allow(clippy::trivially_copy_pass_by_ref)]
0531ce1d
XL
1204pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
1205 _mm_set1_ps(*f)
1206}
1207
532ac7d7 1208/// Broadcasts a double-precision (64-bit) floating-point element from memory
0531ce1d 1209/// to all elements of the returned vector.
83c7162d
XL
1210///
1211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_sd)
0531ce1d
XL
1212#[inline]
1213#[target_feature(enable = "avx")]
1214#[cfg_attr(test, assert_instr(vbroadcastsd))]
83c7162d 1215#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1216#[allow(clippy::trivially_copy_pass_by_ref)]
0531ce1d
XL
1217pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1218 _mm256_set1_pd(*f)
1219}
1220
532ac7d7 1221/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
0531ce1d 1222/// (32-bit) floating-point elements) to all elements of the returned vector.
83c7162d
XL
1223///
1224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ps)
0531ce1d
XL
1225#[inline]
1226#[target_feature(enable = "avx")]
1227#[cfg_attr(test, assert_instr(vbroadcastf128))]
83c7162d 1228#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1229pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1230 vbroadcastf128ps256(a)
1231}
1232
532ac7d7 1233/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
0531ce1d 1234/// (64-bit) floating-point elements) to all elements of the returned vector.
83c7162d
XL
1235///
1236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_pd)
0531ce1d
XL
1237#[inline]
1238#[target_feature(enable = "avx")]
1239#[cfg_attr(test, assert_instr(vbroadcastf128))]
83c7162d 1240#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1241pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1242 vbroadcastf128pd256(a)
1243}
1244
532ac7d7 1245/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
0531ce1d
XL
1246/// single-precision (32-bit) floating-point elements) from `b` into result
1247/// at the location specified by `imm8`.
83c7162d
XL
1248///
1249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_ps)
0531ce1d
XL
1250#[inline]
1251#[target_feature(enable = "avx")]
0731742a
XL
1252#[cfg_attr(
1253 all(test, not(target_os = "windows")),
17df50a5 1254 assert_instr(vinsertf128, IMM1 = 1)
0731742a 1255)]
17df50a5
XL
1256#[rustc_legacy_const_generics(2)]
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1259 static_assert_imm1!(IMM1);
1260 simd_shuffle8!(
1261 a,
1262 _mm256_castps128_ps256(b),
1263 <const IMM1: i32> [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1264 )
0531ce1d
XL
1265}
1266
532ac7d7 1267/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
0531ce1d
XL
1268/// double-precision (64-bit) floating-point elements) from `b` into result
1269/// at the location specified by `imm8`.
83c7162d
XL
1270///
1271/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_pd)
0531ce1d
XL
1272#[inline]
1273#[target_feature(enable = "avx")]
0731742a
XL
1274#[cfg_attr(
1275 all(test, not(target_os = "windows")),
17df50a5 1276 assert_instr(vinsertf128, IMM1 = 1)
0731742a 1277)]
17df50a5
XL
1278#[rustc_legacy_const_generics(2)]
1279#[stable(feature = "simd_x86", since = "1.27.0")]
1280pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1281 static_assert_imm1!(IMM1);
1282 simd_shuffle4!(
1283 a,
1284 _mm256_castpd128_pd256(b),
1285 <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1286 )
0531ce1d
XL
1287}
1288
532ac7d7 1289/// Copies `a` to result, then inserts 128 bits from `b` into result
0531ce1d 1290/// at the location specified by `imm8`.
83c7162d
XL
1291///
1292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_si256)
0531ce1d
XL
1293#[inline]
1294#[target_feature(enable = "avx")]
0731742a
XL
1295#[cfg_attr(
1296 all(test, not(target_os = "windows")),
17df50a5 1297 assert_instr(vinsertf128, IMM1 = 1)
0731742a 1298)]
17df50a5
XL
1299#[rustc_legacy_const_generics(2)]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1302 static_assert_imm1!(IMM1);
1303 let dst: i64x4 = simd_shuffle4!(
1304 a.as_i64x4(),
1305 _mm256_castsi128_si256(b).as_i64x4(),
1306 <const IMM1: i32> [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1307 );
532ac7d7 1308 transmute(dst)
0531ce1d
XL
1309}
1310
532ac7d7 1311/// Copies `a` to result, and inserts the 8-bit integer `i` into result
0531ce1d 1312/// at the location specified by `index`.
83c7162d
XL
1313///
1314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi8)
0531ce1d
XL
1315#[inline]
1316#[target_feature(enable = "avx")]
1317// This intrinsic has no corresponding instruction.
17df50a5 1318#[rustc_legacy_const_generics(2)]
83c7162d 1319#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
1320pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1321 static_assert_imm5!(INDEX);
1322 transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
0531ce1d
XL
1323}
1324
532ac7d7 1325/// Copies `a` to result, and inserts the 16-bit integer `i` into result
0531ce1d 1326/// at the location specified by `index`.
83c7162d
XL
1327///
1328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi16)
0531ce1d
XL
1329#[inline]
1330#[target_feature(enable = "avx")]
1331// This intrinsic has no corresponding instruction.
17df50a5 1332#[rustc_legacy_const_generics(2)]
83c7162d 1333#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
1334pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1335 static_assert_imm4!(INDEX);
1336 transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
0531ce1d
XL
1337}
1338
532ac7d7 1339/// Copies `a` to result, and inserts the 32-bit integer `i` into result
0531ce1d 1340/// at the location specified by `index`.
83c7162d
XL
1341///
1342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi32)
0531ce1d
XL
1343#[inline]
1344#[target_feature(enable = "avx")]
1345// This intrinsic has no corresponding instruction.
17df50a5 1346#[rustc_legacy_const_generics(2)]
83c7162d 1347#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
1348pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1349 static_assert_imm3!(INDEX);
1350 transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
0531ce1d
XL
1351}
1352
532ac7d7 1353/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1354/// floating-point elements) from memory into result.
1355/// `mem_addr` must be aligned on a 32-byte boundary or a
1356/// general-protection exception may be generated.
83c7162d
XL
1357///
1358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_pd)
0531ce1d
XL
1359#[inline]
1360#[target_feature(enable = "avx")]
1361#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
83c7162d 1362#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1363#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1364pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1365 *(mem_addr as *const __m256d)
1366}
1367
532ac7d7 1368/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1369/// floating-point elements) from `a` into memory.
1370/// `mem_addr` must be aligned on a 32-byte boundary or a
1371/// general-protection exception may be generated.
83c7162d
XL
1372///
1373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_pd)
0531ce1d
XL
1374#[inline]
1375#[target_feature(enable = "avx")]
1376#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
83c7162d 1377#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1378#[allow(clippy::cast_ptr_alignment)]
416331ca 1379pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
0531ce1d
XL
1380 *(mem_addr as *mut __m256d) = a;
1381}
1382
532ac7d7 1383/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1384/// floating-point elements) from memory into result.
1385/// `mem_addr` must be aligned on a 32-byte boundary or a
1386/// general-protection exception may be generated.
83c7162d
XL
1387///
1388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ps)
0531ce1d
XL
1389#[inline]
1390#[target_feature(enable = "avx")]
1391#[cfg_attr(test, assert_instr(vmovaps))]
83c7162d 1392#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1393#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1394pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1395 *(mem_addr as *const __m256)
1396}
1397
532ac7d7 1398/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1399/// floating-point elements) from `a` into memory.
1400/// `mem_addr` must be aligned on a 32-byte boundary or a
1401/// general-protection exception may be generated.
83c7162d
XL
1402///
1403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ps)
0531ce1d
XL
1404#[inline]
1405#[target_feature(enable = "avx")]
1406#[cfg_attr(test, assert_instr(vmovaps))]
83c7162d 1407#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1408#[allow(clippy::cast_ptr_alignment)]
416331ca 1409pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
0531ce1d
XL
1410 *(mem_addr as *mut __m256) = a;
1411}
1412
532ac7d7 1413/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1414/// floating-point elements) from memory into result.
1415/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1416///
1417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_pd)
0531ce1d
XL
1418#[inline]
1419#[target_feature(enable = "avx")]
1420#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
83c7162d 1421#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1422pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1423 let mut dst = _mm256_undefined_pd();
1424 ptr::copy_nonoverlapping(
1425 mem_addr as *const u8,
1426 &mut dst as *mut __m256d as *mut u8,
1427 mem::size_of::<__m256d>(),
1428 );
1429 dst
1430}
1431
532ac7d7 1432/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1433/// floating-point elements) from `a` into memory.
1434/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1435///
1436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_pd)
0531ce1d
XL
1437#[inline]
1438#[target_feature(enable = "avx")]
1439#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
83c7162d 1440#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1441pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1442 storeupd256(mem_addr, a);
1443}
1444
532ac7d7 1445/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1446/// floating-point elements) from memory into result.
1447/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1448///
1449/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ps)
0531ce1d
XL
1450#[inline]
1451#[target_feature(enable = "avx")]
1452#[cfg_attr(test, assert_instr(vmovups))]
83c7162d 1453#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1454pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1455 let mut dst = _mm256_undefined_ps();
1456 ptr::copy_nonoverlapping(
1457 mem_addr as *const u8,
1458 &mut dst as *mut __m256 as *mut u8,
1459 mem::size_of::<__m256>(),
1460 );
1461 dst
1462}
1463
532ac7d7 1464/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1465/// floating-point elements) from `a` into memory.
1466/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1467///
1468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ps)
0531ce1d
XL
1469#[inline]
1470#[target_feature(enable = "avx")]
1471#[cfg_attr(test, assert_instr(vmovups))]
83c7162d 1472#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1473pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1474 storeups256(mem_addr, a);
1475}
1476
532ac7d7 1477/// Loads 256-bits of integer data from memory into result.
0531ce1d
XL
1478/// `mem_addr` must be aligned on a 32-byte boundary or a
1479/// general-protection exception may be generated.
83c7162d
XL
1480///
1481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_si256)
0531ce1d
XL
1482#[inline]
1483#[target_feature(enable = "avx")]
1484#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
83c7162d 1485#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1486pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1487 *mem_addr
1488}
1489
532ac7d7 1490/// Stores 256-bits of integer data from `a` into memory.
0531ce1d
XL
1491/// `mem_addr` must be aligned on a 32-byte boundary or a
1492/// general-protection exception may be generated.
83c7162d
XL
1493///
1494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_si256)
0531ce1d
XL
1495#[inline]
1496#[target_feature(enable = "avx")]
1497#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
83c7162d 1498#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1499pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1500 *mem_addr = a;
1501}
1502
532ac7d7 1503/// Loads 256-bits of integer data from memory into result.
0531ce1d 1504/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1505///
1506/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_si256)
0531ce1d
XL
1507#[inline]
1508#[target_feature(enable = "avx")]
1509#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
83c7162d 1510#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1511pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1512 let mut dst = _mm256_undefined_si256();
1513 ptr::copy_nonoverlapping(
1514 mem_addr as *const u8,
1515 &mut dst as *mut __m256i as *mut u8,
1516 mem::size_of::<__m256i>(),
1517 );
1518 dst
1519}
1520
532ac7d7 1521/// Stores 256-bits of integer data from `a` into memory.
0531ce1d 1522/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1523///
1524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_si256)
0531ce1d
XL
1525#[inline]
1526#[target_feature(enable = "avx")]
1527#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
83c7162d 1528#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1529pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1530 storeudq256(mem_addr as *mut i8, a.as_i8x32());
1531}
1532
532ac7d7 1533/// Loads packed double-precision (64-bit) floating-point elements from memory
0531ce1d
XL
1534/// into result using `mask` (elements are zeroed out when the high bit of the
1535/// corresponding element is not set).
83c7162d
XL
1536///
1537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_pd)
0531ce1d
XL
1538#[inline]
1539#[target_feature(enable = "avx")]
1540#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1541#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1542pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
0531ce1d
XL
1543 maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1544}
1545
532ac7d7 1546/// Stores packed double-precision (64-bit) floating-point elements from `a`
0531ce1d 1547/// into memory using `mask`.
83c7162d
XL
1548///
1549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_pd)
0531ce1d
XL
1550#[inline]
1551#[target_feature(enable = "avx")]
1552#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1553#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1554pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
0531ce1d
XL
1555 maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1556}
1557
532ac7d7 1558/// Loads packed double-precision (64-bit) floating-point elements from memory
0531ce1d
XL
1559/// into result using `mask` (elements are zeroed out when the high bit of the
1560/// corresponding element is not set).
83c7162d
XL
1561///
1562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_pd)
0531ce1d
XL
1563#[inline]
1564#[target_feature(enable = "avx")]
1565#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1566#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1567pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1568 maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1569}
1570
532ac7d7 1571/// Stores packed double-precision (64-bit) floating-point elements from `a`
0531ce1d 1572/// into memory using `mask`.
83c7162d
XL
1573///
1574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_pd)
0531ce1d
XL
1575#[inline]
1576#[target_feature(enable = "avx")]
1577#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1578#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1579pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1580 maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1581}
1582
532ac7d7 1583/// Loads packed single-precision (32-bit) floating-point elements from memory
0531ce1d
XL
1584/// into result using `mask` (elements are zeroed out when the high bit of the
1585/// corresponding element is not set).
83c7162d
XL
1586///
1587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_ps)
0531ce1d
XL
1588#[inline]
1589#[target_feature(enable = "avx")]
1590#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1591#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1592pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
0531ce1d
XL
1593 maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1594}
1595
532ac7d7 1596/// Stores packed single-precision (32-bit) floating-point elements from `a`
0531ce1d 1597/// into memory using `mask`.
83c7162d
XL
1598///
1599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_ps)
0531ce1d
XL
1600#[inline]
1601#[target_feature(enable = "avx")]
1602#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1603#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1604pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
0531ce1d
XL
1605 maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1606}
1607
532ac7d7 1608/// Loads packed single-precision (32-bit) floating-point elements from memory
0531ce1d
XL
1609/// into result using `mask` (elements are zeroed out when the high bit of the
1610/// corresponding element is not set).
83c7162d
XL
1611///
1612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_ps)
0531ce1d
XL
1613#[inline]
1614#[target_feature(enable = "avx")]
1615#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1616#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1617pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1618 maskloadps(mem_addr as *const i8, mask.as_i32x4())
1619}
1620
532ac7d7 1621/// Stores packed single-precision (32-bit) floating-point elements from `a`
0531ce1d 1622/// into memory using `mask`.
83c7162d
XL
1623///
1624/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_ps)
0531ce1d
XL
1625#[inline]
1626#[target_feature(enable = "avx")]
1627#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1628#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1629pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1630 maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1631}
1632
1633/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
532ac7d7 1634/// from `a`, and returns the results.
83c7162d
XL
1635///
1636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movehdup_ps)
0531ce1d
XL
1637#[inline]
1638#[target_feature(enable = "avx")]
1639#[cfg_attr(test, assert_instr(vmovshdup))]
83c7162d 1640#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1641pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
17df50a5 1642 simd_shuffle8!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
0531ce1d
XL
1643}
1644
1645/// Duplicate even-indexed single-precision (32-bit) floating-point elements
532ac7d7 1646/// from `a`, and returns the results.
83c7162d
XL
1647///
1648/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_moveldup_ps)
0531ce1d
XL
1649#[inline]
1650#[target_feature(enable = "avx")]
1651#[cfg_attr(test, assert_instr(vmovsldup))]
83c7162d 1652#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1653pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
17df50a5 1654 simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
0531ce1d
XL
1655}
1656
1657/// Duplicate even-indexed double-precision (64-bit) floating-point elements
e1599b0c 1658/// from `a`, and returns the results.
83c7162d
XL
1659///
1660/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movedup_pd)
0531ce1d
XL
1661#[inline]
1662#[target_feature(enable = "avx")]
1663#[cfg_attr(test, assert_instr(vmovddup))]
83c7162d 1664#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1665pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
17df50a5 1666 simd_shuffle4!(a, a, [0, 0, 2, 2])
0531ce1d
XL
1667}
1668
532ac7d7 1669/// Loads 256-bits of integer data from unaligned memory into result.
0531ce1d
XL
1670/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1671/// data crosses a cache line boundary.
83c7162d
XL
1672///
1673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_lddqu_si256)
0531ce1d
XL
1674#[inline]
1675#[target_feature(enable = "avx")]
1676#[cfg_attr(test, assert_instr(vlddqu))]
83c7162d 1677#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1678pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
532ac7d7 1679 transmute(vlddqu(mem_addr as *const i8))
0531ce1d
XL
1680}
1681
1682/// Moves integer data from a 256-bit integer vector to a 32-byte
1683/// aligned memory location. To minimize caching, the data is flagged as
1684/// non-temporal (unlikely to be used again soon)
83c7162d
XL
1685///
1686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_si256)
0531ce1d
XL
1687#[inline]
1688#[target_feature(enable = "avx")]
1689#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
83c7162d 1690#[stable(feature = "simd_x86", since = "1.27.0")]
a1dfa0c6
XL
1691pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1692 intrinsics::nontemporal_store(mem_addr, a);
0531ce1d
XL
1693}
1694
83c7162d 1695/// Moves double-precision values from a 256-bit vector of `[4 x double]`
0531ce1d
XL
1696/// to a 32-byte aligned memory location. To minimize caching, the data is
1697/// flagged as non-temporal (unlikely to be used again soon).
83c7162d
XL
1698///
1699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_pd)
0531ce1d
XL
1700#[inline]
1701#[target_feature(enable = "avx")]
1702#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
83c7162d 1703#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1704#[allow(clippy::cast_ptr_alignment)]
a1dfa0c6
XL
1705pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1706 intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
0531ce1d
XL
1707}
1708
1709/// Moves single-precision floating point values from a 256-bit vector
83c7162d 1710/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
0531ce1d
XL
1711/// caching, the data is flagged as non-temporal (unlikely to be used again
1712/// soon).
83c7162d
XL
1713///
1714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_ps)
0531ce1d
XL
1715#[inline]
1716#[target_feature(enable = "avx")]
1717#[cfg_attr(test, assert_instr(vmovntps))]
83c7162d 1718#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1719#[allow(clippy::cast_ptr_alignment)]
a1dfa0c6
XL
1720pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1721 intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
0531ce1d
XL
1722}
1723
532ac7d7
XL
1724/// Computes the approximate reciprocal of packed single-precision (32-bit)
1725/// floating-point elements in `a`, and returns the results. The maximum
0531ce1d 1726/// relative error for this approximation is less than 1.5*2^-12.
83c7162d
XL
1727///
1728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ps)
0531ce1d
XL
1729#[inline]
1730#[target_feature(enable = "avx")]
1731#[cfg_attr(test, assert_instr(vrcpps))]
83c7162d 1732#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1733pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
1734 vrcpps(a)
1735}
1736
532ac7d7
XL
1737/// Computes the approximate reciprocal square root of packed single-precision
1738/// (32-bit) floating-point elements in `a`, and returns the results.
0531ce1d 1739/// The maximum relative error for this approximation is less than 1.5*2^-12.
83c7162d
XL
1740///
1741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ps)
0531ce1d
XL
1742#[inline]
1743#[target_feature(enable = "avx")]
1744#[cfg_attr(test, assert_instr(vrsqrtps))]
83c7162d 1745#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1746pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1747 vrsqrtps(a)
1748}
1749
532ac7d7 1750/// Unpacks and interleave double-precision (64-bit) floating-point elements
0531ce1d 1751/// from the high half of each 128-bit lane in `a` and `b`.
83c7162d
XL
1752///
1753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_pd)
0531ce1d
XL
1754#[inline]
1755#[target_feature(enable = "avx")]
1756#[cfg_attr(test, assert_instr(vunpckhpd))]
83c7162d 1757#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1758pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
17df50a5 1759 simd_shuffle4!(a, b, [1, 5, 3, 7])
0531ce1d
XL
1760}
1761
532ac7d7 1762/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1763/// from the high half of each 128-bit lane in `a` and `b`.
83c7162d
XL
1764///
1765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_ps)
0531ce1d
XL
1766#[inline]
1767#[target_feature(enable = "avx")]
1768#[cfg_attr(test, assert_instr(vunpckhps))]
83c7162d 1769#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1770pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
17df50a5 1771 simd_shuffle8!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
0531ce1d
XL
1772}
1773
532ac7d7 1774/// Unpacks and interleave double-precision (64-bit) floating-point elements
0531ce1d 1775/// from the low half of each 128-bit lane in `a` and `b`.
83c7162d
XL
1776///
1777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_pd)
0531ce1d
XL
1778#[inline]
1779#[target_feature(enable = "avx")]
1780#[cfg_attr(test, assert_instr(vunpcklpd))]
83c7162d 1781#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1782pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
17df50a5 1783 simd_shuffle4!(a, b, [0, 4, 2, 6])
0531ce1d
XL
1784}
1785
532ac7d7 1786/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1787/// from the low half of each 128-bit lane in `a` and `b`.
83c7162d
XL
1788///
1789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_ps)
0531ce1d
XL
1790#[inline]
1791#[target_feature(enable = "avx")]
1792#[cfg_attr(test, assert_instr(vunpcklps))]
83c7162d 1793#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1794pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
17df50a5 1795 simd_shuffle8!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
0531ce1d
XL
1796}
1797
532ac7d7 1798/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
0531ce1d 1799/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
532ac7d7 1800/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
0531ce1d 1801/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d
XL
1802///
1803/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_si256)
0531ce1d
XL
1804#[inline]
1805#[target_feature(enable = "avx")]
1806#[cfg_attr(test, assert_instr(vptest))]
83c7162d 1807#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1808pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1809 ptestz256(a.as_i64x4(), b.as_i64x4())
1810}
1811
532ac7d7 1812/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
0531ce1d 1813/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
532ac7d7 1814/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
0531ce1d 1815/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d
XL
1816///
1817/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_si256)
0531ce1d
XL
1818#[inline]
1819#[target_feature(enable = "avx")]
1820#[cfg_attr(test, assert_instr(vptest))]
83c7162d 1821#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1822pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
1823 ptestc256(a.as_i64x4(), b.as_i64x4())
1824}
1825
532ac7d7 1826/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
0531ce1d 1827/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
532ac7d7 1828/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
0531ce1d
XL
1829/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1830/// `CF` values are zero, otherwise return 0.
83c7162d
XL
1831///
1832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_si256)
0531ce1d
XL
1833#[inline]
1834#[target_feature(enable = "avx")]
1835#[cfg_attr(test, assert_instr(vptest))]
83c7162d 1836#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1837pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
1838 ptestnzc256(a.as_i64x4(), b.as_i64x4())
1839}
1840
532ac7d7 1841/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
0531ce1d
XL
1842/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1843/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1844/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1845/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1846/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1847/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d
XL
1848///
1849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_pd)
0531ce1d
XL
1850#[inline]
1851#[target_feature(enable = "avx")]
1852#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1853#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1854pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
1855 vtestzpd256(a, b)
1856}
1857
532ac7d7 1858/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
0531ce1d
XL
1859/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1860/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1861/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1862/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1863/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1864/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d
XL
1865///
1866/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_pd)
0531ce1d
XL
1867#[inline]
1868#[target_feature(enable = "avx")]
1869#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1870#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1871pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
1872 vtestcpd256(a, b)
1873}
1874
532ac7d7 1875/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
0531ce1d
XL
1876/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1877/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1878/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1879/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1880/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1881/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1882/// are zero, otherwise return 0.
83c7162d
XL
1883///
1884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_pd)
0531ce1d
XL
1885#[inline]
1886#[target_feature(enable = "avx")]
1887#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1888#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1889pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
1890 vtestnzcpd256(a, b)
1891}
1892
532ac7d7 1893/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
0531ce1d
XL
1894/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1895/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1896/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1897/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1898/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1899/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d
XL
1900///
1901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_pd)
0531ce1d
XL
1902#[inline]
1903#[target_feature(enable = "avx")]
1904#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1905#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1906pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
1907 vtestzpd(a, b)
1908}
1909
532ac7d7 1910/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
0531ce1d
XL
1911/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1912/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1913/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1914/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1915/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1916/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d
XL
1917///
1918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_pd)
0531ce1d
XL
1919#[inline]
1920#[target_feature(enable = "avx")]
1921#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1922#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1923pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
1924 vtestcpd(a, b)
1925}
1926
532ac7d7 1927/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
0531ce1d
XL
1928/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1929/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1930/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1931/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1932/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1933/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1934/// are zero, otherwise return 0.
83c7162d
XL
1935///
1936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_pd)
0531ce1d
XL
1937#[inline]
1938#[target_feature(enable = "avx")]
1939#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1940#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1941pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
1942 vtestnzcpd(a, b)
1943}
1944
532ac7d7 1945/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
0531ce1d
XL
1946/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1947/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
1948/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1949/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1950/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
1951/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d
XL
1952///
1953/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_ps)
0531ce1d
XL
1954#[inline]
1955#[target_feature(enable = "avx")]
1956#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 1957#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1958pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
1959 vtestzps256(a, b)
1960}
1961
532ac7d7 1962/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
0531ce1d
XL
1963/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1964/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
1965/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1966/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1967/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
1968/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d
XL
1969///
1970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_ps)
0531ce1d
XL
1971#[inline]
1972#[target_feature(enable = "avx")]
1973#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 1974#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1975pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
1976 vtestcps256(a, b)
1977}
1978
532ac7d7 1979/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
0531ce1d
XL
1980/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1981/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
1982/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1983/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1984/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
1985/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1986/// are zero, otherwise return 0.
83c7162d
XL
1987///
1988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_ps)
0531ce1d
XL
1989#[inline]
1990#[target_feature(enable = "avx")]
1991#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 1992#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1993pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
1994 vtestnzcps256(a, b)
1995}
1996
532ac7d7 1997/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
0531ce1d
XL
1998/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1999/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2000/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2001/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2002/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2003/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d
XL
2004///
2005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_ps)
0531ce1d
XL
2006#[inline]
2007#[target_feature(enable = "avx")]
2008#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 2009#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2010pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2011 vtestzps(a, b)
2012}
2013
532ac7d7 2014/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
0531ce1d
XL
2015/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2016/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2017/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2018/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2019/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2020/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d
XL
2021///
2022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_ps)
0531ce1d
XL
2023#[inline]
2024#[target_feature(enable = "avx")]
2025#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 2026#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2027pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2028 vtestcps(a, b)
2029}
2030
532ac7d7 2031/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
0531ce1d
XL
2032/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2033/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2034/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2035/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2036/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2037/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2038/// are zero, otherwise return 0.
83c7162d
XL
2039///
2040/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_ps)
0531ce1d
XL
2041#[inline]
2042#[target_feature(enable = "avx")]
2043#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 2044#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2045pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2046 vtestnzcps(a, b)
2047}
2048
532ac7d7 2049/// Sets each bit of the returned mask based on the most significant bit of the
0531ce1d
XL
2050/// corresponding packed double-precision (64-bit) floating-point element in
2051/// `a`.
83c7162d
XL
2052///
2053/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_pd)
0531ce1d
XL
2054#[inline]
2055#[target_feature(enable = "avx")]
2056#[cfg_attr(test, assert_instr(vmovmskpd))]
83c7162d 2057#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2058pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
2059 movmskpd256(a)
2060}
2061
532ac7d7 2062/// Sets each bit of the returned mask based on the most significant bit of the
0531ce1d
XL
2063/// corresponding packed single-precision (32-bit) floating-point element in
2064/// `a`.
83c7162d
XL
2065///
2066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_ps)
0531ce1d
XL
2067#[inline]
2068#[target_feature(enable = "avx")]
2069#[cfg_attr(test, assert_instr(vmovmskps))]
83c7162d 2070#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2071pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
2072 movmskps256(a)
2073}
2074
532ac7d7 2075/// Returns vector of type __m256d with all elements set to zero.
83c7162d
XL
2076///
2077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_pd)
0531ce1d
XL
2078#[inline]
2079#[target_feature(enable = "avx")]
2080#[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
83c7162d 2081#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2082pub unsafe fn _mm256_setzero_pd() -> __m256d {
2083 _mm256_set1_pd(0.0)
2084}
2085
532ac7d7 2086/// Returns vector of type __m256 with all elements set to zero.
83c7162d
XL
2087///
2088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ps)
0531ce1d
XL
2089#[inline]
2090#[target_feature(enable = "avx")]
2091#[cfg_attr(test, assert_instr(vxorps))]
83c7162d 2092#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2093pub unsafe fn _mm256_setzero_ps() -> __m256 {
2094 _mm256_set1_ps(0.0)
2095}
2096
532ac7d7 2097/// Returns vector of type __m256i with all elements set to zero.
83c7162d
XL
2098///
2099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_si256)
0531ce1d
XL
2100#[inline]
2101#[target_feature(enable = "avx")]
2102#[cfg_attr(test, assert_instr(vxor))]
83c7162d 2103#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2104pub unsafe fn _mm256_setzero_si256() -> __m256i {
2105 _mm256_set1_epi8(0)
2106}
2107
532ac7d7 2108/// Sets packed double-precision (64-bit) floating-point elements in returned
0531ce1d 2109/// vector with the supplied values.
83c7162d
XL
2110///
2111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_pd)
0531ce1d
XL
2112#[inline]
2113#[target_feature(enable = "avx")]
2114// This intrinsic has no corresponding instruction.
2115#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2116#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2117pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2118 _mm256_setr_pd(d, c, b, a)
2119}
2120
532ac7d7 2121/// Sets packed single-precision (32-bit) floating-point elements in returned
0531ce1d 2122/// vector with the supplied values.
83c7162d
XL
2123///
2124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ps)
0531ce1d
XL
2125#[inline]
2126#[target_feature(enable = "avx")]
2127// This intrinsic has no corresponding instruction.
83c7162d 2128#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2129pub unsafe fn _mm256_set_ps(
0731742a
XL
2130 a: f32,
2131 b: f32,
2132 c: f32,
2133 d: f32,
2134 e: f32,
2135 f: f32,
2136 g: f32,
2137 h: f32,
0531ce1d
XL
2138) -> __m256 {
2139 _mm256_setr_ps(h, g, f, e, d, c, b, a)
2140}
2141
3c0e092e 2142/// Sets packed 8-bit integers in returned vector with the supplied values.
83c7162d
XL
2143///
2144/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi8)
0531ce1d
XL
2145#[inline]
2146#[target_feature(enable = "avx")]
2147// This intrinsic has no corresponding instruction.
83c7162d 2148#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2149pub unsafe fn _mm256_set_epi8(
0731742a
XL
2150 e00: i8,
2151 e01: i8,
2152 e02: i8,
2153 e03: i8,
2154 e04: i8,
2155 e05: i8,
2156 e06: i8,
2157 e07: i8,
2158 e08: i8,
2159 e09: i8,
2160 e10: i8,
2161 e11: i8,
2162 e12: i8,
2163 e13: i8,
2164 e14: i8,
2165 e15: i8,
2166 e16: i8,
2167 e17: i8,
2168 e18: i8,
2169 e19: i8,
2170 e20: i8,
2171 e21: i8,
2172 e22: i8,
2173 e23: i8,
2174 e24: i8,
2175 e25: i8,
2176 e26: i8,
2177 e27: i8,
2178 e28: i8,
2179 e29: i8,
2180 e30: i8,
2181 e31: i8,
0531ce1d 2182) -> __m256i {
0731742a 2183 #[rustfmt::skip]
0531ce1d
XL
2184 _mm256_setr_epi8(
2185 e31, e30, e29, e28, e27, e26, e25, e24,
2186 e23, e22, e21, e20, e19, e18, e17, e16,
2187 e15, e14, e13, e12, e11, e10, e09, e08,
2188 e07, e06, e05, e04, e03, e02, e01, e00,
2189 )
2190}
2191
532ac7d7 2192/// Sets packed 16-bit integers in returned vector with the supplied values.
83c7162d
XL
2193///
2194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi16)
0531ce1d
XL
2195#[inline]
2196#[target_feature(enable = "avx")]
2197// This intrinsic has no corresponding instruction.
83c7162d 2198#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2199pub unsafe fn _mm256_set_epi16(
0731742a
XL
2200 e00: i16,
2201 e01: i16,
2202 e02: i16,
2203 e03: i16,
2204 e04: i16,
2205 e05: i16,
2206 e06: i16,
2207 e07: i16,
2208 e08: i16,
2209 e09: i16,
2210 e10: i16,
2211 e11: i16,
2212 e12: i16,
2213 e13: i16,
2214 e14: i16,
2215 e15: i16,
0531ce1d 2216) -> __m256i {
0731742a 2217 #[rustfmt::skip]
0531ce1d
XL
2218 _mm256_setr_epi16(
2219 e15, e14, e13, e12,
2220 e11, e10, e09, e08,
2221 e07, e06, e05, e04,
2222 e03, e02, e01, e00,
2223 )
2224}
2225
532ac7d7 2226/// Sets packed 32-bit integers in returned vector with the supplied values.
83c7162d
XL
2227///
2228/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi32)
0531ce1d
XL
2229#[inline]
2230#[target_feature(enable = "avx")]
2231// This intrinsic has no corresponding instruction.
83c7162d 2232#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2233pub unsafe fn _mm256_set_epi32(
0731742a
XL
2234 e0: i32,
2235 e1: i32,
2236 e2: i32,
2237 e3: i32,
2238 e4: i32,
2239 e5: i32,
2240 e6: i32,
2241 e7: i32,
0531ce1d
XL
2242) -> __m256i {
2243 _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2244}
2245
532ac7d7 2246/// Sets packed 64-bit integers in returned vector with the supplied values.
83c7162d
XL
2247///
2248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi64x)
0531ce1d
XL
2249#[inline]
2250#[target_feature(enable = "avx")]
2251// This intrinsic has no corresponding instruction.
83c7162d 2252#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2253pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2254 _mm256_setr_epi64x(d, c, b, a)
2255}
2256
532ac7d7 2257/// Sets packed double-precision (64-bit) floating-point elements in returned
0531ce1d 2258/// vector with the supplied values in reverse order.
83c7162d
XL
2259///
2260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_pd)
0531ce1d
XL
2261#[inline]
2262#[target_feature(enable = "avx")]
2263// This intrinsic has no corresponding instruction.
83c7162d 2264#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2265pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2266 __m256d(a, b, c, d)
2267}
2268
532ac7d7 2269/// Sets packed single-precision (32-bit) floating-point elements in returned
0531ce1d 2270/// vector with the supplied values in reverse order.
83c7162d
XL
2271///
2272/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ps)
0531ce1d
XL
2273#[inline]
2274#[target_feature(enable = "avx")]
2275// This intrinsic has no corresponding instruction.
83c7162d 2276#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2277pub unsafe fn _mm256_setr_ps(
0731742a
XL
2278 a: f32,
2279 b: f32,
2280 c: f32,
2281 d: f32,
2282 e: f32,
2283 f: f32,
2284 g: f32,
2285 h: f32,
0531ce1d
XL
2286) -> __m256 {
2287 __m256(a, b, c, d, e, f, g, h)
2288}
2289
532ac7d7 2290/// Sets packed 8-bit integers in returned vector with the supplied values in
0531ce1d 2291/// reverse order.
83c7162d
XL
2292///
2293/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi8)
0531ce1d
XL
2294#[inline]
2295#[target_feature(enable = "avx")]
2296// This intrinsic has no corresponding instruction.
83c7162d 2297#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2298pub unsafe fn _mm256_setr_epi8(
0731742a
XL
2299 e00: i8,
2300 e01: i8,
2301 e02: i8,
2302 e03: i8,
2303 e04: i8,
2304 e05: i8,
2305 e06: i8,
2306 e07: i8,
2307 e08: i8,
2308 e09: i8,
2309 e10: i8,
2310 e11: i8,
2311 e12: i8,
2312 e13: i8,
2313 e14: i8,
2314 e15: i8,
2315 e16: i8,
2316 e17: i8,
2317 e18: i8,
2318 e19: i8,
2319 e20: i8,
2320 e21: i8,
2321 e22: i8,
2322 e23: i8,
2323 e24: i8,
2324 e25: i8,
2325 e26: i8,
2326 e27: i8,
2327 e28: i8,
2328 e29: i8,
2329 e30: i8,
2330 e31: i8,
0531ce1d 2331) -> __m256i {
0731742a 2332 #[rustfmt::skip]
532ac7d7 2333 transmute(i8x32::new(
0531ce1d
XL
2334 e00, e01, e02, e03, e04, e05, e06, e07,
2335 e08, e09, e10, e11, e12, e13, e14, e15,
2336 e16, e17, e18, e19, e20, e21, e22, e23,
2337 e24, e25, e26, e27, e28, e29, e30, e31,
2338 ))
2339}
2340
532ac7d7 2341/// Sets packed 16-bit integers in returned vector with the supplied values in
0531ce1d 2342/// reverse order.
83c7162d
XL
2343///
2344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi16)
0531ce1d
XL
2345#[inline]
2346#[target_feature(enable = "avx")]
2347// This intrinsic has no corresponding instruction.
83c7162d 2348#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2349pub unsafe fn _mm256_setr_epi16(
0731742a
XL
2350 e00: i16,
2351 e01: i16,
2352 e02: i16,
2353 e03: i16,
2354 e04: i16,
2355 e05: i16,
2356 e06: i16,
2357 e07: i16,
2358 e08: i16,
2359 e09: i16,
2360 e10: i16,
2361 e11: i16,
2362 e12: i16,
2363 e13: i16,
2364 e14: i16,
2365 e15: i16,
0531ce1d 2366) -> __m256i {
0731742a 2367 #[rustfmt::skip]
532ac7d7 2368 transmute(i16x16::new(
0531ce1d
XL
2369 e00, e01, e02, e03,
2370 e04, e05, e06, e07,
2371 e08, e09, e10, e11,
2372 e12, e13, e14, e15,
2373 ))
2374}
2375
532ac7d7 2376/// Sets packed 32-bit integers in returned vector with the supplied values in
0531ce1d 2377/// reverse order.
83c7162d
XL
2378///
2379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi32)
0531ce1d
XL
2380#[inline]
2381#[target_feature(enable = "avx")]
2382// This intrinsic has no corresponding instruction.
83c7162d 2383#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2384pub unsafe fn _mm256_setr_epi32(
0731742a
XL
2385 e0: i32,
2386 e1: i32,
2387 e2: i32,
2388 e3: i32,
2389 e4: i32,
2390 e5: i32,
2391 e6: i32,
2392 e7: i32,
0531ce1d 2393) -> __m256i {
532ac7d7 2394 transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
0531ce1d
XL
2395}
2396
532ac7d7 2397/// Sets packed 64-bit integers in returned vector with the supplied values in
0531ce1d 2398/// reverse order.
83c7162d
XL
2399///
2400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi64x)
0531ce1d
XL
2401#[inline]
2402#[target_feature(enable = "avx")]
2403// This intrinsic has no corresponding instruction.
83c7162d 2404#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2405pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
532ac7d7 2406 transmute(i64x4::new(a, b, c, d))
0531ce1d
XL
2407}
2408
532ac7d7 2409/// Broadcasts double-precision (64-bit) floating-point value `a` to all
0531ce1d 2410/// elements of returned vector.
83c7162d
XL
2411///
2412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pd)
0531ce1d
XL
2413#[inline]
2414#[target_feature(enable = "avx")]
2415// This intrinsic has no corresponding instruction.
83c7162d 2416#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2417pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
2418 _mm256_setr_pd(a, a, a, a)
2419}
2420
532ac7d7 2421/// Broadcasts single-precision (32-bit) floating-point value `a` to all
0531ce1d 2422/// elements of returned vector.
83c7162d
XL
2423///
2424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ps)
0531ce1d
XL
2425#[inline]
2426#[target_feature(enable = "avx")]
2427// This intrinsic has no corresponding instruction.
83c7162d 2428#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2429pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
2430 _mm256_setr_ps(a, a, a, a, a, a, a, a)
2431}
2432
532ac7d7 2433/// Broadcasts 8-bit integer `a` to all elements of returned vector.
0531ce1d 2434/// This intrinsic may generate the `vpbroadcastb`.
83c7162d
XL
2435///
2436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi8)
0531ce1d
XL
2437#[inline]
2438#[target_feature(enable = "avx")]
2439#[cfg_attr(test, assert_instr(vpshufb))]
2440#[cfg_attr(test, assert_instr(vinsertf128))]
2441// This intrinsic has no corresponding instruction.
83c7162d 2442#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2443pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
0731742a 2444 #[rustfmt::skip]
0531ce1d
XL
2445 _mm256_setr_epi8(
2446 a, a, a, a, a, a, a, a,
2447 a, a, a, a, a, a, a, a,
2448 a, a, a, a, a, a, a, a,
2449 a, a, a, a, a, a, a, a,
2450 )
2451}
2452
532ac7d7 2453/// Broadcasts 16-bit integer `a` to all all elements of returned vector.
0531ce1d 2454/// This intrinsic may generate the `vpbroadcastw`.
83c7162d
XL
2455///
2456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi16)
0531ce1d
XL
2457#[inline]
2458#[target_feature(enable = "avx")]
2459//#[cfg_attr(test, assert_instr(vpshufb))]
2460#[cfg_attr(test, assert_instr(vinsertf128))]
2461// This intrinsic has no corresponding instruction.
83c7162d 2462#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2463pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
2464 _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2465}
2466
532ac7d7 2467/// Broadcasts 32-bit integer `a` to all elements of returned vector.
0531ce1d 2468/// This intrinsic may generate the `vpbroadcastd`.
83c7162d
XL
2469///
2470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi32)
0531ce1d
XL
2471#[inline]
2472#[target_feature(enable = "avx")]
2473// This intrinsic has no corresponding instruction.
83c7162d 2474#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2475pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
2476 _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2477}
2478
532ac7d7 2479/// Broadcasts 64-bit integer `a` to all elements of returned vector.
0531ce1d 2480/// This intrinsic may generate the `vpbroadcastq`.
83c7162d
XL
2481///
2482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi64x)
0531ce1d
XL
2483#[inline]
2484#[target_feature(enable = "avx")]
e1599b0c
XL
2485#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2486#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
0531ce1d 2487// This intrinsic has no corresponding instruction.
83c7162d 2488#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2489pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
2490 _mm256_setr_epi64x(a, a, a, a)
2491}
2492
2493/// Cast vector of type __m256d to type __m256.
83c7162d
XL
2494///
2495/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ps)
0531ce1d
XL
2496#[inline]
2497#[target_feature(enable = "avx")]
2498// This intrinsic is only used for compilation and does not generate any
2499// instructions, thus it has zero latency.
83c7162d 2500#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2501pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
532ac7d7 2502 transmute(a)
0531ce1d
XL
2503}
2504
2505/// Cast vector of type __m256 to type __m256d.
83c7162d
XL
2506///
2507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_pd)
0531ce1d
XL
2508#[inline]
2509#[target_feature(enable = "avx")]
2510// This intrinsic is only used for compilation and does not generate any
2511// instructions, thus it has zero latency.
83c7162d 2512#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2513pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
532ac7d7 2514 transmute(a)
0531ce1d
XL
2515}
2516
2517/// Casts vector of type __m256 to type __m256i.
83c7162d
XL
2518///
2519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_si256)
0531ce1d
XL
2520#[inline]
2521#[target_feature(enable = "avx")]
2522// This intrinsic is only used for compilation and does not generate any
2523// instructions, thus it has zero latency.
83c7162d 2524#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2525pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
532ac7d7 2526 transmute(a)
0531ce1d
XL
2527}
2528
2529/// Casts vector of type __m256i to type __m256.
83c7162d
XL
2530///
2531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ps)
0531ce1d
XL
2532#[inline]
2533#[target_feature(enable = "avx")]
2534// This intrinsic is only used for compilation and does not generate any
2535// instructions, thus it has zero latency.
83c7162d 2536#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2537pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
532ac7d7 2538 transmute(a)
0531ce1d
XL
2539}
2540
2541/// Casts vector of type __m256d to type __m256i.
83c7162d
XL
2542///
2543/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_si256)
0531ce1d
XL
2544#[inline]
2545#[target_feature(enable = "avx")]
2546// This intrinsic is only used for compilation and does not generate any
2547// instructions, thus it has zero latency.
83c7162d 2548#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2549pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
532ac7d7 2550 transmute(a)
0531ce1d
XL
2551}
2552
2553/// Casts vector of type __m256i to type __m256d.
83c7162d
XL
2554///
2555/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_pd)
0531ce1d
XL
2556#[inline]
2557#[target_feature(enable = "avx")]
2558// This intrinsic is only used for compilation and does not generate any
2559// instructions, thus it has zero latency.
83c7162d 2560#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2561pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
532ac7d7 2562 transmute(a)
0531ce1d
XL
2563}
2564
2565/// Casts vector of type __m256 to type __m128.
83c7162d
XL
2566///
2567/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps256_ps128)
0531ce1d
XL
2568#[inline]
2569#[target_feature(enable = "avx")]
2570// This intrinsic is only used for compilation and does not generate any
2571// instructions, thus it has zero latency.
83c7162d 2572#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2573pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
17df50a5 2574 simd_shuffle4!(a, a, [0, 1, 2, 3])
0531ce1d
XL
2575}
2576
2577/// Casts vector of type __m256d to type __m128d.
83c7162d
XL
2578///
2579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd256_pd128)
0531ce1d
XL
2580#[inline]
2581#[target_feature(enable = "avx")]
2582// This intrinsic is only used for compilation and does not generate any
2583// instructions, thus it has zero latency.
83c7162d 2584#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2585pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
17df50a5 2586 simd_shuffle2!(a, a, [0, 1])
0531ce1d
XL
2587}
2588
2589/// Casts vector of type __m256i to type __m128i.
83c7162d
XL
2590///
2591/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_si128)
0531ce1d
XL
2592#[inline]
2593#[target_feature(enable = "avx")]
2594// This intrinsic is only used for compilation and does not generate any
2595// instructions, thus it has zero latency.
83c7162d 2596#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2597pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2598 let a = a.as_i64x4();
17df50a5 2599 let dst: i64x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 2600 transmute(dst)
0531ce1d
XL
2601}
2602
2603/// Casts vector of type __m128 to type __m256;
2604/// the upper 128 bits of the result are undefined.
83c7162d
XL
2605///
2606/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps128_ps256)
0531ce1d
XL
2607#[inline]
2608#[target_feature(enable = "avx")]
2609// This intrinsic is only used for compilation and does not generate any
2610// instructions, thus it has zero latency.
83c7162d 2611#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2612pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
17df50a5
XL
2613 // FIXME simd_shuffle8!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
2614 simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
0531ce1d
XL
2615}
2616
2617/// Casts vector of type __m128d to type __m256d;
2618/// the upper 128 bits of the result are undefined.
83c7162d
XL
2619///
2620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd128_pd256)
0531ce1d
XL
2621#[inline]
2622#[target_feature(enable = "avx")]
2623// This intrinsic is only used for compilation and does not generate any
2624// instructions, thus it has zero latency.
83c7162d 2625#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2626pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
17df50a5
XL
2627 // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
2628 simd_shuffle4!(a, a, [0, 1, 0, 0])
0531ce1d
XL
2629}
2630
2631/// Casts vector of type __m128i to type __m256i;
2632/// the upper 128 bits of the result are undefined.
83c7162d
XL
2633///
2634/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi128_si256)
0531ce1d
XL
2635#[inline]
2636#[target_feature(enable = "avx")]
2637// This intrinsic is only used for compilation and does not generate any
2638// instructions, thus it has zero latency.
83c7162d 2639#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2640pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2641 let a = a.as_i64x2();
17df50a5
XL
2642 // FIXME simd_shuffle4!(a, a, [0, 1, -1, -1])
2643 let dst: i64x4 = simd_shuffle4!(a, a, [0, 1, 0, 0]);
532ac7d7 2644 transmute(dst)
0531ce1d
XL
2645}
2646
83c7162d
XL
2647/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2648/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
0531ce1d 2649/// the value of the source vector. The upper 128 bits are set to zero.
83c7162d
XL
2650///
2651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextps128_ps256)
0531ce1d
XL
2652#[inline]
2653#[target_feature(enable = "avx,sse")]
2654// This intrinsic is only used for compilation and does not generate any
2655// instructions, thus it has zero latency.
83c7162d 2656#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2657pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
17df50a5 2658 simd_shuffle8!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
0531ce1d
XL
2659}
2660
2661/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2662/// The lower 128 bits contain the value of the source vector. The upper
2663/// 128 bits are set to zero.
83c7162d
XL
2664///
2665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextsi128_si256)
0531ce1d
XL
2666#[inline]
2667#[target_feature(enable = "avx,sse2")]
2668// This intrinsic is only used for compilation and does not generate any
2669// instructions, thus it has zero latency.
83c7162d 2670#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2671pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2672 let b = _mm_setzero_si128().as_i64x2();
17df50a5 2673 let dst: i64x4 = simd_shuffle4!(a.as_i64x2(), b, [0, 1, 2, 3]);
532ac7d7 2674 transmute(dst)
0531ce1d
XL
2675}
2676
83c7162d
XL
2677/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2678/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
0531ce1d
XL
2679/// contain the value of the source vector. The upper 128 bits are set
2680/// to zero.
83c7162d
XL
2681///
2682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextpd128_pd256)
0531ce1d
XL
2683#[inline]
2684#[target_feature(enable = "avx,sse2")]
2685// This intrinsic is only used for compilation and does not generate any
2686// instructions, thus it has zero latency.
83c7162d 2687#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2688pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
17df50a5 2689 simd_shuffle4!(a, _mm_setzero_pd(), [0, 1, 2, 3])
0531ce1d
XL
2690}
2691
532ac7d7 2692/// Returns vector of type `__m256` with undefined elements.
83c7162d
XL
2693///
2694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ps)
0531ce1d
XL
2695#[inline]
2696#[target_feature(enable = "avx")]
2697// This intrinsic has no corresponding instruction.
83c7162d 2698#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2699pub unsafe fn _mm256_undefined_ps() -> __m256 {
3dfed10e 2700 _mm256_set1_ps(0.0)
0531ce1d
XL
2701}
2702
532ac7d7 2703/// Returns vector of type `__m256d` with undefined elements.
83c7162d
XL
2704///
2705/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_pd)
0531ce1d
XL
2706#[inline]
2707#[target_feature(enable = "avx")]
2708// This intrinsic has no corresponding instruction.
83c7162d 2709#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2710pub unsafe fn _mm256_undefined_pd() -> __m256d {
3dfed10e 2711 _mm256_set1_pd(0.0)
0531ce1d
XL
2712}
2713
532ac7d7 2714/// Returns vector of type __m256i with undefined elements.
83c7162d
XL
2715///
2716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_si256)
0531ce1d
XL
2717#[inline]
2718#[target_feature(enable = "avx")]
2719// This intrinsic has no corresponding instruction.
83c7162d 2720#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2721pub unsafe fn _mm256_undefined_si256() -> __m256i {
0731742a 2722 // FIXME: this function should return MaybeUninit<__m256i>
532ac7d7 2723 mem::MaybeUninit::<__m256i>::uninit().assume_init()
0531ce1d
XL
2724}
2725
532ac7d7 2726/// Sets packed __m256 returned vector with the supplied values.
83c7162d
XL
2727///
2728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128)
0531ce1d
XL
2729#[inline]
2730#[target_feature(enable = "avx")]
2731#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2732#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2733pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
17df50a5 2734 simd_shuffle8!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
0531ce1d
XL
2735}
2736
532ac7d7 2737/// Sets packed __m256d returned vector with the supplied values.
83c7162d
XL
2738///
2739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128d)
0531ce1d
XL
2740#[inline]
2741#[target_feature(enable = "avx")]
2742#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2743#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2744pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
532ac7d7
XL
2745 let hi: __m128 = transmute(hi);
2746 let lo: __m128 = transmute(lo);
2747 transmute(_mm256_set_m128(hi, lo))
0531ce1d
XL
2748}
2749
532ac7d7 2750/// Sets packed __m256i returned vector with the supplied values.
83c7162d
XL
2751///
2752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128i)
0531ce1d
XL
2753#[inline]
2754#[target_feature(enable = "avx")]
2755#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2756#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2757pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
532ac7d7
XL
2758 let hi: __m128 = transmute(hi);
2759 let lo: __m128 = transmute(lo);
2760 transmute(_mm256_set_m128(hi, lo))
0531ce1d
XL
2761}
2762
532ac7d7 2763/// Sets packed __m256 returned vector with the supplied values.
83c7162d
XL
2764///
2765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128)
0531ce1d
XL
2766#[inline]
2767#[target_feature(enable = "avx")]
2768#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2769#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2770pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2771 _mm256_set_m128(hi, lo)
2772}
2773
532ac7d7 2774/// Sets packed __m256d returned vector with the supplied values.
83c7162d
XL
2775///
2776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128d)
0531ce1d
XL
2777#[inline]
2778#[target_feature(enable = "avx")]
2779#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2780#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2781pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2782 _mm256_set_m128d(hi, lo)
2783}
2784
532ac7d7 2785/// Sets packed __m256i returned vector with the supplied values.
83c7162d
XL
2786///
2787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128i)
0531ce1d
XL
2788#[inline]
2789#[target_feature(enable = "avx")]
2790#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2791#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2792pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
2793 _mm256_set_m128i(hi, lo)
2794}
2795
532ac7d7 2796/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
0531ce1d
XL
2797/// floating-point elements) from memory, and combine them into a 256-bit
2798/// value.
2799/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d
XL
2800///
2801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128)
0531ce1d
XL
2802#[inline]
2803#[target_feature(enable = "avx,sse")]
2804// This intrinsic has no corresponding instruction.
83c7162d 2805#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2806pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
0531ce1d 2807 let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
17df50a5 2808 _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
0531ce1d
XL
2809}
2810
532ac7d7 2811/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
0531ce1d
XL
2812/// floating-point elements) from memory, and combine them into a 256-bit
2813/// value.
2814/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d
XL
2815///
2816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128d)
0531ce1d
XL
2817#[inline]
2818#[target_feature(enable = "avx,sse2")]
2819// This intrinsic has no corresponding instruction.
83c7162d 2820#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2821pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
0531ce1d 2822 let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
17df50a5 2823 _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
0531ce1d
XL
2824}
2825
532ac7d7 2826/// Loads two 128-bit values (composed of integer data) from memory, and combine
0531ce1d
XL
2827/// them into a 256-bit value.
2828/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d
XL
2829///
2830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128i)
0531ce1d
XL
2831#[inline]
2832#[target_feature(enable = "avx,sse2")]
2833// This intrinsic has no corresponding instruction.
83c7162d 2834#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2835pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
0531ce1d 2836 let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
17df50a5 2837 _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
0531ce1d
XL
2838}
2839
532ac7d7 2840/// Stores the high and low 128-bit halves (each composed of 4 packed
0531ce1d
XL
2841/// single-precision (32-bit) floating-point elements) from `a` into memory two
2842/// different 128-bit locations.
2843/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d
XL
2844///
2845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128)
0531ce1d
XL
2846#[inline]
2847#[target_feature(enable = "avx,sse")]
2848// This intrinsic has no corresponding instruction.
83c7162d 2849#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2850pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
0531ce1d
XL
2851 let lo = _mm256_castps256_ps128(a);
2852 _mm_storeu_ps(loaddr, lo);
17df50a5 2853 let hi = _mm256_extractf128_ps::<1>(a);
0531ce1d
XL
2854 _mm_storeu_ps(hiaddr, hi);
2855}
2856
532ac7d7 2857/// Stores the high and low 128-bit halves (each composed of 2 packed
0531ce1d
XL
2858/// double-precision (64-bit) floating-point elements) from `a` into memory two
2859/// different 128-bit locations.
2860/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d
XL
2861///
2862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128d)
0531ce1d
XL
2863#[inline]
2864#[target_feature(enable = "avx,sse2")]
2865// This intrinsic has no corresponding instruction.
83c7162d 2866#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2867pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
0531ce1d
XL
2868 let lo = _mm256_castpd256_pd128(a);
2869 _mm_storeu_pd(loaddr, lo);
17df50a5 2870 let hi = _mm256_extractf128_pd::<1>(a);
0531ce1d
XL
2871 _mm_storeu_pd(hiaddr, hi);
2872}
2873
532ac7d7 2874/// Stores the high and low 128-bit halves (each composed of integer data) from
0531ce1d
XL
2875/// `a` into memory two different 128-bit locations.
2876/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d
XL
2877///
2878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128i)
0531ce1d
XL
2879#[inline]
2880#[target_feature(enable = "avx,sse2")]
2881// This intrinsic has no corresponding instruction.
83c7162d 2882#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2883pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
0531ce1d
XL
2884 let lo = _mm256_castsi256_si128(a);
2885 _mm_storeu_si128(loaddr, lo);
17df50a5 2886 let hi = _mm256_extractf128_si256::<1>(a);
0531ce1d
XL
2887 _mm_storeu_si128(hiaddr, hi);
2888}
2889
83c7162d
XL
2890/// Returns the first element of the input vector of `[8 x float]`.
2891///
2892/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtss_f32)
0531ce1d
XL
2893#[inline]
2894#[target_feature(enable = "avx")]
2895//#[cfg_attr(test, assert_instr(movss))] FIXME
83c7162d 2896#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2897pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
2898 simd_extract(a, 0)
2899}
2900
ee023bcb 2901// LLVM intrinsics used in the above functions
0531ce1d
XL
2902#[allow(improper_ctypes)]
2903extern "C" {
2904 #[link_name = "llvm.x86.avx.addsub.pd.256"]
2905 fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
2906 #[link_name = "llvm.x86.avx.addsub.ps.256"]
2907 fn addsubps256(a: __m256, b: __m256) -> __m256;
0531ce1d
XL
2908 #[link_name = "llvm.x86.avx.round.pd.256"]
2909 fn roundpd256(a: __m256d, b: i32) -> __m256d;
2910 #[link_name = "llvm.x86.avx.round.ps.256"]
2911 fn roundps256(a: __m256, b: i32) -> __m256;
0531ce1d
XL
2912 #[link_name = "llvm.x86.avx.sqrt.ps.256"]
2913 fn sqrtps256(a: __m256) -> __m256;
2914 #[link_name = "llvm.x86.avx.blendv.pd.256"]
2915 fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
2916 #[link_name = "llvm.x86.avx.blendv.ps.256"]
2917 fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
2918 #[link_name = "llvm.x86.avx.dp.ps.256"]
2919 fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
2920 #[link_name = "llvm.x86.avx.hadd.pd.256"]
2921 fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
2922 #[link_name = "llvm.x86.avx.hadd.ps.256"]
2923 fn vhaddps(a: __m256, b: __m256) -> __m256;
2924 #[link_name = "llvm.x86.avx.hsub.pd.256"]
2925 fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
2926 #[link_name = "llvm.x86.avx.hsub.ps.256"]
2927 fn vhsubps(a: __m256, b: __m256) -> __m256;
2928 #[link_name = "llvm.x86.sse2.cmp.pd"]
3dfed10e 2929 fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
0531ce1d
XL
2930 #[link_name = "llvm.x86.avx.cmp.pd.256"]
2931 fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
2932 #[link_name = "llvm.x86.sse.cmp.ps"]
3dfed10e 2933 fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
0531ce1d
XL
2934 #[link_name = "llvm.x86.avx.cmp.ps.256"]
2935 fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
2936 #[link_name = "llvm.x86.sse2.cmp.sd"]
3dfed10e 2937 fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
0531ce1d 2938 #[link_name = "llvm.x86.sse.cmp.ss"]
3dfed10e 2939 fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
0531ce1d
XL
2940 #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
2941 fn vcvtdq2ps(a: i32x8) -> __m256;
2942 #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
2943 fn vcvtpd2ps(a: __m256d) -> __m128;
2944 #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
2945 fn vcvtps2dq(a: __m256) -> i32x8;
2946 #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
2947 fn vcvttpd2dq(a: __m256d) -> i32x4;
2948 #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
2949 fn vcvtpd2dq(a: __m256d) -> i32x4;
2950 #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
2951 fn vcvttps2dq(a: __m256) -> i32x8;
2952 #[link_name = "llvm.x86.avx.vzeroall"]
2953 fn vzeroall();
2954 #[link_name = "llvm.x86.avx.vzeroupper"]
2955 fn vzeroupper();
2956 #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
2957 fn vpermilps256(a: __m256, b: i32x8) -> __m256;
2958 #[link_name = "llvm.x86.avx.vpermilvar.ps"]
2959 fn vpermilps(a: __m128, b: i32x4) -> __m128;
2960 #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
2961 fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
2962 #[link_name = "llvm.x86.avx.vpermilvar.pd"]
2963 fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
2964 #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
2965 fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
2966 #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
2967 fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
2968 #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
2969 fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
2970 #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
2971 fn vbroadcastf128ps256(a: &__m128) -> __m256;
2972 #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
2973 fn vbroadcastf128pd256(a: &__m128d) -> __m256d;
2974 #[link_name = "llvm.x86.avx.storeu.pd.256"]
2975 fn storeupd256(mem_addr: *mut f64, a: __m256d);
2976 #[link_name = "llvm.x86.avx.storeu.ps.256"]
2977 fn storeups256(mem_addr: *mut f32, a: __m256);
2978 #[link_name = "llvm.x86.avx.storeu.dq.256"]
2979 fn storeudq256(mem_addr: *mut i8, a: i8x32);
2980 #[link_name = "llvm.x86.avx.maskload.pd.256"]
2981 fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
2982 #[link_name = "llvm.x86.avx.maskstore.pd.256"]
2983 fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
2984 #[link_name = "llvm.x86.avx.maskload.pd"]
2985 fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
2986 #[link_name = "llvm.x86.avx.maskstore.pd"]
2987 fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
2988 #[link_name = "llvm.x86.avx.maskload.ps.256"]
2989 fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
2990 #[link_name = "llvm.x86.avx.maskstore.ps.256"]
2991 fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
2992 #[link_name = "llvm.x86.avx.maskload.ps"]
2993 fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
2994 #[link_name = "llvm.x86.avx.maskstore.ps"]
2995 fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
2996 #[link_name = "llvm.x86.avx.ldu.dq.256"]
2997 fn vlddqu(mem_addr: *const i8) -> i8x32;
2998 #[link_name = "llvm.x86.avx.rcp.ps.256"]
2999 fn vrcpps(a: __m256) -> __m256;
3000 #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3001 fn vrsqrtps(a: __m256) -> __m256;
3002 #[link_name = "llvm.x86.avx.ptestz.256"]
3003 fn ptestz256(a: i64x4, b: i64x4) -> i32;
3004 #[link_name = "llvm.x86.avx.ptestc.256"]
3005 fn ptestc256(a: i64x4, b: i64x4) -> i32;
3006 #[link_name = "llvm.x86.avx.ptestnzc.256"]
3007 fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3008 #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3009 fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3010 #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3011 fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3012 #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3013 fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3014 #[link_name = "llvm.x86.avx.vtestz.pd"]
3015 fn vtestzpd(a: __m128d, b: __m128d) -> i32;
3016 #[link_name = "llvm.x86.avx.vtestc.pd"]
3017 fn vtestcpd(a: __m128d, b: __m128d) -> i32;
3018 #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3019 fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3020 #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3021 fn vtestzps256(a: __m256, b: __m256) -> i32;
3022 #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3023 fn vtestcps256(a: __m256, b: __m256) -> i32;
3024 #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3025 fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3026 #[link_name = "llvm.x86.avx.vtestz.ps"]
3027 fn vtestzps(a: __m128, b: __m128) -> i32;
3028 #[link_name = "llvm.x86.avx.vtestc.ps"]
3029 fn vtestcps(a: __m128, b: __m128) -> i32;
3030 #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3031 fn vtestnzcps(a: __m128, b: __m128) -> i32;
3032 #[link_name = "llvm.x86.avx.movmsk.pd.256"]
3033 fn movmskpd256(a: __m256d) -> i32;
3034 #[link_name = "llvm.x86.avx.movmsk.ps.256"]
3035 fn movmskps256(a: __m256) -> i32;
17df50a5
XL
3036 #[link_name = "llvm.x86.avx.min.ps.256"]
3037 fn vminps(a: __m256, b: __m256) -> __m256;
3038 #[link_name = "llvm.x86.avx.max.ps.256"]
3039 fn vmaxps(a: __m256, b: __m256) -> __m256;
3040 #[link_name = "llvm.x86.avx.min.pd.256"]
3041 fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3042 #[link_name = "llvm.x86.avx.max.pd.256"]
3043 fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
0531ce1d
XL
3044}
3045
3046#[cfg(test)]
3047mod tests {
48663c56 3048 use crate::hint::black_box;
416331ca 3049 use stdarch_test::simd_test;
0531ce1d 3050
532ac7d7 3051 use crate::core_arch::x86::*;
0531ce1d 3052
83c7162d 3053 #[simd_test(enable = "avx")]
0531ce1d
XL
3054 unsafe fn test_mm256_add_pd() {
3055 let a = _mm256_setr_pd(1., 2., 3., 4.);
3056 let b = _mm256_setr_pd(5., 6., 7., 8.);
3057 let r = _mm256_add_pd(a, b);
3058 let e = _mm256_setr_pd(6., 8., 10., 12.);
3059 assert_eq_m256d(r, e);
3060 }
3061
83c7162d 3062 #[simd_test(enable = "avx")]
0531ce1d
XL
3063 unsafe fn test_mm256_add_ps() {
3064 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3065 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3066 let r = _mm256_add_ps(a, b);
3067 let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3068 assert_eq_m256(r, e);
3069 }
3070
83c7162d 3071 #[simd_test(enable = "avx")]
0531ce1d
XL
3072 unsafe fn test_mm256_and_pd() {
3073 let a = _mm256_set1_pd(1.);
3074 let b = _mm256_set1_pd(0.6);
3075 let r = _mm256_and_pd(a, b);
3076 let e = _mm256_set1_pd(0.5);
3077 assert_eq_m256d(r, e);
3078 }
3079
83c7162d 3080 #[simd_test(enable = "avx")]
0531ce1d
XL
3081 unsafe fn test_mm256_and_ps() {
3082 let a = _mm256_set1_ps(1.);
3083 let b = _mm256_set1_ps(0.6);
3084 let r = _mm256_and_ps(a, b);
3085 let e = _mm256_set1_ps(0.5);
3086 assert_eq_m256(r, e);
3087 }
3088
83c7162d 3089 #[simd_test(enable = "avx")]
0531ce1d
XL
3090 unsafe fn test_mm256_or_pd() {
3091 let a = _mm256_set1_pd(1.);
3092 let b = _mm256_set1_pd(0.6);
3093 let r = _mm256_or_pd(a, b);
3094 let e = _mm256_set1_pd(1.2);
3095 assert_eq_m256d(r, e);
3096 }
3097
83c7162d 3098 #[simd_test(enable = "avx")]
0531ce1d
XL
3099 unsafe fn test_mm256_or_ps() {
3100 let a = _mm256_set1_ps(1.);
3101 let b = _mm256_set1_ps(0.6);
3102 let r = _mm256_or_ps(a, b);
3103 let e = _mm256_set1_ps(1.2);
3104 assert_eq_m256(r, e);
3105 }
3106
83c7162d 3107 #[simd_test(enable = "avx")]
0531ce1d
XL
3108 unsafe fn test_mm256_shuffle_pd() {
3109 let a = _mm256_setr_pd(1., 4., 5., 8.);
3110 let b = _mm256_setr_pd(2., 3., 6., 7.);
17df50a5 3111 let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
0531ce1d
XL
3112 let e = _mm256_setr_pd(4., 3., 8., 7.);
3113 assert_eq_m256d(r, e);
3114 }
3115
83c7162d 3116 #[simd_test(enable = "avx")]
0531ce1d
XL
3117 unsafe fn test_mm256_shuffle_ps() {
3118 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3119 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
17df50a5 3120 let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
0531ce1d
XL
3121 let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3122 assert_eq_m256(r, e);
3123 }
3124
83c7162d 3125 #[simd_test(enable = "avx")]
0531ce1d
XL
3126 unsafe fn test_mm256_andnot_pd() {
3127 let a = _mm256_set1_pd(0.);
3128 let b = _mm256_set1_pd(0.6);
3129 let r = _mm256_andnot_pd(a, b);
3130 assert_eq_m256d(r, b);
3131 }
3132
83c7162d 3133 #[simd_test(enable = "avx")]
0531ce1d
XL
3134 unsafe fn test_mm256_andnot_ps() {
3135 let a = _mm256_set1_ps(0.);
3136 let b = _mm256_set1_ps(0.6);
3137 let r = _mm256_andnot_ps(a, b);
3138 assert_eq_m256(r, b);
3139 }
3140
83c7162d 3141 #[simd_test(enable = "avx")]
0531ce1d
XL
3142 unsafe fn test_mm256_max_pd() {
3143 let a = _mm256_setr_pd(1., 4., 5., 8.);
3144 let b = _mm256_setr_pd(2., 3., 6., 7.);
3145 let r = _mm256_max_pd(a, b);
3146 let e = _mm256_setr_pd(2., 4., 6., 8.);
3147 assert_eq_m256d(r, e);
17df50a5
XL
3148 // > If the values being compared are both 0.0s (of either sign), the
3149 // > value in the second operand (source operand) is returned.
3150 let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3151 let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3152 let wu: [u64; 4] = transmute(w);
3153 let xu: [u64; 4] = transmute(x);
3154 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3155 assert_eq!(xu, [0u64; 4]);
3156 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3157 // > second operand (source operand), either a NaN or a valid
3158 // > floating-point value, is written to the result.
3159 let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3160 let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3161 let yf: [f64; 4] = transmute(y);
3162 let zf: [f64; 4] = transmute(z);
3163 assert_eq!(yf, [0.0; 4]);
3164 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3165 }
3166
83c7162d 3167 #[simd_test(enable = "avx")]
0531ce1d
XL
3168 unsafe fn test_mm256_max_ps() {
3169 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3170 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3171 let r = _mm256_max_ps(a, b);
3172 let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3173 assert_eq_m256(r, e);
17df50a5
XL
3174 // > If the values being compared are both 0.0s (of either sign), the
3175 // > value in the second operand (source operand) is returned.
3176 let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3177 let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3178 let wu: [u32; 8] = transmute(w);
3179 let xu: [u32; 8] = transmute(x);
3180 assert_eq!(wu, [0x8000_0000u32; 8]);
3181 assert_eq!(xu, [0u32; 8]);
3182 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3183 // > second operand (source operand), either a NaN or a valid
3184 // > floating-point value, is written to the result.
3185 let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3186 let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3187 let yf: [f32; 8] = transmute(y);
3188 let zf: [f32; 8] = transmute(z);
3189 assert_eq!(yf, [0.0; 8]);
3190 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3191 }
3192
83c7162d 3193 #[simd_test(enable = "avx")]
0531ce1d
XL
3194 unsafe fn test_mm256_min_pd() {
3195 let a = _mm256_setr_pd(1., 4., 5., 8.);
3196 let b = _mm256_setr_pd(2., 3., 6., 7.);
3197 let r = _mm256_min_pd(a, b);
3198 let e = _mm256_setr_pd(1., 3., 5., 7.);
3199 assert_eq_m256d(r, e);
17df50a5
XL
3200 // > If the values being compared are both 0.0s (of either sign), the
3201 // > value in the second operand (source operand) is returned.
3202 let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3203 let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3204 let wu: [u64; 4] = transmute(w);
3205 let xu: [u64; 4] = transmute(x);
3206 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3207 assert_eq!(xu, [0u64; 4]);
3208 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3209 // > second operand (source operand), either a NaN or a valid
3210 // > floating-point value, is written to the result.
3211 let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3212 let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3213 let yf: [f64; 4] = transmute(y);
3214 let zf: [f64; 4] = transmute(z);
3215 assert_eq!(yf, [0.0; 4]);
3216 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3217 }
3218
83c7162d 3219 #[simd_test(enable = "avx")]
0531ce1d
XL
3220 unsafe fn test_mm256_min_ps() {
3221 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3222 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3223 let r = _mm256_min_ps(a, b);
3224 let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3225 assert_eq_m256(r, e);
17df50a5
XL
3226 // > If the values being compared are both 0.0s (of either sign), the
3227 // > value in the second operand (source operand) is returned.
3228 let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3229 let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3230 let wu: [u32; 8] = transmute(w);
3231 let xu: [u32; 8] = transmute(x);
3232 assert_eq!(wu, [0x8000_0000u32; 8]);
3233 assert_eq!(xu, [0u32; 8]);
3234 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3235 // > second operand (source operand), either a NaN or a valid
3236 // > floating-point value, is written to the result.
3237 let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3238 let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3239 let yf: [f32; 8] = transmute(y);
3240 let zf: [f32; 8] = transmute(z);
3241 assert_eq!(yf, [0.0; 8]);
3242 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3243 }
3244
83c7162d 3245 #[simd_test(enable = "avx")]
0531ce1d
XL
3246 unsafe fn test_mm256_mul_pd() {
3247 let a = _mm256_setr_pd(1., 2., 3., 4.);
3248 let b = _mm256_setr_pd(5., 6., 7., 8.);
3249 let r = _mm256_mul_pd(a, b);
3250 let e = _mm256_setr_pd(5., 12., 21., 32.);
3251 assert_eq_m256d(r, e);
3252 }
3253
83c7162d 3254 #[simd_test(enable = "avx")]
0531ce1d
XL
3255 unsafe fn test_mm256_mul_ps() {
3256 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3257 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3258 let r = _mm256_mul_ps(a, b);
3259 let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3260 assert_eq_m256(r, e);
3261 }
3262
83c7162d 3263 #[simd_test(enable = "avx")]
0531ce1d
XL
3264 unsafe fn test_mm256_addsub_pd() {
3265 let a = _mm256_setr_pd(1., 2., 3., 4.);
3266 let b = _mm256_setr_pd(5., 6., 7., 8.);
3267 let r = _mm256_addsub_pd(a, b);
3268 let e = _mm256_setr_pd(-4., 8., -4., 12.);
3269 assert_eq_m256d(r, e);
3270 }
3271
83c7162d 3272 #[simd_test(enable = "avx")]
0531ce1d
XL
3273 unsafe fn test_mm256_addsub_ps() {
3274 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3275 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3276 let r = _mm256_addsub_ps(a, b);
3277 let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3278 assert_eq_m256(r, e);
3279 }
3280
83c7162d 3281 #[simd_test(enable = "avx")]
0531ce1d
XL
3282 unsafe fn test_mm256_sub_pd() {
3283 let a = _mm256_setr_pd(1., 2., 3., 4.);
3284 let b = _mm256_setr_pd(5., 6., 7., 8.);
3285 let r = _mm256_sub_pd(a, b);
3286 let e = _mm256_setr_pd(-4., -4., -4., -4.);
3287 assert_eq_m256d(r, e);
3288 }
3289
83c7162d 3290 #[simd_test(enable = "avx")]
0531ce1d
XL
3291 unsafe fn test_mm256_sub_ps() {
3292 let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3293 let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3294 let r = _mm256_sub_ps(a, b);
3295 let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3296 assert_eq_m256(r, e);
3297 }
3298
83c7162d 3299 #[simd_test(enable = "avx")]
0531ce1d
XL
3300 unsafe fn test_mm256_round_pd() {
3301 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
17df50a5
XL
3302 let result_closest = _mm256_round_pd::<0b0000>(a);
3303 let result_down = _mm256_round_pd::<0b0001>(a);
3304 let result_up = _mm256_round_pd::<0b0010>(a);
0531ce1d
XL
3305 let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3306 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3307 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3308 assert_eq_m256d(result_closest, expected_closest);
3309 assert_eq_m256d(result_down, expected_down);
3310 assert_eq_m256d(result_up, expected_up);
3311 }
3312
83c7162d 3313 #[simd_test(enable = "avx")]
0531ce1d
XL
3314 unsafe fn test_mm256_floor_pd() {
3315 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3316 let result_down = _mm256_floor_pd(a);
3317 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3318 assert_eq_m256d(result_down, expected_down);
3319 }
3320
83c7162d 3321 #[simd_test(enable = "avx")]
0531ce1d
XL
3322 unsafe fn test_mm256_ceil_pd() {
3323 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3324 let result_up = _mm256_ceil_pd(a);
3325 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3326 assert_eq_m256d(result_up, expected_up);
3327 }
3328
83c7162d 3329 #[simd_test(enable = "avx")]
0531ce1d
XL
3330 unsafe fn test_mm256_round_ps() {
3331 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
17df50a5
XL
3332 let result_closest = _mm256_round_ps::<0b0000>(a);
3333 let result_down = _mm256_round_ps::<0b0001>(a);
3334 let result_up = _mm256_round_ps::<0b0010>(a);
0731742a 3335 let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
0531ce1d
XL
3336 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3337 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3338 assert_eq_m256(result_closest, expected_closest);
3339 assert_eq_m256(result_down, expected_down);
3340 assert_eq_m256(result_up, expected_up);
3341 }
3342
83c7162d 3343 #[simd_test(enable = "avx")]
0531ce1d
XL
3344 unsafe fn test_mm256_floor_ps() {
3345 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3346 let result_down = _mm256_floor_ps(a);
3347 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3348 assert_eq_m256(result_down, expected_down);
3349 }
3350
83c7162d 3351 #[simd_test(enable = "avx")]
0531ce1d
XL
3352 unsafe fn test_mm256_ceil_ps() {
3353 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3354 let result_up = _mm256_ceil_ps(a);
3355 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3356 assert_eq_m256(result_up, expected_up);
3357 }
3358
83c7162d 3359 #[simd_test(enable = "avx")]
0531ce1d
XL
3360 unsafe fn test_mm256_sqrt_pd() {
3361 let a = _mm256_setr_pd(4., 9., 16., 25.);
3362 let r = _mm256_sqrt_pd(a);
3363 let e = _mm256_setr_pd(2., 3., 4., 5.);
3364 assert_eq_m256d(r, e);
3365 }
3366
83c7162d 3367 #[simd_test(enable = "avx")]
0531ce1d
XL
3368 unsafe fn test_mm256_sqrt_ps() {
3369 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3370 let r = _mm256_sqrt_ps(a);
3371 let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3372 assert_eq_m256(r, e);
3373 }
3374
83c7162d 3375 #[simd_test(enable = "avx")]
0531ce1d
XL
3376 unsafe fn test_mm256_div_ps() {
3377 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3378 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3379 let r = _mm256_div_ps(a, b);
3380 let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3381 assert_eq_m256(r, e);
3382 }
3383
83c7162d 3384 #[simd_test(enable = "avx")]
0531ce1d
XL
3385 unsafe fn test_mm256_div_pd() {
3386 let a = _mm256_setr_pd(4., 9., 16., 25.);
3387 let b = _mm256_setr_pd(4., 3., 2., 5.);
3388 let r = _mm256_div_pd(a, b);
3389 let e = _mm256_setr_pd(1., 3., 8., 5.);
3390 assert_eq_m256d(r, e);
3391 }
3392
83c7162d 3393 #[simd_test(enable = "avx")]
0531ce1d
XL
3394 unsafe fn test_mm256_blend_pd() {
3395 let a = _mm256_setr_pd(4., 9., 16., 25.);
3396 let b = _mm256_setr_pd(4., 3., 2., 5.);
17df50a5 3397 let r = _mm256_blend_pd::<0x0>(a, b);
0531ce1d 3398 assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
17df50a5 3399 let r = _mm256_blend_pd::<0x3>(a, b);
0531ce1d 3400 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
17df50a5 3401 let r = _mm256_blend_pd::<0xF>(a, b);
0531ce1d
XL
3402 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3403 }
3404
83c7162d 3405 #[simd_test(enable = "avx")]
0531ce1d
XL
3406 unsafe fn test_mm256_blend_ps() {
3407 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3408 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
17df50a5 3409 let r = _mm256_blend_ps::<0x0>(a, b);
8faf50e0 3410 assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
17df50a5 3411 let r = _mm256_blend_ps::<0x3>(a, b);
8faf50e0 3412 assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
17df50a5 3413 let r = _mm256_blend_ps::<0xF>(a, b);
8faf50e0 3414 assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
0531ce1d
XL
3415 }
3416
83c7162d 3417 #[simd_test(enable = "avx")]
0531ce1d
XL
3418 unsafe fn test_mm256_blendv_pd() {
3419 let a = _mm256_setr_pd(4., 9., 16., 25.);
3420 let b = _mm256_setr_pd(4., 3., 2., 5.);
3421 let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3422 let r = _mm256_blendv_pd(a, b, c);
3423 let e = _mm256_setr_pd(4., 9., 2., 5.);
3424 assert_eq_m256d(r, e);
3425 }
3426
83c7162d 3427 #[simd_test(enable = "avx")]
0531ce1d
XL
3428 unsafe fn test_mm256_blendv_ps() {
3429 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3430 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
0731742a 3431 #[rustfmt::skip]
0531ce1d
XL
3432 let c = _mm256_setr_ps(
3433 0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3434 );
3435 let r = _mm256_blendv_ps(a, b, c);
3436 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3437 assert_eq_m256(r, e);
3438 }
3439
83c7162d 3440 #[simd_test(enable = "avx")]
0531ce1d
XL
3441 unsafe fn test_mm256_dp_ps() {
3442 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3443 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
17df50a5 3444 let r = _mm256_dp_ps::<0xFF>(a, b);
0731742a 3445 let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
0531ce1d
XL
3446 assert_eq_m256(r, e);
3447 }
3448
83c7162d 3449 #[simd_test(enable = "avx")]
0531ce1d
XL
3450 unsafe fn test_mm256_hadd_pd() {
3451 let a = _mm256_setr_pd(4., 9., 16., 25.);
3452 let b = _mm256_setr_pd(4., 3., 2., 5.);
3453 let r = _mm256_hadd_pd(a, b);
3454 let e = _mm256_setr_pd(13., 7., 41., 7.);
3455 assert_eq_m256d(r, e);
3456
3457 let a = _mm256_setr_pd(1., 2., 3., 4.);
3458 let b = _mm256_setr_pd(5., 6., 7., 8.);
3459 let r = _mm256_hadd_pd(a, b);
3460 let e = _mm256_setr_pd(3., 11., 7., 15.);
3461 assert_eq_m256d(r, e);
3462 }
3463
83c7162d 3464 #[simd_test(enable = "avx")]
0531ce1d
XL
3465 unsafe fn test_mm256_hadd_ps() {
3466 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3467 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3468 let r = _mm256_hadd_ps(a, b);
3469 let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3470 assert_eq_m256(r, e);
3471
3472 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3473 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3474 let r = _mm256_hadd_ps(a, b);
3475 let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3476 assert_eq_m256(r, e);
3477 }
3478
83c7162d 3479 #[simd_test(enable = "avx")]
0531ce1d
XL
3480 unsafe fn test_mm256_hsub_pd() {
3481 let a = _mm256_setr_pd(4., 9., 16., 25.);
3482 let b = _mm256_setr_pd(4., 3., 2., 5.);
3483 let r = _mm256_hsub_pd(a, b);
3484 let e = _mm256_setr_pd(-5., 1., -9., -3.);
3485 assert_eq_m256d(r, e);
3486
3487 let a = _mm256_setr_pd(1., 2., 3., 4.);
3488 let b = _mm256_setr_pd(5., 6., 7., 8.);
3489 let r = _mm256_hsub_pd(a, b);
3490 let e = _mm256_setr_pd(-1., -1., -1., -1.);
3491 assert_eq_m256d(r, e);
3492 }
3493
83c7162d 3494 #[simd_test(enable = "avx")]
0531ce1d
XL
3495 unsafe fn test_mm256_hsub_ps() {
3496 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3497 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3498 let r = _mm256_hsub_ps(a, b);
3499 let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3500 assert_eq_m256(r, e);
3501
3502 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3503 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3504 let r = _mm256_hsub_ps(a, b);
3505 let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3506 assert_eq_m256(r, e);
3507 }
3508
83c7162d 3509 #[simd_test(enable = "avx")]
0531ce1d
XL
3510 unsafe fn test_mm256_xor_pd() {
3511 let a = _mm256_setr_pd(4., 9., 16., 25.);
3512 let b = _mm256_set1_pd(0.);
3513 let r = _mm256_xor_pd(a, b);
3514 assert_eq_m256d(r, a);
3515 }
3516
83c7162d 3517 #[simd_test(enable = "avx")]
0531ce1d
XL
3518 unsafe fn test_mm256_xor_ps() {
3519 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3520 let b = _mm256_set1_ps(0.);
3521 let r = _mm256_xor_ps(a, b);
3522 assert_eq_m256(r, a);
3523 }
3524
83c7162d 3525 #[simd_test(enable = "avx")]
0531ce1d
XL
3526 unsafe fn test_mm_cmp_pd() {
3527 let a = _mm_setr_pd(4., 9.);
3528 let b = _mm_setr_pd(4., 3.);
17df50a5 3529 let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3530 assert!(get_m128d(r, 0).is_nan());
3531 assert!(get_m128d(r, 1).is_nan());
3532 }
3533
83c7162d 3534 #[simd_test(enable = "avx")]
0531ce1d
XL
3535 unsafe fn test_mm256_cmp_pd() {
3536 let a = _mm256_setr_pd(1., 2., 3., 4.);
3537 let b = _mm256_setr_pd(5., 6., 7., 8.);
17df50a5 3538 let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3539 let e = _mm256_set1_pd(0.);
3540 assert_eq_m256d(r, e);
3541 }
3542
83c7162d 3543 #[simd_test(enable = "avx")]
0531ce1d
XL
3544 unsafe fn test_mm_cmp_ps() {
3545 let a = _mm_setr_ps(4., 3., 2., 5.);
3546 let b = _mm_setr_ps(4., 9., 16., 25.);
17df50a5 3547 let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3548 assert!(get_m128(r, 0).is_nan());
3549 assert_eq!(get_m128(r, 1), 0.);
3550 assert_eq!(get_m128(r, 2), 0.);
3551 assert_eq!(get_m128(r, 3), 0.);
3552 }
3553
83c7162d 3554 #[simd_test(enable = "avx")]
0531ce1d
XL
3555 unsafe fn test_mm256_cmp_ps() {
3556 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3557 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
17df50a5 3558 let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3559 let e = _mm256_set1_ps(0.);
3560 assert_eq_m256(r, e);
3561 }
3562
83c7162d 3563 #[simd_test(enable = "avx")]
0531ce1d
XL
3564 unsafe fn test_mm_cmp_sd() {
3565 let a = _mm_setr_pd(4., 9.);
3566 let b = _mm_setr_pd(4., 3.);
17df50a5 3567 let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3568 assert!(get_m128d(r, 0).is_nan());
3569 assert_eq!(get_m128d(r, 1), 9.);
3570 }
3571
83c7162d 3572 #[simd_test(enable = "avx")]
0531ce1d
XL
3573 unsafe fn test_mm_cmp_ss() {
3574 let a = _mm_setr_ps(4., 3., 2., 5.);
3575 let b = _mm_setr_ps(4., 9., 16., 25.);
17df50a5 3576 let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3577 assert!(get_m128(r, 0).is_nan());
3578 assert_eq!(get_m128(r, 1), 3.);
3579 assert_eq!(get_m128(r, 2), 2.);
3580 assert_eq!(get_m128(r, 3), 5.);
3581 }
3582
83c7162d 3583 #[simd_test(enable = "avx")]
0531ce1d
XL
3584 unsafe fn test_mm256_cvtepi32_pd() {
3585 let a = _mm_setr_epi32(4, 9, 16, 25);
3586 let r = _mm256_cvtepi32_pd(a);
3587 let e = _mm256_setr_pd(4., 9., 16., 25.);
3588 assert_eq_m256d(r, e);
3589 }
3590
83c7162d 3591 #[simd_test(enable = "avx")]
0531ce1d
XL
3592 unsafe fn test_mm256_cvtepi32_ps() {
3593 let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3594 let r = _mm256_cvtepi32_ps(a);
3595 let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3596 assert_eq_m256(r, e);
3597 }
3598
83c7162d 3599 #[simd_test(enable = "avx")]
0531ce1d
XL
3600 unsafe fn test_mm256_cvtpd_ps() {
3601 let a = _mm256_setr_pd(4., 9., 16., 25.);
3602 let r = _mm256_cvtpd_ps(a);
3603 let e = _mm_setr_ps(4., 9., 16., 25.);
3604 assert_eq_m128(r, e);
3605 }
3606
83c7162d 3607 #[simd_test(enable = "avx")]
0531ce1d
XL
3608 unsafe fn test_mm256_cvtps_epi32() {
3609 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3610 let r = _mm256_cvtps_epi32(a);
3611 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3612 assert_eq_m256i(r, e);
3613 }
3614
83c7162d 3615 #[simd_test(enable = "avx")]
0531ce1d
XL
3616 unsafe fn test_mm256_cvtps_pd() {
3617 let a = _mm_setr_ps(4., 9., 16., 25.);
3618 let r = _mm256_cvtps_pd(a);
3619 let e = _mm256_setr_pd(4., 9., 16., 25.);
3620 assert_eq_m256d(r, e);
3621 }
3622
83c7162d 3623 #[simd_test(enable = "avx")]
0531ce1d
XL
3624 unsafe fn test_mm256_cvttpd_epi32() {
3625 let a = _mm256_setr_pd(4., 9., 16., 25.);
3626 let r = _mm256_cvttpd_epi32(a);
3627 let e = _mm_setr_epi32(4, 9, 16, 25);
3628 assert_eq_m128i(r, e);
3629 }
3630
83c7162d 3631 #[simd_test(enable = "avx")]
0531ce1d
XL
3632 unsafe fn test_mm256_cvtpd_epi32() {
3633 let a = _mm256_setr_pd(4., 9., 16., 25.);
3634 let r = _mm256_cvtpd_epi32(a);
3635 let e = _mm_setr_epi32(4, 9, 16, 25);
3636 assert_eq_m128i(r, e);
3637 }
3638
83c7162d 3639 #[simd_test(enable = "avx")]
0531ce1d
XL
3640 unsafe fn test_mm256_cvttps_epi32() {
3641 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3642 let r = _mm256_cvttps_epi32(a);
3643 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3644 assert_eq_m256i(r, e);
3645 }
3646
83c7162d 3647 #[simd_test(enable = "avx")]
0531ce1d
XL
3648 unsafe fn test_mm256_extractf128_ps() {
3649 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
17df50a5 3650 let r = _mm256_extractf128_ps::<0>(a);
0531ce1d
XL
3651 let e = _mm_setr_ps(4., 3., 2., 5.);
3652 assert_eq_m128(r, e);
3653 }
3654
83c7162d 3655 #[simd_test(enable = "avx")]
0531ce1d
XL
3656 unsafe fn test_mm256_extractf128_pd() {
3657 let a = _mm256_setr_pd(4., 3., 2., 5.);
17df50a5 3658 let r = _mm256_extractf128_pd::<0>(a);
0531ce1d
XL
3659 let e = _mm_setr_pd(4., 3.);
3660 assert_eq_m128d(r, e);
3661 }
3662
83c7162d 3663 #[simd_test(enable = "avx")]
0531ce1d
XL
3664 unsafe fn test_mm256_extractf128_si256() {
3665 let a = _mm256_setr_epi64x(4, 3, 2, 5);
17df50a5 3666 let r = _mm256_extractf128_si256::<0>(a);
0531ce1d
XL
3667 let e = _mm_setr_epi64x(4, 3);
3668 assert_eq_m128i(r, e);
3669 }
3670
83c7162d 3671 #[simd_test(enable = "avx")]
0531ce1d
XL
3672 unsafe fn test_mm256_zeroall() {
3673 _mm256_zeroall();
3674 }
3675
83c7162d 3676 #[simd_test(enable = "avx")]
0531ce1d
XL
3677 unsafe fn test_mm256_zeroupper() {
3678 _mm256_zeroupper();
3679 }
3680
83c7162d 3681 #[simd_test(enable = "avx")]
0531ce1d
XL
3682 unsafe fn test_mm256_permutevar_ps() {
3683 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3684 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3685 let r = _mm256_permutevar_ps(a, b);
3686 let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3687 assert_eq_m256(r, e);
3688 }
3689
83c7162d 3690 #[simd_test(enable = "avx")]
0531ce1d
XL
3691 unsafe fn test_mm_permutevar_ps() {
3692 let a = _mm_setr_ps(4., 3., 2., 5.);
3693 let b = _mm_setr_epi32(1, 2, 3, 4);
3694 let r = _mm_permutevar_ps(a, b);
3695 let e = _mm_setr_ps(3., 2., 5., 4.);
3696 assert_eq_m128(r, e);
3697 }
3698
83c7162d 3699 #[simd_test(enable = "avx")]
0531ce1d
XL
3700 unsafe fn test_mm256_permute_ps() {
3701 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
17df50a5 3702 let r = _mm256_permute_ps::<0x1b>(a);
0531ce1d
XL
3703 let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3704 assert_eq_m256(r, e);
3705 }
3706
83c7162d 3707 #[simd_test(enable = "avx")]
0531ce1d
XL
3708 unsafe fn test_mm_permute_ps() {
3709 let a = _mm_setr_ps(4., 3., 2., 5.);
17df50a5 3710 let r = _mm_permute_ps::<0x1b>(a);
0531ce1d
XL
3711 let e = _mm_setr_ps(5., 2., 3., 4.);
3712 assert_eq_m128(r, e);
3713 }
3714
83c7162d 3715 #[simd_test(enable = "avx")]
0531ce1d
XL
3716 unsafe fn test_mm256_permutevar_pd() {
3717 let a = _mm256_setr_pd(4., 3., 2., 5.);
3718 let b = _mm256_setr_epi64x(1, 2, 3, 4);
3719 let r = _mm256_permutevar_pd(a, b);
3720 let e = _mm256_setr_pd(4., 3., 5., 2.);
3721 assert_eq_m256d(r, e);
3722 }
3723
83c7162d 3724 #[simd_test(enable = "avx")]
0531ce1d
XL
3725 unsafe fn test_mm_permutevar_pd() {
3726 let a = _mm_setr_pd(4., 3.);
3727 let b = _mm_setr_epi64x(3, 0);
3728 let r = _mm_permutevar_pd(a, b);
3729 let e = _mm_setr_pd(3., 4.);
3730 assert_eq_m128d(r, e);
3731 }
3732
83c7162d 3733 #[simd_test(enable = "avx")]
0531ce1d
XL
3734 unsafe fn test_mm256_permute_pd() {
3735 let a = _mm256_setr_pd(4., 3., 2., 5.);
17df50a5 3736 let r = _mm256_permute_pd::<5>(a);
0531ce1d
XL
3737 let e = _mm256_setr_pd(3., 4., 5., 2.);
3738 assert_eq_m256d(r, e);
3739 }
3740
83c7162d 3741 #[simd_test(enable = "avx")]
0531ce1d
XL
3742 unsafe fn test_mm_permute_pd() {
3743 let a = _mm_setr_pd(4., 3.);
17df50a5 3744 let r = _mm_permute_pd::<1>(a);
0531ce1d
XL
3745 let e = _mm_setr_pd(3., 4.);
3746 assert_eq_m128d(r, e);
3747 }
3748
83c7162d 3749 #[simd_test(enable = "avx")]
0531ce1d
XL
3750 unsafe fn test_mm256_permute2f128_ps() {
3751 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3752 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
17df50a5 3753 let r = _mm256_permute2f128_ps::<0x13>(a, b);
0531ce1d
XL
3754 let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3755 assert_eq_m256(r, e);
3756 }
3757
83c7162d 3758 #[simd_test(enable = "avx")]
0531ce1d
XL
3759 unsafe fn test_mm256_permute2f128_pd() {
3760 let a = _mm256_setr_pd(1., 2., 3., 4.);
3761 let b = _mm256_setr_pd(5., 6., 7., 8.);
17df50a5 3762 let r = _mm256_permute2f128_pd::<0x31>(a, b);
0531ce1d
XL
3763 let e = _mm256_setr_pd(3., 4., 7., 8.);
3764 assert_eq_m256d(r, e);
3765 }
3766
83c7162d 3767 #[simd_test(enable = "avx")]
0531ce1d
XL
3768 unsafe fn test_mm256_permute2f128_si256() {
3769 let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3770 let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
17df50a5 3771 let r = _mm256_permute2f128_si256::<0x20>(a, b);
0531ce1d
XL
3772 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3773 assert_eq_m256i(r, e);
3774 }
3775
83c7162d 3776 #[simd_test(enable = "avx")]
0531ce1d
XL
3777 unsafe fn test_mm256_broadcast_ss() {
3778 let r = _mm256_broadcast_ss(&3.);
3779 let e = _mm256_set1_ps(3.);
3780 assert_eq_m256(r, e);
3781 }
3782
83c7162d 3783 #[simd_test(enable = "avx")]
0531ce1d
XL
3784 unsafe fn test_mm_broadcast_ss() {
3785 let r = _mm_broadcast_ss(&3.);
3786 let e = _mm_set1_ps(3.);
3787 assert_eq_m128(r, e);
3788 }
3789
83c7162d 3790 #[simd_test(enable = "avx")]
0531ce1d
XL
3791 unsafe fn test_mm256_broadcast_sd() {
3792 let r = _mm256_broadcast_sd(&3.);
3793 let e = _mm256_set1_pd(3.);
3794 assert_eq_m256d(r, e);
3795 }
3796
83c7162d 3797 #[simd_test(enable = "avx")]
0531ce1d
XL
3798 unsafe fn test_mm256_broadcast_ps() {
3799 let a = _mm_setr_ps(4., 3., 2., 5.);
3800 let r = _mm256_broadcast_ps(&a);
3801 let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3802 assert_eq_m256(r, e);
3803 }
3804
83c7162d 3805 #[simd_test(enable = "avx")]
0531ce1d
XL
3806 unsafe fn test_mm256_broadcast_pd() {
3807 let a = _mm_setr_pd(4., 3.);
3808 let r = _mm256_broadcast_pd(&a);
3809 let e = _mm256_setr_pd(4., 3., 4., 3.);
3810 assert_eq_m256d(r, e);
3811 }
3812
83c7162d 3813 #[simd_test(enable = "avx")]
0531ce1d
XL
3814 unsafe fn test_mm256_insertf128_ps() {
3815 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3816 let b = _mm_setr_ps(4., 9., 16., 25.);
17df50a5 3817 let r = _mm256_insertf128_ps::<0>(a, b);
0531ce1d
XL
3818 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3819 assert_eq_m256(r, e);
3820 }
3821
83c7162d 3822 #[simd_test(enable = "avx")]
0531ce1d
XL
3823 unsafe fn test_mm256_insertf128_pd() {
3824 let a = _mm256_setr_pd(1., 2., 3., 4.);
3825 let b = _mm_setr_pd(5., 6.);
17df50a5 3826 let r = _mm256_insertf128_pd::<0>(a, b);
0531ce1d
XL
3827 let e = _mm256_setr_pd(5., 6., 3., 4.);
3828 assert_eq_m256d(r, e);
3829 }
3830
83c7162d 3831 #[simd_test(enable = "avx")]
0531ce1d
XL
3832 unsafe fn test_mm256_insertf128_si256() {
3833 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3834 let b = _mm_setr_epi64x(5, 6);
17df50a5 3835 let r = _mm256_insertf128_si256::<0>(a, b);
0531ce1d
XL
3836 let e = _mm256_setr_epi64x(5, 6, 3, 4);
3837 assert_eq_m256i(r, e);
3838 }
3839
83c7162d 3840 #[simd_test(enable = "avx")]
0531ce1d 3841 unsafe fn test_mm256_insert_epi8() {
0731742a 3842 #[rustfmt::skip]
0531ce1d
XL
3843 let a = _mm256_setr_epi8(
3844 1, 2, 3, 4, 5, 6, 7, 8,
3845 9, 10, 11, 12, 13, 14, 15, 16,
3846 17, 18, 19, 20, 21, 22, 23, 24,
3847 25, 26, 27, 28, 29, 30, 31, 32,
3848 );
17df50a5 3849 let r = _mm256_insert_epi8::<31>(a, 0);
0731742a 3850 #[rustfmt::skip]
0531ce1d
XL
3851 let e = _mm256_setr_epi8(
3852 1, 2, 3, 4, 5, 6, 7, 8,
3853 9, 10, 11, 12, 13, 14, 15, 16,
3854 17, 18, 19, 20, 21, 22, 23, 24,
3855 25, 26, 27, 28, 29, 30, 31, 0,
3856 );
3857 assert_eq_m256i(r, e);
3858 }
3859
83c7162d 3860 #[simd_test(enable = "avx")]
0531ce1d 3861 unsafe fn test_mm256_insert_epi16() {
0731742a 3862 #[rustfmt::skip]
0531ce1d
XL
3863 let a = _mm256_setr_epi16(
3864 0, 1, 2, 3, 4, 5, 6, 7,
3865 8, 9, 10, 11, 12, 13, 14, 15,
3866 );
17df50a5 3867 let r = _mm256_insert_epi16::<15>(a, 0);
0731742a 3868 #[rustfmt::skip]
0531ce1d
XL
3869 let e = _mm256_setr_epi16(
3870 0, 1, 2, 3, 4, 5, 6, 7,
3871 8, 9, 10, 11, 12, 13, 14, 0,
3872 );
3873 assert_eq_m256i(r, e);
3874 }
3875
83c7162d 3876 #[simd_test(enable = "avx")]
0531ce1d
XL
3877 unsafe fn test_mm256_insert_epi32() {
3878 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
17df50a5 3879 let r = _mm256_insert_epi32::<7>(a, 0);
0531ce1d
XL
3880 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
3881 assert_eq_m256i(r, e);
3882 }
3883
83c7162d 3884 #[simd_test(enable = "avx")]
0531ce1d
XL
3885 unsafe fn test_mm256_load_pd() {
3886 let a = _mm256_setr_pd(1., 2., 3., 4.);
3887 let p = &a as *const _ as *const f64;
3888 let r = _mm256_load_pd(p);
3889 let e = _mm256_setr_pd(1., 2., 3., 4.);
3890 assert_eq_m256d(r, e);
3891 }
3892
83c7162d 3893 #[simd_test(enable = "avx")]
0531ce1d
XL
3894 unsafe fn test_mm256_store_pd() {
3895 let a = _mm256_setr_pd(1., 2., 3., 4.);
3896 let mut r = _mm256_undefined_pd();
3897 _mm256_store_pd(&mut r as *mut _ as *mut f64, a);
3898 assert_eq_m256d(r, a);
3899 }
3900
83c7162d 3901 #[simd_test(enable = "avx")]
0531ce1d
XL
3902 unsafe fn test_mm256_load_ps() {
3903 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3904 let p = &a as *const _ as *const f32;
3905 let r = _mm256_load_ps(p);
3906 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3907 assert_eq_m256(r, e);
3908 }
3909
83c7162d 3910 #[simd_test(enable = "avx")]
0531ce1d
XL
3911 unsafe fn test_mm256_store_ps() {
3912 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3913 let mut r = _mm256_undefined_ps();
3914 _mm256_store_ps(&mut r as *mut _ as *mut f32, a);
3915 assert_eq_m256(r, a);
3916 }
3917
83c7162d 3918 #[simd_test(enable = "avx")]
0531ce1d
XL
3919 unsafe fn test_mm256_loadu_pd() {
3920 let a = &[1.0f64, 2., 3., 4.];
3921 let p = a.as_ptr();
3922 let r = _mm256_loadu_pd(black_box(p));
3923 let e = _mm256_setr_pd(1., 2., 3., 4.);
3924 assert_eq_m256d(r, e);
3925 }
3926
83c7162d 3927 #[simd_test(enable = "avx")]
0531ce1d
XL
3928 unsafe fn test_mm256_storeu_pd() {
3929 let a = _mm256_set1_pd(9.);
3930 let mut r = _mm256_undefined_pd();
3931 _mm256_storeu_pd(&mut r as *mut _ as *mut f64, a);
3932 assert_eq_m256d(r, a);
3933 }
3934
83c7162d 3935 #[simd_test(enable = "avx")]
0531ce1d
XL
3936 unsafe fn test_mm256_loadu_ps() {
3937 let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
3938 let p = a.as_ptr();
3939 let r = _mm256_loadu_ps(black_box(p));
3940 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3941 assert_eq_m256(r, e);
3942 }
3943
83c7162d 3944 #[simd_test(enable = "avx")]
0531ce1d
XL
3945 unsafe fn test_mm256_storeu_ps() {
3946 let a = _mm256_set1_ps(9.);
3947 let mut r = _mm256_undefined_ps();
3948 _mm256_storeu_ps(&mut r as *mut _ as *mut f32, a);
3949 assert_eq_m256(r, a);
3950 }
3951
83c7162d 3952 #[simd_test(enable = "avx")]
0531ce1d
XL
3953 unsafe fn test_mm256_load_si256() {
3954 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3955 let p = &a as *const _;
3956 let r = _mm256_load_si256(p);
3957 let e = _mm256_setr_epi64x(1, 2, 3, 4);
3958 assert_eq_m256i(r, e);
3959 }
3960
83c7162d 3961 #[simd_test(enable = "avx")]
0531ce1d
XL
3962 unsafe fn test_mm256_store_si256() {
3963 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3964 let mut r = _mm256_undefined_si256();
3965 _mm256_store_si256(&mut r as *mut _, a);
3966 assert_eq_m256i(r, a);
3967 }
3968
83c7162d 3969 #[simd_test(enable = "avx")]
0531ce1d
XL
3970 unsafe fn test_mm256_loadu_si256() {
3971 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3972 let p = &a as *const _;
3973 let r = _mm256_loadu_si256(black_box(p));
3974 let e = _mm256_setr_epi64x(1, 2, 3, 4);
3975 assert_eq_m256i(r, e);
3976 }
3977
83c7162d 3978 #[simd_test(enable = "avx")]
0531ce1d
XL
3979 unsafe fn test_mm256_storeu_si256() {
3980 let a = _mm256_set1_epi8(9);
3981 let mut r = _mm256_undefined_si256();
3982 _mm256_storeu_si256(&mut r as *mut _, a);
3983 assert_eq_m256i(r, a);
3984 }
3985
83c7162d 3986 #[simd_test(enable = "avx")]
0531ce1d
XL
3987 unsafe fn test_mm256_maskload_pd() {
3988 let a = &[1.0f64, 2., 3., 4.];
3989 let p = a.as_ptr();
3990 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
3991 let r = _mm256_maskload_pd(black_box(p), mask);
3992 let e = _mm256_setr_pd(0., 2., 0., 4.);
3993 assert_eq_m256d(r, e);
3994 }
3995
83c7162d 3996 #[simd_test(enable = "avx")]
0531ce1d
XL
3997 unsafe fn test_mm256_maskstore_pd() {
3998 let mut r = _mm256_set1_pd(0.);
3999 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4000 let a = _mm256_setr_pd(1., 2., 3., 4.);
4001 _mm256_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
4002 let e = _mm256_setr_pd(0., 2., 0., 4.);
4003 assert_eq_m256d(r, e);
4004 }
4005
83c7162d 4006 #[simd_test(enable = "avx")]
0531ce1d
XL
4007 unsafe fn test_mm_maskload_pd() {
4008 let a = &[1.0f64, 2.];
4009 let p = a.as_ptr();
4010 let mask = _mm_setr_epi64x(0, !0);
4011 let r = _mm_maskload_pd(black_box(p), mask);
4012 let e = _mm_setr_pd(0., 2.);
4013 assert_eq_m128d(r, e);
4014 }
4015
83c7162d 4016 #[simd_test(enable = "avx")]
0531ce1d
XL
4017 unsafe fn test_mm_maskstore_pd() {
4018 let mut r = _mm_set1_pd(0.);
4019 let mask = _mm_setr_epi64x(0, !0);
4020 let a = _mm_setr_pd(1., 2.);
4021 _mm_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
4022 let e = _mm_setr_pd(0., 2.);
4023 assert_eq_m128d(r, e);
4024 }
4025
83c7162d 4026 #[simd_test(enable = "avx")]
0531ce1d
XL
4027 unsafe fn test_mm256_maskload_ps() {
4028 let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4029 let p = a.as_ptr();
4030 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4031 let r = _mm256_maskload_ps(black_box(p), mask);
4032 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4033 assert_eq_m256(r, e);
4034 }
4035
83c7162d 4036 #[simd_test(enable = "avx")]
0531ce1d
XL
4037 unsafe fn test_mm256_maskstore_ps() {
4038 let mut r = _mm256_set1_ps(0.);
4039 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4040 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4041 _mm256_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
4042 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4043 assert_eq_m256(r, e);
4044 }
4045
83c7162d 4046 #[simd_test(enable = "avx")]
0531ce1d
XL
4047 unsafe fn test_mm_maskload_ps() {
4048 let a = &[1.0f32, 2., 3., 4.];
4049 let p = a.as_ptr();
4050 let mask = _mm_setr_epi32(0, !0, 0, !0);
4051 let r = _mm_maskload_ps(black_box(p), mask);
4052 let e = _mm_setr_ps(0., 2., 0., 4.);
4053 assert_eq_m128(r, e);
4054 }
4055
83c7162d 4056 #[simd_test(enable = "avx")]
0531ce1d
XL
4057 unsafe fn test_mm_maskstore_ps() {
4058 let mut r = _mm_set1_ps(0.);
4059 let mask = _mm_setr_epi32(0, !0, 0, !0);
4060 let a = _mm_setr_ps(1., 2., 3., 4.);
4061 _mm_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
4062 let e = _mm_setr_ps(0., 2., 0., 4.);
4063 assert_eq_m128(r, e);
4064 }
4065
83c7162d 4066 #[simd_test(enable = "avx")]
0531ce1d
XL
4067 unsafe fn test_mm256_movehdup_ps() {
4068 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4069 let r = _mm256_movehdup_ps(a);
4070 let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4071 assert_eq_m256(r, e);
4072 }
4073
83c7162d 4074 #[simd_test(enable = "avx")]
0531ce1d
XL
4075 unsafe fn test_mm256_moveldup_ps() {
4076 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4077 let r = _mm256_moveldup_ps(a);
4078 let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4079 assert_eq_m256(r, e);
4080 }
4081
83c7162d 4082 #[simd_test(enable = "avx")]
0531ce1d
XL
4083 unsafe fn test_mm256_movedup_pd() {
4084 let a = _mm256_setr_pd(1., 2., 3., 4.);
4085 let r = _mm256_movedup_pd(a);
4086 let e = _mm256_setr_pd(1., 1., 3., 3.);
4087 assert_eq_m256d(r, e);
4088 }
4089
83c7162d 4090 #[simd_test(enable = "avx")]
0531ce1d 4091 unsafe fn test_mm256_lddqu_si256() {
0731742a 4092 #[rustfmt::skip]
0531ce1d
XL
4093 let a = _mm256_setr_epi8(
4094 1, 2, 3, 4, 5, 6, 7, 8,
4095 9, 10, 11, 12, 13, 14, 15, 16,
4096 17, 18, 19, 20, 21, 22, 23, 24,
4097 25, 26, 27, 28, 29, 30, 31, 32,
4098 );
4099 let p = &a as *const _;
4100 let r = _mm256_lddqu_si256(black_box(p));
0731742a 4101 #[rustfmt::skip]
0531ce1d
XL
4102 let e = _mm256_setr_epi8(
4103 1, 2, 3, 4, 5, 6, 7, 8,
4104 9, 10, 11, 12, 13, 14, 15, 16,
4105 17, 18, 19, 20, 21, 22, 23, 24,
4106 25, 26, 27, 28, 29, 30, 31, 32,
4107 );
4108 assert_eq_m256i(r, e);
4109 }
4110
83c7162d 4111 #[simd_test(enable = "avx")]
0531ce1d
XL
4112 unsafe fn test_mm256_stream_si256() {
4113 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4114 let mut r = _mm256_undefined_si256();
4115 _mm256_stream_si256(&mut r as *mut _, a);
4116 assert_eq_m256i(r, a);
4117 }
4118
83c7162d 4119 #[simd_test(enable = "avx")]
0531ce1d
XL
4120 unsafe fn test_mm256_stream_pd() {
4121 #[repr(align(32))]
4122 struct Memory {
4123 pub data: [f64; 4],
4124 }
4125 let a = _mm256_set1_pd(7.0);
8faf50e0 4126 let mut mem = Memory { data: [-1.0; 4] };
0531ce1d
XL
4127
4128 _mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
4129 for i in 0..4 {
4130 assert_eq!(mem.data[i], get_m256d(a, i));
4131 }
4132 }
4133
83c7162d 4134 #[simd_test(enable = "avx")]
0531ce1d
XL
4135 unsafe fn test_mm256_stream_ps() {
4136 #[repr(align(32))]
4137 struct Memory {
4138 pub data: [f32; 8],
4139 }
4140 let a = _mm256_set1_ps(7.0);
8faf50e0 4141 let mut mem = Memory { data: [-1.0; 8] };
0531ce1d
XL
4142
4143 _mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
4144 for i in 0..8 {
4145 assert_eq!(mem.data[i], get_m256(a, i));
4146 }
4147 }
4148
83c7162d 4149 #[simd_test(enable = "avx")]
0531ce1d
XL
4150 unsafe fn test_mm256_rcp_ps() {
4151 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4152 let r = _mm256_rcp_ps(a);
0731742a 4153 #[rustfmt::skip]
0531ce1d
XL
4154 let e = _mm256_setr_ps(
4155 0.99975586, 0.49987793, 0.33325195, 0.24993896,
4156 0.19995117, 0.16662598, 0.14282227, 0.12496948,
4157 );
4158 let rel_err = 0.00048828125;
4159 for i in 0..8 {
4160 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4161 }
4162 }
4163
83c7162d 4164 #[simd_test(enable = "avx")]
0531ce1d
XL
4165 unsafe fn test_mm256_rsqrt_ps() {
4166 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4167 let r = _mm256_rsqrt_ps(a);
0731742a 4168 #[rustfmt::skip]
0531ce1d
XL
4169 let e = _mm256_setr_ps(
4170 0.99975586, 0.7069092, 0.5772705, 0.49987793,
4171 0.44714355, 0.40820313, 0.3779297, 0.3534546,
4172 );
4173 let rel_err = 0.00048828125;
4174 for i in 0..8 {
4175 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4176 }
4177 }
4178
83c7162d 4179 #[simd_test(enable = "avx")]
0531ce1d
XL
4180 unsafe fn test_mm256_unpackhi_pd() {
4181 let a = _mm256_setr_pd(1., 2., 3., 4.);
4182 let b = _mm256_setr_pd(5., 6., 7., 8.);
4183 let r = _mm256_unpackhi_pd(a, b);
4184 let e = _mm256_setr_pd(2., 6., 4., 8.);
4185 assert_eq_m256d(r, e);
4186 }
4187
83c7162d 4188 #[simd_test(enable = "avx")]
0531ce1d
XL
4189 unsafe fn test_mm256_unpackhi_ps() {
4190 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4191 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4192 let r = _mm256_unpackhi_ps(a, b);
4193 let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4194 assert_eq_m256(r, e);
4195 }
4196
83c7162d 4197 #[simd_test(enable = "avx")]
0531ce1d
XL
4198 unsafe fn test_mm256_unpacklo_pd() {
4199 let a = _mm256_setr_pd(1., 2., 3., 4.);
4200 let b = _mm256_setr_pd(5., 6., 7., 8.);
4201 let r = _mm256_unpacklo_pd(a, b);
4202 let e = _mm256_setr_pd(1., 5., 3., 7.);
4203 assert_eq_m256d(r, e);
4204 }
4205
83c7162d 4206 #[simd_test(enable = "avx")]
0531ce1d
XL
4207 unsafe fn test_mm256_unpacklo_ps() {
4208 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4209 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4210 let r = _mm256_unpacklo_ps(a, b);
4211 let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4212 assert_eq_m256(r, e);
4213 }
4214
83c7162d 4215 #[simd_test(enable = "avx")]
0531ce1d
XL
4216 unsafe fn test_mm256_testz_si256() {
4217 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4218 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4219 let r = _mm256_testz_si256(a, b);
4220 assert_eq!(r, 0);
4221 let b = _mm256_set1_epi64x(0);
4222 let r = _mm256_testz_si256(a, b);
4223 assert_eq!(r, 1);
4224 }
4225
83c7162d 4226 #[simd_test(enable = "avx")]
0531ce1d
XL
4227 unsafe fn test_mm256_testc_si256() {
4228 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4229 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4230 let r = _mm256_testc_si256(a, b);
4231 assert_eq!(r, 0);
4232 let b = _mm256_set1_epi64x(0);
4233 let r = _mm256_testc_si256(a, b);
4234 assert_eq!(r, 1);
4235 }
4236
83c7162d 4237 #[simd_test(enable = "avx")]
0531ce1d
XL
4238 unsafe fn test_mm256_testnzc_si256() {
4239 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4240 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4241 let r = _mm256_testnzc_si256(a, b);
4242 assert_eq!(r, 1);
4243 let a = _mm256_setr_epi64x(0, 0, 0, 0);
4244 let b = _mm256_setr_epi64x(0, 0, 0, 0);
4245 let r = _mm256_testnzc_si256(a, b);
4246 assert_eq!(r, 0);
4247 }
4248
83c7162d 4249 #[simd_test(enable = "avx")]
0531ce1d
XL
4250 unsafe fn test_mm256_testz_pd() {
4251 let a = _mm256_setr_pd(1., 2., 3., 4.);
4252 let b = _mm256_setr_pd(5., 6., 7., 8.);
4253 let r = _mm256_testz_pd(a, b);
4254 assert_eq!(r, 1);
4255 let a = _mm256_set1_pd(-1.);
4256 let r = _mm256_testz_pd(a, a);
4257 assert_eq!(r, 0);
4258 }
4259
83c7162d 4260 #[simd_test(enable = "avx")]
0531ce1d
XL
4261 unsafe fn test_mm256_testc_pd() {
4262 let a = _mm256_setr_pd(1., 2., 3., 4.);
4263 let b = _mm256_setr_pd(5., 6., 7., 8.);
4264 let r = _mm256_testc_pd(a, b);
4265 assert_eq!(r, 1);
4266 let a = _mm256_set1_pd(1.);
4267 let b = _mm256_set1_pd(-1.);
4268 let r = _mm256_testc_pd(a, b);
4269 assert_eq!(r, 0);
4270 }
4271
83c7162d 4272 #[simd_test(enable = "avx")]
0531ce1d
XL
4273 unsafe fn test_mm256_testnzc_pd() {
4274 let a = _mm256_setr_pd(1., 2., 3., 4.);
4275 let b = _mm256_setr_pd(5., 6., 7., 8.);
4276 let r = _mm256_testnzc_pd(a, b);
4277 assert_eq!(r, 0);
4278 let a = _mm256_setr_pd(1., -1., -1., -1.);
4279 let b = _mm256_setr_pd(-1., -1., 1., 1.);
4280 let r = _mm256_testnzc_pd(a, b);
4281 assert_eq!(r, 1);
4282 }
4283
83c7162d 4284 #[simd_test(enable = "avx")]
0531ce1d
XL
4285 unsafe fn test_mm_testz_pd() {
4286 let a = _mm_setr_pd(1., 2.);
4287 let b = _mm_setr_pd(5., 6.);
4288 let r = _mm_testz_pd(a, b);
4289 assert_eq!(r, 1);
4290 let a = _mm_set1_pd(-1.);
4291 let r = _mm_testz_pd(a, a);
4292 assert_eq!(r, 0);
4293 }
4294
83c7162d 4295 #[simd_test(enable = "avx")]
0531ce1d
XL
4296 unsafe fn test_mm_testc_pd() {
4297 let a = _mm_setr_pd(1., 2.);
4298 let b = _mm_setr_pd(5., 6.);
4299 let r = _mm_testc_pd(a, b);
4300 assert_eq!(r, 1);
4301 let a = _mm_set1_pd(1.);
4302 let b = _mm_set1_pd(-1.);
4303 let r = _mm_testc_pd(a, b);
4304 assert_eq!(r, 0);
4305 }
4306
83c7162d 4307 #[simd_test(enable = "avx")]
0531ce1d
XL
4308 unsafe fn test_mm_testnzc_pd() {
4309 let a = _mm_setr_pd(1., 2.);
4310 let b = _mm_setr_pd(5., 6.);
4311 let r = _mm_testnzc_pd(a, b);
4312 assert_eq!(r, 0);
4313 let a = _mm_setr_pd(1., -1.);
4314 let b = _mm_setr_pd(-1., -1.);
4315 let r = _mm_testnzc_pd(a, b);
4316 assert_eq!(r, 1);
4317 }
4318
83c7162d 4319 #[simd_test(enable = "avx")]
0531ce1d
XL
4320 unsafe fn test_mm256_testz_ps() {
4321 let a = _mm256_set1_ps(1.);
4322 let r = _mm256_testz_ps(a, a);
4323 assert_eq!(r, 1);
4324 let a = _mm256_set1_ps(-1.);
4325 let r = _mm256_testz_ps(a, a);
4326 assert_eq!(r, 0);
4327 }
4328
83c7162d 4329 #[simd_test(enable = "avx")]
0531ce1d
XL
4330 unsafe fn test_mm256_testc_ps() {
4331 let a = _mm256_set1_ps(1.);
4332 let r = _mm256_testc_ps(a, a);
4333 assert_eq!(r, 1);
4334 let b = _mm256_set1_ps(-1.);
4335 let r = _mm256_testc_ps(a, b);
4336 assert_eq!(r, 0);
4337 }
4338
83c7162d 4339 #[simd_test(enable = "avx")]
0531ce1d
XL
4340 unsafe fn test_mm256_testnzc_ps() {
4341 let a = _mm256_set1_ps(1.);
4342 let r = _mm256_testnzc_ps(a, a);
4343 assert_eq!(r, 0);
4344 let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4345 let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4346 let r = _mm256_testnzc_ps(a, b);
4347 assert_eq!(r, 1);
4348 }
4349
83c7162d 4350 #[simd_test(enable = "avx")]
0531ce1d
XL
4351 unsafe fn test_mm_testz_ps() {
4352 let a = _mm_set1_ps(1.);
4353 let r = _mm_testz_ps(a, a);
4354 assert_eq!(r, 1);
4355 let a = _mm_set1_ps(-1.);
4356 let r = _mm_testz_ps(a, a);
4357 assert_eq!(r, 0);
4358 }
4359
83c7162d 4360 #[simd_test(enable = "avx")]
0531ce1d
XL
4361 unsafe fn test_mm_testc_ps() {
4362 let a = _mm_set1_ps(1.);
4363 let r = _mm_testc_ps(a, a);
4364 assert_eq!(r, 1);
4365 let b = _mm_set1_ps(-1.);
4366 let r = _mm_testc_ps(a, b);
4367 assert_eq!(r, 0);
4368 }
4369
83c7162d 4370 #[simd_test(enable = "avx")]
0531ce1d
XL
4371 unsafe fn test_mm_testnzc_ps() {
4372 let a = _mm_set1_ps(1.);
4373 let r = _mm_testnzc_ps(a, a);
4374 assert_eq!(r, 0);
4375 let a = _mm_setr_ps(1., -1., -1., -1.);
4376 let b = _mm_setr_ps(-1., -1., 1., 1.);
4377 let r = _mm_testnzc_ps(a, b);
4378 assert_eq!(r, 1);
4379 }
4380
83c7162d 4381 #[simd_test(enable = "avx")]
0531ce1d
XL
4382 unsafe fn test_mm256_movemask_pd() {
4383 let a = _mm256_setr_pd(1., -2., 3., -4.);
4384 let r = _mm256_movemask_pd(a);
4385 assert_eq!(r, 0xA);
4386 }
4387
83c7162d 4388 #[simd_test(enable = "avx")]
0531ce1d
XL
4389 unsafe fn test_mm256_movemask_ps() {
4390 let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4391 let r = _mm256_movemask_ps(a);
4392 assert_eq!(r, 0xAA);
4393 }
4394
83c7162d 4395 #[simd_test(enable = "avx")]
0531ce1d
XL
4396 unsafe fn test_mm256_setzero_pd() {
4397 let r = _mm256_setzero_pd();
4398 assert_eq_m256d(r, _mm256_set1_pd(0.));
4399 }
4400
83c7162d 4401 #[simd_test(enable = "avx")]
0531ce1d
XL
4402 unsafe fn test_mm256_setzero_ps() {
4403 let r = _mm256_setzero_ps();
4404 assert_eq_m256(r, _mm256_set1_ps(0.));
4405 }
4406
83c7162d 4407 #[simd_test(enable = "avx")]
0531ce1d
XL
4408 unsafe fn test_mm256_setzero_si256() {
4409 let r = _mm256_setzero_si256();
4410 assert_eq_m256i(r, _mm256_set1_epi8(0));
4411 }
4412
83c7162d 4413 #[simd_test(enable = "avx")]
0531ce1d
XL
4414 unsafe fn test_mm256_set_pd() {
4415 let r = _mm256_set_pd(1., 2., 3., 4.);
4416 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4417 }
4418
83c7162d 4419 #[simd_test(enable = "avx")]
0531ce1d
XL
4420 unsafe fn test_mm256_set_ps() {
4421 let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
8faf50e0 4422 assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
0531ce1d
XL
4423 }
4424
83c7162d 4425 #[simd_test(enable = "avx")]
0531ce1d 4426 unsafe fn test_mm256_set_epi8() {
0731742a 4427 #[rustfmt::skip]
0531ce1d
XL
4428 let r = _mm256_set_epi8(
4429 1, 2, 3, 4, 5, 6, 7, 8,
4430 9, 10, 11, 12, 13, 14, 15, 16,
4431 17, 18, 19, 20, 21, 22, 23, 24,
4432 25, 26, 27, 28, 29, 30, 31, 32,
4433 );
0731742a 4434 #[rustfmt::skip]
0531ce1d
XL
4435 let e = _mm256_setr_epi8(
4436 32, 31, 30, 29, 28, 27, 26, 25,
4437 24, 23, 22, 21, 20, 19, 18, 17,
4438 16, 15, 14, 13, 12, 11, 10, 9,
4439 8, 7, 6, 5, 4, 3, 2, 1
4440 );
4441 assert_eq_m256i(r, e);
4442 }
4443
83c7162d 4444 #[simd_test(enable = "avx")]
0531ce1d 4445 unsafe fn test_mm256_set_epi16() {
0731742a 4446 #[rustfmt::skip]
0531ce1d
XL
4447 let r = _mm256_set_epi16(
4448 1, 2, 3, 4, 5, 6, 7, 8,
4449 9, 10, 11, 12, 13, 14, 15, 16,
4450 );
0731742a 4451 #[rustfmt::skip]
0531ce1d
XL
4452 let e = _mm256_setr_epi16(
4453 16, 15, 14, 13, 12, 11, 10, 9, 8,
4454 7, 6, 5, 4, 3, 2, 1,
4455 );
4456 assert_eq_m256i(r, e);
4457 }
4458
83c7162d 4459 #[simd_test(enable = "avx")]
0531ce1d
XL
4460 unsafe fn test_mm256_set_epi32() {
4461 let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4462 assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4463 }
4464
83c7162d 4465 #[simd_test(enable = "avx")]
0531ce1d
XL
4466 unsafe fn test_mm256_set_epi64x() {
4467 let r = _mm256_set_epi64x(1, 2, 3, 4);
4468 assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4469 }
4470
83c7162d 4471 #[simd_test(enable = "avx")]
0531ce1d
XL
4472 unsafe fn test_mm256_setr_pd() {
4473 let r = _mm256_setr_pd(1., 2., 3., 4.);
4474 assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4475 }
4476
83c7162d 4477 #[simd_test(enable = "avx")]
0531ce1d
XL
4478 unsafe fn test_mm256_setr_ps() {
4479 let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
8faf50e0 4480 assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
0531ce1d
XL
4481 }
4482
83c7162d 4483 #[simd_test(enable = "avx")]
0531ce1d 4484 unsafe fn test_mm256_setr_epi8() {
0731742a 4485 #[rustfmt::skip]
0531ce1d
XL
4486 let r = _mm256_setr_epi8(
4487 1, 2, 3, 4, 5, 6, 7, 8,
4488 9, 10, 11, 12, 13, 14, 15, 16,
4489 17, 18, 19, 20, 21, 22, 23, 24,
4490 25, 26, 27, 28, 29, 30, 31, 32,
4491 );
0731742a 4492 #[rustfmt::skip]
0531ce1d
XL
4493 let e = _mm256_setr_epi8(
4494 1, 2, 3, 4, 5, 6, 7, 8,
4495 9, 10, 11, 12, 13, 14, 15, 16,
4496 17, 18, 19, 20, 21, 22, 23, 24,
4497 25, 26, 27, 28, 29, 30, 31, 32
4498 );
4499
4500 assert_eq_m256i(r, e);
4501 }
4502
83c7162d 4503 #[simd_test(enable = "avx")]
0531ce1d 4504 unsafe fn test_mm256_setr_epi16() {
0731742a 4505 #[rustfmt::skip]
0531ce1d
XL
4506 let r = _mm256_setr_epi16(
4507 1, 2, 3, 4, 5, 6, 7, 8,
4508 9, 10, 11, 12, 13, 14, 15, 16,
4509 );
0731742a 4510 #[rustfmt::skip]
0531ce1d
XL
4511 let e = _mm256_setr_epi16(
4512 1, 2, 3, 4, 5, 6, 7, 8,
4513 9, 10, 11, 12, 13, 14, 15, 16,
4514 );
4515 assert_eq_m256i(r, e);
4516 }
4517
83c7162d 4518 #[simd_test(enable = "avx")]
0531ce1d
XL
4519 unsafe fn test_mm256_setr_epi32() {
4520 let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4521 assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4522 }
4523
83c7162d 4524 #[simd_test(enable = "avx")]
0531ce1d
XL
4525 unsafe fn test_mm256_setr_epi64x() {
4526 let r = _mm256_setr_epi64x(1, 2, 3, 4);
4527 assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4528 }
4529
83c7162d 4530 #[simd_test(enable = "avx")]
0531ce1d
XL
4531 unsafe fn test_mm256_set1_pd() {
4532 let r = _mm256_set1_pd(1.);
4533 assert_eq_m256d(r, _mm256_set1_pd(1.));
4534 }
4535
83c7162d 4536 #[simd_test(enable = "avx")]
0531ce1d
XL
4537 unsafe fn test_mm256_set1_ps() {
4538 let r = _mm256_set1_ps(1.);
4539 assert_eq_m256(r, _mm256_set1_ps(1.));
4540 }
4541
83c7162d 4542 #[simd_test(enable = "avx")]
0531ce1d
XL
4543 unsafe fn test_mm256_set1_epi8() {
4544 let r = _mm256_set1_epi8(1);
4545 assert_eq_m256i(r, _mm256_set1_epi8(1));
4546 }
4547
83c7162d 4548 #[simd_test(enable = "avx")]
0531ce1d
XL
4549 unsafe fn test_mm256_set1_epi16() {
4550 let r = _mm256_set1_epi16(1);
4551 assert_eq_m256i(r, _mm256_set1_epi16(1));
4552 }
4553
83c7162d 4554 #[simd_test(enable = "avx")]
0531ce1d
XL
4555 unsafe fn test_mm256_set1_epi32() {
4556 let r = _mm256_set1_epi32(1);
4557 assert_eq_m256i(r, _mm256_set1_epi32(1));
4558 }
4559
83c7162d 4560 #[simd_test(enable = "avx")]
0531ce1d
XL
4561 unsafe fn test_mm256_set1_epi64x() {
4562 let r = _mm256_set1_epi64x(1);
4563 assert_eq_m256i(r, _mm256_set1_epi64x(1));
4564 }
4565
83c7162d 4566 #[simd_test(enable = "avx")]
0531ce1d
XL
4567 unsafe fn test_mm256_castpd_ps() {
4568 let a = _mm256_setr_pd(1., 2., 3., 4.);
4569 let r = _mm256_castpd_ps(a);
4570 let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4571 assert_eq_m256(r, e);
4572 }
4573
83c7162d 4574 #[simd_test(enable = "avx")]
0531ce1d
XL
4575 unsafe fn test_mm256_castps_pd() {
4576 let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4577 let r = _mm256_castps_pd(a);
4578 let e = _mm256_setr_pd(1., 2., 3., 4.);
4579 assert_eq_m256d(r, e);
4580 }
4581
83c7162d 4582 #[simd_test(enable = "avx")]
0531ce1d
XL
4583 unsafe fn test_mm256_castps_si256() {
4584 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4585 let r = _mm256_castps_si256(a);
0731742a 4586 #[rustfmt::skip]
0531ce1d
XL
4587 let e = _mm256_setr_epi8(
4588 0, 0, -128, 63, 0, 0, 0, 64,
4589 0, 0, 64, 64, 0, 0, -128, 64,
4590 0, 0, -96, 64, 0, 0, -64, 64,
4591 0, 0, -32, 64, 0, 0, 0, 65,
4592 );
4593 assert_eq_m256i(r, e);
4594 }
4595
83c7162d 4596 #[simd_test(enable = "avx")]
0531ce1d 4597 unsafe fn test_mm256_castsi256_ps() {
0731742a 4598 #[rustfmt::skip]
0531ce1d
XL
4599 let a = _mm256_setr_epi8(
4600 0, 0, -128, 63, 0, 0, 0, 64,
4601 0, 0, 64, 64, 0, 0, -128, 64,
4602 0, 0, -96, 64, 0, 0, -64, 64,
4603 0, 0, -32, 64, 0, 0, 0, 65,
4604 );
4605 let r = _mm256_castsi256_ps(a);
4606 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4607 assert_eq_m256(r, e);
4608 }
4609
83c7162d 4610 #[simd_test(enable = "avx")]
0531ce1d
XL
4611 unsafe fn test_mm256_castpd_si256() {
4612 let a = _mm256_setr_pd(1., 2., 3., 4.);
4613 let r = _mm256_castpd_si256(a);
532ac7d7 4614 assert_eq_m256d(transmute(r), a);
0531ce1d
XL
4615 }
4616
83c7162d 4617 #[simd_test(enable = "avx")]
0531ce1d
XL
4618 unsafe fn test_mm256_castsi256_pd() {
4619 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4620 let r = _mm256_castsi256_pd(a);
532ac7d7 4621 assert_eq_m256d(r, transmute(a));
0531ce1d
XL
4622 }
4623
83c7162d 4624 #[simd_test(enable = "avx")]
0531ce1d
XL
4625 unsafe fn test_mm256_castps256_ps128() {
4626 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4627 let r = _mm256_castps256_ps128(a);
4628 assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4629 }
4630
83c7162d 4631 #[simd_test(enable = "avx")]
0531ce1d
XL
4632 unsafe fn test_mm256_castpd256_pd128() {
4633 let a = _mm256_setr_pd(1., 2., 3., 4.);
4634 let r = _mm256_castpd256_pd128(a);
4635 assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4636 }
4637
83c7162d 4638 #[simd_test(enable = "avx")]
0531ce1d
XL
4639 unsafe fn test_mm256_castsi256_si128() {
4640 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4641 let r = _mm256_castsi256_si128(a);
4642 assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4643 }
4644
83c7162d 4645 #[simd_test(enable = "avx")]
0531ce1d
XL
4646 unsafe fn test_mm256_zextps128_ps256() {
4647 let a = _mm_setr_ps(1., 2., 3., 4.);
4648 let r = _mm256_zextps128_ps256(a);
4649 let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4650 assert_eq_m256(r, e);
4651 }
4652
83c7162d 4653 #[simd_test(enable = "avx")]
0531ce1d
XL
4654 unsafe fn test_mm256_zextsi128_si256() {
4655 let a = _mm_setr_epi64x(1, 2);
4656 let r = _mm256_zextsi128_si256(a);
4657 let e = _mm256_setr_epi64x(1, 2, 0, 0);
4658 assert_eq_m256i(r, e);
4659 }
4660
83c7162d 4661 #[simd_test(enable = "avx")]
0531ce1d
XL
4662 unsafe fn test_mm256_zextpd128_pd256() {
4663 let a = _mm_setr_pd(1., 2.);
4664 let r = _mm256_zextpd128_pd256(a);
4665 let e = _mm256_setr_pd(1., 2., 0., 0.);
4666 assert_eq_m256d(r, e);
4667 }
4668
83c7162d 4669 #[simd_test(enable = "avx")]
0531ce1d
XL
4670 unsafe fn test_mm256_set_m128() {
4671 let hi = _mm_setr_ps(5., 6., 7., 8.);
4672 let lo = _mm_setr_ps(1., 2., 3., 4.);
4673 let r = _mm256_set_m128(hi, lo);
4674 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4675 assert_eq_m256(r, e);
4676 }
4677
83c7162d 4678 #[simd_test(enable = "avx")]
0531ce1d
XL
4679 unsafe fn test_mm256_set_m128d() {
4680 let hi = _mm_setr_pd(3., 4.);
4681 let lo = _mm_setr_pd(1., 2.);
4682 let r = _mm256_set_m128d(hi, lo);
4683 let e = _mm256_setr_pd(1., 2., 3., 4.);
4684 assert_eq_m256d(r, e);
4685 }
4686
83c7162d 4687 #[simd_test(enable = "avx")]
0531ce1d 4688 unsafe fn test_mm256_set_m128i() {
0731742a 4689 #[rustfmt::skip]
0531ce1d
XL
4690 let hi = _mm_setr_epi8(
4691 17, 18, 19, 20,
4692 21, 22, 23, 24,
4693 25, 26, 27, 28,
4694 29, 30, 31, 32,
4695 );
0731742a 4696 #[rustfmt::skip]
0531ce1d
XL
4697 let lo = _mm_setr_epi8(
4698 1, 2, 3, 4,
4699 5, 6, 7, 8,
4700 9, 10, 11, 12,
4701 13, 14, 15, 16,
4702 );
4703 let r = _mm256_set_m128i(hi, lo);
0731742a 4704 #[rustfmt::skip]
0531ce1d
XL
4705 let e = _mm256_setr_epi8(
4706 1, 2, 3, 4, 5, 6, 7, 8,
4707 9, 10, 11, 12, 13, 14, 15, 16,
4708 17, 18, 19, 20, 21, 22, 23, 24,
4709 25, 26, 27, 28, 29, 30, 31, 32,
4710 );
4711 assert_eq_m256i(r, e);
4712 }
4713
83c7162d 4714 #[simd_test(enable = "avx")]
0531ce1d
XL
4715 unsafe fn test_mm256_setr_m128() {
4716 let lo = _mm_setr_ps(1., 2., 3., 4.);
4717 let hi = _mm_setr_ps(5., 6., 7., 8.);
4718 let r = _mm256_setr_m128(lo, hi);
4719 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4720 assert_eq_m256(r, e);
4721 }
4722
83c7162d 4723 #[simd_test(enable = "avx")]
0531ce1d
XL
4724 unsafe fn test_mm256_setr_m128d() {
4725 let lo = _mm_setr_pd(1., 2.);
4726 let hi = _mm_setr_pd(3., 4.);
4727 let r = _mm256_setr_m128d(lo, hi);
4728 let e = _mm256_setr_pd(1., 2., 3., 4.);
4729 assert_eq_m256d(r, e);
4730 }
4731
83c7162d 4732 #[simd_test(enable = "avx")]
0531ce1d 4733 unsafe fn test_mm256_setr_m128i() {
0731742a 4734 #[rustfmt::skip]
0531ce1d
XL
4735 let lo = _mm_setr_epi8(
4736 1, 2, 3, 4,
4737 5, 6, 7, 8,
4738 9, 10, 11, 12,
4739 13, 14, 15, 16,
4740 );
0731742a 4741 #[rustfmt::skip]
0531ce1d
XL
4742 let hi = _mm_setr_epi8(
4743 17, 18, 19, 20, 21, 22, 23, 24,
4744 25, 26, 27, 28, 29, 30, 31, 32,
4745 );
4746 let r = _mm256_setr_m128i(lo, hi);
0731742a 4747 #[rustfmt::skip]
0531ce1d
XL
4748 let e = _mm256_setr_epi8(
4749 1, 2, 3, 4, 5, 6, 7, 8,
4750 9, 10, 11, 12, 13, 14, 15, 16,
4751 17, 18, 19, 20, 21, 22, 23, 24,
4752 25, 26, 27, 28, 29, 30, 31, 32,
4753 );
4754 assert_eq_m256i(r, e);
4755 }
4756
83c7162d 4757 #[simd_test(enable = "avx")]
0531ce1d
XL
4758 unsafe fn test_mm256_loadu2_m128() {
4759 let hi = &[5., 6., 7., 8.];
4760 let hiaddr = hi.as_ptr();
4761 let lo = &[1., 2., 3., 4.];
4762 let loaddr = lo.as_ptr();
4763 let r = _mm256_loadu2_m128(hiaddr, loaddr);
4764 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4765 assert_eq_m256(r, e);
4766 }
4767
83c7162d 4768 #[simd_test(enable = "avx")]
0531ce1d
XL
4769 unsafe fn test_mm256_loadu2_m128d() {
4770 let hi = &[3., 4.];
4771 let hiaddr = hi.as_ptr();
4772 let lo = &[1., 2.];
4773 let loaddr = lo.as_ptr();
4774 let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4775 let e = _mm256_setr_pd(1., 2., 3., 4.);
4776 assert_eq_m256d(r, e);
4777 }
4778
83c7162d 4779 #[simd_test(enable = "avx")]
0531ce1d 4780 unsafe fn test_mm256_loadu2_m128i() {
0731742a 4781 #[rustfmt::skip]
0531ce1d
XL
4782 let hi = _mm_setr_epi8(
4783 17, 18, 19, 20, 21, 22, 23, 24,
4784 25, 26, 27, 28, 29, 30, 31, 32,
4785 );
0731742a 4786 #[rustfmt::skip]
0531ce1d
XL
4787 let lo = _mm_setr_epi8(
4788 1, 2, 3, 4, 5, 6, 7, 8,
4789 9, 10, 11, 12, 13, 14, 15, 16,
4790 );
0731742a
XL
4791 let r = _mm256_loadu2_m128i(&hi as *const _ as *const _, &lo as *const _ as *const _);
4792 #[rustfmt::skip]
0531ce1d
XL
4793 let e = _mm256_setr_epi8(
4794 1, 2, 3, 4, 5, 6, 7, 8,
4795 9, 10, 11, 12, 13, 14, 15, 16,
4796 17, 18, 19, 20, 21, 22, 23, 24,
4797 25, 26, 27, 28, 29, 30, 31, 32,
4798 );
4799 assert_eq_m256i(r, e);
4800 }
4801
83c7162d 4802 #[simd_test(enable = "avx")]
0531ce1d
XL
4803 unsafe fn test_mm256_storeu2_m128() {
4804 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4805 let mut hi = _mm_undefined_ps();
4806 let mut lo = _mm_undefined_ps();
4807 _mm256_storeu2_m128(
4808 &mut hi as *mut _ as *mut f32,
4809 &mut lo as *mut _ as *mut f32,
4810 a,
4811 );
4812 assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
4813 assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
4814 }
4815
83c7162d 4816 #[simd_test(enable = "avx")]
0531ce1d
XL
4817 unsafe fn test_mm256_storeu2_m128d() {
4818 let a = _mm256_setr_pd(1., 2., 3., 4.);
4819 let mut hi = _mm_undefined_pd();
4820 let mut lo = _mm_undefined_pd();
4821 _mm256_storeu2_m128d(
4822 &mut hi as *mut _ as *mut f64,
4823 &mut lo as *mut _ as *mut f64,
4824 a,
4825 );
4826 assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
4827 assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
4828 }
4829
83c7162d 4830 #[simd_test(enable = "avx")]
0531ce1d 4831 unsafe fn test_mm256_storeu2_m128i() {
0731742a 4832 #[rustfmt::skip]
0531ce1d
XL
4833 let a = _mm256_setr_epi8(
4834 1, 2, 3, 4, 5, 6, 7, 8,
4835 9, 10, 11, 12, 13, 14, 15, 16,
4836 17, 18, 19, 20, 21, 22, 23, 24,
4837 25, 26, 27, 28, 29, 30, 31, 32,
4838 );
4839 let mut hi = _mm_undefined_si128();
4840 let mut lo = _mm_undefined_si128();
4841 _mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a);
0731742a 4842 #[rustfmt::skip]
0531ce1d
XL
4843 let e_hi = _mm_setr_epi8(
4844 17, 18, 19, 20, 21, 22, 23, 24,
4845 25, 26, 27, 28, 29, 30, 31, 32
4846 );
0731742a 4847 #[rustfmt::skip]
0531ce1d
XL
4848 let e_lo = _mm_setr_epi8(
4849 1, 2, 3, 4, 5, 6, 7, 8,
4850 9, 10, 11, 12, 13, 14, 15, 16
4851 );
4852
4853 assert_eq_m128i(hi, e_hi);
4854 assert_eq_m128i(lo, e_lo);
4855 }
4856
83c7162d 4857 #[simd_test(enable = "avx")]
0531ce1d
XL
4858 unsafe fn test_mm256_cvtss_f32() {
4859 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4860 let r = _mm256_cvtss_f32(a);
4861 assert_eq!(r, 1.);
4862 }
4863}