]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/avx.rs
New upstream version 1.71.1+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / avx.rs
CommitLineData
0531ce1d
XL
1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//! Programmer's Manual, Volume 3: General-Purpose and System
8//! Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
532ac7d7
XL
16use crate::{
17 core_arch::{simd::*, simd_llvm::*, x86::*},
18 intrinsics,
19 mem::{self, transmute},
20 ptr,
21};
0531ce1d
XL
22
23#[cfg(test)]
416331ca 24use stdarch_test::assert_instr;
0531ce1d 25
532ac7d7 26/// Adds packed double-precision (64-bit) floating-point elements
0531ce1d 27/// in `a` and `b`.
83c7162d 28///
353b0b11 29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
0531ce1d
XL
30#[inline]
31#[target_feature(enable = "avx")]
32#[cfg_attr(test, assert_instr(vaddpd))]
83c7162d 33#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
34pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
35 simd_add(a, b)
36}
37
532ac7d7 38/// Adds packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 39/// `b`.
83c7162d 40///
353b0b11 41/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
0531ce1d
XL
42#[inline]
43#[target_feature(enable = "avx")]
44#[cfg_attr(test, assert_instr(vaddps))]
83c7162d 45#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
46pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
47 simd_add(a, b)
48}
49
532ac7d7
XL
50/// Computes the bitwise AND of a packed double-precision (64-bit)
51/// floating-point elements in `a` and `b`.
83c7162d 52///
353b0b11 53/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
0531ce1d
XL
54#[inline]
55#[target_feature(enable = "avx")]
a2a8927a 56// FIXME: Should be 'vandpd' instruction.
416331ca 57// See https://github.com/rust-lang/stdarch/issues/71
0531ce1d 58#[cfg_attr(test, assert_instr(vandps))]
83c7162d 59#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 60pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
61 let a: u64x4 = transmute(a);
62 let b: u64x4 = transmute(b);
63 transmute(simd_and(a, b))
0531ce1d
XL
64}
65
532ac7d7 66/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
0531ce1d 67/// elements in `a` and `b`.
83c7162d 68///
353b0b11 69/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
0531ce1d
XL
70#[inline]
71#[target_feature(enable = "avx")]
72#[cfg_attr(test, assert_instr(vandps))]
83c7162d 73#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 74pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
75 let a: u32x8 = transmute(a);
76 let b: u32x8 = transmute(b);
77 transmute(simd_and(a, b))
0531ce1d
XL
78}
79
532ac7d7 80/// Computes the bitwise OR packed double-precision (64-bit) floating-point
0531ce1d 81/// elements in `a` and `b`.
83c7162d 82///
353b0b11 83/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
0531ce1d
XL
84#[inline]
85#[target_feature(enable = "avx")]
a2a8927a 86// FIXME: should be `vorpd` instruction.
416331ca 87// See <https://github.com/rust-lang/stdarch/issues/71>.
0531ce1d 88#[cfg_attr(test, assert_instr(vorps))]
83c7162d 89#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 90pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
91 let a: u64x4 = transmute(a);
92 let b: u64x4 = transmute(b);
93 transmute(simd_or(a, b))
0531ce1d
XL
94}
95
532ac7d7 96/// Computes the bitwise OR packed single-precision (32-bit) floating-point
0531ce1d 97/// elements in `a` and `b`.
83c7162d 98///
353b0b11 99/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
0531ce1d
XL
100#[inline]
101#[target_feature(enable = "avx")]
102#[cfg_attr(test, assert_instr(vorps))]
83c7162d 103#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 104pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
105 let a: u32x8 = transmute(a);
106 let b: u32x8 = transmute(b);
107 transmute(simd_or(a, b))
0531ce1d
XL
108}
109
532ac7d7 110/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
0531ce1d 111/// lanes using the control in `imm8`.
83c7162d 112///
353b0b11 113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
0531ce1d
XL
114#[inline]
115#[target_feature(enable = "avx")]
17df50a5
XL
116#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
117#[rustc_legacy_const_generics(2)]
118#[stable(feature = "simd_x86", since = "1.27.0")]
119pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
353b0b11
FG
120 static_assert_uimm_bits!(MASK, 8);
121 simd_shuffle!(
17df50a5
XL
122 a,
123 b,
353b0b11 124 [
17df50a5
XL
125 MASK as u32 & 0b1,
126 ((MASK as u32 >> 1) & 0b1) + 4,
127 ((MASK as u32 >> 2) & 0b1) + 2,
128 ((MASK as u32 >> 3) & 0b1) + 6,
129 ],
130 )
0531ce1d
XL
131}
132
532ac7d7 133/// Shuffles single-precision (32-bit) floating-point elements in `a` within
0531ce1d 134/// 128-bit lanes using the control in `imm8`.
83c7162d 135///
353b0b11 136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
0531ce1d
XL
137#[inline]
138#[target_feature(enable = "avx")]
17df50a5
XL
139#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
140#[rustc_legacy_const_generics(2)]
141#[stable(feature = "simd_x86", since = "1.27.0")]
142pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
353b0b11
FG
143 static_assert_uimm_bits!(MASK, 8);
144 simd_shuffle!(
17df50a5
XL
145 a,
146 b,
353b0b11 147 [
17df50a5
XL
148 MASK as u32 & 0b11,
149 (MASK as u32 >> 2) & 0b11,
150 ((MASK as u32 >> 4) & 0b11) + 8,
151 ((MASK as u32 >> 6) & 0b11) + 8,
152 (MASK as u32 & 0b11) + 4,
153 ((MASK as u32 >> 2) & 0b11) + 4,
154 ((MASK as u32 >> 4) & 0b11) + 12,
155 ((MASK as u32 >> 6) & 0b11) + 12,
156 ],
157 )
0531ce1d
XL
158}
159
532ac7d7
XL
160/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
161/// elements in `a`, and then AND with `b`.
83c7162d 162///
353b0b11 163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
0531ce1d
XL
164#[inline]
165#[target_feature(enable = "avx")]
532ac7d7 166// FIXME: should be `vandnpd` instruction.
0531ce1d 167#[cfg_attr(test, assert_instr(vandnps))]
83c7162d 168#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 169pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
170 let a: u64x4 = transmute(a);
171 let b: u64x4 = transmute(b);
172 transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
0531ce1d
XL
173}
174
532ac7d7 175/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
0531ce1d
XL
176/// elements in `a`
177/// and then AND with `b`.
83c7162d 178///
353b0b11 179/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
0531ce1d
XL
180#[inline]
181#[target_feature(enable = "avx")]
182#[cfg_attr(test, assert_instr(vandnps))]
83c7162d 183#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 184pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
185 let a: u32x8 = transmute(a);
186 let b: u32x8 = transmute(b);
187 transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
0531ce1d
XL
188}
189
532ac7d7
XL
190/// Compares packed double-precision (64-bit) floating-point elements
191/// in `a` and `b`, and returns packed maximum values
83c7162d 192///
353b0b11 193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
0531ce1d
XL
194#[inline]
195#[target_feature(enable = "avx")]
196#[cfg_attr(test, assert_instr(vmaxpd))]
83c7162d 197#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 198pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
17df50a5 199 vmaxpd(a, b)
0531ce1d
XL
200}
201
532ac7d7
XL
202/// Compares packed single-precision (32-bit) floating-point elements in `a`
203/// and `b`, and returns packed maximum values
83c7162d 204///
353b0b11 205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
0531ce1d
XL
206#[inline]
207#[target_feature(enable = "avx")]
208#[cfg_attr(test, assert_instr(vmaxps))]
83c7162d 209#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 210pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
17df50a5 211 vmaxps(a, b)
0531ce1d
XL
212}
213
532ac7d7
XL
214/// Compares packed double-precision (64-bit) floating-point elements
215/// in `a` and `b`, and returns packed minimum values
83c7162d 216///
353b0b11 217/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
0531ce1d
XL
218#[inline]
219#[target_feature(enable = "avx")]
220#[cfg_attr(test, assert_instr(vminpd))]
83c7162d 221#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 222pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
17df50a5 223 vminpd(a, b)
0531ce1d
XL
224}
225
532ac7d7
XL
226/// Compares packed single-precision (32-bit) floating-point elements in `a`
227/// and `b`, and returns packed minimum values
83c7162d 228///
353b0b11 229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
0531ce1d
XL
230#[inline]
231#[target_feature(enable = "avx")]
232#[cfg_attr(test, assert_instr(vminps))]
83c7162d 233#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 234pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
17df50a5 235 vminps(a, b)
0531ce1d
XL
236}
237
532ac7d7 238/// Multiplies packed double-precision (64-bit) floating-point elements
0531ce1d 239/// in `a` and `b`.
83c7162d 240///
353b0b11 241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
0531ce1d
XL
242#[inline]
243#[target_feature(enable = "avx")]
244#[cfg_attr(test, assert_instr(vmulpd))]
83c7162d 245#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
246pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
247 simd_mul(a, b)
248}
249
532ac7d7 250/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
0531ce1d 251/// `b`.
83c7162d 252///
353b0b11 253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
0531ce1d
XL
254#[inline]
255#[target_feature(enable = "avx")]
256#[cfg_attr(test, assert_instr(vmulps))]
83c7162d 257#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
258pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
259 simd_mul(a, b)
260}
261
532ac7d7 262/// Alternatively adds and subtracts packed double-precision (64-bit)
0531ce1d 263/// floating-point elements in `a` to/from packed elements in `b`.
83c7162d 264///
353b0b11 265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
0531ce1d
XL
266#[inline]
267#[target_feature(enable = "avx")]
268#[cfg_attr(test, assert_instr(vaddsubpd))]
83c7162d 269#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
270pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
271 addsubpd256(a, b)
272}
273
532ac7d7 274/// Alternatively adds and subtracts packed single-precision (32-bit)
0531ce1d 275/// floating-point elements in `a` to/from packed elements in `b`.
83c7162d 276///
353b0b11 277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
0531ce1d
XL
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubps))]
83c7162d 281#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
282pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
283 addsubps256(a, b)
284}
285
532ac7d7 286/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
0531ce1d 287/// from packed elements in `a`.
83c7162d 288///
353b0b11 289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
0531ce1d
XL
290#[inline]
291#[target_feature(enable = "avx")]
292#[cfg_attr(test, assert_instr(vsubpd))]
83c7162d 293#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
294pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
295 simd_sub(a, b)
296}
297
532ac7d7 298/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
0531ce1d 299/// from packed elements in `a`.
83c7162d 300///
353b0b11 301/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
0531ce1d
XL
302#[inline]
303#[target_feature(enable = "avx")]
304#[cfg_attr(test, assert_instr(vsubps))]
83c7162d 305#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
306pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
307 simd_sub(a, b)
308}
309
532ac7d7 310/// Computes the division of each of the 8 packed 32-bit floating-point elements
0531ce1d 311/// in `a` by the corresponding packed elements in `b`.
83c7162d 312///
353b0b11 313/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
0531ce1d
XL
314#[inline]
315#[target_feature(enable = "avx")]
316#[cfg_attr(test, assert_instr(vdivps))]
83c7162d 317#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
318pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
319 simd_div(a, b)
320}
321
532ac7d7 322/// Computes the division of each of the 4 packed 64-bit floating-point elements
0531ce1d 323/// in `a` by the corresponding packed elements in `b`.
83c7162d 324///
353b0b11 325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
0531ce1d
XL
326#[inline]
327#[target_feature(enable = "avx")]
328#[cfg_attr(test, assert_instr(vdivpd))]
83c7162d 329#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
330pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
331 simd_div(a, b)
332}
333
532ac7d7 334/// Rounds packed double-precision (64-bit) floating point elements in `a`
17df50a5 335/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
0531ce1d
XL
336///
337/// - `0x00`: Round to the nearest whole number.
338/// - `0x01`: Round down, toward negative infinity.
339/// - `0x02`: Round up, toward positive infinity.
340/// - `0x03`: Truncate the values.
341///
342/// For a complete list of options, check [the LLVM docs][llvm_docs].
343///
344/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
83c7162d 345///
353b0b11 346/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
0531ce1d
XL
347#[inline]
348#[target_feature(enable = "avx")]
17df50a5
XL
349#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
350#[rustc_legacy_const_generics(1)]
83c7162d 351#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 352pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
353b0b11 353 static_assert_uimm_bits!(ROUNDING, 4);
17df50a5 354 roundpd256(a, ROUNDING)
0531ce1d
XL
355}
356
532ac7d7 357/// Rounds packed double-precision (64-bit) floating point elements in `a`
0531ce1d 358/// toward positive infinity.
83c7162d 359///
353b0b11 360/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
0531ce1d
XL
361#[inline]
362#[target_feature(enable = "avx")]
363#[cfg_attr(test, assert_instr(vroundpd))]
83c7162d 364#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 365pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
74b04a01 366 simd_ceil(a)
0531ce1d
XL
367}
368
532ac7d7 369/// Rounds packed double-precision (64-bit) floating point elements in `a`
0531ce1d 370/// toward negative infinity.
83c7162d 371///
353b0b11 372/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
0531ce1d
XL
373#[inline]
374#[target_feature(enable = "avx")]
375#[cfg_attr(test, assert_instr(vroundpd))]
83c7162d 376#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 377pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
74b04a01 378 simd_floor(a)
0531ce1d
XL
379}
380
532ac7d7 381/// Rounds packed single-precision (32-bit) floating point elements in `a`
17df50a5 382/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
0531ce1d
XL
383///
384/// - `0x00`: Round to the nearest whole number.
385/// - `0x01`: Round down, toward negative infinity.
386/// - `0x02`: Round up, toward positive infinity.
387/// - `0x03`: Truncate the values.
388///
389/// For a complete list of options, check [the LLVM docs][llvm_docs].
390///
391/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
83c7162d 392///
353b0b11 393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
0531ce1d
XL
394#[inline]
395#[target_feature(enable = "avx")]
17df50a5
XL
396#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
397#[rustc_legacy_const_generics(1)]
83c7162d 398#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 399pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
353b0b11 400 static_assert_uimm_bits!(ROUNDING, 4);
17df50a5 401 roundps256(a, ROUNDING)
0531ce1d
XL
402}
403
532ac7d7 404/// Rounds packed single-precision (32-bit) floating point elements in `a`
0531ce1d 405/// toward positive infinity.
83c7162d 406///
353b0b11 407/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
0531ce1d
XL
408#[inline]
409#[target_feature(enable = "avx")]
410#[cfg_attr(test, assert_instr(vroundps))]
83c7162d 411#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 412pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
74b04a01 413 simd_ceil(a)
0531ce1d
XL
414}
415
532ac7d7 416/// Rounds packed single-precision (32-bit) floating point elements in `a`
0531ce1d 417/// toward negative infinity.
83c7162d 418///
353b0b11 419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
0531ce1d
XL
420#[inline]
421#[target_feature(enable = "avx")]
422#[cfg_attr(test, assert_instr(vroundps))]
83c7162d 423#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 424pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
74b04a01 425 simd_floor(a)
0531ce1d
XL
426}
427
532ac7d7 428/// Returns the square root of packed single-precision (32-bit) floating point
0531ce1d 429/// elements in `a`.
83c7162d 430///
353b0b11 431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
0531ce1d
XL
432#[inline]
433#[target_feature(enable = "avx")]
434#[cfg_attr(test, assert_instr(vsqrtps))]
83c7162d 435#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
436pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
437 sqrtps256(a)
438}
439
532ac7d7 440/// Returns the square root of packed double-precision (64-bit) floating point
0531ce1d 441/// elements in `a`.
83c7162d 442///
353b0b11 443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
0531ce1d
XL
444#[inline]
445#[target_feature(enable = "avx")]
446#[cfg_attr(test, assert_instr(vsqrtpd))]
83c7162d 447#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 448pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
74b04a01 449 simd_fsqrt(a)
0531ce1d
XL
450}
451
532ac7d7 452/// Blends packed double-precision (64-bit) floating-point elements from
0531ce1d 453/// `a` and `b` using control mask `imm8`.
83c7162d 454///
353b0b11 455/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
0531ce1d
XL
456#[inline]
457#[target_feature(enable = "avx")]
8faf50e0
XL
458// Note: LLVM7 prefers single-precision blend instructions when
459// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
460// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
17df50a5
XL
461#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
462#[rustc_legacy_const_generics(2)]
463#[stable(feature = "simd_x86", since = "1.27.0")]
464pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
353b0b11
FG
465 static_assert_uimm_bits!(IMM4, 4);
466 simd_shuffle!(
17df50a5
XL
467 a,
468 b,
353b0b11 469 [
17df50a5
XL
470 ((IMM4 as u32 >> 0) & 1) * 4 + 0,
471 ((IMM4 as u32 >> 1) & 1) * 4 + 1,
472 ((IMM4 as u32 >> 2) & 1) * 4 + 2,
473 ((IMM4 as u32 >> 3) & 1) * 4 + 3,
474 ],
475 )
0531ce1d
XL
476}
477
532ac7d7 478/// Blends packed single-precision (32-bit) floating-point elements from
0531ce1d 479/// `a` and `b` using control mask `imm8`.
83c7162d 480///
353b0b11 481/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
0531ce1d
XL
482#[inline]
483#[target_feature(enable = "avx")]
17df50a5
XL
484#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
485#[rustc_legacy_const_generics(2)]
486#[stable(feature = "simd_x86", since = "1.27.0")]
487pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
353b0b11
FG
488 static_assert_uimm_bits!(IMM8, 8);
489 simd_shuffle!(
17df50a5
XL
490 a,
491 b,
353b0b11 492 [
17df50a5
XL
493 ((IMM8 as u32 >> 0) & 1) * 8 + 0,
494 ((IMM8 as u32 >> 1) & 1) * 8 + 1,
495 ((IMM8 as u32 >> 2) & 1) * 8 + 2,
496 ((IMM8 as u32 >> 3) & 1) * 8 + 3,
497 ((IMM8 as u32 >> 4) & 1) * 8 + 4,
498 ((IMM8 as u32 >> 5) & 1) * 8 + 5,
499 ((IMM8 as u32 >> 6) & 1) * 8 + 6,
500 ((IMM8 as u32 >> 7) & 1) * 8 + 7,
501 ],
502 )
0531ce1d
XL
503}
504
532ac7d7 505/// Blends packed double-precision (64-bit) floating-point elements from
0531ce1d 506/// `a` and `b` using `c` as a mask.
83c7162d 507///
353b0b11 508/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
0531ce1d
XL
509#[inline]
510#[target_feature(enable = "avx")]
511#[cfg_attr(test, assert_instr(vblendvpd))]
83c7162d 512#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
513pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
514 vblendvpd(a, b, c)
515}
516
532ac7d7 517/// Blends packed single-precision (32-bit) floating-point elements from
0531ce1d 518/// `a` and `b` using `c` as a mask.
83c7162d 519///
353b0b11 520/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
0531ce1d
XL
521#[inline]
522#[target_feature(enable = "avx")]
523#[cfg_attr(test, assert_instr(vblendvps))]
83c7162d 524#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
525pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
526 vblendvps(a, b, c)
527}
528
532ac7d7 529/// Conditionally multiplies the packed single-precision (32-bit) floating-point
0531ce1d
XL
530/// elements in `a` and `b` using the high 4 bits in `imm8`,
531/// sum the four products, and conditionally return the sum
532/// using the low 4 bits of `imm8`.
83c7162d 533///
353b0b11 534/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
0531ce1d
XL
535#[inline]
536#[target_feature(enable = "avx")]
17df50a5
XL
537#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
538#[rustc_legacy_const_generics(2)]
83c7162d 539#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 540pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
353b0b11 541 static_assert_uimm_bits!(IMM8, 8);
17df50a5 542 vdpps(a, b, IMM8)
0531ce1d
XL
543}
544
545/// Horizontal addition of adjacent pairs in the two packed vectors
546/// of 4 64-bit floating points `a` and `b`.
547/// In the result, sums of elements from `a` are returned in even locations,
548/// while sums of elements from `b` are returned in odd locations.
83c7162d 549///
353b0b11 550/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
0531ce1d
XL
551#[inline]
552#[target_feature(enable = "avx")]
553#[cfg_attr(test, assert_instr(vhaddpd))]
83c7162d 554#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
555pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
556 vhaddpd(a, b)
557}
558
559/// Horizontal addition of adjacent pairs in the two packed vectors
560/// of 8 32-bit floating points `a` and `b`.
561/// In the result, sums of elements from `a` are returned in locations of
562/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
563/// 2, 3, 6, 7.
83c7162d 564///
353b0b11 565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
0531ce1d
XL
566#[inline]
567#[target_feature(enable = "avx")]
568#[cfg_attr(test, assert_instr(vhaddps))]
83c7162d 569#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
570pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
571 vhaddps(a, b)
572}
573
574/// Horizontal subtraction of adjacent pairs in the two packed vectors
575/// of 4 64-bit floating points `a` and `b`.
576/// In the result, sums of elements from `a` are returned in even locations,
577/// while sums of elements from `b` are returned in odd locations.
83c7162d 578///
353b0b11 579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
0531ce1d
XL
580#[inline]
581#[target_feature(enable = "avx")]
582#[cfg_attr(test, assert_instr(vhsubpd))]
83c7162d 583#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
584pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
585 vhsubpd(a, b)
586}
587
588/// Horizontal subtraction of adjacent pairs in the two packed vectors
589/// of 8 32-bit floating points `a` and `b`.
590/// In the result, sums of elements from `a` are returned in locations of
591/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
592/// 2, 3, 6, 7.
83c7162d 593///
353b0b11 594/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
0531ce1d
XL
595#[inline]
596#[target_feature(enable = "avx")]
597#[cfg_attr(test, assert_instr(vhsubps))]
83c7162d 598#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
599pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
600 vhsubps(a, b)
601}
602
532ac7d7 603/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
0531ce1d 604/// elements in `a` and `b`.
83c7162d 605///
353b0b11 606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
0531ce1d
XL
607#[inline]
608#[target_feature(enable = "avx")]
609// FIXME Should be 'vxorpd' instruction.
610#[cfg_attr(test, assert_instr(vxorps))]
83c7162d 611#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 612pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
532ac7d7
XL
613 let a: u64x4 = transmute(a);
614 let b: u64x4 = transmute(b);
615 transmute(simd_xor(a, b))
0531ce1d
XL
616}
617
532ac7d7 618/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
0531ce1d 619/// elements in `a` and `b`.
83c7162d 620///
353b0b11 621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
0531ce1d
XL
622#[inline]
623#[target_feature(enable = "avx")]
624#[cfg_attr(test, assert_instr(vxorps))]
83c7162d 625#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 626pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
532ac7d7
XL
627 let a: u32x8 = transmute(a);
628 let b: u32x8 = transmute(b);
629 transmute(simd_xor(a, b))
0531ce1d
XL
630}
631
632/// Equal (ordered, non-signaling)
83c7162d 633#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
634pub const _CMP_EQ_OQ: i32 = 0x00;
635/// Less-than (ordered, signaling)
83c7162d 636#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
637pub const _CMP_LT_OS: i32 = 0x01;
638/// Less-than-or-equal (ordered, signaling)
83c7162d 639#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
640pub const _CMP_LE_OS: i32 = 0x02;
641/// Unordered (non-signaling)
83c7162d 642#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
643pub const _CMP_UNORD_Q: i32 = 0x03;
644/// Not-equal (unordered, non-signaling)
83c7162d 645#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
646pub const _CMP_NEQ_UQ: i32 = 0x04;
647/// Not-less-than (unordered, signaling)
83c7162d 648#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
649pub const _CMP_NLT_US: i32 = 0x05;
650/// Not-less-than-or-equal (unordered, signaling)
83c7162d 651#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
652pub const _CMP_NLE_US: i32 = 0x06;
653/// Ordered (non-signaling)
83c7162d 654#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
655pub const _CMP_ORD_Q: i32 = 0x07;
656/// Equal (unordered, non-signaling)
83c7162d 657#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
658pub const _CMP_EQ_UQ: i32 = 0x08;
659/// Not-greater-than-or-equal (unordered, signaling)
83c7162d 660#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
661pub const _CMP_NGE_US: i32 = 0x09;
662/// Not-greater-than (unordered, signaling)
83c7162d 663#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
664pub const _CMP_NGT_US: i32 = 0x0a;
665/// False (ordered, non-signaling)
83c7162d 666#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
667pub const _CMP_FALSE_OQ: i32 = 0x0b;
668/// Not-equal (ordered, non-signaling)
83c7162d 669#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
670pub const _CMP_NEQ_OQ: i32 = 0x0c;
671/// Greater-than-or-equal (ordered, signaling)
83c7162d 672#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
673pub const _CMP_GE_OS: i32 = 0x0d;
674/// Greater-than (ordered, signaling)
83c7162d 675#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
676pub const _CMP_GT_OS: i32 = 0x0e;
677/// True (unordered, non-signaling)
83c7162d 678#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
679pub const _CMP_TRUE_UQ: i32 = 0x0f;
680/// Equal (ordered, signaling)
83c7162d 681#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
682pub const _CMP_EQ_OS: i32 = 0x10;
683/// Less-than (ordered, non-signaling)
83c7162d 684#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
685pub const _CMP_LT_OQ: i32 = 0x11;
686/// Less-than-or-equal (ordered, non-signaling)
83c7162d 687#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
688pub const _CMP_LE_OQ: i32 = 0x12;
689/// Unordered (signaling)
83c7162d 690#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
691pub const _CMP_UNORD_S: i32 = 0x13;
692/// Not-equal (unordered, signaling)
83c7162d 693#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
694pub const _CMP_NEQ_US: i32 = 0x14;
695/// Not-less-than (unordered, non-signaling)
83c7162d 696#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
697pub const _CMP_NLT_UQ: i32 = 0x15;
698/// Not-less-than-or-equal (unordered, non-signaling)
83c7162d 699#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
700pub const _CMP_NLE_UQ: i32 = 0x16;
701/// Ordered (signaling)
83c7162d 702#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
703pub const _CMP_ORD_S: i32 = 0x17;
704/// Equal (unordered, signaling)
83c7162d 705#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
706pub const _CMP_EQ_US: i32 = 0x18;
707/// Not-greater-than-or-equal (unordered, non-signaling)
83c7162d 708#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
709pub const _CMP_NGE_UQ: i32 = 0x19;
710/// Not-greater-than (unordered, non-signaling)
83c7162d 711#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
712pub const _CMP_NGT_UQ: i32 = 0x1a;
713/// False (ordered, signaling)
83c7162d 714#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
715pub const _CMP_FALSE_OS: i32 = 0x1b;
716/// Not-equal (ordered, signaling)
83c7162d 717#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
718pub const _CMP_NEQ_OS: i32 = 0x1c;
719/// Greater-than-or-equal (ordered, non-signaling)
83c7162d 720#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
721pub const _CMP_GE_OQ: i32 = 0x1d;
722/// Greater-than (ordered, non-signaling)
83c7162d 723#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
724pub const _CMP_GT_OQ: i32 = 0x1e;
725/// True (unordered, signaling)
83c7162d 726#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
727pub const _CMP_TRUE_US: i32 = 0x1f;
728
532ac7d7 729/// Compares packed double-precision (64-bit) floating-point
0531ce1d 730/// elements in `a` and `b` based on the comparison operand
17df50a5 731/// specified by `IMM5`.
83c7162d 732///
353b0b11 733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
0531ce1d
XL
734#[inline]
735#[target_feature(enable = "avx,sse2")]
17df50a5
XL
736#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
737#[rustc_legacy_const_generics(2)]
83c7162d 738#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 739pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
353b0b11 740 static_assert_uimm_bits!(IMM5, 5);
17df50a5 741 vcmppd(a, b, IMM5 as i8)
0531ce1d
XL
742}
743
532ac7d7 744/// Compares packed double-precision (64-bit) floating-point
0531ce1d 745/// elements in `a` and `b` based on the comparison operand
17df50a5 746/// specified by `IMM5`.
83c7162d 747///
353b0b11 748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
0531ce1d
XL
749#[inline]
750#[target_feature(enable = "avx")]
17df50a5
XL
751#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
752#[rustc_legacy_const_generics(2)]
83c7162d 753#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 754pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
353b0b11 755 static_assert_uimm_bits!(IMM5, 5);
17df50a5 756 vcmppd256(a, b, IMM5 as u8)
0531ce1d
XL
757}
758
532ac7d7 759/// Compares packed single-precision (32-bit) floating-point
0531ce1d 760/// elements in `a` and `b` based on the comparison operand
17df50a5 761/// specified by `IMM5`.
83c7162d 762///
353b0b11 763/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
0531ce1d
XL
764#[inline]
765#[target_feature(enable = "avx,sse")]
17df50a5
XL
766#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
767#[rustc_legacy_const_generics(2)]
83c7162d 768#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 769pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
353b0b11 770 static_assert_uimm_bits!(IMM5, 5);
17df50a5 771 vcmpps(a, b, IMM5 as i8)
0531ce1d
XL
772}
773
532ac7d7 774/// Compares packed single-precision (32-bit) floating-point
0531ce1d 775/// elements in `a` and `b` based on the comparison operand
17df50a5 776/// specified by `IMM5`.
83c7162d 777///
353b0b11 778/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
0531ce1d
XL
779#[inline]
780#[target_feature(enable = "avx")]
17df50a5
XL
781#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
782#[rustc_legacy_const_generics(2)]
83c7162d 783#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 784pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
353b0b11 785 static_assert_uimm_bits!(IMM5, 5);
17df50a5 786 vcmpps256(a, b, IMM5 as u8)
0531ce1d
XL
787}
788
532ac7d7 789/// Compares the lower double-precision (64-bit) floating-point element in
17df50a5 790/// `a` and `b` based on the comparison operand specified by `IMM5`,
0531ce1d 791/// store the result in the lower element of returned vector,
532ac7d7 792/// and copies the upper element from `a` to the upper element of returned
0531ce1d 793/// vector.
83c7162d 794///
353b0b11 795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
0531ce1d
XL
796#[inline]
797#[target_feature(enable = "avx,sse2")]
17df50a5
XL
798#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
799#[rustc_legacy_const_generics(2)]
83c7162d 800#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 801pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
353b0b11 802 static_assert_uimm_bits!(IMM5, 5);
17df50a5 803 vcmpsd(a, b, IMM5 as i8)
0531ce1d
XL
804}
805
532ac7d7 806/// Compares the lower single-precision (32-bit) floating-point element in
17df50a5 807/// `a` and `b` based on the comparison operand specified by `IMM5`,
0531ce1d 808/// store the result in the lower element of returned vector,
532ac7d7 809/// and copies the upper 3 packed elements from `a` to the upper elements of
0531ce1d 810/// returned vector.
83c7162d 811///
353b0b11 812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
0531ce1d
XL
813#[inline]
814#[target_feature(enable = "avx,sse")]
17df50a5
XL
815#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
816#[rustc_legacy_const_generics(2)]
83c7162d 817#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 818pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
353b0b11 819 static_assert_uimm_bits!(IMM5, 5);
17df50a5 820 vcmpss(a, b, IMM5 as i8)
0531ce1d
XL
821}
822
532ac7d7 823/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
0531ce1d 824/// floating-point elements.
83c7162d 825///
353b0b11 826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
0531ce1d
XL
827#[inline]
828#[target_feature(enable = "avx")]
829#[cfg_attr(test, assert_instr(vcvtdq2pd))]
83c7162d 830#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
831pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
832 simd_cast(a.as_i32x4())
833}
834
532ac7d7 835/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
0531ce1d 836/// floating-point elements.
83c7162d 837///
353b0b11 838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
0531ce1d
XL
839#[inline]
840#[target_feature(enable = "avx")]
841#[cfg_attr(test, assert_instr(vcvtdq2ps))]
83c7162d 842#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
843pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
844 vcvtdq2ps(a.as_i32x8())
845}
846
532ac7d7 847/// Converts packed double-precision (64-bit) floating-point elements in `a`
0531ce1d 848/// to packed single-precision (32-bit) floating-point elements.
83c7162d 849///
353b0b11 850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
0531ce1d
XL
851#[inline]
852#[target_feature(enable = "avx")]
853#[cfg_attr(test, assert_instr(vcvtpd2ps))]
83c7162d 854#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
855pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
856 vcvtpd2ps(a)
857}
858
532ac7d7 859/// Converts packed single-precision (32-bit) floating-point elements in `a`
0531ce1d 860/// to packed 32-bit integers.
83c7162d 861///
353b0b11 862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
0531ce1d
XL
863#[inline]
864#[target_feature(enable = "avx")]
865#[cfg_attr(test, assert_instr(vcvtps2dq))]
83c7162d 866#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 867pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
532ac7d7 868 transmute(vcvtps2dq(a))
0531ce1d
XL
869}
870
532ac7d7 871/// Converts packed single-precision (32-bit) floating-point elements in `a`
0531ce1d 872/// to packed double-precision (64-bit) floating-point elements.
83c7162d 873///
353b0b11 874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
0531ce1d
XL
875#[inline]
876#[target_feature(enable = "avx")]
877#[cfg_attr(test, assert_instr(vcvtps2pd))]
83c7162d 878#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
879pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
880 simd_cast(a)
881}
882
532ac7d7 883/// Converts packed double-precision (64-bit) floating-point elements in `a`
0531ce1d 884/// to packed 32-bit integers with truncation.
83c7162d 885///
353b0b11 886/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
0531ce1d
XL
887#[inline]
888#[target_feature(enable = "avx")]
889#[cfg_attr(test, assert_instr(vcvttpd2dq))]
83c7162d 890#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 891pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
532ac7d7 892 transmute(vcvttpd2dq(a))
0531ce1d
XL
893}
894
532ac7d7 895/// Converts packed double-precision (64-bit) floating-point elements in `a`
0531ce1d 896/// to packed 32-bit integers.
83c7162d 897///
353b0b11 898/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
0531ce1d
XL
899#[inline]
900#[target_feature(enable = "avx")]
901#[cfg_attr(test, assert_instr(vcvtpd2dq))]
83c7162d 902#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 903pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
532ac7d7 904 transmute(vcvtpd2dq(a))
0531ce1d
XL
905}
906
532ac7d7 907/// Converts packed single-precision (32-bit) floating-point elements in `a`
0531ce1d 908/// to packed 32-bit integers with truncation.
83c7162d 909///
353b0b11 910/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
0531ce1d
XL
911#[inline]
912#[target_feature(enable = "avx")]
913#[cfg_attr(test, assert_instr(vcvttps2dq))]
83c7162d 914#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 915pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
532ac7d7 916 transmute(vcvttps2dq(a))
0531ce1d
XL
917}
918
532ac7d7 919/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
0531ce1d 920/// floating-point elements) from `a`, selected with `imm8`.
83c7162d 921///
353b0b11 922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
0531ce1d
XL
923#[inline]
924#[target_feature(enable = "avx")]
0731742a
XL
925#[cfg_attr(
926 all(test, not(target_os = "windows")),
17df50a5 927 assert_instr(vextractf128, IMM1 = 1)
0731742a 928)]
17df50a5
XL
929#[rustc_legacy_const_generics(1)]
930#[stable(feature = "simd_x86", since = "1.27.0")]
931pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
353b0b11
FG
932 static_assert_uimm_bits!(IMM1, 1);
933 simd_shuffle!(
17df50a5
XL
934 a,
935 _mm256_undefined_ps(),
353b0b11 936 [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
17df50a5 937 )
0531ce1d
XL
938}
939
532ac7d7 940/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
0531ce1d 941/// floating-point elements) from `a`, selected with `imm8`.
83c7162d 942///
353b0b11 943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
0531ce1d
XL
944#[inline]
945#[target_feature(enable = "avx")]
0731742a
XL
946#[cfg_attr(
947 all(test, not(target_os = "windows")),
17df50a5 948 assert_instr(vextractf128, IMM1 = 1)
0731742a 949)]
17df50a5 950#[rustc_legacy_const_generics(1)]
83c7162d 951#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 952pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
353b0b11
FG
953 static_assert_uimm_bits!(IMM1, 1);
954 simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
0531ce1d
XL
955}
956
532ac7d7 957/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
83c7162d 958///
353b0b11 959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
0531ce1d
XL
960#[inline]
961#[target_feature(enable = "avx")]
0731742a
XL
962#[cfg_attr(
963 all(test, not(target_os = "windows")),
17df50a5 964 assert_instr(vextractf128, IMM1 = 1)
0731742a 965)]
17df50a5
XL
966#[rustc_legacy_const_generics(1)]
967#[stable(feature = "simd_x86", since = "1.27.0")]
968pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
353b0b11
FG
969 static_assert_uimm_bits!(IMM1, 1);
970 let dst: i64x2 = simd_shuffle!(
17df50a5
XL
971 a.as_i64x4(),
972 _mm256_undefined_si256().as_i64x4(),
353b0b11 973 [[0, 1], [2, 3]][IMM1 as usize],
17df50a5 974 );
532ac7d7 975 transmute(dst)
0531ce1d
XL
976}
977
532ac7d7 978/// Zeroes the contents of all XMM or YMM registers.
83c7162d 979///
353b0b11 980/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
0531ce1d
XL
981#[inline]
982#[target_feature(enable = "avx")]
983#[cfg_attr(test, assert_instr(vzeroall))]
83c7162d 984#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
985pub unsafe fn _mm256_zeroall() {
986 vzeroall()
987}
988
532ac7d7 989/// Zeroes the upper 128 bits of all YMM registers;
0531ce1d 990/// the lower 128-bits of the registers are unmodified.
83c7162d 991///
353b0b11 992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
0531ce1d
XL
993#[inline]
994#[target_feature(enable = "avx")]
995#[cfg_attr(test, assert_instr(vzeroupper))]
83c7162d 996#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
997pub unsafe fn _mm256_zeroupper() {
998 vzeroupper()
999}
1000
532ac7d7 1001/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1002/// within 128-bit lanes using the control in `b`.
83c7162d 1003///
353b0b11 1004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
0531ce1d
XL
1005#[inline]
1006#[target_feature(enable = "avx")]
1007#[cfg_attr(test, assert_instr(vpermilps))]
83c7162d 1008#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1009pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1010 vpermilps256(a, b.as_i32x8())
1011}
1012
532ac7d7 1013/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1014/// using the control in `b`.
83c7162d 1015///
353b0b11 1016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
0531ce1d
XL
1017#[inline]
1018#[target_feature(enable = "avx")]
1019#[cfg_attr(test, assert_instr(vpermilps))]
83c7162d 1020#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1021pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1022 vpermilps(a, b.as_i32x4())
1023}
1024
532ac7d7 1025/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1026/// within 128-bit lanes using the control in `imm8`.
83c7162d 1027///
353b0b11 1028/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
0531ce1d
XL
1029#[inline]
1030#[target_feature(enable = "avx")]
17df50a5
XL
1031#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
1032#[rustc_legacy_const_generics(1)]
1033#[stable(feature = "simd_x86", since = "1.27.0")]
1034pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
353b0b11
FG
1035 static_assert_uimm_bits!(IMM8, 8);
1036 simd_shuffle!(
17df50a5
XL
1037 a,
1038 _mm256_undefined_ps(),
353b0b11 1039 [
17df50a5
XL
1040 (IMM8 as u32 >> 0) & 0b11,
1041 (IMM8 as u32 >> 2) & 0b11,
1042 (IMM8 as u32 >> 4) & 0b11,
1043 (IMM8 as u32 >> 6) & 0b11,
1044 ((IMM8 as u32 >> 0) & 0b11) + 4,
1045 ((IMM8 as u32 >> 2) & 0b11) + 4,
1046 ((IMM8 as u32 >> 4) & 0b11) + 4,
1047 ((IMM8 as u32 >> 6) & 0b11) + 4,
1048 ],
1049 )
0531ce1d
XL
1050}
1051
532ac7d7 1052/// Shuffles single-precision (32-bit) floating-point elements in `a`
0531ce1d 1053/// using the control in `imm8`.
83c7162d 1054///
353b0b11 1055/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
0531ce1d
XL
1056#[inline]
1057#[target_feature(enable = "avx,sse")]
17df50a5
XL
1058#[cfg_attr(test, assert_instr(vpermilps, IMM8 = 9))]
1059#[rustc_legacy_const_generics(1)]
1060#[stable(feature = "simd_x86", since = "1.27.0")]
1061pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
353b0b11
FG
1062 static_assert_uimm_bits!(IMM8, 8);
1063 simd_shuffle!(
17df50a5
XL
1064 a,
1065 _mm_undefined_ps(),
353b0b11 1066 [
17df50a5
XL
1067 (IMM8 as u32 >> 0) & 0b11,
1068 (IMM8 as u32 >> 2) & 0b11,
1069 (IMM8 as u32 >> 4) & 0b11,
1070 (IMM8 as u32 >> 6) & 0b11,
1071 ],
1072 )
0531ce1d
XL
1073}
1074
532ac7d7 1075/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1076/// within 256-bit lanes using the control in `b`.
83c7162d 1077///
353b0b11 1078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
0531ce1d
XL
1079#[inline]
1080#[target_feature(enable = "avx")]
1081#[cfg_attr(test, assert_instr(vpermilpd))]
83c7162d 1082#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1083pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1084 vpermilpd256(a, b.as_i64x4())
1085}
1086
532ac7d7 1087/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1088/// using the control in `b`.
83c7162d 1089///
353b0b11 1090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
0531ce1d
XL
1091#[inline]
1092#[target_feature(enable = "avx")]
1093#[cfg_attr(test, assert_instr(vpermilpd))]
83c7162d 1094#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1095pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1096 vpermilpd(a, b.as_i64x2())
1097}
1098
532ac7d7 1099/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1100/// within 128-bit lanes using the control in `imm8`.
83c7162d 1101///
353b0b11 1102/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
0531ce1d
XL
1103#[inline]
1104#[target_feature(enable = "avx")]
17df50a5
XL
1105#[cfg_attr(test, assert_instr(vpermilpd, IMM4 = 0x1))]
1106#[rustc_legacy_const_generics(1)]
1107#[stable(feature = "simd_x86", since = "1.27.0")]
1108pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
353b0b11
FG
1109 static_assert_uimm_bits!(IMM4, 4);
1110 simd_shuffle!(
17df50a5
XL
1111 a,
1112 _mm256_undefined_pd(),
353b0b11 1113 [
17df50a5
XL
1114 ((IMM4 as u32 >> 0) & 1),
1115 ((IMM4 as u32 >> 1) & 1),
1116 ((IMM4 as u32 >> 2) & 1) + 2,
1117 ((IMM4 as u32 >> 3) & 1) + 2,
1118 ],
1119 )
0531ce1d
XL
1120}
1121
532ac7d7 1122/// Shuffles double-precision (64-bit) floating-point elements in `a`
0531ce1d 1123/// using the control in `imm8`.
83c7162d 1124///
353b0b11 1125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
0531ce1d
XL
1126#[inline]
1127#[target_feature(enable = "avx,sse2")]
17df50a5
XL
1128#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0x1))]
1129#[rustc_legacy_const_generics(1)]
1130#[stable(feature = "simd_x86", since = "1.27.0")]
1131pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
353b0b11
FG
1132 static_assert_uimm_bits!(IMM2, 2);
1133 simd_shuffle!(
17df50a5
XL
1134 a,
1135 _mm_undefined_pd(),
353b0b11 1136 [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
17df50a5 1137 )
0531ce1d
XL
1138}
1139
532ac7d7 1140/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
0531ce1d 1141/// floating-point elements) selected by `imm8` from `a` and `b`.
83c7162d 1142///
353b0b11 1143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
0531ce1d
XL
1144#[inline]
1145#[target_feature(enable = "avx")]
17df50a5
XL
1146#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1147#[rustc_legacy_const_generics(2)]
83c7162d 1148#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1149pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
353b0b11 1150 static_assert_uimm_bits!(IMM8, 8);
17df50a5 1151 vperm2f128ps256(a, b, IMM8 as i8)
0531ce1d
XL
1152}
1153
532ac7d7 1154/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
0531ce1d 1155/// floating-point elements) selected by `imm8` from `a` and `b`.
83c7162d 1156///
353b0b11 1157/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
0531ce1d
XL
1158#[inline]
1159#[target_feature(enable = "avx")]
17df50a5
XL
1160#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1161#[rustc_legacy_const_generics(2)]
83c7162d 1162#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1163pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
353b0b11 1164 static_assert_uimm_bits!(IMM8, 8);
17df50a5 1165 vperm2f128pd256(a, b, IMM8 as i8)
0531ce1d
XL
1166}
1167
1b1a35ee 1168/// Shuffles 128-bits (composed of integer data) selected by `imm8`
0531ce1d 1169/// from `a` and `b`.
83c7162d 1170///
353b0b11 1171/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
0531ce1d
XL
1172#[inline]
1173#[target_feature(enable = "avx")]
17df50a5
XL
1174#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1175#[rustc_legacy_const_generics(2)]
83c7162d 1176#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1177pub unsafe fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
353b0b11 1178 static_assert_uimm_bits!(IMM8, 8);
17df50a5 1179 transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
0531ce1d
XL
1180}
1181
532ac7d7 1182/// Broadcasts a single-precision (32-bit) floating-point element from memory
0531ce1d 1183/// to all elements of the returned vector.
83c7162d 1184///
353b0b11 1185/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
0531ce1d
XL
1186#[inline]
1187#[target_feature(enable = "avx")]
1188#[cfg_attr(test, assert_instr(vbroadcastss))]
83c7162d 1189#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1190#[allow(clippy::trivially_copy_pass_by_ref)]
0531ce1d
XL
1191pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1192 _mm256_set1_ps(*f)
1193}
1194
532ac7d7 1195/// Broadcasts a single-precision (32-bit) floating-point element from memory
0531ce1d 1196/// to all elements of the returned vector.
83c7162d 1197///
353b0b11 1198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
0531ce1d
XL
1199#[inline]
1200#[target_feature(enable = "avx")]
1201#[cfg_attr(test, assert_instr(vbroadcastss))]
83c7162d 1202#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1203#[allow(clippy::trivially_copy_pass_by_ref)]
0531ce1d
XL
1204pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
1205 _mm_set1_ps(*f)
1206}
1207
532ac7d7 1208/// Broadcasts a double-precision (64-bit) floating-point element from memory
0531ce1d 1209/// to all elements of the returned vector.
83c7162d 1210///
353b0b11 1211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
0531ce1d
XL
1212#[inline]
1213#[target_feature(enable = "avx")]
1214#[cfg_attr(test, assert_instr(vbroadcastsd))]
83c7162d 1215#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1216#[allow(clippy::trivially_copy_pass_by_ref)]
0531ce1d
XL
1217pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1218 _mm256_set1_pd(*f)
1219}
1220
532ac7d7 1221/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
0531ce1d 1222/// (32-bit) floating-point elements) to all elements of the returned vector.
83c7162d 1223///
353b0b11 1224/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
0531ce1d
XL
1225#[inline]
1226#[target_feature(enable = "avx")]
1227#[cfg_attr(test, assert_instr(vbroadcastf128))]
83c7162d 1228#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1229pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1230 vbroadcastf128ps256(a)
1231}
1232
532ac7d7 1233/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
0531ce1d 1234/// (64-bit) floating-point elements) to all elements of the returned vector.
83c7162d 1235///
353b0b11 1236/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
0531ce1d
XL
1237#[inline]
1238#[target_feature(enable = "avx")]
1239#[cfg_attr(test, assert_instr(vbroadcastf128))]
83c7162d 1240#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1241pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1242 vbroadcastf128pd256(a)
1243}
1244
532ac7d7 1245/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
0531ce1d
XL
1246/// single-precision (32-bit) floating-point elements) from `b` into result
1247/// at the location specified by `imm8`.
83c7162d 1248///
353b0b11 1249/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
0531ce1d
XL
1250#[inline]
1251#[target_feature(enable = "avx")]
0731742a
XL
1252#[cfg_attr(
1253 all(test, not(target_os = "windows")),
17df50a5 1254 assert_instr(vinsertf128, IMM1 = 1)
0731742a 1255)]
17df50a5
XL
1256#[rustc_legacy_const_generics(2)]
1257#[stable(feature = "simd_x86", since = "1.27.0")]
1258pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
353b0b11
FG
1259 static_assert_uimm_bits!(IMM1, 1);
1260 simd_shuffle!(
17df50a5
XL
1261 a,
1262 _mm256_castps128_ps256(b),
353b0b11 1263 [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
17df50a5 1264 )
0531ce1d
XL
1265}
1266
532ac7d7 1267/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
0531ce1d
XL
1268/// double-precision (64-bit) floating-point elements) from `b` into result
1269/// at the location specified by `imm8`.
83c7162d 1270///
353b0b11 1271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
0531ce1d
XL
1272#[inline]
1273#[target_feature(enable = "avx")]
0731742a
XL
1274#[cfg_attr(
1275 all(test, not(target_os = "windows")),
17df50a5 1276 assert_instr(vinsertf128, IMM1 = 1)
0731742a 1277)]
17df50a5
XL
1278#[rustc_legacy_const_generics(2)]
1279#[stable(feature = "simd_x86", since = "1.27.0")]
1280pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
353b0b11
FG
1281 static_assert_uimm_bits!(IMM1, 1);
1282 simd_shuffle!(
17df50a5
XL
1283 a,
1284 _mm256_castpd128_pd256(b),
353b0b11 1285 [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
17df50a5 1286 )
0531ce1d
XL
1287}
1288
532ac7d7 1289/// Copies `a` to result, then inserts 128 bits from `b` into result
0531ce1d 1290/// at the location specified by `imm8`.
83c7162d 1291///
353b0b11 1292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
0531ce1d
XL
1293#[inline]
1294#[target_feature(enable = "avx")]
0731742a
XL
1295#[cfg_attr(
1296 all(test, not(target_os = "windows")),
17df50a5 1297 assert_instr(vinsertf128, IMM1 = 1)
0731742a 1298)]
17df50a5
XL
1299#[rustc_legacy_const_generics(2)]
1300#[stable(feature = "simd_x86", since = "1.27.0")]
1301pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
353b0b11
FG
1302 static_assert_uimm_bits!(IMM1, 1);
1303 let dst: i64x4 = simd_shuffle!(
17df50a5
XL
1304 a.as_i64x4(),
1305 _mm256_castsi128_si256(b).as_i64x4(),
353b0b11 1306 [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
17df50a5 1307 );
532ac7d7 1308 transmute(dst)
0531ce1d
XL
1309}
1310
532ac7d7 1311/// Copies `a` to result, and inserts the 8-bit integer `i` into result
0531ce1d 1312/// at the location specified by `index`.
83c7162d 1313///
353b0b11 1314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
0531ce1d
XL
1315#[inline]
1316#[target_feature(enable = "avx")]
1317// This intrinsic has no corresponding instruction.
17df50a5 1318#[rustc_legacy_const_generics(2)]
83c7162d 1319#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1320pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
353b0b11 1321 static_assert_uimm_bits!(INDEX, 5);
17df50a5 1322 transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
0531ce1d
XL
1323}
1324
532ac7d7 1325/// Copies `a` to result, and inserts the 16-bit integer `i` into result
0531ce1d 1326/// at the location specified by `index`.
83c7162d 1327///
353b0b11 1328/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
0531ce1d
XL
1329#[inline]
1330#[target_feature(enable = "avx")]
1331// This intrinsic has no corresponding instruction.
17df50a5 1332#[rustc_legacy_const_generics(2)]
83c7162d 1333#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1334pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
353b0b11 1335 static_assert_uimm_bits!(INDEX, 4);
17df50a5 1336 transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
0531ce1d
XL
1337}
1338
532ac7d7 1339/// Copies `a` to result, and inserts the 32-bit integer `i` into result
0531ce1d 1340/// at the location specified by `index`.
83c7162d 1341///
353b0b11 1342/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
0531ce1d
XL
1343#[inline]
1344#[target_feature(enable = "avx")]
1345// This intrinsic has no corresponding instruction.
17df50a5 1346#[rustc_legacy_const_generics(2)]
83c7162d 1347#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5 1348pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
353b0b11 1349 static_assert_uimm_bits!(INDEX, 3);
17df50a5 1350 transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
0531ce1d
XL
1351}
1352
532ac7d7 1353/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1354/// floating-point elements) from memory into result.
1355/// `mem_addr` must be aligned on a 32-byte boundary or a
1356/// general-protection exception may be generated.
83c7162d 1357///
353b0b11 1358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
0531ce1d
XL
1359#[inline]
1360#[target_feature(enable = "avx")]
1361#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
83c7162d 1362#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1363#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1364pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1365 *(mem_addr as *const __m256d)
1366}
1367
532ac7d7 1368/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1369/// floating-point elements) from `a` into memory.
1370/// `mem_addr` must be aligned on a 32-byte boundary or a
1371/// general-protection exception may be generated.
83c7162d 1372///
353b0b11 1373/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
0531ce1d
XL
1374#[inline]
1375#[target_feature(enable = "avx")]
1376#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
83c7162d 1377#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1378#[allow(clippy::cast_ptr_alignment)]
416331ca 1379pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
0531ce1d
XL
1380 *(mem_addr as *mut __m256d) = a;
1381}
1382
532ac7d7 1383/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1384/// floating-point elements) from memory into result.
1385/// `mem_addr` must be aligned on a 32-byte boundary or a
1386/// general-protection exception may be generated.
83c7162d 1387///
353b0b11 1388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
0531ce1d
XL
1389#[inline]
1390#[target_feature(enable = "avx")]
1391#[cfg_attr(test, assert_instr(vmovaps))]
83c7162d 1392#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1393#[allow(clippy::cast_ptr_alignment)]
0531ce1d
XL
1394pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1395 *(mem_addr as *const __m256)
1396}
1397
532ac7d7 1398/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1399/// floating-point elements) from `a` into memory.
1400/// `mem_addr` must be aligned on a 32-byte boundary or a
1401/// general-protection exception may be generated.
83c7162d 1402///
353b0b11 1403/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
0531ce1d
XL
1404#[inline]
1405#[target_feature(enable = "avx")]
1406#[cfg_attr(test, assert_instr(vmovaps))]
83c7162d 1407#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1408#[allow(clippy::cast_ptr_alignment)]
416331ca 1409pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
0531ce1d
XL
1410 *(mem_addr as *mut __m256) = a;
1411}
1412
532ac7d7 1413/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1414/// floating-point elements) from memory into result.
1415/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d 1416///
353b0b11 1417/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
0531ce1d
XL
1418#[inline]
1419#[target_feature(enable = "avx")]
1420#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
83c7162d 1421#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1422pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1423 let mut dst = _mm256_undefined_pd();
1424 ptr::copy_nonoverlapping(
1425 mem_addr as *const u8,
1426 &mut dst as *mut __m256d as *mut u8,
1427 mem::size_of::<__m256d>(),
1428 );
1429 dst
1430}
1431
532ac7d7 1432/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
0531ce1d
XL
1433/// floating-point elements) from `a` into memory.
1434/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d 1435///
353b0b11 1436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
0531ce1d
XL
1437#[inline]
1438#[target_feature(enable = "avx")]
1439#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
83c7162d 1440#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1441pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1442 storeupd256(mem_addr, a);
1443}
1444
532ac7d7 1445/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1446/// floating-point elements) from memory into result.
1447/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d 1448///
353b0b11 1449/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
0531ce1d
XL
1450#[inline]
1451#[target_feature(enable = "avx")]
1452#[cfg_attr(test, assert_instr(vmovups))]
83c7162d 1453#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1454pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1455 let mut dst = _mm256_undefined_ps();
1456 ptr::copy_nonoverlapping(
1457 mem_addr as *const u8,
1458 &mut dst as *mut __m256 as *mut u8,
1459 mem::size_of::<__m256>(),
1460 );
1461 dst
1462}
1463
532ac7d7 1464/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
0531ce1d
XL
1465/// floating-point elements) from `a` into memory.
1466/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d 1467///
353b0b11 1468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
0531ce1d
XL
1469#[inline]
1470#[target_feature(enable = "avx")]
1471#[cfg_attr(test, assert_instr(vmovups))]
83c7162d 1472#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1473pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1474 storeups256(mem_addr, a);
1475}
1476
532ac7d7 1477/// Loads 256-bits of integer data from memory into result.
0531ce1d
XL
1478/// `mem_addr` must be aligned on a 32-byte boundary or a
1479/// general-protection exception may be generated.
83c7162d 1480///
353b0b11 1481/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
0531ce1d
XL
1482#[inline]
1483#[target_feature(enable = "avx")]
1484#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
83c7162d 1485#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1486pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1487 *mem_addr
1488}
1489
532ac7d7 1490/// Stores 256-bits of integer data from `a` into memory.
0531ce1d
XL
1491/// `mem_addr` must be aligned on a 32-byte boundary or a
1492/// general-protection exception may be generated.
83c7162d 1493///
353b0b11 1494/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
0531ce1d
XL
1495#[inline]
1496#[target_feature(enable = "avx")]
1497#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
83c7162d 1498#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1499pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1500 *mem_addr = a;
1501}
1502
532ac7d7 1503/// Loads 256-bits of integer data from memory into result.
0531ce1d 1504/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d 1505///
353b0b11 1506/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
0531ce1d
XL
1507#[inline]
1508#[target_feature(enable = "avx")]
1509#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
83c7162d 1510#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1511pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1512 let mut dst = _mm256_undefined_si256();
1513 ptr::copy_nonoverlapping(
1514 mem_addr as *const u8,
1515 &mut dst as *mut __m256i as *mut u8,
1516 mem::size_of::<__m256i>(),
1517 );
1518 dst
1519}
1520
532ac7d7 1521/// Stores 256-bits of integer data from `a` into memory.
0531ce1d 1522/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d 1523///
353b0b11 1524/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
0531ce1d
XL
1525#[inline]
1526#[target_feature(enable = "avx")]
1527#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
83c7162d 1528#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1529pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1530 storeudq256(mem_addr as *mut i8, a.as_i8x32());
1531}
1532
532ac7d7 1533/// Loads packed double-precision (64-bit) floating-point elements from memory
0531ce1d
XL
1534/// into result using `mask` (elements are zeroed out when the high bit of the
1535/// corresponding element is not set).
83c7162d 1536///
353b0b11 1537/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
0531ce1d
XL
1538#[inline]
1539#[target_feature(enable = "avx")]
1540#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1541#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1542pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
0531ce1d
XL
1543 maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1544}
1545
532ac7d7 1546/// Stores packed double-precision (64-bit) floating-point elements from `a`
0531ce1d 1547/// into memory using `mask`.
83c7162d 1548///
353b0b11 1549/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
0531ce1d
XL
1550#[inline]
1551#[target_feature(enable = "avx")]
1552#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1553#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1554pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
0531ce1d
XL
1555 maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1556}
1557
532ac7d7 1558/// Loads packed double-precision (64-bit) floating-point elements from memory
0531ce1d
XL
1559/// into result using `mask` (elements are zeroed out when the high bit of the
1560/// corresponding element is not set).
83c7162d 1561///
353b0b11 1562/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
0531ce1d
XL
1563#[inline]
1564#[target_feature(enable = "avx")]
1565#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1566#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1567pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1568 maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1569}
1570
532ac7d7 1571/// Stores packed double-precision (64-bit) floating-point elements from `a`
0531ce1d 1572/// into memory using `mask`.
83c7162d 1573///
353b0b11 1574/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
0531ce1d
XL
1575#[inline]
1576#[target_feature(enable = "avx")]
1577#[cfg_attr(test, assert_instr(vmaskmovpd))]
83c7162d 1578#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1579pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1580 maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1581}
1582
532ac7d7 1583/// Loads packed single-precision (32-bit) floating-point elements from memory
0531ce1d
XL
1584/// into result using `mask` (elements are zeroed out when the high bit of the
1585/// corresponding element is not set).
83c7162d 1586///
353b0b11 1587/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
0531ce1d
XL
1588#[inline]
1589#[target_feature(enable = "avx")]
1590#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1591#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1592pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
0531ce1d
XL
1593 maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1594}
1595
532ac7d7 1596/// Stores packed single-precision (32-bit) floating-point elements from `a`
0531ce1d 1597/// into memory using `mask`.
83c7162d 1598///
353b0b11 1599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
0531ce1d
XL
1600#[inline]
1601#[target_feature(enable = "avx")]
1602#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1603#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1604pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
0531ce1d
XL
1605 maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1606}
1607
532ac7d7 1608/// Loads packed single-precision (32-bit) floating-point elements from memory
0531ce1d
XL
1609/// into result using `mask` (elements are zeroed out when the high bit of the
1610/// corresponding element is not set).
83c7162d 1611///
353b0b11 1612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
0531ce1d
XL
1613#[inline]
1614#[target_feature(enable = "avx")]
1615#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1616#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1617pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1618 maskloadps(mem_addr as *const i8, mask.as_i32x4())
1619}
1620
532ac7d7 1621/// Stores packed single-precision (32-bit) floating-point elements from `a`
0531ce1d 1622/// into memory using `mask`.
83c7162d 1623///
353b0b11 1624/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
0531ce1d
XL
1625#[inline]
1626#[target_feature(enable = "avx")]
1627#[cfg_attr(test, assert_instr(vmaskmovps))]
83c7162d 1628#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1629pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1630 maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1631}
1632
1633/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
532ac7d7 1634/// from `a`, and returns the results.
83c7162d 1635///
353b0b11 1636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
0531ce1d
XL
1637#[inline]
1638#[target_feature(enable = "avx")]
1639#[cfg_attr(test, assert_instr(vmovshdup))]
83c7162d 1640#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1641pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
353b0b11 1642 simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
0531ce1d
XL
1643}
1644
1645/// Duplicate even-indexed single-precision (32-bit) floating-point elements
532ac7d7 1646/// from `a`, and returns the results.
83c7162d 1647///
353b0b11 1648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
0531ce1d
XL
1649#[inline]
1650#[target_feature(enable = "avx")]
1651#[cfg_attr(test, assert_instr(vmovsldup))]
83c7162d 1652#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1653pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
353b0b11 1654 simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
0531ce1d
XL
1655}
1656
1657/// Duplicate even-indexed double-precision (64-bit) floating-point elements
e1599b0c 1658/// from `a`, and returns the results.
83c7162d 1659///
353b0b11 1660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
0531ce1d
XL
1661#[inline]
1662#[target_feature(enable = "avx")]
1663#[cfg_attr(test, assert_instr(vmovddup))]
83c7162d 1664#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1665pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
353b0b11 1666 simd_shuffle!(a, a, [0, 0, 2, 2])
0531ce1d
XL
1667}
1668
532ac7d7 1669/// Loads 256-bits of integer data from unaligned memory into result.
0531ce1d
XL
1670/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1671/// data crosses a cache line boundary.
83c7162d 1672///
353b0b11 1673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
0531ce1d
XL
1674#[inline]
1675#[target_feature(enable = "avx")]
1676#[cfg_attr(test, assert_instr(vlddqu))]
83c7162d 1677#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1678pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
532ac7d7 1679 transmute(vlddqu(mem_addr as *const i8))
0531ce1d
XL
1680}
1681
1682/// Moves integer data from a 256-bit integer vector to a 32-byte
1683/// aligned memory location. To minimize caching, the data is flagged as
1684/// non-temporal (unlikely to be used again soon)
83c7162d 1685///
353b0b11 1686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
0531ce1d
XL
1687#[inline]
1688#[target_feature(enable = "avx")]
1689#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
83c7162d 1690#[stable(feature = "simd_x86", since = "1.27.0")]
a1dfa0c6
XL
1691pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1692 intrinsics::nontemporal_store(mem_addr, a);
0531ce1d
XL
1693}
1694
83c7162d 1695/// Moves double-precision values from a 256-bit vector of `[4 x double]`
0531ce1d
XL
1696/// to a 32-byte aligned memory location. To minimize caching, the data is
1697/// flagged as non-temporal (unlikely to be used again soon).
83c7162d 1698///
353b0b11 1699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
0531ce1d
XL
1700#[inline]
1701#[target_feature(enable = "avx")]
1702#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
83c7162d 1703#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1704#[allow(clippy::cast_ptr_alignment)]
a1dfa0c6
XL
1705pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1706 intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
0531ce1d
XL
1707}
1708
1709/// Moves single-precision floating point values from a 256-bit vector
83c7162d 1710/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
0531ce1d
XL
1711/// caching, the data is flagged as non-temporal (unlikely to be used again
1712/// soon).
83c7162d 1713///
353b0b11 1714/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
0531ce1d
XL
1715#[inline]
1716#[target_feature(enable = "avx")]
1717#[cfg_attr(test, assert_instr(vmovntps))]
83c7162d 1718#[stable(feature = "simd_x86", since = "1.27.0")]
48663c56 1719#[allow(clippy::cast_ptr_alignment)]
a1dfa0c6
XL
1720pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1721 intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
0531ce1d
XL
1722}
1723
532ac7d7
XL
1724/// Computes the approximate reciprocal of packed single-precision (32-bit)
1725/// floating-point elements in `a`, and returns the results. The maximum
0531ce1d 1726/// relative error for this approximation is less than 1.5*2^-12.
83c7162d 1727///
353b0b11 1728/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
0531ce1d
XL
1729#[inline]
1730#[target_feature(enable = "avx")]
1731#[cfg_attr(test, assert_instr(vrcpps))]
83c7162d 1732#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1733pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
1734 vrcpps(a)
1735}
1736
532ac7d7
XL
1737/// Computes the approximate reciprocal square root of packed single-precision
1738/// (32-bit) floating-point elements in `a`, and returns the results.
0531ce1d 1739/// The maximum relative error for this approximation is less than 1.5*2^-12.
83c7162d 1740///
353b0b11 1741/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
0531ce1d
XL
1742#[inline]
1743#[target_feature(enable = "avx")]
1744#[cfg_attr(test, assert_instr(vrsqrtps))]
83c7162d 1745#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1746pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1747 vrsqrtps(a)
1748}
1749
532ac7d7 1750/// Unpacks and interleave double-precision (64-bit) floating-point elements
0531ce1d 1751/// from the high half of each 128-bit lane in `a` and `b`.
83c7162d 1752///
353b0b11 1753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
0531ce1d
XL
1754#[inline]
1755#[target_feature(enable = "avx")]
1756#[cfg_attr(test, assert_instr(vunpckhpd))]
83c7162d 1757#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1758pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
353b0b11 1759 simd_shuffle!(a, b, [1, 5, 3, 7])
0531ce1d
XL
1760}
1761
532ac7d7 1762/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1763/// from the high half of each 128-bit lane in `a` and `b`.
83c7162d 1764///
353b0b11 1765/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
0531ce1d
XL
1766#[inline]
1767#[target_feature(enable = "avx")]
1768#[cfg_attr(test, assert_instr(vunpckhps))]
83c7162d 1769#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1770pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
353b0b11 1771 simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
0531ce1d
XL
1772}
1773
532ac7d7 1774/// Unpacks and interleave double-precision (64-bit) floating-point elements
0531ce1d 1775/// from the low half of each 128-bit lane in `a` and `b`.
83c7162d 1776///
353b0b11 1777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
0531ce1d
XL
1778#[inline]
1779#[target_feature(enable = "avx")]
1780#[cfg_attr(test, assert_instr(vunpcklpd))]
83c7162d 1781#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1782pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
353b0b11 1783 simd_shuffle!(a, b, [0, 4, 2, 6])
0531ce1d
XL
1784}
1785
532ac7d7 1786/// Unpacks and interleave single-precision (32-bit) floating-point elements
0531ce1d 1787/// from the low half of each 128-bit lane in `a` and `b`.
83c7162d 1788///
353b0b11 1789/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
0531ce1d
XL
1790#[inline]
1791#[target_feature(enable = "avx")]
1792#[cfg_attr(test, assert_instr(vunpcklps))]
83c7162d 1793#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1794pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
353b0b11 1795 simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
0531ce1d
XL
1796}
1797
532ac7d7 1798/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
0531ce1d 1799/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
532ac7d7 1800/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
0531ce1d 1801/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d 1802///
353b0b11 1803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
0531ce1d
XL
1804#[inline]
1805#[target_feature(enable = "avx")]
1806#[cfg_attr(test, assert_instr(vptest))]
83c7162d 1807#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1808pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1809 ptestz256(a.as_i64x4(), b.as_i64x4())
1810}
1811
532ac7d7 1812/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
0531ce1d 1813/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
532ac7d7 1814/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
0531ce1d 1815/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d 1816///
353b0b11 1817/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
0531ce1d
XL
1818#[inline]
1819#[target_feature(enable = "avx")]
1820#[cfg_attr(test, assert_instr(vptest))]
83c7162d 1821#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1822pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
1823 ptestc256(a.as_i64x4(), b.as_i64x4())
1824}
1825
532ac7d7 1826/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
0531ce1d 1827/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
532ac7d7 1828/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
0531ce1d
XL
1829/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1830/// `CF` values are zero, otherwise return 0.
83c7162d 1831///
353b0b11 1832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
0531ce1d
XL
1833#[inline]
1834#[target_feature(enable = "avx")]
1835#[cfg_attr(test, assert_instr(vptest))]
83c7162d 1836#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1837pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
1838 ptestnzc256(a.as_i64x4(), b.as_i64x4())
1839}
1840
532ac7d7 1841/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
0531ce1d
XL
1842/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1843/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1844/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1845/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1846/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1847/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d 1848///
353b0b11 1849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
0531ce1d
XL
1850#[inline]
1851#[target_feature(enable = "avx")]
1852#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1853#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1854pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
1855 vtestzpd256(a, b)
1856}
1857
532ac7d7 1858/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
0531ce1d
XL
1859/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1860/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1861/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1862/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1863/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1864/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d 1865///
353b0b11 1866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
0531ce1d
XL
1867#[inline]
1868#[target_feature(enable = "avx")]
1869#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1870#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1871pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
1872 vtestcpd256(a, b)
1873}
1874
532ac7d7 1875/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
0531ce1d
XL
1876/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1877/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1878/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1879/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1880/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1881/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1882/// are zero, otherwise return 0.
83c7162d 1883///
353b0b11 1884/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
0531ce1d
XL
1885#[inline]
1886#[target_feature(enable = "avx")]
1887#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1888#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1889pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
1890 vtestnzcpd256(a, b)
1891}
1892
532ac7d7 1893/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
0531ce1d
XL
1894/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1895/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1896/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1897/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1898/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1899/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d 1900///
353b0b11 1901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
0531ce1d
XL
1902#[inline]
1903#[target_feature(enable = "avx")]
1904#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1905#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1906pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
1907 vtestzpd(a, b)
1908}
1909
532ac7d7 1910/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
0531ce1d
XL
1911/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1912/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1913/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1914/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1915/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1916/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d 1917///
353b0b11 1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
0531ce1d
XL
1919#[inline]
1920#[target_feature(enable = "avx")]
1921#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1922#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1923pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
1924 vtestcpd(a, b)
1925}
1926
532ac7d7 1927/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
0531ce1d
XL
1928/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1929/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1930/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1931/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1932/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1933/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1934/// are zero, otherwise return 0.
83c7162d 1935///
353b0b11 1936/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
0531ce1d
XL
1937#[inline]
1938#[target_feature(enable = "avx")]
1939#[cfg_attr(test, assert_instr(vtestpd))]
83c7162d 1940#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1941pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
1942 vtestnzcpd(a, b)
1943}
1944
532ac7d7 1945/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
0531ce1d
XL
1946/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1947/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
1948/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1949/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1950/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
1951/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d 1952///
353b0b11 1953/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
0531ce1d
XL
1954#[inline]
1955#[target_feature(enable = "avx")]
1956#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 1957#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1958pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
1959 vtestzps256(a, b)
1960}
1961
532ac7d7 1962/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
0531ce1d
XL
1963/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1964/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
1965/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1966/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1967/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
1968/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d 1969///
353b0b11 1970/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
0531ce1d
XL
1971#[inline]
1972#[target_feature(enable = "avx")]
1973#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 1974#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1975pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
1976 vtestcps256(a, b)
1977}
1978
532ac7d7 1979/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
0531ce1d
XL
1980/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1981/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
1982/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1983/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1984/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
1985/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1986/// are zero, otherwise return 0.
83c7162d 1987///
353b0b11 1988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
0531ce1d
XL
1989#[inline]
1990#[target_feature(enable = "avx")]
1991#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 1992#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1993pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
1994 vtestnzcps256(a, b)
1995}
1996
532ac7d7 1997/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
0531ce1d
XL
1998/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1999/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2000/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2001/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2002/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2003/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
83c7162d 2004///
353b0b11 2005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
0531ce1d
XL
2006#[inline]
2007#[target_feature(enable = "avx")]
2008#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 2009#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2010pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2011 vtestzps(a, b)
2012}
2013
532ac7d7 2014/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
0531ce1d
XL
2015/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2016/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2017/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2018/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2019/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2020/// is zero, otherwise set `CF` to 0. Return the `CF` value.
83c7162d 2021///
353b0b11 2022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
0531ce1d
XL
2023#[inline]
2024#[target_feature(enable = "avx")]
2025#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 2026#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2027pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2028 vtestcps(a, b)
2029}
2030
532ac7d7 2031/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
0531ce1d
XL
2032/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2033/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2034/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2035/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2036/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2037/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2038/// are zero, otherwise return 0.
83c7162d 2039///
353b0b11 2040/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
0531ce1d
XL
2041#[inline]
2042#[target_feature(enable = "avx")]
2043#[cfg_attr(test, assert_instr(vtestps))]
83c7162d 2044#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2045pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2046 vtestnzcps(a, b)
2047}
2048
532ac7d7 2049/// Sets each bit of the returned mask based on the most significant bit of the
0531ce1d
XL
2050/// corresponding packed double-precision (64-bit) floating-point element in
2051/// `a`.
83c7162d 2052///
353b0b11 2053/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
0531ce1d
XL
2054#[inline]
2055#[target_feature(enable = "avx")]
2056#[cfg_attr(test, assert_instr(vmovmskpd))]
83c7162d 2057#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2058pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
2059 movmskpd256(a)
2060}
2061
532ac7d7 2062/// Sets each bit of the returned mask based on the most significant bit of the
0531ce1d
XL
2063/// corresponding packed single-precision (32-bit) floating-point element in
2064/// `a`.
83c7162d 2065///
353b0b11 2066/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
0531ce1d
XL
2067#[inline]
2068#[target_feature(enable = "avx")]
2069#[cfg_attr(test, assert_instr(vmovmskps))]
83c7162d 2070#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2071pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
2072 movmskps256(a)
2073}
2074
532ac7d7 2075/// Returns vector of type __m256d with all elements set to zero.
83c7162d 2076///
353b0b11 2077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
0531ce1d
XL
2078#[inline]
2079#[target_feature(enable = "avx")]
2080#[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
83c7162d 2081#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2082pub unsafe fn _mm256_setzero_pd() -> __m256d {
2083 _mm256_set1_pd(0.0)
2084}
2085
532ac7d7 2086/// Returns vector of type __m256 with all elements set to zero.
83c7162d 2087///
353b0b11 2088/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
0531ce1d
XL
2089#[inline]
2090#[target_feature(enable = "avx")]
2091#[cfg_attr(test, assert_instr(vxorps))]
83c7162d 2092#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2093pub unsafe fn _mm256_setzero_ps() -> __m256 {
2094 _mm256_set1_ps(0.0)
2095}
2096
532ac7d7 2097/// Returns vector of type __m256i with all elements set to zero.
83c7162d 2098///
353b0b11 2099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
0531ce1d
XL
2100#[inline]
2101#[target_feature(enable = "avx")]
2102#[cfg_attr(test, assert_instr(vxor))]
83c7162d 2103#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2104pub unsafe fn _mm256_setzero_si256() -> __m256i {
2105 _mm256_set1_epi8(0)
2106}
2107
532ac7d7 2108/// Sets packed double-precision (64-bit) floating-point elements in returned
0531ce1d 2109/// vector with the supplied values.
83c7162d 2110///
353b0b11 2111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
0531ce1d
XL
2112#[inline]
2113#[target_feature(enable = "avx")]
2114// This intrinsic has no corresponding instruction.
2115#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2116#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2117pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2118 _mm256_setr_pd(d, c, b, a)
2119}
2120
532ac7d7 2121/// Sets packed single-precision (32-bit) floating-point elements in returned
0531ce1d 2122/// vector with the supplied values.
83c7162d 2123///
353b0b11 2124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
0531ce1d
XL
2125#[inline]
2126#[target_feature(enable = "avx")]
2127// This intrinsic has no corresponding instruction.
83c7162d 2128#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2129pub unsafe fn _mm256_set_ps(
0731742a
XL
2130 a: f32,
2131 b: f32,
2132 c: f32,
2133 d: f32,
2134 e: f32,
2135 f: f32,
2136 g: f32,
2137 h: f32,
0531ce1d
XL
2138) -> __m256 {
2139 _mm256_setr_ps(h, g, f, e, d, c, b, a)
2140}
2141
3c0e092e 2142/// Sets packed 8-bit integers in returned vector with the supplied values.
83c7162d 2143///
353b0b11 2144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
0531ce1d
XL
2145#[inline]
2146#[target_feature(enable = "avx")]
2147// This intrinsic has no corresponding instruction.
83c7162d 2148#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2149pub unsafe fn _mm256_set_epi8(
0731742a
XL
2150 e00: i8,
2151 e01: i8,
2152 e02: i8,
2153 e03: i8,
2154 e04: i8,
2155 e05: i8,
2156 e06: i8,
2157 e07: i8,
2158 e08: i8,
2159 e09: i8,
2160 e10: i8,
2161 e11: i8,
2162 e12: i8,
2163 e13: i8,
2164 e14: i8,
2165 e15: i8,
2166 e16: i8,
2167 e17: i8,
2168 e18: i8,
2169 e19: i8,
2170 e20: i8,
2171 e21: i8,
2172 e22: i8,
2173 e23: i8,
2174 e24: i8,
2175 e25: i8,
2176 e26: i8,
2177 e27: i8,
2178 e28: i8,
2179 e29: i8,
2180 e30: i8,
2181 e31: i8,
0531ce1d 2182) -> __m256i {
0731742a 2183 #[rustfmt::skip]
0531ce1d
XL
2184 _mm256_setr_epi8(
2185 e31, e30, e29, e28, e27, e26, e25, e24,
2186 e23, e22, e21, e20, e19, e18, e17, e16,
2187 e15, e14, e13, e12, e11, e10, e09, e08,
2188 e07, e06, e05, e04, e03, e02, e01, e00,
2189 )
2190}
2191
532ac7d7 2192/// Sets packed 16-bit integers in returned vector with the supplied values.
83c7162d 2193///
353b0b11 2194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
0531ce1d
XL
2195#[inline]
2196#[target_feature(enable = "avx")]
2197// This intrinsic has no corresponding instruction.
83c7162d 2198#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2199pub unsafe fn _mm256_set_epi16(
0731742a
XL
2200 e00: i16,
2201 e01: i16,
2202 e02: i16,
2203 e03: i16,
2204 e04: i16,
2205 e05: i16,
2206 e06: i16,
2207 e07: i16,
2208 e08: i16,
2209 e09: i16,
2210 e10: i16,
2211 e11: i16,
2212 e12: i16,
2213 e13: i16,
2214 e14: i16,
2215 e15: i16,
0531ce1d 2216) -> __m256i {
0731742a 2217 #[rustfmt::skip]
0531ce1d
XL
2218 _mm256_setr_epi16(
2219 e15, e14, e13, e12,
2220 e11, e10, e09, e08,
2221 e07, e06, e05, e04,
2222 e03, e02, e01, e00,
2223 )
2224}
2225
532ac7d7 2226/// Sets packed 32-bit integers in returned vector with the supplied values.
83c7162d 2227///
353b0b11 2228/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
0531ce1d
XL
2229#[inline]
2230#[target_feature(enable = "avx")]
2231// This intrinsic has no corresponding instruction.
83c7162d 2232#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2233pub unsafe fn _mm256_set_epi32(
0731742a
XL
2234 e0: i32,
2235 e1: i32,
2236 e2: i32,
2237 e3: i32,
2238 e4: i32,
2239 e5: i32,
2240 e6: i32,
2241 e7: i32,
0531ce1d
XL
2242) -> __m256i {
2243 _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2244}
2245
532ac7d7 2246/// Sets packed 64-bit integers in returned vector with the supplied values.
83c7162d 2247///
353b0b11 2248/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
0531ce1d
XL
2249#[inline]
2250#[target_feature(enable = "avx")]
2251// This intrinsic has no corresponding instruction.
83c7162d 2252#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2253pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2254 _mm256_setr_epi64x(d, c, b, a)
2255}
2256
532ac7d7 2257/// Sets packed double-precision (64-bit) floating-point elements in returned
0531ce1d 2258/// vector with the supplied values in reverse order.
83c7162d 2259///
353b0b11 2260/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
0531ce1d
XL
2261#[inline]
2262#[target_feature(enable = "avx")]
2263// This intrinsic has no corresponding instruction.
83c7162d 2264#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2265pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2266 __m256d(a, b, c, d)
2267}
2268
532ac7d7 2269/// Sets packed single-precision (32-bit) floating-point elements in returned
0531ce1d 2270/// vector with the supplied values in reverse order.
83c7162d 2271///
353b0b11 2272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
0531ce1d
XL
2273#[inline]
2274#[target_feature(enable = "avx")]
2275// This intrinsic has no corresponding instruction.
83c7162d 2276#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2277pub unsafe fn _mm256_setr_ps(
0731742a
XL
2278 a: f32,
2279 b: f32,
2280 c: f32,
2281 d: f32,
2282 e: f32,
2283 f: f32,
2284 g: f32,
2285 h: f32,
0531ce1d
XL
2286) -> __m256 {
2287 __m256(a, b, c, d, e, f, g, h)
2288}
2289
532ac7d7 2290/// Sets packed 8-bit integers in returned vector with the supplied values in
0531ce1d 2291/// reverse order.
83c7162d 2292///
353b0b11 2293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
0531ce1d
XL
2294#[inline]
2295#[target_feature(enable = "avx")]
2296// This intrinsic has no corresponding instruction.
83c7162d 2297#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2298pub unsafe fn _mm256_setr_epi8(
0731742a
XL
2299 e00: i8,
2300 e01: i8,
2301 e02: i8,
2302 e03: i8,
2303 e04: i8,
2304 e05: i8,
2305 e06: i8,
2306 e07: i8,
2307 e08: i8,
2308 e09: i8,
2309 e10: i8,
2310 e11: i8,
2311 e12: i8,
2312 e13: i8,
2313 e14: i8,
2314 e15: i8,
2315 e16: i8,
2316 e17: i8,
2317 e18: i8,
2318 e19: i8,
2319 e20: i8,
2320 e21: i8,
2321 e22: i8,
2322 e23: i8,
2323 e24: i8,
2324 e25: i8,
2325 e26: i8,
2326 e27: i8,
2327 e28: i8,
2328 e29: i8,
2329 e30: i8,
2330 e31: i8,
0531ce1d 2331) -> __m256i {
0731742a 2332 #[rustfmt::skip]
532ac7d7 2333 transmute(i8x32::new(
0531ce1d
XL
2334 e00, e01, e02, e03, e04, e05, e06, e07,
2335 e08, e09, e10, e11, e12, e13, e14, e15,
2336 e16, e17, e18, e19, e20, e21, e22, e23,
2337 e24, e25, e26, e27, e28, e29, e30, e31,
2338 ))
2339}
2340
532ac7d7 2341/// Sets packed 16-bit integers in returned vector with the supplied values in
0531ce1d 2342/// reverse order.
83c7162d 2343///
353b0b11 2344/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
0531ce1d
XL
2345#[inline]
2346#[target_feature(enable = "avx")]
2347// This intrinsic has no corresponding instruction.
83c7162d 2348#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2349pub unsafe fn _mm256_setr_epi16(
0731742a
XL
2350 e00: i16,
2351 e01: i16,
2352 e02: i16,
2353 e03: i16,
2354 e04: i16,
2355 e05: i16,
2356 e06: i16,
2357 e07: i16,
2358 e08: i16,
2359 e09: i16,
2360 e10: i16,
2361 e11: i16,
2362 e12: i16,
2363 e13: i16,
2364 e14: i16,
2365 e15: i16,
0531ce1d 2366) -> __m256i {
0731742a 2367 #[rustfmt::skip]
532ac7d7 2368 transmute(i16x16::new(
0531ce1d
XL
2369 e00, e01, e02, e03,
2370 e04, e05, e06, e07,
2371 e08, e09, e10, e11,
2372 e12, e13, e14, e15,
2373 ))
2374}
2375
532ac7d7 2376/// Sets packed 32-bit integers in returned vector with the supplied values in
0531ce1d 2377/// reverse order.
83c7162d 2378///
353b0b11 2379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
0531ce1d
XL
2380#[inline]
2381#[target_feature(enable = "avx")]
2382// This intrinsic has no corresponding instruction.
83c7162d 2383#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2384pub unsafe fn _mm256_setr_epi32(
0731742a
XL
2385 e0: i32,
2386 e1: i32,
2387 e2: i32,
2388 e3: i32,
2389 e4: i32,
2390 e5: i32,
2391 e6: i32,
2392 e7: i32,
0531ce1d 2393) -> __m256i {
532ac7d7 2394 transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
0531ce1d
XL
2395}
2396
532ac7d7 2397/// Sets packed 64-bit integers in returned vector with the supplied values in
0531ce1d 2398/// reverse order.
83c7162d 2399///
353b0b11 2400/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
0531ce1d
XL
2401#[inline]
2402#[target_feature(enable = "avx")]
2403// This intrinsic has no corresponding instruction.
83c7162d 2404#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2405pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
532ac7d7 2406 transmute(i64x4::new(a, b, c, d))
0531ce1d
XL
2407}
2408
532ac7d7 2409/// Broadcasts double-precision (64-bit) floating-point value `a` to all
0531ce1d 2410/// elements of returned vector.
83c7162d 2411///
353b0b11 2412/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
0531ce1d
XL
2413#[inline]
2414#[target_feature(enable = "avx")]
2415// This intrinsic has no corresponding instruction.
83c7162d 2416#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2417pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
2418 _mm256_setr_pd(a, a, a, a)
2419}
2420
532ac7d7 2421/// Broadcasts single-precision (32-bit) floating-point value `a` to all
0531ce1d 2422/// elements of returned vector.
83c7162d 2423///
353b0b11 2424/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
0531ce1d
XL
2425#[inline]
2426#[target_feature(enable = "avx")]
2427// This intrinsic has no corresponding instruction.
83c7162d 2428#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2429pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
2430 _mm256_setr_ps(a, a, a, a, a, a, a, a)
2431}
2432
532ac7d7 2433/// Broadcasts 8-bit integer `a` to all elements of returned vector.
0531ce1d 2434/// This intrinsic may generate the `vpbroadcastb`.
83c7162d 2435///
353b0b11 2436/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
0531ce1d
XL
2437#[inline]
2438#[target_feature(enable = "avx")]
0531ce1d 2439// This intrinsic has no corresponding instruction.
83c7162d 2440#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2441pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
0731742a 2442 #[rustfmt::skip]
0531ce1d
XL
2443 _mm256_setr_epi8(
2444 a, a, a, a, a, a, a, a,
2445 a, a, a, a, a, a, a, a,
2446 a, a, a, a, a, a, a, a,
2447 a, a, a, a, a, a, a, a,
2448 )
2449}
2450
9ffffee4 2451/// Broadcasts 16-bit integer `a` to all elements of returned vector.
0531ce1d 2452/// This intrinsic may generate the `vpbroadcastw`.
83c7162d 2453///
353b0b11 2454/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
0531ce1d
XL
2455#[inline]
2456#[target_feature(enable = "avx")]
2457//#[cfg_attr(test, assert_instr(vpshufb))]
2458#[cfg_attr(test, assert_instr(vinsertf128))]
2459// This intrinsic has no corresponding instruction.
83c7162d 2460#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2461pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
2462 _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2463}
2464
532ac7d7 2465/// Broadcasts 32-bit integer `a` to all elements of returned vector.
0531ce1d 2466/// This intrinsic may generate the `vpbroadcastd`.
83c7162d 2467///
353b0b11 2468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
0531ce1d
XL
2469#[inline]
2470#[target_feature(enable = "avx")]
2471// This intrinsic has no corresponding instruction.
83c7162d 2472#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2473pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
2474 _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2475}
2476
532ac7d7 2477/// Broadcasts 64-bit integer `a` to all elements of returned vector.
0531ce1d 2478/// This intrinsic may generate the `vpbroadcastq`.
83c7162d 2479///
353b0b11 2480/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
0531ce1d
XL
2481#[inline]
2482#[target_feature(enable = "avx")]
e1599b0c
XL
2483#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2484#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
0531ce1d 2485// This intrinsic has no corresponding instruction.
83c7162d 2486#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2487pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
2488 _mm256_setr_epi64x(a, a, a, a)
2489}
2490
2491/// Cast vector of type __m256d to type __m256.
83c7162d 2492///
353b0b11 2493/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
0531ce1d
XL
2494#[inline]
2495#[target_feature(enable = "avx")]
2496// This intrinsic is only used for compilation and does not generate any
2497// instructions, thus it has zero latency.
83c7162d 2498#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2499pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
532ac7d7 2500 transmute(a)
0531ce1d
XL
2501}
2502
2503/// Cast vector of type __m256 to type __m256d.
83c7162d 2504///
353b0b11 2505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
0531ce1d
XL
2506#[inline]
2507#[target_feature(enable = "avx")]
2508// This intrinsic is only used for compilation and does not generate any
2509// instructions, thus it has zero latency.
83c7162d 2510#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2511pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
532ac7d7 2512 transmute(a)
0531ce1d
XL
2513}
2514
2515/// Casts vector of type __m256 to type __m256i.
83c7162d 2516///
353b0b11 2517/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
0531ce1d
XL
2518#[inline]
2519#[target_feature(enable = "avx")]
2520// This intrinsic is only used for compilation and does not generate any
2521// instructions, thus it has zero latency.
83c7162d 2522#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2523pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
532ac7d7 2524 transmute(a)
0531ce1d
XL
2525}
2526
2527/// Casts vector of type __m256i to type __m256.
83c7162d 2528///
353b0b11 2529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
0531ce1d
XL
2530#[inline]
2531#[target_feature(enable = "avx")]
2532// This intrinsic is only used for compilation and does not generate any
2533// instructions, thus it has zero latency.
83c7162d 2534#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2535pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
532ac7d7 2536 transmute(a)
0531ce1d
XL
2537}
2538
2539/// Casts vector of type __m256d to type __m256i.
83c7162d 2540///
353b0b11 2541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
0531ce1d
XL
2542#[inline]
2543#[target_feature(enable = "avx")]
2544// This intrinsic is only used for compilation and does not generate any
2545// instructions, thus it has zero latency.
83c7162d 2546#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2547pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
532ac7d7 2548 transmute(a)
0531ce1d
XL
2549}
2550
2551/// Casts vector of type __m256i to type __m256d.
83c7162d 2552///
353b0b11 2553/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
0531ce1d
XL
2554#[inline]
2555#[target_feature(enable = "avx")]
2556// This intrinsic is only used for compilation and does not generate any
2557// instructions, thus it has zero latency.
83c7162d 2558#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2559pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
532ac7d7 2560 transmute(a)
0531ce1d
XL
2561}
2562
2563/// Casts vector of type __m256 to type __m128.
83c7162d 2564///
353b0b11 2565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
0531ce1d
XL
2566#[inline]
2567#[target_feature(enable = "avx")]
2568// This intrinsic is only used for compilation and does not generate any
2569// instructions, thus it has zero latency.
83c7162d 2570#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2571pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
353b0b11 2572 simd_shuffle!(a, a, [0, 1, 2, 3])
0531ce1d
XL
2573}
2574
2575/// Casts vector of type __m256d to type __m128d.
83c7162d 2576///
353b0b11 2577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
0531ce1d
XL
2578#[inline]
2579#[target_feature(enable = "avx")]
2580// This intrinsic is only used for compilation and does not generate any
2581// instructions, thus it has zero latency.
83c7162d 2582#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2583pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
353b0b11 2584 simd_shuffle!(a, a, [0, 1])
0531ce1d
XL
2585}
2586
2587/// Casts vector of type __m256i to type __m128i.
83c7162d 2588///
353b0b11 2589/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
0531ce1d
XL
2590#[inline]
2591#[target_feature(enable = "avx")]
2592// This intrinsic is only used for compilation and does not generate any
2593// instructions, thus it has zero latency.
83c7162d 2594#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2595pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2596 let a = a.as_i64x4();
353b0b11 2597 let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
532ac7d7 2598 transmute(dst)
0531ce1d
XL
2599}
2600
2601/// Casts vector of type __m128 to type __m256;
2602/// the upper 128 bits of the result are undefined.
83c7162d 2603///
353b0b11 2604/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
0531ce1d
XL
2605#[inline]
2606#[target_feature(enable = "avx")]
2607// This intrinsic is only used for compilation and does not generate any
2608// instructions, thus it has zero latency.
83c7162d 2609#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2610pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
353b0b11
FG
2611 // FIXME simd_shuffle!(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
2612 simd_shuffle!(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
0531ce1d
XL
2613}
2614
2615/// Casts vector of type __m128d to type __m256d;
2616/// the upper 128 bits of the result are undefined.
83c7162d 2617///
353b0b11 2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
0531ce1d
XL
2619#[inline]
2620#[target_feature(enable = "avx")]
2621// This intrinsic is only used for compilation and does not generate any
2622// instructions, thus it has zero latency.
83c7162d 2623#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2624pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
353b0b11
FG
2625 // FIXME simd_shuffle!(a, a, [0, 1, -1, -1])
2626 simd_shuffle!(a, a, [0, 1, 0, 0])
0531ce1d
XL
2627}
2628
2629/// Casts vector of type __m128i to type __m256i;
2630/// the upper 128 bits of the result are undefined.
83c7162d 2631///
353b0b11 2632/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
0531ce1d
XL
2633#[inline]
2634#[target_feature(enable = "avx")]
2635// This intrinsic is only used for compilation and does not generate any
2636// instructions, thus it has zero latency.
83c7162d 2637#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2638pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2639 let a = a.as_i64x2();
353b0b11
FG
2640 // FIXME simd_shuffle!(a, a, [0, 1, -1, -1])
2641 let dst: i64x4 = simd_shuffle!(a, a, [0, 1, 0, 0]);
532ac7d7 2642 transmute(dst)
0531ce1d
XL
2643}
2644
83c7162d
XL
2645/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2646/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
0531ce1d 2647/// the value of the source vector. The upper 128 bits are set to zero.
83c7162d 2648///
353b0b11 2649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
0531ce1d
XL
2650#[inline]
2651#[target_feature(enable = "avx,sse")]
2652// This intrinsic is only used for compilation and does not generate any
2653// instructions, thus it has zero latency.
83c7162d 2654#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2655pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
353b0b11 2656 simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
0531ce1d
XL
2657}
2658
2659/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2660/// The lower 128 bits contain the value of the source vector. The upper
2661/// 128 bits are set to zero.
83c7162d 2662///
353b0b11 2663/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
0531ce1d
XL
2664#[inline]
2665#[target_feature(enable = "avx,sse2")]
2666// This intrinsic is only used for compilation and does not generate any
2667// instructions, thus it has zero latency.
83c7162d 2668#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2669pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2670 let b = _mm_setzero_si128().as_i64x2();
353b0b11 2671 let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
532ac7d7 2672 transmute(dst)
0531ce1d
XL
2673}
2674
83c7162d
XL
2675/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2676/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
0531ce1d
XL
2677/// contain the value of the source vector. The upper 128 bits are set
2678/// to zero.
83c7162d 2679///
353b0b11 2680/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
0531ce1d
XL
2681#[inline]
2682#[target_feature(enable = "avx,sse2")]
2683// This intrinsic is only used for compilation and does not generate any
2684// instructions, thus it has zero latency.
83c7162d 2685#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2686pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
353b0b11 2687 simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3])
0531ce1d
XL
2688}
2689
49aad941
FG
2690/// Returns vector of type `__m256` with indeterminate elements.
2691/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2692/// In practice, this is equivalent to [`mem::zeroed`].
83c7162d 2693///
353b0b11 2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
0531ce1d
XL
2695#[inline]
2696#[target_feature(enable = "avx")]
2697// This intrinsic has no corresponding instruction.
83c7162d 2698#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2699pub unsafe fn _mm256_undefined_ps() -> __m256 {
3dfed10e 2700 _mm256_set1_ps(0.0)
0531ce1d
XL
2701}
2702
49aad941
FG
2703/// Returns vector of type `__m256d` with indeterminate elements.
2704/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2705/// In practice, this is equivalent to [`mem::zeroed`].
83c7162d 2706///
353b0b11 2707/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
0531ce1d
XL
2708#[inline]
2709#[target_feature(enable = "avx")]
2710// This intrinsic has no corresponding instruction.
83c7162d 2711#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2712pub unsafe fn _mm256_undefined_pd() -> __m256d {
3dfed10e 2713 _mm256_set1_pd(0.0)
0531ce1d
XL
2714}
2715
49aad941
FG
2716/// Returns vector of type __m256i with with indeterminate elements.
2717/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2718/// In practice, this is equivalent to [`mem::zeroed`].
83c7162d 2719///
353b0b11 2720/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
0531ce1d
XL
2721#[inline]
2722#[target_feature(enable = "avx")]
2723// This intrinsic has no corresponding instruction.
83c7162d 2724#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2725pub unsafe fn _mm256_undefined_si256() -> __m256i {
04454e1e 2726 __m256i(0, 0, 0, 0)
0531ce1d
XL
2727}
2728
532ac7d7 2729/// Sets packed __m256 returned vector with the supplied values.
83c7162d 2730///
353b0b11 2731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
0531ce1d
XL
2732#[inline]
2733#[target_feature(enable = "avx")]
2734#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2735#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2736pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
353b0b11 2737 simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
0531ce1d
XL
2738}
2739
532ac7d7 2740/// Sets packed __m256d returned vector with the supplied values.
83c7162d 2741///
353b0b11 2742/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
0531ce1d
XL
2743#[inline]
2744#[target_feature(enable = "avx")]
2745#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2746#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2747pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
532ac7d7
XL
2748 let hi: __m128 = transmute(hi);
2749 let lo: __m128 = transmute(lo);
2750 transmute(_mm256_set_m128(hi, lo))
0531ce1d
XL
2751}
2752
532ac7d7 2753/// Sets packed __m256i returned vector with the supplied values.
83c7162d 2754///
353b0b11 2755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
0531ce1d
XL
2756#[inline]
2757#[target_feature(enable = "avx")]
2758#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2759#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2760pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
532ac7d7
XL
2761 let hi: __m128 = transmute(hi);
2762 let lo: __m128 = transmute(lo);
2763 transmute(_mm256_set_m128(hi, lo))
0531ce1d
XL
2764}
2765
532ac7d7 2766/// Sets packed __m256 returned vector with the supplied values.
83c7162d 2767///
353b0b11 2768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
0531ce1d
XL
2769#[inline]
2770#[target_feature(enable = "avx")]
2771#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2772#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2773pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2774 _mm256_set_m128(hi, lo)
2775}
2776
532ac7d7 2777/// Sets packed __m256d returned vector with the supplied values.
83c7162d 2778///
353b0b11 2779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
0531ce1d
XL
2780#[inline]
2781#[target_feature(enable = "avx")]
2782#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2783#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2784pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2785 _mm256_set_m128d(hi, lo)
2786}
2787
532ac7d7 2788/// Sets packed __m256i returned vector with the supplied values.
83c7162d 2789///
353b0b11 2790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
0531ce1d
XL
2791#[inline]
2792#[target_feature(enable = "avx")]
2793#[cfg_attr(test, assert_instr(vinsertf128))]
83c7162d 2794#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2795pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
2796 _mm256_set_m128i(hi, lo)
2797}
2798
532ac7d7 2799/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
0531ce1d
XL
2800/// floating-point elements) from memory, and combine them into a 256-bit
2801/// value.
2802/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d 2803///
353b0b11 2804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
0531ce1d
XL
2805#[inline]
2806#[target_feature(enable = "avx,sse")]
2807// This intrinsic has no corresponding instruction.
83c7162d 2808#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2809pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
0531ce1d 2810 let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
17df50a5 2811 _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
0531ce1d
XL
2812}
2813
532ac7d7 2814/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
0531ce1d
XL
2815/// floating-point elements) from memory, and combine them into a 256-bit
2816/// value.
2817/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d 2818///
353b0b11 2819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
0531ce1d
XL
2820#[inline]
2821#[target_feature(enable = "avx,sse2")]
2822// This intrinsic has no corresponding instruction.
83c7162d 2823#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2824pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
0531ce1d 2825 let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
17df50a5 2826 _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
0531ce1d
XL
2827}
2828
532ac7d7 2829/// Loads two 128-bit values (composed of integer data) from memory, and combine
0531ce1d
XL
2830/// them into a 256-bit value.
2831/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d 2832///
353b0b11 2833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
0531ce1d
XL
2834#[inline]
2835#[target_feature(enable = "avx,sse2")]
2836// This intrinsic has no corresponding instruction.
83c7162d 2837#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2838pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
0531ce1d 2839 let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
17df50a5 2840 _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
0531ce1d
XL
2841}
2842
532ac7d7 2843/// Stores the high and low 128-bit halves (each composed of 4 packed
0531ce1d
XL
2844/// single-precision (32-bit) floating-point elements) from `a` into memory two
2845/// different 128-bit locations.
2846/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d 2847///
353b0b11 2848/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
0531ce1d
XL
2849#[inline]
2850#[target_feature(enable = "avx,sse")]
2851// This intrinsic has no corresponding instruction.
83c7162d 2852#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2853pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
0531ce1d
XL
2854 let lo = _mm256_castps256_ps128(a);
2855 _mm_storeu_ps(loaddr, lo);
17df50a5 2856 let hi = _mm256_extractf128_ps::<1>(a);
0531ce1d
XL
2857 _mm_storeu_ps(hiaddr, hi);
2858}
2859
532ac7d7 2860/// Stores the high and low 128-bit halves (each composed of 2 packed
0531ce1d
XL
2861/// double-precision (64-bit) floating-point elements) from `a` into memory two
2862/// different 128-bit locations.
2863/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d 2864///
353b0b11 2865/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
0531ce1d
XL
2866#[inline]
2867#[target_feature(enable = "avx,sse2")]
2868// This intrinsic has no corresponding instruction.
83c7162d 2869#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2870pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
0531ce1d
XL
2871 let lo = _mm256_castpd256_pd128(a);
2872 _mm_storeu_pd(loaddr, lo);
17df50a5 2873 let hi = _mm256_extractf128_pd::<1>(a);
0531ce1d
XL
2874 _mm_storeu_pd(hiaddr, hi);
2875}
2876
532ac7d7 2877/// Stores the high and low 128-bit halves (each composed of integer data) from
0531ce1d
XL
2878/// `a` into memory two different 128-bit locations.
2879/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
83c7162d 2880///
353b0b11 2881/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
0531ce1d
XL
2882#[inline]
2883#[target_feature(enable = "avx,sse2")]
2884// This intrinsic has no corresponding instruction.
83c7162d 2885#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2886pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
0531ce1d
XL
2887 let lo = _mm256_castsi256_si128(a);
2888 _mm_storeu_si128(loaddr, lo);
17df50a5 2889 let hi = _mm256_extractf128_si256::<1>(a);
0531ce1d
XL
2890 _mm_storeu_si128(hiaddr, hi);
2891}
2892
83c7162d
XL
2893/// Returns the first element of the input vector of `[8 x float]`.
2894///
353b0b11 2895/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
0531ce1d
XL
2896#[inline]
2897#[target_feature(enable = "avx")]
2898//#[cfg_attr(test, assert_instr(movss))] FIXME
83c7162d 2899#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2900pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
2901 simd_extract(a, 0)
2902}
2903
5e7ed085 2904// LLVM intrinsics used in the above functions
0531ce1d
XL
2905#[allow(improper_ctypes)]
2906extern "C" {
2907 #[link_name = "llvm.x86.avx.addsub.pd.256"]
2908 fn addsubpd256(a: __m256d, b: __m256d) -> __m256d;
2909 #[link_name = "llvm.x86.avx.addsub.ps.256"]
2910 fn addsubps256(a: __m256, b: __m256) -> __m256;
0531ce1d
XL
2911 #[link_name = "llvm.x86.avx.round.pd.256"]
2912 fn roundpd256(a: __m256d, b: i32) -> __m256d;
2913 #[link_name = "llvm.x86.avx.round.ps.256"]
2914 fn roundps256(a: __m256, b: i32) -> __m256;
0531ce1d
XL
2915 #[link_name = "llvm.x86.avx.sqrt.ps.256"]
2916 fn sqrtps256(a: __m256) -> __m256;
2917 #[link_name = "llvm.x86.avx.blendv.pd.256"]
2918 fn vblendvpd(a: __m256d, b: __m256d, c: __m256d) -> __m256d;
2919 #[link_name = "llvm.x86.avx.blendv.ps.256"]
2920 fn vblendvps(a: __m256, b: __m256, c: __m256) -> __m256;
2921 #[link_name = "llvm.x86.avx.dp.ps.256"]
2922 fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
2923 #[link_name = "llvm.x86.avx.hadd.pd.256"]
2924 fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
2925 #[link_name = "llvm.x86.avx.hadd.ps.256"]
2926 fn vhaddps(a: __m256, b: __m256) -> __m256;
2927 #[link_name = "llvm.x86.avx.hsub.pd.256"]
2928 fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
2929 #[link_name = "llvm.x86.avx.hsub.ps.256"]
2930 fn vhsubps(a: __m256, b: __m256) -> __m256;
2931 #[link_name = "llvm.x86.sse2.cmp.pd"]
3dfed10e 2932 fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
0531ce1d
XL
2933 #[link_name = "llvm.x86.avx.cmp.pd.256"]
2934 fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
2935 #[link_name = "llvm.x86.sse.cmp.ps"]
3dfed10e 2936 fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
0531ce1d
XL
2937 #[link_name = "llvm.x86.avx.cmp.ps.256"]
2938 fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
2939 #[link_name = "llvm.x86.sse2.cmp.sd"]
3dfed10e 2940 fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
0531ce1d 2941 #[link_name = "llvm.x86.sse.cmp.ss"]
3dfed10e 2942 fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
0531ce1d
XL
2943 #[link_name = "llvm.x86.avx.cvtdq2.ps.256"]
2944 fn vcvtdq2ps(a: i32x8) -> __m256;
2945 #[link_name = "llvm.x86.avx.cvt.pd2.ps.256"]
2946 fn vcvtpd2ps(a: __m256d) -> __m128;
2947 #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
2948 fn vcvtps2dq(a: __m256) -> i32x8;
2949 #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
2950 fn vcvttpd2dq(a: __m256d) -> i32x4;
2951 #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
2952 fn vcvtpd2dq(a: __m256d) -> i32x4;
2953 #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
2954 fn vcvttps2dq(a: __m256) -> i32x8;
2955 #[link_name = "llvm.x86.avx.vzeroall"]
2956 fn vzeroall();
2957 #[link_name = "llvm.x86.avx.vzeroupper"]
2958 fn vzeroupper();
2959 #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
2960 fn vpermilps256(a: __m256, b: i32x8) -> __m256;
2961 #[link_name = "llvm.x86.avx.vpermilvar.ps"]
2962 fn vpermilps(a: __m128, b: i32x4) -> __m128;
2963 #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
2964 fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
2965 #[link_name = "llvm.x86.avx.vpermilvar.pd"]
2966 fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
2967 #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
2968 fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
2969 #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
2970 fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
2971 #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
2972 fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
2973 #[link_name = "llvm.x86.avx.vbroadcastf128.ps.256"]
2974 fn vbroadcastf128ps256(a: &__m128) -> __m256;
2975 #[link_name = "llvm.x86.avx.vbroadcastf128.pd.256"]
2976 fn vbroadcastf128pd256(a: &__m128d) -> __m256d;
2977 #[link_name = "llvm.x86.avx.storeu.pd.256"]
2978 fn storeupd256(mem_addr: *mut f64, a: __m256d);
2979 #[link_name = "llvm.x86.avx.storeu.ps.256"]
2980 fn storeups256(mem_addr: *mut f32, a: __m256);
2981 #[link_name = "llvm.x86.avx.storeu.dq.256"]
2982 fn storeudq256(mem_addr: *mut i8, a: i8x32);
2983 #[link_name = "llvm.x86.avx.maskload.pd.256"]
2984 fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
2985 #[link_name = "llvm.x86.avx.maskstore.pd.256"]
2986 fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
2987 #[link_name = "llvm.x86.avx.maskload.pd"]
2988 fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
2989 #[link_name = "llvm.x86.avx.maskstore.pd"]
2990 fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
2991 #[link_name = "llvm.x86.avx.maskload.ps.256"]
2992 fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
2993 #[link_name = "llvm.x86.avx.maskstore.ps.256"]
2994 fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
2995 #[link_name = "llvm.x86.avx.maskload.ps"]
2996 fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
2997 #[link_name = "llvm.x86.avx.maskstore.ps"]
2998 fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
2999 #[link_name = "llvm.x86.avx.ldu.dq.256"]
3000 fn vlddqu(mem_addr: *const i8) -> i8x32;
3001 #[link_name = "llvm.x86.avx.rcp.ps.256"]
3002 fn vrcpps(a: __m256) -> __m256;
3003 #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3004 fn vrsqrtps(a: __m256) -> __m256;
3005 #[link_name = "llvm.x86.avx.ptestz.256"]
3006 fn ptestz256(a: i64x4, b: i64x4) -> i32;
3007 #[link_name = "llvm.x86.avx.ptestc.256"]
3008 fn ptestc256(a: i64x4, b: i64x4) -> i32;
3009 #[link_name = "llvm.x86.avx.ptestnzc.256"]
3010 fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3011 #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3012 fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3013 #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3014 fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3015 #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3016 fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3017 #[link_name = "llvm.x86.avx.vtestz.pd"]
3018 fn vtestzpd(a: __m128d, b: __m128d) -> i32;
3019 #[link_name = "llvm.x86.avx.vtestc.pd"]
3020 fn vtestcpd(a: __m128d, b: __m128d) -> i32;
3021 #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3022 fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3023 #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3024 fn vtestzps256(a: __m256, b: __m256) -> i32;
3025 #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3026 fn vtestcps256(a: __m256, b: __m256) -> i32;
3027 #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3028 fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3029 #[link_name = "llvm.x86.avx.vtestz.ps"]
3030 fn vtestzps(a: __m128, b: __m128) -> i32;
3031 #[link_name = "llvm.x86.avx.vtestc.ps"]
3032 fn vtestcps(a: __m128, b: __m128) -> i32;
3033 #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3034 fn vtestnzcps(a: __m128, b: __m128) -> i32;
3035 #[link_name = "llvm.x86.avx.movmsk.pd.256"]
3036 fn movmskpd256(a: __m256d) -> i32;
3037 #[link_name = "llvm.x86.avx.movmsk.ps.256"]
3038 fn movmskps256(a: __m256) -> i32;
17df50a5
XL
3039 #[link_name = "llvm.x86.avx.min.ps.256"]
3040 fn vminps(a: __m256, b: __m256) -> __m256;
3041 #[link_name = "llvm.x86.avx.max.ps.256"]
3042 fn vmaxps(a: __m256, b: __m256) -> __m256;
3043 #[link_name = "llvm.x86.avx.min.pd.256"]
3044 fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3045 #[link_name = "llvm.x86.avx.max.pd.256"]
3046 fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
0531ce1d
XL
3047}
3048
3049#[cfg(test)]
3050mod tests {
48663c56 3051 use crate::hint::black_box;
416331ca 3052 use stdarch_test::simd_test;
0531ce1d 3053
532ac7d7 3054 use crate::core_arch::x86::*;
0531ce1d 3055
83c7162d 3056 #[simd_test(enable = "avx")]
0531ce1d
XL
3057 unsafe fn test_mm256_add_pd() {
3058 let a = _mm256_setr_pd(1., 2., 3., 4.);
3059 let b = _mm256_setr_pd(5., 6., 7., 8.);
3060 let r = _mm256_add_pd(a, b);
3061 let e = _mm256_setr_pd(6., 8., 10., 12.);
3062 assert_eq_m256d(r, e);
3063 }
3064
83c7162d 3065 #[simd_test(enable = "avx")]
0531ce1d
XL
3066 unsafe fn test_mm256_add_ps() {
3067 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3068 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3069 let r = _mm256_add_ps(a, b);
3070 let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3071 assert_eq_m256(r, e);
3072 }
3073
83c7162d 3074 #[simd_test(enable = "avx")]
0531ce1d
XL
3075 unsafe fn test_mm256_and_pd() {
3076 let a = _mm256_set1_pd(1.);
3077 let b = _mm256_set1_pd(0.6);
3078 let r = _mm256_and_pd(a, b);
3079 let e = _mm256_set1_pd(0.5);
3080 assert_eq_m256d(r, e);
3081 }
3082
83c7162d 3083 #[simd_test(enable = "avx")]
0531ce1d
XL
3084 unsafe fn test_mm256_and_ps() {
3085 let a = _mm256_set1_ps(1.);
3086 let b = _mm256_set1_ps(0.6);
3087 let r = _mm256_and_ps(a, b);
3088 let e = _mm256_set1_ps(0.5);
3089 assert_eq_m256(r, e);
3090 }
3091
83c7162d 3092 #[simd_test(enable = "avx")]
0531ce1d
XL
3093 unsafe fn test_mm256_or_pd() {
3094 let a = _mm256_set1_pd(1.);
3095 let b = _mm256_set1_pd(0.6);
3096 let r = _mm256_or_pd(a, b);
3097 let e = _mm256_set1_pd(1.2);
3098 assert_eq_m256d(r, e);
3099 }
3100
83c7162d 3101 #[simd_test(enable = "avx")]
0531ce1d
XL
3102 unsafe fn test_mm256_or_ps() {
3103 let a = _mm256_set1_ps(1.);
3104 let b = _mm256_set1_ps(0.6);
3105 let r = _mm256_or_ps(a, b);
3106 let e = _mm256_set1_ps(1.2);
3107 assert_eq_m256(r, e);
3108 }
3109
83c7162d 3110 #[simd_test(enable = "avx")]
0531ce1d
XL
3111 unsafe fn test_mm256_shuffle_pd() {
3112 let a = _mm256_setr_pd(1., 4., 5., 8.);
3113 let b = _mm256_setr_pd(2., 3., 6., 7.);
17df50a5 3114 let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
0531ce1d
XL
3115 let e = _mm256_setr_pd(4., 3., 8., 7.);
3116 assert_eq_m256d(r, e);
3117 }
3118
83c7162d 3119 #[simd_test(enable = "avx")]
0531ce1d
XL
3120 unsafe fn test_mm256_shuffle_ps() {
3121 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3122 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
17df50a5 3123 let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
0531ce1d
XL
3124 let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3125 assert_eq_m256(r, e);
3126 }
3127
83c7162d 3128 #[simd_test(enable = "avx")]
0531ce1d
XL
3129 unsafe fn test_mm256_andnot_pd() {
3130 let a = _mm256_set1_pd(0.);
3131 let b = _mm256_set1_pd(0.6);
3132 let r = _mm256_andnot_pd(a, b);
3133 assert_eq_m256d(r, b);
3134 }
3135
83c7162d 3136 #[simd_test(enable = "avx")]
0531ce1d
XL
3137 unsafe fn test_mm256_andnot_ps() {
3138 let a = _mm256_set1_ps(0.);
3139 let b = _mm256_set1_ps(0.6);
3140 let r = _mm256_andnot_ps(a, b);
3141 assert_eq_m256(r, b);
3142 }
3143
83c7162d 3144 #[simd_test(enable = "avx")]
0531ce1d
XL
3145 unsafe fn test_mm256_max_pd() {
3146 let a = _mm256_setr_pd(1., 4., 5., 8.);
3147 let b = _mm256_setr_pd(2., 3., 6., 7.);
3148 let r = _mm256_max_pd(a, b);
3149 let e = _mm256_setr_pd(2., 4., 6., 8.);
3150 assert_eq_m256d(r, e);
17df50a5
XL
3151 // > If the values being compared are both 0.0s (of either sign), the
3152 // > value in the second operand (source operand) is returned.
3153 let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3154 let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3155 let wu: [u64; 4] = transmute(w);
3156 let xu: [u64; 4] = transmute(x);
3157 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3158 assert_eq!(xu, [0u64; 4]);
3159 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3160 // > second operand (source operand), either a NaN or a valid
3161 // > floating-point value, is written to the result.
3162 let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3163 let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3164 let yf: [f64; 4] = transmute(y);
3165 let zf: [f64; 4] = transmute(z);
3166 assert_eq!(yf, [0.0; 4]);
3167 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3168 }
3169
83c7162d 3170 #[simd_test(enable = "avx")]
0531ce1d
XL
3171 unsafe fn test_mm256_max_ps() {
3172 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3173 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3174 let r = _mm256_max_ps(a, b);
3175 let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3176 assert_eq_m256(r, e);
17df50a5
XL
3177 // > If the values being compared are both 0.0s (of either sign), the
3178 // > value in the second operand (source operand) is returned.
3179 let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3180 let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3181 let wu: [u32; 8] = transmute(w);
3182 let xu: [u32; 8] = transmute(x);
3183 assert_eq!(wu, [0x8000_0000u32; 8]);
3184 assert_eq!(xu, [0u32; 8]);
3185 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3186 // > second operand (source operand), either a NaN or a valid
3187 // > floating-point value, is written to the result.
3188 let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3189 let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3190 let yf: [f32; 8] = transmute(y);
3191 let zf: [f32; 8] = transmute(z);
3192 assert_eq!(yf, [0.0; 8]);
3193 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3194 }
3195
83c7162d 3196 #[simd_test(enable = "avx")]
0531ce1d
XL
3197 unsafe fn test_mm256_min_pd() {
3198 let a = _mm256_setr_pd(1., 4., 5., 8.);
3199 let b = _mm256_setr_pd(2., 3., 6., 7.);
3200 let r = _mm256_min_pd(a, b);
3201 let e = _mm256_setr_pd(1., 3., 5., 7.);
3202 assert_eq_m256d(r, e);
17df50a5
XL
3203 // > If the values being compared are both 0.0s (of either sign), the
3204 // > value in the second operand (source operand) is returned.
3205 let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3206 let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3207 let wu: [u64; 4] = transmute(w);
3208 let xu: [u64; 4] = transmute(x);
3209 assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3210 assert_eq!(xu, [0u64; 4]);
3211 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3212 // > second operand (source operand), either a NaN or a valid
3213 // > floating-point value, is written to the result.
3214 let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3215 let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3216 let yf: [f64; 4] = transmute(y);
3217 let zf: [f64; 4] = transmute(z);
3218 assert_eq!(yf, [0.0; 4]);
3219 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3220 }
3221
83c7162d 3222 #[simd_test(enable = "avx")]
0531ce1d
XL
3223 unsafe fn test_mm256_min_ps() {
3224 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3225 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3226 let r = _mm256_min_ps(a, b);
3227 let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3228 assert_eq_m256(r, e);
17df50a5
XL
3229 // > If the values being compared are both 0.0s (of either sign), the
3230 // > value in the second operand (source operand) is returned.
3231 let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3232 let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3233 let wu: [u32; 8] = transmute(w);
3234 let xu: [u32; 8] = transmute(x);
3235 assert_eq!(wu, [0x8000_0000u32; 8]);
3236 assert_eq!(xu, [0u32; 8]);
3237 // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3238 // > second operand (source operand), either a NaN or a valid
3239 // > floating-point value, is written to the result.
3240 let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3241 let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3242 let yf: [f32; 8] = transmute(y);
3243 let zf: [f32; 8] = transmute(z);
3244 assert_eq!(yf, [0.0; 8]);
3245 assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
0531ce1d
XL
3246 }
3247
83c7162d 3248 #[simd_test(enable = "avx")]
0531ce1d
XL
3249 unsafe fn test_mm256_mul_pd() {
3250 let a = _mm256_setr_pd(1., 2., 3., 4.);
3251 let b = _mm256_setr_pd(5., 6., 7., 8.);
3252 let r = _mm256_mul_pd(a, b);
3253 let e = _mm256_setr_pd(5., 12., 21., 32.);
3254 assert_eq_m256d(r, e);
3255 }
3256
83c7162d 3257 #[simd_test(enable = "avx")]
0531ce1d
XL
3258 unsafe fn test_mm256_mul_ps() {
3259 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3260 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3261 let r = _mm256_mul_ps(a, b);
3262 let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3263 assert_eq_m256(r, e);
3264 }
3265
83c7162d 3266 #[simd_test(enable = "avx")]
0531ce1d
XL
3267 unsafe fn test_mm256_addsub_pd() {
3268 let a = _mm256_setr_pd(1., 2., 3., 4.);
3269 let b = _mm256_setr_pd(5., 6., 7., 8.);
3270 let r = _mm256_addsub_pd(a, b);
3271 let e = _mm256_setr_pd(-4., 8., -4., 12.);
3272 assert_eq_m256d(r, e);
3273 }
3274
83c7162d 3275 #[simd_test(enable = "avx")]
0531ce1d
XL
3276 unsafe fn test_mm256_addsub_ps() {
3277 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3278 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3279 let r = _mm256_addsub_ps(a, b);
3280 let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3281 assert_eq_m256(r, e);
3282 }
3283
83c7162d 3284 #[simd_test(enable = "avx")]
0531ce1d
XL
3285 unsafe fn test_mm256_sub_pd() {
3286 let a = _mm256_setr_pd(1., 2., 3., 4.);
3287 let b = _mm256_setr_pd(5., 6., 7., 8.);
3288 let r = _mm256_sub_pd(a, b);
3289 let e = _mm256_setr_pd(-4., -4., -4., -4.);
3290 assert_eq_m256d(r, e);
3291 }
3292
83c7162d 3293 #[simd_test(enable = "avx")]
0531ce1d
XL
3294 unsafe fn test_mm256_sub_ps() {
3295 let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3296 let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3297 let r = _mm256_sub_ps(a, b);
3298 let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3299 assert_eq_m256(r, e);
3300 }
3301
83c7162d 3302 #[simd_test(enable = "avx")]
0531ce1d
XL
3303 unsafe fn test_mm256_round_pd() {
3304 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
17df50a5
XL
3305 let result_closest = _mm256_round_pd::<0b0000>(a);
3306 let result_down = _mm256_round_pd::<0b0001>(a);
3307 let result_up = _mm256_round_pd::<0b0010>(a);
0531ce1d
XL
3308 let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3309 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3310 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3311 assert_eq_m256d(result_closest, expected_closest);
3312 assert_eq_m256d(result_down, expected_down);
3313 assert_eq_m256d(result_up, expected_up);
3314 }
3315
83c7162d 3316 #[simd_test(enable = "avx")]
0531ce1d
XL
3317 unsafe fn test_mm256_floor_pd() {
3318 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3319 let result_down = _mm256_floor_pd(a);
3320 let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3321 assert_eq_m256d(result_down, expected_down);
3322 }
3323
83c7162d 3324 #[simd_test(enable = "avx")]
0531ce1d
XL
3325 unsafe fn test_mm256_ceil_pd() {
3326 let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3327 let result_up = _mm256_ceil_pd(a);
3328 let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3329 assert_eq_m256d(result_up, expected_up);
3330 }
3331
83c7162d 3332 #[simd_test(enable = "avx")]
0531ce1d
XL
3333 unsafe fn test_mm256_round_ps() {
3334 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
17df50a5
XL
3335 let result_closest = _mm256_round_ps::<0b0000>(a);
3336 let result_down = _mm256_round_ps::<0b0001>(a);
3337 let result_up = _mm256_round_ps::<0b0010>(a);
0731742a 3338 let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
0531ce1d
XL
3339 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3340 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3341 assert_eq_m256(result_closest, expected_closest);
3342 assert_eq_m256(result_down, expected_down);
3343 assert_eq_m256(result_up, expected_up);
3344 }
3345
83c7162d 3346 #[simd_test(enable = "avx")]
0531ce1d
XL
3347 unsafe fn test_mm256_floor_ps() {
3348 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3349 let result_down = _mm256_floor_ps(a);
3350 let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3351 assert_eq_m256(result_down, expected_down);
3352 }
3353
83c7162d 3354 #[simd_test(enable = "avx")]
0531ce1d
XL
3355 unsafe fn test_mm256_ceil_ps() {
3356 let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3357 let result_up = _mm256_ceil_ps(a);
3358 let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3359 assert_eq_m256(result_up, expected_up);
3360 }
3361
83c7162d 3362 #[simd_test(enable = "avx")]
0531ce1d
XL
3363 unsafe fn test_mm256_sqrt_pd() {
3364 let a = _mm256_setr_pd(4., 9., 16., 25.);
3365 let r = _mm256_sqrt_pd(a);
3366 let e = _mm256_setr_pd(2., 3., 4., 5.);
3367 assert_eq_m256d(r, e);
3368 }
3369
83c7162d 3370 #[simd_test(enable = "avx")]
0531ce1d
XL
3371 unsafe fn test_mm256_sqrt_ps() {
3372 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3373 let r = _mm256_sqrt_ps(a);
3374 let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3375 assert_eq_m256(r, e);
3376 }
3377
83c7162d 3378 #[simd_test(enable = "avx")]
0531ce1d
XL
3379 unsafe fn test_mm256_div_ps() {
3380 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3381 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3382 let r = _mm256_div_ps(a, b);
3383 let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3384 assert_eq_m256(r, e);
3385 }
3386
83c7162d 3387 #[simd_test(enable = "avx")]
0531ce1d
XL
3388 unsafe fn test_mm256_div_pd() {
3389 let a = _mm256_setr_pd(4., 9., 16., 25.);
3390 let b = _mm256_setr_pd(4., 3., 2., 5.);
3391 let r = _mm256_div_pd(a, b);
3392 let e = _mm256_setr_pd(1., 3., 8., 5.);
3393 assert_eq_m256d(r, e);
3394 }
3395
83c7162d 3396 #[simd_test(enable = "avx")]
0531ce1d
XL
3397 unsafe fn test_mm256_blend_pd() {
3398 let a = _mm256_setr_pd(4., 9., 16., 25.);
3399 let b = _mm256_setr_pd(4., 3., 2., 5.);
17df50a5 3400 let r = _mm256_blend_pd::<0x0>(a, b);
0531ce1d 3401 assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
17df50a5 3402 let r = _mm256_blend_pd::<0x3>(a, b);
0531ce1d 3403 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
17df50a5 3404 let r = _mm256_blend_pd::<0xF>(a, b);
0531ce1d
XL
3405 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3406 }
3407
83c7162d 3408 #[simd_test(enable = "avx")]
0531ce1d
XL
3409 unsafe fn test_mm256_blend_ps() {
3410 let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3411 let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
17df50a5 3412 let r = _mm256_blend_ps::<0x0>(a, b);
8faf50e0 3413 assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
17df50a5 3414 let r = _mm256_blend_ps::<0x3>(a, b);
8faf50e0 3415 assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
17df50a5 3416 let r = _mm256_blend_ps::<0xF>(a, b);
8faf50e0 3417 assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
0531ce1d
XL
3418 }
3419
83c7162d 3420 #[simd_test(enable = "avx")]
0531ce1d
XL
3421 unsafe fn test_mm256_blendv_pd() {
3422 let a = _mm256_setr_pd(4., 9., 16., 25.);
3423 let b = _mm256_setr_pd(4., 3., 2., 5.);
3424 let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3425 let r = _mm256_blendv_pd(a, b, c);
3426 let e = _mm256_setr_pd(4., 9., 2., 5.);
3427 assert_eq_m256d(r, e);
3428 }
3429
83c7162d 3430 #[simd_test(enable = "avx")]
0531ce1d
XL
3431 unsafe fn test_mm256_blendv_ps() {
3432 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3433 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
0731742a 3434 #[rustfmt::skip]
0531ce1d
XL
3435 let c = _mm256_setr_ps(
3436 0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3437 );
3438 let r = _mm256_blendv_ps(a, b, c);
3439 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3440 assert_eq_m256(r, e);
3441 }
3442
83c7162d 3443 #[simd_test(enable = "avx")]
0531ce1d
XL
3444 unsafe fn test_mm256_dp_ps() {
3445 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3446 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
17df50a5 3447 let r = _mm256_dp_ps::<0xFF>(a, b);
0731742a 3448 let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
0531ce1d
XL
3449 assert_eq_m256(r, e);
3450 }
3451
83c7162d 3452 #[simd_test(enable = "avx")]
0531ce1d
XL
3453 unsafe fn test_mm256_hadd_pd() {
3454 let a = _mm256_setr_pd(4., 9., 16., 25.);
3455 let b = _mm256_setr_pd(4., 3., 2., 5.);
3456 let r = _mm256_hadd_pd(a, b);
3457 let e = _mm256_setr_pd(13., 7., 41., 7.);
3458 assert_eq_m256d(r, e);
3459
3460 let a = _mm256_setr_pd(1., 2., 3., 4.);
3461 let b = _mm256_setr_pd(5., 6., 7., 8.);
3462 let r = _mm256_hadd_pd(a, b);
3463 let e = _mm256_setr_pd(3., 11., 7., 15.);
3464 assert_eq_m256d(r, e);
3465 }
3466
83c7162d 3467 #[simd_test(enable = "avx")]
0531ce1d
XL
3468 unsafe fn test_mm256_hadd_ps() {
3469 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3470 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3471 let r = _mm256_hadd_ps(a, b);
3472 let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3473 assert_eq_m256(r, e);
3474
3475 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3476 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3477 let r = _mm256_hadd_ps(a, b);
3478 let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3479 assert_eq_m256(r, e);
3480 }
3481
83c7162d 3482 #[simd_test(enable = "avx")]
0531ce1d
XL
3483 unsafe fn test_mm256_hsub_pd() {
3484 let a = _mm256_setr_pd(4., 9., 16., 25.);
3485 let b = _mm256_setr_pd(4., 3., 2., 5.);
3486 let r = _mm256_hsub_pd(a, b);
3487 let e = _mm256_setr_pd(-5., 1., -9., -3.);
3488 assert_eq_m256d(r, e);
3489
3490 let a = _mm256_setr_pd(1., 2., 3., 4.);
3491 let b = _mm256_setr_pd(5., 6., 7., 8.);
3492 let r = _mm256_hsub_pd(a, b);
3493 let e = _mm256_setr_pd(-1., -1., -1., -1.);
3494 assert_eq_m256d(r, e);
3495 }
3496
83c7162d 3497 #[simd_test(enable = "avx")]
0531ce1d
XL
3498 unsafe fn test_mm256_hsub_ps() {
3499 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3500 let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3501 let r = _mm256_hsub_ps(a, b);
3502 let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3503 assert_eq_m256(r, e);
3504
3505 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3506 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3507 let r = _mm256_hsub_ps(a, b);
3508 let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3509 assert_eq_m256(r, e);
3510 }
3511
83c7162d 3512 #[simd_test(enable = "avx")]
0531ce1d
XL
3513 unsafe fn test_mm256_xor_pd() {
3514 let a = _mm256_setr_pd(4., 9., 16., 25.);
3515 let b = _mm256_set1_pd(0.);
3516 let r = _mm256_xor_pd(a, b);
3517 assert_eq_m256d(r, a);
3518 }
3519
83c7162d 3520 #[simd_test(enable = "avx")]
0531ce1d
XL
3521 unsafe fn test_mm256_xor_ps() {
3522 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3523 let b = _mm256_set1_ps(0.);
3524 let r = _mm256_xor_ps(a, b);
3525 assert_eq_m256(r, a);
3526 }
3527
83c7162d 3528 #[simd_test(enable = "avx")]
0531ce1d
XL
3529 unsafe fn test_mm_cmp_pd() {
3530 let a = _mm_setr_pd(4., 9.);
3531 let b = _mm_setr_pd(4., 3.);
17df50a5 3532 let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3533 assert!(get_m128d(r, 0).is_nan());
3534 assert!(get_m128d(r, 1).is_nan());
3535 }
3536
83c7162d 3537 #[simd_test(enable = "avx")]
0531ce1d
XL
3538 unsafe fn test_mm256_cmp_pd() {
3539 let a = _mm256_setr_pd(1., 2., 3., 4.);
3540 let b = _mm256_setr_pd(5., 6., 7., 8.);
17df50a5 3541 let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3542 let e = _mm256_set1_pd(0.);
3543 assert_eq_m256d(r, e);
3544 }
3545
83c7162d 3546 #[simd_test(enable = "avx")]
0531ce1d
XL
3547 unsafe fn test_mm_cmp_ps() {
3548 let a = _mm_setr_ps(4., 3., 2., 5.);
3549 let b = _mm_setr_ps(4., 9., 16., 25.);
17df50a5 3550 let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3551 assert!(get_m128(r, 0).is_nan());
3552 assert_eq!(get_m128(r, 1), 0.);
3553 assert_eq!(get_m128(r, 2), 0.);
3554 assert_eq!(get_m128(r, 3), 0.);
3555 }
3556
83c7162d 3557 #[simd_test(enable = "avx")]
0531ce1d
XL
3558 unsafe fn test_mm256_cmp_ps() {
3559 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3560 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
17df50a5 3561 let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3562 let e = _mm256_set1_ps(0.);
3563 assert_eq_m256(r, e);
3564 }
3565
83c7162d 3566 #[simd_test(enable = "avx")]
0531ce1d
XL
3567 unsafe fn test_mm_cmp_sd() {
3568 let a = _mm_setr_pd(4., 9.);
3569 let b = _mm_setr_pd(4., 3.);
17df50a5 3570 let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3571 assert!(get_m128d(r, 0).is_nan());
3572 assert_eq!(get_m128d(r, 1), 9.);
3573 }
3574
83c7162d 3575 #[simd_test(enable = "avx")]
0531ce1d
XL
3576 unsafe fn test_mm_cmp_ss() {
3577 let a = _mm_setr_ps(4., 3., 2., 5.);
3578 let b = _mm_setr_ps(4., 9., 16., 25.);
17df50a5 3579 let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
0531ce1d
XL
3580 assert!(get_m128(r, 0).is_nan());
3581 assert_eq!(get_m128(r, 1), 3.);
3582 assert_eq!(get_m128(r, 2), 2.);
3583 assert_eq!(get_m128(r, 3), 5.);
3584 }
3585
83c7162d 3586 #[simd_test(enable = "avx")]
0531ce1d
XL
3587 unsafe fn test_mm256_cvtepi32_pd() {
3588 let a = _mm_setr_epi32(4, 9, 16, 25);
3589 let r = _mm256_cvtepi32_pd(a);
3590 let e = _mm256_setr_pd(4., 9., 16., 25.);
3591 assert_eq_m256d(r, e);
3592 }
3593
83c7162d 3594 #[simd_test(enable = "avx")]
0531ce1d
XL
3595 unsafe fn test_mm256_cvtepi32_ps() {
3596 let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3597 let r = _mm256_cvtepi32_ps(a);
3598 let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3599 assert_eq_m256(r, e);
3600 }
3601
83c7162d 3602 #[simd_test(enable = "avx")]
0531ce1d
XL
3603 unsafe fn test_mm256_cvtpd_ps() {
3604 let a = _mm256_setr_pd(4., 9., 16., 25.);
3605 let r = _mm256_cvtpd_ps(a);
3606 let e = _mm_setr_ps(4., 9., 16., 25.);
3607 assert_eq_m128(r, e);
3608 }
3609
83c7162d 3610 #[simd_test(enable = "avx")]
0531ce1d
XL
3611 unsafe fn test_mm256_cvtps_epi32() {
3612 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3613 let r = _mm256_cvtps_epi32(a);
3614 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3615 assert_eq_m256i(r, e);
3616 }
3617
83c7162d 3618 #[simd_test(enable = "avx")]
0531ce1d
XL
3619 unsafe fn test_mm256_cvtps_pd() {
3620 let a = _mm_setr_ps(4., 9., 16., 25.);
3621 let r = _mm256_cvtps_pd(a);
3622 let e = _mm256_setr_pd(4., 9., 16., 25.);
3623 assert_eq_m256d(r, e);
3624 }
3625
83c7162d 3626 #[simd_test(enable = "avx")]
0531ce1d
XL
3627 unsafe fn test_mm256_cvttpd_epi32() {
3628 let a = _mm256_setr_pd(4., 9., 16., 25.);
3629 let r = _mm256_cvttpd_epi32(a);
3630 let e = _mm_setr_epi32(4, 9, 16, 25);
3631 assert_eq_m128i(r, e);
3632 }
3633
83c7162d 3634 #[simd_test(enable = "avx")]
0531ce1d
XL
3635 unsafe fn test_mm256_cvtpd_epi32() {
3636 let a = _mm256_setr_pd(4., 9., 16., 25.);
3637 let r = _mm256_cvtpd_epi32(a);
3638 let e = _mm_setr_epi32(4, 9, 16, 25);
3639 assert_eq_m128i(r, e);
3640 }
3641
83c7162d 3642 #[simd_test(enable = "avx")]
0531ce1d
XL
3643 unsafe fn test_mm256_cvttps_epi32() {
3644 let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3645 let r = _mm256_cvttps_epi32(a);
3646 let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3647 assert_eq_m256i(r, e);
3648 }
3649
83c7162d 3650 #[simd_test(enable = "avx")]
0531ce1d
XL
3651 unsafe fn test_mm256_extractf128_ps() {
3652 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
17df50a5 3653 let r = _mm256_extractf128_ps::<0>(a);
0531ce1d
XL
3654 let e = _mm_setr_ps(4., 3., 2., 5.);
3655 assert_eq_m128(r, e);
3656 }
3657
83c7162d 3658 #[simd_test(enable = "avx")]
0531ce1d
XL
3659 unsafe fn test_mm256_extractf128_pd() {
3660 let a = _mm256_setr_pd(4., 3., 2., 5.);
17df50a5 3661 let r = _mm256_extractf128_pd::<0>(a);
0531ce1d
XL
3662 let e = _mm_setr_pd(4., 3.);
3663 assert_eq_m128d(r, e);
3664 }
3665
83c7162d 3666 #[simd_test(enable = "avx")]
0531ce1d
XL
3667 unsafe fn test_mm256_extractf128_si256() {
3668 let a = _mm256_setr_epi64x(4, 3, 2, 5);
17df50a5 3669 let r = _mm256_extractf128_si256::<0>(a);
0531ce1d
XL
3670 let e = _mm_setr_epi64x(4, 3);
3671 assert_eq_m128i(r, e);
3672 }
3673
83c7162d 3674 #[simd_test(enable = "avx")]
0531ce1d
XL
3675 unsafe fn test_mm256_zeroall() {
3676 _mm256_zeroall();
3677 }
3678
83c7162d 3679 #[simd_test(enable = "avx")]
0531ce1d
XL
3680 unsafe fn test_mm256_zeroupper() {
3681 _mm256_zeroupper();
3682 }
3683
83c7162d 3684 #[simd_test(enable = "avx")]
0531ce1d
XL
3685 unsafe fn test_mm256_permutevar_ps() {
3686 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3687 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3688 let r = _mm256_permutevar_ps(a, b);
3689 let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3690 assert_eq_m256(r, e);
3691 }
3692
83c7162d 3693 #[simd_test(enable = "avx")]
0531ce1d
XL
3694 unsafe fn test_mm_permutevar_ps() {
3695 let a = _mm_setr_ps(4., 3., 2., 5.);
3696 let b = _mm_setr_epi32(1, 2, 3, 4);
3697 let r = _mm_permutevar_ps(a, b);
3698 let e = _mm_setr_ps(3., 2., 5., 4.);
3699 assert_eq_m128(r, e);
3700 }
3701
83c7162d 3702 #[simd_test(enable = "avx")]
0531ce1d
XL
3703 unsafe fn test_mm256_permute_ps() {
3704 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
17df50a5 3705 let r = _mm256_permute_ps::<0x1b>(a);
0531ce1d
XL
3706 let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3707 assert_eq_m256(r, e);
3708 }
3709
83c7162d 3710 #[simd_test(enable = "avx")]
0531ce1d
XL
3711 unsafe fn test_mm_permute_ps() {
3712 let a = _mm_setr_ps(4., 3., 2., 5.);
17df50a5 3713 let r = _mm_permute_ps::<0x1b>(a);
0531ce1d
XL
3714 let e = _mm_setr_ps(5., 2., 3., 4.);
3715 assert_eq_m128(r, e);
3716 }
3717
83c7162d 3718 #[simd_test(enable = "avx")]
0531ce1d
XL
3719 unsafe fn test_mm256_permutevar_pd() {
3720 let a = _mm256_setr_pd(4., 3., 2., 5.);
3721 let b = _mm256_setr_epi64x(1, 2, 3, 4);
3722 let r = _mm256_permutevar_pd(a, b);
3723 let e = _mm256_setr_pd(4., 3., 5., 2.);
3724 assert_eq_m256d(r, e);
3725 }
3726
83c7162d 3727 #[simd_test(enable = "avx")]
0531ce1d
XL
3728 unsafe fn test_mm_permutevar_pd() {
3729 let a = _mm_setr_pd(4., 3.);
3730 let b = _mm_setr_epi64x(3, 0);
3731 let r = _mm_permutevar_pd(a, b);
3732 let e = _mm_setr_pd(3., 4.);
3733 assert_eq_m128d(r, e);
3734 }
3735
83c7162d 3736 #[simd_test(enable = "avx")]
0531ce1d
XL
3737 unsafe fn test_mm256_permute_pd() {
3738 let a = _mm256_setr_pd(4., 3., 2., 5.);
17df50a5 3739 let r = _mm256_permute_pd::<5>(a);
0531ce1d
XL
3740 let e = _mm256_setr_pd(3., 4., 5., 2.);
3741 assert_eq_m256d(r, e);
3742 }
3743
83c7162d 3744 #[simd_test(enable = "avx")]
0531ce1d
XL
3745 unsafe fn test_mm_permute_pd() {
3746 let a = _mm_setr_pd(4., 3.);
17df50a5 3747 let r = _mm_permute_pd::<1>(a);
0531ce1d
XL
3748 let e = _mm_setr_pd(3., 4.);
3749 assert_eq_m128d(r, e);
3750 }
3751
83c7162d 3752 #[simd_test(enable = "avx")]
0531ce1d
XL
3753 unsafe fn test_mm256_permute2f128_ps() {
3754 let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3755 let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
17df50a5 3756 let r = _mm256_permute2f128_ps::<0x13>(a, b);
0531ce1d
XL
3757 let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3758 assert_eq_m256(r, e);
3759 }
3760
83c7162d 3761 #[simd_test(enable = "avx")]
0531ce1d
XL
3762 unsafe fn test_mm256_permute2f128_pd() {
3763 let a = _mm256_setr_pd(1., 2., 3., 4.);
3764 let b = _mm256_setr_pd(5., 6., 7., 8.);
17df50a5 3765 let r = _mm256_permute2f128_pd::<0x31>(a, b);
0531ce1d
XL
3766 let e = _mm256_setr_pd(3., 4., 7., 8.);
3767 assert_eq_m256d(r, e);
3768 }
3769
83c7162d 3770 #[simd_test(enable = "avx")]
0531ce1d
XL
3771 unsafe fn test_mm256_permute2f128_si256() {
3772 let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3773 let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
17df50a5 3774 let r = _mm256_permute2f128_si256::<0x20>(a, b);
0531ce1d
XL
3775 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3776 assert_eq_m256i(r, e);
3777 }
3778
83c7162d 3779 #[simd_test(enable = "avx")]
0531ce1d
XL
3780 unsafe fn test_mm256_broadcast_ss() {
3781 let r = _mm256_broadcast_ss(&3.);
3782 let e = _mm256_set1_ps(3.);
3783 assert_eq_m256(r, e);
3784 }
3785
83c7162d 3786 #[simd_test(enable = "avx")]
0531ce1d
XL
3787 unsafe fn test_mm_broadcast_ss() {
3788 let r = _mm_broadcast_ss(&3.);
3789 let e = _mm_set1_ps(3.);
3790 assert_eq_m128(r, e);
3791 }
3792
83c7162d 3793 #[simd_test(enable = "avx")]
0531ce1d
XL
3794 unsafe fn test_mm256_broadcast_sd() {
3795 let r = _mm256_broadcast_sd(&3.);
3796 let e = _mm256_set1_pd(3.);
3797 assert_eq_m256d(r, e);
3798 }
3799
83c7162d 3800 #[simd_test(enable = "avx")]
0531ce1d
XL
3801 unsafe fn test_mm256_broadcast_ps() {
3802 let a = _mm_setr_ps(4., 3., 2., 5.);
3803 let r = _mm256_broadcast_ps(&a);
3804 let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3805 assert_eq_m256(r, e);
3806 }
3807
83c7162d 3808 #[simd_test(enable = "avx")]
0531ce1d
XL
3809 unsafe fn test_mm256_broadcast_pd() {
3810 let a = _mm_setr_pd(4., 3.);
3811 let r = _mm256_broadcast_pd(&a);
3812 let e = _mm256_setr_pd(4., 3., 4., 3.);
3813 assert_eq_m256d(r, e);
3814 }
3815
83c7162d 3816 #[simd_test(enable = "avx")]
0531ce1d
XL
3817 unsafe fn test_mm256_insertf128_ps() {
3818 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3819 let b = _mm_setr_ps(4., 9., 16., 25.);
17df50a5 3820 let r = _mm256_insertf128_ps::<0>(a, b);
0531ce1d
XL
3821 let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3822 assert_eq_m256(r, e);
3823 }
3824
83c7162d 3825 #[simd_test(enable = "avx")]
0531ce1d
XL
3826 unsafe fn test_mm256_insertf128_pd() {
3827 let a = _mm256_setr_pd(1., 2., 3., 4.);
3828 let b = _mm_setr_pd(5., 6.);
17df50a5 3829 let r = _mm256_insertf128_pd::<0>(a, b);
0531ce1d
XL
3830 let e = _mm256_setr_pd(5., 6., 3., 4.);
3831 assert_eq_m256d(r, e);
3832 }
3833
83c7162d 3834 #[simd_test(enable = "avx")]
0531ce1d
XL
3835 unsafe fn test_mm256_insertf128_si256() {
3836 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3837 let b = _mm_setr_epi64x(5, 6);
17df50a5 3838 let r = _mm256_insertf128_si256::<0>(a, b);
0531ce1d
XL
3839 let e = _mm256_setr_epi64x(5, 6, 3, 4);
3840 assert_eq_m256i(r, e);
3841 }
3842
83c7162d 3843 #[simd_test(enable = "avx")]
0531ce1d 3844 unsafe fn test_mm256_insert_epi8() {
0731742a 3845 #[rustfmt::skip]
0531ce1d
XL
3846 let a = _mm256_setr_epi8(
3847 1, 2, 3, 4, 5, 6, 7, 8,
3848 9, 10, 11, 12, 13, 14, 15, 16,
3849 17, 18, 19, 20, 21, 22, 23, 24,
3850 25, 26, 27, 28, 29, 30, 31, 32,
3851 );
17df50a5 3852 let r = _mm256_insert_epi8::<31>(a, 0);
0731742a 3853 #[rustfmt::skip]
0531ce1d
XL
3854 let e = _mm256_setr_epi8(
3855 1, 2, 3, 4, 5, 6, 7, 8,
3856 9, 10, 11, 12, 13, 14, 15, 16,
3857 17, 18, 19, 20, 21, 22, 23, 24,
3858 25, 26, 27, 28, 29, 30, 31, 0,
3859 );
3860 assert_eq_m256i(r, e);
3861 }
3862
83c7162d 3863 #[simd_test(enable = "avx")]
0531ce1d 3864 unsafe fn test_mm256_insert_epi16() {
0731742a 3865 #[rustfmt::skip]
0531ce1d
XL
3866 let a = _mm256_setr_epi16(
3867 0, 1, 2, 3, 4, 5, 6, 7,
3868 8, 9, 10, 11, 12, 13, 14, 15,
3869 );
17df50a5 3870 let r = _mm256_insert_epi16::<15>(a, 0);
0731742a 3871 #[rustfmt::skip]
0531ce1d
XL
3872 let e = _mm256_setr_epi16(
3873 0, 1, 2, 3, 4, 5, 6, 7,
3874 8, 9, 10, 11, 12, 13, 14, 0,
3875 );
3876 assert_eq_m256i(r, e);
3877 }
3878
83c7162d 3879 #[simd_test(enable = "avx")]
0531ce1d
XL
3880 unsafe fn test_mm256_insert_epi32() {
3881 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
17df50a5 3882 let r = _mm256_insert_epi32::<7>(a, 0);
0531ce1d
XL
3883 let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
3884 assert_eq_m256i(r, e);
3885 }
3886
83c7162d 3887 #[simd_test(enable = "avx")]
0531ce1d
XL
3888 unsafe fn test_mm256_load_pd() {
3889 let a = _mm256_setr_pd(1., 2., 3., 4.);
3890 let p = &a as *const _ as *const f64;
3891 let r = _mm256_load_pd(p);
3892 let e = _mm256_setr_pd(1., 2., 3., 4.);
3893 assert_eq_m256d(r, e);
3894 }
3895
83c7162d 3896 #[simd_test(enable = "avx")]
0531ce1d
XL
3897 unsafe fn test_mm256_store_pd() {
3898 let a = _mm256_setr_pd(1., 2., 3., 4.);
3899 let mut r = _mm256_undefined_pd();
3900 _mm256_store_pd(&mut r as *mut _ as *mut f64, a);
3901 assert_eq_m256d(r, a);
3902 }
3903
83c7162d 3904 #[simd_test(enable = "avx")]
0531ce1d
XL
3905 unsafe fn test_mm256_load_ps() {
3906 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3907 let p = &a as *const _ as *const f32;
3908 let r = _mm256_load_ps(p);
3909 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3910 assert_eq_m256(r, e);
3911 }
3912
83c7162d 3913 #[simd_test(enable = "avx")]
0531ce1d
XL
3914 unsafe fn test_mm256_store_ps() {
3915 let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3916 let mut r = _mm256_undefined_ps();
3917 _mm256_store_ps(&mut r as *mut _ as *mut f32, a);
3918 assert_eq_m256(r, a);
3919 }
3920
83c7162d 3921 #[simd_test(enable = "avx")]
0531ce1d
XL
3922 unsafe fn test_mm256_loadu_pd() {
3923 let a = &[1.0f64, 2., 3., 4.];
3924 let p = a.as_ptr();
3925 let r = _mm256_loadu_pd(black_box(p));
3926 let e = _mm256_setr_pd(1., 2., 3., 4.);
3927 assert_eq_m256d(r, e);
3928 }
3929
83c7162d 3930 #[simd_test(enable = "avx")]
0531ce1d
XL
3931 unsafe fn test_mm256_storeu_pd() {
3932 let a = _mm256_set1_pd(9.);
3933 let mut r = _mm256_undefined_pd();
3934 _mm256_storeu_pd(&mut r as *mut _ as *mut f64, a);
3935 assert_eq_m256d(r, a);
3936 }
3937
83c7162d 3938 #[simd_test(enable = "avx")]
0531ce1d
XL
3939 unsafe fn test_mm256_loadu_ps() {
3940 let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
3941 let p = a.as_ptr();
3942 let r = _mm256_loadu_ps(black_box(p));
3943 let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3944 assert_eq_m256(r, e);
3945 }
3946
83c7162d 3947 #[simd_test(enable = "avx")]
0531ce1d
XL
3948 unsafe fn test_mm256_storeu_ps() {
3949 let a = _mm256_set1_ps(9.);
3950 let mut r = _mm256_undefined_ps();
3951 _mm256_storeu_ps(&mut r as *mut _ as *mut f32, a);
3952 assert_eq_m256(r, a);
3953 }
3954
83c7162d 3955 #[simd_test(enable = "avx")]
0531ce1d
XL
3956 unsafe fn test_mm256_load_si256() {
3957 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3958 let p = &a as *const _;
3959 let r = _mm256_load_si256(p);
3960 let e = _mm256_setr_epi64x(1, 2, 3, 4);
3961 assert_eq_m256i(r, e);
3962 }
3963
83c7162d 3964 #[simd_test(enable = "avx")]
0531ce1d
XL
3965 unsafe fn test_mm256_store_si256() {
3966 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3967 let mut r = _mm256_undefined_si256();
3968 _mm256_store_si256(&mut r as *mut _, a);
3969 assert_eq_m256i(r, a);
3970 }
3971
83c7162d 3972 #[simd_test(enable = "avx")]
0531ce1d
XL
3973 unsafe fn test_mm256_loadu_si256() {
3974 let a = _mm256_setr_epi64x(1, 2, 3, 4);
3975 let p = &a as *const _;
3976 let r = _mm256_loadu_si256(black_box(p));
3977 let e = _mm256_setr_epi64x(1, 2, 3, 4);
3978 assert_eq_m256i(r, e);
3979 }
3980
83c7162d 3981 #[simd_test(enable = "avx")]
0531ce1d
XL
3982 unsafe fn test_mm256_storeu_si256() {
3983 let a = _mm256_set1_epi8(9);
3984 let mut r = _mm256_undefined_si256();
3985 _mm256_storeu_si256(&mut r as *mut _, a);
3986 assert_eq_m256i(r, a);
3987 }
3988
83c7162d 3989 #[simd_test(enable = "avx")]
0531ce1d
XL
3990 unsafe fn test_mm256_maskload_pd() {
3991 let a = &[1.0f64, 2., 3., 4.];
3992 let p = a.as_ptr();
3993 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
3994 let r = _mm256_maskload_pd(black_box(p), mask);
3995 let e = _mm256_setr_pd(0., 2., 0., 4.);
3996 assert_eq_m256d(r, e);
3997 }
3998
83c7162d 3999 #[simd_test(enable = "avx")]
0531ce1d
XL
4000 unsafe fn test_mm256_maskstore_pd() {
4001 let mut r = _mm256_set1_pd(0.);
4002 let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4003 let a = _mm256_setr_pd(1., 2., 3., 4.);
4004 _mm256_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
4005 let e = _mm256_setr_pd(0., 2., 0., 4.);
4006 assert_eq_m256d(r, e);
4007 }
4008
83c7162d 4009 #[simd_test(enable = "avx")]
0531ce1d
XL
4010 unsafe fn test_mm_maskload_pd() {
4011 let a = &[1.0f64, 2.];
4012 let p = a.as_ptr();
4013 let mask = _mm_setr_epi64x(0, !0);
4014 let r = _mm_maskload_pd(black_box(p), mask);
4015 let e = _mm_setr_pd(0., 2.);
4016 assert_eq_m128d(r, e);
4017 }
4018
83c7162d 4019 #[simd_test(enable = "avx")]
0531ce1d
XL
4020 unsafe fn test_mm_maskstore_pd() {
4021 let mut r = _mm_set1_pd(0.);
4022 let mask = _mm_setr_epi64x(0, !0);
4023 let a = _mm_setr_pd(1., 2.);
4024 _mm_maskstore_pd(&mut r as *mut _ as *mut f64, mask, a);
4025 let e = _mm_setr_pd(0., 2.);
4026 assert_eq_m128d(r, e);
4027 }
4028
83c7162d 4029 #[simd_test(enable = "avx")]
0531ce1d
XL
4030 unsafe fn test_mm256_maskload_ps() {
4031 let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4032 let p = a.as_ptr();
4033 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4034 let r = _mm256_maskload_ps(black_box(p), mask);
4035 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4036 assert_eq_m256(r, e);
4037 }
4038
83c7162d 4039 #[simd_test(enable = "avx")]
0531ce1d
XL
4040 unsafe fn test_mm256_maskstore_ps() {
4041 let mut r = _mm256_set1_ps(0.);
4042 let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4043 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4044 _mm256_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
4045 let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4046 assert_eq_m256(r, e);
4047 }
4048
83c7162d 4049 #[simd_test(enable = "avx")]
0531ce1d
XL
4050 unsafe fn test_mm_maskload_ps() {
4051 let a = &[1.0f32, 2., 3., 4.];
4052 let p = a.as_ptr();
4053 let mask = _mm_setr_epi32(0, !0, 0, !0);
4054 let r = _mm_maskload_ps(black_box(p), mask);
4055 let e = _mm_setr_ps(0., 2., 0., 4.);
4056 assert_eq_m128(r, e);
4057 }
4058
83c7162d 4059 #[simd_test(enable = "avx")]
0531ce1d
XL
4060 unsafe fn test_mm_maskstore_ps() {
4061 let mut r = _mm_set1_ps(0.);
4062 let mask = _mm_setr_epi32(0, !0, 0, !0);
4063 let a = _mm_setr_ps(1., 2., 3., 4.);
4064 _mm_maskstore_ps(&mut r as *mut _ as *mut f32, mask, a);
4065 let e = _mm_setr_ps(0., 2., 0., 4.);
4066 assert_eq_m128(r, e);
4067 }
4068
83c7162d 4069 #[simd_test(enable = "avx")]
0531ce1d
XL
4070 unsafe fn test_mm256_movehdup_ps() {
4071 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4072 let r = _mm256_movehdup_ps(a);
4073 let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4074 assert_eq_m256(r, e);
4075 }
4076
83c7162d 4077 #[simd_test(enable = "avx")]
0531ce1d
XL
4078 unsafe fn test_mm256_moveldup_ps() {
4079 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4080 let r = _mm256_moveldup_ps(a);
4081 let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4082 assert_eq_m256(r, e);
4083 }
4084
83c7162d 4085 #[simd_test(enable = "avx")]
0531ce1d
XL
4086 unsafe fn test_mm256_movedup_pd() {
4087 let a = _mm256_setr_pd(1., 2., 3., 4.);
4088 let r = _mm256_movedup_pd(a);
4089 let e = _mm256_setr_pd(1., 1., 3., 3.);
4090 assert_eq_m256d(r, e);
4091 }
4092
83c7162d 4093 #[simd_test(enable = "avx")]
0531ce1d 4094 unsafe fn test_mm256_lddqu_si256() {
0731742a 4095 #[rustfmt::skip]
0531ce1d
XL
4096 let a = _mm256_setr_epi8(
4097 1, 2, 3, 4, 5, 6, 7, 8,
4098 9, 10, 11, 12, 13, 14, 15, 16,
4099 17, 18, 19, 20, 21, 22, 23, 24,
4100 25, 26, 27, 28, 29, 30, 31, 32,
4101 );
4102 let p = &a as *const _;
4103 let r = _mm256_lddqu_si256(black_box(p));
0731742a 4104 #[rustfmt::skip]
0531ce1d
XL
4105 let e = _mm256_setr_epi8(
4106 1, 2, 3, 4, 5, 6, 7, 8,
4107 9, 10, 11, 12, 13, 14, 15, 16,
4108 17, 18, 19, 20, 21, 22, 23, 24,
4109 25, 26, 27, 28, 29, 30, 31, 32,
4110 );
4111 assert_eq_m256i(r, e);
4112 }
4113
83c7162d 4114 #[simd_test(enable = "avx")]
0531ce1d
XL
4115 unsafe fn test_mm256_stream_si256() {
4116 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4117 let mut r = _mm256_undefined_si256();
4118 _mm256_stream_si256(&mut r as *mut _, a);
4119 assert_eq_m256i(r, a);
4120 }
4121
83c7162d 4122 #[simd_test(enable = "avx")]
0531ce1d
XL
4123 unsafe fn test_mm256_stream_pd() {
4124 #[repr(align(32))]
4125 struct Memory {
4126 pub data: [f64; 4],
4127 }
4128 let a = _mm256_set1_pd(7.0);
8faf50e0 4129 let mut mem = Memory { data: [-1.0; 4] };
0531ce1d
XL
4130
4131 _mm256_stream_pd(&mut mem.data[0] as *mut f64, a);
4132 for i in 0..4 {
4133 assert_eq!(mem.data[i], get_m256d(a, i));
4134 }
4135 }
4136
83c7162d 4137 #[simd_test(enable = "avx")]
0531ce1d
XL
4138 unsafe fn test_mm256_stream_ps() {
4139 #[repr(align(32))]
4140 struct Memory {
4141 pub data: [f32; 8],
4142 }
4143 let a = _mm256_set1_ps(7.0);
8faf50e0 4144 let mut mem = Memory { data: [-1.0; 8] };
0531ce1d
XL
4145
4146 _mm256_stream_ps(&mut mem.data[0] as *mut f32, a);
4147 for i in 0..8 {
4148 assert_eq!(mem.data[i], get_m256(a, i));
4149 }
4150 }
4151
83c7162d 4152 #[simd_test(enable = "avx")]
0531ce1d
XL
4153 unsafe fn test_mm256_rcp_ps() {
4154 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4155 let r = _mm256_rcp_ps(a);
0731742a 4156 #[rustfmt::skip]
0531ce1d
XL
4157 let e = _mm256_setr_ps(
4158 0.99975586, 0.49987793, 0.33325195, 0.24993896,
4159 0.19995117, 0.16662598, 0.14282227, 0.12496948,
4160 );
4161 let rel_err = 0.00048828125;
4162 for i in 0..8 {
4163 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4164 }
4165 }
4166
83c7162d 4167 #[simd_test(enable = "avx")]
0531ce1d
XL
4168 unsafe fn test_mm256_rsqrt_ps() {
4169 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4170 let r = _mm256_rsqrt_ps(a);
0731742a 4171 #[rustfmt::skip]
0531ce1d
XL
4172 let e = _mm256_setr_ps(
4173 0.99975586, 0.7069092, 0.5772705, 0.49987793,
4174 0.44714355, 0.40820313, 0.3779297, 0.3534546,
4175 );
4176 let rel_err = 0.00048828125;
4177 for i in 0..8 {
4178 assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4179 }
4180 }
4181
83c7162d 4182 #[simd_test(enable = "avx")]
0531ce1d
XL
4183 unsafe fn test_mm256_unpackhi_pd() {
4184 let a = _mm256_setr_pd(1., 2., 3., 4.);
4185 let b = _mm256_setr_pd(5., 6., 7., 8.);
4186 let r = _mm256_unpackhi_pd(a, b);
4187 let e = _mm256_setr_pd(2., 6., 4., 8.);
4188 assert_eq_m256d(r, e);
4189 }
4190
83c7162d 4191 #[simd_test(enable = "avx")]
0531ce1d
XL
4192 unsafe fn test_mm256_unpackhi_ps() {
4193 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4194 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4195 let r = _mm256_unpackhi_ps(a, b);
4196 let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4197 assert_eq_m256(r, e);
4198 }
4199
83c7162d 4200 #[simd_test(enable = "avx")]
0531ce1d
XL
4201 unsafe fn test_mm256_unpacklo_pd() {
4202 let a = _mm256_setr_pd(1., 2., 3., 4.);
4203 let b = _mm256_setr_pd(5., 6., 7., 8.);
4204 let r = _mm256_unpacklo_pd(a, b);
4205 let e = _mm256_setr_pd(1., 5., 3., 7.);
4206 assert_eq_m256d(r, e);
4207 }
4208
83c7162d 4209 #[simd_test(enable = "avx")]
0531ce1d
XL
4210 unsafe fn test_mm256_unpacklo_ps() {
4211 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4212 let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4213 let r = _mm256_unpacklo_ps(a, b);
4214 let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4215 assert_eq_m256(r, e);
4216 }
4217
83c7162d 4218 #[simd_test(enable = "avx")]
0531ce1d
XL
4219 unsafe fn test_mm256_testz_si256() {
4220 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4221 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4222 let r = _mm256_testz_si256(a, b);
4223 assert_eq!(r, 0);
4224 let b = _mm256_set1_epi64x(0);
4225 let r = _mm256_testz_si256(a, b);
4226 assert_eq!(r, 1);
4227 }
4228
83c7162d 4229 #[simd_test(enable = "avx")]
0531ce1d
XL
4230 unsafe fn test_mm256_testc_si256() {
4231 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4232 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4233 let r = _mm256_testc_si256(a, b);
4234 assert_eq!(r, 0);
4235 let b = _mm256_set1_epi64x(0);
4236 let r = _mm256_testc_si256(a, b);
4237 assert_eq!(r, 1);
4238 }
4239
83c7162d 4240 #[simd_test(enable = "avx")]
0531ce1d
XL
4241 unsafe fn test_mm256_testnzc_si256() {
4242 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4243 let b = _mm256_setr_epi64x(5, 6, 7, 8);
4244 let r = _mm256_testnzc_si256(a, b);
4245 assert_eq!(r, 1);
4246 let a = _mm256_setr_epi64x(0, 0, 0, 0);
4247 let b = _mm256_setr_epi64x(0, 0, 0, 0);
4248 let r = _mm256_testnzc_si256(a, b);
4249 assert_eq!(r, 0);
4250 }
4251
83c7162d 4252 #[simd_test(enable = "avx")]
0531ce1d
XL
4253 unsafe fn test_mm256_testz_pd() {
4254 let a = _mm256_setr_pd(1., 2., 3., 4.);
4255 let b = _mm256_setr_pd(5., 6., 7., 8.);
4256 let r = _mm256_testz_pd(a, b);
4257 assert_eq!(r, 1);
4258 let a = _mm256_set1_pd(-1.);
4259 let r = _mm256_testz_pd(a, a);
4260 assert_eq!(r, 0);
4261 }
4262
83c7162d 4263 #[simd_test(enable = "avx")]
0531ce1d
XL
4264 unsafe fn test_mm256_testc_pd() {
4265 let a = _mm256_setr_pd(1., 2., 3., 4.);
4266 let b = _mm256_setr_pd(5., 6., 7., 8.);
4267 let r = _mm256_testc_pd(a, b);
4268 assert_eq!(r, 1);
4269 let a = _mm256_set1_pd(1.);
4270 let b = _mm256_set1_pd(-1.);
4271 let r = _mm256_testc_pd(a, b);
4272 assert_eq!(r, 0);
4273 }
4274
83c7162d 4275 #[simd_test(enable = "avx")]
0531ce1d
XL
4276 unsafe fn test_mm256_testnzc_pd() {
4277 let a = _mm256_setr_pd(1., 2., 3., 4.);
4278 let b = _mm256_setr_pd(5., 6., 7., 8.);
4279 let r = _mm256_testnzc_pd(a, b);
4280 assert_eq!(r, 0);
4281 let a = _mm256_setr_pd(1., -1., -1., -1.);
4282 let b = _mm256_setr_pd(-1., -1., 1., 1.);
4283 let r = _mm256_testnzc_pd(a, b);
4284 assert_eq!(r, 1);
4285 }
4286
83c7162d 4287 #[simd_test(enable = "avx")]
0531ce1d
XL
4288 unsafe fn test_mm_testz_pd() {
4289 let a = _mm_setr_pd(1., 2.);
4290 let b = _mm_setr_pd(5., 6.);
4291 let r = _mm_testz_pd(a, b);
4292 assert_eq!(r, 1);
4293 let a = _mm_set1_pd(-1.);
4294 let r = _mm_testz_pd(a, a);
4295 assert_eq!(r, 0);
4296 }
4297
83c7162d 4298 #[simd_test(enable = "avx")]
0531ce1d
XL
4299 unsafe fn test_mm_testc_pd() {
4300 let a = _mm_setr_pd(1., 2.);
4301 let b = _mm_setr_pd(5., 6.);
4302 let r = _mm_testc_pd(a, b);
4303 assert_eq!(r, 1);
4304 let a = _mm_set1_pd(1.);
4305 let b = _mm_set1_pd(-1.);
4306 let r = _mm_testc_pd(a, b);
4307 assert_eq!(r, 0);
4308 }
4309
83c7162d 4310 #[simd_test(enable = "avx")]
0531ce1d
XL
4311 unsafe fn test_mm_testnzc_pd() {
4312 let a = _mm_setr_pd(1., 2.);
4313 let b = _mm_setr_pd(5., 6.);
4314 let r = _mm_testnzc_pd(a, b);
4315 assert_eq!(r, 0);
4316 let a = _mm_setr_pd(1., -1.);
4317 let b = _mm_setr_pd(-1., -1.);
4318 let r = _mm_testnzc_pd(a, b);
4319 assert_eq!(r, 1);
4320 }
4321
83c7162d 4322 #[simd_test(enable = "avx")]
0531ce1d
XL
4323 unsafe fn test_mm256_testz_ps() {
4324 let a = _mm256_set1_ps(1.);
4325 let r = _mm256_testz_ps(a, a);
4326 assert_eq!(r, 1);
4327 let a = _mm256_set1_ps(-1.);
4328 let r = _mm256_testz_ps(a, a);
4329 assert_eq!(r, 0);
4330 }
4331
83c7162d 4332 #[simd_test(enable = "avx")]
0531ce1d
XL
4333 unsafe fn test_mm256_testc_ps() {
4334 let a = _mm256_set1_ps(1.);
4335 let r = _mm256_testc_ps(a, a);
4336 assert_eq!(r, 1);
4337 let b = _mm256_set1_ps(-1.);
4338 let r = _mm256_testc_ps(a, b);
4339 assert_eq!(r, 0);
4340 }
4341
83c7162d 4342 #[simd_test(enable = "avx")]
0531ce1d
XL
4343 unsafe fn test_mm256_testnzc_ps() {
4344 let a = _mm256_set1_ps(1.);
4345 let r = _mm256_testnzc_ps(a, a);
4346 assert_eq!(r, 0);
4347 let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4348 let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4349 let r = _mm256_testnzc_ps(a, b);
4350 assert_eq!(r, 1);
4351 }
4352
83c7162d 4353 #[simd_test(enable = "avx")]
0531ce1d
XL
4354 unsafe fn test_mm_testz_ps() {
4355 let a = _mm_set1_ps(1.);
4356 let r = _mm_testz_ps(a, a);
4357 assert_eq!(r, 1);
4358 let a = _mm_set1_ps(-1.);
4359 let r = _mm_testz_ps(a, a);
4360 assert_eq!(r, 0);
4361 }
4362
83c7162d 4363 #[simd_test(enable = "avx")]
0531ce1d
XL
4364 unsafe fn test_mm_testc_ps() {
4365 let a = _mm_set1_ps(1.);
4366 let r = _mm_testc_ps(a, a);
4367 assert_eq!(r, 1);
4368 let b = _mm_set1_ps(-1.);
4369 let r = _mm_testc_ps(a, b);
4370 assert_eq!(r, 0);
4371 }
4372
83c7162d 4373 #[simd_test(enable = "avx")]
0531ce1d
XL
4374 unsafe fn test_mm_testnzc_ps() {
4375 let a = _mm_set1_ps(1.);
4376 let r = _mm_testnzc_ps(a, a);
4377 assert_eq!(r, 0);
4378 let a = _mm_setr_ps(1., -1., -1., -1.);
4379 let b = _mm_setr_ps(-1., -1., 1., 1.);
4380 let r = _mm_testnzc_ps(a, b);
4381 assert_eq!(r, 1);
4382 }
4383
83c7162d 4384 #[simd_test(enable = "avx")]
0531ce1d
XL
4385 unsafe fn test_mm256_movemask_pd() {
4386 let a = _mm256_setr_pd(1., -2., 3., -4.);
4387 let r = _mm256_movemask_pd(a);
4388 assert_eq!(r, 0xA);
4389 }
4390
83c7162d 4391 #[simd_test(enable = "avx")]
0531ce1d
XL
4392 unsafe fn test_mm256_movemask_ps() {
4393 let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4394 let r = _mm256_movemask_ps(a);
4395 assert_eq!(r, 0xAA);
4396 }
4397
83c7162d 4398 #[simd_test(enable = "avx")]
0531ce1d
XL
4399 unsafe fn test_mm256_setzero_pd() {
4400 let r = _mm256_setzero_pd();
4401 assert_eq_m256d(r, _mm256_set1_pd(0.));
4402 }
4403
83c7162d 4404 #[simd_test(enable = "avx")]
0531ce1d
XL
4405 unsafe fn test_mm256_setzero_ps() {
4406 let r = _mm256_setzero_ps();
4407 assert_eq_m256(r, _mm256_set1_ps(0.));
4408 }
4409
83c7162d 4410 #[simd_test(enable = "avx")]
0531ce1d
XL
4411 unsafe fn test_mm256_setzero_si256() {
4412 let r = _mm256_setzero_si256();
4413 assert_eq_m256i(r, _mm256_set1_epi8(0));
4414 }
4415
83c7162d 4416 #[simd_test(enable = "avx")]
0531ce1d
XL
4417 unsafe fn test_mm256_set_pd() {
4418 let r = _mm256_set_pd(1., 2., 3., 4.);
4419 assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4420 }
4421
83c7162d 4422 #[simd_test(enable = "avx")]
0531ce1d
XL
4423 unsafe fn test_mm256_set_ps() {
4424 let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
8faf50e0 4425 assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
0531ce1d
XL
4426 }
4427
83c7162d 4428 #[simd_test(enable = "avx")]
0531ce1d 4429 unsafe fn test_mm256_set_epi8() {
0731742a 4430 #[rustfmt::skip]
0531ce1d
XL
4431 let r = _mm256_set_epi8(
4432 1, 2, 3, 4, 5, 6, 7, 8,
4433 9, 10, 11, 12, 13, 14, 15, 16,
4434 17, 18, 19, 20, 21, 22, 23, 24,
4435 25, 26, 27, 28, 29, 30, 31, 32,
4436 );
0731742a 4437 #[rustfmt::skip]
0531ce1d
XL
4438 let e = _mm256_setr_epi8(
4439 32, 31, 30, 29, 28, 27, 26, 25,
4440 24, 23, 22, 21, 20, 19, 18, 17,
4441 16, 15, 14, 13, 12, 11, 10, 9,
4442 8, 7, 6, 5, 4, 3, 2, 1
4443 );
4444 assert_eq_m256i(r, e);
4445 }
4446
83c7162d 4447 #[simd_test(enable = "avx")]
0531ce1d 4448 unsafe fn test_mm256_set_epi16() {
0731742a 4449 #[rustfmt::skip]
0531ce1d
XL
4450 let r = _mm256_set_epi16(
4451 1, 2, 3, 4, 5, 6, 7, 8,
4452 9, 10, 11, 12, 13, 14, 15, 16,
4453 );
0731742a 4454 #[rustfmt::skip]
0531ce1d
XL
4455 let e = _mm256_setr_epi16(
4456 16, 15, 14, 13, 12, 11, 10, 9, 8,
4457 7, 6, 5, 4, 3, 2, 1,
4458 );
4459 assert_eq_m256i(r, e);
4460 }
4461
83c7162d 4462 #[simd_test(enable = "avx")]
0531ce1d
XL
4463 unsafe fn test_mm256_set_epi32() {
4464 let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4465 assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4466 }
4467
83c7162d 4468 #[simd_test(enable = "avx")]
0531ce1d
XL
4469 unsafe fn test_mm256_set_epi64x() {
4470 let r = _mm256_set_epi64x(1, 2, 3, 4);
4471 assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4472 }
4473
83c7162d 4474 #[simd_test(enable = "avx")]
0531ce1d
XL
4475 unsafe fn test_mm256_setr_pd() {
4476 let r = _mm256_setr_pd(1., 2., 3., 4.);
4477 assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4478 }
4479
83c7162d 4480 #[simd_test(enable = "avx")]
0531ce1d
XL
4481 unsafe fn test_mm256_setr_ps() {
4482 let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
8faf50e0 4483 assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
0531ce1d
XL
4484 }
4485
83c7162d 4486 #[simd_test(enable = "avx")]
0531ce1d 4487 unsafe fn test_mm256_setr_epi8() {
0731742a 4488 #[rustfmt::skip]
0531ce1d
XL
4489 let r = _mm256_setr_epi8(
4490 1, 2, 3, 4, 5, 6, 7, 8,
4491 9, 10, 11, 12, 13, 14, 15, 16,
4492 17, 18, 19, 20, 21, 22, 23, 24,
4493 25, 26, 27, 28, 29, 30, 31, 32,
4494 );
0731742a 4495 #[rustfmt::skip]
0531ce1d
XL
4496 let e = _mm256_setr_epi8(
4497 1, 2, 3, 4, 5, 6, 7, 8,
4498 9, 10, 11, 12, 13, 14, 15, 16,
4499 17, 18, 19, 20, 21, 22, 23, 24,
4500 25, 26, 27, 28, 29, 30, 31, 32
4501 );
4502
4503 assert_eq_m256i(r, e);
4504 }
4505
83c7162d 4506 #[simd_test(enable = "avx")]
0531ce1d 4507 unsafe fn test_mm256_setr_epi16() {
0731742a 4508 #[rustfmt::skip]
0531ce1d
XL
4509 let r = _mm256_setr_epi16(
4510 1, 2, 3, 4, 5, 6, 7, 8,
4511 9, 10, 11, 12, 13, 14, 15, 16,
4512 );
0731742a 4513 #[rustfmt::skip]
0531ce1d
XL
4514 let e = _mm256_setr_epi16(
4515 1, 2, 3, 4, 5, 6, 7, 8,
4516 9, 10, 11, 12, 13, 14, 15, 16,
4517 );
4518 assert_eq_m256i(r, e);
4519 }
4520
83c7162d 4521 #[simd_test(enable = "avx")]
0531ce1d
XL
4522 unsafe fn test_mm256_setr_epi32() {
4523 let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4524 assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4525 }
4526
83c7162d 4527 #[simd_test(enable = "avx")]
0531ce1d
XL
4528 unsafe fn test_mm256_setr_epi64x() {
4529 let r = _mm256_setr_epi64x(1, 2, 3, 4);
4530 assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4531 }
4532
83c7162d 4533 #[simd_test(enable = "avx")]
0531ce1d
XL
4534 unsafe fn test_mm256_set1_pd() {
4535 let r = _mm256_set1_pd(1.);
4536 assert_eq_m256d(r, _mm256_set1_pd(1.));
4537 }
4538
83c7162d 4539 #[simd_test(enable = "avx")]
0531ce1d
XL
4540 unsafe fn test_mm256_set1_ps() {
4541 let r = _mm256_set1_ps(1.);
4542 assert_eq_m256(r, _mm256_set1_ps(1.));
4543 }
4544
83c7162d 4545 #[simd_test(enable = "avx")]
0531ce1d
XL
4546 unsafe fn test_mm256_set1_epi8() {
4547 let r = _mm256_set1_epi8(1);
4548 assert_eq_m256i(r, _mm256_set1_epi8(1));
4549 }
4550
83c7162d 4551 #[simd_test(enable = "avx")]
0531ce1d
XL
4552 unsafe fn test_mm256_set1_epi16() {
4553 let r = _mm256_set1_epi16(1);
4554 assert_eq_m256i(r, _mm256_set1_epi16(1));
4555 }
4556
83c7162d 4557 #[simd_test(enable = "avx")]
0531ce1d
XL
4558 unsafe fn test_mm256_set1_epi32() {
4559 let r = _mm256_set1_epi32(1);
4560 assert_eq_m256i(r, _mm256_set1_epi32(1));
4561 }
4562
83c7162d 4563 #[simd_test(enable = "avx")]
0531ce1d
XL
4564 unsafe fn test_mm256_set1_epi64x() {
4565 let r = _mm256_set1_epi64x(1);
4566 assert_eq_m256i(r, _mm256_set1_epi64x(1));
4567 }
4568
83c7162d 4569 #[simd_test(enable = "avx")]
0531ce1d
XL
4570 unsafe fn test_mm256_castpd_ps() {
4571 let a = _mm256_setr_pd(1., 2., 3., 4.);
4572 let r = _mm256_castpd_ps(a);
4573 let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4574 assert_eq_m256(r, e);
4575 }
4576
83c7162d 4577 #[simd_test(enable = "avx")]
0531ce1d
XL
4578 unsafe fn test_mm256_castps_pd() {
4579 let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4580 let r = _mm256_castps_pd(a);
4581 let e = _mm256_setr_pd(1., 2., 3., 4.);
4582 assert_eq_m256d(r, e);
4583 }
4584
83c7162d 4585 #[simd_test(enable = "avx")]
0531ce1d
XL
4586 unsafe fn test_mm256_castps_si256() {
4587 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4588 let r = _mm256_castps_si256(a);
0731742a 4589 #[rustfmt::skip]
0531ce1d
XL
4590 let e = _mm256_setr_epi8(
4591 0, 0, -128, 63, 0, 0, 0, 64,
4592 0, 0, 64, 64, 0, 0, -128, 64,
4593 0, 0, -96, 64, 0, 0, -64, 64,
4594 0, 0, -32, 64, 0, 0, 0, 65,
4595 );
4596 assert_eq_m256i(r, e);
4597 }
4598
83c7162d 4599 #[simd_test(enable = "avx")]
0531ce1d 4600 unsafe fn test_mm256_castsi256_ps() {
0731742a 4601 #[rustfmt::skip]
0531ce1d
XL
4602 let a = _mm256_setr_epi8(
4603 0, 0, -128, 63, 0, 0, 0, 64,
4604 0, 0, 64, 64, 0, 0, -128, 64,
4605 0, 0, -96, 64, 0, 0, -64, 64,
4606 0, 0, -32, 64, 0, 0, 0, 65,
4607 );
4608 let r = _mm256_castsi256_ps(a);
4609 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4610 assert_eq_m256(r, e);
4611 }
4612
83c7162d 4613 #[simd_test(enable = "avx")]
0531ce1d
XL
4614 unsafe fn test_mm256_castpd_si256() {
4615 let a = _mm256_setr_pd(1., 2., 3., 4.);
4616 let r = _mm256_castpd_si256(a);
532ac7d7 4617 assert_eq_m256d(transmute(r), a);
0531ce1d
XL
4618 }
4619
83c7162d 4620 #[simd_test(enable = "avx")]
0531ce1d
XL
4621 unsafe fn test_mm256_castsi256_pd() {
4622 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4623 let r = _mm256_castsi256_pd(a);
532ac7d7 4624 assert_eq_m256d(r, transmute(a));
0531ce1d
XL
4625 }
4626
83c7162d 4627 #[simd_test(enable = "avx")]
0531ce1d
XL
4628 unsafe fn test_mm256_castps256_ps128() {
4629 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4630 let r = _mm256_castps256_ps128(a);
4631 assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4632 }
4633
83c7162d 4634 #[simd_test(enable = "avx")]
0531ce1d
XL
4635 unsafe fn test_mm256_castpd256_pd128() {
4636 let a = _mm256_setr_pd(1., 2., 3., 4.);
4637 let r = _mm256_castpd256_pd128(a);
4638 assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4639 }
4640
83c7162d 4641 #[simd_test(enable = "avx")]
0531ce1d
XL
4642 unsafe fn test_mm256_castsi256_si128() {
4643 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4644 let r = _mm256_castsi256_si128(a);
4645 assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4646 }
4647
83c7162d 4648 #[simd_test(enable = "avx")]
0531ce1d
XL
4649 unsafe fn test_mm256_zextps128_ps256() {
4650 let a = _mm_setr_ps(1., 2., 3., 4.);
4651 let r = _mm256_zextps128_ps256(a);
4652 let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4653 assert_eq_m256(r, e);
4654 }
4655
83c7162d 4656 #[simd_test(enable = "avx")]
0531ce1d
XL
4657 unsafe fn test_mm256_zextsi128_si256() {
4658 let a = _mm_setr_epi64x(1, 2);
4659 let r = _mm256_zextsi128_si256(a);
4660 let e = _mm256_setr_epi64x(1, 2, 0, 0);
4661 assert_eq_m256i(r, e);
4662 }
4663
83c7162d 4664 #[simd_test(enable = "avx")]
0531ce1d
XL
4665 unsafe fn test_mm256_zextpd128_pd256() {
4666 let a = _mm_setr_pd(1., 2.);
4667 let r = _mm256_zextpd128_pd256(a);
4668 let e = _mm256_setr_pd(1., 2., 0., 0.);
4669 assert_eq_m256d(r, e);
4670 }
4671
83c7162d 4672 #[simd_test(enable = "avx")]
0531ce1d
XL
4673 unsafe fn test_mm256_set_m128() {
4674 let hi = _mm_setr_ps(5., 6., 7., 8.);
4675 let lo = _mm_setr_ps(1., 2., 3., 4.);
4676 let r = _mm256_set_m128(hi, lo);
4677 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4678 assert_eq_m256(r, e);
4679 }
4680
83c7162d 4681 #[simd_test(enable = "avx")]
0531ce1d
XL
4682 unsafe fn test_mm256_set_m128d() {
4683 let hi = _mm_setr_pd(3., 4.);
4684 let lo = _mm_setr_pd(1., 2.);
4685 let r = _mm256_set_m128d(hi, lo);
4686 let e = _mm256_setr_pd(1., 2., 3., 4.);
4687 assert_eq_m256d(r, e);
4688 }
4689
83c7162d 4690 #[simd_test(enable = "avx")]
0531ce1d 4691 unsafe fn test_mm256_set_m128i() {
0731742a 4692 #[rustfmt::skip]
0531ce1d
XL
4693 let hi = _mm_setr_epi8(
4694 17, 18, 19, 20,
4695 21, 22, 23, 24,
4696 25, 26, 27, 28,
4697 29, 30, 31, 32,
4698 );
0731742a 4699 #[rustfmt::skip]
0531ce1d
XL
4700 let lo = _mm_setr_epi8(
4701 1, 2, 3, 4,
4702 5, 6, 7, 8,
4703 9, 10, 11, 12,
4704 13, 14, 15, 16,
4705 );
4706 let r = _mm256_set_m128i(hi, lo);
0731742a 4707 #[rustfmt::skip]
0531ce1d
XL
4708 let e = _mm256_setr_epi8(
4709 1, 2, 3, 4, 5, 6, 7, 8,
4710 9, 10, 11, 12, 13, 14, 15, 16,
4711 17, 18, 19, 20, 21, 22, 23, 24,
4712 25, 26, 27, 28, 29, 30, 31, 32,
4713 );
4714 assert_eq_m256i(r, e);
4715 }
4716
83c7162d 4717 #[simd_test(enable = "avx")]
0531ce1d
XL
4718 unsafe fn test_mm256_setr_m128() {
4719 let lo = _mm_setr_ps(1., 2., 3., 4.);
4720 let hi = _mm_setr_ps(5., 6., 7., 8.);
4721 let r = _mm256_setr_m128(lo, hi);
4722 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4723 assert_eq_m256(r, e);
4724 }
4725
83c7162d 4726 #[simd_test(enable = "avx")]
0531ce1d
XL
4727 unsafe fn test_mm256_setr_m128d() {
4728 let lo = _mm_setr_pd(1., 2.);
4729 let hi = _mm_setr_pd(3., 4.);
4730 let r = _mm256_setr_m128d(lo, hi);
4731 let e = _mm256_setr_pd(1., 2., 3., 4.);
4732 assert_eq_m256d(r, e);
4733 }
4734
83c7162d 4735 #[simd_test(enable = "avx")]
0531ce1d 4736 unsafe fn test_mm256_setr_m128i() {
0731742a 4737 #[rustfmt::skip]
0531ce1d
XL
4738 let lo = _mm_setr_epi8(
4739 1, 2, 3, 4,
4740 5, 6, 7, 8,
4741 9, 10, 11, 12,
4742 13, 14, 15, 16,
4743 );
0731742a 4744 #[rustfmt::skip]
0531ce1d
XL
4745 let hi = _mm_setr_epi8(
4746 17, 18, 19, 20, 21, 22, 23, 24,
4747 25, 26, 27, 28, 29, 30, 31, 32,
4748 );
4749 let r = _mm256_setr_m128i(lo, hi);
0731742a 4750 #[rustfmt::skip]
0531ce1d
XL
4751 let e = _mm256_setr_epi8(
4752 1, 2, 3, 4, 5, 6, 7, 8,
4753 9, 10, 11, 12, 13, 14, 15, 16,
4754 17, 18, 19, 20, 21, 22, 23, 24,
4755 25, 26, 27, 28, 29, 30, 31, 32,
4756 );
4757 assert_eq_m256i(r, e);
4758 }
4759
83c7162d 4760 #[simd_test(enable = "avx")]
0531ce1d
XL
4761 unsafe fn test_mm256_loadu2_m128() {
4762 let hi = &[5., 6., 7., 8.];
4763 let hiaddr = hi.as_ptr();
4764 let lo = &[1., 2., 3., 4.];
4765 let loaddr = lo.as_ptr();
4766 let r = _mm256_loadu2_m128(hiaddr, loaddr);
4767 let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4768 assert_eq_m256(r, e);
4769 }
4770
83c7162d 4771 #[simd_test(enable = "avx")]
0531ce1d
XL
4772 unsafe fn test_mm256_loadu2_m128d() {
4773 let hi = &[3., 4.];
4774 let hiaddr = hi.as_ptr();
4775 let lo = &[1., 2.];
4776 let loaddr = lo.as_ptr();
4777 let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4778 let e = _mm256_setr_pd(1., 2., 3., 4.);
4779 assert_eq_m256d(r, e);
4780 }
4781
83c7162d 4782 #[simd_test(enable = "avx")]
0531ce1d 4783 unsafe fn test_mm256_loadu2_m128i() {
0731742a 4784 #[rustfmt::skip]
0531ce1d
XL
4785 let hi = _mm_setr_epi8(
4786 17, 18, 19, 20, 21, 22, 23, 24,
4787 25, 26, 27, 28, 29, 30, 31, 32,
4788 );
0731742a 4789 #[rustfmt::skip]
0531ce1d
XL
4790 let lo = _mm_setr_epi8(
4791 1, 2, 3, 4, 5, 6, 7, 8,
4792 9, 10, 11, 12, 13, 14, 15, 16,
4793 );
0731742a
XL
4794 let r = _mm256_loadu2_m128i(&hi as *const _ as *const _, &lo as *const _ as *const _);
4795 #[rustfmt::skip]
0531ce1d
XL
4796 let e = _mm256_setr_epi8(
4797 1, 2, 3, 4, 5, 6, 7, 8,
4798 9, 10, 11, 12, 13, 14, 15, 16,
4799 17, 18, 19, 20, 21, 22, 23, 24,
4800 25, 26, 27, 28, 29, 30, 31, 32,
4801 );
4802 assert_eq_m256i(r, e);
4803 }
4804
83c7162d 4805 #[simd_test(enable = "avx")]
0531ce1d
XL
4806 unsafe fn test_mm256_storeu2_m128() {
4807 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4808 let mut hi = _mm_undefined_ps();
4809 let mut lo = _mm_undefined_ps();
4810 _mm256_storeu2_m128(
4811 &mut hi as *mut _ as *mut f32,
4812 &mut lo as *mut _ as *mut f32,
4813 a,
4814 );
4815 assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
4816 assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
4817 }
4818
83c7162d 4819 #[simd_test(enable = "avx")]
0531ce1d
XL
4820 unsafe fn test_mm256_storeu2_m128d() {
4821 let a = _mm256_setr_pd(1., 2., 3., 4.);
4822 let mut hi = _mm_undefined_pd();
4823 let mut lo = _mm_undefined_pd();
4824 _mm256_storeu2_m128d(
4825 &mut hi as *mut _ as *mut f64,
4826 &mut lo as *mut _ as *mut f64,
4827 a,
4828 );
4829 assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
4830 assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
4831 }
4832
83c7162d 4833 #[simd_test(enable = "avx")]
0531ce1d 4834 unsafe fn test_mm256_storeu2_m128i() {
0731742a 4835 #[rustfmt::skip]
0531ce1d
XL
4836 let a = _mm256_setr_epi8(
4837 1, 2, 3, 4, 5, 6, 7, 8,
4838 9, 10, 11, 12, 13, 14, 15, 16,
4839 17, 18, 19, 20, 21, 22, 23, 24,
4840 25, 26, 27, 28, 29, 30, 31, 32,
4841 );
4842 let mut hi = _mm_undefined_si128();
4843 let mut lo = _mm_undefined_si128();
4844 _mm256_storeu2_m128i(&mut hi as *mut _, &mut lo as *mut _, a);
0731742a 4845 #[rustfmt::skip]
0531ce1d
XL
4846 let e_hi = _mm_setr_epi8(
4847 17, 18, 19, 20, 21, 22, 23, 24,
4848 25, 26, 27, 28, 29, 30, 31, 32
4849 );
0731742a 4850 #[rustfmt::skip]
0531ce1d
XL
4851 let e_lo = _mm_setr_epi8(
4852 1, 2, 3, 4, 5, 6, 7, 8,
4853 9, 10, 11, 12, 13, 14, 15, 16
4854 );
4855
4856 assert_eq_m128i(hi, e_hi);
4857 assert_eq_m128i(lo, e_lo);
4858 }
4859
83c7162d 4860 #[simd_test(enable = "avx")]
0531ce1d
XL
4861 unsafe fn test_mm256_cvtss_f32() {
4862 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4863 let r = _mm256_cvtss_f32(a);
4864 assert_eq!(r, 1.);
4865 }
4866}