]> git.proxmox.com Git - rustc.git/blob - src/stdarch/crates/core_arch/src/x86/avx2.rs
New upstream version 1.44.1+dfsg1
[rustc.git] / src / stdarch / crates / core_arch / src / x86 / avx2.rs
1 //! Advanced Vector Extensions 2 (AVX)
2 //!
3 //! AVX2 expands most AVX commands to 256-bit wide vector registers and
4 //! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5 //!
6 //! The references are:
7 //!
8 //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9 //! Instruction Set Reference, A-Z][intel64_ref].
10 //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11 //! System Instructions][amd64_ref].
12 //!
13 //! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14 //! overview of the instructions available.
15 //!
16 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21 use crate::{
22 core_arch::{simd::*, simd_llvm::*, x86::*},
23 mem::transmute,
24 };
25
26 #[cfg(test)]
27 use stdarch_test::assert_instr;
28
29 /// Computes the absolute values of packed 32-bit integers in `a`.
30 ///
31 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
32 #[inline]
33 #[target_feature(enable = "avx2")]
34 #[cfg_attr(test, assert_instr(vpabsd))]
35 #[stable(feature = "simd_x86", since = "1.27.0")]
36 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
37 transmute(pabsd(a.as_i32x8()))
38 }
39
40 /// Computes the absolute values of packed 16-bit integers in `a`.
41 ///
42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
43 #[inline]
44 #[target_feature(enable = "avx2")]
45 #[cfg_attr(test, assert_instr(vpabsw))]
46 #[stable(feature = "simd_x86", since = "1.27.0")]
47 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
48 transmute(pabsw(a.as_i16x16()))
49 }
50
51 /// Computes the absolute values of packed 8-bit integers in `a`.
52 ///
53 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
54 #[inline]
55 #[target_feature(enable = "avx2")]
56 #[cfg_attr(test, assert_instr(vpabsb))]
57 #[stable(feature = "simd_x86", since = "1.27.0")]
58 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
59 transmute(pabsb(a.as_i8x32()))
60 }
61
62 /// Adds packed 64-bit integers in `a` and `b`.
63 ///
64 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
65 #[inline]
66 #[target_feature(enable = "avx2")]
67 #[cfg_attr(test, assert_instr(vpaddq))]
68 #[stable(feature = "simd_x86", since = "1.27.0")]
69 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
70 transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
71 }
72
73 /// Adds packed 32-bit integers in `a` and `b`.
74 ///
75 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
76 #[inline]
77 #[target_feature(enable = "avx2")]
78 #[cfg_attr(test, assert_instr(vpaddd))]
79 #[stable(feature = "simd_x86", since = "1.27.0")]
80 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
81 transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
82 }
83
84 /// Adds packed 16-bit integers in `a` and `b`.
85 ///
86 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
87 #[inline]
88 #[target_feature(enable = "avx2")]
89 #[cfg_attr(test, assert_instr(vpaddw))]
90 #[stable(feature = "simd_x86", since = "1.27.0")]
91 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
92 transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
93 }
94
95 /// Adds packed 8-bit integers in `a` and `b`.
96 ///
97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
98 #[inline]
99 #[target_feature(enable = "avx2")]
100 #[cfg_attr(test, assert_instr(vpaddb))]
101 #[stable(feature = "simd_x86", since = "1.27.0")]
102 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
103 transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
104 }
105
106 /// Adds packed 8-bit integers in `a` and `b` using saturation.
107 ///
108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
109 #[inline]
110 #[target_feature(enable = "avx2")]
111 #[cfg_attr(test, assert_instr(vpaddsb))]
112 #[stable(feature = "simd_x86", since = "1.27.0")]
113 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
114 transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
115 }
116
117 /// Adds packed 16-bit integers in `a` and `b` using saturation.
118 ///
119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
120 #[inline]
121 #[target_feature(enable = "avx2")]
122 #[cfg_attr(test, assert_instr(vpaddsw))]
123 #[stable(feature = "simd_x86", since = "1.27.0")]
124 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
125 transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
126 }
127
128 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
129 ///
130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
131 #[inline]
132 #[target_feature(enable = "avx2")]
133 #[cfg_attr(test, assert_instr(vpaddusb))]
134 #[stable(feature = "simd_x86", since = "1.27.0")]
135 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
136 transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
137 }
138
139 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
140 ///
141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
142 #[inline]
143 #[target_feature(enable = "avx2")]
144 #[cfg_attr(test, assert_instr(vpaddusw))]
145 #[stable(feature = "simd_x86", since = "1.27.0")]
146 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
147 transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
148 }
149
150 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
151 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
152 ///
153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
154 #[inline]
155 #[target_feature(enable = "avx2")]
156 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
157 #[rustc_args_required_const(2)]
158 #[stable(feature = "simd_x86", since = "1.27.0")]
159 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
160 let n = n as u32;
161 // If `palignr` is shifting the pair of vectors more than the size of two
162 // lanes, emit zero.
163 if n > 32 {
164 return _mm256_set1_epi8(0);
165 }
166 // If `palignr` is shifting the pair of input vectors more than one lane,
167 // but less than two lanes, convert to shifting in zeroes.
168 let (a, b, n) = if n > 16 {
169 (_mm256_set1_epi8(0), a, n - 16)
170 } else {
171 (a, b, n)
172 };
173
174 let a = a.as_i8x32();
175 let b = b.as_i8x32();
176
177 let r: i8x32 = match n {
178 0 => simd_shuffle32(
179 b,
180 a,
181 [
182 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
183 23, 24, 25, 26, 27, 28, 29, 30, 31,
184 ],
185 ),
186 1 => simd_shuffle32(
187 b,
188 a,
189 [
190 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
191 24, 25, 26, 27, 28, 29, 30, 31, 48,
192 ],
193 ),
194 2 => simd_shuffle32(
195 b,
196 a,
197 [
198 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
199 25, 26, 27, 28, 29, 30, 31, 48, 49,
200 ],
201 ),
202 3 => simd_shuffle32(
203 b,
204 a,
205 [
206 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
207 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
208 ],
209 ),
210 4 => simd_shuffle32(
211 b,
212 a,
213 [
214 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
215 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
216 ],
217 ),
218 5 => simd_shuffle32(
219 b,
220 a,
221 [
222 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
223 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
224 ],
225 ),
226 6 => simd_shuffle32(
227 b,
228 a,
229 [
230 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
231 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
232 ],
233 ),
234 7 => simd_shuffle32(
235 b,
236 a,
237 [
238 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
239 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
240 ],
241 ),
242 8 => simd_shuffle32(
243 b,
244 a,
245 [
246 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
247 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
248 ],
249 ),
250 9 => simd_shuffle32(
251 b,
252 a,
253 [
254 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
255 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
256 ],
257 ),
258 10 => simd_shuffle32(
259 b,
260 a,
261 [
262 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
263 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
264 ],
265 ),
266 11 => simd_shuffle32(
267 b,
268 a,
269 [
270 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
271 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
272 ],
273 ),
274 12 => simd_shuffle32(
275 b,
276 a,
277 [
278 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
279 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
280 ],
281 ),
282 13 => simd_shuffle32(
283 b,
284 a,
285 [
286 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
287 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
288 ],
289 ),
290 14 => simd_shuffle32(
291 b,
292 a,
293 [
294 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
295 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
296 ],
297 ),
298 15 => simd_shuffle32(
299 b,
300 a,
301 [
302 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
303 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
304 ],
305 ),
306 _ => b,
307 };
308 transmute(r)
309 }
310
311 /// Computes the bitwise AND of 256 bits (representing integer data)
312 /// in `a` and `b`.
313 ///
314 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
315 #[inline]
316 #[target_feature(enable = "avx2")]
317 #[cfg_attr(test, assert_instr(vandps))]
318 #[stable(feature = "simd_x86", since = "1.27.0")]
319 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
320 transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
321 }
322
323 /// Computes the bitwise NOT of 256 bits (representing integer data)
324 /// in `a` and then AND with `b`.
325 ///
326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
327 #[inline]
328 #[target_feature(enable = "avx2")]
329 #[cfg_attr(test, assert_instr(vandnps))]
330 #[stable(feature = "simd_x86", since = "1.27.0")]
331 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
332 let all_ones = _mm256_set1_epi8(-1);
333 transmute(simd_and(
334 simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
335 b.as_i64x4(),
336 ))
337 }
338
339 /// Averages packed unsigned 16-bit integers in `a` and `b`.
340 ///
341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
342 #[inline]
343 #[target_feature(enable = "avx2")]
344 #[cfg_attr(test, assert_instr(vpavgw))]
345 #[stable(feature = "simd_x86", since = "1.27.0")]
346 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
347 transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
348 }
349
350 /// Averages packed unsigned 8-bit integers in `a` and `b`.
351 ///
352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
353 #[inline]
354 #[target_feature(enable = "avx2")]
355 #[cfg_attr(test, assert_instr(vpavgb))]
356 #[stable(feature = "simd_x86", since = "1.27.0")]
357 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
358 transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
359 }
360
361 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
362 ///
363 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
364 #[inline]
365 #[target_feature(enable = "avx2")]
366 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
367 #[rustc_args_required_const(2)]
368 #[stable(feature = "simd_x86", since = "1.27.0")]
369 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
370 let imm8 = (imm8 & 0xFF) as u8;
371 let a = a.as_i32x4();
372 let b = b.as_i32x4();
373 macro_rules! blend2 {
374 ($a:expr, $b:expr, $c:expr, $d:expr) => {
375 simd_shuffle4(a, b, [$a, $b, $c, $d]);
376 };
377 }
378 macro_rules! blend1 {
379 ($a:expr, $b:expr) => {
380 match (imm8 >> 2) & 0b11 {
381 0b00 => blend2!($a, $b, 2, 3),
382 0b01 => blend2!($a, $b, 6, 3),
383 0b10 => blend2!($a, $b, 2, 7),
384 _ => blend2!($a, $b, 6, 7),
385 }
386 };
387 }
388 let r: i32x4 = match imm8 & 0b11 {
389 0b00 => blend1!(0, 1),
390 0b01 => blend1!(4, 1),
391 0b10 => blend1!(0, 5),
392 _ => blend1!(4, 5),
393 };
394 transmute(r)
395 }
396
397 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
398 ///
399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
400 #[inline]
401 #[target_feature(enable = "avx2")]
402 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
403 #[rustc_args_required_const(2)]
404 #[stable(feature = "simd_x86", since = "1.27.0")]
405 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
406 let imm8 = (imm8 & 0xFF) as u8;
407 let a = a.as_i32x8();
408 let b = b.as_i32x8();
409 macro_rules! blend4 {
410 (
411 $a:expr,
412 $b:expr,
413 $c:expr,
414 $d:expr,
415 $e:expr,
416 $f:expr,
417 $g:expr,
418 $h:expr
419 ) => {
420 simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
421 };
422 }
423 macro_rules! blend3 {
424 ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
425 match (imm8 >> 6) & 0b11 {
426 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
427 0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
428 0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
429 _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
430 }
431 };
432 }
433 macro_rules! blend2 {
434 ($a:expr, $b:expr, $c:expr, $d:expr) => {
435 match (imm8 >> 4) & 0b11 {
436 0b00 => blend3!($a, $b, $c, $d, 4, 5),
437 0b01 => blend3!($a, $b, $c, $d, 12, 5),
438 0b10 => blend3!($a, $b, $c, $d, 4, 13),
439 _ => blend3!($a, $b, $c, $d, 12, 13),
440 }
441 };
442 }
443 macro_rules! blend1 {
444 ($a:expr, $b:expr) => {
445 match (imm8 >> 2) & 0b11 {
446 0b00 => blend2!($a, $b, 2, 3),
447 0b01 => blend2!($a, $b, 10, 3),
448 0b10 => blend2!($a, $b, 2, 11),
449 _ => blend2!($a, $b, 10, 11),
450 }
451 };
452 }
453 let r: i32x8 = match imm8 & 0b11 {
454 0b00 => blend1!(0, 1),
455 0b01 => blend1!(8, 1),
456 0b10 => blend1!(0, 9),
457 _ => blend1!(8, 9),
458 };
459 transmute(r)
460 }
461
462 /// Blends packed 16-bit integers from `a` and `b` using control mask `imm8`.
463 ///
464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
465 #[inline]
466 #[target_feature(enable = "avx2")]
467 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
468 #[rustc_args_required_const(2)]
469 #[stable(feature = "simd_x86", since = "1.27.0")]
470 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
471 let imm8 = (imm8 & 0xFF) as u8;
472 let a = a.as_i16x16();
473 let b = b.as_i16x16();
474 macro_rules! blend4 {
475 (
476 $a:expr,
477 $b:expr,
478 $c:expr,
479 $d:expr,
480 $e:expr,
481 $f:expr,
482 $g:expr,
483 $h:expr,
484 $i:expr,
485 $j:expr,
486 $k:expr,
487 $l:expr,
488 $m:expr,
489 $n:expr,
490 $o:expr,
491 $p:expr
492 ) => {
493 simd_shuffle16(
494 a,
495 b,
496 [
497 $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
498 ],
499 )
500 };
501 }
502 macro_rules! blend3 {
503 (
504 $a:expr,
505 $b:expr,
506 $c:expr,
507 $d:expr,
508 $e:expr,
509 $f:expr,
510 $a2:expr,
511 $b2:expr,
512 $c2:expr,
513 $d2:expr,
514 $e2:expr,
515 $f2:expr
516 ) => {
517 match (imm8 >> 6) & 0b11 {
518 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
519 0b01 => {
520 blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
521 }
522 0b10 => {
523 blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
524 }
525 _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
526 }
527 };
528 }
529 macro_rules! blend2 {
530 (
531 $a:expr,
532 $b:expr,
533 $c:expr,
534 $d:expr,
535 $a2:expr,
536 $b2:expr,
537 $c2:expr,
538 $d2:expr
539 ) => {
540 match (imm8 >> 4) & 0b11 {
541 0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
542 0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
543 0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
544 _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
545 }
546 };
547 }
548 macro_rules! blend1 {
549 ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
550 match (imm8 >> 2) & 0b11 {
551 0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
552 0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
553 0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
554 _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
555 }
556 };
557 }
558 let r: i16x16 = match imm8 & 0b11 {
559 0b00 => blend1!(0, 1, 8, 9),
560 0b01 => blend1!(16, 1, 24, 9),
561 0b10 => blend1!(0, 17, 8, 25),
562 _ => blend1!(16, 17, 24, 25),
563 };
564 transmute(r)
565 }
566
567 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
568 ///
569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
570 #[inline]
571 #[target_feature(enable = "avx2")]
572 #[cfg_attr(test, assert_instr(vpblendvb))]
573 #[stable(feature = "simd_x86", since = "1.27.0")]
574 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
575 transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
576 }
577
578 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
579 /// the 128-bit returned value.
580 ///
581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
582 #[inline]
583 #[target_feature(enable = "avx2")]
584 #[cfg_attr(test, assert_instr(vpbroadcastb))]
585 #[stable(feature = "simd_x86", since = "1.27.0")]
586 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
587 let zero = _mm_setzero_si128();
588 let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
589 transmute::<i8x16, _>(ret)
590 }
591
592 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
593 /// the 256-bit returned value.
594 ///
595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
596 #[inline]
597 #[target_feature(enable = "avx2")]
598 #[cfg_attr(test, assert_instr(vpbroadcastb))]
599 #[stable(feature = "simd_x86", since = "1.27.0")]
600 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
601 let zero = _mm_setzero_si128();
602 let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
603 transmute::<i8x32, _>(ret)
604 }
605
606 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
607 // often compiled to `vbroadcastss`.
608 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
609 /// the 128-bit returned value.
610 ///
611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
612 #[inline]
613 #[target_feature(enable = "avx2")]
614 #[cfg_attr(test, assert_instr(vbroadcastss))]
615 #[stable(feature = "simd_x86", since = "1.27.0")]
616 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
617 let zero = _mm_setzero_si128();
618 let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
619 transmute::<i32x4, _>(ret)
620 }
621
622 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
623 // often compiled to `vbroadcastss`.
624 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
625 /// the 256-bit returned value.
626 ///
627 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
628 #[inline]
629 #[target_feature(enable = "avx2")]
630 #[cfg_attr(test, assert_instr(vbroadcastss))]
631 #[stable(feature = "simd_x86", since = "1.27.0")]
632 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
633 let zero = _mm_setzero_si128();
634 let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
635 transmute::<i32x8, _>(ret)
636 }
637
638 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
639 /// the 128-bit returned value.
640 ///
641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
642 #[inline]
643 #[target_feature(enable = "avx2")]
644 // FIXME: https://github.com/rust-lang/stdarch/issues/791
645 #[cfg_attr(test, assert_instr(vmovddup))]
646 #[stable(feature = "simd_x86", since = "1.27.0")]
647 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
648 let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
649 transmute::<i64x2, _>(ret)
650 }
651
652 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
653 /// the 256-bit returned value.
654 ///
655 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
656 #[inline]
657 #[target_feature(enable = "avx2")]
658 #[cfg_attr(test, assert_instr(vbroadcastsd))]
659 #[stable(feature = "simd_x86", since = "1.27.0")]
660 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
661 let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
662 transmute::<i64x4, _>(ret)
663 }
664
665 /// Broadcasts the low double-precision (64-bit) floating-point element
666 /// from `a` to all elements of the 128-bit returned value.
667 ///
668 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
669 #[inline]
670 #[target_feature(enable = "avx2")]
671 #[cfg_attr(test, assert_instr(vmovddup))]
672 #[stable(feature = "simd_x86", since = "1.27.0")]
673 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
674 simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
675 }
676
677 /// Broadcasts the low double-precision (64-bit) floating-point element
678 /// from `a` to all elements of the 256-bit returned value.
679 ///
680 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
681 #[inline]
682 #[target_feature(enable = "avx2")]
683 #[cfg_attr(test, assert_instr(vbroadcastsd))]
684 #[stable(feature = "simd_x86", since = "1.27.0")]
685 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
686 simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
687 }
688
689 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
690 // `vbroadcastf128`.
691 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
692 /// the 256-bit returned value.
693 ///
694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
695 #[inline]
696 #[target_feature(enable = "avx2")]
697 #[stable(feature = "simd_x86", since = "1.27.0")]
698 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
699 let zero = _mm_setzero_si128();
700 let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
701 transmute::<i64x4, _>(ret)
702 }
703
704 /// Broadcasts the low single-precision (32-bit) floating-point element
705 /// from `a` to all elements of the 128-bit returned value.
706 ///
707 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
708 #[inline]
709 #[target_feature(enable = "avx2")]
710 #[cfg_attr(test, assert_instr(vbroadcastss))]
711 #[stable(feature = "simd_x86", since = "1.27.0")]
712 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
713 simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
714 }
715
716 /// Broadcasts the low single-precision (32-bit) floating-point element
717 /// from `a` to all elements of the 256-bit returned value.
718 ///
719 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
720 #[inline]
721 #[target_feature(enable = "avx2")]
722 #[cfg_attr(test, assert_instr(vbroadcastss))]
723 #[stable(feature = "simd_x86", since = "1.27.0")]
724 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
725 simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
726 }
727
728 /// Broadcasts the low packed 16-bit integer from a to all elements of
729 /// the 128-bit returned value
730 ///
731 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
732 #[inline]
733 #[target_feature(enable = "avx2")]
734 #[cfg_attr(test, assert_instr(vpbroadcastw))]
735 #[stable(feature = "simd_x86", since = "1.27.0")]
736 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
737 let zero = _mm_setzero_si128();
738 let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
739 transmute::<i16x8, _>(ret)
740 }
741
742 /// Broadcasts the low packed 16-bit integer from a to all elements of
743 /// the 256-bit returned value
744 ///
745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
746 #[inline]
747 #[target_feature(enable = "avx2")]
748 #[cfg_attr(test, assert_instr(vpbroadcastw))]
749 #[stable(feature = "simd_x86", since = "1.27.0")]
750 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
751 let zero = _mm_setzero_si128();
752 let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
753 transmute::<i16x16, _>(ret)
754 }
755
756 /// Compares packed 64-bit integers in `a` and `b` for equality.
757 ///
758 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
759 #[inline]
760 #[target_feature(enable = "avx2")]
761 #[cfg_attr(test, assert_instr(vpcmpeqq))]
762 #[stable(feature = "simd_x86", since = "1.27.0")]
763 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
764 transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
765 }
766
767 /// Compares packed 32-bit integers in `a` and `b` for equality.
768 ///
769 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
770 #[inline]
771 #[target_feature(enable = "avx2")]
772 #[cfg_attr(test, assert_instr(vpcmpeqd))]
773 #[stable(feature = "simd_x86", since = "1.27.0")]
774 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
775 transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
776 }
777
778 /// Compares packed 16-bit integers in `a` and `b` for equality.
779 ///
780 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
781 #[inline]
782 #[target_feature(enable = "avx2")]
783 #[cfg_attr(test, assert_instr(vpcmpeqw))]
784 #[stable(feature = "simd_x86", since = "1.27.0")]
785 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
786 transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
787 }
788
789 /// Compares packed 8-bit integers in `a` and `b` for equality.
790 ///
791 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
792 #[inline]
793 #[target_feature(enable = "avx2")]
794 #[cfg_attr(test, assert_instr(vpcmpeqb))]
795 #[stable(feature = "simd_x86", since = "1.27.0")]
796 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
797 transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
798 }
799
800 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
801 ///
802 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
803 #[inline]
804 #[target_feature(enable = "avx2")]
805 #[cfg_attr(test, assert_instr(vpcmpgtq))]
806 #[stable(feature = "simd_x86", since = "1.27.0")]
807 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
808 transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
809 }
810
811 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
812 ///
813 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
814 #[inline]
815 #[target_feature(enable = "avx2")]
816 #[cfg_attr(test, assert_instr(vpcmpgtd))]
817 #[stable(feature = "simd_x86", since = "1.27.0")]
818 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
819 transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
820 }
821
822 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
823 ///
824 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
825 #[inline]
826 #[target_feature(enable = "avx2")]
827 #[cfg_attr(test, assert_instr(vpcmpgtw))]
828 #[stable(feature = "simd_x86", since = "1.27.0")]
829 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
830 transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
831 }
832
833 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
834 ///
835 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
836 #[inline]
837 #[target_feature(enable = "avx2")]
838 #[cfg_attr(test, assert_instr(vpcmpgtb))]
839 #[stable(feature = "simd_x86", since = "1.27.0")]
840 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
841 transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
842 }
843
844 /// Sign-extend 16-bit integers to 32-bit integers.
845 ///
846 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
847 #[inline]
848 #[target_feature(enable = "avx2")]
849 #[cfg_attr(test, assert_instr(vpmovsxwd))]
850 #[stable(feature = "simd_x86", since = "1.27.0")]
851 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
852 transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
853 }
854
855 /// Sign-extend 16-bit integers to 64-bit integers.
856 ///
857 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
858 #[inline]
859 #[target_feature(enable = "avx2")]
860 #[cfg_attr(test, assert_instr(vpmovsxwq))]
861 #[stable(feature = "simd_x86", since = "1.27.0")]
862 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
863 let a = a.as_i16x8();
864 let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
865 transmute::<i64x4, _>(simd_cast(v64))
866 }
867
868 /// Sign-extend 32-bit integers to 64-bit integers.
869 ///
870 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
871 #[inline]
872 #[target_feature(enable = "avx2")]
873 #[cfg_attr(test, assert_instr(vpmovsxdq))]
874 #[stable(feature = "simd_x86", since = "1.27.0")]
875 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
876 transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
877 }
878
879 /// Sign-extend 8-bit integers to 16-bit integers.
880 ///
881 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
882 #[inline]
883 #[target_feature(enable = "avx2")]
884 #[cfg_attr(test, assert_instr(vpmovsxbw))]
885 #[stable(feature = "simd_x86", since = "1.27.0")]
886 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
887 transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
888 }
889
890 /// Sign-extend 8-bit integers to 32-bit integers.
891 ///
892 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
893 #[inline]
894 #[target_feature(enable = "avx2")]
895 #[cfg_attr(test, assert_instr(vpmovsxbd))]
896 #[stable(feature = "simd_x86", since = "1.27.0")]
897 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
898 let a = a.as_i8x16();
899 let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
900 transmute::<i32x8, _>(simd_cast(v64))
901 }
902
903 /// Sign-extend 8-bit integers to 64-bit integers.
904 ///
905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
906 #[inline]
907 #[target_feature(enable = "avx2")]
908 #[cfg_attr(test, assert_instr(vpmovsxbq))]
909 #[stable(feature = "simd_x86", since = "1.27.0")]
910 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
911 let a = a.as_i8x16();
912 let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
913 transmute::<i64x4, _>(simd_cast(v32))
914 }
915
916 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
917 /// integers, and stores the results in `dst`.
918 ///
919 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
920 #[inline]
921 #[target_feature(enable = "avx2")]
922 #[cfg_attr(test, assert_instr(vpmovzxwd))]
923 #[stable(feature = "simd_x86", since = "1.27.0")]
924 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
925 transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
926 }
927
928 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
929 /// integers. The upper four elements of `a` are unused.
930 ///
931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
932 #[inline]
933 #[target_feature(enable = "avx2")]
934 #[cfg_attr(test, assert_instr(vpmovzxwq))]
935 #[stable(feature = "simd_x86", since = "1.27.0")]
936 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
937 let a = a.as_u16x8();
938 let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
939 transmute::<i64x4, _>(simd_cast(v64))
940 }
941
942 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
943 ///
944 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
945 #[inline]
946 #[target_feature(enable = "avx2")]
947 #[cfg_attr(test, assert_instr(vpmovzxdq))]
948 #[stable(feature = "simd_x86", since = "1.27.0")]
949 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
950 transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
951 }
952
953 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
954 ///
955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
956 #[inline]
957 #[target_feature(enable = "avx2")]
958 #[cfg_attr(test, assert_instr(vpmovzxbw))]
959 #[stable(feature = "simd_x86", since = "1.27.0")]
960 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
961 transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
962 }
963
964 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
965 /// integers. The upper eight elements of `a` are unused.
966 ///
967 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
968 #[inline]
969 #[target_feature(enable = "avx2")]
970 #[cfg_attr(test, assert_instr(vpmovzxbd))]
971 #[stable(feature = "simd_x86", since = "1.27.0")]
972 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
973 let a = a.as_u8x16();
974 let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
975 transmute::<i32x8, _>(simd_cast(v64))
976 }
977
978 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
979 /// integers. The upper twelve elements of `a` are unused.
980 ///
981 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
982 #[inline]
983 #[target_feature(enable = "avx2")]
984 #[cfg_attr(test, assert_instr(vpmovzxbq))]
985 #[stable(feature = "simd_x86", since = "1.27.0")]
986 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
987 let a = a.as_u8x16();
988 let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
989 transmute::<i64x4, _>(simd_cast(v32))
990 }
991
992 /// Extracts 128 bits (of integer data) from `a` selected with `imm8`.
993 ///
994 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
995 #[inline]
996 #[target_feature(enable = "avx2")]
997 #[cfg_attr(
998 all(test, not(target_os = "windows")),
999 assert_instr(vextractf128, imm8 = 1)
1000 )]
1001 #[rustc_args_required_const(1)]
1002 #[stable(feature = "simd_x86", since = "1.27.0")]
1003 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
1004 let a = a.as_i64x4();
1005 let b = _mm256_undefined_si256().as_i64x4();
1006 let dst: i64x2 = match imm8 & 0b01 {
1007 0 => simd_shuffle2(a, b, [0, 1]),
1008 _ => simd_shuffle2(a, b, [2, 3]),
1009 };
1010 transmute(dst)
1011 }
1012
1013 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
1014 ///
1015 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
1016 #[inline]
1017 #[target_feature(enable = "avx2")]
1018 #[cfg_attr(test, assert_instr(vphaddw))]
1019 #[stable(feature = "simd_x86", since = "1.27.0")]
1020 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
1021 transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
1022 }
1023
1024 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
1025 ///
1026 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
1027 #[inline]
1028 #[target_feature(enable = "avx2")]
1029 #[cfg_attr(test, assert_instr(vphaddd))]
1030 #[stable(feature = "simd_x86", since = "1.27.0")]
1031 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
1032 transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
1033 }
1034
1035 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
1036 /// using saturation.
1037 ///
1038 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
1039 #[inline]
1040 #[target_feature(enable = "avx2")]
1041 #[cfg_attr(test, assert_instr(vphaddsw))]
1042 #[stable(feature = "simd_x86", since = "1.27.0")]
1043 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1044 transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
1045 }
1046
1047 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1048 ///
1049 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
1050 #[inline]
1051 #[target_feature(enable = "avx2")]
1052 #[cfg_attr(test, assert_instr(vphsubw))]
1053 #[stable(feature = "simd_x86", since = "1.27.0")]
1054 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1055 transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
1056 }
1057
1058 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1059 ///
1060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
1061 #[inline]
1062 #[target_feature(enable = "avx2")]
1063 #[cfg_attr(test, assert_instr(vphsubd))]
1064 #[stable(feature = "simd_x86", since = "1.27.0")]
1065 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1066 transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
1067 }
1068
1069 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1070 /// using saturation.
1071 ///
1072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
1073 #[inline]
1074 #[target_feature(enable = "avx2")]
1075 #[cfg_attr(test, assert_instr(vphsubsw))]
1076 #[stable(feature = "simd_x86", since = "1.27.0")]
1077 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1078 transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
1079 }
1080
1081 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1082 /// where
1083 /// `scale` is between 1 and 8.
1084 ///
1085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
1086 #[inline]
1087 #[target_feature(enable = "avx2")]
1088 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1089 #[rustc_args_required_const(2)]
1090 #[stable(feature = "simd_x86", since = "1.27.0")]
1091 pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1092 let zero = _mm_setzero_si128().as_i32x4();
1093 let neg_one = _mm_set1_epi32(-1).as_i32x4();
1094 let offsets = offsets.as_i32x4();
1095 let slice = slice as *const i8;
1096 macro_rules! call {
1097 ($imm8:expr) => {
1098 pgatherdd(zero, slice, offsets, neg_one, $imm8)
1099 };
1100 }
1101 let r = constify_imm8!(scale, call);
1102 transmute(r)
1103 }
1104
1105 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1106 /// where
1107 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1108 /// that position instead.
1109 ///
1110 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
1111 #[inline]
1112 #[target_feature(enable = "avx2")]
1113 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1114 #[rustc_args_required_const(4)]
1115 #[stable(feature = "simd_x86", since = "1.27.0")]
1116 pub unsafe fn _mm_mask_i32gather_epi32(
1117 src: __m128i,
1118 slice: *const i32,
1119 offsets: __m128i,
1120 mask: __m128i,
1121 scale: i32,
1122 ) -> __m128i {
1123 let src = src.as_i32x4();
1124 let mask = mask.as_i32x4();
1125 let offsets = offsets.as_i32x4();
1126 let slice = slice as *const i8;
1127 macro_rules! call {
1128 ($imm8:expr) => {
1129 pgatherdd(src, slice, offsets, mask, $imm8)
1130 };
1131 }
1132 let r = constify_imm8!(scale, call);
1133 transmute(r)
1134 }
1135
1136 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1137 /// where
1138 /// `scale` is between 1 and 8.
1139 ///
1140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
1141 #[inline]
1142 #[target_feature(enable = "avx2")]
1143 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1144 #[rustc_args_required_const(2)]
1145 #[stable(feature = "simd_x86", since = "1.27.0")]
1146 pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
1147 let zero = _mm256_setzero_si256().as_i32x8();
1148 let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1149 let offsets = offsets.as_i32x8();
1150 let slice = slice as *const i8;
1151 macro_rules! call {
1152 ($imm8:expr) => {
1153 vpgatherdd(zero, slice, offsets, neg_one, $imm8)
1154 };
1155 }
1156 let r = constify_imm8!(scale, call);
1157 transmute(r)
1158 }
1159
1160 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1161 /// where
1162 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1163 /// that position instead.
1164 ///
1165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
1166 #[inline]
1167 #[target_feature(enable = "avx2")]
1168 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1169 #[rustc_args_required_const(4)]
1170 #[stable(feature = "simd_x86", since = "1.27.0")]
1171 pub unsafe fn _mm256_mask_i32gather_epi32(
1172 src: __m256i,
1173 slice: *const i32,
1174 offsets: __m256i,
1175 mask: __m256i,
1176 scale: i32,
1177 ) -> __m256i {
1178 let src = src.as_i32x8();
1179 let mask = mask.as_i32x8();
1180 let offsets = offsets.as_i32x8();
1181 let slice = slice as *const i8;
1182 macro_rules! call {
1183 ($imm8:expr) => {
1184 vpgatherdd(src, slice, offsets, mask, $imm8)
1185 };
1186 }
1187 let r = constify_imm8!(scale, call);
1188 transmute(r)
1189 }
1190
1191 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1192 /// where
1193 /// `scale` is between 1 and 8.
1194 ///
1195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
1196 #[inline]
1197 #[target_feature(enable = "avx2")]
1198 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1199 #[rustc_args_required_const(2)]
1200 #[stable(feature = "simd_x86", since = "1.27.0")]
1201 pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1202 let zero = _mm_setzero_ps();
1203 let neg_one = _mm_set1_ps(-1.0);
1204 let offsets = offsets.as_i32x4();
1205 let slice = slice as *const i8;
1206 macro_rules! call {
1207 ($imm8:expr) => {
1208 pgatherdps(zero, slice, offsets, neg_one, $imm8)
1209 };
1210 }
1211 constify_imm8!(scale, call)
1212 }
1213
1214 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1215 /// where
1216 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1217 /// that position instead.
1218 ///
1219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
1220 #[inline]
1221 #[target_feature(enable = "avx2")]
1222 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1223 #[rustc_args_required_const(4)]
1224 #[stable(feature = "simd_x86", since = "1.27.0")]
1225 pub unsafe fn _mm_mask_i32gather_ps(
1226 src: __m128,
1227 slice: *const f32,
1228 offsets: __m128i,
1229 mask: __m128,
1230 scale: i32,
1231 ) -> __m128 {
1232 let offsets = offsets.as_i32x4();
1233 let slice = slice as *const i8;
1234 macro_rules! call {
1235 ($imm8:expr) => {
1236 pgatherdps(src, slice, offsets, mask, $imm8)
1237 };
1238 }
1239 constify_imm8!(scale, call)
1240 }
1241
1242 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1243 /// where
1244 /// `scale` is between 1 and 8.
1245 ///
1246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
1247 #[inline]
1248 #[target_feature(enable = "avx2")]
1249 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1250 #[rustc_args_required_const(2)]
1251 #[stable(feature = "simd_x86", since = "1.27.0")]
1252 pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
1253 let zero = _mm256_setzero_ps();
1254 let neg_one = _mm256_set1_ps(-1.0);
1255 let offsets = offsets.as_i32x8();
1256 let slice = slice as *const i8;
1257 macro_rules! call {
1258 ($imm8:expr) => {
1259 vpgatherdps(zero, slice, offsets, neg_one, $imm8)
1260 };
1261 }
1262 constify_imm8!(scale, call)
1263 }
1264
1265 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1266 /// where
1267 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1268 /// that position instead.
1269 ///
1270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
1271 #[inline]
1272 #[target_feature(enable = "avx2")]
1273 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1274 #[rustc_args_required_const(4)]
1275 #[stable(feature = "simd_x86", since = "1.27.0")]
1276 pub unsafe fn _mm256_mask_i32gather_ps(
1277 src: __m256,
1278 slice: *const f32,
1279 offsets: __m256i,
1280 mask: __m256,
1281 scale: i32,
1282 ) -> __m256 {
1283 let offsets = offsets.as_i32x8();
1284 let slice = slice as *const i8;
1285 macro_rules! call {
1286 ($imm8:expr) => {
1287 vpgatherdps(src, slice, offsets, mask, $imm8)
1288 };
1289 }
1290 constify_imm8!(scale, call)
1291 }
1292
1293 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1294 /// where
1295 /// `scale` is between 1 and 8.
1296 ///
1297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
1298 #[inline]
1299 #[target_feature(enable = "avx2")]
1300 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1301 #[rustc_args_required_const(2)]
1302 #[stable(feature = "simd_x86", since = "1.27.0")]
1303 pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1304 let zero = _mm_setzero_si128().as_i64x2();
1305 let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1306 let offsets = offsets.as_i32x4();
1307 let slice = slice as *const i8;
1308 macro_rules! call {
1309 ($imm8:expr) => {
1310 pgatherdq(zero, slice, offsets, neg_one, $imm8)
1311 };
1312 }
1313 let r = constify_imm8!(scale, call);
1314 transmute(r)
1315 }
1316
1317 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1318 /// where
1319 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1320 /// that position instead.
1321 ///
1322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
1323 #[inline]
1324 #[target_feature(enable = "avx2")]
1325 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1326 #[rustc_args_required_const(4)]
1327 #[stable(feature = "simd_x86", since = "1.27.0")]
1328 pub unsafe fn _mm_mask_i32gather_epi64(
1329 src: __m128i,
1330 slice: *const i64,
1331 offsets: __m128i,
1332 mask: __m128i,
1333 scale: i32,
1334 ) -> __m128i {
1335 let src = src.as_i64x2();
1336 let mask = mask.as_i64x2();
1337 let offsets = offsets.as_i32x4();
1338 let slice = slice as *const i8;
1339 macro_rules! call {
1340 ($imm8:expr) => {
1341 pgatherdq(src, slice, offsets, mask, $imm8)
1342 };
1343 }
1344 let r = constify_imm8!(scale, call);
1345 transmute(r)
1346 }
1347
1348 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1349 /// where
1350 /// `scale` is between 1 and 8.
1351 ///
1352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
1353 #[inline]
1354 #[target_feature(enable = "avx2")]
1355 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1356 #[rustc_args_required_const(2)]
1357 #[stable(feature = "simd_x86", since = "1.27.0")]
1358 pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
1359 let zero = _mm256_setzero_si256().as_i64x4();
1360 let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1361 let offsets = offsets.as_i32x4();
1362 let slice = slice as *const i8;
1363 macro_rules! call {
1364 ($imm8:expr) => {
1365 vpgatherdq(zero, slice, offsets, neg_one, $imm8)
1366 };
1367 }
1368 let r = constify_imm8!(scale, call);
1369 transmute(r)
1370 }
1371
1372 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1373 /// where
1374 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1375 /// that position instead.
1376 ///
1377 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
1378 #[inline]
1379 #[target_feature(enable = "avx2")]
1380 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1381 #[rustc_args_required_const(4)]
1382 #[stable(feature = "simd_x86", since = "1.27.0")]
1383 pub unsafe fn _mm256_mask_i32gather_epi64(
1384 src: __m256i,
1385 slice: *const i64,
1386 offsets: __m128i,
1387 mask: __m256i,
1388 scale: i32,
1389 ) -> __m256i {
1390 let src = src.as_i64x4();
1391 let mask = mask.as_i64x4();
1392 let offsets = offsets.as_i32x4();
1393 let slice = slice as *const i8;
1394 macro_rules! call {
1395 ($imm8:expr) => {
1396 vpgatherdq(src, slice, offsets, mask, $imm8)
1397 };
1398 }
1399 let r = constify_imm8!(scale, call);
1400 transmute(r)
1401 }
1402
1403 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1404 /// where
1405 /// `scale` is between 1 and 8.
1406 ///
1407 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
1408 #[inline]
1409 #[target_feature(enable = "avx2")]
1410 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1411 #[rustc_args_required_const(2)]
1412 #[stable(feature = "simd_x86", since = "1.27.0")]
1413 pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1414 let zero = _mm_setzero_pd();
1415 let neg_one = _mm_set1_pd(-1.0);
1416 let offsets = offsets.as_i32x4();
1417 let slice = slice as *const i8;
1418 macro_rules! call {
1419 ($imm8:expr) => {
1420 pgatherdpd(zero, slice, offsets, neg_one, $imm8)
1421 };
1422 }
1423 constify_imm8!(scale, call)
1424 }
1425
1426 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1427 /// where
1428 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1429 /// that position instead.
1430 ///
1431 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
1432 #[inline]
1433 #[target_feature(enable = "avx2")]
1434 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1435 #[rustc_args_required_const(4)]
1436 #[stable(feature = "simd_x86", since = "1.27.0")]
1437 pub unsafe fn _mm_mask_i32gather_pd(
1438 src: __m128d,
1439 slice: *const f64,
1440 offsets: __m128i,
1441 mask: __m128d,
1442 scale: i32,
1443 ) -> __m128d {
1444 let offsets = offsets.as_i32x4();
1445 let slice = slice as *const i8;
1446 macro_rules! call {
1447 ($imm8:expr) => {
1448 pgatherdpd(src, slice, offsets, mask, $imm8)
1449 };
1450 }
1451 constify_imm8!(scale, call)
1452 }
1453
1454 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1455 /// where
1456 /// `scale` is between 1 and 8.
1457 ///
1458 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
1459 #[inline]
1460 #[target_feature(enable = "avx2")]
1461 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1462 #[rustc_args_required_const(2)]
1463 #[stable(feature = "simd_x86", since = "1.27.0")]
1464 pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
1465 let zero = _mm256_setzero_pd();
1466 let neg_one = _mm256_set1_pd(-1.0);
1467 let offsets = offsets.as_i32x4();
1468 let slice = slice as *const i8;
1469 macro_rules! call {
1470 ($imm8:expr) => {
1471 vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
1472 };
1473 }
1474 constify_imm8!(scale, call)
1475 }
1476
1477 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1478 /// where
1479 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1480 /// that position instead.
1481 ///
1482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
1483 #[inline]
1484 #[target_feature(enable = "avx2")]
1485 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1486 #[rustc_args_required_const(4)]
1487 #[stable(feature = "simd_x86", since = "1.27.0")]
1488 pub unsafe fn _mm256_mask_i32gather_pd(
1489 src: __m256d,
1490 slice: *const f64,
1491 offsets: __m128i,
1492 mask: __m256d,
1493 scale: i32,
1494 ) -> __m256d {
1495 let offsets = offsets.as_i32x4();
1496 let slice = slice as *const i8;
1497 macro_rules! call {
1498 ($imm8:expr) => {
1499 vpgatherdpd(src, slice, offsets, mask, $imm8)
1500 };
1501 }
1502 constify_imm8!(scale, call)
1503 }
1504
1505 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1506 /// where
1507 /// `scale` is between 1 and 8.
1508 ///
1509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
1510 #[inline]
1511 #[target_feature(enable = "avx2")]
1512 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1513 #[rustc_args_required_const(2)]
1514 #[stable(feature = "simd_x86", since = "1.27.0")]
1515 pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1516 let zero = _mm_setzero_si128().as_i32x4();
1517 let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1518 let offsets = offsets.as_i64x2();
1519 let slice = slice as *const i8;
1520 macro_rules! call {
1521 ($imm8:expr) => {
1522 pgatherqd(zero, slice, offsets, neg_one, $imm8)
1523 };
1524 }
1525 let r = constify_imm8!(scale, call);
1526 transmute(r)
1527 }
1528
1529 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1530 /// where
1531 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1532 /// that position instead.
1533 ///
1534 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
1535 #[inline]
1536 #[target_feature(enable = "avx2")]
1537 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1538 #[rustc_args_required_const(4)]
1539 #[stable(feature = "simd_x86", since = "1.27.0")]
1540 pub unsafe fn _mm_mask_i64gather_epi32(
1541 src: __m128i,
1542 slice: *const i32,
1543 offsets: __m128i,
1544 mask: __m128i,
1545 scale: i32,
1546 ) -> __m128i {
1547 let src = src.as_i32x4();
1548 let mask = mask.as_i32x4();
1549 let offsets = offsets.as_i64x2();
1550 let slice = slice as *const i8;
1551 macro_rules! call {
1552 ($imm8:expr) => {
1553 pgatherqd(src, slice, offsets, mask, $imm8)
1554 };
1555 }
1556 let r = constify_imm8!(scale, call);
1557 transmute(r)
1558 }
1559
1560 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1561 /// where
1562 /// `scale` is between 1 and 8.
1563 ///
1564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
1565 #[inline]
1566 #[target_feature(enable = "avx2")]
1567 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1568 #[rustc_args_required_const(2)]
1569 #[stable(feature = "simd_x86", since = "1.27.0")]
1570 pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
1571 let zero = _mm_setzero_si128().as_i32x4();
1572 let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1573 let offsets = offsets.as_i64x4();
1574 let slice = slice as *const i8;
1575 macro_rules! call {
1576 ($imm8:expr) => {
1577 vpgatherqd(zero, slice, offsets, neg_one, $imm8)
1578 };
1579 }
1580 let r = constify_imm8!(scale, call);
1581 transmute(r)
1582 }
1583
1584 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1585 /// where
1586 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1587 /// that position instead.
1588 ///
1589 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
1590 #[inline]
1591 #[target_feature(enable = "avx2")]
1592 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1593 #[rustc_args_required_const(4)]
1594 #[stable(feature = "simd_x86", since = "1.27.0")]
1595 pub unsafe fn _mm256_mask_i64gather_epi32(
1596 src: __m128i,
1597 slice: *const i32,
1598 offsets: __m256i,
1599 mask: __m128i,
1600 scale: i32,
1601 ) -> __m128i {
1602 let src = src.as_i32x4();
1603 let mask = mask.as_i32x4();
1604 let offsets = offsets.as_i64x4();
1605 let slice = slice as *const i8;
1606 macro_rules! call {
1607 ($imm8:expr) => {
1608 vpgatherqd(src, slice, offsets, mask, $imm8)
1609 };
1610 }
1611 let r = constify_imm8!(scale, call);
1612 transmute(r)
1613 }
1614
1615 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1616 /// where
1617 /// `scale` is between 1 and 8.
1618 ///
1619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
1620 #[inline]
1621 #[target_feature(enable = "avx2")]
1622 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1623 #[rustc_args_required_const(2)]
1624 #[stable(feature = "simd_x86", since = "1.27.0")]
1625 pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1626 let zero = _mm_setzero_ps();
1627 let neg_one = _mm_set1_ps(-1.0);
1628 let offsets = offsets.as_i64x2();
1629 let slice = slice as *const i8;
1630 macro_rules! call {
1631 ($imm8:expr) => {
1632 pgatherqps(zero, slice, offsets, neg_one, $imm8)
1633 };
1634 }
1635 constify_imm8!(scale, call)
1636 }
1637
1638 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1639 /// where
1640 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1641 /// that position instead.
1642 ///
1643 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
1644 #[inline]
1645 #[target_feature(enable = "avx2")]
1646 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1647 #[rustc_args_required_const(4)]
1648 #[stable(feature = "simd_x86", since = "1.27.0")]
1649 pub unsafe fn _mm_mask_i64gather_ps(
1650 src: __m128,
1651 slice: *const f32,
1652 offsets: __m128i,
1653 mask: __m128,
1654 scale: i32,
1655 ) -> __m128 {
1656 let offsets = offsets.as_i64x2();
1657 let slice = slice as *const i8;
1658 macro_rules! call {
1659 ($imm8:expr) => {
1660 pgatherqps(src, slice, offsets, mask, $imm8)
1661 };
1662 }
1663 constify_imm8!(scale, call)
1664 }
1665
1666 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1667 /// where
1668 /// `scale` is between 1 and 8.
1669 ///
1670 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
1671 #[inline]
1672 #[target_feature(enable = "avx2")]
1673 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1674 #[rustc_args_required_const(2)]
1675 #[stable(feature = "simd_x86", since = "1.27.0")]
1676 pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
1677 let zero = _mm_setzero_ps();
1678 let neg_one = _mm_set1_ps(-1.0);
1679 let offsets = offsets.as_i64x4();
1680 let slice = slice as *const i8;
1681 macro_rules! call {
1682 ($imm8:expr) => {
1683 vpgatherqps(zero, slice, offsets, neg_one, $imm8)
1684 };
1685 }
1686 constify_imm8!(scale, call)
1687 }
1688
1689 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1690 /// where
1691 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1692 /// that position instead.
1693 ///
1694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
1695 #[inline]
1696 #[target_feature(enable = "avx2")]
1697 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1698 #[rustc_args_required_const(4)]
1699 #[stable(feature = "simd_x86", since = "1.27.0")]
1700 pub unsafe fn _mm256_mask_i64gather_ps(
1701 src: __m128,
1702 slice: *const f32,
1703 offsets: __m256i,
1704 mask: __m128,
1705 scale: i32,
1706 ) -> __m128 {
1707 let offsets = offsets.as_i64x4();
1708 let slice = slice as *const i8;
1709 macro_rules! call {
1710 ($imm8:expr) => {
1711 vpgatherqps(src, slice, offsets, mask, $imm8)
1712 };
1713 }
1714 constify_imm8!(scale, call)
1715 }
1716
1717 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1718 /// where
1719 /// `scale` is between 1 and 8.
1720 ///
1721 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
1722 #[inline]
1723 #[target_feature(enable = "avx2")]
1724 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1725 #[rustc_args_required_const(2)]
1726 #[stable(feature = "simd_x86", since = "1.27.0")]
1727 pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1728 let zero = _mm_setzero_si128().as_i64x2();
1729 let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1730 let slice = slice as *const i8;
1731 let offsets = offsets.as_i64x2();
1732 macro_rules! call {
1733 ($imm8:expr) => {
1734 pgatherqq(zero, slice, offsets, neg_one, $imm8)
1735 };
1736 }
1737 let r = constify_imm8!(scale, call);
1738 transmute(r)
1739 }
1740
1741 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1742 /// where
1743 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1744 /// that position instead.
1745 ///
1746 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
1747 #[inline]
1748 #[target_feature(enable = "avx2")]
1749 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1750 #[rustc_args_required_const(4)]
1751 #[stable(feature = "simd_x86", since = "1.27.0")]
1752 pub unsafe fn _mm_mask_i64gather_epi64(
1753 src: __m128i,
1754 slice: *const i64,
1755 offsets: __m128i,
1756 mask: __m128i,
1757 scale: i32,
1758 ) -> __m128i {
1759 let src = src.as_i64x2();
1760 let mask = mask.as_i64x2();
1761 let offsets = offsets.as_i64x2();
1762 let slice = slice as *const i8;
1763 macro_rules! call {
1764 ($imm8:expr) => {
1765 pgatherqq(src, slice, offsets, mask, $imm8)
1766 };
1767 }
1768 let r = constify_imm8!(scale, call);
1769 transmute(r)
1770 }
1771
1772 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1773 /// where
1774 /// `scale` is between 1 and 8.
1775 ///
1776 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
1777 #[inline]
1778 #[target_feature(enable = "avx2")]
1779 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1780 #[rustc_args_required_const(2)]
1781 #[stable(feature = "simd_x86", since = "1.27.0")]
1782 pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
1783 let zero = _mm256_setzero_si256().as_i64x4();
1784 let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1785 let slice = slice as *const i8;
1786 let offsets = offsets.as_i64x4();
1787 macro_rules! call {
1788 ($imm8:expr) => {
1789 vpgatherqq(zero, slice, offsets, neg_one, $imm8)
1790 };
1791 }
1792 let r = constify_imm8!(scale, call);
1793 transmute(r)
1794 }
1795
1796 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1797 /// where
1798 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1799 /// that position instead.
1800 ///
1801 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
1802 #[inline]
1803 #[target_feature(enable = "avx2")]
1804 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1805 #[rustc_args_required_const(4)]
1806 #[stable(feature = "simd_x86", since = "1.27.0")]
1807 pub unsafe fn _mm256_mask_i64gather_epi64(
1808 src: __m256i,
1809 slice: *const i64,
1810 offsets: __m256i,
1811 mask: __m256i,
1812 scale: i32,
1813 ) -> __m256i {
1814 let src = src.as_i64x4();
1815 let mask = mask.as_i64x4();
1816 let offsets = offsets.as_i64x4();
1817 let slice = slice as *const i8;
1818 macro_rules! call {
1819 ($imm8:expr) => {
1820 vpgatherqq(src, slice, offsets, mask, $imm8)
1821 };
1822 }
1823 let r = constify_imm8!(scale, call);
1824 transmute(r)
1825 }
1826
1827 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1828 /// where
1829 /// `scale` is between 1 and 8.
1830 ///
1831 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
1832 #[inline]
1833 #[target_feature(enable = "avx2")]
1834 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1835 #[rustc_args_required_const(2)]
1836 #[stable(feature = "simd_x86", since = "1.27.0")]
1837 pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1838 let zero = _mm_setzero_pd();
1839 let neg_one = _mm_set1_pd(-1.0);
1840 let slice = slice as *const i8;
1841 let offsets = offsets.as_i64x2();
1842 macro_rules! call {
1843 ($imm8:expr) => {
1844 pgatherqpd(zero, slice, offsets, neg_one, $imm8)
1845 };
1846 }
1847 constify_imm8!(scale, call)
1848 }
1849
1850 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1851 /// where
1852 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1853 /// that position instead.
1854 ///
1855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
1856 #[inline]
1857 #[target_feature(enable = "avx2")]
1858 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1859 #[rustc_args_required_const(4)]
1860 #[stable(feature = "simd_x86", since = "1.27.0")]
1861 pub unsafe fn _mm_mask_i64gather_pd(
1862 src: __m128d,
1863 slice: *const f64,
1864 offsets: __m128i,
1865 mask: __m128d,
1866 scale: i32,
1867 ) -> __m128d {
1868 let slice = slice as *const i8;
1869 let offsets = offsets.as_i64x2();
1870 macro_rules! call {
1871 ($imm8:expr) => {
1872 pgatherqpd(src, slice, offsets, mask, $imm8)
1873 };
1874 }
1875 constify_imm8!(scale, call)
1876 }
1877
1878 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1879 /// where
1880 /// `scale` is between 1 and 8.
1881 ///
1882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
1883 #[inline]
1884 #[target_feature(enable = "avx2")]
1885 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1886 #[rustc_args_required_const(2)]
1887 #[stable(feature = "simd_x86", since = "1.27.0")]
1888 pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
1889 let zero = _mm256_setzero_pd();
1890 let neg_one = _mm256_set1_pd(-1.0);
1891 let slice = slice as *const i8;
1892 let offsets = offsets.as_i64x4();
1893 macro_rules! call {
1894 ($imm8:expr) => {
1895 vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
1896 };
1897 }
1898 constify_imm8!(scale, call)
1899 }
1900
1901 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1902 /// where
1903 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1904 /// that position instead.
1905 ///
1906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
1907 #[inline]
1908 #[target_feature(enable = "avx2")]
1909 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1910 #[rustc_args_required_const(4)]
1911 #[stable(feature = "simd_x86", since = "1.27.0")]
1912 pub unsafe fn _mm256_mask_i64gather_pd(
1913 src: __m256d,
1914 slice: *const f64,
1915 offsets: __m256i,
1916 mask: __m256d,
1917 scale: i32,
1918 ) -> __m256d {
1919 let slice = slice as *const i8;
1920 let offsets = offsets.as_i64x4();
1921 macro_rules! call {
1922 ($imm8:expr) => {
1923 vpgatherqpd(src, slice, offsets, mask, $imm8)
1924 };
1925 }
1926 constify_imm8!(scale, call)
1927 }
1928
1929 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1930 /// location specified by `imm8`.
1931 ///
1932 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
1933 #[inline]
1934 #[target_feature(enable = "avx2")]
1935 #[cfg_attr(
1936 all(test, not(target_os = "windows")),
1937 assert_instr(vinsertf128, imm8 = 1)
1938 )]
1939 #[rustc_args_required_const(2)]
1940 #[stable(feature = "simd_x86", since = "1.27.0")]
1941 pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
1942 let a = a.as_i64x4();
1943 let b = _mm256_castsi128_si256(b).as_i64x4();
1944 let dst: i64x4 = match imm8 & 0b01 {
1945 0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
1946 _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
1947 };
1948 transmute(dst)
1949 }
1950
1951 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1952 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1953 /// of intermediate 32-bit integers.
1954 ///
1955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
1956 #[inline]
1957 #[target_feature(enable = "avx2")]
1958 #[cfg_attr(test, assert_instr(vpmaddwd))]
1959 #[stable(feature = "simd_x86", since = "1.27.0")]
1960 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1961 transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1962 }
1963
1964 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
1965 /// corresponding signed 8-bit integer from `b`, producing intermediate
1966 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1967 /// signed 16-bit integers
1968 ///
1969 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
1970 #[inline]
1971 #[target_feature(enable = "avx2")]
1972 #[cfg_attr(test, assert_instr(vpmaddubsw))]
1973 #[stable(feature = "simd_x86", since = "1.27.0")]
1974 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1975 transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1976 }
1977
1978 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1979 /// (elements are zeroed out when the highest bit is not set in the
1980 /// corresponding element).
1981 ///
1982 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
1983 #[inline]
1984 #[target_feature(enable = "avx2")]
1985 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1986 #[stable(feature = "simd_x86", since = "1.27.0")]
1987 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1988 transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1989 }
1990
1991 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1992 /// (elements are zeroed out when the highest bit is not set in the
1993 /// corresponding element).
1994 ///
1995 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
1996 #[inline]
1997 #[target_feature(enable = "avx2")]
1998 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1999 #[stable(feature = "simd_x86", since = "1.27.0")]
2000 pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
2001 transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
2002 }
2003
2004 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2005 /// (elements are zeroed out when the highest bit is not set in the
2006 /// corresponding element).
2007 ///
2008 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
2009 #[inline]
2010 #[target_feature(enable = "avx2")]
2011 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2012 #[stable(feature = "simd_x86", since = "1.27.0")]
2013 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
2014 transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
2015 }
2016
2017 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2018 /// (elements are zeroed out when the highest bit is not set in the
2019 /// corresponding element).
2020 ///
2021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
2022 #[inline]
2023 #[target_feature(enable = "avx2")]
2024 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2025 #[stable(feature = "simd_x86", since = "1.27.0")]
2026 pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
2027 transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
2028 }
2029
2030 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2031 /// using `mask` (elements are not stored when the highest bit is not set
2032 /// in the corresponding element).
2033 ///
2034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
2035 #[inline]
2036 #[target_feature(enable = "avx2")]
2037 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2038 #[stable(feature = "simd_x86", since = "1.27.0")]
2039 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
2040 maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
2041 }
2042
2043 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2044 /// using `mask` (elements are not stored when the highest bit is not set
2045 /// in the corresponding element).
2046 ///
2047 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
2048 #[inline]
2049 #[target_feature(enable = "avx2")]
2050 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2051 #[stable(feature = "simd_x86", since = "1.27.0")]
2052 pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
2053 maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
2054 }
2055
2056 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2057 /// using `mask` (elements are not stored when the highest bit is not set
2058 /// in the corresponding element).
2059 ///
2060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
2061 #[inline]
2062 #[target_feature(enable = "avx2")]
2063 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2064 #[stable(feature = "simd_x86", since = "1.27.0")]
2065 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
2066 maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
2067 }
2068
2069 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2070 /// using `mask` (elements are not stored when the highest bit is not set
2071 /// in the corresponding element).
2072 ///
2073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
2074 #[inline]
2075 #[target_feature(enable = "avx2")]
2076 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2077 #[stable(feature = "simd_x86", since = "1.27.0")]
2078 pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
2079 maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
2080 }
2081
2082 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2083 /// maximum values.
2084 ///
2085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
2086 #[inline]
2087 #[target_feature(enable = "avx2")]
2088 #[cfg_attr(test, assert_instr(vpmaxsw))]
2089 #[stable(feature = "simd_x86", since = "1.27.0")]
2090 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2091 transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
2092 }
2093
2094 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2095 /// maximum values.
2096 ///
2097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
2098 #[inline]
2099 #[target_feature(enable = "avx2")]
2100 #[cfg_attr(test, assert_instr(vpmaxsd))]
2101 #[stable(feature = "simd_x86", since = "1.27.0")]
2102 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2103 transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
2104 }
2105
2106 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2107 /// maximum values.
2108 ///
2109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
2110 #[inline]
2111 #[target_feature(enable = "avx2")]
2112 #[cfg_attr(test, assert_instr(vpmaxsb))]
2113 #[stable(feature = "simd_x86", since = "1.27.0")]
2114 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2115 transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
2116 }
2117
2118 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2119 /// the packed maximum values.
2120 ///
2121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
2122 #[inline]
2123 #[target_feature(enable = "avx2")]
2124 #[cfg_attr(test, assert_instr(vpmaxuw))]
2125 #[stable(feature = "simd_x86", since = "1.27.0")]
2126 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2127 transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
2128 }
2129
2130 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2131 /// the packed maximum values.
2132 ///
2133 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
2134 #[inline]
2135 #[target_feature(enable = "avx2")]
2136 #[cfg_attr(test, assert_instr(vpmaxud))]
2137 #[stable(feature = "simd_x86", since = "1.27.0")]
2138 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2139 transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
2140 }
2141
2142 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2143 /// the packed maximum values.
2144 ///
2145 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
2146 #[inline]
2147 #[target_feature(enable = "avx2")]
2148 #[cfg_attr(test, assert_instr(vpmaxub))]
2149 #[stable(feature = "simd_x86", since = "1.27.0")]
2150 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2151 transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
2152 }
2153
2154 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2155 /// minimum values.
2156 ///
2157 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
2158 #[inline]
2159 #[target_feature(enable = "avx2")]
2160 #[cfg_attr(test, assert_instr(vpminsw))]
2161 #[stable(feature = "simd_x86", since = "1.27.0")]
2162 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2163 transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
2164 }
2165
2166 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2167 /// minimum values.
2168 ///
2169 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
2170 #[inline]
2171 #[target_feature(enable = "avx2")]
2172 #[cfg_attr(test, assert_instr(vpminsd))]
2173 #[stable(feature = "simd_x86", since = "1.27.0")]
2174 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2175 transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
2176 }
2177
2178 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2179 /// minimum values.
2180 ///
2181 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
2182 #[inline]
2183 #[target_feature(enable = "avx2")]
2184 #[cfg_attr(test, assert_instr(vpminsb))]
2185 #[stable(feature = "simd_x86", since = "1.27.0")]
2186 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2187 transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
2188 }
2189
2190 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2191 /// the packed minimum values.
2192 ///
2193 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
2194 #[inline]
2195 #[target_feature(enable = "avx2")]
2196 #[cfg_attr(test, assert_instr(vpminuw))]
2197 #[stable(feature = "simd_x86", since = "1.27.0")]
2198 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2199 transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
2200 }
2201
2202 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2203 /// the packed minimum values.
2204 ///
2205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
2206 #[inline]
2207 #[target_feature(enable = "avx2")]
2208 #[cfg_attr(test, assert_instr(vpminud))]
2209 #[stable(feature = "simd_x86", since = "1.27.0")]
2210 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2211 transmute(pminud(a.as_u32x8(), b.as_u32x8()))
2212 }
2213
2214 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2215 /// the packed minimum values.
2216 ///
2217 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
2218 #[inline]
2219 #[target_feature(enable = "avx2")]
2220 #[cfg_attr(test, assert_instr(vpminub))]
2221 #[stable(feature = "simd_x86", since = "1.27.0")]
2222 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2223 transmute(pminub(a.as_u8x32(), b.as_u8x32()))
2224 }
2225
2226 /// Creates mask from the most significant bit of each 8-bit element in `a`,
2227 /// return the result.
2228 ///
2229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
2230 #[inline]
2231 #[target_feature(enable = "avx2")]
2232 #[cfg_attr(test, assert_instr(vpmovmskb))]
2233 #[stable(feature = "simd_x86", since = "1.27.0")]
2234 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2235 pmovmskb(a.as_i8x32())
2236 }
2237
2238 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2239 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2240 /// results in dst. Eight SADs are performed for each 128-bit lane using one
2241 /// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2242 /// selected from `b` starting at on the offset specified in `imm8`. Eight
2243 /// quadruplets are formed from sequential 8-bit integers selected from `a`
2244 /// starting at the offset specified in `imm8`.
2245 ///
2246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
2247 #[inline]
2248 #[target_feature(enable = "avx2")]
2249 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
2250 #[rustc_args_required_const(2)]
2251 #[stable(feature = "simd_x86", since = "1.27.0")]
2252 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2253 let a = a.as_u8x32();
2254 let b = b.as_u8x32();
2255 macro_rules! call {
2256 ($imm8:expr) => {
2257 mpsadbw(a, b, $imm8)
2258 };
2259 }
2260 let r = constify_imm8!(imm8, call);
2261 transmute(r)
2262 }
2263
2264 /// Multiplies the low 32-bit integers from each packed 64-bit element in
2265 /// `a` and `b`
2266 ///
2267 /// Returns the 64-bit results.
2268 ///
2269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
2270 #[inline]
2271 #[target_feature(enable = "avx2")]
2272 #[cfg_attr(test, assert_instr(vpmuldq))]
2273 #[stable(feature = "simd_x86", since = "1.27.0")]
2274 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2275 transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
2276 }
2277
2278 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2279 /// element in `a` and `b`
2280 ///
2281 /// Returns the unsigned 64-bit results.
2282 ///
2283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
2284 #[inline]
2285 #[target_feature(enable = "avx2")]
2286 #[cfg_attr(test, assert_instr(vpmuludq))]
2287 #[stable(feature = "simd_x86", since = "1.27.0")]
2288 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2289 transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
2290 }
2291
2292 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2293 /// intermediate 32-bit integers and returning the high 16 bits of the
2294 /// intermediate integers.
2295 ///
2296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
2297 #[inline]
2298 #[target_feature(enable = "avx2")]
2299 #[cfg_attr(test, assert_instr(vpmulhw))]
2300 #[stable(feature = "simd_x86", since = "1.27.0")]
2301 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2302 transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
2303 }
2304
2305 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2306 /// intermediate 32-bit integers and returning the high 16 bits of the
2307 /// intermediate integers.
2308 ///
2309 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
2310 #[inline]
2311 #[target_feature(enable = "avx2")]
2312 #[cfg_attr(test, assert_instr(vpmulhuw))]
2313 #[stable(feature = "simd_x86", since = "1.27.0")]
2314 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2315 transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
2316 }
2317
2318 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2319 /// intermediate 32-bit integers, and returns the low 16 bits of the
2320 /// intermediate integers
2321 ///
2322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
2323 #[inline]
2324 #[target_feature(enable = "avx2")]
2325 #[cfg_attr(test, assert_instr(vpmullw))]
2326 #[stable(feature = "simd_x86", since = "1.27.0")]
2327 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2328 transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2329 }
2330
2331 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
2332 /// intermediate 64-bit integers, and returns the low 32 bits of the
2333 /// intermediate integers
2334 ///
2335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
2336 #[inline]
2337 #[target_feature(enable = "avx2")]
2338 #[cfg_attr(test, assert_instr(vpmulld))]
2339 #[stable(feature = "simd_x86", since = "1.27.0")]
2340 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2341 transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2342 }
2343
2344 /// Multiplies packed 16-bit integers in `a` and `b`, producing
2345 /// intermediate signed 32-bit integers. Truncate each intermediate
2346 /// integer to the 18 most significant bits, round by adding 1, and
2347 /// return bits `[16:1]`.
2348 ///
2349 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
2350 #[inline]
2351 #[target_feature(enable = "avx2")]
2352 #[cfg_attr(test, assert_instr(vpmulhrsw))]
2353 #[stable(feature = "simd_x86", since = "1.27.0")]
2354 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2355 transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2356 }
2357
2358 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2359 /// and `b`
2360 ///
2361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
2362 #[inline]
2363 #[target_feature(enable = "avx2")]
2364 #[cfg_attr(test, assert_instr(vorps))]
2365 #[stable(feature = "simd_x86", since = "1.27.0")]
2366 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2367 transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2368 }
2369
2370 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2371 /// using signed saturation
2372 ///
2373 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
2374 #[inline]
2375 #[target_feature(enable = "avx2")]
2376 #[cfg_attr(test, assert_instr(vpacksswb))]
2377 #[stable(feature = "simd_x86", since = "1.27.0")]
2378 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2379 transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2380 }
2381
2382 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2383 /// using signed saturation
2384 ///
2385 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
2386 #[inline]
2387 #[target_feature(enable = "avx2")]
2388 #[cfg_attr(test, assert_instr(vpackssdw))]
2389 #[stable(feature = "simd_x86", since = "1.27.0")]
2390 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2391 transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2392 }
2393
2394 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2395 /// using unsigned saturation
2396 ///
2397 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
2398 #[inline]
2399 #[target_feature(enable = "avx2")]
2400 #[cfg_attr(test, assert_instr(vpackuswb))]
2401 #[stable(feature = "simd_x86", since = "1.27.0")]
2402 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2403 transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2404 }
2405
2406 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2407 /// using unsigned saturation
2408 ///
2409 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
2410 #[inline]
2411 #[target_feature(enable = "avx2")]
2412 #[cfg_attr(test, assert_instr(vpackusdw))]
2413 #[stable(feature = "simd_x86", since = "1.27.0")]
2414 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2415 transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2416 }
2417
2418 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
2419 ///
2420 /// The last 3 bits of each integer of `b` are used as addresses into the 8
2421 /// integers of `a`.
2422 ///
2423 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
2424 #[inline]
2425 #[target_feature(enable = "avx2")]
2426 #[cfg_attr(test, assert_instr(vpermps))]
2427 #[stable(feature = "simd_x86", since = "1.27.0")]
2428 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2429 transmute(permd(a.as_u32x8(), b.as_u32x8()))
2430 }
2431
2432 /// Permutes 64-bit integers from `a` using control mask `imm8`.
2433 ///
2434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
2435 #[inline]
2436 #[target_feature(enable = "avx2")]
2437 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
2438 #[rustc_args_required_const(1)]
2439 #[stable(feature = "simd_x86", since = "1.27.0")]
2440 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
2441 let imm8 = (imm8 & 0xFF) as u8;
2442 let zero = _mm256_setzero_si256().as_i64x4();
2443 let a = a.as_i64x4();
2444 macro_rules! permute4 {
2445 ($a:expr, $b:expr, $c:expr, $d:expr) => {
2446 simd_shuffle4(a, zero, [$a, $b, $c, $d]);
2447 };
2448 }
2449 macro_rules! permute3 {
2450 ($a:expr, $b:expr, $c:expr) => {
2451 match (imm8 >> 6) & 0b11 {
2452 0b00 => permute4!($a, $b, $c, 0),
2453 0b01 => permute4!($a, $b, $c, 1),
2454 0b10 => permute4!($a, $b, $c, 2),
2455 _ => permute4!($a, $b, $c, 3),
2456 }
2457 };
2458 }
2459 macro_rules! permute2 {
2460 ($a:expr, $b:expr) => {
2461 match (imm8 >> 4) & 0b11 {
2462 0b00 => permute3!($a, $b, 0),
2463 0b01 => permute3!($a, $b, 1),
2464 0b10 => permute3!($a, $b, 2),
2465 _ => permute3!($a, $b, 3),
2466 }
2467 };
2468 }
2469 macro_rules! permute1 {
2470 ($a:expr) => {
2471 match (imm8 >> 2) & 0b11 {
2472 0b00 => permute2!($a, 0),
2473 0b01 => permute2!($a, 1),
2474 0b10 => permute2!($a, 2),
2475 _ => permute2!($a, 3),
2476 }
2477 };
2478 }
2479 let r: i64x4 = match imm8 & 0b11 {
2480 0b00 => permute1!(0),
2481 0b01 => permute1!(1),
2482 0b10 => permute1!(2),
2483 _ => permute1!(3),
2484 };
2485 transmute(r)
2486 }
2487
2488 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2489 ///
2490 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
2491 #[inline]
2492 #[target_feature(enable = "avx2")]
2493 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
2494 #[rustc_args_required_const(2)]
2495 #[stable(feature = "simd_x86", since = "1.27.0")]
2496 pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2497 let a = a.as_i64x4();
2498 let b = b.as_i64x4();
2499 macro_rules! call {
2500 ($imm8:expr) => {
2501 vperm2i128(a, b, $imm8)
2502 };
2503 }
2504 transmute(constify_imm8!(imm8, call))
2505 }
2506
2507 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
2508 /// control in `imm8`.
2509 ///
2510 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
2511 #[inline]
2512 #[target_feature(enable = "avx2")]
2513 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
2514 #[rustc_args_required_const(1)]
2515 #[stable(feature = "simd_x86", since = "1.27.0")]
2516 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
2517 let imm8 = (imm8 & 0xFF) as u8;
2518 let undef = _mm256_undefined_pd();
2519 macro_rules! shuffle_done {
2520 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2521 simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
2522 };
2523 }
2524 macro_rules! shuffle_x67 {
2525 ($x01:expr, $x23:expr, $x45:expr) => {
2526 match (imm8 >> 6) & 0b11 {
2527 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2528 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2529 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2530 _ => shuffle_done!($x01, $x23, $x45, 3),
2531 }
2532 };
2533 }
2534 macro_rules! shuffle_x45 {
2535 ($x01:expr, $x23:expr) => {
2536 match (imm8 >> 4) & 0b11 {
2537 0b00 => shuffle_x67!($x01, $x23, 0),
2538 0b01 => shuffle_x67!($x01, $x23, 1),
2539 0b10 => shuffle_x67!($x01, $x23, 2),
2540 _ => shuffle_x67!($x01, $x23, 3),
2541 }
2542 };
2543 }
2544 macro_rules! shuffle_x23 {
2545 ($x01:expr) => {
2546 match (imm8 >> 2) & 0b11 {
2547 0b00 => shuffle_x45!($x01, 0),
2548 0b01 => shuffle_x45!($x01, 1),
2549 0b10 => shuffle_x45!($x01, 2),
2550 _ => shuffle_x45!($x01, 3),
2551 }
2552 };
2553 }
2554 match imm8 & 0b11 {
2555 0b00 => shuffle_x23!(0),
2556 0b01 => shuffle_x23!(1),
2557 0b10 => shuffle_x23!(2),
2558 _ => shuffle_x23!(3),
2559 }
2560 }
2561
2562 /// Shuffles eight 32-bit foating-point elements in `a` across lanes using
2563 /// the corresponding 32-bit integer index in `idx`.
2564 ///
2565 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
2566 #[inline]
2567 #[target_feature(enable = "avx2")]
2568 #[cfg_attr(test, assert_instr(vpermps))]
2569 #[stable(feature = "simd_x86", since = "1.27.0")]
2570 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2571 permps(a, idx.as_i32x8())
2572 }
2573
2574 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2575 /// and `b`, then horizontally sum each consecutive 8 differences to
2576 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2577 /// integers in the low 16 bits of the 64-bit return value
2578 ///
2579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
2580 #[inline]
2581 #[target_feature(enable = "avx2")]
2582 #[cfg_attr(test, assert_instr(vpsadbw))]
2583 #[stable(feature = "simd_x86", since = "1.27.0")]
2584 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2585 transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2586 }
2587
2588 /// Shuffles bytes from `a` according to the content of `b`.
2589 ///
2590 /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
2591 /// of `a`.
2592 ///
2593 /// In addition, if the highest significant bit of a byte of `b` is set, the
2594 /// respective destination byte is set to 0.
2595 ///
2596 /// The low and high halves of the vectors are shuffled separately.
2597 ///
2598 /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2599 /// equivalent to:
2600 ///
2601 /// ```
2602 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2603 /// let mut r = [0; 32];
2604 /// for i in 0..16 {
2605 /// // if the most significant bit of b is set,
2606 /// // then the destination byte is set to 0.
2607 /// if b[i] & 0x80 == 0u8 {
2608 /// r[i] = a[(b[i] % 16) as usize];
2609 /// }
2610 /// if b[i + 16] & 0x80 == 0u8 {
2611 /// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2612 /// }
2613 /// }
2614 /// r
2615 /// }
2616 /// ```
2617 ///
2618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
2619 #[inline]
2620 #[target_feature(enable = "avx2")]
2621 #[cfg_attr(test, assert_instr(vpshufb))]
2622 #[stable(feature = "simd_x86", since = "1.27.0")]
2623 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2624 transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2625 }
2626
2627 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2628 /// `imm8`.
2629 ///
2630 /// ```rust
2631 /// #[cfg(target_arch = "x86")]
2632 /// use std::arch::x86::*;
2633 /// #[cfg(target_arch = "x86_64")]
2634 /// use std::arch::x86_64::*;
2635 ///
2636 /// # fn main() {
2637 /// # if is_x86_feature_detected!("avx2") {
2638 /// # #[target_feature(enable = "avx2")]
2639 /// # unsafe fn worker() {
2640 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2641 ///
2642 /// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2643 /// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2644 ///
2645 /// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2646 /// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2647 ///
2648 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2649 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2650 /// # }
2651 /// # unsafe { worker(); }
2652 /// # }
2653 /// # }
2654 /// ```
2655 ///
2656 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2657 #[inline]
2658 #[target_feature(enable = "avx2")]
2659 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
2660 #[rustc_args_required_const(1)]
2661 #[stable(feature = "simd_x86", since = "1.27.0")]
2662 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
2663 // simd_shuffleX requires that its selector parameter be made up of
2664 // constant values, but we can't enforce that here. In spirit, we need
2665 // to write a `match` on all possible values of a byte, and for each value,
2666 // hard-code the correct `simd_shuffleX` call using only constants. We
2667 // then hope for LLVM to do the rest.
2668 //
2669 // Of course, that's... awful. So we try to use macros to do it for us.
2670 let imm8 = (imm8 & 0xFF) as u8;
2671
2672 let a = a.as_i32x8();
2673 macro_rules! shuffle_done {
2674 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2675 simd_shuffle8(
2676 a,
2677 a,
2678 [
2679 $x01,
2680 $x23,
2681 $x45,
2682 $x67,
2683 4 + $x01,
2684 4 + $x23,
2685 4 + $x45,
2686 4 + $x67,
2687 ],
2688 )
2689 };
2690 }
2691 macro_rules! shuffle_x67 {
2692 ($x01:expr, $x23:expr, $x45:expr) => {
2693 match (imm8 >> 6) & 0b11 {
2694 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2695 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2696 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2697 _ => shuffle_done!($x01, $x23, $x45, 3),
2698 }
2699 };
2700 }
2701 macro_rules! shuffle_x45 {
2702 ($x01:expr, $x23:expr) => {
2703 match (imm8 >> 4) & 0b11 {
2704 0b00 => shuffle_x67!($x01, $x23, 0),
2705 0b01 => shuffle_x67!($x01, $x23, 1),
2706 0b10 => shuffle_x67!($x01, $x23, 2),
2707 _ => shuffle_x67!($x01, $x23, 3),
2708 }
2709 };
2710 }
2711 macro_rules! shuffle_x23 {
2712 ($x01:expr) => {
2713 match (imm8 >> 2) & 0b11 {
2714 0b00 => shuffle_x45!($x01, 0),
2715 0b01 => shuffle_x45!($x01, 1),
2716 0b10 => shuffle_x45!($x01, 2),
2717 _ => shuffle_x45!($x01, 3),
2718 }
2719 };
2720 }
2721 let r: i32x8 = match imm8 & 0b11 {
2722 0b00 => shuffle_x23!(0),
2723 0b01 => shuffle_x23!(1),
2724 0b10 => shuffle_x23!(2),
2725 _ => shuffle_x23!(3),
2726 };
2727 transmute(r)
2728 }
2729
2730 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2731 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2732 /// to the output.
2733 ///
2734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
2735 #[inline]
2736 #[target_feature(enable = "avx2")]
2737 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
2738 #[rustc_args_required_const(1)]
2739 #[stable(feature = "simd_x86", since = "1.27.0")]
2740 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
2741 let imm8 = (imm8 & 0xFF) as u8;
2742 let a = a.as_i16x16();
2743 macro_rules! shuffle_done {
2744 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2745 #[rustfmt::skip]
2746 simd_shuffle16(a, a, [
2747 0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
2748 8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
2749 ]);
2750 };
2751 }
2752 macro_rules! shuffle_x67 {
2753 ($x01:expr, $x23:expr, $x45:expr) => {
2754 match (imm8 >> 6) & 0b11 {
2755 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2756 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2757 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2758 _ => shuffle_done!($x01, $x23, $x45, 3),
2759 }
2760 };
2761 }
2762 macro_rules! shuffle_x45 {
2763 ($x01:expr, $x23:expr) => {
2764 match (imm8 >> 4) & 0b11 {
2765 0b00 => shuffle_x67!($x01, $x23, 0),
2766 0b01 => shuffle_x67!($x01, $x23, 1),
2767 0b10 => shuffle_x67!($x01, $x23, 2),
2768 _ => shuffle_x67!($x01, $x23, 3),
2769 }
2770 };
2771 }
2772 macro_rules! shuffle_x23 {
2773 ($x01:expr) => {
2774 match (imm8 >> 2) & 0b11 {
2775 0b00 => shuffle_x45!($x01, 0),
2776 0b01 => shuffle_x45!($x01, 1),
2777 0b10 => shuffle_x45!($x01, 2),
2778 _ => shuffle_x45!($x01, 3),
2779 }
2780 };
2781 }
2782 let r: i16x16 = match imm8 & 0b11 {
2783 0b00 => shuffle_x23!(0),
2784 0b01 => shuffle_x23!(1),
2785 0b10 => shuffle_x23!(2),
2786 _ => shuffle_x23!(3),
2787 };
2788 transmute(r)
2789 }
2790
2791 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2792 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2793 /// to the output.
2794 ///
2795 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
2796 #[inline]
2797 #[target_feature(enable = "avx2")]
2798 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
2799 #[rustc_args_required_const(1)]
2800 #[stable(feature = "simd_x86", since = "1.27.0")]
2801 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
2802 let imm8 = (imm8 & 0xFF) as u8;
2803 let a = a.as_i16x16();
2804 macro_rules! shuffle_done {
2805 ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2806 #[rustfmt::skip]
2807 simd_shuffle16(a, a, [
2808 0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
2809 8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
2810 ]);
2811 };
2812 }
2813 macro_rules! shuffle_x67 {
2814 ($x01:expr, $x23:expr, $x45:expr) => {
2815 match (imm8 >> 6) & 0b11 {
2816 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2817 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2818 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2819 _ => shuffle_done!($x01, $x23, $x45, 3),
2820 }
2821 };
2822 }
2823 macro_rules! shuffle_x45 {
2824 ($x01:expr, $x23:expr) => {
2825 match (imm8 >> 4) & 0b11 {
2826 0b00 => shuffle_x67!($x01, $x23, 0),
2827 0b01 => shuffle_x67!($x01, $x23, 1),
2828 0b10 => shuffle_x67!($x01, $x23, 2),
2829 _ => shuffle_x67!($x01, $x23, 3),
2830 }
2831 };
2832 }
2833 macro_rules! shuffle_x23 {
2834 ($x01:expr) => {
2835 match (imm8 >> 2) & 0b11 {
2836 0b00 => shuffle_x45!($x01, 0),
2837 0b01 => shuffle_x45!($x01, 1),
2838 0b10 => shuffle_x45!($x01, 2),
2839 _ => shuffle_x45!($x01, 3),
2840 }
2841 };
2842 }
2843 let r: i16x16 = match imm8 & 0b11 {
2844 0b00 => shuffle_x23!(0),
2845 0b01 => shuffle_x23!(1),
2846 0b10 => shuffle_x23!(2),
2847 _ => shuffle_x23!(3),
2848 };
2849 transmute(r)
2850 }
2851
2852 /// Negates packed 16-bit integers in `a` when the corresponding signed
2853 /// 16-bit integer in `b` is negative, and returns the results.
2854 /// Results are zeroed out when the corresponding element in `b` is zero.
2855 ///
2856 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
2857 #[inline]
2858 #[target_feature(enable = "avx2")]
2859 #[cfg_attr(test, assert_instr(vpsignw))]
2860 #[stable(feature = "simd_x86", since = "1.27.0")]
2861 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2862 transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2863 }
2864
2865 /// Negates packed 32-bit integers in `a` when the corresponding signed
2866 /// 32-bit integer in `b` is negative, and returns the results.
2867 /// Results are zeroed out when the corresponding element in `b` is zero.
2868 ///
2869 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
2870 #[inline]
2871 #[target_feature(enable = "avx2")]
2872 #[cfg_attr(test, assert_instr(vpsignd))]
2873 #[stable(feature = "simd_x86", since = "1.27.0")]
2874 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2875 transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2876 }
2877
2878 /// Negates packed 8-bit integers in `a` when the corresponding signed
2879 /// 8-bit integer in `b` is negative, and returns the results.
2880 /// Results are zeroed out when the corresponding element in `b` is zero.
2881 ///
2882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
2883 #[inline]
2884 #[target_feature(enable = "avx2")]
2885 #[cfg_attr(test, assert_instr(vpsignb))]
2886 #[stable(feature = "simd_x86", since = "1.27.0")]
2887 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2888 transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2889 }
2890
2891 /// Shifts packed 16-bit integers in `a` left by `count` while
2892 /// shifting in zeros, and returns the result
2893 ///
2894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
2895 #[inline]
2896 #[target_feature(enable = "avx2")]
2897 #[cfg_attr(test, assert_instr(vpsllw))]
2898 #[stable(feature = "simd_x86", since = "1.27.0")]
2899 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2900 transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2901 }
2902
2903 /// Shifts packed 32-bit integers in `a` left by `count` while
2904 /// shifting in zeros, and returns the result
2905 ///
2906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
2907 #[inline]
2908 #[target_feature(enable = "avx2")]
2909 #[cfg_attr(test, assert_instr(vpslld))]
2910 #[stable(feature = "simd_x86", since = "1.27.0")]
2911 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2912 transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2913 }
2914
2915 /// Shifts packed 64-bit integers in `a` left by `count` while
2916 /// shifting in zeros, and returns the result
2917 ///
2918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
2919 #[inline]
2920 #[target_feature(enable = "avx2")]
2921 #[cfg_attr(test, assert_instr(vpsllq))]
2922 #[stable(feature = "simd_x86", since = "1.27.0")]
2923 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2924 transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2925 }
2926
2927 /// Shifts packed 16-bit integers in `a` left by `imm8` while
2928 /// shifting in zeros, return the results;
2929 ///
2930 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
2931 #[inline]
2932 #[target_feature(enable = "avx2")]
2933 #[cfg_attr(test, assert_instr(vpsllw))]
2934 #[stable(feature = "simd_x86", since = "1.27.0")]
2935 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
2936 transmute(pslliw(a.as_i16x16(), imm8))
2937 }
2938
2939 /// Shifts packed 32-bit integers in `a` left by `imm8` while
2940 /// shifting in zeros, return the results;
2941 ///
2942 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
2943 #[inline]
2944 #[target_feature(enable = "avx2")]
2945 #[cfg_attr(test, assert_instr(vpslld))]
2946 #[stable(feature = "simd_x86", since = "1.27.0")]
2947 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
2948 transmute(psllid(a.as_i32x8(), imm8))
2949 }
2950
2951 /// Shifts packed 64-bit integers in `a` left by `imm8` while
2952 /// shifting in zeros, return the results;
2953 ///
2954 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
2955 #[inline]
2956 #[target_feature(enable = "avx2")]
2957 #[cfg_attr(test, assert_instr(vpsllq))]
2958 #[stable(feature = "simd_x86", since = "1.27.0")]
2959 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
2960 transmute(pslliq(a.as_i64x4(), imm8))
2961 }
2962
2963 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2964 ///
2965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
2966 #[inline]
2967 #[target_feature(enable = "avx2")]
2968 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2969 #[rustc_args_required_const(1)]
2970 #[stable(feature = "simd_x86", since = "1.27.0")]
2971 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2972 let a = a.as_i64x4();
2973 macro_rules! call {
2974 ($imm8:expr) => {
2975 vpslldq(a, $imm8)
2976 };
2977 }
2978 transmute(constify_imm8!(imm8 * 8, call))
2979 }
2980
2981 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2982 ///
2983 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
2984 #[inline]
2985 #[target_feature(enable = "avx2")]
2986 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2987 #[rustc_args_required_const(1)]
2988 #[stable(feature = "simd_x86", since = "1.27.0")]
2989 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2990 let a = a.as_i64x4();
2991 macro_rules! call {
2992 ($imm8:expr) => {
2993 vpslldq(a, $imm8)
2994 };
2995 }
2996 transmute(constify_imm8!(imm8 * 8, call))
2997 }
2998
2999 /// Shifts packed 32-bit integers in `a` left by the amount
3000 /// specified by the corresponding element in `count` while
3001 /// shifting in zeros, and returns the result.
3002 ///
3003 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
3004 #[inline]
3005 #[target_feature(enable = "avx2")]
3006 #[cfg_attr(test, assert_instr(vpsllvd))]
3007 #[stable(feature = "simd_x86", since = "1.27.0")]
3008 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
3009 transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
3010 }
3011
3012 /// Shifts packed 32-bit integers in `a` left by the amount
3013 /// specified by the corresponding element in `count` while
3014 /// shifting in zeros, and returns the result.
3015 ///
3016 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
3017 #[inline]
3018 #[target_feature(enable = "avx2")]
3019 #[cfg_attr(test, assert_instr(vpsllvd))]
3020 #[stable(feature = "simd_x86", since = "1.27.0")]
3021 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
3022 transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
3023 }
3024
3025 /// Shifts packed 64-bit integers in `a` left by the amount
3026 /// specified by the corresponding element in `count` while
3027 /// shifting in zeros, and returns the result.
3028 ///
3029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
3030 #[inline]
3031 #[target_feature(enable = "avx2")]
3032 #[cfg_attr(test, assert_instr(vpsllvq))]
3033 #[stable(feature = "simd_x86", since = "1.27.0")]
3034 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3035 transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
3036 }
3037
3038 /// Shifts packed 64-bit integers in `a` left by the amount
3039 /// specified by the corresponding element in `count` while
3040 /// shifting in zeros, and returns the result.
3041 ///
3042 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
3043 #[inline]
3044 #[target_feature(enable = "avx2")]
3045 #[cfg_attr(test, assert_instr(vpsllvq))]
3046 #[stable(feature = "simd_x86", since = "1.27.0")]
3047 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3048 transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
3049 }
3050
3051 /// Shifts packed 16-bit integers in `a` right by `count` while
3052 /// shifting in sign bits.
3053 ///
3054 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
3055 #[inline]
3056 #[target_feature(enable = "avx2")]
3057 #[cfg_attr(test, assert_instr(vpsraw))]
3058 #[stable(feature = "simd_x86", since = "1.27.0")]
3059 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3060 transmute(psraw(a.as_i16x16(), count.as_i16x8()))
3061 }
3062
3063 /// Shifts packed 32-bit integers in `a` right by `count` while
3064 /// shifting in sign bits.
3065 ///
3066 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
3067 #[inline]
3068 #[target_feature(enable = "avx2")]
3069 #[cfg_attr(test, assert_instr(vpsrad))]
3070 #[stable(feature = "simd_x86", since = "1.27.0")]
3071 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3072 transmute(psrad(a.as_i32x8(), count.as_i32x4()))
3073 }
3074
3075 /// Shifts packed 16-bit integers in `a` right by `imm8` while
3076 /// shifting in sign bits.
3077 ///
3078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
3079 #[inline]
3080 #[target_feature(enable = "avx2")]
3081 #[cfg_attr(test, assert_instr(vpsraw))]
3082 #[stable(feature = "simd_x86", since = "1.27.0")]
3083 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
3084 transmute(psraiw(a.as_i16x16(), imm8))
3085 }
3086
3087 /// Shifts packed 32-bit integers in `a` right by `imm8` while
3088 /// shifting in sign bits.
3089 ///
3090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
3091 #[inline]
3092 #[target_feature(enable = "avx2")]
3093 #[cfg_attr(test, assert_instr(vpsrad))]
3094 #[stable(feature = "simd_x86", since = "1.27.0")]
3095 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
3096 transmute(psraid(a.as_i32x8(), imm8))
3097 }
3098
3099 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3100 /// corresponding element in `count` while shifting in sign bits.
3101 ///
3102 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
3103 #[inline]
3104 #[target_feature(enable = "avx2")]
3105 #[cfg_attr(test, assert_instr(vpsravd))]
3106 #[stable(feature = "simd_x86", since = "1.27.0")]
3107 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3108 transmute(psravd(a.as_i32x4(), count.as_i32x4()))
3109 }
3110
3111 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3112 /// corresponding element in `count` while shifting in sign bits.
3113 ///
3114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
3115 #[inline]
3116 #[target_feature(enable = "avx2")]
3117 #[cfg_attr(test, assert_instr(vpsravd))]
3118 #[stable(feature = "simd_x86", since = "1.27.0")]
3119 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3120 transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
3121 }
3122
3123 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3124 ///
3125 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
3126 #[inline]
3127 #[target_feature(enable = "avx2")]
3128 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3129 #[rustc_args_required_const(1)]
3130 #[stable(feature = "simd_x86", since = "1.27.0")]
3131 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
3132 let a = a.as_i64x4();
3133 macro_rules! call {
3134 ($imm8:expr) => {
3135 vpsrldq(a, $imm8)
3136 };
3137 }
3138 transmute(constify_imm8!(imm8 * 8, call))
3139 }
3140
3141 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3142 ///
3143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
3144 #[inline]
3145 #[target_feature(enable = "avx2")]
3146 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3147 #[rustc_args_required_const(1)]
3148 #[stable(feature = "simd_x86", since = "1.27.0")]
3149 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
3150 let a = a.as_i64x4();
3151 macro_rules! call {
3152 ($imm8:expr) => {
3153 vpsrldq(a, $imm8)
3154 };
3155 }
3156 transmute(constify_imm8!(imm8 * 8, call))
3157 }
3158
3159 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3160 /// zeros.
3161 ///
3162 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
3163 #[inline]
3164 #[target_feature(enable = "avx2")]
3165 #[cfg_attr(test, assert_instr(vpsrlw))]
3166 #[stable(feature = "simd_x86", since = "1.27.0")]
3167 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3168 transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3169 }
3170
3171 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3172 /// zeros.
3173 ///
3174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
3175 #[inline]
3176 #[target_feature(enable = "avx2")]
3177 #[cfg_attr(test, assert_instr(vpsrld))]
3178 #[stable(feature = "simd_x86", since = "1.27.0")]
3179 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3180 transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3181 }
3182
3183 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3184 /// zeros.
3185 ///
3186 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
3187 #[inline]
3188 #[target_feature(enable = "avx2")]
3189 #[cfg_attr(test, assert_instr(vpsrlq))]
3190 #[stable(feature = "simd_x86", since = "1.27.0")]
3191 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3192 transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3193 }
3194
3195 /// Shifts packed 16-bit integers in `a` right by `imm8` while shifting in
3196 /// zeros
3197 ///
3198 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
3199 #[inline]
3200 #[target_feature(enable = "avx2")]
3201 #[cfg_attr(test, assert_instr(vpsrlw))]
3202 #[stable(feature = "simd_x86", since = "1.27.0")]
3203 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
3204 transmute(psrliw(a.as_i16x16(), imm8))
3205 }
3206
3207 /// Shifts packed 32-bit integers in `a` right by `imm8` while shifting in
3208 /// zeros
3209 ///
3210 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
3211 #[inline]
3212 #[target_feature(enable = "avx2")]
3213 #[cfg_attr(test, assert_instr(vpsrld))]
3214 #[stable(feature = "simd_x86", since = "1.27.0")]
3215 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
3216 transmute(psrlid(a.as_i32x8(), imm8))
3217 }
3218
3219 /// Shifts packed 64-bit integers in `a` right by `imm8` while shifting in
3220 /// zeros
3221 ///
3222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
3223 #[inline]
3224 #[target_feature(enable = "avx2")]
3225 #[cfg_attr(test, assert_instr(vpsrlq))]
3226 #[stable(feature = "simd_x86", since = "1.27.0")]
3227 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
3228 transmute(psrliq(a.as_i64x4(), imm8))
3229 }
3230
3231 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3232 /// the corresponding element in `count` while shifting in zeros,
3233 ///
3234 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
3235 #[inline]
3236 #[target_feature(enable = "avx2")]
3237 #[cfg_attr(test, assert_instr(vpsrlvd))]
3238 #[stable(feature = "simd_x86", since = "1.27.0")]
3239 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3240 transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3241 }
3242
3243 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3244 /// the corresponding element in `count` while shifting in zeros,
3245 ///
3246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
3247 #[inline]
3248 #[target_feature(enable = "avx2")]
3249 #[cfg_attr(test, assert_instr(vpsrlvd))]
3250 #[stable(feature = "simd_x86", since = "1.27.0")]
3251 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3252 transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3253 }
3254
3255 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3256 /// the corresponding element in `count` while shifting in zeros,
3257 ///
3258 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
3259 #[inline]
3260 #[target_feature(enable = "avx2")]
3261 #[cfg_attr(test, assert_instr(vpsrlvq))]
3262 #[stable(feature = "simd_x86", since = "1.27.0")]
3263 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3264 transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3265 }
3266
3267 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3268 /// the corresponding element in `count` while shifting in zeros,
3269 ///
3270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
3271 #[inline]
3272 #[target_feature(enable = "avx2")]
3273 #[cfg_attr(test, assert_instr(vpsrlvq))]
3274 #[stable(feature = "simd_x86", since = "1.27.0")]
3275 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3276 transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3277 }
3278
3279 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
3280
3281 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3282 ///
3283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
3284 #[inline]
3285 #[target_feature(enable = "avx2")]
3286 #[cfg_attr(test, assert_instr(vpsubw))]
3287 #[stable(feature = "simd_x86", since = "1.27.0")]
3288 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3289 transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3290 }
3291
3292 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
3293 ///
3294 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
3295 #[inline]
3296 #[target_feature(enable = "avx2")]
3297 #[cfg_attr(test, assert_instr(vpsubd))]
3298 #[stable(feature = "simd_x86", since = "1.27.0")]
3299 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3300 transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3301 }
3302
3303 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
3304 ///
3305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
3306 #[inline]
3307 #[target_feature(enable = "avx2")]
3308 #[cfg_attr(test, assert_instr(vpsubq))]
3309 #[stable(feature = "simd_x86", since = "1.27.0")]
3310 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3311 transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3312 }
3313
3314 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
3315 ///
3316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
3317 #[inline]
3318 #[target_feature(enable = "avx2")]
3319 #[cfg_attr(test, assert_instr(vpsubb))]
3320 #[stable(feature = "simd_x86", since = "1.27.0")]
3321 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3322 transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3323 }
3324
3325 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3326 /// `a` using saturation.
3327 ///
3328 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
3329 #[inline]
3330 #[target_feature(enable = "avx2")]
3331 #[cfg_attr(test, assert_instr(vpsubsw))]
3332 #[stable(feature = "simd_x86", since = "1.27.0")]
3333 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3334 transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
3335 }
3336
3337 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3338 /// `a` using saturation.
3339 ///
3340 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
3341 #[inline]
3342 #[target_feature(enable = "avx2")]
3343 #[cfg_attr(test, assert_instr(vpsubsb))]
3344 #[stable(feature = "simd_x86", since = "1.27.0")]
3345 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3346 transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
3347 }
3348
3349 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3350 /// integers in `a` using saturation.
3351 ///
3352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
3353 #[inline]
3354 #[target_feature(enable = "avx2")]
3355 #[cfg_attr(test, assert_instr(vpsubusw))]
3356 #[stable(feature = "simd_x86", since = "1.27.0")]
3357 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3358 transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
3359 }
3360
3361 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3362 /// integers in `a` using saturation.
3363 ///
3364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
3365 #[inline]
3366 #[target_feature(enable = "avx2")]
3367 #[cfg_attr(test, assert_instr(vpsubusb))]
3368 #[stable(feature = "simd_x86", since = "1.27.0")]
3369 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3370 transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
3371 }
3372
3373 /// Unpacks and interleave 8-bit integers from the high half of each
3374 /// 128-bit lane in `a` and `b`.
3375 ///
3376 /// ```rust
3377 /// #[cfg(target_arch = "x86")]
3378 /// use std::arch::x86::*;
3379 /// #[cfg(target_arch = "x86_64")]
3380 /// use std::arch::x86_64::*;
3381 ///
3382 /// # fn main() {
3383 /// # if is_x86_feature_detected!("avx2") {
3384 /// # #[target_feature(enable = "avx2")]
3385 /// # unsafe fn worker() {
3386 /// let a = _mm256_setr_epi8(
3387 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3388 /// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3389 /// );
3390 /// let b = _mm256_setr_epi8(
3391 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3392 /// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3393 /// -30, -31,
3394 /// );
3395 ///
3396 /// let c = _mm256_unpackhi_epi8(a, b);
3397 ///
3398 /// let expected = _mm256_setr_epi8(
3399 /// 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3400 /// 24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3401 /// -31,
3402 /// );
3403 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3404 ///
3405 /// # }
3406 /// # unsafe { worker(); }
3407 /// # }
3408 /// # }
3409 /// ```
3410 ///
3411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
3412 #[inline]
3413 #[target_feature(enable = "avx2")]
3414 #[cfg_attr(test, assert_instr(vpunpckhbw))]
3415 #[stable(feature = "simd_x86", since = "1.27.0")]
3416 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3417 #[rustfmt::skip]
3418 let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3419 8, 40, 9, 41, 10, 42, 11, 43,
3420 12, 44, 13, 45, 14, 46, 15, 47,
3421 24, 56, 25, 57, 26, 58, 27, 59,
3422 28, 60, 29, 61, 30, 62, 31, 63,
3423 ]);
3424 transmute(r)
3425 }
3426
3427 /// Unpacks and interleave 8-bit integers from the low half of each
3428 /// 128-bit lane of `a` and `b`.
3429 ///
3430 /// ```rust
3431 /// #[cfg(target_arch = "x86")]
3432 /// use std::arch::x86::*;
3433 /// #[cfg(target_arch = "x86_64")]
3434 /// use std::arch::x86_64::*;
3435 ///
3436 /// # fn main() {
3437 /// # if is_x86_feature_detected!("avx2") {
3438 /// # #[target_feature(enable = "avx2")]
3439 /// # unsafe fn worker() {
3440 /// let a = _mm256_setr_epi8(
3441 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3442 /// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3443 /// );
3444 /// let b = _mm256_setr_epi8(
3445 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3446 /// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3447 /// -30, -31,
3448 /// );
3449 ///
3450 /// let c = _mm256_unpacklo_epi8(a, b);
3451 ///
3452 /// let expected = _mm256_setr_epi8(
3453 /// 0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3454 /// -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3455 /// );
3456 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3457 ///
3458 /// # }
3459 /// # unsafe { worker(); }
3460 /// # }
3461 /// # }
3462 /// ```
3463 ///
3464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
3465 #[inline]
3466 #[target_feature(enable = "avx2")]
3467 #[cfg_attr(test, assert_instr(vpunpcklbw))]
3468 #[stable(feature = "simd_x86", since = "1.27.0")]
3469 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3470 #[rustfmt::skip]
3471 let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3472 0, 32, 1, 33, 2, 34, 3, 35,
3473 4, 36, 5, 37, 6, 38, 7, 39,
3474 16, 48, 17, 49, 18, 50, 19, 51,
3475 20, 52, 21, 53, 22, 54, 23, 55,
3476 ]);
3477 transmute(r)
3478 }
3479
3480 /// Unpacks and interleave 16-bit integers from the high half of each
3481 /// 128-bit lane of `a` and `b`.
3482 ///
3483 /// ```rust
3484 /// #[cfg(target_arch = "x86")]
3485 /// use std::arch::x86::*;
3486 /// #[cfg(target_arch = "x86_64")]
3487 /// use std::arch::x86_64::*;
3488 ///
3489 /// # fn main() {
3490 /// # if is_x86_feature_detected!("avx2") {
3491 /// # #[target_feature(enable = "avx2")]
3492 /// # unsafe fn worker() {
3493 /// let a = _mm256_setr_epi16(
3494 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3495 /// );
3496 /// let b = _mm256_setr_epi16(
3497 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3498 /// );
3499 ///
3500 /// let c = _mm256_unpackhi_epi16(a, b);
3501 ///
3502 /// let expected = _mm256_setr_epi16(
3503 /// 4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3504 /// );
3505 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3506 ///
3507 /// # }
3508 /// # unsafe { worker(); }
3509 /// # }
3510 /// # }
3511 /// ```
3512 ///
3513 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
3514 #[inline]
3515 #[target_feature(enable = "avx2")]
3516 #[cfg_attr(test, assert_instr(vpunpckhwd))]
3517 #[stable(feature = "simd_x86", since = "1.27.0")]
3518 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3519 let r: i16x16 = simd_shuffle16(
3520 a.as_i16x16(),
3521 b.as_i16x16(),
3522 [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3523 );
3524 transmute(r)
3525 }
3526
3527 /// Unpacks and interleave 16-bit integers from the low half of each
3528 /// 128-bit lane of `a` and `b`.
3529 ///
3530 /// ```rust
3531 /// #[cfg(target_arch = "x86")]
3532 /// use std::arch::x86::*;
3533 /// #[cfg(target_arch = "x86_64")]
3534 /// use std::arch::x86_64::*;
3535 ///
3536 /// # fn main() {
3537 /// # if is_x86_feature_detected!("avx2") {
3538 /// # #[target_feature(enable = "avx2")]
3539 /// # unsafe fn worker() {
3540 ///
3541 /// let a = _mm256_setr_epi16(
3542 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3543 /// );
3544 /// let b = _mm256_setr_epi16(
3545 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3546 /// );
3547 ///
3548 /// let c = _mm256_unpacklo_epi16(a, b);
3549 ///
3550 /// let expected = _mm256_setr_epi16(
3551 /// 0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3552 /// );
3553 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3554 ///
3555 /// # }
3556 /// # unsafe { worker(); }
3557 /// # }
3558 /// # }
3559 /// ```
3560 ///
3561 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
3562 #[inline]
3563 #[target_feature(enable = "avx2")]
3564 #[cfg_attr(test, assert_instr(vpunpcklwd))]
3565 #[stable(feature = "simd_x86", since = "1.27.0")]
3566 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3567 let r: i16x16 = simd_shuffle16(
3568 a.as_i16x16(),
3569 b.as_i16x16(),
3570 [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3571 );
3572 transmute(r)
3573 }
3574
3575 /// Unpacks and interleave 32-bit integers from the high half of each
3576 /// 128-bit lane of `a` and `b`.
3577 ///
3578 /// ```rust
3579 /// #[cfg(target_arch = "x86")]
3580 /// use std::arch::x86::*;
3581 /// #[cfg(target_arch = "x86_64")]
3582 /// use std::arch::x86_64::*;
3583 ///
3584 /// # fn main() {
3585 /// # if is_x86_feature_detected!("avx2") {
3586 /// # #[target_feature(enable = "avx2")]
3587 /// # unsafe fn worker() {
3588 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3589 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3590 ///
3591 /// let c = _mm256_unpackhi_epi32(a, b);
3592 ///
3593 /// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3594 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3595 ///
3596 /// # }
3597 /// # unsafe { worker(); }
3598 /// # }
3599 /// # }
3600 /// ```
3601 ///
3602 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
3603 #[inline]
3604 #[target_feature(enable = "avx2")]
3605 #[cfg_attr(test, assert_instr(vunpckhps))]
3606 #[stable(feature = "simd_x86", since = "1.27.0")]
3607 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3608 let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3609 transmute(r)
3610 }
3611
3612 /// Unpacks and interleave 32-bit integers from the low half of each
3613 /// 128-bit lane of `a` and `b`.
3614 ///
3615 /// ```rust
3616 /// #[cfg(target_arch = "x86")]
3617 /// use std::arch::x86::*;
3618 /// #[cfg(target_arch = "x86_64")]
3619 /// use std::arch::x86_64::*;
3620 ///
3621 /// # fn main() {
3622 /// # if is_x86_feature_detected!("avx2") {
3623 /// # #[target_feature(enable = "avx2")]
3624 /// # unsafe fn worker() {
3625 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3626 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3627 ///
3628 /// let c = _mm256_unpacklo_epi32(a, b);
3629 ///
3630 /// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3631 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3632 ///
3633 /// # }
3634 /// # unsafe { worker(); }
3635 /// # }
3636 /// # }
3637 /// ```
3638 ///
3639 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
3640 #[inline]
3641 #[target_feature(enable = "avx2")]
3642 #[cfg_attr(test, assert_instr(vunpcklps))]
3643 #[stable(feature = "simd_x86", since = "1.27.0")]
3644 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3645 let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3646 transmute(r)
3647 }
3648
3649 /// Unpacks and interleave 64-bit integers from the high half of each
3650 /// 128-bit lane of `a` and `b`.
3651 ///
3652 /// ```rust
3653 /// #[cfg(target_arch = "x86")]
3654 /// use std::arch::x86::*;
3655 /// #[cfg(target_arch = "x86_64")]
3656 /// use std::arch::x86_64::*;
3657 ///
3658 /// # fn main() {
3659 /// # if is_x86_feature_detected!("avx2") {
3660 /// # #[target_feature(enable = "avx2")]
3661 /// # unsafe fn worker() {
3662 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3663 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3664 ///
3665 /// let c = _mm256_unpackhi_epi64(a, b);
3666 ///
3667 /// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3668 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3669 ///
3670 /// # }
3671 /// # unsafe { worker(); }
3672 /// # }
3673 /// # }
3674 /// ```
3675 ///
3676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
3677 #[inline]
3678 #[target_feature(enable = "avx2")]
3679 #[cfg_attr(test, assert_instr(vunpckhpd))]
3680 #[stable(feature = "simd_x86", since = "1.27.0")]
3681 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3682 let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3683 transmute(r)
3684 }
3685
3686 /// Unpacks and interleave 64-bit integers from the low half of each
3687 /// 128-bit lane of `a` and `b`.
3688 ///
3689 /// ```rust
3690 /// #[cfg(target_arch = "x86")]
3691 /// use std::arch::x86::*;
3692 /// #[cfg(target_arch = "x86_64")]
3693 /// use std::arch::x86_64::*;
3694 ///
3695 /// # fn main() {
3696 /// # if is_x86_feature_detected!("avx2") {
3697 /// # #[target_feature(enable = "avx2")]
3698 /// # unsafe fn worker() {
3699 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3700 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3701 ///
3702 /// let c = _mm256_unpacklo_epi64(a, b);
3703 ///
3704 /// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3705 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3706 ///
3707 /// # }
3708 /// # unsafe { worker(); }
3709 /// # }
3710 /// # }
3711 /// ```
3712 ///
3713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
3714 #[inline]
3715 #[target_feature(enable = "avx2")]
3716 #[cfg_attr(test, assert_instr(vunpcklpd))]
3717 #[stable(feature = "simd_x86", since = "1.27.0")]
3718 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3719 let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3720 transmute(r)
3721 }
3722
3723 /// Computes the bitwise XOR of 256 bits (representing integer data)
3724 /// in `a` and `b`
3725 ///
3726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
3727 #[inline]
3728 #[target_feature(enable = "avx2")]
3729 #[cfg_attr(test, assert_instr(vxorps))]
3730 #[stable(feature = "simd_x86", since = "1.27.0")]
3731 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3732 transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3733 }
3734
3735 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3736 /// integer containing the zero-extended integer data.
3737 ///
3738 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3739 ///
3740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
3741 #[inline]
3742 #[target_feature(enable = "avx2")]
3743 // This intrinsic has no corresponding instruction.
3744 #[rustc_args_required_const(1)]
3745 #[stable(feature = "simd_x86", since = "1.27.0")]
3746 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
3747 let imm8 = (imm8 & 31) as u32;
3748 simd_extract(a.as_i8x32(), imm8)
3749 }
3750
3751 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3752 /// integer containing the zero-extended integer data.
3753 ///
3754 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3755 ///
3756 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
3757 #[inline]
3758 #[target_feature(enable = "avx2")]
3759 // This intrinsic has no corresponding instruction.
3760 #[rustc_args_required_const(1)]
3761 #[stable(feature = "simd_x86", since = "1.27.0")]
3762 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
3763 let imm8 = (imm8 & 15) as u32;
3764 simd_extract(a.as_i16x16(), imm8)
3765 }
3766
3767 /// Extracts a 32-bit integer from `a`, selected with `imm8`.
3768 ///
3769 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
3770 #[inline]
3771 #[target_feature(enable = "avx2")]
3772 // This intrinsic has no corresponding instruction.
3773 #[rustc_args_required_const(1)]
3774 #[stable(feature = "simd_x86", since = "1.27.0")]
3775 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
3776 let imm8 = (imm8 & 7) as u32;
3777 simd_extract(a.as_i32x8(), imm8)
3778 }
3779
3780 /// Returns the first element of the input vector of `[4 x double]`.
3781 ///
3782 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
3783 #[inline]
3784 #[target_feature(enable = "avx2")]
3785 //#[cfg_attr(test, assert_instr(movsd))] FIXME
3786 #[stable(feature = "simd_x86", since = "1.27.0")]
3787 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
3788 simd_extract(a, 0)
3789 }
3790
3791 /// Returns the first element of the input vector of `[8 x i32]`.
3792 ///
3793 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
3794 #[inline]
3795 #[target_feature(enable = "avx2")]
3796 //#[cfg_attr(test, assert_instr(movd))] FIXME
3797 #[stable(feature = "simd_x86", since = "1.27.0")]
3798 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
3799 simd_extract(a.as_i32x8(), 0)
3800 }
3801
3802 #[allow(improper_ctypes)]
3803 extern "C" {
3804 #[link_name = "llvm.x86.avx2.pabs.b"]
3805 fn pabsb(a: i8x32) -> u8x32;
3806 #[link_name = "llvm.x86.avx2.pabs.w"]
3807 fn pabsw(a: i16x16) -> u16x16;
3808 #[link_name = "llvm.x86.avx2.pabs.d"]
3809 fn pabsd(a: i32x8) -> u32x8;
3810 #[link_name = "llvm.x86.avx2.pavg.b"]
3811 fn pavgb(a: u8x32, b: u8x32) -> u8x32;
3812 #[link_name = "llvm.x86.avx2.pavg.w"]
3813 fn pavgw(a: u16x16, b: u16x16) -> u16x16;
3814 #[link_name = "llvm.x86.avx2.pblendvb"]
3815 fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
3816 #[link_name = "llvm.x86.avx2.phadd.w"]
3817 fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3818 #[link_name = "llvm.x86.avx2.phadd.d"]
3819 fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3820 #[link_name = "llvm.x86.avx2.phadd.sw"]
3821 fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3822 #[link_name = "llvm.x86.avx2.phsub.w"]
3823 fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3824 #[link_name = "llvm.x86.avx2.phsub.d"]
3825 fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3826 #[link_name = "llvm.x86.avx2.phsub.sw"]
3827 fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3828 #[link_name = "llvm.x86.avx2.pmadd.wd"]
3829 fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3830 #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3831 fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3832 #[link_name = "llvm.x86.avx2.maskload.d"]
3833 fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3834 #[link_name = "llvm.x86.avx2.maskload.d.256"]
3835 fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3836 #[link_name = "llvm.x86.avx2.maskload.q"]
3837 fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3838 #[link_name = "llvm.x86.avx2.maskload.q.256"]
3839 fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3840 #[link_name = "llvm.x86.avx2.maskstore.d"]
3841 fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3842 #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3843 fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3844 #[link_name = "llvm.x86.avx2.maskstore.q"]
3845 fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3846 #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3847 fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3848 #[link_name = "llvm.x86.avx2.pmaxs.w"]
3849 fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
3850 #[link_name = "llvm.x86.avx2.pmaxs.d"]
3851 fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
3852 #[link_name = "llvm.x86.avx2.pmaxs.b"]
3853 fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
3854 #[link_name = "llvm.x86.avx2.pmaxu.w"]
3855 fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
3856 #[link_name = "llvm.x86.avx2.pmaxu.d"]
3857 fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
3858 #[link_name = "llvm.x86.avx2.pmaxu.b"]
3859 fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
3860 #[link_name = "llvm.x86.avx2.pmins.w"]
3861 fn pminsw(a: i16x16, b: i16x16) -> i16x16;
3862 #[link_name = "llvm.x86.avx2.pmins.d"]
3863 fn pminsd(a: i32x8, b: i32x8) -> i32x8;
3864 #[link_name = "llvm.x86.avx2.pmins.b"]
3865 fn pminsb(a: i8x32, b: i8x32) -> i8x32;
3866 #[link_name = "llvm.x86.avx2.pminu.w"]
3867 fn pminuw(a: u16x16, b: u16x16) -> u16x16;
3868 #[link_name = "llvm.x86.avx2.pminu.d"]
3869 fn pminud(a: u32x8, b: u32x8) -> u32x8;
3870 #[link_name = "llvm.x86.avx2.pminu.b"]
3871 fn pminub(a: u8x32, b: u8x32) -> u8x32;
3872 #[link_name = "llvm.x86.avx2.pmovmskb"]
3873 fn pmovmskb(a: i8x32) -> i32;
3874 #[link_name = "llvm.x86.avx2.mpsadbw"]
3875 fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3876 #[link_name = "llvm.x86.avx2.pmulhu.w"]
3877 fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
3878 #[link_name = "llvm.x86.avx2.pmulh.w"]
3879 fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
3880 #[link_name = "llvm.x86.avx2.pmul.dq"]
3881 fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
3882 #[link_name = "llvm.x86.avx2.pmulu.dq"]
3883 fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
3884 #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3885 fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3886 #[link_name = "llvm.x86.avx2.packsswb"]
3887 fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3888 #[link_name = "llvm.x86.avx2.packssdw"]
3889 fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3890 #[link_name = "llvm.x86.avx2.packuswb"]
3891 fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3892 #[link_name = "llvm.x86.avx2.packusdw"]
3893 fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3894 #[link_name = "llvm.x86.avx2.psad.bw"]
3895 fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3896 #[link_name = "llvm.x86.avx2.psign.b"]
3897 fn psignb(a: i8x32, b: i8x32) -> i8x32;
3898 #[link_name = "llvm.x86.avx2.psign.w"]
3899 fn psignw(a: i16x16, b: i16x16) -> i16x16;
3900 #[link_name = "llvm.x86.avx2.psign.d"]
3901 fn psignd(a: i32x8, b: i32x8) -> i32x8;
3902 #[link_name = "llvm.x86.avx2.psll.w"]
3903 fn psllw(a: i16x16, count: i16x8) -> i16x16;
3904 #[link_name = "llvm.x86.avx2.psll.d"]
3905 fn pslld(a: i32x8, count: i32x4) -> i32x8;
3906 #[link_name = "llvm.x86.avx2.psll.q"]
3907 fn psllq(a: i64x4, count: i64x2) -> i64x4;
3908 #[link_name = "llvm.x86.avx2.pslli.w"]
3909 fn pslliw(a: i16x16, imm8: i32) -> i16x16;
3910 #[link_name = "llvm.x86.avx2.pslli.d"]
3911 fn psllid(a: i32x8, imm8: i32) -> i32x8;
3912 #[link_name = "llvm.x86.avx2.pslli.q"]
3913 fn pslliq(a: i64x4, imm8: i32) -> i64x4;
3914 #[link_name = "llvm.x86.avx2.psllv.d"]
3915 fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3916 #[link_name = "llvm.x86.avx2.psllv.d.256"]
3917 fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3918 #[link_name = "llvm.x86.avx2.psllv.q"]
3919 fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3920 #[link_name = "llvm.x86.avx2.psllv.q.256"]
3921 fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3922 #[link_name = "llvm.x86.avx2.psra.w"]
3923 fn psraw(a: i16x16, count: i16x8) -> i16x16;
3924 #[link_name = "llvm.x86.avx2.psra.d"]
3925 fn psrad(a: i32x8, count: i32x4) -> i32x8;
3926 #[link_name = "llvm.x86.avx2.psrai.w"]
3927 fn psraiw(a: i16x16, imm8: i32) -> i16x16;
3928 #[link_name = "llvm.x86.avx2.psrai.d"]
3929 fn psraid(a: i32x8, imm8: i32) -> i32x8;
3930 #[link_name = "llvm.x86.avx2.psrav.d"]
3931 fn psravd(a: i32x4, count: i32x4) -> i32x4;
3932 #[link_name = "llvm.x86.avx2.psrav.d.256"]
3933 fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3934 #[link_name = "llvm.x86.avx2.psrl.w"]
3935 fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3936 #[link_name = "llvm.x86.avx2.psrl.d"]
3937 fn psrld(a: i32x8, count: i32x4) -> i32x8;
3938 #[link_name = "llvm.x86.avx2.psrl.q"]
3939 fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3940 #[link_name = "llvm.x86.avx2.psrli.w"]
3941 fn psrliw(a: i16x16, imm8: i32) -> i16x16;
3942 #[link_name = "llvm.x86.avx2.psrli.d"]
3943 fn psrlid(a: i32x8, imm8: i32) -> i32x8;
3944 #[link_name = "llvm.x86.avx2.psrli.q"]
3945 fn psrliq(a: i64x4, imm8: i32) -> i64x4;
3946 #[link_name = "llvm.x86.avx2.psrlv.d"]
3947 fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3948 #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3949 fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3950 #[link_name = "llvm.x86.avx2.psrlv.q"]
3951 fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3952 #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3953 fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3954 #[link_name = "llvm.x86.avx2.pshuf.b"]
3955 fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3956 #[link_name = "llvm.x86.avx2.permd"]
3957 fn permd(a: u32x8, b: u32x8) -> u32x8;
3958 #[link_name = "llvm.x86.avx2.permps"]
3959 fn permps(a: __m256, b: i32x8) -> __m256;
3960 #[link_name = "llvm.x86.avx2.vperm2i128"]
3961 fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3962 #[link_name = "llvm.x86.avx2.gather.d.d"]
3963 fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3964 #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3965 fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3966 #[link_name = "llvm.x86.avx2.gather.d.q"]
3967 fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3968 #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3969 fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3970 #[link_name = "llvm.x86.avx2.gather.q.d"]
3971 fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3972 #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3973 fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3974 #[link_name = "llvm.x86.avx2.gather.q.q"]
3975 fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3976 #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3977 fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3978 #[link_name = "llvm.x86.avx2.gather.d.pd"]
3979 fn pgatherdpd(
3980 src: __m128d,
3981 slice: *const i8,
3982 offsets: i32x4,
3983 mask: __m128d,
3984 scale: i8,
3985 ) -> __m128d;
3986 #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3987 fn vpgatherdpd(
3988 src: __m256d,
3989 slice: *const i8,
3990 offsets: i32x4,
3991 mask: __m256d,
3992 scale: i8,
3993 ) -> __m256d;
3994 #[link_name = "llvm.x86.avx2.gather.q.pd"]
3995 fn pgatherqpd(
3996 src: __m128d,
3997 slice: *const i8,
3998 offsets: i64x2,
3999 mask: __m128d,
4000 scale: i8,
4001 ) -> __m128d;
4002 #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
4003 fn vpgatherqpd(
4004 src: __m256d,
4005 slice: *const i8,
4006 offsets: i64x4,
4007 mask: __m256d,
4008 scale: i8,
4009 ) -> __m256d;
4010 #[link_name = "llvm.x86.avx2.gather.d.ps"]
4011 fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
4012 -> __m128;
4013 #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4014 fn vpgatherdps(
4015 src: __m256,
4016 slice: *const i8,
4017 offsets: i32x8,
4018 mask: __m256,
4019 scale: i8,
4020 ) -> __m256;
4021 #[link_name = "llvm.x86.avx2.gather.q.ps"]
4022 fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4023 -> __m128;
4024 #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4025 fn vpgatherqps(
4026 src: __m128,
4027 slice: *const i8,
4028 offsets: i64x4,
4029 mask: __m128,
4030 scale: i8,
4031 ) -> __m128;
4032 #[link_name = "llvm.x86.avx2.psll.dq"]
4033 fn vpslldq(a: i64x4, b: i32) -> i64x4;
4034 #[link_name = "llvm.x86.avx2.psrl.dq"]
4035 fn vpsrldq(a: i64x4, b: i32) -> i64x4;
4036 }
4037
4038 #[cfg(test)]
4039 mod tests {
4040 use std;
4041 use stdarch_test::simd_test;
4042
4043 use crate::core_arch::x86::*;
4044
4045 #[simd_test(enable = "avx2")]
4046 unsafe fn test_mm256_abs_epi32() {
4047 #[rustfmt::skip]
4048 let a = _mm256_setr_epi32(
4049 0, 1, -1, i32::MAX,
4050 i32::MIN, 100, -100, -32,
4051 );
4052 let r = _mm256_abs_epi32(a);
4053 #[rustfmt::skip]
4054 let e = _mm256_setr_epi32(
4055 0, 1, 1, i32::MAX,
4056 i32::MAX.wrapping_add(1), 100, 100, 32,
4057 );
4058 assert_eq_m256i(r, e);
4059 }
4060
4061 #[simd_test(enable = "avx2")]
4062 unsafe fn test_mm256_abs_epi16() {
4063 #[rustfmt::skip]
4064 let a = _mm256_setr_epi16(
4065 0, 1, -1, 2, -2, 3, -3, 4,
4066 -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
4067 );
4068 let r = _mm256_abs_epi16(a);
4069 #[rustfmt::skip]
4070 let e = _mm256_setr_epi16(
4071 0, 1, 1, 2, 2, 3, 3, 4,
4072 4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
4073 );
4074 assert_eq_m256i(r, e);
4075 }
4076
4077 #[simd_test(enable = "avx2")]
4078 unsafe fn test_mm256_abs_epi8() {
4079 #[rustfmt::skip]
4080 let a = _mm256_setr_epi8(
4081 0, 1, -1, 2, -2, 3, -3, 4,
4082 -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4083 0, 1, -1, 2, -2, 3, -3, 4,
4084 -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4085 );
4086 let r = _mm256_abs_epi8(a);
4087 #[rustfmt::skip]
4088 let e = _mm256_setr_epi8(
4089 0, 1, 1, 2, 2, 3, 3, 4,
4090 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4091 0, 1, 1, 2, 2, 3, 3, 4,
4092 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4093 );
4094 assert_eq_m256i(r, e);
4095 }
4096
4097 #[simd_test(enable = "avx2")]
4098 unsafe fn test_mm256_add_epi64() {
4099 let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4100 let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4101 let r = _mm256_add_epi64(a, b);
4102 let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4103 assert_eq_m256i(r, e);
4104 }
4105
4106 #[simd_test(enable = "avx2")]
4107 unsafe fn test_mm256_add_epi32() {
4108 let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4109 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4110 let r = _mm256_add_epi32(a, b);
4111 let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4112 assert_eq_m256i(r, e);
4113 }
4114
4115 #[simd_test(enable = "avx2")]
4116 unsafe fn test_mm256_add_epi16() {
4117 #[rustfmt::skip]
4118 let a = _mm256_setr_epi16(
4119 0, 1, 2, 3, 4, 5, 6, 7,
4120 8, 9, 10, 11, 12, 13, 14, 15,
4121 );
4122 #[rustfmt::skip]
4123 let b = _mm256_setr_epi16(
4124 0, 1, 2, 3, 4, 5, 6, 7,
4125 8, 9, 10, 11, 12, 13, 14, 15,
4126 );
4127 let r = _mm256_add_epi16(a, b);
4128 #[rustfmt::skip]
4129 let e = _mm256_setr_epi16(
4130 0, 2, 4, 6, 8, 10, 12, 14,
4131 16, 18, 20, 22, 24, 26, 28, 30,
4132 );
4133 assert_eq_m256i(r, e);
4134 }
4135
4136 #[simd_test(enable = "avx2")]
4137 unsafe fn test_mm256_add_epi8() {
4138 #[rustfmt::skip]
4139 let a = _mm256_setr_epi8(
4140 0, 1, 2, 3, 4, 5, 6, 7,
4141 8, 9, 10, 11, 12, 13, 14, 15,
4142 16, 17, 18, 19, 20, 21, 22, 23,
4143 24, 25, 26, 27, 28, 29, 30, 31,
4144 );
4145 #[rustfmt::skip]
4146 let b = _mm256_setr_epi8(
4147 0, 1, 2, 3, 4, 5, 6, 7,
4148 8, 9, 10, 11, 12, 13, 14, 15,
4149 16, 17, 18, 19, 20, 21, 22, 23,
4150 24, 25, 26, 27, 28, 29, 30, 31,
4151 );
4152 let r = _mm256_add_epi8(a, b);
4153 #[rustfmt::skip]
4154 let e = _mm256_setr_epi8(
4155 0, 2, 4, 6, 8, 10, 12, 14,
4156 16, 18, 20, 22, 24, 26, 28, 30,
4157 32, 34, 36, 38, 40, 42, 44, 46,
4158 48, 50, 52, 54, 56, 58, 60, 62,
4159 );
4160 assert_eq_m256i(r, e);
4161 }
4162
4163 #[simd_test(enable = "avx2")]
4164 unsafe fn test_mm256_adds_epi8() {
4165 #[rustfmt::skip]
4166 let a = _mm256_setr_epi8(
4167 0, 1, 2, 3, 4, 5, 6, 7,
4168 8, 9, 10, 11, 12, 13, 14, 15,
4169 16, 17, 18, 19, 20, 21, 22, 23,
4170 24, 25, 26, 27, 28, 29, 30, 31,
4171 );
4172 #[rustfmt::skip]
4173 let b = _mm256_setr_epi8(
4174 32, 33, 34, 35, 36, 37, 38, 39,
4175 40, 41, 42, 43, 44, 45, 46, 47,
4176 48, 49, 50, 51, 52, 53, 54, 55,
4177 56, 57, 58, 59, 60, 61, 62, 63,
4178 );
4179 let r = _mm256_adds_epi8(a, b);
4180 #[rustfmt::skip]
4181 let e = _mm256_setr_epi8(
4182 32, 34, 36, 38, 40, 42, 44, 46,
4183 48, 50, 52, 54, 56, 58, 60, 62,
4184 64, 66, 68, 70, 72, 74, 76, 78,
4185 80, 82, 84, 86, 88, 90, 92, 94,
4186 );
4187 assert_eq_m256i(r, e);
4188 }
4189
4190 #[simd_test(enable = "avx2")]
4191 unsafe fn test_mm256_adds_epi8_saturate_positive() {
4192 let a = _mm256_set1_epi8(0x7F);
4193 let b = _mm256_set1_epi8(1);
4194 let r = _mm256_adds_epi8(a, b);
4195 assert_eq_m256i(r, a);
4196 }
4197
4198 #[simd_test(enable = "avx2")]
4199 unsafe fn test_mm256_adds_epi8_saturate_negative() {
4200 let a = _mm256_set1_epi8(-0x80);
4201 let b = _mm256_set1_epi8(-1);
4202 let r = _mm256_adds_epi8(a, b);
4203 assert_eq_m256i(r, a);
4204 }
4205
4206 #[simd_test(enable = "avx2")]
4207 unsafe fn test_mm256_adds_epi16() {
4208 #[rustfmt::skip]
4209 let a = _mm256_setr_epi16(
4210 0, 1, 2, 3, 4, 5, 6, 7,
4211 8, 9, 10, 11, 12, 13, 14, 15,
4212 );
4213 #[rustfmt::skip]
4214 let b = _mm256_setr_epi16(
4215 32, 33, 34, 35, 36, 37, 38, 39,
4216 40, 41, 42, 43, 44, 45, 46, 47,
4217 );
4218 let r = _mm256_adds_epi16(a, b);
4219 #[rustfmt::skip]
4220 let e = _mm256_setr_epi16(
4221 32, 34, 36, 38, 40, 42, 44, 46,
4222 48, 50, 52, 54, 56, 58, 60, 62,
4223 );
4224
4225 assert_eq_m256i(r, e);
4226 }
4227
4228 #[simd_test(enable = "avx2")]
4229 unsafe fn test_mm256_adds_epi16_saturate_positive() {
4230 let a = _mm256_set1_epi16(0x7FFF);
4231 let b = _mm256_set1_epi16(1);
4232 let r = _mm256_adds_epi16(a, b);
4233 assert_eq_m256i(r, a);
4234 }
4235
4236 #[simd_test(enable = "avx2")]
4237 unsafe fn test_mm256_adds_epi16_saturate_negative() {
4238 let a = _mm256_set1_epi16(-0x8000);
4239 let b = _mm256_set1_epi16(-1);
4240 let r = _mm256_adds_epi16(a, b);
4241 assert_eq_m256i(r, a);
4242 }
4243
4244 #[simd_test(enable = "avx2")]
4245 unsafe fn test_mm256_adds_epu8() {
4246 #[rustfmt::skip]
4247 let a = _mm256_setr_epi8(
4248 0, 1, 2, 3, 4, 5, 6, 7,
4249 8, 9, 10, 11, 12, 13, 14, 15,
4250 16, 17, 18, 19, 20, 21, 22, 23,
4251 24, 25, 26, 27, 28, 29, 30, 31,
4252 );
4253 #[rustfmt::skip]
4254 let b = _mm256_setr_epi8(
4255 32, 33, 34, 35, 36, 37, 38, 39,
4256 40, 41, 42, 43, 44, 45, 46, 47,
4257 48, 49, 50, 51, 52, 53, 54, 55,
4258 56, 57, 58, 59, 60, 61, 62, 63,
4259 );
4260 let r = _mm256_adds_epu8(a, b);
4261 #[rustfmt::skip]
4262 let e = _mm256_setr_epi8(
4263 32, 34, 36, 38, 40, 42, 44, 46,
4264 48, 50, 52, 54, 56, 58, 60, 62,
4265 64, 66, 68, 70, 72, 74, 76, 78,
4266 80, 82, 84, 86, 88, 90, 92, 94,
4267 );
4268 assert_eq_m256i(r, e);
4269 }
4270
4271 #[simd_test(enable = "avx2")]
4272 unsafe fn test_mm256_adds_epu8_saturate() {
4273 let a = _mm256_set1_epi8(!0);
4274 let b = _mm256_set1_epi8(1);
4275 let r = _mm256_adds_epu8(a, b);
4276 assert_eq_m256i(r, a);
4277 }
4278
4279 #[simd_test(enable = "avx2")]
4280 unsafe fn test_mm256_adds_epu16() {
4281 #[rustfmt::skip]
4282 let a = _mm256_setr_epi16(
4283 0, 1, 2, 3, 4, 5, 6, 7,
4284 8, 9, 10, 11, 12, 13, 14, 15,
4285 );
4286 #[rustfmt::skip]
4287 let b = _mm256_setr_epi16(
4288 32, 33, 34, 35, 36, 37, 38, 39,
4289 40, 41, 42, 43, 44, 45, 46, 47,
4290 );
4291 let r = _mm256_adds_epu16(a, b);
4292 #[rustfmt::skip]
4293 let e = _mm256_setr_epi16(
4294 32, 34, 36, 38, 40, 42, 44, 46,
4295 48, 50, 52, 54, 56, 58, 60, 62,
4296 );
4297
4298 assert_eq_m256i(r, e);
4299 }
4300
4301 #[simd_test(enable = "avx2")]
4302 unsafe fn test_mm256_adds_epu16_saturate() {
4303 let a = _mm256_set1_epi16(!0);
4304 let b = _mm256_set1_epi16(1);
4305 let r = _mm256_adds_epu16(a, b);
4306 assert_eq_m256i(r, a);
4307 }
4308
4309 #[simd_test(enable = "avx2")]
4310 unsafe fn test_mm256_and_si256() {
4311 let a = _mm256_set1_epi8(5);
4312 let b = _mm256_set1_epi8(3);
4313 let got = _mm256_and_si256(a, b);
4314 assert_eq_m256i(got, _mm256_set1_epi8(1));
4315 }
4316
4317 #[simd_test(enable = "avx2")]
4318 unsafe fn test_mm256_andnot_si256() {
4319 let a = _mm256_set1_epi8(5);
4320 let b = _mm256_set1_epi8(3);
4321 let got = _mm256_andnot_si256(a, b);
4322 assert_eq_m256i(got, _mm256_set1_epi8(2));
4323 }
4324
4325 #[simd_test(enable = "avx2")]
4326 unsafe fn test_mm256_avg_epu8() {
4327 let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4328 let r = _mm256_avg_epu8(a, b);
4329 assert_eq_m256i(r, _mm256_set1_epi8(6));
4330 }
4331
4332 #[simd_test(enable = "avx2")]
4333 unsafe fn test_mm256_avg_epu16() {
4334 let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4335 let r = _mm256_avg_epu16(a, b);
4336 assert_eq_m256i(r, _mm256_set1_epi16(6));
4337 }
4338
4339 #[simd_test(enable = "avx2")]
4340 unsafe fn test_mm_blend_epi32() {
4341 let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4342 let e = _mm_setr_epi32(9, 3, 3, 3);
4343 let r = _mm_blend_epi32(a, b, 0x01 as i32);
4344 assert_eq_m128i(r, e);
4345
4346 let r = _mm_blend_epi32(b, a, 0x0E as i32);
4347 assert_eq_m128i(r, e);
4348 }
4349
4350 #[simd_test(enable = "avx2")]
4351 unsafe fn test_mm256_blend_epi32() {
4352 let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4353 let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4354 let r = _mm256_blend_epi32(a, b, 0x01 as i32);
4355 assert_eq_m256i(r, e);
4356
4357 let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4358 let r = _mm256_blend_epi32(a, b, 0x82 as i32);
4359 assert_eq_m256i(r, e);
4360
4361 let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4362 let r = _mm256_blend_epi32(a, b, 0x7C as i32);
4363 assert_eq_m256i(r, e);
4364 }
4365
4366 #[simd_test(enable = "avx2")]
4367 unsafe fn test_mm256_blend_epi16() {
4368 let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4369 let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4370 let r = _mm256_blend_epi16(a, b, 0x01 as i32);
4371 assert_eq_m256i(r, e);
4372
4373 let r = _mm256_blend_epi16(b, a, 0xFE as i32);
4374 assert_eq_m256i(r, e);
4375 }
4376
4377 #[simd_test(enable = "avx2")]
4378 unsafe fn test_mm256_blendv_epi8() {
4379 let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4380 let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
4381 let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
4382 let r = _mm256_blendv_epi8(a, b, mask);
4383 assert_eq_m256i(r, e);
4384 }
4385
4386 #[simd_test(enable = "avx2")]
4387 unsafe fn test_mm_broadcastb_epi8() {
4388 let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4389 let res = _mm_broadcastb_epi8(a);
4390 assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4391 }
4392
4393 #[simd_test(enable = "avx2")]
4394 unsafe fn test_mm256_broadcastb_epi8() {
4395 let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4396 let res = _mm256_broadcastb_epi8(a);
4397 assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4398 }
4399
4400 #[simd_test(enable = "avx2")]
4401 unsafe fn test_mm_broadcastd_epi32() {
4402 let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4403 let res = _mm_broadcastd_epi32(a);
4404 assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4405 }
4406
4407 #[simd_test(enable = "avx2")]
4408 unsafe fn test_mm256_broadcastd_epi32() {
4409 let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4410 let res = _mm256_broadcastd_epi32(a);
4411 assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4412 }
4413
4414 #[simd_test(enable = "avx2")]
4415 unsafe fn test_mm_broadcastq_epi64() {
4416 let a = _mm_setr_epi64x(0x1ffffffff, 0);
4417 let res = _mm_broadcastq_epi64(a);
4418 assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4419 }
4420
4421 #[simd_test(enable = "avx2")]
4422 unsafe fn test_mm256_broadcastq_epi64() {
4423 let a = _mm_setr_epi64x(0x1ffffffff, 0);
4424 let res = _mm256_broadcastq_epi64(a);
4425 assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4426 }
4427
4428 #[simd_test(enable = "avx2")]
4429 unsafe fn test_mm_broadcastsd_pd() {
4430 let a = _mm_setr_pd(6.28, 3.14);
4431 let res = _mm_broadcastsd_pd(a);
4432 assert_eq_m128d(res, _mm_set1_pd(6.28f64));
4433 }
4434
4435 #[simd_test(enable = "avx2")]
4436 unsafe fn test_mm256_broadcastsd_pd() {
4437 let a = _mm_setr_pd(6.28, 3.14);
4438 let res = _mm256_broadcastsd_pd(a);
4439 assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
4440 }
4441
4442 #[simd_test(enable = "avx2")]
4443 unsafe fn test_mm256_broadcastsi128_si256() {
4444 let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4445 let res = _mm256_broadcastsi128_si256(a);
4446 let retval = _mm256_setr_epi64x(
4447 0x0987654321012334,
4448 0x5678909876543210,
4449 0x0987654321012334,
4450 0x5678909876543210,
4451 );
4452 assert_eq_m256i(res, retval);
4453 }
4454
4455 #[simd_test(enable = "avx2")]
4456 unsafe fn test_mm_broadcastss_ps() {
4457 let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4458 let res = _mm_broadcastss_ps(a);
4459 assert_eq_m128(res, _mm_set1_ps(6.28f32));
4460 }
4461
4462 #[simd_test(enable = "avx2")]
4463 unsafe fn test_mm256_broadcastss_ps() {
4464 let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4465 let res = _mm256_broadcastss_ps(a);
4466 assert_eq_m256(res, _mm256_set1_ps(6.28f32));
4467 }
4468
4469 #[simd_test(enable = "avx2")]
4470 unsafe fn test_mm_broadcastw_epi16() {
4471 let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4472 let res = _mm_broadcastw_epi16(a);
4473 assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4474 }
4475
4476 #[simd_test(enable = "avx2")]
4477 unsafe fn test_mm256_broadcastw_epi16() {
4478 let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4479 let res = _mm256_broadcastw_epi16(a);
4480 assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4481 }
4482
4483 #[simd_test(enable = "avx2")]
4484 unsafe fn test_mm256_cmpeq_epi8() {
4485 #[rustfmt::skip]
4486 let a = _mm256_setr_epi8(
4487 0, 1, 2, 3, 4, 5, 6, 7,
4488 8, 9, 10, 11, 12, 13, 14, 15,
4489 16, 17, 18, 19, 20, 21, 22, 23,
4490 24, 25, 26, 27, 28, 29, 30, 31,
4491 );
4492 #[rustfmt::skip]
4493 let b = _mm256_setr_epi8(
4494 31, 30, 2, 28, 27, 26, 25, 24,
4495 23, 22, 21, 20, 19, 18, 17, 16,
4496 15, 14, 13, 12, 11, 10, 9, 8,
4497 7, 6, 5, 4, 3, 2, 1, 0,
4498 );
4499 let r = _mm256_cmpeq_epi8(a, b);
4500 assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
4501 }
4502
4503 #[simd_test(enable = "avx2")]
4504 unsafe fn test_mm256_cmpeq_epi16() {
4505 #[rustfmt::skip]
4506 let a = _mm256_setr_epi16(
4507 0, 1, 2, 3, 4, 5, 6, 7,
4508 8, 9, 10, 11, 12, 13, 14, 15,
4509 );
4510 #[rustfmt::skip]
4511 let b = _mm256_setr_epi16(
4512 15, 14, 2, 12, 11, 10, 9, 8,
4513 7, 6, 5, 4, 3, 2, 1, 0,
4514 );
4515 let r = _mm256_cmpeq_epi16(a, b);
4516 assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
4517 }
4518
4519 #[simd_test(enable = "avx2")]
4520 unsafe fn test_mm256_cmpeq_epi32() {
4521 let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4522 let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4523 let r = _mm256_cmpeq_epi32(a, b);
4524 let e = _mm256_set1_epi32(0);
4525 let e = _mm256_insert_epi32(e, !0, 2);
4526 assert_eq_m256i(r, e);
4527 }
4528
4529 #[simd_test(enable = "avx2")]
4530 unsafe fn test_mm256_cmpeq_epi64() {
4531 let a = _mm256_setr_epi64x(0, 1, 2, 3);
4532 let b = _mm256_setr_epi64x(3, 2, 2, 0);
4533 let r = _mm256_cmpeq_epi64(a, b);
4534 assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
4535 }
4536
4537 #[simd_test(enable = "avx2")]
4538 unsafe fn test_mm256_cmpgt_epi8() {
4539 let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
4540 let b = _mm256_set1_epi8(0);
4541 let r = _mm256_cmpgt_epi8(a, b);
4542 assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
4543 }
4544
4545 #[simd_test(enable = "avx2")]
4546 unsafe fn test_mm256_cmpgt_epi16() {
4547 let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
4548 let b = _mm256_set1_epi16(0);
4549 let r = _mm256_cmpgt_epi16(a, b);
4550 assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
4551 }
4552
4553 #[simd_test(enable = "avx2")]
4554 unsafe fn test_mm256_cmpgt_epi32() {
4555 let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
4556 let b = _mm256_set1_epi32(0);
4557 let r = _mm256_cmpgt_epi32(a, b);
4558 assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
4559 }
4560
4561 #[simd_test(enable = "avx2")]
4562 unsafe fn test_mm256_cmpgt_epi64() {
4563 let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
4564 let b = _mm256_set1_epi64x(0);
4565 let r = _mm256_cmpgt_epi64(a, b);
4566 assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
4567 }
4568
4569 #[simd_test(enable = "avx2")]
4570 unsafe fn test_mm256_cvtepi8_epi16() {
4571 #[rustfmt::skip]
4572 let a = _mm_setr_epi8(
4573 0, 0, -1, 1, -2, 2, -3, 3,
4574 -4, 4, -5, 5, -6, 6, -7, 7,
4575 );
4576 #[rustfmt::skip]
4577 let r = _mm256_setr_epi16(
4578 0, 0, -1, 1, -2, 2, -3, 3,
4579 -4, 4, -5, 5, -6, 6, -7, 7,
4580 );
4581 assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4582 }
4583
4584 #[simd_test(enable = "avx2")]
4585 unsafe fn test_mm256_cvtepi8_epi32() {
4586 #[rustfmt::skip]
4587 let a = _mm_setr_epi8(
4588 0, 0, -1, 1, -2, 2, -3, 3,
4589 -4, 4, -5, 5, -6, 6, -7, 7,
4590 );
4591 let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4592 assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4593 }
4594
4595 #[simd_test(enable = "avx2")]
4596 unsafe fn test_mm256_cvtepi8_epi64() {
4597 #[rustfmt::skip]
4598 let a = _mm_setr_epi8(
4599 0, 0, -1, 1, -2, 2, -3, 3,
4600 -4, 4, -5, 5, -6, 6, -7, 7,
4601 );
4602 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4603 assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4604 }
4605
4606 #[simd_test(enable = "avx2")]
4607 unsafe fn test_mm256_cvtepi16_epi32() {
4608 let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4609 let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4610 assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4611 }
4612
4613 #[simd_test(enable = "avx2")]
4614 unsafe fn test_mm256_cvtepi16_epi64() {
4615 let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4616 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4617 assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4618 }
4619
4620 #[simd_test(enable = "avx2")]
4621 unsafe fn test_mm256_cvtepi32_epi64() {
4622 let a = _mm_setr_epi32(0, 0, -1, 1);
4623 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4624 assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4625 }
4626
4627 #[simd_test(enable = "avx2")]
4628 unsafe fn test_mm256_cvtepu16_epi32() {
4629 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4630 let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4631 assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4632 }
4633
4634 #[simd_test(enable = "avx2")]
4635 unsafe fn test_mm256_cvtepu16_epi64() {
4636 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4637 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4638 assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4639 }
4640
4641 #[simd_test(enable = "avx2")]
4642 unsafe fn test_mm256_cvtepu32_epi64() {
4643 let a = _mm_setr_epi32(0, 1, 2, 3);
4644 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4645 assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4646 }
4647
4648 #[simd_test(enable = "avx2")]
4649 unsafe fn test_mm256_cvtepu8_epi16() {
4650 #[rustfmt::skip]
4651 let a = _mm_setr_epi8(
4652 0, 1, 2, 3, 4, 5, 6, 7,
4653 8, 9, 10, 11, 12, 13, 14, 15,
4654 );
4655 #[rustfmt::skip]
4656 let r = _mm256_setr_epi16(
4657 0, 1, 2, 3, 4, 5, 6, 7,
4658 8, 9, 10, 11, 12, 13, 14, 15,
4659 );
4660 assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4661 }
4662
4663 #[simd_test(enable = "avx2")]
4664 unsafe fn test_mm256_cvtepu8_epi32() {
4665 #[rustfmt::skip]
4666 let a = _mm_setr_epi8(
4667 0, 1, 2, 3, 4, 5, 6, 7,
4668 8, 9, 10, 11, 12, 13, 14, 15,
4669 );
4670 let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4671 assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4672 }
4673
4674 #[simd_test(enable = "avx2")]
4675 unsafe fn test_mm256_cvtepu8_epi64() {
4676 #[rustfmt::skip]
4677 let a = _mm_setr_epi8(
4678 0, 1, 2, 3, 4, 5, 6, 7,
4679 8, 9, 10, 11, 12, 13, 14, 15,
4680 );
4681 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4682 assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4683 }
4684
4685 #[simd_test(enable = "avx2")]
4686 unsafe fn test_mm256_extracti128_si256() {
4687 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4688 let r = _mm256_extracti128_si256(a, 0b01);
4689 let e = _mm_setr_epi64x(3, 4);
4690 assert_eq_m128i(r, e);
4691 }
4692
4693 #[simd_test(enable = "avx2")]
4694 unsafe fn test_mm256_hadd_epi16() {
4695 let a = _mm256_set1_epi16(2);
4696 let b = _mm256_set1_epi16(4);
4697 let r = _mm256_hadd_epi16(a, b);
4698 let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4699 assert_eq_m256i(r, e);
4700 }
4701
4702 #[simd_test(enable = "avx2")]
4703 unsafe fn test_mm256_hadd_epi32() {
4704 let a = _mm256_set1_epi32(2);
4705 let b = _mm256_set1_epi32(4);
4706 let r = _mm256_hadd_epi32(a, b);
4707 let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4708 assert_eq_m256i(r, e);
4709 }
4710
4711 #[simd_test(enable = "avx2")]
4712 unsafe fn test_mm256_hadds_epi16() {
4713 let a = _mm256_set1_epi16(2);
4714 let a = _mm256_insert_epi16(a, 0x7fff, 0);
4715 let a = _mm256_insert_epi16(a, 1, 1);
4716 let b = _mm256_set1_epi16(4);
4717 let r = _mm256_hadds_epi16(a, b);
4718 #[rustfmt::skip]
4719 let e = _mm256_setr_epi16(
4720 0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4721 4, 4, 4, 4, 8, 8, 8, 8,
4722 );
4723 assert_eq_m256i(r, e);
4724 }
4725
4726 #[simd_test(enable = "avx2")]
4727 unsafe fn test_mm256_hsub_epi16() {
4728 let a = _mm256_set1_epi16(2);
4729 let b = _mm256_set1_epi16(4);
4730 let r = _mm256_hsub_epi16(a, b);
4731 let e = _mm256_set1_epi16(0);
4732 assert_eq_m256i(r, e);
4733 }
4734
4735 #[simd_test(enable = "avx2")]
4736 unsafe fn test_mm256_hsub_epi32() {
4737 let a = _mm256_set1_epi32(2);
4738 let b = _mm256_set1_epi32(4);
4739 let r = _mm256_hsub_epi32(a, b);
4740 let e = _mm256_set1_epi32(0);
4741 assert_eq_m256i(r, e);
4742 }
4743
4744 #[simd_test(enable = "avx2")]
4745 unsafe fn test_mm256_hsubs_epi16() {
4746 let a = _mm256_set1_epi16(2);
4747 let a = _mm256_insert_epi16(a, 0x7fff, 0);
4748 let a = _mm256_insert_epi16(a, -1, 1);
4749 let b = _mm256_set1_epi16(4);
4750 let r = _mm256_hsubs_epi16(a, b);
4751 let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
4752 assert_eq_m256i(r, e);
4753 }
4754
4755 #[simd_test(enable = "avx2")]
4756 unsafe fn test_mm256_madd_epi16() {
4757 let a = _mm256_set1_epi16(2);
4758 let b = _mm256_set1_epi16(4);
4759 let r = _mm256_madd_epi16(a, b);
4760 let e = _mm256_set1_epi32(16);
4761 assert_eq_m256i(r, e);
4762 }
4763
4764 #[simd_test(enable = "avx2")]
4765 unsafe fn test_mm256_inserti128_si256() {
4766 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4767 let b = _mm_setr_epi64x(7, 8);
4768 let r = _mm256_inserti128_si256(a, b, 0b01);
4769 let e = _mm256_setr_epi64x(1, 2, 7, 8);
4770 assert_eq_m256i(r, e);
4771 }
4772
4773 #[simd_test(enable = "avx2")]
4774 unsafe fn test_mm256_maddubs_epi16() {
4775 let a = _mm256_set1_epi8(2);
4776 let b = _mm256_set1_epi8(4);
4777 let r = _mm256_maddubs_epi16(a, b);
4778 let e = _mm256_set1_epi16(16);
4779 assert_eq_m256i(r, e);
4780 }
4781
4782 #[simd_test(enable = "avx2")]
4783 unsafe fn test_mm_maskload_epi32() {
4784 let nums = [1, 2, 3, 4];
4785 let a = &nums as *const i32;
4786 let mask = _mm_setr_epi32(-1, 0, 0, -1);
4787 let r = _mm_maskload_epi32(a, mask);
4788 let e = _mm_setr_epi32(1, 0, 0, 4);
4789 assert_eq_m128i(r, e);
4790 }
4791
4792 #[simd_test(enable = "avx2")]
4793 unsafe fn test_mm256_maskload_epi32() {
4794 let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4795 let a = &nums as *const i32;
4796 let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4797 let r = _mm256_maskload_epi32(a, mask);
4798 let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4799 assert_eq_m256i(r, e);
4800 }
4801
4802 #[simd_test(enable = "avx2")]
4803 unsafe fn test_mm_maskload_epi64() {
4804 let nums = [1_i64, 2_i64];
4805 let a = &nums as *const i64;
4806 let mask = _mm_setr_epi64x(0, -1);
4807 let r = _mm_maskload_epi64(a, mask);
4808 let e = _mm_setr_epi64x(0, 2);
4809 assert_eq_m128i(r, e);
4810 }
4811
4812 #[simd_test(enable = "avx2")]
4813 unsafe fn test_mm256_maskload_epi64() {
4814 let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4815 let a = &nums as *const i64;
4816 let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4817 let r = _mm256_maskload_epi64(a, mask);
4818 let e = _mm256_setr_epi64x(0, 2, 3, 0);
4819 assert_eq_m256i(r, e);
4820 }
4821
4822 #[simd_test(enable = "avx2")]
4823 unsafe fn test_mm_maskstore_epi32() {
4824 let a = _mm_setr_epi32(1, 2, 3, 4);
4825 let mut arr = [-1, -1, -1, -1];
4826 let mask = _mm_setr_epi32(-1, 0, 0, -1);
4827 _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4828 let e = [1, -1, -1, 4];
4829 assert_eq!(arr, e);
4830 }
4831
4832 #[simd_test(enable = "avx2")]
4833 unsafe fn test_mm256_maskstore_epi32() {
4834 let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4835 let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4836 let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4837 _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4838 let e = [1, -1, -1, 42, -1, 6, 7, -1];
4839 assert_eq!(arr, e);
4840 }
4841
4842 #[simd_test(enable = "avx2")]
4843 unsafe fn test_mm_maskstore_epi64() {
4844 let a = _mm_setr_epi64x(1_i64, 2_i64);
4845 let mut arr = [-1_i64, -1_i64];
4846 let mask = _mm_setr_epi64x(0, -1);
4847 _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4848 let e = [-1, 2];
4849 assert_eq!(arr, e);
4850 }
4851
4852 #[simd_test(enable = "avx2")]
4853 unsafe fn test_mm256_maskstore_epi64() {
4854 let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4855 let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4856 let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4857 _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4858 let e = [-1, 2, 3, -1];
4859 assert_eq!(arr, e);
4860 }
4861
4862 #[simd_test(enable = "avx2")]
4863 unsafe fn test_mm256_max_epi16() {
4864 let a = _mm256_set1_epi16(2);
4865 let b = _mm256_set1_epi16(4);
4866 let r = _mm256_max_epi16(a, b);
4867 assert_eq_m256i(r, b);
4868 }
4869
4870 #[simd_test(enable = "avx2")]
4871 unsafe fn test_mm256_max_epi32() {
4872 let a = _mm256_set1_epi32(2);
4873 let b = _mm256_set1_epi32(4);
4874 let r = _mm256_max_epi32(a, b);
4875 assert_eq_m256i(r, b);
4876 }
4877
4878 #[simd_test(enable = "avx2")]
4879 unsafe fn test_mm256_max_epi8() {
4880 let a = _mm256_set1_epi8(2);
4881 let b = _mm256_set1_epi8(4);
4882 let r = _mm256_max_epi8(a, b);
4883 assert_eq_m256i(r, b);
4884 }
4885
4886 #[simd_test(enable = "avx2")]
4887 unsafe fn test_mm256_max_epu16() {
4888 let a = _mm256_set1_epi16(2);
4889 let b = _mm256_set1_epi16(4);
4890 let r = _mm256_max_epu16(a, b);
4891 assert_eq_m256i(r, b);
4892 }
4893
4894 #[simd_test(enable = "avx2")]
4895 unsafe fn test_mm256_max_epu32() {
4896 let a = _mm256_set1_epi32(2);
4897 let b = _mm256_set1_epi32(4);
4898 let r = _mm256_max_epu32(a, b);
4899 assert_eq_m256i(r, b);
4900 }
4901
4902 #[simd_test(enable = "avx2")]
4903 unsafe fn test_mm256_max_epu8() {
4904 let a = _mm256_set1_epi8(2);
4905 let b = _mm256_set1_epi8(4);
4906 let r = _mm256_max_epu8(a, b);
4907 assert_eq_m256i(r, b);
4908 }
4909
4910 #[simd_test(enable = "avx2")]
4911 unsafe fn test_mm256_min_epi16() {
4912 let a = _mm256_set1_epi16(2);
4913 let b = _mm256_set1_epi16(4);
4914 let r = _mm256_min_epi16(a, b);
4915 assert_eq_m256i(r, a);
4916 }
4917
4918 #[simd_test(enable = "avx2")]
4919 unsafe fn test_mm256_min_epi32() {
4920 let a = _mm256_set1_epi32(2);
4921 let b = _mm256_set1_epi32(4);
4922 let r = _mm256_min_epi32(a, b);
4923 assert_eq_m256i(r, a);
4924 }
4925
4926 #[simd_test(enable = "avx2")]
4927 unsafe fn test_mm256_min_epi8() {
4928 let a = _mm256_set1_epi8(2);
4929 let b = _mm256_set1_epi8(4);
4930 let r = _mm256_min_epi8(a, b);
4931 assert_eq_m256i(r, a);
4932 }
4933
4934 #[simd_test(enable = "avx2")]
4935 unsafe fn test_mm256_min_epu16() {
4936 let a = _mm256_set1_epi16(2);
4937 let b = _mm256_set1_epi16(4);
4938 let r = _mm256_min_epu16(a, b);
4939 assert_eq_m256i(r, a);
4940 }
4941
4942 #[simd_test(enable = "avx2")]
4943 unsafe fn test_mm256_min_epu32() {
4944 let a = _mm256_set1_epi32(2);
4945 let b = _mm256_set1_epi32(4);
4946 let r = _mm256_min_epu32(a, b);
4947 assert_eq_m256i(r, a);
4948 }
4949
4950 #[simd_test(enable = "avx2")]
4951 unsafe fn test_mm256_min_epu8() {
4952 let a = _mm256_set1_epi8(2);
4953 let b = _mm256_set1_epi8(4);
4954 let r = _mm256_min_epu8(a, b);
4955 assert_eq_m256i(r, a);
4956 }
4957
4958 #[simd_test(enable = "avx2")]
4959 unsafe fn test_mm256_movemask_epi8() {
4960 let a = _mm256_set1_epi8(-1);
4961 let r = _mm256_movemask_epi8(a);
4962 let e = -1;
4963 assert_eq!(r, e);
4964 }
4965
4966 #[simd_test(enable = "avx2")]
4967 unsafe fn test_mm256_mpsadbw_epu8() {
4968 let a = _mm256_set1_epi8(2);
4969 let b = _mm256_set1_epi8(4);
4970 let r = _mm256_mpsadbw_epu8(a, b, 0);
4971 let e = _mm256_set1_epi16(8);
4972 assert_eq_m256i(r, e);
4973 }
4974
4975 #[simd_test(enable = "avx2")]
4976 unsafe fn test_mm256_mul_epi32() {
4977 let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4978 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4979 let r = _mm256_mul_epi32(a, b);
4980 let e = _mm256_setr_epi64x(0, 0, 10, 14);
4981 assert_eq_m256i(r, e);
4982 }
4983
4984 #[simd_test(enable = "avx2")]
4985 unsafe fn test_mm256_mul_epu32() {
4986 let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4987 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4988 let r = _mm256_mul_epu32(a, b);
4989 let e = _mm256_setr_epi64x(0, 0, 10, 14);
4990 assert_eq_m256i(r, e);
4991 }
4992
4993 #[simd_test(enable = "avx2")]
4994 unsafe fn test_mm256_mulhi_epi16() {
4995 let a = _mm256_set1_epi16(6535);
4996 let b = _mm256_set1_epi16(6535);
4997 let r = _mm256_mulhi_epi16(a, b);
4998 let e = _mm256_set1_epi16(651);
4999 assert_eq_m256i(r, e);
5000 }
5001
5002 #[simd_test(enable = "avx2")]
5003 unsafe fn test_mm256_mulhi_epu16() {
5004 let a = _mm256_set1_epi16(6535);
5005 let b = _mm256_set1_epi16(6535);
5006 let r = _mm256_mulhi_epu16(a, b);
5007 let e = _mm256_set1_epi16(651);
5008 assert_eq_m256i(r, e);
5009 }
5010
5011 #[simd_test(enable = "avx2")]
5012 unsafe fn test_mm256_mullo_epi16() {
5013 let a = _mm256_set1_epi16(2);
5014 let b = _mm256_set1_epi16(4);
5015 let r = _mm256_mullo_epi16(a, b);
5016 let e = _mm256_set1_epi16(8);
5017 assert_eq_m256i(r, e);
5018 }
5019
5020 #[simd_test(enable = "avx2")]
5021 unsafe fn test_mm256_mullo_epi32() {
5022 let a = _mm256_set1_epi32(2);
5023 let b = _mm256_set1_epi32(4);
5024 let r = _mm256_mullo_epi32(a, b);
5025 let e = _mm256_set1_epi32(8);
5026 assert_eq_m256i(r, e);
5027 }
5028
5029 #[simd_test(enable = "avx2")]
5030 unsafe fn test_mm256_mulhrs_epi16() {
5031 let a = _mm256_set1_epi16(2);
5032 let b = _mm256_set1_epi16(4);
5033 let r = _mm256_mullo_epi16(a, b);
5034 let e = _mm256_set1_epi16(8);
5035 assert_eq_m256i(r, e);
5036 }
5037
5038 #[simd_test(enable = "avx2")]
5039 unsafe fn test_mm256_or_si256() {
5040 let a = _mm256_set1_epi8(-1);
5041 let b = _mm256_set1_epi8(0);
5042 let r = _mm256_or_si256(a, b);
5043 assert_eq_m256i(r, a);
5044 }
5045
5046 #[simd_test(enable = "avx2")]
5047 unsafe fn test_mm256_packs_epi16() {
5048 let a = _mm256_set1_epi16(2);
5049 let b = _mm256_set1_epi16(4);
5050 let r = _mm256_packs_epi16(a, b);
5051 #[rustfmt::skip]
5052 let e = _mm256_setr_epi8(
5053 2, 2, 2, 2, 2, 2, 2, 2,
5054 4, 4, 4, 4, 4, 4, 4, 4,
5055 2, 2, 2, 2, 2, 2, 2, 2,
5056 4, 4, 4, 4, 4, 4, 4, 4,
5057 );
5058
5059 assert_eq_m256i(r, e);
5060 }
5061
5062 #[simd_test(enable = "avx2")]
5063 unsafe fn test_mm256_packs_epi32() {
5064 let a = _mm256_set1_epi32(2);
5065 let b = _mm256_set1_epi32(4);
5066 let r = _mm256_packs_epi32(a, b);
5067 let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5068
5069 assert_eq_m256i(r, e);
5070 }
5071
5072 #[simd_test(enable = "avx2")]
5073 unsafe fn test_mm256_packus_epi16() {
5074 let a = _mm256_set1_epi16(2);
5075 let b = _mm256_set1_epi16(4);
5076 let r = _mm256_packus_epi16(a, b);
5077 #[rustfmt::skip]
5078 let e = _mm256_setr_epi8(
5079 2, 2, 2, 2, 2, 2, 2, 2,
5080 4, 4, 4, 4, 4, 4, 4, 4,
5081 2, 2, 2, 2, 2, 2, 2, 2,
5082 4, 4, 4, 4, 4, 4, 4, 4,
5083 );
5084
5085 assert_eq_m256i(r, e);
5086 }
5087
5088 #[simd_test(enable = "avx2")]
5089 unsafe fn test_mm256_packus_epi32() {
5090 let a = _mm256_set1_epi32(2);
5091 let b = _mm256_set1_epi32(4);
5092 let r = _mm256_packus_epi32(a, b);
5093 let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5094
5095 assert_eq_m256i(r, e);
5096 }
5097
5098 #[simd_test(enable = "avx2")]
5099 unsafe fn test_mm256_sad_epu8() {
5100 let a = _mm256_set1_epi8(2);
5101 let b = _mm256_set1_epi8(4);
5102 let r = _mm256_sad_epu8(a, b);
5103 let e = _mm256_set1_epi64x(16);
5104 assert_eq_m256i(r, e);
5105 }
5106
5107 #[simd_test(enable = "avx2")]
5108 unsafe fn test_mm256_shufflehi_epi16() {
5109 #[rustfmt::skip]
5110 let a = _mm256_setr_epi16(
5111 0, 1, 2, 3, 11, 22, 33, 44,
5112 4, 5, 6, 7, 55, 66, 77, 88,
5113 );
5114 #[rustfmt::skip]
5115 let e = _mm256_setr_epi16(
5116 0, 1, 2, 3, 44, 22, 22, 11,
5117 4, 5, 6, 7, 88, 66, 66, 55,
5118 );
5119 let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
5120 assert_eq_m256i(r, e);
5121 }
5122
5123 #[simd_test(enable = "avx2")]
5124 unsafe fn test_mm256_shufflelo_epi16() {
5125 #[rustfmt::skip]
5126 let a = _mm256_setr_epi16(
5127 11, 22, 33, 44, 0, 1, 2, 3,
5128 55, 66, 77, 88, 4, 5, 6, 7,
5129 );
5130 #[rustfmt::skip]
5131 let e = _mm256_setr_epi16(
5132 44, 22, 22, 11, 0, 1, 2, 3,
5133 88, 66, 66, 55, 4, 5, 6, 7,
5134 );
5135 let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
5136 assert_eq_m256i(r, e);
5137 }
5138
5139 #[simd_test(enable = "avx2")]
5140 unsafe fn test_mm256_sign_epi16() {
5141 let a = _mm256_set1_epi16(2);
5142 let b = _mm256_set1_epi16(-1);
5143 let r = _mm256_sign_epi16(a, b);
5144 let e = _mm256_set1_epi16(-2);
5145 assert_eq_m256i(r, e);
5146 }
5147
5148 #[simd_test(enable = "avx2")]
5149 unsafe fn test_mm256_sign_epi32() {
5150 let a = _mm256_set1_epi32(2);
5151 let b = _mm256_set1_epi32(-1);
5152 let r = _mm256_sign_epi32(a, b);
5153 let e = _mm256_set1_epi32(-2);
5154 assert_eq_m256i(r, e);
5155 }
5156
5157 #[simd_test(enable = "avx2")]
5158 unsafe fn test_mm256_sign_epi8() {
5159 let a = _mm256_set1_epi8(2);
5160 let b = _mm256_set1_epi8(-1);
5161 let r = _mm256_sign_epi8(a, b);
5162 let e = _mm256_set1_epi8(-2);
5163 assert_eq_m256i(r, e);
5164 }
5165
5166 #[simd_test(enable = "avx2")]
5167 unsafe fn test_mm256_sll_epi16() {
5168 let a = _mm256_set1_epi16(0xFF);
5169 let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5170 let r = _mm256_sll_epi16(a, b);
5171 assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5172 }
5173
5174 #[simd_test(enable = "avx2")]
5175 unsafe fn test_mm256_sll_epi32() {
5176 let a = _mm256_set1_epi32(0xFFFF);
5177 let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5178 let r = _mm256_sll_epi32(a, b);
5179 assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5180 }
5181
5182 #[simd_test(enable = "avx2")]
5183 unsafe fn test_mm256_sll_epi64() {
5184 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5185 let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
5186 let r = _mm256_sll_epi64(a, b);
5187 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5188 }
5189
5190 #[simd_test(enable = "avx2")]
5191 unsafe fn test_mm256_slli_epi16() {
5192 assert_eq_m256i(
5193 _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
5194 _mm256_set1_epi16(0xFF0),
5195 );
5196 }
5197
5198 #[simd_test(enable = "avx2")]
5199 unsafe fn test_mm256_slli_epi32() {
5200 assert_eq_m256i(
5201 _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5202 _mm256_set1_epi32(0xFFFF0),
5203 );
5204 }
5205
5206 #[simd_test(enable = "avx2")]
5207 unsafe fn test_mm256_slli_epi64() {
5208 assert_eq_m256i(
5209 _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5210 _mm256_set1_epi64x(0xFFFFFFFF0),
5211 );
5212 }
5213
5214 #[simd_test(enable = "avx2")]
5215 unsafe fn test_mm256_slli_si256() {
5216 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5217 let r = _mm256_slli_si256(a, 3);
5218 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5219 }
5220
5221 #[simd_test(enable = "avx2")]
5222 unsafe fn test_mm_sllv_epi32() {
5223 let a = _mm_set1_epi32(2);
5224 let b = _mm_set1_epi32(1);
5225 let r = _mm_sllv_epi32(a, b);
5226 let e = _mm_set1_epi32(4);
5227 assert_eq_m128i(r, e);
5228 }
5229
5230 #[simd_test(enable = "avx2")]
5231 unsafe fn test_mm256_sllv_epi32() {
5232 let a = _mm256_set1_epi32(2);
5233 let b = _mm256_set1_epi32(1);
5234 let r = _mm256_sllv_epi32(a, b);
5235 let e = _mm256_set1_epi32(4);
5236 assert_eq_m256i(r, e);
5237 }
5238
5239 #[simd_test(enable = "avx2")]
5240 unsafe fn test_mm_sllv_epi64() {
5241 let a = _mm_set1_epi64x(2);
5242 let b = _mm_set1_epi64x(1);
5243 let r = _mm_sllv_epi64(a, b);
5244 let e = _mm_set1_epi64x(4);
5245 assert_eq_m128i(r, e);
5246 }
5247
5248 #[simd_test(enable = "avx2")]
5249 unsafe fn test_mm256_sllv_epi64() {
5250 let a = _mm256_set1_epi64x(2);
5251 let b = _mm256_set1_epi64x(1);
5252 let r = _mm256_sllv_epi64(a, b);
5253 let e = _mm256_set1_epi64x(4);
5254 assert_eq_m256i(r, e);
5255 }
5256
5257 #[simd_test(enable = "avx2")]
5258 unsafe fn test_mm256_sra_epi16() {
5259 let a = _mm256_set1_epi16(-1);
5260 let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5261 let r = _mm256_sra_epi16(a, b);
5262 assert_eq_m256i(r, _mm256_set1_epi16(-1));
5263 }
5264
5265 #[simd_test(enable = "avx2")]
5266 unsafe fn test_mm256_sra_epi32() {
5267 let a = _mm256_set1_epi32(-1);
5268 let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
5269 let r = _mm256_sra_epi32(a, b);
5270 assert_eq_m256i(r, _mm256_set1_epi32(-1));
5271 }
5272
5273 #[simd_test(enable = "avx2")]
5274 unsafe fn test_mm256_srai_epi16() {
5275 assert_eq_m256i(
5276 _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
5277 _mm256_set1_epi16(-1),
5278 );
5279 }
5280
5281 #[simd_test(enable = "avx2")]
5282 unsafe fn test_mm256_srai_epi32() {
5283 assert_eq_m256i(
5284 _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
5285 _mm256_set1_epi32(-1),
5286 );
5287 }
5288
5289 #[simd_test(enable = "avx2")]
5290 unsafe fn test_mm_srav_epi32() {
5291 let a = _mm_set1_epi32(4);
5292 let count = _mm_set1_epi32(1);
5293 let r = _mm_srav_epi32(a, count);
5294 let e = _mm_set1_epi32(2);
5295 assert_eq_m128i(r, e);
5296 }
5297
5298 #[simd_test(enable = "avx2")]
5299 unsafe fn test_mm256_srav_epi32() {
5300 let a = _mm256_set1_epi32(4);
5301 let count = _mm256_set1_epi32(1);
5302 let r = _mm256_srav_epi32(a, count);
5303 let e = _mm256_set1_epi32(2);
5304 assert_eq_m256i(r, e);
5305 }
5306
5307 #[simd_test(enable = "avx2")]
5308 unsafe fn test_mm256_srli_si256() {
5309 #[rustfmt::skip]
5310 let a = _mm256_setr_epi8(
5311 1, 2, 3, 4, 5, 6, 7, 8,
5312 9, 10, 11, 12, 13, 14, 15, 16,
5313 17, 18, 19, 20, 21, 22, 23, 24,
5314 25, 26, 27, 28, 29, 30, 31, 32,
5315 );
5316 let r = _mm256_srli_si256(a, 3);
5317 #[rustfmt::skip]
5318 let e = _mm256_setr_epi8(
5319 4, 5, 6, 7, 8, 9, 10, 11,
5320 12, 13, 14, 15, 16, 0, 0, 0,
5321 20, 21, 22, 23, 24, 25, 26, 27,
5322 28, 29, 30, 31, 32, 0, 0, 0,
5323 );
5324 assert_eq_m256i(r, e);
5325 }
5326
5327 #[simd_test(enable = "avx2")]
5328 unsafe fn test_mm256_srl_epi16() {
5329 let a = _mm256_set1_epi16(0xFF);
5330 let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5331 let r = _mm256_srl_epi16(a, b);
5332 assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5333 }
5334
5335 #[simd_test(enable = "avx2")]
5336 unsafe fn test_mm256_srl_epi32() {
5337 let a = _mm256_set1_epi32(0xFFFF);
5338 let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5339 let r = _mm256_srl_epi32(a, b);
5340 assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5341 }
5342
5343 #[simd_test(enable = "avx2")]
5344 unsafe fn test_mm256_srl_epi64() {
5345 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5346 let b = _mm_setr_epi64x(4, 0);
5347 let r = _mm256_srl_epi64(a, b);
5348 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5349 }
5350
5351 #[simd_test(enable = "avx2")]
5352 unsafe fn test_mm256_srli_epi16() {
5353 assert_eq_m256i(
5354 _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
5355 _mm256_set1_epi16(0xF),
5356 );
5357 }
5358
5359 #[simd_test(enable = "avx2")]
5360 unsafe fn test_mm256_srli_epi32() {
5361 assert_eq_m256i(
5362 _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5363 _mm256_set1_epi32(0xFFF),
5364 );
5365 }
5366
5367 #[simd_test(enable = "avx2")]
5368 unsafe fn test_mm256_srli_epi64() {
5369 assert_eq_m256i(
5370 _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5371 _mm256_set1_epi64x(0xFFFFFFF),
5372 );
5373 }
5374
5375 #[simd_test(enable = "avx2")]
5376 unsafe fn test_mm_srlv_epi32() {
5377 let a = _mm_set1_epi32(2);
5378 let count = _mm_set1_epi32(1);
5379 let r = _mm_srlv_epi32(a, count);
5380 let e = _mm_set1_epi32(1);
5381 assert_eq_m128i(r, e);
5382 }
5383
5384 #[simd_test(enable = "avx2")]
5385 unsafe fn test_mm256_srlv_epi32() {
5386 let a = _mm256_set1_epi32(2);
5387 let count = _mm256_set1_epi32(1);
5388 let r = _mm256_srlv_epi32(a, count);
5389 let e = _mm256_set1_epi32(1);
5390 assert_eq_m256i(r, e);
5391 }
5392
5393 #[simd_test(enable = "avx2")]
5394 unsafe fn test_mm_srlv_epi64() {
5395 let a = _mm_set1_epi64x(2);
5396 let count = _mm_set1_epi64x(1);
5397 let r = _mm_srlv_epi64(a, count);
5398 let e = _mm_set1_epi64x(1);
5399 assert_eq_m128i(r, e);
5400 }
5401
5402 #[simd_test(enable = "avx2")]
5403 unsafe fn test_mm256_srlv_epi64() {
5404 let a = _mm256_set1_epi64x(2);
5405 let count = _mm256_set1_epi64x(1);
5406 let r = _mm256_srlv_epi64(a, count);
5407 let e = _mm256_set1_epi64x(1);
5408 assert_eq_m256i(r, e);
5409 }
5410
5411 #[simd_test(enable = "avx2")]
5412 unsafe fn test_mm256_sub_epi16() {
5413 let a = _mm256_set1_epi16(4);
5414 let b = _mm256_set1_epi16(2);
5415 let r = _mm256_sub_epi16(a, b);
5416 assert_eq_m256i(r, b);
5417 }
5418
5419 #[simd_test(enable = "avx2")]
5420 unsafe fn test_mm256_sub_epi32() {
5421 let a = _mm256_set1_epi32(4);
5422 let b = _mm256_set1_epi32(2);
5423 let r = _mm256_sub_epi32(a, b);
5424 assert_eq_m256i(r, b);
5425 }
5426
5427 #[simd_test(enable = "avx2")]
5428 unsafe fn test_mm256_sub_epi64() {
5429 let a = _mm256_set1_epi64x(4);
5430 let b = _mm256_set1_epi64x(2);
5431 let r = _mm256_sub_epi64(a, b);
5432 assert_eq_m256i(r, b);
5433 }
5434
5435 #[simd_test(enable = "avx2")]
5436 unsafe fn test_mm256_sub_epi8() {
5437 let a = _mm256_set1_epi8(4);
5438 let b = _mm256_set1_epi8(2);
5439 let r = _mm256_sub_epi8(a, b);
5440 assert_eq_m256i(r, b);
5441 }
5442
5443 #[simd_test(enable = "avx2")]
5444 unsafe fn test_mm256_subs_epi16() {
5445 let a = _mm256_set1_epi16(4);
5446 let b = _mm256_set1_epi16(2);
5447 let r = _mm256_subs_epi16(a, b);
5448 assert_eq_m256i(r, b);
5449 }
5450
5451 #[simd_test(enable = "avx2")]
5452 unsafe fn test_mm256_subs_epi8() {
5453 let a = _mm256_set1_epi8(4);
5454 let b = _mm256_set1_epi8(2);
5455 let r = _mm256_subs_epi8(a, b);
5456 assert_eq_m256i(r, b);
5457 }
5458
5459 #[simd_test(enable = "avx2")]
5460 unsafe fn test_mm256_subs_epu16() {
5461 let a = _mm256_set1_epi16(4);
5462 let b = _mm256_set1_epi16(2);
5463 let r = _mm256_subs_epu16(a, b);
5464 assert_eq_m256i(r, b);
5465 }
5466
5467 #[simd_test(enable = "avx2")]
5468 unsafe fn test_mm256_subs_epu8() {
5469 let a = _mm256_set1_epi8(4);
5470 let b = _mm256_set1_epi8(2);
5471 let r = _mm256_subs_epu8(a, b);
5472 assert_eq_m256i(r, b);
5473 }
5474
5475 #[simd_test(enable = "avx2")]
5476 unsafe fn test_mm256_xor_si256() {
5477 let a = _mm256_set1_epi8(5);
5478 let b = _mm256_set1_epi8(3);
5479 let r = _mm256_xor_si256(a, b);
5480 assert_eq_m256i(r, _mm256_set1_epi8(6));
5481 }
5482
5483 #[simd_test(enable = "avx2")]
5484 unsafe fn test_mm256_alignr_epi8() {
5485 #[rustfmt::skip]
5486 let a = _mm256_setr_epi8(
5487 1, 2, 3, 4, 5, 6, 7, 8,
5488 9, 10, 11, 12, 13, 14, 15, 16,
5489 17, 18, 19, 20, 21, 22, 23, 24,
5490 25, 26, 27, 28, 29, 30, 31, 32,
5491 );
5492 #[rustfmt::skip]
5493 let b = _mm256_setr_epi8(
5494 -1, -2, -3, -4, -5, -6, -7, -8,
5495 -9, -10, -11, -12, -13, -14, -15, -16,
5496 -17, -18, -19, -20, -21, -22, -23, -24,
5497 -25, -26, -27, -28, -29, -30, -31, -32,
5498 );
5499 let r = _mm256_alignr_epi8(a, b, 33);
5500 assert_eq_m256i(r, _mm256_set1_epi8(0));
5501
5502 let r = _mm256_alignr_epi8(a, b, 17);
5503 #[rustfmt::skip]
5504 let expected = _mm256_setr_epi8(
5505 2, 3, 4, 5, 6, 7, 8, 9,
5506 10, 11, 12, 13, 14, 15, 16, 0,
5507 18, 19, 20, 21, 22, 23, 24, 25,
5508 26, 27, 28, 29, 30, 31, 32, 0,
5509 );
5510 assert_eq_m256i(r, expected);
5511
5512 let r = _mm256_alignr_epi8(a, b, 4);
5513 #[rustfmt::skip]
5514 let expected = _mm256_setr_epi8(
5515 -5, -6, -7, -8, -9, -10, -11, -12,
5516 -13, -14, -15, -16, 1, 2, 3, 4,
5517 -21, -22, -23, -24, -25, -26, -27, -28,
5518 -29, -30, -31, -32, 17, 18, 19, 20,
5519 );
5520 assert_eq_m256i(r, expected);
5521
5522 #[rustfmt::skip]
5523 let expected = _mm256_setr_epi8(
5524 -1, -2, -3, -4, -5, -6, -7, -8,
5525 -9, -10, -11, -12, -13, -14, -15, -16, -17,
5526 -18, -19, -20, -21, -22, -23, -24, -25,
5527 -26, -27, -28, -29, -30, -31, -32,
5528 );
5529 let r = _mm256_alignr_epi8(a, b, 16);
5530 assert_eq_m256i(r, expected);
5531
5532 let r = _mm256_alignr_epi8(a, b, 15);
5533 #[rustfmt::skip]
5534 let expected = _mm256_setr_epi8(
5535 -16, 1, 2, 3, 4, 5, 6, 7,
5536 8, 9, 10, 11, 12, 13, 14, 15,
5537 -32, 17, 18, 19, 20, 21, 22, 23,
5538 24, 25, 26, 27, 28, 29, 30, 31,
5539 );
5540 assert_eq_m256i(r, expected);
5541
5542 let r = _mm256_alignr_epi8(a, b, 0);
5543 assert_eq_m256i(r, b);
5544 }
5545
5546 #[simd_test(enable = "avx2")]
5547 unsafe fn test_mm256_shuffle_epi8() {
5548 #[rustfmt::skip]
5549 let a = _mm256_setr_epi8(
5550 1, 2, 3, 4, 5, 6, 7, 8,
5551 9, 10, 11, 12, 13, 14, 15, 16,
5552 17, 18, 19, 20, 21, 22, 23, 24,
5553 25, 26, 27, 28, 29, 30, 31, 32,
5554 );
5555 #[rustfmt::skip]
5556 let b = _mm256_setr_epi8(
5557 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5558 12, 5, 5, 10, 4, 1, 8, 0,
5559 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5560 12, 5, 5, 10, 4, 1, 8, 0,
5561 );
5562 #[rustfmt::skip]
5563 let expected = _mm256_setr_epi8(
5564 5, 0, 5, 4, 9, 13, 7, 4,
5565 13, 6, 6, 11, 5, 2, 9, 1,
5566 21, 0, 21, 20, 25, 29, 23, 20,
5567 29, 22, 22, 27, 21, 18, 25, 17,
5568 );
5569 let r = _mm256_shuffle_epi8(a, b);
5570 assert_eq_m256i(r, expected);
5571 }
5572
5573 #[simd_test(enable = "avx2")]
5574 unsafe fn test_mm256_permutevar8x32_epi32() {
5575 let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5576 let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5577 let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5578 let r = _mm256_permutevar8x32_epi32(a, b);
5579 assert_eq_m256i(r, expected);
5580 }
5581
5582 #[simd_test(enable = "avx2")]
5583 unsafe fn test_mm256_permute4x64_epi64() {
5584 let a = _mm256_setr_epi64x(100, 200, 300, 400);
5585 let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5586 let r = _mm256_permute4x64_epi64(a, 0b00010011);
5587 assert_eq_m256i(r, expected);
5588 }
5589
5590 #[simd_test(enable = "avx2")]
5591 unsafe fn test_mm256_permute2x128_si256() {
5592 let a = _mm256_setr_epi64x(100, 200, 500, 600);
5593 let b = _mm256_setr_epi64x(300, 400, 700, 800);
5594 let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
5595 let e = _mm256_setr_epi64x(700, 800, 500, 600);
5596 assert_eq_m256i(r, e);
5597 }
5598
5599 #[simd_test(enable = "avx2")]
5600 unsafe fn test_mm256_permute4x64_pd() {
5601 let a = _mm256_setr_pd(1., 2., 3., 4.);
5602 let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
5603 let e = _mm256_setr_pd(4., 1., 2., 1.);
5604 assert_eq_m256d(r, e);
5605 }
5606
5607 #[simd_test(enable = "avx2")]
5608 unsafe fn test_mm256_permutevar8x32_ps() {
5609 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5610 let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5611 let r = _mm256_permutevar8x32_ps(a, b);
5612 let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5613 assert_eq_m256(r, e);
5614 }
5615
5616 #[simd_test(enable = "avx2")]
5617 unsafe fn test_mm_i32gather_epi32() {
5618 let mut arr = [0i32; 128];
5619 for i in 0..128i32 {
5620 arr[i as usize] = i;
5621 }
5622 // A multiplier of 4 is word-addressing
5623 let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5624 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5625 }
5626
5627 #[simd_test(enable = "avx2")]
5628 unsafe fn test_mm_mask_i32gather_epi32() {
5629 let mut arr = [0i32; 128];
5630 for i in 0..128i32 {
5631 arr[i as usize] = i;
5632 }
5633 // A multiplier of 4 is word-addressing
5634 let r = _mm_mask_i32gather_epi32(
5635 _mm_set1_epi32(256),
5636 arr.as_ptr(),
5637 _mm_setr_epi32(0, 16, 64, 96),
5638 _mm_setr_epi32(-1, -1, -1, 0),
5639 4,
5640 );
5641 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5642 }
5643
5644 #[simd_test(enable = "avx2")]
5645 unsafe fn test_mm256_i32gather_epi32() {
5646 let mut arr = [0i32; 128];
5647 for i in 0..128i32 {
5648 arr[i as usize] = i;
5649 }
5650 // A multiplier of 4 is word-addressing
5651 let r = _mm256_i32gather_epi32(
5652 arr.as_ptr(),
5653 _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5654 4,
5655 );
5656 assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5657 }
5658
5659 #[simd_test(enable = "avx2")]
5660 unsafe fn test_mm256_mask_i32gather_epi32() {
5661 let mut arr = [0i32; 128];
5662 for i in 0..128i32 {
5663 arr[i as usize] = i;
5664 }
5665 // A multiplier of 4 is word-addressing
5666 let r = _mm256_mask_i32gather_epi32(
5667 _mm256_set1_epi32(256),
5668 arr.as_ptr(),
5669 _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5670 _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5671 4,
5672 );
5673 assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5674 }
5675
5676 #[simd_test(enable = "avx2")]
5677 unsafe fn test_mm_i32gather_ps() {
5678 let mut arr = [0.0f32; 128];
5679 let mut j = 0.0;
5680 for i in 0..128usize {
5681 arr[i] = j;
5682 j += 1.0;
5683 }
5684 // A multiplier of 4 is word-addressing for f32s
5685 let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5686 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5687 }
5688
5689 #[simd_test(enable = "avx2")]
5690 unsafe fn test_mm_mask_i32gather_ps() {
5691 let mut arr = [0.0f32; 128];
5692 let mut j = 0.0;
5693 for i in 0..128usize {
5694 arr[i] = j;
5695 j += 1.0;
5696 }
5697 // A multiplier of 4 is word-addressing for f32s
5698 let r = _mm_mask_i32gather_ps(
5699 _mm_set1_ps(256.0),
5700 arr.as_ptr(),
5701 _mm_setr_epi32(0, 16, 64, 96),
5702 _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5703 4,
5704 );
5705 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5706 }
5707
5708 #[simd_test(enable = "avx2")]
5709 unsafe fn test_mm256_i32gather_ps() {
5710 let mut arr = [0.0f32; 128];
5711 let mut j = 0.0;
5712 for i in 0..128usize {
5713 arr[i] = j;
5714 j += 1.0;
5715 }
5716 // A multiplier of 4 is word-addressing for f32s
5717 let r = _mm256_i32gather_ps(
5718 arr.as_ptr(),
5719 _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5720 4,
5721 );
5722 assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5723 }
5724
5725 #[simd_test(enable = "avx2")]
5726 unsafe fn test_mm256_mask_i32gather_ps() {
5727 let mut arr = [0.0f32; 128];
5728 let mut j = 0.0;
5729 for i in 0..128usize {
5730 arr[i] = j;
5731 j += 1.0;
5732 }
5733 // A multiplier of 4 is word-addressing for f32s
5734 let r = _mm256_mask_i32gather_ps(
5735 _mm256_set1_ps(256.0),
5736 arr.as_ptr(),
5737 _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5738 _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5739 4,
5740 );
5741 assert_eq_m256(
5742 r,
5743 _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5744 );
5745 }
5746
5747 #[simd_test(enable = "avx2")]
5748 unsafe fn test_mm_i32gather_epi64() {
5749 let mut arr = [0i64; 128];
5750 for i in 0..128i64 {
5751 arr[i as usize] = i;
5752 }
5753 // A multiplier of 8 is word-addressing for i64s
5754 let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5755 assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5756 }
5757
5758 #[simd_test(enable = "avx2")]
5759 unsafe fn test_mm_mask_i32gather_epi64() {
5760 let mut arr = [0i64; 128];
5761 for i in 0..128i64 {
5762 arr[i as usize] = i;
5763 }
5764 // A multiplier of 8 is word-addressing for i64s
5765 let r = _mm_mask_i32gather_epi64(
5766 _mm_set1_epi64x(256),
5767 arr.as_ptr(),
5768 _mm_setr_epi32(16, 16, 16, 16),
5769 _mm_setr_epi64x(-1, 0),
5770 8,
5771 );
5772 assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5773 }
5774
5775 #[simd_test(enable = "avx2")]
5776 unsafe fn test_mm256_i32gather_epi64() {
5777 let mut arr = [0i64; 128];
5778 for i in 0..128i64 {
5779 arr[i as usize] = i;
5780 }
5781 // A multiplier of 8 is word-addressing for i64s
5782 let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5783 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5784 }
5785
5786 #[simd_test(enable = "avx2")]
5787 unsafe fn test_mm256_mask_i32gather_epi64() {
5788 let mut arr = [0i64; 128];
5789 for i in 0..128i64 {
5790 arr[i as usize] = i;
5791 }
5792 // A multiplier of 8 is word-addressing for i64s
5793 let r = _mm256_mask_i32gather_epi64(
5794 _mm256_set1_epi64x(256),
5795 arr.as_ptr(),
5796 _mm_setr_epi32(0, 16, 64, 96),
5797 _mm256_setr_epi64x(-1, -1, -1, 0),
5798 8,
5799 );
5800 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5801 }
5802
5803 #[simd_test(enable = "avx2")]
5804 unsafe fn test_mm_i32gather_pd() {
5805 let mut arr = [0.0f64; 128];
5806 let mut j = 0.0;
5807 for i in 0..128usize {
5808 arr[i] = j;
5809 j += 1.0;
5810 }
5811 // A multiplier of 8 is word-addressing for f64s
5812 let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5813 assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5814 }
5815
5816 #[simd_test(enable = "avx2")]
5817 unsafe fn test_mm_mask_i32gather_pd() {
5818 let mut arr = [0.0f64; 128];
5819 let mut j = 0.0;
5820 for i in 0..128usize {
5821 arr[i] = j;
5822 j += 1.0;
5823 }
5824 // A multiplier of 8 is word-addressing for f64s
5825 let r = _mm_mask_i32gather_pd(
5826 _mm_set1_pd(256.0),
5827 arr.as_ptr(),
5828 _mm_setr_epi32(16, 16, 16, 16),
5829 _mm_setr_pd(-1.0, 0.0),
5830 8,
5831 );
5832 assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5833 }
5834
5835 #[simd_test(enable = "avx2")]
5836 unsafe fn test_mm256_i32gather_pd() {
5837 let mut arr = [0.0f64; 128];
5838 let mut j = 0.0;
5839 for i in 0..128usize {
5840 arr[i] = j;
5841 j += 1.0;
5842 }
5843 // A multiplier of 8 is word-addressing for f64s
5844 let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5845 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5846 }
5847
5848 #[simd_test(enable = "avx2")]
5849 unsafe fn test_mm256_mask_i32gather_pd() {
5850 let mut arr = [0.0f64; 128];
5851 let mut j = 0.0;
5852 for i in 0..128usize {
5853 arr[i] = j;
5854 j += 1.0;
5855 }
5856 // A multiplier of 8 is word-addressing for f64s
5857 let r = _mm256_mask_i32gather_pd(
5858 _mm256_set1_pd(256.0),
5859 arr.as_ptr(),
5860 _mm_setr_epi32(0, 16, 64, 96),
5861 _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5862 8,
5863 );
5864 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5865 }
5866
5867 #[simd_test(enable = "avx2")]
5868 unsafe fn test_mm_i64gather_epi32() {
5869 let mut arr = [0i32; 128];
5870 for i in 0..128i32 {
5871 arr[i as usize] = i;
5872 }
5873 // A multiplier of 4 is word-addressing
5874 let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5875 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5876 }
5877
5878 #[simd_test(enable = "avx2")]
5879 unsafe fn test_mm_mask_i64gather_epi32() {
5880 let mut arr = [0i32; 128];
5881 for i in 0..128i32 {
5882 arr[i as usize] = i;
5883 }
5884 // A multiplier of 4 is word-addressing
5885 let r = _mm_mask_i64gather_epi32(
5886 _mm_set1_epi32(256),
5887 arr.as_ptr(),
5888 _mm_setr_epi64x(0, 16),
5889 _mm_setr_epi32(-1, 0, -1, 0),
5890 4,
5891 );
5892 assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5893 }
5894
5895 #[simd_test(enable = "avx2")]
5896 unsafe fn test_mm256_i64gather_epi32() {
5897 let mut arr = [0i32; 128];
5898 for i in 0..128i32 {
5899 arr[i as usize] = i;
5900 }
5901 // A multiplier of 4 is word-addressing
5902 let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5903 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5904 }
5905
5906 #[simd_test(enable = "avx2")]
5907 unsafe fn test_mm256_mask_i64gather_epi32() {
5908 let mut arr = [0i32; 128];
5909 for i in 0..128i32 {
5910 arr[i as usize] = i;
5911 }
5912 // A multiplier of 4 is word-addressing
5913 let r = _mm256_mask_i64gather_epi32(
5914 _mm_set1_epi32(256),
5915 arr.as_ptr(),
5916 _mm256_setr_epi64x(0, 16, 64, 96),
5917 _mm_setr_epi32(-1, -1, -1, 0),
5918 4,
5919 );
5920 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5921 }
5922
5923 #[simd_test(enable = "avx2")]
5924 unsafe fn test_mm_i64gather_ps() {
5925 let mut arr = [0.0f32; 128];
5926 let mut j = 0.0;
5927 for i in 0..128usize {
5928 arr[i] = j;
5929 j += 1.0;
5930 }
5931 // A multiplier of 4 is word-addressing for f32s
5932 let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5933 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5934 }
5935
5936 #[simd_test(enable = "avx2")]
5937 unsafe fn test_mm_mask_i64gather_ps() {
5938 let mut arr = [0.0f32; 128];
5939 let mut j = 0.0;
5940 for i in 0..128usize {
5941 arr[i] = j;
5942 j += 1.0;
5943 }
5944 // A multiplier of 4 is word-addressing for f32s
5945 let r = _mm_mask_i64gather_ps(
5946 _mm_set1_ps(256.0),
5947 arr.as_ptr(),
5948 _mm_setr_epi64x(0, 16),
5949 _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5950 4,
5951 );
5952 assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5953 }
5954
5955 #[simd_test(enable = "avx2")]
5956 unsafe fn test_mm256_i64gather_ps() {
5957 let mut arr = [0.0f32; 128];
5958 let mut j = 0.0;
5959 for i in 0..128usize {
5960 arr[i] = j;
5961 j += 1.0;
5962 }
5963 // A multiplier of 4 is word-addressing for f32s
5964 let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5965 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5966 }
5967
5968 #[simd_test(enable = "avx2")]
5969 unsafe fn test_mm256_mask_i64gather_ps() {
5970 let mut arr = [0.0f32; 128];
5971 let mut j = 0.0;
5972 for i in 0..128usize {
5973 arr[i] = j;
5974 j += 1.0;
5975 }
5976 // A multiplier of 4 is word-addressing for f32s
5977 let r = _mm256_mask_i64gather_ps(
5978 _mm_set1_ps(256.0),
5979 arr.as_ptr(),
5980 _mm256_setr_epi64x(0, 16, 64, 96),
5981 _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5982 4,
5983 );
5984 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5985 }
5986
5987 #[simd_test(enable = "avx2")]
5988 unsafe fn test_mm_i64gather_epi64() {
5989 let mut arr = [0i64; 128];
5990 for i in 0..128i64 {
5991 arr[i as usize] = i;
5992 }
5993 // A multiplier of 8 is word-addressing for i64s
5994 let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
5995 assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5996 }
5997
5998 #[simd_test(enable = "avx2")]
5999 unsafe fn test_mm_mask_i64gather_epi64() {
6000 let mut arr = [0i64; 128];
6001 for i in 0..128i64 {
6002 arr[i as usize] = i;
6003 }
6004 // A multiplier of 8 is word-addressing for i64s
6005 let r = _mm_mask_i64gather_epi64(
6006 _mm_set1_epi64x(256),
6007 arr.as_ptr(),
6008 _mm_setr_epi64x(16, 16),
6009 _mm_setr_epi64x(-1, 0),
6010 8,
6011 );
6012 assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
6013 }
6014
6015 #[simd_test(enable = "avx2")]
6016 unsafe fn test_mm256_i64gather_epi64() {
6017 let mut arr = [0i64; 128];
6018 for i in 0..128i64 {
6019 arr[i as usize] = i;
6020 }
6021 // A multiplier of 8 is word-addressing for i64s
6022 let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6023 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
6024 }
6025
6026 #[simd_test(enable = "avx2")]
6027 unsafe fn test_mm256_mask_i64gather_epi64() {
6028 let mut arr = [0i64; 128];
6029 for i in 0..128i64 {
6030 arr[i as usize] = i;
6031 }
6032 // A multiplier of 8 is word-addressing for i64s
6033 let r = _mm256_mask_i64gather_epi64(
6034 _mm256_set1_epi64x(256),
6035 arr.as_ptr(),
6036 _mm256_setr_epi64x(0, 16, 64, 96),
6037 _mm256_setr_epi64x(-1, -1, -1, 0),
6038 8,
6039 );
6040 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
6041 }
6042
6043 #[simd_test(enable = "avx2")]
6044 unsafe fn test_mm_i64gather_pd() {
6045 let mut arr = [0.0f64; 128];
6046 let mut j = 0.0;
6047 for i in 0..128usize {
6048 arr[i] = j;
6049 j += 1.0;
6050 }
6051 // A multiplier of 8 is word-addressing for f64s
6052 let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6053 assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
6054 }
6055
6056 #[simd_test(enable = "avx2")]
6057 unsafe fn test_mm_mask_i64gather_pd() {
6058 let mut arr = [0.0f64; 128];
6059 let mut j = 0.0;
6060 for i in 0..128usize {
6061 arr[i] = j;
6062 j += 1.0;
6063 }
6064 // A multiplier of 8 is word-addressing for f64s
6065 let r = _mm_mask_i64gather_pd(
6066 _mm_set1_pd(256.0),
6067 arr.as_ptr(),
6068 _mm_setr_epi64x(16, 16),
6069 _mm_setr_pd(-1.0, 0.0),
6070 8,
6071 );
6072 assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
6073 }
6074
6075 #[simd_test(enable = "avx2")]
6076 unsafe fn test_mm256_i64gather_pd() {
6077 let mut arr = [0.0f64; 128];
6078 let mut j = 0.0;
6079 for i in 0..128usize {
6080 arr[i] = j;
6081 j += 1.0;
6082 }
6083 // A multiplier of 8 is word-addressing for f64s
6084 let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6085 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6086 }
6087
6088 #[simd_test(enable = "avx2")]
6089 unsafe fn test_mm256_mask_i64gather_pd() {
6090 let mut arr = [0.0f64; 128];
6091 let mut j = 0.0;
6092 for i in 0..128usize {
6093 arr[i] = j;
6094 j += 1.0;
6095 }
6096 // A multiplier of 8 is word-addressing for f64s
6097 let r = _mm256_mask_i64gather_pd(
6098 _mm256_set1_pd(256.0),
6099 arr.as_ptr(),
6100 _mm256_setr_epi64x(0, 16, 64, 96),
6101 _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6102 8,
6103 );
6104 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6105 }
6106
6107 #[simd_test(enable = "avx")]
6108 unsafe fn test_mm256_extract_epi8() {
6109 #[rustfmt::skip]
6110 let a = _mm256_setr_epi8(
6111 -1, 1, 2, 3, 4, 5, 6, 7,
6112 8, 9, 10, 11, 12, 13, 14, 15,
6113 16, 17, 18, 19, 20, 21, 22, 23,
6114 24, 25, 26, 27, 28, 29, 30, 31
6115 );
6116 let r1 = _mm256_extract_epi8(a, 0);
6117 let r2 = _mm256_extract_epi8(a, 35);
6118 assert_eq!(r1, -1);
6119 assert_eq!(r2, 3);
6120 }
6121
6122 #[simd_test(enable = "avx2")]
6123 unsafe fn test_mm256_extract_epi16() {
6124 #[rustfmt::skip]
6125 let a = _mm256_setr_epi16(
6126 -1, 1, 2, 3, 4, 5, 6, 7,
6127 8, 9, 10, 11, 12, 13, 14, 15,
6128 );
6129 let r1 = _mm256_extract_epi16(a, 0);
6130 let r2 = _mm256_extract_epi16(a, 19);
6131 assert_eq!(r1, -1);
6132 assert_eq!(r2, 3);
6133 }
6134
6135 #[simd_test(enable = "avx2")]
6136 unsafe fn test_mm256_extract_epi32() {
6137 let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
6138 let r1 = _mm256_extract_epi32(a, 0);
6139 let r2 = _mm256_extract_epi32(a, 11);
6140 assert_eq!(r1, -1);
6141 assert_eq!(r2, 3);
6142 }
6143
6144 #[simd_test(enable = "avx2")]
6145 unsafe fn test_mm256_cvtsd_f64() {
6146 let a = _mm256_setr_pd(1., 2., 3., 4.);
6147 let r = _mm256_cvtsd_f64(a);
6148 assert_eq!(r, 1.);
6149 }
6150
6151 #[simd_test(enable = "avx2")]
6152 unsafe fn test_mm256_cvtsi256_si32() {
6153 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
6154 let r = _mm256_cvtsi256_si32(a);
6155 assert_eq!(r, 1);
6156 }
6157 }