]> git.proxmox.com Git - rustc.git/blob - library/stdarch/crates/core_arch/src/x86/avx2.rs
New upstream version 1.48.0~beta.8+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / avx2.rs
1 //! Advanced Vector Extensions 2 (AVX)
2 //!
3 //! AVX2 expands most AVX commands to 256-bit wide vector registers and
4 //! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5 //!
6 //! The references are:
7 //!
8 //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9 //! Instruction Set Reference, A-Z][intel64_ref].
10 //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11 //! System Instructions][amd64_ref].
12 //!
13 //! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14 //! overview of the instructions available.
15 //!
16 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21 use crate::{
22 core_arch::{simd::*, simd_llvm::*, x86::*},
23 mem::transmute,
24 };
25
26 #[cfg(test)]
27 use stdarch_test::assert_instr;
28
29 /// Computes the absolute values of packed 32-bit integers in `a`.
30 ///
31 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
32 #[inline]
33 #[target_feature(enable = "avx2")]
34 #[cfg_attr(test, assert_instr(vpabsd))]
35 #[stable(feature = "simd_x86", since = "1.27.0")]
36 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
37 transmute(pabsd(a.as_i32x8()))
38 }
39
40 /// Computes the absolute values of packed 16-bit integers in `a`.
41 ///
42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
43 #[inline]
44 #[target_feature(enable = "avx2")]
45 #[cfg_attr(test, assert_instr(vpabsw))]
46 #[stable(feature = "simd_x86", since = "1.27.0")]
47 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
48 transmute(pabsw(a.as_i16x16()))
49 }
50
51 /// Computes the absolute values of packed 8-bit integers in `a`.
52 ///
53 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
54 #[inline]
55 #[target_feature(enable = "avx2")]
56 #[cfg_attr(test, assert_instr(vpabsb))]
57 #[stable(feature = "simd_x86", since = "1.27.0")]
58 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
59 transmute(pabsb(a.as_i8x32()))
60 }
61
62 /// Adds packed 64-bit integers in `a` and `b`.
63 ///
64 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
65 #[inline]
66 #[target_feature(enable = "avx2")]
67 #[cfg_attr(test, assert_instr(vpaddq))]
68 #[stable(feature = "simd_x86", since = "1.27.0")]
69 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
70 transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
71 }
72
73 /// Adds packed 32-bit integers in `a` and `b`.
74 ///
75 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
76 #[inline]
77 #[target_feature(enable = "avx2")]
78 #[cfg_attr(test, assert_instr(vpaddd))]
79 #[stable(feature = "simd_x86", since = "1.27.0")]
80 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
81 transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
82 }
83
84 /// Adds packed 16-bit integers in `a` and `b`.
85 ///
86 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
87 #[inline]
88 #[target_feature(enable = "avx2")]
89 #[cfg_attr(test, assert_instr(vpaddw))]
90 #[stable(feature = "simd_x86", since = "1.27.0")]
91 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
92 transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
93 }
94
95 /// Adds packed 8-bit integers in `a` and `b`.
96 ///
97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
98 #[inline]
99 #[target_feature(enable = "avx2")]
100 #[cfg_attr(test, assert_instr(vpaddb))]
101 #[stable(feature = "simd_x86", since = "1.27.0")]
102 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
103 transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
104 }
105
106 /// Adds packed 8-bit integers in `a` and `b` using saturation.
107 ///
108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
109 #[inline]
110 #[target_feature(enable = "avx2")]
111 #[cfg_attr(test, assert_instr(vpaddsb))]
112 #[stable(feature = "simd_x86", since = "1.27.0")]
113 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
114 transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
115 }
116
117 /// Adds packed 16-bit integers in `a` and `b` using saturation.
118 ///
119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
120 #[inline]
121 #[target_feature(enable = "avx2")]
122 #[cfg_attr(test, assert_instr(vpaddsw))]
123 #[stable(feature = "simd_x86", since = "1.27.0")]
124 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
125 transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
126 }
127
128 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
129 ///
130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
131 #[inline]
132 #[target_feature(enable = "avx2")]
133 #[cfg_attr(test, assert_instr(vpaddusb))]
134 #[stable(feature = "simd_x86", since = "1.27.0")]
135 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
136 transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
137 }
138
139 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
140 ///
141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
142 #[inline]
143 #[target_feature(enable = "avx2")]
144 #[cfg_attr(test, assert_instr(vpaddusw))]
145 #[stable(feature = "simd_x86", since = "1.27.0")]
146 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
147 transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
148 }
149
150 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
151 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
152 ///
153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
154 #[inline]
155 #[target_feature(enable = "avx2")]
156 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
157 #[rustc_args_required_const(2)]
158 #[stable(feature = "simd_x86", since = "1.27.0")]
159 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
160 let n = n as u32;
161 // If `palignr` is shifting the pair of vectors more than the size of two
162 // lanes, emit zero.
163 if n > 32 {
164 return _mm256_set1_epi8(0);
165 }
166 // If `palignr` is shifting the pair of input vectors more than one lane,
167 // but less than two lanes, convert to shifting in zeroes.
168 let (a, b, n) = if n > 16 {
169 (_mm256_set1_epi8(0), a, n - 16)
170 } else {
171 (a, b, n)
172 };
173
174 let a = a.as_i8x32();
175 let b = b.as_i8x32();
176
177 let r: i8x32 = match n {
178 0 => simd_shuffle32(
179 b,
180 a,
181 [
182 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
183 23, 24, 25, 26, 27, 28, 29, 30, 31,
184 ],
185 ),
186 1 => simd_shuffle32(
187 b,
188 a,
189 [
190 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
191 24, 25, 26, 27, 28, 29, 30, 31, 48,
192 ],
193 ),
194 2 => simd_shuffle32(
195 b,
196 a,
197 [
198 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
199 25, 26, 27, 28, 29, 30, 31, 48, 49,
200 ],
201 ),
202 3 => simd_shuffle32(
203 b,
204 a,
205 [
206 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
207 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
208 ],
209 ),
210 4 => simd_shuffle32(
211 b,
212 a,
213 [
214 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
215 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
216 ],
217 ),
218 5 => simd_shuffle32(
219 b,
220 a,
221 [
222 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
223 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
224 ],
225 ),
226 6 => simd_shuffle32(
227 b,
228 a,
229 [
230 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
231 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
232 ],
233 ),
234 7 => simd_shuffle32(
235 b,
236 a,
237 [
238 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
239 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
240 ],
241 ),
242 8 => simd_shuffle32(
243 b,
244 a,
245 [
246 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
247 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
248 ],
249 ),
250 9 => simd_shuffle32(
251 b,
252 a,
253 [
254 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
255 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
256 ],
257 ),
258 10 => simd_shuffle32(
259 b,
260 a,
261 [
262 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
263 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
264 ],
265 ),
266 11 => simd_shuffle32(
267 b,
268 a,
269 [
270 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
271 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
272 ],
273 ),
274 12 => simd_shuffle32(
275 b,
276 a,
277 [
278 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
279 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
280 ],
281 ),
282 13 => simd_shuffle32(
283 b,
284 a,
285 [
286 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
287 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
288 ],
289 ),
290 14 => simd_shuffle32(
291 b,
292 a,
293 [
294 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
295 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
296 ],
297 ),
298 15 => simd_shuffle32(
299 b,
300 a,
301 [
302 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
303 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
304 ],
305 ),
306 _ => b,
307 };
308 transmute(r)
309 }
310
311 /// Computes the bitwise AND of 256 bits (representing integer data)
312 /// in `a` and `b`.
313 ///
314 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
315 #[inline]
316 #[target_feature(enable = "avx2")]
317 #[cfg_attr(test, assert_instr(vandps))]
318 #[stable(feature = "simd_x86", since = "1.27.0")]
319 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
320 transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
321 }
322
323 /// Computes the bitwise NOT of 256 bits (representing integer data)
324 /// in `a` and then AND with `b`.
325 ///
326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
327 #[inline]
328 #[target_feature(enable = "avx2")]
329 #[cfg_attr(test, assert_instr(vandnps))]
330 #[stable(feature = "simd_x86", since = "1.27.0")]
331 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
332 let all_ones = _mm256_set1_epi8(-1);
333 transmute(simd_and(
334 simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
335 b.as_i64x4(),
336 ))
337 }
338
339 /// Averages packed unsigned 16-bit integers in `a` and `b`.
340 ///
341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
342 #[inline]
343 #[target_feature(enable = "avx2")]
344 #[cfg_attr(test, assert_instr(vpavgw))]
345 #[stable(feature = "simd_x86", since = "1.27.0")]
346 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
347 transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
348 }
349
350 /// Averages packed unsigned 8-bit integers in `a` and `b`.
351 ///
352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
353 #[inline]
354 #[target_feature(enable = "avx2")]
355 #[cfg_attr(test, assert_instr(vpavgb))]
356 #[stable(feature = "simd_x86", since = "1.27.0")]
357 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
358 transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
359 }
360
361 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
362 ///
363 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
364 #[inline]
365 #[target_feature(enable = "avx2")]
366 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
367 #[rustc_args_required_const(2)]
368 #[stable(feature = "simd_x86", since = "1.27.0")]
369 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
370 let imm8 = (imm8 & 0xFF) as u8;
371 let a = a.as_i32x4();
372 let b = b.as_i32x4();
373 macro_rules! blend2 {
374 ($a:expr, $b:expr, $c:expr, $d:expr) => {
375 simd_shuffle4(a, b, [$a, $b, $c, $d]);
376 };
377 }
378 macro_rules! blend1 {
379 ($a:expr, $b:expr) => {
380 match (imm8 >> 2) & 0b11 {
381 0b00 => blend2!($a, $b, 2, 3),
382 0b01 => blend2!($a, $b, 6, 3),
383 0b10 => blend2!($a, $b, 2, 7),
384 _ => blend2!($a, $b, 6, 7),
385 }
386 };
387 }
388 let r: i32x4 = match imm8 & 0b11 {
389 0b00 => blend1!(0, 1),
390 0b01 => blend1!(4, 1),
391 0b10 => blend1!(0, 5),
392 _ => blend1!(4, 5),
393 };
394 transmute(r)
395 }
396
397 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
398 ///
399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
400 #[inline]
401 #[target_feature(enable = "avx2")]
402 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
403 #[rustc_args_required_const(2)]
404 #[stable(feature = "simd_x86", since = "1.27.0")]
405 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
406 let imm8 = (imm8 & 0xFF) as u8;
407 let a = a.as_i32x8();
408 let b = b.as_i32x8();
409 macro_rules! blend4 {
410 (
411 $a:expr,
412 $b:expr,
413 $c:expr,
414 $d:expr,
415 $e:expr,
416 $f:expr,
417 $g:expr,
418 $h:expr
419 ) => {
420 simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
421 };
422 }
423 macro_rules! blend3 {
424 ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
425 match (imm8 >> 6) & 0b11 {
426 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
427 0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
428 0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
429 _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
430 }
431 };
432 }
433 macro_rules! blend2 {
434 ($a:expr, $b:expr, $c:expr, $d:expr) => {
435 match (imm8 >> 4) & 0b11 {
436 0b00 => blend3!($a, $b, $c, $d, 4, 5),
437 0b01 => blend3!($a, $b, $c, $d, 12, 5),
438 0b10 => blend3!($a, $b, $c, $d, 4, 13),
439 _ => blend3!($a, $b, $c, $d, 12, 13),
440 }
441 };
442 }
443 macro_rules! blend1 {
444 ($a:expr, $b:expr) => {
445 match (imm8 >> 2) & 0b11 {
446 0b00 => blend2!($a, $b, 2, 3),
447 0b01 => blend2!($a, $b, 10, 3),
448 0b10 => blend2!($a, $b, 2, 11),
449 _ => blend2!($a, $b, 10, 11),
450 }
451 };
452 }
453 let r: i32x8 = match imm8 & 0b11 {
454 0b00 => blend1!(0, 1),
455 0b01 => blend1!(8, 1),
456 0b10 => blend1!(0, 9),
457 _ => blend1!(8, 9),
458 };
459 transmute(r)
460 }
461
462 /// Blends packed 16-bit integers from `a` and `b` using control mask `imm8`.
463 ///
464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
465 #[inline]
466 #[target_feature(enable = "avx2")]
467 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
468 #[rustc_args_required_const(2)]
469 #[stable(feature = "simd_x86", since = "1.27.0")]
470 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
471 let imm8 = (imm8 & 0xFF) as u8;
472 let a = a.as_i16x16();
473 let b = b.as_i16x16();
474 macro_rules! blend4 {
475 (
476 $a:expr,
477 $b:expr,
478 $c:expr,
479 $d:expr,
480 $e:expr,
481 $f:expr,
482 $g:expr,
483 $h:expr,
484 $i:expr,
485 $j:expr,
486 $k:expr,
487 $l:expr,
488 $m:expr,
489 $n:expr,
490 $o:expr,
491 $p:expr
492 ) => {
493 simd_shuffle16(
494 a,
495 b,
496 [
497 $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
498 ],
499 )
500 };
501 }
502 macro_rules! blend3 {
503 (
504 $a:expr,
505 $b:expr,
506 $c:expr,
507 $d:expr,
508 $e:expr,
509 $f:expr,
510 $a2:expr,
511 $b2:expr,
512 $c2:expr,
513 $d2:expr,
514 $e2:expr,
515 $f2:expr
516 ) => {
517 match (imm8 >> 6) & 0b11 {
518 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
519 0b01 => {
520 blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
521 }
522 0b10 => {
523 blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
524 }
525 _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
526 }
527 };
528 }
529 macro_rules! blend2 {
530 (
531 $a:expr,
532 $b:expr,
533 $c:expr,
534 $d:expr,
535 $a2:expr,
536 $b2:expr,
537 $c2:expr,
538 $d2:expr
539 ) => {
540 match (imm8 >> 4) & 0b11 {
541 0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
542 0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
543 0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
544 _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
545 }
546 };
547 }
548 macro_rules! blend1 {
549 ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
550 match (imm8 >> 2) & 0b11 {
551 0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
552 0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
553 0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
554 _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
555 }
556 };
557 }
558 let r: i16x16 = match imm8 & 0b11 {
559 0b00 => blend1!(0, 1, 8, 9),
560 0b01 => blend1!(16, 1, 24, 9),
561 0b10 => blend1!(0, 17, 8, 25),
562 _ => blend1!(16, 17, 24, 25),
563 };
564 transmute(r)
565 }
566
567 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
568 ///
569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
570 #[inline]
571 #[target_feature(enable = "avx2")]
572 #[cfg_attr(test, assert_instr(vpblendvb))]
573 #[stable(feature = "simd_x86", since = "1.27.0")]
574 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
575 transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
576 }
577
578 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
579 /// the 128-bit returned value.
580 ///
581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
582 #[inline]
583 #[target_feature(enable = "avx2")]
584 #[cfg_attr(test, assert_instr(vpbroadcastb))]
585 #[stable(feature = "simd_x86", since = "1.27.0")]
586 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
587 let zero = _mm_setzero_si128();
588 let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
589 transmute::<i8x16, _>(ret)
590 }
591
592 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
593 /// the 256-bit returned value.
594 ///
595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
596 #[inline]
597 #[target_feature(enable = "avx2")]
598 #[cfg_attr(test, assert_instr(vpbroadcastb))]
599 #[stable(feature = "simd_x86", since = "1.27.0")]
600 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
601 let zero = _mm_setzero_si128();
602 let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
603 transmute::<i8x32, _>(ret)
604 }
605
606 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
607 // often compiled to `vbroadcastss`.
608 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
609 /// the 128-bit returned value.
610 ///
611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
612 #[inline]
613 #[target_feature(enable = "avx2")]
614 #[cfg_attr(test, assert_instr(vbroadcastss))]
615 #[stable(feature = "simd_x86", since = "1.27.0")]
616 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
617 let zero = _mm_setzero_si128();
618 let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
619 transmute::<i32x4, _>(ret)
620 }
621
622 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
623 // often compiled to `vbroadcastss`.
624 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
625 /// the 256-bit returned value.
626 ///
627 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
628 #[inline]
629 #[target_feature(enable = "avx2")]
630 #[cfg_attr(test, assert_instr(vbroadcastss))]
631 #[stable(feature = "simd_x86", since = "1.27.0")]
632 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
633 let zero = _mm_setzero_si128();
634 let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
635 transmute::<i32x8, _>(ret)
636 }
637
638 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
639 /// the 128-bit returned value.
640 ///
641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
642 #[inline]
643 #[target_feature(enable = "avx2")]
644 // FIXME: https://github.com/rust-lang/stdarch/issues/791
645 #[cfg_attr(test, assert_instr(vmovddup))]
646 #[stable(feature = "simd_x86", since = "1.27.0")]
647 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
648 let ret = simd_shuffle2(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
649 transmute::<i64x2, _>(ret)
650 }
651
652 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
653 /// the 256-bit returned value.
654 ///
655 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
656 #[inline]
657 #[target_feature(enable = "avx2")]
658 #[cfg_attr(test, assert_instr(vbroadcastsd))]
659 #[stable(feature = "simd_x86", since = "1.27.0")]
660 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
661 let ret = simd_shuffle4(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
662 transmute::<i64x4, _>(ret)
663 }
664
665 /// Broadcasts the low double-precision (64-bit) floating-point element
666 /// from `a` to all elements of the 128-bit returned value.
667 ///
668 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
669 #[inline]
670 #[target_feature(enable = "avx2")]
671 #[cfg_attr(test, assert_instr(vmovddup))]
672 #[stable(feature = "simd_x86", since = "1.27.0")]
673 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
674 simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
675 }
676
677 /// Broadcasts the low double-precision (64-bit) floating-point element
678 /// from `a` to all elements of the 256-bit returned value.
679 ///
680 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
681 #[inline]
682 #[target_feature(enable = "avx2")]
683 #[cfg_attr(test, assert_instr(vbroadcastsd))]
684 #[stable(feature = "simd_x86", since = "1.27.0")]
685 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
686 simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
687 }
688
689 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
690 // `vbroadcastf128`.
691 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
692 /// the 256-bit returned value.
693 ///
694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
695 #[inline]
696 #[target_feature(enable = "avx2")]
697 #[stable(feature = "simd_x86", since = "1.27.0")]
698 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
699 let zero = _mm_setzero_si128();
700 let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
701 transmute::<i64x4, _>(ret)
702 }
703
704 /// Broadcasts the low single-precision (32-bit) floating-point element
705 /// from `a` to all elements of the 128-bit returned value.
706 ///
707 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
708 #[inline]
709 #[target_feature(enable = "avx2")]
710 #[cfg_attr(test, assert_instr(vbroadcastss))]
711 #[stable(feature = "simd_x86", since = "1.27.0")]
712 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
713 simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
714 }
715
716 /// Broadcasts the low single-precision (32-bit) floating-point element
717 /// from `a` to all elements of the 256-bit returned value.
718 ///
719 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
720 #[inline]
721 #[target_feature(enable = "avx2")]
722 #[cfg_attr(test, assert_instr(vbroadcastss))]
723 #[stable(feature = "simd_x86", since = "1.27.0")]
724 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
725 simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
726 }
727
728 /// Broadcasts the low packed 16-bit integer from a to all elements of
729 /// the 128-bit returned value
730 ///
731 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
732 #[inline]
733 #[target_feature(enable = "avx2")]
734 #[cfg_attr(test, assert_instr(vpbroadcastw))]
735 #[stable(feature = "simd_x86", since = "1.27.0")]
736 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
737 let zero = _mm_setzero_si128();
738 let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
739 transmute::<i16x8, _>(ret)
740 }
741
742 /// Broadcasts the low packed 16-bit integer from a to all elements of
743 /// the 256-bit returned value
744 ///
745 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
746 #[inline]
747 #[target_feature(enable = "avx2")]
748 #[cfg_attr(test, assert_instr(vpbroadcastw))]
749 #[stable(feature = "simd_x86", since = "1.27.0")]
750 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
751 let zero = _mm_setzero_si128();
752 let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
753 transmute::<i16x16, _>(ret)
754 }
755
756 /// Compares packed 64-bit integers in `a` and `b` for equality.
757 ///
758 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
759 #[inline]
760 #[target_feature(enable = "avx2")]
761 #[cfg_attr(test, assert_instr(vpcmpeqq))]
762 #[stable(feature = "simd_x86", since = "1.27.0")]
763 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
764 transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
765 }
766
767 /// Compares packed 32-bit integers in `a` and `b` for equality.
768 ///
769 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
770 #[inline]
771 #[target_feature(enable = "avx2")]
772 #[cfg_attr(test, assert_instr(vpcmpeqd))]
773 #[stable(feature = "simd_x86", since = "1.27.0")]
774 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
775 transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
776 }
777
778 /// Compares packed 16-bit integers in `a` and `b` for equality.
779 ///
780 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
781 #[inline]
782 #[target_feature(enable = "avx2")]
783 #[cfg_attr(test, assert_instr(vpcmpeqw))]
784 #[stable(feature = "simd_x86", since = "1.27.0")]
785 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
786 transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
787 }
788
789 /// Compares packed 8-bit integers in `a` and `b` for equality.
790 ///
791 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
792 #[inline]
793 #[target_feature(enable = "avx2")]
794 #[cfg_attr(test, assert_instr(vpcmpeqb))]
795 #[stable(feature = "simd_x86", since = "1.27.0")]
796 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
797 transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
798 }
799
800 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
801 ///
802 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
803 #[inline]
804 #[target_feature(enable = "avx2")]
805 #[cfg_attr(test, assert_instr(vpcmpgtq))]
806 #[stable(feature = "simd_x86", since = "1.27.0")]
807 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
808 transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
809 }
810
811 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
812 ///
813 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
814 #[inline]
815 #[target_feature(enable = "avx2")]
816 #[cfg_attr(test, assert_instr(vpcmpgtd))]
817 #[stable(feature = "simd_x86", since = "1.27.0")]
818 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
819 transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
820 }
821
822 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
823 ///
824 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
825 #[inline]
826 #[target_feature(enable = "avx2")]
827 #[cfg_attr(test, assert_instr(vpcmpgtw))]
828 #[stable(feature = "simd_x86", since = "1.27.0")]
829 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
830 transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
831 }
832
833 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
834 ///
835 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
836 #[inline]
837 #[target_feature(enable = "avx2")]
838 #[cfg_attr(test, assert_instr(vpcmpgtb))]
839 #[stable(feature = "simd_x86", since = "1.27.0")]
840 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
841 transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
842 }
843
844 /// Sign-extend 16-bit integers to 32-bit integers.
845 ///
846 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
847 #[inline]
848 #[target_feature(enable = "avx2")]
849 #[cfg_attr(test, assert_instr(vpmovsxwd))]
850 #[stable(feature = "simd_x86", since = "1.27.0")]
851 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
852 transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
853 }
854
855 /// Sign-extend 16-bit integers to 64-bit integers.
856 ///
857 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
858 #[inline]
859 #[target_feature(enable = "avx2")]
860 #[cfg_attr(test, assert_instr(vpmovsxwq))]
861 #[stable(feature = "simd_x86", since = "1.27.0")]
862 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
863 let a = a.as_i16x8();
864 let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
865 transmute::<i64x4, _>(simd_cast(v64))
866 }
867
868 /// Sign-extend 32-bit integers to 64-bit integers.
869 ///
870 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
871 #[inline]
872 #[target_feature(enable = "avx2")]
873 #[cfg_attr(test, assert_instr(vpmovsxdq))]
874 #[stable(feature = "simd_x86", since = "1.27.0")]
875 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
876 transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
877 }
878
879 /// Sign-extend 8-bit integers to 16-bit integers.
880 ///
881 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
882 #[inline]
883 #[target_feature(enable = "avx2")]
884 #[cfg_attr(test, assert_instr(vpmovsxbw))]
885 #[stable(feature = "simd_x86", since = "1.27.0")]
886 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
887 transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
888 }
889
890 /// Sign-extend 8-bit integers to 32-bit integers.
891 ///
892 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
893 #[inline]
894 #[target_feature(enable = "avx2")]
895 #[cfg_attr(test, assert_instr(vpmovsxbd))]
896 #[stable(feature = "simd_x86", since = "1.27.0")]
897 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
898 let a = a.as_i8x16();
899 let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
900 transmute::<i32x8, _>(simd_cast(v64))
901 }
902
903 /// Sign-extend 8-bit integers to 64-bit integers.
904 ///
905 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
906 #[inline]
907 #[target_feature(enable = "avx2")]
908 #[cfg_attr(test, assert_instr(vpmovsxbq))]
909 #[stable(feature = "simd_x86", since = "1.27.0")]
910 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
911 let a = a.as_i8x16();
912 let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
913 transmute::<i64x4, _>(simd_cast(v32))
914 }
915
916 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
917 /// integers, and stores the results in `dst`.
918 ///
919 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
920 #[inline]
921 #[target_feature(enable = "avx2")]
922 #[cfg_attr(test, assert_instr(vpmovzxwd))]
923 #[stable(feature = "simd_x86", since = "1.27.0")]
924 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
925 transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
926 }
927
928 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
929 /// integers. The upper four elements of `a` are unused.
930 ///
931 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
932 #[inline]
933 #[target_feature(enable = "avx2")]
934 #[cfg_attr(test, assert_instr(vpmovzxwq))]
935 #[stable(feature = "simd_x86", since = "1.27.0")]
936 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
937 let a = a.as_u16x8();
938 let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
939 transmute::<i64x4, _>(simd_cast(v64))
940 }
941
942 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
943 ///
944 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
945 #[inline]
946 #[target_feature(enable = "avx2")]
947 #[cfg_attr(test, assert_instr(vpmovzxdq))]
948 #[stable(feature = "simd_x86", since = "1.27.0")]
949 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
950 transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
951 }
952
953 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
954 ///
955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
956 #[inline]
957 #[target_feature(enable = "avx2")]
958 #[cfg_attr(test, assert_instr(vpmovzxbw))]
959 #[stable(feature = "simd_x86", since = "1.27.0")]
960 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
961 transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
962 }
963
964 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
965 /// integers. The upper eight elements of `a` are unused.
966 ///
967 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
968 #[inline]
969 #[target_feature(enable = "avx2")]
970 #[cfg_attr(test, assert_instr(vpmovzxbd))]
971 #[stable(feature = "simd_x86", since = "1.27.0")]
972 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
973 let a = a.as_u8x16();
974 let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
975 transmute::<i32x8, _>(simd_cast(v64))
976 }
977
978 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
979 /// integers. The upper twelve elements of `a` are unused.
980 ///
981 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
982 #[inline]
983 #[target_feature(enable = "avx2")]
984 #[cfg_attr(test, assert_instr(vpmovzxbq))]
985 #[stable(feature = "simd_x86", since = "1.27.0")]
986 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
987 let a = a.as_u8x16();
988 let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
989 transmute::<i64x4, _>(simd_cast(v32))
990 }
991
992 /// Extracts 128 bits (of integer data) from `a` selected with `imm8`.
993 ///
994 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
995 #[inline]
996 #[target_feature(enable = "avx2")]
997 #[cfg_attr(
998 all(test, not(target_os = "windows")),
999 assert_instr(vextractf128, imm8 = 1)
1000 )]
1001 #[rustc_args_required_const(1)]
1002 #[stable(feature = "simd_x86", since = "1.27.0")]
1003 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
1004 let a = a.as_i64x4();
1005 let b = _mm256_undefined_si256().as_i64x4();
1006 let dst: i64x2 = match imm8 & 0b01 {
1007 0 => simd_shuffle2(a, b, [0, 1]),
1008 _ => simd_shuffle2(a, b, [2, 3]),
1009 };
1010 transmute(dst)
1011 }
1012
1013 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
1014 ///
1015 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
1016 #[inline]
1017 #[target_feature(enable = "avx2")]
1018 #[cfg_attr(test, assert_instr(vphaddw))]
1019 #[stable(feature = "simd_x86", since = "1.27.0")]
1020 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
1021 transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
1022 }
1023
1024 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
1025 ///
1026 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
1027 #[inline]
1028 #[target_feature(enable = "avx2")]
1029 #[cfg_attr(test, assert_instr(vphaddd))]
1030 #[stable(feature = "simd_x86", since = "1.27.0")]
1031 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
1032 transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
1033 }
1034
1035 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
1036 /// using saturation.
1037 ///
1038 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
1039 #[inline]
1040 #[target_feature(enable = "avx2")]
1041 #[cfg_attr(test, assert_instr(vphaddsw))]
1042 #[stable(feature = "simd_x86", since = "1.27.0")]
1043 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1044 transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
1045 }
1046
1047 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1048 ///
1049 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
1050 #[inline]
1051 #[target_feature(enable = "avx2")]
1052 #[cfg_attr(test, assert_instr(vphsubw))]
1053 #[stable(feature = "simd_x86", since = "1.27.0")]
1054 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1055 transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
1056 }
1057
1058 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1059 ///
1060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
1061 #[inline]
1062 #[target_feature(enable = "avx2")]
1063 #[cfg_attr(test, assert_instr(vphsubd))]
1064 #[stable(feature = "simd_x86", since = "1.27.0")]
1065 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1066 transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
1067 }
1068
1069 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1070 /// using saturation.
1071 ///
1072 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
1073 #[inline]
1074 #[target_feature(enable = "avx2")]
1075 #[cfg_attr(test, assert_instr(vphsubsw))]
1076 #[stable(feature = "simd_x86", since = "1.27.0")]
1077 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1078 transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
1079 }
1080
1081 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1082 /// where
1083 /// `scale` is between 1 and 8.
1084 ///
1085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
1086 #[inline]
1087 #[target_feature(enable = "avx2")]
1088 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1089 #[rustc_args_required_const(2)]
1090 #[stable(feature = "simd_x86", since = "1.27.0")]
1091 pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1092 let zero = _mm_setzero_si128().as_i32x4();
1093 let neg_one = _mm_set1_epi32(-1).as_i32x4();
1094 let offsets = offsets.as_i32x4();
1095 let slice = slice as *const i8;
1096 macro_rules! call {
1097 ($imm8:expr) => {
1098 pgatherdd(zero, slice, offsets, neg_one, $imm8)
1099 };
1100 }
1101 let r = constify_imm8!(scale, call);
1102 transmute(r)
1103 }
1104
1105 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1106 /// where
1107 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1108 /// that position instead.
1109 ///
1110 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
1111 #[inline]
1112 #[target_feature(enable = "avx2")]
1113 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1114 #[rustc_args_required_const(4)]
1115 #[stable(feature = "simd_x86", since = "1.27.0")]
1116 pub unsafe fn _mm_mask_i32gather_epi32(
1117 src: __m128i,
1118 slice: *const i32,
1119 offsets: __m128i,
1120 mask: __m128i,
1121 scale: i32,
1122 ) -> __m128i {
1123 let src = src.as_i32x4();
1124 let mask = mask.as_i32x4();
1125 let offsets = offsets.as_i32x4();
1126 let slice = slice as *const i8;
1127 macro_rules! call {
1128 ($imm8:expr) => {
1129 pgatherdd(src, slice, offsets, mask, $imm8)
1130 };
1131 }
1132 let r = constify_imm8!(scale, call);
1133 transmute(r)
1134 }
1135
1136 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1137 /// where
1138 /// `scale` is between 1 and 8.
1139 ///
1140 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
1141 #[inline]
1142 #[target_feature(enable = "avx2")]
1143 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1144 #[rustc_args_required_const(2)]
1145 #[stable(feature = "simd_x86", since = "1.27.0")]
1146 pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
1147 let zero = _mm256_setzero_si256().as_i32x8();
1148 let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1149 let offsets = offsets.as_i32x8();
1150 let slice = slice as *const i8;
1151 macro_rules! call {
1152 ($imm8:expr) => {
1153 vpgatherdd(zero, slice, offsets, neg_one, $imm8)
1154 };
1155 }
1156 let r = constify_imm8!(scale, call);
1157 transmute(r)
1158 }
1159
1160 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1161 /// where
1162 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1163 /// that position instead.
1164 ///
1165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
1166 #[inline]
1167 #[target_feature(enable = "avx2")]
1168 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1169 #[rustc_args_required_const(4)]
1170 #[stable(feature = "simd_x86", since = "1.27.0")]
1171 pub unsafe fn _mm256_mask_i32gather_epi32(
1172 src: __m256i,
1173 slice: *const i32,
1174 offsets: __m256i,
1175 mask: __m256i,
1176 scale: i32,
1177 ) -> __m256i {
1178 let src = src.as_i32x8();
1179 let mask = mask.as_i32x8();
1180 let offsets = offsets.as_i32x8();
1181 let slice = slice as *const i8;
1182 macro_rules! call {
1183 ($imm8:expr) => {
1184 vpgatherdd(src, slice, offsets, mask, $imm8)
1185 };
1186 }
1187 let r = constify_imm8!(scale, call);
1188 transmute(r)
1189 }
1190
1191 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1192 /// where
1193 /// `scale` is between 1 and 8.
1194 ///
1195 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
1196 #[inline]
1197 #[target_feature(enable = "avx2")]
1198 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1199 #[rustc_args_required_const(2)]
1200 #[stable(feature = "simd_x86", since = "1.27.0")]
1201 pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1202 let zero = _mm_setzero_ps();
1203 let neg_one = _mm_set1_ps(-1.0);
1204 let offsets = offsets.as_i32x4();
1205 let slice = slice as *const i8;
1206 macro_rules! call {
1207 ($imm8:expr) => {
1208 pgatherdps(zero, slice, offsets, neg_one, $imm8)
1209 };
1210 }
1211 constify_imm8!(scale, call)
1212 }
1213
1214 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1215 /// where
1216 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1217 /// that position instead.
1218 ///
1219 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
1220 #[inline]
1221 #[target_feature(enable = "avx2")]
1222 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1223 #[rustc_args_required_const(4)]
1224 #[stable(feature = "simd_x86", since = "1.27.0")]
1225 pub unsafe fn _mm_mask_i32gather_ps(
1226 src: __m128,
1227 slice: *const f32,
1228 offsets: __m128i,
1229 mask: __m128,
1230 scale: i32,
1231 ) -> __m128 {
1232 let offsets = offsets.as_i32x4();
1233 let slice = slice as *const i8;
1234 macro_rules! call {
1235 ($imm8:expr) => {
1236 pgatherdps(src, slice, offsets, mask, $imm8)
1237 };
1238 }
1239 constify_imm8!(scale, call)
1240 }
1241
1242 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1243 /// where
1244 /// `scale` is between 1 and 8.
1245 ///
1246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
1247 #[inline]
1248 #[target_feature(enable = "avx2")]
1249 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1250 #[rustc_args_required_const(2)]
1251 #[stable(feature = "simd_x86", since = "1.27.0")]
1252 pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
1253 let zero = _mm256_setzero_ps();
1254 let neg_one = _mm256_set1_ps(-1.0);
1255 let offsets = offsets.as_i32x8();
1256 let slice = slice as *const i8;
1257 macro_rules! call {
1258 ($imm8:expr) => {
1259 vpgatherdps(zero, slice, offsets, neg_one, $imm8)
1260 };
1261 }
1262 constify_imm8!(scale, call)
1263 }
1264
1265 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1266 /// where
1267 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1268 /// that position instead.
1269 ///
1270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
1271 #[inline]
1272 #[target_feature(enable = "avx2")]
1273 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1274 #[rustc_args_required_const(4)]
1275 #[stable(feature = "simd_x86", since = "1.27.0")]
1276 pub unsafe fn _mm256_mask_i32gather_ps(
1277 src: __m256,
1278 slice: *const f32,
1279 offsets: __m256i,
1280 mask: __m256,
1281 scale: i32,
1282 ) -> __m256 {
1283 let offsets = offsets.as_i32x8();
1284 let slice = slice as *const i8;
1285 macro_rules! call {
1286 ($imm8:expr) => {
1287 vpgatherdps(src, slice, offsets, mask, $imm8)
1288 };
1289 }
1290 constify_imm8!(scale, call)
1291 }
1292
1293 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1294 /// where
1295 /// `scale` is between 1 and 8.
1296 ///
1297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
1298 #[inline]
1299 #[target_feature(enable = "avx2")]
1300 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1301 #[rustc_args_required_const(2)]
1302 #[stable(feature = "simd_x86", since = "1.27.0")]
1303 pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1304 let zero = _mm_setzero_si128().as_i64x2();
1305 let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1306 let offsets = offsets.as_i32x4();
1307 let slice = slice as *const i8;
1308 macro_rules! call {
1309 ($imm8:expr) => {
1310 pgatherdq(zero, slice, offsets, neg_one, $imm8)
1311 };
1312 }
1313 let r = constify_imm8!(scale, call);
1314 transmute(r)
1315 }
1316
1317 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1318 /// where
1319 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1320 /// that position instead.
1321 ///
1322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
1323 #[inline]
1324 #[target_feature(enable = "avx2")]
1325 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1326 #[rustc_args_required_const(4)]
1327 #[stable(feature = "simd_x86", since = "1.27.0")]
1328 pub unsafe fn _mm_mask_i32gather_epi64(
1329 src: __m128i,
1330 slice: *const i64,
1331 offsets: __m128i,
1332 mask: __m128i,
1333 scale: i32,
1334 ) -> __m128i {
1335 let src = src.as_i64x2();
1336 let mask = mask.as_i64x2();
1337 let offsets = offsets.as_i32x4();
1338 let slice = slice as *const i8;
1339 macro_rules! call {
1340 ($imm8:expr) => {
1341 pgatherdq(src, slice, offsets, mask, $imm8)
1342 };
1343 }
1344 let r = constify_imm8!(scale, call);
1345 transmute(r)
1346 }
1347
1348 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1349 /// where
1350 /// `scale` is between 1 and 8.
1351 ///
1352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
1353 #[inline]
1354 #[target_feature(enable = "avx2")]
1355 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1356 #[rustc_args_required_const(2)]
1357 #[stable(feature = "simd_x86", since = "1.27.0")]
1358 pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
1359 let zero = _mm256_setzero_si256().as_i64x4();
1360 let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1361 let offsets = offsets.as_i32x4();
1362 let slice = slice as *const i8;
1363 macro_rules! call {
1364 ($imm8:expr) => {
1365 vpgatherdq(zero, slice, offsets, neg_one, $imm8)
1366 };
1367 }
1368 let r = constify_imm8!(scale, call);
1369 transmute(r)
1370 }
1371
1372 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1373 /// where
1374 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1375 /// that position instead.
1376 ///
1377 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
1378 #[inline]
1379 #[target_feature(enable = "avx2")]
1380 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1381 #[rustc_args_required_const(4)]
1382 #[stable(feature = "simd_x86", since = "1.27.0")]
1383 pub unsafe fn _mm256_mask_i32gather_epi64(
1384 src: __m256i,
1385 slice: *const i64,
1386 offsets: __m128i,
1387 mask: __m256i,
1388 scale: i32,
1389 ) -> __m256i {
1390 let src = src.as_i64x4();
1391 let mask = mask.as_i64x4();
1392 let offsets = offsets.as_i32x4();
1393 let slice = slice as *const i8;
1394 macro_rules! call {
1395 ($imm8:expr) => {
1396 vpgatherdq(src, slice, offsets, mask, $imm8)
1397 };
1398 }
1399 let r = constify_imm8!(scale, call);
1400 transmute(r)
1401 }
1402
1403 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1404 /// where
1405 /// `scale` is between 1 and 8.
1406 ///
1407 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
1408 #[inline]
1409 #[target_feature(enable = "avx2")]
1410 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1411 #[rustc_args_required_const(2)]
1412 #[stable(feature = "simd_x86", since = "1.27.0")]
1413 pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1414 let zero = _mm_setzero_pd();
1415 let neg_one = _mm_set1_pd(-1.0);
1416 let offsets = offsets.as_i32x4();
1417 let slice = slice as *const i8;
1418 macro_rules! call {
1419 ($imm8:expr) => {
1420 pgatherdpd(zero, slice, offsets, neg_one, $imm8)
1421 };
1422 }
1423 constify_imm8!(scale, call)
1424 }
1425
1426 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1427 /// where
1428 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1429 /// that position instead.
1430 ///
1431 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
1432 #[inline]
1433 #[target_feature(enable = "avx2")]
1434 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1435 #[rustc_args_required_const(4)]
1436 #[stable(feature = "simd_x86", since = "1.27.0")]
1437 pub unsafe fn _mm_mask_i32gather_pd(
1438 src: __m128d,
1439 slice: *const f64,
1440 offsets: __m128i,
1441 mask: __m128d,
1442 scale: i32,
1443 ) -> __m128d {
1444 let offsets = offsets.as_i32x4();
1445 let slice = slice as *const i8;
1446 macro_rules! call {
1447 ($imm8:expr) => {
1448 pgatherdpd(src, slice, offsets, mask, $imm8)
1449 };
1450 }
1451 constify_imm8!(scale, call)
1452 }
1453
1454 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1455 /// where
1456 /// `scale` is between 1 and 8.
1457 ///
1458 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
1459 #[inline]
1460 #[target_feature(enable = "avx2")]
1461 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1462 #[rustc_args_required_const(2)]
1463 #[stable(feature = "simd_x86", since = "1.27.0")]
1464 pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
1465 let zero = _mm256_setzero_pd();
1466 let neg_one = _mm256_set1_pd(-1.0);
1467 let offsets = offsets.as_i32x4();
1468 let slice = slice as *const i8;
1469 macro_rules! call {
1470 ($imm8:expr) => {
1471 vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
1472 };
1473 }
1474 constify_imm8!(scale, call)
1475 }
1476
1477 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1478 /// where
1479 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1480 /// that position instead.
1481 ///
1482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
1483 #[inline]
1484 #[target_feature(enable = "avx2")]
1485 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1486 #[rustc_args_required_const(4)]
1487 #[stable(feature = "simd_x86", since = "1.27.0")]
1488 pub unsafe fn _mm256_mask_i32gather_pd(
1489 src: __m256d,
1490 slice: *const f64,
1491 offsets: __m128i,
1492 mask: __m256d,
1493 scale: i32,
1494 ) -> __m256d {
1495 let offsets = offsets.as_i32x4();
1496 let slice = slice as *const i8;
1497 macro_rules! call {
1498 ($imm8:expr) => {
1499 vpgatherdpd(src, slice, offsets, mask, $imm8)
1500 };
1501 }
1502 constify_imm8!(scale, call)
1503 }
1504
1505 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1506 /// where
1507 /// `scale` is between 1 and 8.
1508 ///
1509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
1510 #[inline]
1511 #[target_feature(enable = "avx2")]
1512 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1513 #[rustc_args_required_const(2)]
1514 #[stable(feature = "simd_x86", since = "1.27.0")]
1515 pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1516 let zero = _mm_setzero_si128().as_i32x4();
1517 let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1518 let offsets = offsets.as_i64x2();
1519 let slice = slice as *const i8;
1520 macro_rules! call {
1521 ($imm8:expr) => {
1522 pgatherqd(zero, slice, offsets, neg_one, $imm8)
1523 };
1524 }
1525 let r = constify_imm8!(scale, call);
1526 transmute(r)
1527 }
1528
1529 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1530 /// where
1531 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1532 /// that position instead.
1533 ///
1534 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
1535 #[inline]
1536 #[target_feature(enable = "avx2")]
1537 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1538 #[rustc_args_required_const(4)]
1539 #[stable(feature = "simd_x86", since = "1.27.0")]
1540 pub unsafe fn _mm_mask_i64gather_epi32(
1541 src: __m128i,
1542 slice: *const i32,
1543 offsets: __m128i,
1544 mask: __m128i,
1545 scale: i32,
1546 ) -> __m128i {
1547 let src = src.as_i32x4();
1548 let mask = mask.as_i32x4();
1549 let offsets = offsets.as_i64x2();
1550 let slice = slice as *const i8;
1551 macro_rules! call {
1552 ($imm8:expr) => {
1553 pgatherqd(src, slice, offsets, mask, $imm8)
1554 };
1555 }
1556 let r = constify_imm8!(scale, call);
1557 transmute(r)
1558 }
1559
1560 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1561 /// where
1562 /// `scale` is between 1 and 8.
1563 ///
1564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
1565 #[inline]
1566 #[target_feature(enable = "avx2")]
1567 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1568 #[rustc_args_required_const(2)]
1569 #[stable(feature = "simd_x86", since = "1.27.0")]
1570 pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
1571 let zero = _mm_setzero_si128().as_i32x4();
1572 let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1573 let offsets = offsets.as_i64x4();
1574 let slice = slice as *const i8;
1575 macro_rules! call {
1576 ($imm8:expr) => {
1577 vpgatherqd(zero, slice, offsets, neg_one, $imm8)
1578 };
1579 }
1580 let r = constify_imm8!(scale, call);
1581 transmute(r)
1582 }
1583
1584 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1585 /// where
1586 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1587 /// that position instead.
1588 ///
1589 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
1590 #[inline]
1591 #[target_feature(enable = "avx2")]
1592 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1593 #[rustc_args_required_const(4)]
1594 #[stable(feature = "simd_x86", since = "1.27.0")]
1595 pub unsafe fn _mm256_mask_i64gather_epi32(
1596 src: __m128i,
1597 slice: *const i32,
1598 offsets: __m256i,
1599 mask: __m128i,
1600 scale: i32,
1601 ) -> __m128i {
1602 let src = src.as_i32x4();
1603 let mask = mask.as_i32x4();
1604 let offsets = offsets.as_i64x4();
1605 let slice = slice as *const i8;
1606 macro_rules! call {
1607 ($imm8:expr) => {
1608 vpgatherqd(src, slice, offsets, mask, $imm8)
1609 };
1610 }
1611 let r = constify_imm8!(scale, call);
1612 transmute(r)
1613 }
1614
1615 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1616 /// where
1617 /// `scale` is between 1 and 8.
1618 ///
1619 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
1620 #[inline]
1621 #[target_feature(enable = "avx2")]
1622 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1623 #[rustc_args_required_const(2)]
1624 #[stable(feature = "simd_x86", since = "1.27.0")]
1625 pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1626 let zero = _mm_setzero_ps();
1627 let neg_one = _mm_set1_ps(-1.0);
1628 let offsets = offsets.as_i64x2();
1629 let slice = slice as *const i8;
1630 macro_rules! call {
1631 ($imm8:expr) => {
1632 pgatherqps(zero, slice, offsets, neg_one, $imm8)
1633 };
1634 }
1635 constify_imm8!(scale, call)
1636 }
1637
1638 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1639 /// where
1640 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1641 /// that position instead.
1642 ///
1643 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
1644 #[inline]
1645 #[target_feature(enable = "avx2")]
1646 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1647 #[rustc_args_required_const(4)]
1648 #[stable(feature = "simd_x86", since = "1.27.0")]
1649 pub unsafe fn _mm_mask_i64gather_ps(
1650 src: __m128,
1651 slice: *const f32,
1652 offsets: __m128i,
1653 mask: __m128,
1654 scale: i32,
1655 ) -> __m128 {
1656 let offsets = offsets.as_i64x2();
1657 let slice = slice as *const i8;
1658 macro_rules! call {
1659 ($imm8:expr) => {
1660 pgatherqps(src, slice, offsets, mask, $imm8)
1661 };
1662 }
1663 constify_imm8!(scale, call)
1664 }
1665
1666 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1667 /// where
1668 /// `scale` is between 1 and 8.
1669 ///
1670 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
1671 #[inline]
1672 #[target_feature(enable = "avx2")]
1673 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1674 #[rustc_args_required_const(2)]
1675 #[stable(feature = "simd_x86", since = "1.27.0")]
1676 pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
1677 let zero = _mm_setzero_ps();
1678 let neg_one = _mm_set1_ps(-1.0);
1679 let offsets = offsets.as_i64x4();
1680 let slice = slice as *const i8;
1681 macro_rules! call {
1682 ($imm8:expr) => {
1683 vpgatherqps(zero, slice, offsets, neg_one, $imm8)
1684 };
1685 }
1686 constify_imm8!(scale, call)
1687 }
1688
1689 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1690 /// where
1691 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1692 /// that position instead.
1693 ///
1694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
1695 #[inline]
1696 #[target_feature(enable = "avx2")]
1697 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1698 #[rustc_args_required_const(4)]
1699 #[stable(feature = "simd_x86", since = "1.27.0")]
1700 pub unsafe fn _mm256_mask_i64gather_ps(
1701 src: __m128,
1702 slice: *const f32,
1703 offsets: __m256i,
1704 mask: __m128,
1705 scale: i32,
1706 ) -> __m128 {
1707 let offsets = offsets.as_i64x4();
1708 let slice = slice as *const i8;
1709 macro_rules! call {
1710 ($imm8:expr) => {
1711 vpgatherqps(src, slice, offsets, mask, $imm8)
1712 };
1713 }
1714 constify_imm8!(scale, call)
1715 }
1716
1717 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1718 /// where
1719 /// `scale` is between 1 and 8.
1720 ///
1721 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
1722 #[inline]
1723 #[target_feature(enable = "avx2")]
1724 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1725 #[rustc_args_required_const(2)]
1726 #[stable(feature = "simd_x86", since = "1.27.0")]
1727 pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1728 let zero = _mm_setzero_si128().as_i64x2();
1729 let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1730 let slice = slice as *const i8;
1731 let offsets = offsets.as_i64x2();
1732 macro_rules! call {
1733 ($imm8:expr) => {
1734 pgatherqq(zero, slice, offsets, neg_one, $imm8)
1735 };
1736 }
1737 let r = constify_imm8!(scale, call);
1738 transmute(r)
1739 }
1740
1741 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1742 /// where
1743 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1744 /// that position instead.
1745 ///
1746 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
1747 #[inline]
1748 #[target_feature(enable = "avx2")]
1749 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1750 #[rustc_args_required_const(4)]
1751 #[stable(feature = "simd_x86", since = "1.27.0")]
1752 pub unsafe fn _mm_mask_i64gather_epi64(
1753 src: __m128i,
1754 slice: *const i64,
1755 offsets: __m128i,
1756 mask: __m128i,
1757 scale: i32,
1758 ) -> __m128i {
1759 let src = src.as_i64x2();
1760 let mask = mask.as_i64x2();
1761 let offsets = offsets.as_i64x2();
1762 let slice = slice as *const i8;
1763 macro_rules! call {
1764 ($imm8:expr) => {
1765 pgatherqq(src, slice, offsets, mask, $imm8)
1766 };
1767 }
1768 let r = constify_imm8!(scale, call);
1769 transmute(r)
1770 }
1771
1772 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1773 /// where
1774 /// `scale` is between 1 and 8.
1775 ///
1776 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
1777 #[inline]
1778 #[target_feature(enable = "avx2")]
1779 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1780 #[rustc_args_required_const(2)]
1781 #[stable(feature = "simd_x86", since = "1.27.0")]
1782 pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
1783 let zero = _mm256_setzero_si256().as_i64x4();
1784 let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1785 let slice = slice as *const i8;
1786 let offsets = offsets.as_i64x4();
1787 macro_rules! call {
1788 ($imm8:expr) => {
1789 vpgatherqq(zero, slice, offsets, neg_one, $imm8)
1790 };
1791 }
1792 let r = constify_imm8!(scale, call);
1793 transmute(r)
1794 }
1795
1796 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1797 /// where
1798 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1799 /// that position instead.
1800 ///
1801 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
1802 #[inline]
1803 #[target_feature(enable = "avx2")]
1804 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1805 #[rustc_args_required_const(4)]
1806 #[stable(feature = "simd_x86", since = "1.27.0")]
1807 pub unsafe fn _mm256_mask_i64gather_epi64(
1808 src: __m256i,
1809 slice: *const i64,
1810 offsets: __m256i,
1811 mask: __m256i,
1812 scale: i32,
1813 ) -> __m256i {
1814 let src = src.as_i64x4();
1815 let mask = mask.as_i64x4();
1816 let offsets = offsets.as_i64x4();
1817 let slice = slice as *const i8;
1818 macro_rules! call {
1819 ($imm8:expr) => {
1820 vpgatherqq(src, slice, offsets, mask, $imm8)
1821 };
1822 }
1823 let r = constify_imm8!(scale, call);
1824 transmute(r)
1825 }
1826
1827 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1828 /// where
1829 /// `scale` is between 1 and 8.
1830 ///
1831 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
1832 #[inline]
1833 #[target_feature(enable = "avx2")]
1834 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1835 #[rustc_args_required_const(2)]
1836 #[stable(feature = "simd_x86", since = "1.27.0")]
1837 pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1838 let zero = _mm_setzero_pd();
1839 let neg_one = _mm_set1_pd(-1.0);
1840 let slice = slice as *const i8;
1841 let offsets = offsets.as_i64x2();
1842 macro_rules! call {
1843 ($imm8:expr) => {
1844 pgatherqpd(zero, slice, offsets, neg_one, $imm8)
1845 };
1846 }
1847 constify_imm8!(scale, call)
1848 }
1849
1850 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1851 /// where
1852 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1853 /// that position instead.
1854 ///
1855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
1856 #[inline]
1857 #[target_feature(enable = "avx2")]
1858 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1859 #[rustc_args_required_const(4)]
1860 #[stable(feature = "simd_x86", since = "1.27.0")]
1861 pub unsafe fn _mm_mask_i64gather_pd(
1862 src: __m128d,
1863 slice: *const f64,
1864 offsets: __m128i,
1865 mask: __m128d,
1866 scale: i32,
1867 ) -> __m128d {
1868 let slice = slice as *const i8;
1869 let offsets = offsets.as_i64x2();
1870 macro_rules! call {
1871 ($imm8:expr) => {
1872 pgatherqpd(src, slice, offsets, mask, $imm8)
1873 };
1874 }
1875 constify_imm8!(scale, call)
1876 }
1877
1878 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1879 /// where
1880 /// `scale` is between 1 and 8.
1881 ///
1882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
1883 #[inline]
1884 #[target_feature(enable = "avx2")]
1885 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1886 #[rustc_args_required_const(2)]
1887 #[stable(feature = "simd_x86", since = "1.27.0")]
1888 pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
1889 let zero = _mm256_setzero_pd();
1890 let neg_one = _mm256_set1_pd(-1.0);
1891 let slice = slice as *const i8;
1892 let offsets = offsets.as_i64x4();
1893 macro_rules! call {
1894 ($imm8:expr) => {
1895 vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
1896 };
1897 }
1898 constify_imm8!(scale, call)
1899 }
1900
1901 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1902 /// where
1903 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1904 /// that position instead.
1905 ///
1906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
1907 #[inline]
1908 #[target_feature(enable = "avx2")]
1909 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1910 #[rustc_args_required_const(4)]
1911 #[stable(feature = "simd_x86", since = "1.27.0")]
1912 pub unsafe fn _mm256_mask_i64gather_pd(
1913 src: __m256d,
1914 slice: *const f64,
1915 offsets: __m256i,
1916 mask: __m256d,
1917 scale: i32,
1918 ) -> __m256d {
1919 let slice = slice as *const i8;
1920 let offsets = offsets.as_i64x4();
1921 macro_rules! call {
1922 ($imm8:expr) => {
1923 vpgatherqpd(src, slice, offsets, mask, $imm8)
1924 };
1925 }
1926 constify_imm8!(scale, call)
1927 }
1928
1929 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1930 /// location specified by `imm8`.
1931 ///
1932 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
1933 #[inline]
1934 #[target_feature(enable = "avx2")]
1935 #[cfg_attr(
1936 all(test, not(target_os = "windows")),
1937 assert_instr(vinsertf128, imm8 = 1)
1938 )]
1939 #[rustc_args_required_const(2)]
1940 #[stable(feature = "simd_x86", since = "1.27.0")]
1941 pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
1942 let a = a.as_i64x4();
1943 let b = _mm256_castsi128_si256(b).as_i64x4();
1944 let dst: i64x4 = match imm8 & 0b01 {
1945 0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
1946 _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
1947 };
1948 transmute(dst)
1949 }
1950
1951 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1952 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1953 /// of intermediate 32-bit integers.
1954 ///
1955 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
1956 #[inline]
1957 #[target_feature(enable = "avx2")]
1958 #[cfg_attr(test, assert_instr(vpmaddwd))]
1959 #[stable(feature = "simd_x86", since = "1.27.0")]
1960 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1961 transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1962 }
1963
1964 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
1965 /// corresponding signed 8-bit integer from `b`, producing intermediate
1966 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1967 /// signed 16-bit integers
1968 ///
1969 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
1970 #[inline]
1971 #[target_feature(enable = "avx2")]
1972 #[cfg_attr(test, assert_instr(vpmaddubsw))]
1973 #[stable(feature = "simd_x86", since = "1.27.0")]
1974 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1975 transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1976 }
1977
1978 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1979 /// (elements are zeroed out when the highest bit is not set in the
1980 /// corresponding element).
1981 ///
1982 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
1983 #[inline]
1984 #[target_feature(enable = "avx2")]
1985 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1986 #[stable(feature = "simd_x86", since = "1.27.0")]
1987 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1988 transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1989 }
1990
1991 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1992 /// (elements are zeroed out when the highest bit is not set in the
1993 /// corresponding element).
1994 ///
1995 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
1996 #[inline]
1997 #[target_feature(enable = "avx2")]
1998 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1999 #[stable(feature = "simd_x86", since = "1.27.0")]
2000 pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
2001 transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
2002 }
2003
2004 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2005 /// (elements are zeroed out when the highest bit is not set in the
2006 /// corresponding element).
2007 ///
2008 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
2009 #[inline]
2010 #[target_feature(enable = "avx2")]
2011 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2012 #[stable(feature = "simd_x86", since = "1.27.0")]
2013 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
2014 transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
2015 }
2016
2017 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2018 /// (elements are zeroed out when the highest bit is not set in the
2019 /// corresponding element).
2020 ///
2021 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
2022 #[inline]
2023 #[target_feature(enable = "avx2")]
2024 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2025 #[stable(feature = "simd_x86", since = "1.27.0")]
2026 pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
2027 transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
2028 }
2029
2030 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2031 /// using `mask` (elements are not stored when the highest bit is not set
2032 /// in the corresponding element).
2033 ///
2034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
2035 #[inline]
2036 #[target_feature(enable = "avx2")]
2037 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2038 #[stable(feature = "simd_x86", since = "1.27.0")]
2039 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
2040 maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
2041 }
2042
2043 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2044 /// using `mask` (elements are not stored when the highest bit is not set
2045 /// in the corresponding element).
2046 ///
2047 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
2048 #[inline]
2049 #[target_feature(enable = "avx2")]
2050 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2051 #[stable(feature = "simd_x86", since = "1.27.0")]
2052 pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
2053 maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
2054 }
2055
2056 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2057 /// using `mask` (elements are not stored when the highest bit is not set
2058 /// in the corresponding element).
2059 ///
2060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
2061 #[inline]
2062 #[target_feature(enable = "avx2")]
2063 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2064 #[stable(feature = "simd_x86", since = "1.27.0")]
2065 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
2066 maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
2067 }
2068
2069 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2070 /// using `mask` (elements are not stored when the highest bit is not set
2071 /// in the corresponding element).
2072 ///
2073 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
2074 #[inline]
2075 #[target_feature(enable = "avx2")]
2076 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2077 #[stable(feature = "simd_x86", since = "1.27.0")]
2078 pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
2079 maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
2080 }
2081
2082 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2083 /// maximum values.
2084 ///
2085 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
2086 #[inline]
2087 #[target_feature(enable = "avx2")]
2088 #[cfg_attr(test, assert_instr(vpmaxsw))]
2089 #[stable(feature = "simd_x86", since = "1.27.0")]
2090 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2091 transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
2092 }
2093
2094 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2095 /// maximum values.
2096 ///
2097 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
2098 #[inline]
2099 #[target_feature(enable = "avx2")]
2100 #[cfg_attr(test, assert_instr(vpmaxsd))]
2101 #[stable(feature = "simd_x86", since = "1.27.0")]
2102 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2103 transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
2104 }
2105
2106 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2107 /// maximum values.
2108 ///
2109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
2110 #[inline]
2111 #[target_feature(enable = "avx2")]
2112 #[cfg_attr(test, assert_instr(vpmaxsb))]
2113 #[stable(feature = "simd_x86", since = "1.27.0")]
2114 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2115 transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
2116 }
2117
2118 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2119 /// the packed maximum values.
2120 ///
2121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
2122 #[inline]
2123 #[target_feature(enable = "avx2")]
2124 #[cfg_attr(test, assert_instr(vpmaxuw))]
2125 #[stable(feature = "simd_x86", since = "1.27.0")]
2126 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2127 transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
2128 }
2129
2130 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2131 /// the packed maximum values.
2132 ///
2133 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
2134 #[inline]
2135 #[target_feature(enable = "avx2")]
2136 #[cfg_attr(test, assert_instr(vpmaxud))]
2137 #[stable(feature = "simd_x86", since = "1.27.0")]
2138 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2139 transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
2140 }
2141
2142 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2143 /// the packed maximum values.
2144 ///
2145 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
2146 #[inline]
2147 #[target_feature(enable = "avx2")]
2148 #[cfg_attr(test, assert_instr(vpmaxub))]
2149 #[stable(feature = "simd_x86", since = "1.27.0")]
2150 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2151 transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
2152 }
2153
2154 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2155 /// minimum values.
2156 ///
2157 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
2158 #[inline]
2159 #[target_feature(enable = "avx2")]
2160 #[cfg_attr(test, assert_instr(vpminsw))]
2161 #[stable(feature = "simd_x86", since = "1.27.0")]
2162 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2163 transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
2164 }
2165
2166 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2167 /// minimum values.
2168 ///
2169 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
2170 #[inline]
2171 #[target_feature(enable = "avx2")]
2172 #[cfg_attr(test, assert_instr(vpminsd))]
2173 #[stable(feature = "simd_x86", since = "1.27.0")]
2174 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2175 transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
2176 }
2177
2178 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2179 /// minimum values.
2180 ///
2181 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
2182 #[inline]
2183 #[target_feature(enable = "avx2")]
2184 #[cfg_attr(test, assert_instr(vpminsb))]
2185 #[stable(feature = "simd_x86", since = "1.27.0")]
2186 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2187 transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
2188 }
2189
2190 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2191 /// the packed minimum values.
2192 ///
2193 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
2194 #[inline]
2195 #[target_feature(enable = "avx2")]
2196 #[cfg_attr(test, assert_instr(vpminuw))]
2197 #[stable(feature = "simd_x86", since = "1.27.0")]
2198 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2199 transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
2200 }
2201
2202 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2203 /// the packed minimum values.
2204 ///
2205 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
2206 #[inline]
2207 #[target_feature(enable = "avx2")]
2208 #[cfg_attr(test, assert_instr(vpminud))]
2209 #[stable(feature = "simd_x86", since = "1.27.0")]
2210 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2211 transmute(pminud(a.as_u32x8(), b.as_u32x8()))
2212 }
2213
2214 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2215 /// the packed minimum values.
2216 ///
2217 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
2218 #[inline]
2219 #[target_feature(enable = "avx2")]
2220 #[cfg_attr(test, assert_instr(vpminub))]
2221 #[stable(feature = "simd_x86", since = "1.27.0")]
2222 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2223 transmute(pminub(a.as_u8x32(), b.as_u8x32()))
2224 }
2225
2226 /// Creates mask from the most significant bit of each 8-bit element in `a`,
2227 /// return the result.
2228 ///
2229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
2230 #[inline]
2231 #[target_feature(enable = "avx2")]
2232 #[cfg_attr(test, assert_instr(vpmovmskb))]
2233 #[stable(feature = "simd_x86", since = "1.27.0")]
2234 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2235 pmovmskb(a.as_i8x32())
2236 }
2237
2238 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2239 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2240 /// results in dst. Eight SADs are performed for each 128-bit lane using one
2241 /// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2242 /// selected from `b` starting at on the offset specified in `imm8`. Eight
2243 /// quadruplets are formed from sequential 8-bit integers selected from `a`
2244 /// starting at the offset specified in `imm8`.
2245 ///
2246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
2247 #[inline]
2248 #[target_feature(enable = "avx2")]
2249 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
2250 #[rustc_args_required_const(2)]
2251 #[stable(feature = "simd_x86", since = "1.27.0")]
2252 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2253 let a = a.as_u8x32();
2254 let b = b.as_u8x32();
2255 macro_rules! call {
2256 ($imm8:expr) => {
2257 mpsadbw(a, b, $imm8)
2258 };
2259 }
2260 let r = constify_imm8!(imm8, call);
2261 transmute(r)
2262 }
2263
2264 /// Multiplies the low 32-bit integers from each packed 64-bit element in
2265 /// `a` and `b`
2266 ///
2267 /// Returns the 64-bit results.
2268 ///
2269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
2270 #[inline]
2271 #[target_feature(enable = "avx2")]
2272 #[cfg_attr(test, assert_instr(vpmuldq))]
2273 #[stable(feature = "simd_x86", since = "1.27.0")]
2274 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2275 transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
2276 }
2277
2278 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2279 /// element in `a` and `b`
2280 ///
2281 /// Returns the unsigned 64-bit results.
2282 ///
2283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
2284 #[inline]
2285 #[target_feature(enable = "avx2")]
2286 #[cfg_attr(test, assert_instr(vpmuludq))]
2287 #[stable(feature = "simd_x86", since = "1.27.0")]
2288 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2289 transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
2290 }
2291
2292 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2293 /// intermediate 32-bit integers and returning the high 16 bits of the
2294 /// intermediate integers.
2295 ///
2296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
2297 #[inline]
2298 #[target_feature(enable = "avx2")]
2299 #[cfg_attr(test, assert_instr(vpmulhw))]
2300 #[stable(feature = "simd_x86", since = "1.27.0")]
2301 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2302 transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
2303 }
2304
2305 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2306 /// intermediate 32-bit integers and returning the high 16 bits of the
2307 /// intermediate integers.
2308 ///
2309 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
2310 #[inline]
2311 #[target_feature(enable = "avx2")]
2312 #[cfg_attr(test, assert_instr(vpmulhuw))]
2313 #[stable(feature = "simd_x86", since = "1.27.0")]
2314 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2315 transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
2316 }
2317
2318 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2319 /// intermediate 32-bit integers, and returns the low 16 bits of the
2320 /// intermediate integers
2321 ///
2322 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
2323 #[inline]
2324 #[target_feature(enable = "avx2")]
2325 #[cfg_attr(test, assert_instr(vpmullw))]
2326 #[stable(feature = "simd_x86", since = "1.27.0")]
2327 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2328 transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2329 }
2330
2331 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
2332 /// intermediate 64-bit integers, and returns the low 32 bits of the
2333 /// intermediate integers
2334 ///
2335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
2336 #[inline]
2337 #[target_feature(enable = "avx2")]
2338 #[cfg_attr(test, assert_instr(vpmulld))]
2339 #[stable(feature = "simd_x86", since = "1.27.0")]
2340 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2341 transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2342 }
2343
2344 /// Multiplies packed 16-bit integers in `a` and `b`, producing
2345 /// intermediate signed 32-bit integers. Truncate each intermediate
2346 /// integer to the 18 most significant bits, round by adding 1, and
2347 /// return bits `[16:1]`.
2348 ///
2349 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
2350 #[inline]
2351 #[target_feature(enable = "avx2")]
2352 #[cfg_attr(test, assert_instr(vpmulhrsw))]
2353 #[stable(feature = "simd_x86", since = "1.27.0")]
2354 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2355 transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2356 }
2357
2358 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2359 /// and `b`
2360 ///
2361 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
2362 #[inline]
2363 #[target_feature(enable = "avx2")]
2364 #[cfg_attr(test, assert_instr(vorps))]
2365 #[stable(feature = "simd_x86", since = "1.27.0")]
2366 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2367 transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2368 }
2369
2370 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2371 /// using signed saturation
2372 ///
2373 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
2374 #[inline]
2375 #[target_feature(enable = "avx2")]
2376 #[cfg_attr(test, assert_instr(vpacksswb))]
2377 #[stable(feature = "simd_x86", since = "1.27.0")]
2378 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2379 transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2380 }
2381
2382 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2383 /// using signed saturation
2384 ///
2385 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
2386 #[inline]
2387 #[target_feature(enable = "avx2")]
2388 #[cfg_attr(test, assert_instr(vpackssdw))]
2389 #[stable(feature = "simd_x86", since = "1.27.0")]
2390 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2391 transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2392 }
2393
2394 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2395 /// using unsigned saturation
2396 ///
2397 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
2398 #[inline]
2399 #[target_feature(enable = "avx2")]
2400 #[cfg_attr(test, assert_instr(vpackuswb))]
2401 #[stable(feature = "simd_x86", since = "1.27.0")]
2402 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2403 transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2404 }
2405
2406 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2407 /// using unsigned saturation
2408 ///
2409 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
2410 #[inline]
2411 #[target_feature(enable = "avx2")]
2412 #[cfg_attr(test, assert_instr(vpackusdw))]
2413 #[stable(feature = "simd_x86", since = "1.27.0")]
2414 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2415 transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2416 }
2417
2418 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
2419 ///
2420 /// The last 3 bits of each integer of `b` are used as addresses into the 8
2421 /// integers of `a`.
2422 ///
2423 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
2424 #[inline]
2425 #[target_feature(enable = "avx2")]
2426 #[cfg_attr(test, assert_instr(vpermps))]
2427 #[stable(feature = "simd_x86", since = "1.27.0")]
2428 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2429 transmute(permd(a.as_u32x8(), b.as_u32x8()))
2430 }
2431
2432 /// Permutes 64-bit integers from `a` using control mask `imm8`.
2433 ///
2434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
2435 #[inline]
2436 #[target_feature(enable = "avx2")]
2437 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
2438 #[rustc_args_required_const(1)]
2439 #[stable(feature = "simd_x86", since = "1.27.0")]
2440 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
2441 let imm8 = (imm8 & 0xFF) as u8;
2442 let zero = _mm256_setzero_si256().as_i64x4();
2443 let a = a.as_i64x4();
2444 macro_rules! permute4 {
2445 ($a:expr, $b:expr, $c:expr, $d:expr) => {
2446 simd_shuffle4(a, zero, [$a, $b, $c, $d]);
2447 };
2448 }
2449 macro_rules! permute3 {
2450 ($a:expr, $b:expr, $c:expr) => {
2451 match (imm8 >> 6) & 0b11 {
2452 0b00 => permute4!($a, $b, $c, 0),
2453 0b01 => permute4!($a, $b, $c, 1),
2454 0b10 => permute4!($a, $b, $c, 2),
2455 _ => permute4!($a, $b, $c, 3),
2456 }
2457 };
2458 }
2459 macro_rules! permute2 {
2460 ($a:expr, $b:expr) => {
2461 match (imm8 >> 4) & 0b11 {
2462 0b00 => permute3!($a, $b, 0),
2463 0b01 => permute3!($a, $b, 1),
2464 0b10 => permute3!($a, $b, 2),
2465 _ => permute3!($a, $b, 3),
2466 }
2467 };
2468 }
2469 macro_rules! permute1 {
2470 ($a:expr) => {
2471 match (imm8 >> 2) & 0b11 {
2472 0b00 => permute2!($a, 0),
2473 0b01 => permute2!($a, 1),
2474 0b10 => permute2!($a, 2),
2475 _ => permute2!($a, 3),
2476 }
2477 };
2478 }
2479 let r: i64x4 = match imm8 & 0b11 {
2480 0b00 => permute1!(0),
2481 0b01 => permute1!(1),
2482 0b10 => permute1!(2),
2483 _ => permute1!(3),
2484 };
2485 transmute(r)
2486 }
2487
2488 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2489 ///
2490 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
2491 #[inline]
2492 #[target_feature(enable = "avx2")]
2493 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
2494 #[rustc_args_required_const(2)]
2495 #[stable(feature = "simd_x86", since = "1.27.0")]
2496 pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2497 let a = a.as_i64x4();
2498 let b = b.as_i64x4();
2499 macro_rules! call {
2500 ($imm8:expr) => {
2501 vperm2i128(a, b, $imm8)
2502 };
2503 }
2504 transmute(constify_imm8!(imm8, call))
2505 }
2506
2507 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
2508 /// control in `imm8`.
2509 ///
2510 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
2511 #[inline]
2512 #[target_feature(enable = "avx2")]
2513 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
2514 #[rustc_args_required_const(1)]
2515 #[stable(feature = "simd_x86", since = "1.27.0")]
2516 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
2517 let imm8 = (imm8 & 0xFF) as u8;
2518 let undef = _mm256_undefined_pd();
2519 macro_rules! shuffle_done {
2520 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2521 simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
2522 };
2523 }
2524 macro_rules! shuffle_x67 {
2525 ($x01:expr, $x23:expr, $x45:expr) => {
2526 match (imm8 >> 6) & 0b11 {
2527 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2528 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2529 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2530 _ => shuffle_done!($x01, $x23, $x45, 3),
2531 }
2532 };
2533 }
2534 macro_rules! shuffle_x45 {
2535 ($x01:expr, $x23:expr) => {
2536 match (imm8 >> 4) & 0b11 {
2537 0b00 => shuffle_x67!($x01, $x23, 0),
2538 0b01 => shuffle_x67!($x01, $x23, 1),
2539 0b10 => shuffle_x67!($x01, $x23, 2),
2540 _ => shuffle_x67!($x01, $x23, 3),
2541 }
2542 };
2543 }
2544 macro_rules! shuffle_x23 {
2545 ($x01:expr) => {
2546 match (imm8 >> 2) & 0b11 {
2547 0b00 => shuffle_x45!($x01, 0),
2548 0b01 => shuffle_x45!($x01, 1),
2549 0b10 => shuffle_x45!($x01, 2),
2550 _ => shuffle_x45!($x01, 3),
2551 }
2552 };
2553 }
2554 match imm8 & 0b11 {
2555 0b00 => shuffle_x23!(0),
2556 0b01 => shuffle_x23!(1),
2557 0b10 => shuffle_x23!(2),
2558 _ => shuffle_x23!(3),
2559 }
2560 }
2561
2562 /// Shuffles eight 32-bit foating-point elements in `a` across lanes using
2563 /// the corresponding 32-bit integer index in `idx`.
2564 ///
2565 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
2566 #[inline]
2567 #[target_feature(enable = "avx2")]
2568 #[cfg_attr(test, assert_instr(vpermps))]
2569 #[stable(feature = "simd_x86", since = "1.27.0")]
2570 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2571 permps(a, idx.as_i32x8())
2572 }
2573
2574 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2575 /// and `b`, then horizontally sum each consecutive 8 differences to
2576 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2577 /// integers in the low 16 bits of the 64-bit return value
2578 ///
2579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
2580 #[inline]
2581 #[target_feature(enable = "avx2")]
2582 #[cfg_attr(test, assert_instr(vpsadbw))]
2583 #[stable(feature = "simd_x86", since = "1.27.0")]
2584 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2585 transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2586 }
2587
2588 /// Shuffles bytes from `a` according to the content of `b`.
2589 ///
2590 /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
2591 /// of `a`.
2592 ///
2593 /// In addition, if the highest significant bit of a byte of `b` is set, the
2594 /// respective destination byte is set to 0.
2595 ///
2596 /// The low and high halves of the vectors are shuffled separately.
2597 ///
2598 /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2599 /// equivalent to:
2600 ///
2601 /// ```
2602 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2603 /// let mut r = [0; 32];
2604 /// for i in 0..16 {
2605 /// // if the most significant bit of b is set,
2606 /// // then the destination byte is set to 0.
2607 /// if b[i] & 0x80 == 0u8 {
2608 /// r[i] = a[(b[i] % 16) as usize];
2609 /// }
2610 /// if b[i + 16] & 0x80 == 0u8 {
2611 /// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2612 /// }
2613 /// }
2614 /// r
2615 /// }
2616 /// ```
2617 ///
2618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
2619 #[inline]
2620 #[target_feature(enable = "avx2")]
2621 #[cfg_attr(test, assert_instr(vpshufb))]
2622 #[stable(feature = "simd_x86", since = "1.27.0")]
2623 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2624 transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2625 }
2626
2627 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2628 /// `imm8`.
2629 ///
2630 /// ```rust
2631 /// #[cfg(target_arch = "x86")]
2632 /// use std::arch::x86::*;
2633 /// #[cfg(target_arch = "x86_64")]
2634 /// use std::arch::x86_64::*;
2635 ///
2636 /// # fn main() {
2637 /// # if is_x86_feature_detected!("avx2") {
2638 /// # #[target_feature(enable = "avx2")]
2639 /// # unsafe fn worker() {
2640 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2641 ///
2642 /// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2643 /// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2644 ///
2645 /// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2646 /// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2647 ///
2648 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2649 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2650 /// # }
2651 /// # unsafe { worker(); }
2652 /// # }
2653 /// # }
2654 /// ```
2655 ///
2656 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2657 #[inline]
2658 #[target_feature(enable = "avx2")]
2659 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
2660 #[rustc_args_required_const(1)]
2661 #[stable(feature = "simd_x86", since = "1.27.0")]
2662 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
2663 // simd_shuffleX requires that its selector parameter be made up of
2664 // constant values, but we can't enforce that here. In spirit, we need
2665 // to write a `match` on all possible values of a byte, and for each value,
2666 // hard-code the correct `simd_shuffleX` call using only constants. We
2667 // then hope for LLVM to do the rest.
2668 //
2669 // Of course, that's... awful. So we try to use macros to do it for us.
2670 let imm8 = (imm8 & 0xFF) as u8;
2671
2672 let a = a.as_i32x8();
2673 macro_rules! shuffle_done {
2674 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2675 simd_shuffle8(
2676 a,
2677 a,
2678 [
2679 $x01,
2680 $x23,
2681 $x45,
2682 $x67,
2683 4 + $x01,
2684 4 + $x23,
2685 4 + $x45,
2686 4 + $x67,
2687 ],
2688 )
2689 };
2690 }
2691 macro_rules! shuffle_x67 {
2692 ($x01:expr, $x23:expr, $x45:expr) => {
2693 match (imm8 >> 6) & 0b11 {
2694 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2695 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2696 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2697 _ => shuffle_done!($x01, $x23, $x45, 3),
2698 }
2699 };
2700 }
2701 macro_rules! shuffle_x45 {
2702 ($x01:expr, $x23:expr) => {
2703 match (imm8 >> 4) & 0b11 {
2704 0b00 => shuffle_x67!($x01, $x23, 0),
2705 0b01 => shuffle_x67!($x01, $x23, 1),
2706 0b10 => shuffle_x67!($x01, $x23, 2),
2707 _ => shuffle_x67!($x01, $x23, 3),
2708 }
2709 };
2710 }
2711 macro_rules! shuffle_x23 {
2712 ($x01:expr) => {
2713 match (imm8 >> 2) & 0b11 {
2714 0b00 => shuffle_x45!($x01, 0),
2715 0b01 => shuffle_x45!($x01, 1),
2716 0b10 => shuffle_x45!($x01, 2),
2717 _ => shuffle_x45!($x01, 3),
2718 }
2719 };
2720 }
2721 let r: i32x8 = match imm8 & 0b11 {
2722 0b00 => shuffle_x23!(0),
2723 0b01 => shuffle_x23!(1),
2724 0b10 => shuffle_x23!(2),
2725 _ => shuffle_x23!(3),
2726 };
2727 transmute(r)
2728 }
2729
2730 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2731 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2732 /// to the output.
2733 ///
2734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
2735 #[inline]
2736 #[target_feature(enable = "avx2")]
2737 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
2738 #[rustc_args_required_const(1)]
2739 #[stable(feature = "simd_x86", since = "1.27.0")]
2740 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
2741 let imm8 = (imm8 & 0xFF) as u8;
2742 let a = a.as_i16x16();
2743 macro_rules! shuffle_done {
2744 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2745 #[rustfmt::skip]
2746 simd_shuffle16(a, a, [
2747 0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
2748 8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
2749 ]);
2750 };
2751 }
2752 macro_rules! shuffle_x67 {
2753 ($x01:expr, $x23:expr, $x45:expr) => {
2754 match (imm8 >> 6) & 0b11 {
2755 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2756 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2757 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2758 _ => shuffle_done!($x01, $x23, $x45, 3),
2759 }
2760 };
2761 }
2762 macro_rules! shuffle_x45 {
2763 ($x01:expr, $x23:expr) => {
2764 match (imm8 >> 4) & 0b11 {
2765 0b00 => shuffle_x67!($x01, $x23, 0),
2766 0b01 => shuffle_x67!($x01, $x23, 1),
2767 0b10 => shuffle_x67!($x01, $x23, 2),
2768 _ => shuffle_x67!($x01, $x23, 3),
2769 }
2770 };
2771 }
2772 macro_rules! shuffle_x23 {
2773 ($x01:expr) => {
2774 match (imm8 >> 2) & 0b11 {
2775 0b00 => shuffle_x45!($x01, 0),
2776 0b01 => shuffle_x45!($x01, 1),
2777 0b10 => shuffle_x45!($x01, 2),
2778 _ => shuffle_x45!($x01, 3),
2779 }
2780 };
2781 }
2782 let r: i16x16 = match imm8 & 0b11 {
2783 0b00 => shuffle_x23!(0),
2784 0b01 => shuffle_x23!(1),
2785 0b10 => shuffle_x23!(2),
2786 _ => shuffle_x23!(3),
2787 };
2788 transmute(r)
2789 }
2790
2791 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2792 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2793 /// to the output.
2794 ///
2795 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
2796 #[inline]
2797 #[target_feature(enable = "avx2")]
2798 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
2799 #[rustc_args_required_const(1)]
2800 #[stable(feature = "simd_x86", since = "1.27.0")]
2801 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
2802 let imm8 = (imm8 & 0xFF) as u8;
2803 let a = a.as_i16x16();
2804 macro_rules! shuffle_done {
2805 ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2806 #[rustfmt::skip]
2807 simd_shuffle16(a, a, [
2808 0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
2809 8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
2810 ]);
2811 };
2812 }
2813 macro_rules! shuffle_x67 {
2814 ($x01:expr, $x23:expr, $x45:expr) => {
2815 match (imm8 >> 6) & 0b11 {
2816 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2817 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2818 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2819 _ => shuffle_done!($x01, $x23, $x45, 3),
2820 }
2821 };
2822 }
2823 macro_rules! shuffle_x45 {
2824 ($x01:expr, $x23:expr) => {
2825 match (imm8 >> 4) & 0b11 {
2826 0b00 => shuffle_x67!($x01, $x23, 0),
2827 0b01 => shuffle_x67!($x01, $x23, 1),
2828 0b10 => shuffle_x67!($x01, $x23, 2),
2829 _ => shuffle_x67!($x01, $x23, 3),
2830 }
2831 };
2832 }
2833 macro_rules! shuffle_x23 {
2834 ($x01:expr) => {
2835 match (imm8 >> 2) & 0b11 {
2836 0b00 => shuffle_x45!($x01, 0),
2837 0b01 => shuffle_x45!($x01, 1),
2838 0b10 => shuffle_x45!($x01, 2),
2839 _ => shuffle_x45!($x01, 3),
2840 }
2841 };
2842 }
2843 let r: i16x16 = match imm8 & 0b11 {
2844 0b00 => shuffle_x23!(0),
2845 0b01 => shuffle_x23!(1),
2846 0b10 => shuffle_x23!(2),
2847 _ => shuffle_x23!(3),
2848 };
2849 transmute(r)
2850 }
2851
2852 /// Negates packed 16-bit integers in `a` when the corresponding signed
2853 /// 16-bit integer in `b` is negative, and returns the results.
2854 /// Results are zeroed out when the corresponding element in `b` is zero.
2855 ///
2856 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
2857 #[inline]
2858 #[target_feature(enable = "avx2")]
2859 #[cfg_attr(test, assert_instr(vpsignw))]
2860 #[stable(feature = "simd_x86", since = "1.27.0")]
2861 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2862 transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2863 }
2864
2865 /// Negates packed 32-bit integers in `a` when the corresponding signed
2866 /// 32-bit integer in `b` is negative, and returns the results.
2867 /// Results are zeroed out when the corresponding element in `b` is zero.
2868 ///
2869 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
2870 #[inline]
2871 #[target_feature(enable = "avx2")]
2872 #[cfg_attr(test, assert_instr(vpsignd))]
2873 #[stable(feature = "simd_x86", since = "1.27.0")]
2874 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2875 transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2876 }
2877
2878 /// Negates packed 8-bit integers in `a` when the corresponding signed
2879 /// 8-bit integer in `b` is negative, and returns the results.
2880 /// Results are zeroed out when the corresponding element in `b` is zero.
2881 ///
2882 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
2883 #[inline]
2884 #[target_feature(enable = "avx2")]
2885 #[cfg_attr(test, assert_instr(vpsignb))]
2886 #[stable(feature = "simd_x86", since = "1.27.0")]
2887 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2888 transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2889 }
2890
2891 /// Shifts packed 16-bit integers in `a` left by `count` while
2892 /// shifting in zeros, and returns the result
2893 ///
2894 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
2895 #[inline]
2896 #[target_feature(enable = "avx2")]
2897 #[cfg_attr(test, assert_instr(vpsllw))]
2898 #[stable(feature = "simd_x86", since = "1.27.0")]
2899 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2900 transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2901 }
2902
2903 /// Shifts packed 32-bit integers in `a` left by `count` while
2904 /// shifting in zeros, and returns the result
2905 ///
2906 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
2907 #[inline]
2908 #[target_feature(enable = "avx2")]
2909 #[cfg_attr(test, assert_instr(vpslld))]
2910 #[stable(feature = "simd_x86", since = "1.27.0")]
2911 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2912 transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2913 }
2914
2915 /// Shifts packed 64-bit integers in `a` left by `count` while
2916 /// shifting in zeros, and returns the result
2917 ///
2918 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
2919 #[inline]
2920 #[target_feature(enable = "avx2")]
2921 #[cfg_attr(test, assert_instr(vpsllq))]
2922 #[stable(feature = "simd_x86", since = "1.27.0")]
2923 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2924 transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2925 }
2926
2927 /// Shifts packed 16-bit integers in `a` left by `imm8` while
2928 /// shifting in zeros, return the results;
2929 ///
2930 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
2931 #[inline]
2932 #[target_feature(enable = "avx2")]
2933 #[cfg_attr(test, assert_instr(vpsllw))]
2934 #[stable(feature = "simd_x86", since = "1.27.0")]
2935 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
2936 transmute(pslliw(a.as_i16x16(), imm8))
2937 }
2938
2939 /// Shifts packed 32-bit integers in `a` left by `imm8` while
2940 /// shifting in zeros, return the results;
2941 ///
2942 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
2943 #[inline]
2944 #[target_feature(enable = "avx2")]
2945 #[cfg_attr(test, assert_instr(vpslld))]
2946 #[stable(feature = "simd_x86", since = "1.27.0")]
2947 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
2948 transmute(psllid(a.as_i32x8(), imm8))
2949 }
2950
2951 /// Shifts packed 64-bit integers in `a` left by `imm8` while
2952 /// shifting in zeros, return the results;
2953 ///
2954 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
2955 #[inline]
2956 #[target_feature(enable = "avx2")]
2957 #[cfg_attr(test, assert_instr(vpsllq))]
2958 #[stable(feature = "simd_x86", since = "1.27.0")]
2959 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
2960 transmute(pslliq(a.as_i64x4(), imm8))
2961 }
2962
2963 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2964 ///
2965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
2966 #[inline]
2967 #[target_feature(enable = "avx2")]
2968 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2969 #[rustc_args_required_const(1)]
2970 #[stable(feature = "simd_x86", since = "1.27.0")]
2971 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2972 let a = a.as_i64x4();
2973 macro_rules! call {
2974 ($imm8:expr) => {
2975 vpslldq(a, $imm8)
2976 };
2977 }
2978 transmute(constify_imm8!(imm8 * 8, call))
2979 }
2980
2981 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2982 ///
2983 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
2984 #[inline]
2985 #[target_feature(enable = "avx2")]
2986 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2987 #[rustc_args_required_const(1)]
2988 #[stable(feature = "simd_x86", since = "1.27.0")]
2989 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2990 let a = a.as_i64x4();
2991 macro_rules! call {
2992 ($imm8:expr) => {
2993 vpslldq(a, $imm8)
2994 };
2995 }
2996 transmute(constify_imm8!(imm8 * 8, call))
2997 }
2998
2999 /// Shifts packed 32-bit integers in `a` left by the amount
3000 /// specified by the corresponding element in `count` while
3001 /// shifting in zeros, and returns the result.
3002 ///
3003 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
3004 #[inline]
3005 #[target_feature(enable = "avx2")]
3006 #[cfg_attr(test, assert_instr(vpsllvd))]
3007 #[stable(feature = "simd_x86", since = "1.27.0")]
3008 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
3009 transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
3010 }
3011
3012 /// Shifts packed 32-bit integers in `a` left by the amount
3013 /// specified by the corresponding element in `count` while
3014 /// shifting in zeros, and returns the result.
3015 ///
3016 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
3017 #[inline]
3018 #[target_feature(enable = "avx2")]
3019 #[cfg_attr(test, assert_instr(vpsllvd))]
3020 #[stable(feature = "simd_x86", since = "1.27.0")]
3021 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
3022 transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
3023 }
3024
3025 /// Shifts packed 64-bit integers in `a` left by the amount
3026 /// specified by the corresponding element in `count` while
3027 /// shifting in zeros, and returns the result.
3028 ///
3029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
3030 #[inline]
3031 #[target_feature(enable = "avx2")]
3032 #[cfg_attr(test, assert_instr(vpsllvq))]
3033 #[stable(feature = "simd_x86", since = "1.27.0")]
3034 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3035 transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
3036 }
3037
3038 /// Shifts packed 64-bit integers in `a` left by the amount
3039 /// specified by the corresponding element in `count` while
3040 /// shifting in zeros, and returns the result.
3041 ///
3042 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
3043 #[inline]
3044 #[target_feature(enable = "avx2")]
3045 #[cfg_attr(test, assert_instr(vpsllvq))]
3046 #[stable(feature = "simd_x86", since = "1.27.0")]
3047 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3048 transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
3049 }
3050
3051 /// Shifts packed 16-bit integers in `a` right by `count` while
3052 /// shifting in sign bits.
3053 ///
3054 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
3055 #[inline]
3056 #[target_feature(enable = "avx2")]
3057 #[cfg_attr(test, assert_instr(vpsraw))]
3058 #[stable(feature = "simd_x86", since = "1.27.0")]
3059 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3060 transmute(psraw(a.as_i16x16(), count.as_i16x8()))
3061 }
3062
3063 /// Shifts packed 32-bit integers in `a` right by `count` while
3064 /// shifting in sign bits.
3065 ///
3066 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
3067 #[inline]
3068 #[target_feature(enable = "avx2")]
3069 #[cfg_attr(test, assert_instr(vpsrad))]
3070 #[stable(feature = "simd_x86", since = "1.27.0")]
3071 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3072 transmute(psrad(a.as_i32x8(), count.as_i32x4()))
3073 }
3074
3075 /// Shifts packed 16-bit integers in `a` right by `imm8` while
3076 /// shifting in sign bits.
3077 ///
3078 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
3079 #[inline]
3080 #[target_feature(enable = "avx2")]
3081 #[cfg_attr(test, assert_instr(vpsraw))]
3082 #[stable(feature = "simd_x86", since = "1.27.0")]
3083 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
3084 transmute(psraiw(a.as_i16x16(), imm8))
3085 }
3086
3087 /// Shifts packed 32-bit integers in `a` right by `imm8` while
3088 /// shifting in sign bits.
3089 ///
3090 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
3091 #[inline]
3092 #[target_feature(enable = "avx2")]
3093 #[cfg_attr(test, assert_instr(vpsrad))]
3094 #[stable(feature = "simd_x86", since = "1.27.0")]
3095 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
3096 transmute(psraid(a.as_i32x8(), imm8))
3097 }
3098
3099 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3100 /// corresponding element in `count` while shifting in sign bits.
3101 ///
3102 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
3103 #[inline]
3104 #[target_feature(enable = "avx2")]
3105 #[cfg_attr(test, assert_instr(vpsravd))]
3106 #[stable(feature = "simd_x86", since = "1.27.0")]
3107 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3108 transmute(psravd(a.as_i32x4(), count.as_i32x4()))
3109 }
3110
3111 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3112 /// corresponding element in `count` while shifting in sign bits.
3113 ///
3114 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
3115 #[inline]
3116 #[target_feature(enable = "avx2")]
3117 #[cfg_attr(test, assert_instr(vpsravd))]
3118 #[stable(feature = "simd_x86", since = "1.27.0")]
3119 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3120 transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
3121 }
3122
3123 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3124 ///
3125 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
3126 #[inline]
3127 #[target_feature(enable = "avx2")]
3128 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3129 #[rustc_args_required_const(1)]
3130 #[stable(feature = "simd_x86", since = "1.27.0")]
3131 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
3132 let a = a.as_i64x4();
3133 macro_rules! call {
3134 ($imm8:expr) => {
3135 vpsrldq(a, $imm8)
3136 };
3137 }
3138 transmute(constify_imm8!(imm8 * 8, call))
3139 }
3140
3141 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3142 ///
3143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
3144 #[inline]
3145 #[target_feature(enable = "avx2")]
3146 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3147 #[rustc_args_required_const(1)]
3148 #[stable(feature = "simd_x86", since = "1.27.0")]
3149 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
3150 let a = a.as_i64x4();
3151 macro_rules! call {
3152 ($imm8:expr) => {
3153 vpsrldq(a, $imm8)
3154 };
3155 }
3156 transmute(constify_imm8!(imm8 * 8, call))
3157 }
3158
3159 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3160 /// zeros.
3161 ///
3162 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
3163 #[inline]
3164 #[target_feature(enable = "avx2")]
3165 #[cfg_attr(test, assert_instr(vpsrlw))]
3166 #[stable(feature = "simd_x86", since = "1.27.0")]
3167 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3168 transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3169 }
3170
3171 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3172 /// zeros.
3173 ///
3174 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
3175 #[inline]
3176 #[target_feature(enable = "avx2")]
3177 #[cfg_attr(test, assert_instr(vpsrld))]
3178 #[stable(feature = "simd_x86", since = "1.27.0")]
3179 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3180 transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3181 }
3182
3183 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3184 /// zeros.
3185 ///
3186 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
3187 #[inline]
3188 #[target_feature(enable = "avx2")]
3189 #[cfg_attr(test, assert_instr(vpsrlq))]
3190 #[stable(feature = "simd_x86", since = "1.27.0")]
3191 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3192 transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3193 }
3194
3195 /// Shifts packed 16-bit integers in `a` right by `imm8` while shifting in
3196 /// zeros
3197 ///
3198 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
3199 #[inline]
3200 #[target_feature(enable = "avx2")]
3201 #[cfg_attr(test, assert_instr(vpsrlw))]
3202 #[stable(feature = "simd_x86", since = "1.27.0")]
3203 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
3204 transmute(psrliw(a.as_i16x16(), imm8))
3205 }
3206
3207 /// Shifts packed 32-bit integers in `a` right by `imm8` while shifting in
3208 /// zeros
3209 ///
3210 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
3211 #[inline]
3212 #[target_feature(enable = "avx2")]
3213 #[cfg_attr(test, assert_instr(vpsrld))]
3214 #[stable(feature = "simd_x86", since = "1.27.0")]
3215 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
3216 transmute(psrlid(a.as_i32x8(), imm8))
3217 }
3218
3219 /// Shifts packed 64-bit integers in `a` right by `imm8` while shifting in
3220 /// zeros
3221 ///
3222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
3223 #[inline]
3224 #[target_feature(enable = "avx2")]
3225 #[cfg_attr(test, assert_instr(vpsrlq))]
3226 #[stable(feature = "simd_x86", since = "1.27.0")]
3227 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
3228 transmute(psrliq(a.as_i64x4(), imm8))
3229 }
3230
3231 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3232 /// the corresponding element in `count` while shifting in zeros,
3233 ///
3234 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
3235 #[inline]
3236 #[target_feature(enable = "avx2")]
3237 #[cfg_attr(test, assert_instr(vpsrlvd))]
3238 #[stable(feature = "simd_x86", since = "1.27.0")]
3239 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3240 transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3241 }
3242
3243 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3244 /// the corresponding element in `count` while shifting in zeros,
3245 ///
3246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
3247 #[inline]
3248 #[target_feature(enable = "avx2")]
3249 #[cfg_attr(test, assert_instr(vpsrlvd))]
3250 #[stable(feature = "simd_x86", since = "1.27.0")]
3251 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3252 transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3253 }
3254
3255 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3256 /// the corresponding element in `count` while shifting in zeros,
3257 ///
3258 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
3259 #[inline]
3260 #[target_feature(enable = "avx2")]
3261 #[cfg_attr(test, assert_instr(vpsrlvq))]
3262 #[stable(feature = "simd_x86", since = "1.27.0")]
3263 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3264 transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3265 }
3266
3267 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3268 /// the corresponding element in `count` while shifting in zeros,
3269 ///
3270 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
3271 #[inline]
3272 #[target_feature(enable = "avx2")]
3273 #[cfg_attr(test, assert_instr(vpsrlvq))]
3274 #[stable(feature = "simd_x86", since = "1.27.0")]
3275 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3276 transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3277 }
3278
3279 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
3280
3281 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3282 ///
3283 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
3284 #[inline]
3285 #[target_feature(enable = "avx2")]
3286 #[cfg_attr(test, assert_instr(vpsubw))]
3287 #[stable(feature = "simd_x86", since = "1.27.0")]
3288 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3289 transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3290 }
3291
3292 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3293 ///
3294 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
3295 #[inline]
3296 #[target_feature(enable = "avx2")]
3297 #[cfg_attr(test, assert_instr(vpsubd))]
3298 #[stable(feature = "simd_x86", since = "1.27.0")]
3299 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3300 transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3301 }
3302
3303 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3304 ///
3305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
3306 #[inline]
3307 #[target_feature(enable = "avx2")]
3308 #[cfg_attr(test, assert_instr(vpsubq))]
3309 #[stable(feature = "simd_x86", since = "1.27.0")]
3310 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3311 transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3312 }
3313
3314 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3315 ///
3316 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
3317 #[inline]
3318 #[target_feature(enable = "avx2")]
3319 #[cfg_attr(test, assert_instr(vpsubb))]
3320 #[stable(feature = "simd_x86", since = "1.27.0")]
3321 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3322 transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3323 }
3324
3325 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3326 /// `a` using saturation.
3327 ///
3328 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
3329 #[inline]
3330 #[target_feature(enable = "avx2")]
3331 #[cfg_attr(test, assert_instr(vpsubsw))]
3332 #[stable(feature = "simd_x86", since = "1.27.0")]
3333 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3334 transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
3335 }
3336
3337 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3338 /// `a` using saturation.
3339 ///
3340 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
3341 #[inline]
3342 #[target_feature(enable = "avx2")]
3343 #[cfg_attr(test, assert_instr(vpsubsb))]
3344 #[stable(feature = "simd_x86", since = "1.27.0")]
3345 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3346 transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
3347 }
3348
3349 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3350 /// integers in `a` using saturation.
3351 ///
3352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
3353 #[inline]
3354 #[target_feature(enable = "avx2")]
3355 #[cfg_attr(test, assert_instr(vpsubusw))]
3356 #[stable(feature = "simd_x86", since = "1.27.0")]
3357 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3358 transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
3359 }
3360
3361 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3362 /// integers in `a` using saturation.
3363 ///
3364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
3365 #[inline]
3366 #[target_feature(enable = "avx2")]
3367 #[cfg_attr(test, assert_instr(vpsubusb))]
3368 #[stable(feature = "simd_x86", since = "1.27.0")]
3369 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3370 transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
3371 }
3372
3373 /// Unpacks and interleave 8-bit integers from the high half of each
3374 /// 128-bit lane in `a` and `b`.
3375 ///
3376 /// ```rust
3377 /// #[cfg(target_arch = "x86")]
3378 /// use std::arch::x86::*;
3379 /// #[cfg(target_arch = "x86_64")]
3380 /// use std::arch::x86_64::*;
3381 ///
3382 /// # fn main() {
3383 /// # if is_x86_feature_detected!("avx2") {
3384 /// # #[target_feature(enable = "avx2")]
3385 /// # unsafe fn worker() {
3386 /// let a = _mm256_setr_epi8(
3387 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3388 /// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3389 /// );
3390 /// let b = _mm256_setr_epi8(
3391 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3392 /// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3393 /// -30, -31,
3394 /// );
3395 ///
3396 /// let c = _mm256_unpackhi_epi8(a, b);
3397 ///
3398 /// let expected = _mm256_setr_epi8(
3399 /// 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3400 /// 24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3401 /// -31,
3402 /// );
3403 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3404 ///
3405 /// # }
3406 /// # unsafe { worker(); }
3407 /// # }
3408 /// # }
3409 /// ```
3410 ///
3411 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
3412 #[inline]
3413 #[target_feature(enable = "avx2")]
3414 #[cfg_attr(test, assert_instr(vpunpckhbw))]
3415 #[stable(feature = "simd_x86", since = "1.27.0")]
3416 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3417 #[rustfmt::skip]
3418 let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3419 8, 40, 9, 41, 10, 42, 11, 43,
3420 12, 44, 13, 45, 14, 46, 15, 47,
3421 24, 56, 25, 57, 26, 58, 27, 59,
3422 28, 60, 29, 61, 30, 62, 31, 63,
3423 ]);
3424 transmute(r)
3425 }
3426
3427 /// Unpacks and interleave 8-bit integers from the low half of each
3428 /// 128-bit lane of `a` and `b`.
3429 ///
3430 /// ```rust
3431 /// #[cfg(target_arch = "x86")]
3432 /// use std::arch::x86::*;
3433 /// #[cfg(target_arch = "x86_64")]
3434 /// use std::arch::x86_64::*;
3435 ///
3436 /// # fn main() {
3437 /// # if is_x86_feature_detected!("avx2") {
3438 /// # #[target_feature(enable = "avx2")]
3439 /// # unsafe fn worker() {
3440 /// let a = _mm256_setr_epi8(
3441 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3442 /// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3443 /// );
3444 /// let b = _mm256_setr_epi8(
3445 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3446 /// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3447 /// -30, -31,
3448 /// );
3449 ///
3450 /// let c = _mm256_unpacklo_epi8(a, b);
3451 ///
3452 /// let expected = _mm256_setr_epi8(
3453 /// 0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3454 /// -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3455 /// );
3456 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3457 ///
3458 /// # }
3459 /// # unsafe { worker(); }
3460 /// # }
3461 /// # }
3462 /// ```
3463 ///
3464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
3465 #[inline]
3466 #[target_feature(enable = "avx2")]
3467 #[cfg_attr(test, assert_instr(vpunpcklbw))]
3468 #[stable(feature = "simd_x86", since = "1.27.0")]
3469 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3470 #[rustfmt::skip]
3471 let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3472 0, 32, 1, 33, 2, 34, 3, 35,
3473 4, 36, 5, 37, 6, 38, 7, 39,
3474 16, 48, 17, 49, 18, 50, 19, 51,
3475 20, 52, 21, 53, 22, 54, 23, 55,
3476 ]);
3477 transmute(r)
3478 }
3479
3480 /// Unpacks and interleave 16-bit integers from the high half of each
3481 /// 128-bit lane of `a` and `b`.
3482 ///
3483 /// ```rust
3484 /// #[cfg(target_arch = "x86")]
3485 /// use std::arch::x86::*;
3486 /// #[cfg(target_arch = "x86_64")]
3487 /// use std::arch::x86_64::*;
3488 ///
3489 /// # fn main() {
3490 /// # if is_x86_feature_detected!("avx2") {
3491 /// # #[target_feature(enable = "avx2")]
3492 /// # unsafe fn worker() {
3493 /// let a = _mm256_setr_epi16(
3494 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3495 /// );
3496 /// let b = _mm256_setr_epi16(
3497 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3498 /// );
3499 ///
3500 /// let c = _mm256_unpackhi_epi16(a, b);
3501 ///
3502 /// let expected = _mm256_setr_epi16(
3503 /// 4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3504 /// );
3505 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3506 ///
3507 /// # }
3508 /// # unsafe { worker(); }
3509 /// # }
3510 /// # }
3511 /// ```
3512 ///
3513 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
3514 #[inline]
3515 #[target_feature(enable = "avx2")]
3516 #[cfg_attr(test, assert_instr(vpunpckhwd))]
3517 #[stable(feature = "simd_x86", since = "1.27.0")]
3518 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3519 let r: i16x16 = simd_shuffle16(
3520 a.as_i16x16(),
3521 b.as_i16x16(),
3522 [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3523 );
3524 transmute(r)
3525 }
3526
3527 /// Unpacks and interleave 16-bit integers from the low half of each
3528 /// 128-bit lane of `a` and `b`.
3529 ///
3530 /// ```rust
3531 /// #[cfg(target_arch = "x86")]
3532 /// use std::arch::x86::*;
3533 /// #[cfg(target_arch = "x86_64")]
3534 /// use std::arch::x86_64::*;
3535 ///
3536 /// # fn main() {
3537 /// # if is_x86_feature_detected!("avx2") {
3538 /// # #[target_feature(enable = "avx2")]
3539 /// # unsafe fn worker() {
3540 ///
3541 /// let a = _mm256_setr_epi16(
3542 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3543 /// );
3544 /// let b = _mm256_setr_epi16(
3545 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3546 /// );
3547 ///
3548 /// let c = _mm256_unpacklo_epi16(a, b);
3549 ///
3550 /// let expected = _mm256_setr_epi16(
3551 /// 0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3552 /// );
3553 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3554 ///
3555 /// # }
3556 /// # unsafe { worker(); }
3557 /// # }
3558 /// # }
3559 /// ```
3560 ///
3561 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
3562 #[inline]
3563 #[target_feature(enable = "avx2")]
3564 #[cfg_attr(test, assert_instr(vpunpcklwd))]
3565 #[stable(feature = "simd_x86", since = "1.27.0")]
3566 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3567 let r: i16x16 = simd_shuffle16(
3568 a.as_i16x16(),
3569 b.as_i16x16(),
3570 [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3571 );
3572 transmute(r)
3573 }
3574
3575 /// Unpacks and interleave 32-bit integers from the high half of each
3576 /// 128-bit lane of `a` and `b`.
3577 ///
3578 /// ```rust
3579 /// #[cfg(target_arch = "x86")]
3580 /// use std::arch::x86::*;
3581 /// #[cfg(target_arch = "x86_64")]
3582 /// use std::arch::x86_64::*;
3583 ///
3584 /// # fn main() {
3585 /// # if is_x86_feature_detected!("avx2") {
3586 /// # #[target_feature(enable = "avx2")]
3587 /// # unsafe fn worker() {
3588 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3589 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3590 ///
3591 /// let c = _mm256_unpackhi_epi32(a, b);
3592 ///
3593 /// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3594 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3595 ///
3596 /// # }
3597 /// # unsafe { worker(); }
3598 /// # }
3599 /// # }
3600 /// ```
3601 ///
3602 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
3603 #[inline]
3604 #[target_feature(enable = "avx2")]
3605 #[cfg_attr(test, assert_instr(vunpckhps))]
3606 #[stable(feature = "simd_x86", since = "1.27.0")]
3607 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3608 let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3609 transmute(r)
3610 }
3611
3612 /// Unpacks and interleave 32-bit integers from the low half of each
3613 /// 128-bit lane of `a` and `b`.
3614 ///
3615 /// ```rust
3616 /// #[cfg(target_arch = "x86")]
3617 /// use std::arch::x86::*;
3618 /// #[cfg(target_arch = "x86_64")]
3619 /// use std::arch::x86_64::*;
3620 ///
3621 /// # fn main() {
3622 /// # if is_x86_feature_detected!("avx2") {
3623 /// # #[target_feature(enable = "avx2")]
3624 /// # unsafe fn worker() {
3625 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3626 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3627 ///
3628 /// let c = _mm256_unpacklo_epi32(a, b);
3629 ///
3630 /// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3631 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3632 ///
3633 /// # }
3634 /// # unsafe { worker(); }
3635 /// # }
3636 /// # }
3637 /// ```
3638 ///
3639 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
3640 #[inline]
3641 #[target_feature(enable = "avx2")]
3642 #[cfg_attr(test, assert_instr(vunpcklps))]
3643 #[stable(feature = "simd_x86", since = "1.27.0")]
3644 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3645 let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3646 transmute(r)
3647 }
3648
3649 /// Unpacks and interleave 64-bit integers from the high half of each
3650 /// 128-bit lane of `a` and `b`.
3651 ///
3652 /// ```rust
3653 /// #[cfg(target_arch = "x86")]
3654 /// use std::arch::x86::*;
3655 /// #[cfg(target_arch = "x86_64")]
3656 /// use std::arch::x86_64::*;
3657 ///
3658 /// # fn main() {
3659 /// # if is_x86_feature_detected!("avx2") {
3660 /// # #[target_feature(enable = "avx2")]
3661 /// # unsafe fn worker() {
3662 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3663 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3664 ///
3665 /// let c = _mm256_unpackhi_epi64(a, b);
3666 ///
3667 /// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3668 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3669 ///
3670 /// # }
3671 /// # unsafe { worker(); }
3672 /// # }
3673 /// # }
3674 /// ```
3675 ///
3676 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
3677 #[inline]
3678 #[target_feature(enable = "avx2")]
3679 #[cfg_attr(test, assert_instr(vunpckhpd))]
3680 #[stable(feature = "simd_x86", since = "1.27.0")]
3681 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3682 let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3683 transmute(r)
3684 }
3685
3686 /// Unpacks and interleave 64-bit integers from the low half of each
3687 /// 128-bit lane of `a` and `b`.
3688 ///
3689 /// ```rust
3690 /// #[cfg(target_arch = "x86")]
3691 /// use std::arch::x86::*;
3692 /// #[cfg(target_arch = "x86_64")]
3693 /// use std::arch::x86_64::*;
3694 ///
3695 /// # fn main() {
3696 /// # if is_x86_feature_detected!("avx2") {
3697 /// # #[target_feature(enable = "avx2")]
3698 /// # unsafe fn worker() {
3699 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3700 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3701 ///
3702 /// let c = _mm256_unpacklo_epi64(a, b);
3703 ///
3704 /// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3705 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3706 ///
3707 /// # }
3708 /// # unsafe { worker(); }
3709 /// # }
3710 /// # }
3711 /// ```
3712 ///
3713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
3714 #[inline]
3715 #[target_feature(enable = "avx2")]
3716 #[cfg_attr(test, assert_instr(vunpcklpd))]
3717 #[stable(feature = "simd_x86", since = "1.27.0")]
3718 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3719 let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3720 transmute(r)
3721 }
3722
3723 /// Computes the bitwise XOR of 256 bits (representing integer data)
3724 /// in `a` and `b`
3725 ///
3726 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
3727 #[inline]
3728 #[target_feature(enable = "avx2")]
3729 #[cfg_attr(test, assert_instr(vxorps))]
3730 #[stable(feature = "simd_x86", since = "1.27.0")]
3731 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3732 transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3733 }
3734
3735 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3736 /// integer containing the zero-extended integer data.
3737 ///
3738 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3739 ///
3740 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
3741 #[inline]
3742 #[target_feature(enable = "avx2")]
3743 // This intrinsic has no corresponding instruction.
3744 #[rustc_args_required_const(1)]
3745 #[stable(feature = "simd_x86", since = "1.27.0")]
3746 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i32 {
3747 let a = a.as_u8x32();
3748 macro_rules! call {
3749 ($imm5:expr) => {
3750 simd_extract::<_, u8>(a, $imm5) as i32
3751 };
3752 }
3753 constify_imm5!(imm8, call)
3754 }
3755
3756 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3757 /// integer containing the zero-extended integer data.
3758 ///
3759 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3760 ///
3761 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
3762 #[inline]
3763 #[target_feature(enable = "avx2")]
3764 // This intrinsic has no corresponding instruction.
3765 #[rustc_args_required_const(1)]
3766 #[stable(feature = "simd_x86", since = "1.27.0")]
3767 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i32 {
3768 let a = a.as_u16x16();
3769 macro_rules! call {
3770 ($imm4:expr) => {
3771 simd_extract::<_, u16>(a, $imm4) as i32
3772 };
3773 }
3774 constify_imm4!((imm8 & 15), call)
3775 }
3776
3777 /// Extracts a 32-bit integer from `a`, selected with `imm8`.
3778 ///
3779 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
3780 #[inline]
3781 #[target_feature(enable = "avx2")]
3782 // This intrinsic has no corresponding instruction.
3783 #[rustc_args_required_const(1)]
3784 #[stable(feature = "simd_x86", since = "1.27.0")]
3785 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
3786 let a = a.as_i32x8();
3787 macro_rules! call {
3788 ($imm3:expr) => {
3789 simd_extract(a, $imm3)
3790 };
3791 }
3792 constify_imm3!((imm8 & 7), call)
3793 }
3794
3795 /// Returns the first element of the input vector of `[4 x double]`.
3796 ///
3797 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
3798 #[inline]
3799 #[target_feature(enable = "avx2")]
3800 //#[cfg_attr(test, assert_instr(movsd))] FIXME
3801 #[stable(feature = "simd_x86", since = "1.27.0")]
3802 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
3803 simd_extract(a, 0)
3804 }
3805
3806 /// Returns the first element of the input vector of `[8 x i32]`.
3807 ///
3808 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
3809 #[inline]
3810 #[target_feature(enable = "avx2")]
3811 //#[cfg_attr(test, assert_instr(movd))] FIXME
3812 #[stable(feature = "simd_x86", since = "1.27.0")]
3813 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
3814 simd_extract(a.as_i32x8(), 0)
3815 }
3816
3817 #[allow(improper_ctypes)]
3818 extern "C" {
3819 #[link_name = "llvm.x86.avx2.pabs.b"]
3820 fn pabsb(a: i8x32) -> u8x32;
3821 #[link_name = "llvm.x86.avx2.pabs.w"]
3822 fn pabsw(a: i16x16) -> u16x16;
3823 #[link_name = "llvm.x86.avx2.pabs.d"]
3824 fn pabsd(a: i32x8) -> u32x8;
3825 #[link_name = "llvm.x86.avx2.pavg.b"]
3826 fn pavgb(a: u8x32, b: u8x32) -> u8x32;
3827 #[link_name = "llvm.x86.avx2.pavg.w"]
3828 fn pavgw(a: u16x16, b: u16x16) -> u16x16;
3829 #[link_name = "llvm.x86.avx2.pblendvb"]
3830 fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
3831 #[link_name = "llvm.x86.avx2.phadd.w"]
3832 fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3833 #[link_name = "llvm.x86.avx2.phadd.d"]
3834 fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3835 #[link_name = "llvm.x86.avx2.phadd.sw"]
3836 fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3837 #[link_name = "llvm.x86.avx2.phsub.w"]
3838 fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3839 #[link_name = "llvm.x86.avx2.phsub.d"]
3840 fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3841 #[link_name = "llvm.x86.avx2.phsub.sw"]
3842 fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3843 #[link_name = "llvm.x86.avx2.pmadd.wd"]
3844 fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3845 #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3846 fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3847 #[link_name = "llvm.x86.avx2.maskload.d"]
3848 fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3849 #[link_name = "llvm.x86.avx2.maskload.d.256"]
3850 fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3851 #[link_name = "llvm.x86.avx2.maskload.q"]
3852 fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3853 #[link_name = "llvm.x86.avx2.maskload.q.256"]
3854 fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3855 #[link_name = "llvm.x86.avx2.maskstore.d"]
3856 fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3857 #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3858 fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3859 #[link_name = "llvm.x86.avx2.maskstore.q"]
3860 fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3861 #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3862 fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3863 #[link_name = "llvm.x86.avx2.pmaxs.w"]
3864 fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
3865 #[link_name = "llvm.x86.avx2.pmaxs.d"]
3866 fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
3867 #[link_name = "llvm.x86.avx2.pmaxs.b"]
3868 fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
3869 #[link_name = "llvm.x86.avx2.pmaxu.w"]
3870 fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
3871 #[link_name = "llvm.x86.avx2.pmaxu.d"]
3872 fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
3873 #[link_name = "llvm.x86.avx2.pmaxu.b"]
3874 fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
3875 #[link_name = "llvm.x86.avx2.pmins.w"]
3876 fn pminsw(a: i16x16, b: i16x16) -> i16x16;
3877 #[link_name = "llvm.x86.avx2.pmins.d"]
3878 fn pminsd(a: i32x8, b: i32x8) -> i32x8;
3879 #[link_name = "llvm.x86.avx2.pmins.b"]
3880 fn pminsb(a: i8x32, b: i8x32) -> i8x32;
3881 #[link_name = "llvm.x86.avx2.pminu.w"]
3882 fn pminuw(a: u16x16, b: u16x16) -> u16x16;
3883 #[link_name = "llvm.x86.avx2.pminu.d"]
3884 fn pminud(a: u32x8, b: u32x8) -> u32x8;
3885 #[link_name = "llvm.x86.avx2.pminu.b"]
3886 fn pminub(a: u8x32, b: u8x32) -> u8x32;
3887 #[link_name = "llvm.x86.avx2.pmovmskb"]
3888 fn pmovmskb(a: i8x32) -> i32;
3889 #[link_name = "llvm.x86.avx2.mpsadbw"]
3890 fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3891 #[link_name = "llvm.x86.avx2.pmulhu.w"]
3892 fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
3893 #[link_name = "llvm.x86.avx2.pmulh.w"]
3894 fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
3895 #[link_name = "llvm.x86.avx2.pmul.dq"]
3896 fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
3897 #[link_name = "llvm.x86.avx2.pmulu.dq"]
3898 fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
3899 #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3900 fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3901 #[link_name = "llvm.x86.avx2.packsswb"]
3902 fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3903 #[link_name = "llvm.x86.avx2.packssdw"]
3904 fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3905 #[link_name = "llvm.x86.avx2.packuswb"]
3906 fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3907 #[link_name = "llvm.x86.avx2.packusdw"]
3908 fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3909 #[link_name = "llvm.x86.avx2.psad.bw"]
3910 fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3911 #[link_name = "llvm.x86.avx2.psign.b"]
3912 fn psignb(a: i8x32, b: i8x32) -> i8x32;
3913 #[link_name = "llvm.x86.avx2.psign.w"]
3914 fn psignw(a: i16x16, b: i16x16) -> i16x16;
3915 #[link_name = "llvm.x86.avx2.psign.d"]
3916 fn psignd(a: i32x8, b: i32x8) -> i32x8;
3917 #[link_name = "llvm.x86.avx2.psll.w"]
3918 fn psllw(a: i16x16, count: i16x8) -> i16x16;
3919 #[link_name = "llvm.x86.avx2.psll.d"]
3920 fn pslld(a: i32x8, count: i32x4) -> i32x8;
3921 #[link_name = "llvm.x86.avx2.psll.q"]
3922 fn psllq(a: i64x4, count: i64x2) -> i64x4;
3923 #[link_name = "llvm.x86.avx2.pslli.w"]
3924 fn pslliw(a: i16x16, imm8: i32) -> i16x16;
3925 #[link_name = "llvm.x86.avx2.pslli.d"]
3926 fn psllid(a: i32x8, imm8: i32) -> i32x8;
3927 #[link_name = "llvm.x86.avx2.pslli.q"]
3928 fn pslliq(a: i64x4, imm8: i32) -> i64x4;
3929 #[link_name = "llvm.x86.avx2.psllv.d"]
3930 fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3931 #[link_name = "llvm.x86.avx2.psllv.d.256"]
3932 fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3933 #[link_name = "llvm.x86.avx2.psllv.q"]
3934 fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3935 #[link_name = "llvm.x86.avx2.psllv.q.256"]
3936 fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3937 #[link_name = "llvm.x86.avx2.psra.w"]
3938 fn psraw(a: i16x16, count: i16x8) -> i16x16;
3939 #[link_name = "llvm.x86.avx2.psra.d"]
3940 fn psrad(a: i32x8, count: i32x4) -> i32x8;
3941 #[link_name = "llvm.x86.avx2.psrai.w"]
3942 fn psraiw(a: i16x16, imm8: i32) -> i16x16;
3943 #[link_name = "llvm.x86.avx2.psrai.d"]
3944 fn psraid(a: i32x8, imm8: i32) -> i32x8;
3945 #[link_name = "llvm.x86.avx2.psrav.d"]
3946 fn psravd(a: i32x4, count: i32x4) -> i32x4;
3947 #[link_name = "llvm.x86.avx2.psrav.d.256"]
3948 fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3949 #[link_name = "llvm.x86.avx2.psrl.w"]
3950 fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3951 #[link_name = "llvm.x86.avx2.psrl.d"]
3952 fn psrld(a: i32x8, count: i32x4) -> i32x8;
3953 #[link_name = "llvm.x86.avx2.psrl.q"]
3954 fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3955 #[link_name = "llvm.x86.avx2.psrli.w"]
3956 fn psrliw(a: i16x16, imm8: i32) -> i16x16;
3957 #[link_name = "llvm.x86.avx2.psrli.d"]
3958 fn psrlid(a: i32x8, imm8: i32) -> i32x8;
3959 #[link_name = "llvm.x86.avx2.psrli.q"]
3960 fn psrliq(a: i64x4, imm8: i32) -> i64x4;
3961 #[link_name = "llvm.x86.avx2.psrlv.d"]
3962 fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3963 #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3964 fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3965 #[link_name = "llvm.x86.avx2.psrlv.q"]
3966 fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3967 #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3968 fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3969 #[link_name = "llvm.x86.avx2.pshuf.b"]
3970 fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3971 #[link_name = "llvm.x86.avx2.permd"]
3972 fn permd(a: u32x8, b: u32x8) -> u32x8;
3973 #[link_name = "llvm.x86.avx2.permps"]
3974 fn permps(a: __m256, b: i32x8) -> __m256;
3975 #[link_name = "llvm.x86.avx2.vperm2i128"]
3976 fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3977 #[link_name = "llvm.x86.avx2.gather.d.d"]
3978 fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3979 #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3980 fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3981 #[link_name = "llvm.x86.avx2.gather.d.q"]
3982 fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3983 #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3984 fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3985 #[link_name = "llvm.x86.avx2.gather.q.d"]
3986 fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3987 #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3988 fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3989 #[link_name = "llvm.x86.avx2.gather.q.q"]
3990 fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3991 #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3992 fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3993 #[link_name = "llvm.x86.avx2.gather.d.pd"]
3994 fn pgatherdpd(
3995 src: __m128d,
3996 slice: *const i8,
3997 offsets: i32x4,
3998 mask: __m128d,
3999 scale: i8,
4000 ) -> __m128d;
4001 #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
4002 fn vpgatherdpd(
4003 src: __m256d,
4004 slice: *const i8,
4005 offsets: i32x4,
4006 mask: __m256d,
4007 scale: i8,
4008 ) -> __m256d;
4009 #[link_name = "llvm.x86.avx2.gather.q.pd"]
4010 fn pgatherqpd(
4011 src: __m128d,
4012 slice: *const i8,
4013 offsets: i64x2,
4014 mask: __m128d,
4015 scale: i8,
4016 ) -> __m128d;
4017 #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
4018 fn vpgatherqpd(
4019 src: __m256d,
4020 slice: *const i8,
4021 offsets: i64x4,
4022 mask: __m256d,
4023 scale: i8,
4024 ) -> __m256d;
4025 #[link_name = "llvm.x86.avx2.gather.d.ps"]
4026 fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
4027 -> __m128;
4028 #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4029 fn vpgatherdps(
4030 src: __m256,
4031 slice: *const i8,
4032 offsets: i32x8,
4033 mask: __m256,
4034 scale: i8,
4035 ) -> __m256;
4036 #[link_name = "llvm.x86.avx2.gather.q.ps"]
4037 fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4038 -> __m128;
4039 #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4040 fn vpgatherqps(
4041 src: __m128,
4042 slice: *const i8,
4043 offsets: i64x4,
4044 mask: __m128,
4045 scale: i8,
4046 ) -> __m128;
4047 #[link_name = "llvm.x86.avx2.psll.dq"]
4048 fn vpslldq(a: i64x4, b: i32) -> i64x4;
4049 #[link_name = "llvm.x86.avx2.psrl.dq"]
4050 fn vpsrldq(a: i64x4, b: i32) -> i64x4;
4051 }
4052
4053 #[cfg(test)]
4054 mod tests {
4055 use std;
4056 use stdarch_test::simd_test;
4057
4058 use crate::core_arch::x86::*;
4059
4060 #[simd_test(enable = "avx2")]
4061 unsafe fn test_mm256_abs_epi32() {
4062 #[rustfmt::skip]
4063 let a = _mm256_setr_epi32(
4064 0, 1, -1, i32::MAX,
4065 i32::MIN, 100, -100, -32,
4066 );
4067 let r = _mm256_abs_epi32(a);
4068 #[rustfmt::skip]
4069 let e = _mm256_setr_epi32(
4070 0, 1, 1, i32::MAX,
4071 i32::MAX.wrapping_add(1), 100, 100, 32,
4072 );
4073 assert_eq_m256i(r, e);
4074 }
4075
4076 #[simd_test(enable = "avx2")]
4077 unsafe fn test_mm256_abs_epi16() {
4078 #[rustfmt::skip]
4079 let a = _mm256_setr_epi16(
4080 0, 1, -1, 2, -2, 3, -3, 4,
4081 -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
4082 );
4083 let r = _mm256_abs_epi16(a);
4084 #[rustfmt::skip]
4085 let e = _mm256_setr_epi16(
4086 0, 1, 1, 2, 2, 3, 3, 4,
4087 4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
4088 );
4089 assert_eq_m256i(r, e);
4090 }
4091
4092 #[simd_test(enable = "avx2")]
4093 unsafe fn test_mm256_abs_epi8() {
4094 #[rustfmt::skip]
4095 let a = _mm256_setr_epi8(
4096 0, 1, -1, 2, -2, 3, -3, 4,
4097 -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4098 0, 1, -1, 2, -2, 3, -3, 4,
4099 -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
4100 );
4101 let r = _mm256_abs_epi8(a);
4102 #[rustfmt::skip]
4103 let e = _mm256_setr_epi8(
4104 0, 1, 1, 2, 2, 3, 3, 4,
4105 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4106 0, 1, 1, 2, 2, 3, 3, 4,
4107 4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
4108 );
4109 assert_eq_m256i(r, e);
4110 }
4111
4112 #[simd_test(enable = "avx2")]
4113 unsafe fn test_mm256_add_epi64() {
4114 let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4115 let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4116 let r = _mm256_add_epi64(a, b);
4117 let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4118 assert_eq_m256i(r, e);
4119 }
4120
4121 #[simd_test(enable = "avx2")]
4122 unsafe fn test_mm256_add_epi32() {
4123 let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4124 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4125 let r = _mm256_add_epi32(a, b);
4126 let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4127 assert_eq_m256i(r, e);
4128 }
4129
4130 #[simd_test(enable = "avx2")]
4131 unsafe fn test_mm256_add_epi16() {
4132 #[rustfmt::skip]
4133 let a = _mm256_setr_epi16(
4134 0, 1, 2, 3, 4, 5, 6, 7,
4135 8, 9, 10, 11, 12, 13, 14, 15,
4136 );
4137 #[rustfmt::skip]
4138 let b = _mm256_setr_epi16(
4139 0, 1, 2, 3, 4, 5, 6, 7,
4140 8, 9, 10, 11, 12, 13, 14, 15,
4141 );
4142 let r = _mm256_add_epi16(a, b);
4143 #[rustfmt::skip]
4144 let e = _mm256_setr_epi16(
4145 0, 2, 4, 6, 8, 10, 12, 14,
4146 16, 18, 20, 22, 24, 26, 28, 30,
4147 );
4148 assert_eq_m256i(r, e);
4149 }
4150
4151 #[simd_test(enable = "avx2")]
4152 unsafe fn test_mm256_add_epi8() {
4153 #[rustfmt::skip]
4154 let a = _mm256_setr_epi8(
4155 0, 1, 2, 3, 4, 5, 6, 7,
4156 8, 9, 10, 11, 12, 13, 14, 15,
4157 16, 17, 18, 19, 20, 21, 22, 23,
4158 24, 25, 26, 27, 28, 29, 30, 31,
4159 );
4160 #[rustfmt::skip]
4161 let b = _mm256_setr_epi8(
4162 0, 1, 2, 3, 4, 5, 6, 7,
4163 8, 9, 10, 11, 12, 13, 14, 15,
4164 16, 17, 18, 19, 20, 21, 22, 23,
4165 24, 25, 26, 27, 28, 29, 30, 31,
4166 );
4167 let r = _mm256_add_epi8(a, b);
4168 #[rustfmt::skip]
4169 let e = _mm256_setr_epi8(
4170 0, 2, 4, 6, 8, 10, 12, 14,
4171 16, 18, 20, 22, 24, 26, 28, 30,
4172 32, 34, 36, 38, 40, 42, 44, 46,
4173 48, 50, 52, 54, 56, 58, 60, 62,
4174 );
4175 assert_eq_m256i(r, e);
4176 }
4177
4178 #[simd_test(enable = "avx2")]
4179 unsafe fn test_mm256_adds_epi8() {
4180 #[rustfmt::skip]
4181 let a = _mm256_setr_epi8(
4182 0, 1, 2, 3, 4, 5, 6, 7,
4183 8, 9, 10, 11, 12, 13, 14, 15,
4184 16, 17, 18, 19, 20, 21, 22, 23,
4185 24, 25, 26, 27, 28, 29, 30, 31,
4186 );
4187 #[rustfmt::skip]
4188 let b = _mm256_setr_epi8(
4189 32, 33, 34, 35, 36, 37, 38, 39,
4190 40, 41, 42, 43, 44, 45, 46, 47,
4191 48, 49, 50, 51, 52, 53, 54, 55,
4192 56, 57, 58, 59, 60, 61, 62, 63,
4193 );
4194 let r = _mm256_adds_epi8(a, b);
4195 #[rustfmt::skip]
4196 let e = _mm256_setr_epi8(
4197 32, 34, 36, 38, 40, 42, 44, 46,
4198 48, 50, 52, 54, 56, 58, 60, 62,
4199 64, 66, 68, 70, 72, 74, 76, 78,
4200 80, 82, 84, 86, 88, 90, 92, 94,
4201 );
4202 assert_eq_m256i(r, e);
4203 }
4204
4205 #[simd_test(enable = "avx2")]
4206 unsafe fn test_mm256_adds_epi8_saturate_positive() {
4207 let a = _mm256_set1_epi8(0x7F);
4208 let b = _mm256_set1_epi8(1);
4209 let r = _mm256_adds_epi8(a, b);
4210 assert_eq_m256i(r, a);
4211 }
4212
4213 #[simd_test(enable = "avx2")]
4214 unsafe fn test_mm256_adds_epi8_saturate_negative() {
4215 let a = _mm256_set1_epi8(-0x80);
4216 let b = _mm256_set1_epi8(-1);
4217 let r = _mm256_adds_epi8(a, b);
4218 assert_eq_m256i(r, a);
4219 }
4220
4221 #[simd_test(enable = "avx2")]
4222 unsafe fn test_mm256_adds_epi16() {
4223 #[rustfmt::skip]
4224 let a = _mm256_setr_epi16(
4225 0, 1, 2, 3, 4, 5, 6, 7,
4226 8, 9, 10, 11, 12, 13, 14, 15,
4227 );
4228 #[rustfmt::skip]
4229 let b = _mm256_setr_epi16(
4230 32, 33, 34, 35, 36, 37, 38, 39,
4231 40, 41, 42, 43, 44, 45, 46, 47,
4232 );
4233 let r = _mm256_adds_epi16(a, b);
4234 #[rustfmt::skip]
4235 let e = _mm256_setr_epi16(
4236 32, 34, 36, 38, 40, 42, 44, 46,
4237 48, 50, 52, 54, 56, 58, 60, 62,
4238 );
4239
4240 assert_eq_m256i(r, e);
4241 }
4242
4243 #[simd_test(enable = "avx2")]
4244 unsafe fn test_mm256_adds_epi16_saturate_positive() {
4245 let a = _mm256_set1_epi16(0x7FFF);
4246 let b = _mm256_set1_epi16(1);
4247 let r = _mm256_adds_epi16(a, b);
4248 assert_eq_m256i(r, a);
4249 }
4250
4251 #[simd_test(enable = "avx2")]
4252 unsafe fn test_mm256_adds_epi16_saturate_negative() {
4253 let a = _mm256_set1_epi16(-0x8000);
4254 let b = _mm256_set1_epi16(-1);
4255 let r = _mm256_adds_epi16(a, b);
4256 assert_eq_m256i(r, a);
4257 }
4258
4259 #[simd_test(enable = "avx2")]
4260 unsafe fn test_mm256_adds_epu8() {
4261 #[rustfmt::skip]
4262 let a = _mm256_setr_epi8(
4263 0, 1, 2, 3, 4, 5, 6, 7,
4264 8, 9, 10, 11, 12, 13, 14, 15,
4265 16, 17, 18, 19, 20, 21, 22, 23,
4266 24, 25, 26, 27, 28, 29, 30, 31,
4267 );
4268 #[rustfmt::skip]
4269 let b = _mm256_setr_epi8(
4270 32, 33, 34, 35, 36, 37, 38, 39,
4271 40, 41, 42, 43, 44, 45, 46, 47,
4272 48, 49, 50, 51, 52, 53, 54, 55,
4273 56, 57, 58, 59, 60, 61, 62, 63,
4274 );
4275 let r = _mm256_adds_epu8(a, b);
4276 #[rustfmt::skip]
4277 let e = _mm256_setr_epi8(
4278 32, 34, 36, 38, 40, 42, 44, 46,
4279 48, 50, 52, 54, 56, 58, 60, 62,
4280 64, 66, 68, 70, 72, 74, 76, 78,
4281 80, 82, 84, 86, 88, 90, 92, 94,
4282 );
4283 assert_eq_m256i(r, e);
4284 }
4285
4286 #[simd_test(enable = "avx2")]
4287 unsafe fn test_mm256_adds_epu8_saturate() {
4288 let a = _mm256_set1_epi8(!0);
4289 let b = _mm256_set1_epi8(1);
4290 let r = _mm256_adds_epu8(a, b);
4291 assert_eq_m256i(r, a);
4292 }
4293
4294 #[simd_test(enable = "avx2")]
4295 unsafe fn test_mm256_adds_epu16() {
4296 #[rustfmt::skip]
4297 let a = _mm256_setr_epi16(
4298 0, 1, 2, 3, 4, 5, 6, 7,
4299 8, 9, 10, 11, 12, 13, 14, 15,
4300 );
4301 #[rustfmt::skip]
4302 let b = _mm256_setr_epi16(
4303 32, 33, 34, 35, 36, 37, 38, 39,
4304 40, 41, 42, 43, 44, 45, 46, 47,
4305 );
4306 let r = _mm256_adds_epu16(a, b);
4307 #[rustfmt::skip]
4308 let e = _mm256_setr_epi16(
4309 32, 34, 36, 38, 40, 42, 44, 46,
4310 48, 50, 52, 54, 56, 58, 60, 62,
4311 );
4312
4313 assert_eq_m256i(r, e);
4314 }
4315
4316 #[simd_test(enable = "avx2")]
4317 unsafe fn test_mm256_adds_epu16_saturate() {
4318 let a = _mm256_set1_epi16(!0);
4319 let b = _mm256_set1_epi16(1);
4320 let r = _mm256_adds_epu16(a, b);
4321 assert_eq_m256i(r, a);
4322 }
4323
4324 #[simd_test(enable = "avx2")]
4325 unsafe fn test_mm256_and_si256() {
4326 let a = _mm256_set1_epi8(5);
4327 let b = _mm256_set1_epi8(3);
4328 let got = _mm256_and_si256(a, b);
4329 assert_eq_m256i(got, _mm256_set1_epi8(1));
4330 }
4331
4332 #[simd_test(enable = "avx2")]
4333 unsafe fn test_mm256_andnot_si256() {
4334 let a = _mm256_set1_epi8(5);
4335 let b = _mm256_set1_epi8(3);
4336 let got = _mm256_andnot_si256(a, b);
4337 assert_eq_m256i(got, _mm256_set1_epi8(2));
4338 }
4339
4340 #[simd_test(enable = "avx2")]
4341 unsafe fn test_mm256_avg_epu8() {
4342 let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4343 let r = _mm256_avg_epu8(a, b);
4344 assert_eq_m256i(r, _mm256_set1_epi8(6));
4345 }
4346
4347 #[simd_test(enable = "avx2")]
4348 unsafe fn test_mm256_avg_epu16() {
4349 let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4350 let r = _mm256_avg_epu16(a, b);
4351 assert_eq_m256i(r, _mm256_set1_epi16(6));
4352 }
4353
4354 #[simd_test(enable = "avx2")]
4355 unsafe fn test_mm_blend_epi32() {
4356 let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4357 let e = _mm_setr_epi32(9, 3, 3, 3);
4358 let r = _mm_blend_epi32(a, b, 0x01 as i32);
4359 assert_eq_m128i(r, e);
4360
4361 let r = _mm_blend_epi32(b, a, 0x0E as i32);
4362 assert_eq_m128i(r, e);
4363 }
4364
4365 #[simd_test(enable = "avx2")]
4366 unsafe fn test_mm256_blend_epi32() {
4367 let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4368 let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4369 let r = _mm256_blend_epi32(a, b, 0x01 as i32);
4370 assert_eq_m256i(r, e);
4371
4372 let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4373 let r = _mm256_blend_epi32(a, b, 0x82 as i32);
4374 assert_eq_m256i(r, e);
4375
4376 let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4377 let r = _mm256_blend_epi32(a, b, 0x7C as i32);
4378 assert_eq_m256i(r, e);
4379 }
4380
4381 #[simd_test(enable = "avx2")]
4382 unsafe fn test_mm256_blend_epi16() {
4383 let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4384 let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4385 let r = _mm256_blend_epi16(a, b, 0x01 as i32);
4386 assert_eq_m256i(r, e);
4387
4388 let r = _mm256_blend_epi16(b, a, 0xFE as i32);
4389 assert_eq_m256i(r, e);
4390 }
4391
4392 #[simd_test(enable = "avx2")]
4393 unsafe fn test_mm256_blendv_epi8() {
4394 let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4395 let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
4396 let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
4397 let r = _mm256_blendv_epi8(a, b, mask);
4398 assert_eq_m256i(r, e);
4399 }
4400
4401 #[simd_test(enable = "avx2")]
4402 unsafe fn test_mm_broadcastb_epi8() {
4403 let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4404 let res = _mm_broadcastb_epi8(a);
4405 assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4406 }
4407
4408 #[simd_test(enable = "avx2")]
4409 unsafe fn test_mm256_broadcastb_epi8() {
4410 let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4411 let res = _mm256_broadcastb_epi8(a);
4412 assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4413 }
4414
4415 #[simd_test(enable = "avx2")]
4416 unsafe fn test_mm_broadcastd_epi32() {
4417 let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4418 let res = _mm_broadcastd_epi32(a);
4419 assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4420 }
4421
4422 #[simd_test(enable = "avx2")]
4423 unsafe fn test_mm256_broadcastd_epi32() {
4424 let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4425 let res = _mm256_broadcastd_epi32(a);
4426 assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4427 }
4428
4429 #[simd_test(enable = "avx2")]
4430 unsafe fn test_mm_broadcastq_epi64() {
4431 let a = _mm_setr_epi64x(0x1ffffffff, 0);
4432 let res = _mm_broadcastq_epi64(a);
4433 assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4434 }
4435
4436 #[simd_test(enable = "avx2")]
4437 unsafe fn test_mm256_broadcastq_epi64() {
4438 let a = _mm_setr_epi64x(0x1ffffffff, 0);
4439 let res = _mm256_broadcastq_epi64(a);
4440 assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4441 }
4442
4443 #[simd_test(enable = "avx2")]
4444 unsafe fn test_mm_broadcastsd_pd() {
4445 let a = _mm_setr_pd(6.28, 3.14);
4446 let res = _mm_broadcastsd_pd(a);
4447 assert_eq_m128d(res, _mm_set1_pd(6.28f64));
4448 }
4449
4450 #[simd_test(enable = "avx2")]
4451 unsafe fn test_mm256_broadcastsd_pd() {
4452 let a = _mm_setr_pd(6.28, 3.14);
4453 let res = _mm256_broadcastsd_pd(a);
4454 assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
4455 }
4456
4457 #[simd_test(enable = "avx2")]
4458 unsafe fn test_mm256_broadcastsi128_si256() {
4459 let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4460 let res = _mm256_broadcastsi128_si256(a);
4461 let retval = _mm256_setr_epi64x(
4462 0x0987654321012334,
4463 0x5678909876543210,
4464 0x0987654321012334,
4465 0x5678909876543210,
4466 );
4467 assert_eq_m256i(res, retval);
4468 }
4469
4470 #[simd_test(enable = "avx2")]
4471 unsafe fn test_mm_broadcastss_ps() {
4472 let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4473 let res = _mm_broadcastss_ps(a);
4474 assert_eq_m128(res, _mm_set1_ps(6.28f32));
4475 }
4476
4477 #[simd_test(enable = "avx2")]
4478 unsafe fn test_mm256_broadcastss_ps() {
4479 let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4480 let res = _mm256_broadcastss_ps(a);
4481 assert_eq_m256(res, _mm256_set1_ps(6.28f32));
4482 }
4483
4484 #[simd_test(enable = "avx2")]
4485 unsafe fn test_mm_broadcastw_epi16() {
4486 let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4487 let res = _mm_broadcastw_epi16(a);
4488 assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4489 }
4490
4491 #[simd_test(enable = "avx2")]
4492 unsafe fn test_mm256_broadcastw_epi16() {
4493 let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4494 let res = _mm256_broadcastw_epi16(a);
4495 assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4496 }
4497
4498 #[simd_test(enable = "avx2")]
4499 unsafe fn test_mm256_cmpeq_epi8() {
4500 #[rustfmt::skip]
4501 let a = _mm256_setr_epi8(
4502 0, 1, 2, 3, 4, 5, 6, 7,
4503 8, 9, 10, 11, 12, 13, 14, 15,
4504 16, 17, 18, 19, 20, 21, 22, 23,
4505 24, 25, 26, 27, 28, 29, 30, 31,
4506 );
4507 #[rustfmt::skip]
4508 let b = _mm256_setr_epi8(
4509 31, 30, 2, 28, 27, 26, 25, 24,
4510 23, 22, 21, 20, 19, 18, 17, 16,
4511 15, 14, 13, 12, 11, 10, 9, 8,
4512 7, 6, 5, 4, 3, 2, 1, 0,
4513 );
4514 let r = _mm256_cmpeq_epi8(a, b);
4515 assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
4516 }
4517
4518 #[simd_test(enable = "avx2")]
4519 unsafe fn test_mm256_cmpeq_epi16() {
4520 #[rustfmt::skip]
4521 let a = _mm256_setr_epi16(
4522 0, 1, 2, 3, 4, 5, 6, 7,
4523 8, 9, 10, 11, 12, 13, 14, 15,
4524 );
4525 #[rustfmt::skip]
4526 let b = _mm256_setr_epi16(
4527 15, 14, 2, 12, 11, 10, 9, 8,
4528 7, 6, 5, 4, 3, 2, 1, 0,
4529 );
4530 let r = _mm256_cmpeq_epi16(a, b);
4531 assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
4532 }
4533
4534 #[simd_test(enable = "avx2")]
4535 unsafe fn test_mm256_cmpeq_epi32() {
4536 let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4537 let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4538 let r = _mm256_cmpeq_epi32(a, b);
4539 let e = _mm256_set1_epi32(0);
4540 let e = _mm256_insert_epi32(e, !0, 2);
4541 assert_eq_m256i(r, e);
4542 }
4543
4544 #[simd_test(enable = "avx2")]
4545 unsafe fn test_mm256_cmpeq_epi64() {
4546 let a = _mm256_setr_epi64x(0, 1, 2, 3);
4547 let b = _mm256_setr_epi64x(3, 2, 2, 0);
4548 let r = _mm256_cmpeq_epi64(a, b);
4549 assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
4550 }
4551
4552 #[simd_test(enable = "avx2")]
4553 unsafe fn test_mm256_cmpgt_epi8() {
4554 let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
4555 let b = _mm256_set1_epi8(0);
4556 let r = _mm256_cmpgt_epi8(a, b);
4557 assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
4558 }
4559
4560 #[simd_test(enable = "avx2")]
4561 unsafe fn test_mm256_cmpgt_epi16() {
4562 let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
4563 let b = _mm256_set1_epi16(0);
4564 let r = _mm256_cmpgt_epi16(a, b);
4565 assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
4566 }
4567
4568 #[simd_test(enable = "avx2")]
4569 unsafe fn test_mm256_cmpgt_epi32() {
4570 let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
4571 let b = _mm256_set1_epi32(0);
4572 let r = _mm256_cmpgt_epi32(a, b);
4573 assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
4574 }
4575
4576 #[simd_test(enable = "avx2")]
4577 unsafe fn test_mm256_cmpgt_epi64() {
4578 let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
4579 let b = _mm256_set1_epi64x(0);
4580 let r = _mm256_cmpgt_epi64(a, b);
4581 assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
4582 }
4583
4584 #[simd_test(enable = "avx2")]
4585 unsafe fn test_mm256_cvtepi8_epi16() {
4586 #[rustfmt::skip]
4587 let a = _mm_setr_epi8(
4588 0, 0, -1, 1, -2, 2, -3, 3,
4589 -4, 4, -5, 5, -6, 6, -7, 7,
4590 );
4591 #[rustfmt::skip]
4592 let r = _mm256_setr_epi16(
4593 0, 0, -1, 1, -2, 2, -3, 3,
4594 -4, 4, -5, 5, -6, 6, -7, 7,
4595 );
4596 assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4597 }
4598
4599 #[simd_test(enable = "avx2")]
4600 unsafe fn test_mm256_cvtepi8_epi32() {
4601 #[rustfmt::skip]
4602 let a = _mm_setr_epi8(
4603 0, 0, -1, 1, -2, 2, -3, 3,
4604 -4, 4, -5, 5, -6, 6, -7, 7,
4605 );
4606 let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4607 assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4608 }
4609
4610 #[simd_test(enable = "avx2")]
4611 unsafe fn test_mm256_cvtepi8_epi64() {
4612 #[rustfmt::skip]
4613 let a = _mm_setr_epi8(
4614 0, 0, -1, 1, -2, 2, -3, 3,
4615 -4, 4, -5, 5, -6, 6, -7, 7,
4616 );
4617 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4618 assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4619 }
4620
4621 #[simd_test(enable = "avx2")]
4622 unsafe fn test_mm256_cvtepi16_epi32() {
4623 let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4624 let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4625 assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4626 }
4627
4628 #[simd_test(enable = "avx2")]
4629 unsafe fn test_mm256_cvtepi16_epi64() {
4630 let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4631 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4632 assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4633 }
4634
4635 #[simd_test(enable = "avx2")]
4636 unsafe fn test_mm256_cvtepi32_epi64() {
4637 let a = _mm_setr_epi32(0, 0, -1, 1);
4638 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4639 assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4640 }
4641
4642 #[simd_test(enable = "avx2")]
4643 unsafe fn test_mm256_cvtepu16_epi32() {
4644 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4645 let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4646 assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4647 }
4648
4649 #[simd_test(enable = "avx2")]
4650 unsafe fn test_mm256_cvtepu16_epi64() {
4651 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4652 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4653 assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4654 }
4655
4656 #[simd_test(enable = "avx2")]
4657 unsafe fn test_mm256_cvtepu32_epi64() {
4658 let a = _mm_setr_epi32(0, 1, 2, 3);
4659 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4660 assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4661 }
4662
4663 #[simd_test(enable = "avx2")]
4664 unsafe fn test_mm256_cvtepu8_epi16() {
4665 #[rustfmt::skip]
4666 let a = _mm_setr_epi8(
4667 0, 1, 2, 3, 4, 5, 6, 7,
4668 8, 9, 10, 11, 12, 13, 14, 15,
4669 );
4670 #[rustfmt::skip]
4671 let r = _mm256_setr_epi16(
4672 0, 1, 2, 3, 4, 5, 6, 7,
4673 8, 9, 10, 11, 12, 13, 14, 15,
4674 );
4675 assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4676 }
4677
4678 #[simd_test(enable = "avx2")]
4679 unsafe fn test_mm256_cvtepu8_epi32() {
4680 #[rustfmt::skip]
4681 let a = _mm_setr_epi8(
4682 0, 1, 2, 3, 4, 5, 6, 7,
4683 8, 9, 10, 11, 12, 13, 14, 15,
4684 );
4685 let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4686 assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4687 }
4688
4689 #[simd_test(enable = "avx2")]
4690 unsafe fn test_mm256_cvtepu8_epi64() {
4691 #[rustfmt::skip]
4692 let a = _mm_setr_epi8(
4693 0, 1, 2, 3, 4, 5, 6, 7,
4694 8, 9, 10, 11, 12, 13, 14, 15,
4695 );
4696 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4697 assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4698 }
4699
4700 #[simd_test(enable = "avx2")]
4701 unsafe fn test_mm256_extracti128_si256() {
4702 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4703 let r = _mm256_extracti128_si256(a, 0b01);
4704 let e = _mm_setr_epi64x(3, 4);
4705 assert_eq_m128i(r, e);
4706 }
4707
4708 #[simd_test(enable = "avx2")]
4709 unsafe fn test_mm256_hadd_epi16() {
4710 let a = _mm256_set1_epi16(2);
4711 let b = _mm256_set1_epi16(4);
4712 let r = _mm256_hadd_epi16(a, b);
4713 let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4714 assert_eq_m256i(r, e);
4715 }
4716
4717 #[simd_test(enable = "avx2")]
4718 unsafe fn test_mm256_hadd_epi32() {
4719 let a = _mm256_set1_epi32(2);
4720 let b = _mm256_set1_epi32(4);
4721 let r = _mm256_hadd_epi32(a, b);
4722 let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4723 assert_eq_m256i(r, e);
4724 }
4725
4726 #[simd_test(enable = "avx2")]
4727 unsafe fn test_mm256_hadds_epi16() {
4728 let a = _mm256_set1_epi16(2);
4729 let a = _mm256_insert_epi16(a, 0x7fff, 0);
4730 let a = _mm256_insert_epi16(a, 1, 1);
4731 let b = _mm256_set1_epi16(4);
4732 let r = _mm256_hadds_epi16(a, b);
4733 #[rustfmt::skip]
4734 let e = _mm256_setr_epi16(
4735 0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4736 4, 4, 4, 4, 8, 8, 8, 8,
4737 );
4738 assert_eq_m256i(r, e);
4739 }
4740
4741 #[simd_test(enable = "avx2")]
4742 unsafe fn test_mm256_hsub_epi16() {
4743 let a = _mm256_set1_epi16(2);
4744 let b = _mm256_set1_epi16(4);
4745 let r = _mm256_hsub_epi16(a, b);
4746 let e = _mm256_set1_epi16(0);
4747 assert_eq_m256i(r, e);
4748 }
4749
4750 #[simd_test(enable = "avx2")]
4751 unsafe fn test_mm256_hsub_epi32() {
4752 let a = _mm256_set1_epi32(2);
4753 let b = _mm256_set1_epi32(4);
4754 let r = _mm256_hsub_epi32(a, b);
4755 let e = _mm256_set1_epi32(0);
4756 assert_eq_m256i(r, e);
4757 }
4758
4759 #[simd_test(enable = "avx2")]
4760 unsafe fn test_mm256_hsubs_epi16() {
4761 let a = _mm256_set1_epi16(2);
4762 let a = _mm256_insert_epi16(a, 0x7fff, 0);
4763 let a = _mm256_insert_epi16(a, -1, 1);
4764 let b = _mm256_set1_epi16(4);
4765 let r = _mm256_hsubs_epi16(a, b);
4766 let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
4767 assert_eq_m256i(r, e);
4768 }
4769
4770 #[simd_test(enable = "avx2")]
4771 unsafe fn test_mm256_madd_epi16() {
4772 let a = _mm256_set1_epi16(2);
4773 let b = _mm256_set1_epi16(4);
4774 let r = _mm256_madd_epi16(a, b);
4775 let e = _mm256_set1_epi32(16);
4776 assert_eq_m256i(r, e);
4777 }
4778
4779 #[simd_test(enable = "avx2")]
4780 unsafe fn test_mm256_inserti128_si256() {
4781 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4782 let b = _mm_setr_epi64x(7, 8);
4783 let r = _mm256_inserti128_si256(a, b, 0b01);
4784 let e = _mm256_setr_epi64x(1, 2, 7, 8);
4785 assert_eq_m256i(r, e);
4786 }
4787
4788 #[simd_test(enable = "avx2")]
4789 unsafe fn test_mm256_maddubs_epi16() {
4790 let a = _mm256_set1_epi8(2);
4791 let b = _mm256_set1_epi8(4);
4792 let r = _mm256_maddubs_epi16(a, b);
4793 let e = _mm256_set1_epi16(16);
4794 assert_eq_m256i(r, e);
4795 }
4796
4797 #[simd_test(enable = "avx2")]
4798 unsafe fn test_mm_maskload_epi32() {
4799 let nums = [1, 2, 3, 4];
4800 let a = &nums as *const i32;
4801 let mask = _mm_setr_epi32(-1, 0, 0, -1);
4802 let r = _mm_maskload_epi32(a, mask);
4803 let e = _mm_setr_epi32(1, 0, 0, 4);
4804 assert_eq_m128i(r, e);
4805 }
4806
4807 #[simd_test(enable = "avx2")]
4808 unsafe fn test_mm256_maskload_epi32() {
4809 let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4810 let a = &nums as *const i32;
4811 let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4812 let r = _mm256_maskload_epi32(a, mask);
4813 let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4814 assert_eq_m256i(r, e);
4815 }
4816
4817 #[simd_test(enable = "avx2")]
4818 unsafe fn test_mm_maskload_epi64() {
4819 let nums = [1_i64, 2_i64];
4820 let a = &nums as *const i64;
4821 let mask = _mm_setr_epi64x(0, -1);
4822 let r = _mm_maskload_epi64(a, mask);
4823 let e = _mm_setr_epi64x(0, 2);
4824 assert_eq_m128i(r, e);
4825 }
4826
4827 #[simd_test(enable = "avx2")]
4828 unsafe fn test_mm256_maskload_epi64() {
4829 let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4830 let a = &nums as *const i64;
4831 let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4832 let r = _mm256_maskload_epi64(a, mask);
4833 let e = _mm256_setr_epi64x(0, 2, 3, 0);
4834 assert_eq_m256i(r, e);
4835 }
4836
4837 #[simd_test(enable = "avx2")]
4838 unsafe fn test_mm_maskstore_epi32() {
4839 let a = _mm_setr_epi32(1, 2, 3, 4);
4840 let mut arr = [-1, -1, -1, -1];
4841 let mask = _mm_setr_epi32(-1, 0, 0, -1);
4842 _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4843 let e = [1, -1, -1, 4];
4844 assert_eq!(arr, e);
4845 }
4846
4847 #[simd_test(enable = "avx2")]
4848 unsafe fn test_mm256_maskstore_epi32() {
4849 let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4850 let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4851 let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4852 _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4853 let e = [1, -1, -1, 42, -1, 6, 7, -1];
4854 assert_eq!(arr, e);
4855 }
4856
4857 #[simd_test(enable = "avx2")]
4858 unsafe fn test_mm_maskstore_epi64() {
4859 let a = _mm_setr_epi64x(1_i64, 2_i64);
4860 let mut arr = [-1_i64, -1_i64];
4861 let mask = _mm_setr_epi64x(0, -1);
4862 _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4863 let e = [-1, 2];
4864 assert_eq!(arr, e);
4865 }
4866
4867 #[simd_test(enable = "avx2")]
4868 unsafe fn test_mm256_maskstore_epi64() {
4869 let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4870 let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4871 let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4872 _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4873 let e = [-1, 2, 3, -1];
4874 assert_eq!(arr, e);
4875 }
4876
4877 #[simd_test(enable = "avx2")]
4878 unsafe fn test_mm256_max_epi16() {
4879 let a = _mm256_set1_epi16(2);
4880 let b = _mm256_set1_epi16(4);
4881 let r = _mm256_max_epi16(a, b);
4882 assert_eq_m256i(r, b);
4883 }
4884
4885 #[simd_test(enable = "avx2")]
4886 unsafe fn test_mm256_max_epi32() {
4887 let a = _mm256_set1_epi32(2);
4888 let b = _mm256_set1_epi32(4);
4889 let r = _mm256_max_epi32(a, b);
4890 assert_eq_m256i(r, b);
4891 }
4892
4893 #[simd_test(enable = "avx2")]
4894 unsafe fn test_mm256_max_epi8() {
4895 let a = _mm256_set1_epi8(2);
4896 let b = _mm256_set1_epi8(4);
4897 let r = _mm256_max_epi8(a, b);
4898 assert_eq_m256i(r, b);
4899 }
4900
4901 #[simd_test(enable = "avx2")]
4902 unsafe fn test_mm256_max_epu16() {
4903 let a = _mm256_set1_epi16(2);
4904 let b = _mm256_set1_epi16(4);
4905 let r = _mm256_max_epu16(a, b);
4906 assert_eq_m256i(r, b);
4907 }
4908
4909 #[simd_test(enable = "avx2")]
4910 unsafe fn test_mm256_max_epu32() {
4911 let a = _mm256_set1_epi32(2);
4912 let b = _mm256_set1_epi32(4);
4913 let r = _mm256_max_epu32(a, b);
4914 assert_eq_m256i(r, b);
4915 }
4916
4917 #[simd_test(enable = "avx2")]
4918 unsafe fn test_mm256_max_epu8() {
4919 let a = _mm256_set1_epi8(2);
4920 let b = _mm256_set1_epi8(4);
4921 let r = _mm256_max_epu8(a, b);
4922 assert_eq_m256i(r, b);
4923 }
4924
4925 #[simd_test(enable = "avx2")]
4926 unsafe fn test_mm256_min_epi16() {
4927 let a = _mm256_set1_epi16(2);
4928 let b = _mm256_set1_epi16(4);
4929 let r = _mm256_min_epi16(a, b);
4930 assert_eq_m256i(r, a);
4931 }
4932
4933 #[simd_test(enable = "avx2")]
4934 unsafe fn test_mm256_min_epi32() {
4935 let a = _mm256_set1_epi32(2);
4936 let b = _mm256_set1_epi32(4);
4937 let r = _mm256_min_epi32(a, b);
4938 assert_eq_m256i(r, a);
4939 }
4940
4941 #[simd_test(enable = "avx2")]
4942 unsafe fn test_mm256_min_epi8() {
4943 let a = _mm256_set1_epi8(2);
4944 let b = _mm256_set1_epi8(4);
4945 let r = _mm256_min_epi8(a, b);
4946 assert_eq_m256i(r, a);
4947 }
4948
4949 #[simd_test(enable = "avx2")]
4950 unsafe fn test_mm256_min_epu16() {
4951 let a = _mm256_set1_epi16(2);
4952 let b = _mm256_set1_epi16(4);
4953 let r = _mm256_min_epu16(a, b);
4954 assert_eq_m256i(r, a);
4955 }
4956
4957 #[simd_test(enable = "avx2")]
4958 unsafe fn test_mm256_min_epu32() {
4959 let a = _mm256_set1_epi32(2);
4960 let b = _mm256_set1_epi32(4);
4961 let r = _mm256_min_epu32(a, b);
4962 assert_eq_m256i(r, a);
4963 }
4964
4965 #[simd_test(enable = "avx2")]
4966 unsafe fn test_mm256_min_epu8() {
4967 let a = _mm256_set1_epi8(2);
4968 let b = _mm256_set1_epi8(4);
4969 let r = _mm256_min_epu8(a, b);
4970 assert_eq_m256i(r, a);
4971 }
4972
4973 #[simd_test(enable = "avx2")]
4974 unsafe fn test_mm256_movemask_epi8() {
4975 let a = _mm256_set1_epi8(-1);
4976 let r = _mm256_movemask_epi8(a);
4977 let e = -1;
4978 assert_eq!(r, e);
4979 }
4980
4981 #[simd_test(enable = "avx2")]
4982 unsafe fn test_mm256_mpsadbw_epu8() {
4983 let a = _mm256_set1_epi8(2);
4984 let b = _mm256_set1_epi8(4);
4985 let r = _mm256_mpsadbw_epu8(a, b, 0);
4986 let e = _mm256_set1_epi16(8);
4987 assert_eq_m256i(r, e);
4988 }
4989
4990 #[simd_test(enable = "avx2")]
4991 unsafe fn test_mm256_mul_epi32() {
4992 let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4993 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4994 let r = _mm256_mul_epi32(a, b);
4995 let e = _mm256_setr_epi64x(0, 0, 10, 14);
4996 assert_eq_m256i(r, e);
4997 }
4998
4999 #[simd_test(enable = "avx2")]
5000 unsafe fn test_mm256_mul_epu32() {
5001 let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
5002 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
5003 let r = _mm256_mul_epu32(a, b);
5004 let e = _mm256_setr_epi64x(0, 0, 10, 14);
5005 assert_eq_m256i(r, e);
5006 }
5007
5008 #[simd_test(enable = "avx2")]
5009 unsafe fn test_mm256_mulhi_epi16() {
5010 let a = _mm256_set1_epi16(6535);
5011 let b = _mm256_set1_epi16(6535);
5012 let r = _mm256_mulhi_epi16(a, b);
5013 let e = _mm256_set1_epi16(651);
5014 assert_eq_m256i(r, e);
5015 }
5016
5017 #[simd_test(enable = "avx2")]
5018 unsafe fn test_mm256_mulhi_epu16() {
5019 let a = _mm256_set1_epi16(6535);
5020 let b = _mm256_set1_epi16(6535);
5021 let r = _mm256_mulhi_epu16(a, b);
5022 let e = _mm256_set1_epi16(651);
5023 assert_eq_m256i(r, e);
5024 }
5025
5026 #[simd_test(enable = "avx2")]
5027 unsafe fn test_mm256_mullo_epi16() {
5028 let a = _mm256_set1_epi16(2);
5029 let b = _mm256_set1_epi16(4);
5030 let r = _mm256_mullo_epi16(a, b);
5031 let e = _mm256_set1_epi16(8);
5032 assert_eq_m256i(r, e);
5033 }
5034
5035 #[simd_test(enable = "avx2")]
5036 unsafe fn test_mm256_mullo_epi32() {
5037 let a = _mm256_set1_epi32(2);
5038 let b = _mm256_set1_epi32(4);
5039 let r = _mm256_mullo_epi32(a, b);
5040 let e = _mm256_set1_epi32(8);
5041 assert_eq_m256i(r, e);
5042 }
5043
5044 #[simd_test(enable = "avx2")]
5045 unsafe fn test_mm256_mulhrs_epi16() {
5046 let a = _mm256_set1_epi16(2);
5047 let b = _mm256_set1_epi16(4);
5048 let r = _mm256_mullo_epi16(a, b);
5049 let e = _mm256_set1_epi16(8);
5050 assert_eq_m256i(r, e);
5051 }
5052
5053 #[simd_test(enable = "avx2")]
5054 unsafe fn test_mm256_or_si256() {
5055 let a = _mm256_set1_epi8(-1);
5056 let b = _mm256_set1_epi8(0);
5057 let r = _mm256_or_si256(a, b);
5058 assert_eq_m256i(r, a);
5059 }
5060
5061 #[simd_test(enable = "avx2")]
5062 unsafe fn test_mm256_packs_epi16() {
5063 let a = _mm256_set1_epi16(2);
5064 let b = _mm256_set1_epi16(4);
5065 let r = _mm256_packs_epi16(a, b);
5066 #[rustfmt::skip]
5067 let e = _mm256_setr_epi8(
5068 2, 2, 2, 2, 2, 2, 2, 2,
5069 4, 4, 4, 4, 4, 4, 4, 4,
5070 2, 2, 2, 2, 2, 2, 2, 2,
5071 4, 4, 4, 4, 4, 4, 4, 4,
5072 );
5073
5074 assert_eq_m256i(r, e);
5075 }
5076
5077 #[simd_test(enable = "avx2")]
5078 unsafe fn test_mm256_packs_epi32() {
5079 let a = _mm256_set1_epi32(2);
5080 let b = _mm256_set1_epi32(4);
5081 let r = _mm256_packs_epi32(a, b);
5082 let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5083
5084 assert_eq_m256i(r, e);
5085 }
5086
5087 #[simd_test(enable = "avx2")]
5088 unsafe fn test_mm256_packus_epi16() {
5089 let a = _mm256_set1_epi16(2);
5090 let b = _mm256_set1_epi16(4);
5091 let r = _mm256_packus_epi16(a, b);
5092 #[rustfmt::skip]
5093 let e = _mm256_setr_epi8(
5094 2, 2, 2, 2, 2, 2, 2, 2,
5095 4, 4, 4, 4, 4, 4, 4, 4,
5096 2, 2, 2, 2, 2, 2, 2, 2,
5097 4, 4, 4, 4, 4, 4, 4, 4,
5098 );
5099
5100 assert_eq_m256i(r, e);
5101 }
5102
5103 #[simd_test(enable = "avx2")]
5104 unsafe fn test_mm256_packus_epi32() {
5105 let a = _mm256_set1_epi32(2);
5106 let b = _mm256_set1_epi32(4);
5107 let r = _mm256_packus_epi32(a, b);
5108 let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5109
5110 assert_eq_m256i(r, e);
5111 }
5112
5113 #[simd_test(enable = "avx2")]
5114 unsafe fn test_mm256_sad_epu8() {
5115 let a = _mm256_set1_epi8(2);
5116 let b = _mm256_set1_epi8(4);
5117 let r = _mm256_sad_epu8(a, b);
5118 let e = _mm256_set1_epi64x(16);
5119 assert_eq_m256i(r, e);
5120 }
5121
5122 #[simd_test(enable = "avx2")]
5123 unsafe fn test_mm256_shufflehi_epi16() {
5124 #[rustfmt::skip]
5125 let a = _mm256_setr_epi16(
5126 0, 1, 2, 3, 11, 22, 33, 44,
5127 4, 5, 6, 7, 55, 66, 77, 88,
5128 );
5129 #[rustfmt::skip]
5130 let e = _mm256_setr_epi16(
5131 0, 1, 2, 3, 44, 22, 22, 11,
5132 4, 5, 6, 7, 88, 66, 66, 55,
5133 );
5134 let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
5135 assert_eq_m256i(r, e);
5136 }
5137
5138 #[simd_test(enable = "avx2")]
5139 unsafe fn test_mm256_shufflelo_epi16() {
5140 #[rustfmt::skip]
5141 let a = _mm256_setr_epi16(
5142 11, 22, 33, 44, 0, 1, 2, 3,
5143 55, 66, 77, 88, 4, 5, 6, 7,
5144 );
5145 #[rustfmt::skip]
5146 let e = _mm256_setr_epi16(
5147 44, 22, 22, 11, 0, 1, 2, 3,
5148 88, 66, 66, 55, 4, 5, 6, 7,
5149 );
5150 let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
5151 assert_eq_m256i(r, e);
5152 }
5153
5154 #[simd_test(enable = "avx2")]
5155 unsafe fn test_mm256_sign_epi16() {
5156 let a = _mm256_set1_epi16(2);
5157 let b = _mm256_set1_epi16(-1);
5158 let r = _mm256_sign_epi16(a, b);
5159 let e = _mm256_set1_epi16(-2);
5160 assert_eq_m256i(r, e);
5161 }
5162
5163 #[simd_test(enable = "avx2")]
5164 unsafe fn test_mm256_sign_epi32() {
5165 let a = _mm256_set1_epi32(2);
5166 let b = _mm256_set1_epi32(-1);
5167 let r = _mm256_sign_epi32(a, b);
5168 let e = _mm256_set1_epi32(-2);
5169 assert_eq_m256i(r, e);
5170 }
5171
5172 #[simd_test(enable = "avx2")]
5173 unsafe fn test_mm256_sign_epi8() {
5174 let a = _mm256_set1_epi8(2);
5175 let b = _mm256_set1_epi8(-1);
5176 let r = _mm256_sign_epi8(a, b);
5177 let e = _mm256_set1_epi8(-2);
5178 assert_eq_m256i(r, e);
5179 }
5180
5181 #[simd_test(enable = "avx2")]
5182 unsafe fn test_mm256_sll_epi16() {
5183 let a = _mm256_set1_epi16(0xFF);
5184 let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5185 let r = _mm256_sll_epi16(a, b);
5186 assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5187 }
5188
5189 #[simd_test(enable = "avx2")]
5190 unsafe fn test_mm256_sll_epi32() {
5191 let a = _mm256_set1_epi32(0xFFFF);
5192 let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5193 let r = _mm256_sll_epi32(a, b);
5194 assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5195 }
5196
5197 #[simd_test(enable = "avx2")]
5198 unsafe fn test_mm256_sll_epi64() {
5199 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5200 let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
5201 let r = _mm256_sll_epi64(a, b);
5202 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5203 }
5204
5205 #[simd_test(enable = "avx2")]
5206 unsafe fn test_mm256_slli_epi16() {
5207 assert_eq_m256i(
5208 _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
5209 _mm256_set1_epi16(0xFF0),
5210 );
5211 }
5212
5213 #[simd_test(enable = "avx2")]
5214 unsafe fn test_mm256_slli_epi32() {
5215 assert_eq_m256i(
5216 _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5217 _mm256_set1_epi32(0xFFFF0),
5218 );
5219 }
5220
5221 #[simd_test(enable = "avx2")]
5222 unsafe fn test_mm256_slli_epi64() {
5223 assert_eq_m256i(
5224 _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5225 _mm256_set1_epi64x(0xFFFFFFFF0),
5226 );
5227 }
5228
5229 #[simd_test(enable = "avx2")]
5230 unsafe fn test_mm256_slli_si256() {
5231 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5232 let r = _mm256_slli_si256(a, 3);
5233 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5234 }
5235
5236 #[simd_test(enable = "avx2")]
5237 unsafe fn test_mm_sllv_epi32() {
5238 let a = _mm_set1_epi32(2);
5239 let b = _mm_set1_epi32(1);
5240 let r = _mm_sllv_epi32(a, b);
5241 let e = _mm_set1_epi32(4);
5242 assert_eq_m128i(r, e);
5243 }
5244
5245 #[simd_test(enable = "avx2")]
5246 unsafe fn test_mm256_sllv_epi32() {
5247 let a = _mm256_set1_epi32(2);
5248 let b = _mm256_set1_epi32(1);
5249 let r = _mm256_sllv_epi32(a, b);
5250 let e = _mm256_set1_epi32(4);
5251 assert_eq_m256i(r, e);
5252 }
5253
5254 #[simd_test(enable = "avx2")]
5255 unsafe fn test_mm_sllv_epi64() {
5256 let a = _mm_set1_epi64x(2);
5257 let b = _mm_set1_epi64x(1);
5258 let r = _mm_sllv_epi64(a, b);
5259 let e = _mm_set1_epi64x(4);
5260 assert_eq_m128i(r, e);
5261 }
5262
5263 #[simd_test(enable = "avx2")]
5264 unsafe fn test_mm256_sllv_epi64() {
5265 let a = _mm256_set1_epi64x(2);
5266 let b = _mm256_set1_epi64x(1);
5267 let r = _mm256_sllv_epi64(a, b);
5268 let e = _mm256_set1_epi64x(4);
5269 assert_eq_m256i(r, e);
5270 }
5271
5272 #[simd_test(enable = "avx2")]
5273 unsafe fn test_mm256_sra_epi16() {
5274 let a = _mm256_set1_epi16(-1);
5275 let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5276 let r = _mm256_sra_epi16(a, b);
5277 assert_eq_m256i(r, _mm256_set1_epi16(-1));
5278 }
5279
5280 #[simd_test(enable = "avx2")]
5281 unsafe fn test_mm256_sra_epi32() {
5282 let a = _mm256_set1_epi32(-1);
5283 let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
5284 let r = _mm256_sra_epi32(a, b);
5285 assert_eq_m256i(r, _mm256_set1_epi32(-1));
5286 }
5287
5288 #[simd_test(enable = "avx2")]
5289 unsafe fn test_mm256_srai_epi16() {
5290 assert_eq_m256i(
5291 _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
5292 _mm256_set1_epi16(-1),
5293 );
5294 }
5295
5296 #[simd_test(enable = "avx2")]
5297 unsafe fn test_mm256_srai_epi32() {
5298 assert_eq_m256i(
5299 _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
5300 _mm256_set1_epi32(-1),
5301 );
5302 }
5303
5304 #[simd_test(enable = "avx2")]
5305 unsafe fn test_mm_srav_epi32() {
5306 let a = _mm_set1_epi32(4);
5307 let count = _mm_set1_epi32(1);
5308 let r = _mm_srav_epi32(a, count);
5309 let e = _mm_set1_epi32(2);
5310 assert_eq_m128i(r, e);
5311 }
5312
5313 #[simd_test(enable = "avx2")]
5314 unsafe fn test_mm256_srav_epi32() {
5315 let a = _mm256_set1_epi32(4);
5316 let count = _mm256_set1_epi32(1);
5317 let r = _mm256_srav_epi32(a, count);
5318 let e = _mm256_set1_epi32(2);
5319 assert_eq_m256i(r, e);
5320 }
5321
5322 #[simd_test(enable = "avx2")]
5323 unsafe fn test_mm256_srli_si256() {
5324 #[rustfmt::skip]
5325 let a = _mm256_setr_epi8(
5326 1, 2, 3, 4, 5, 6, 7, 8,
5327 9, 10, 11, 12, 13, 14, 15, 16,
5328 17, 18, 19, 20, 21, 22, 23, 24,
5329 25, 26, 27, 28, 29, 30, 31, 32,
5330 );
5331 let r = _mm256_srli_si256(a, 3);
5332 #[rustfmt::skip]
5333 let e = _mm256_setr_epi8(
5334 4, 5, 6, 7, 8, 9, 10, 11,
5335 12, 13, 14, 15, 16, 0, 0, 0,
5336 20, 21, 22, 23, 24, 25, 26, 27,
5337 28, 29, 30, 31, 32, 0, 0, 0,
5338 );
5339 assert_eq_m256i(r, e);
5340 }
5341
5342 #[simd_test(enable = "avx2")]
5343 unsafe fn test_mm256_srl_epi16() {
5344 let a = _mm256_set1_epi16(0xFF);
5345 let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5346 let r = _mm256_srl_epi16(a, b);
5347 assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5348 }
5349
5350 #[simd_test(enable = "avx2")]
5351 unsafe fn test_mm256_srl_epi32() {
5352 let a = _mm256_set1_epi32(0xFFFF);
5353 let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5354 let r = _mm256_srl_epi32(a, b);
5355 assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5356 }
5357
5358 #[simd_test(enable = "avx2")]
5359 unsafe fn test_mm256_srl_epi64() {
5360 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5361 let b = _mm_setr_epi64x(4, 0);
5362 let r = _mm256_srl_epi64(a, b);
5363 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5364 }
5365
5366 #[simd_test(enable = "avx2")]
5367 unsafe fn test_mm256_srli_epi16() {
5368 assert_eq_m256i(
5369 _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
5370 _mm256_set1_epi16(0xF),
5371 );
5372 }
5373
5374 #[simd_test(enable = "avx2")]
5375 unsafe fn test_mm256_srli_epi32() {
5376 assert_eq_m256i(
5377 _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5378 _mm256_set1_epi32(0xFFF),
5379 );
5380 }
5381
5382 #[simd_test(enable = "avx2")]
5383 unsafe fn test_mm256_srli_epi64() {
5384 assert_eq_m256i(
5385 _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5386 _mm256_set1_epi64x(0xFFFFFFF),
5387 );
5388 }
5389
5390 #[simd_test(enable = "avx2")]
5391 unsafe fn test_mm_srlv_epi32() {
5392 let a = _mm_set1_epi32(2);
5393 let count = _mm_set1_epi32(1);
5394 let r = _mm_srlv_epi32(a, count);
5395 let e = _mm_set1_epi32(1);
5396 assert_eq_m128i(r, e);
5397 }
5398
5399 #[simd_test(enable = "avx2")]
5400 unsafe fn test_mm256_srlv_epi32() {
5401 let a = _mm256_set1_epi32(2);
5402 let count = _mm256_set1_epi32(1);
5403 let r = _mm256_srlv_epi32(a, count);
5404 let e = _mm256_set1_epi32(1);
5405 assert_eq_m256i(r, e);
5406 }
5407
5408 #[simd_test(enable = "avx2")]
5409 unsafe fn test_mm_srlv_epi64() {
5410 let a = _mm_set1_epi64x(2);
5411 let count = _mm_set1_epi64x(1);
5412 let r = _mm_srlv_epi64(a, count);
5413 let e = _mm_set1_epi64x(1);
5414 assert_eq_m128i(r, e);
5415 }
5416
5417 #[simd_test(enable = "avx2")]
5418 unsafe fn test_mm256_srlv_epi64() {
5419 let a = _mm256_set1_epi64x(2);
5420 let count = _mm256_set1_epi64x(1);
5421 let r = _mm256_srlv_epi64(a, count);
5422 let e = _mm256_set1_epi64x(1);
5423 assert_eq_m256i(r, e);
5424 }
5425
5426 #[simd_test(enable = "avx2")]
5427 unsafe fn test_mm256_sub_epi16() {
5428 let a = _mm256_set1_epi16(4);
5429 let b = _mm256_set1_epi16(2);
5430 let r = _mm256_sub_epi16(a, b);
5431 assert_eq_m256i(r, b);
5432 }
5433
5434 #[simd_test(enable = "avx2")]
5435 unsafe fn test_mm256_sub_epi32() {
5436 let a = _mm256_set1_epi32(4);
5437 let b = _mm256_set1_epi32(2);
5438 let r = _mm256_sub_epi32(a, b);
5439 assert_eq_m256i(r, b);
5440 }
5441
5442 #[simd_test(enable = "avx2")]
5443 unsafe fn test_mm256_sub_epi64() {
5444 let a = _mm256_set1_epi64x(4);
5445 let b = _mm256_set1_epi64x(2);
5446 let r = _mm256_sub_epi64(a, b);
5447 assert_eq_m256i(r, b);
5448 }
5449
5450 #[simd_test(enable = "avx2")]
5451 unsafe fn test_mm256_sub_epi8() {
5452 let a = _mm256_set1_epi8(4);
5453 let b = _mm256_set1_epi8(2);
5454 let r = _mm256_sub_epi8(a, b);
5455 assert_eq_m256i(r, b);
5456 }
5457
5458 #[simd_test(enable = "avx2")]
5459 unsafe fn test_mm256_subs_epi16() {
5460 let a = _mm256_set1_epi16(4);
5461 let b = _mm256_set1_epi16(2);
5462 let r = _mm256_subs_epi16(a, b);
5463 assert_eq_m256i(r, b);
5464 }
5465
5466 #[simd_test(enable = "avx2")]
5467 unsafe fn test_mm256_subs_epi8() {
5468 let a = _mm256_set1_epi8(4);
5469 let b = _mm256_set1_epi8(2);
5470 let r = _mm256_subs_epi8(a, b);
5471 assert_eq_m256i(r, b);
5472 }
5473
5474 #[simd_test(enable = "avx2")]
5475 unsafe fn test_mm256_subs_epu16() {
5476 let a = _mm256_set1_epi16(4);
5477 let b = _mm256_set1_epi16(2);
5478 let r = _mm256_subs_epu16(a, b);
5479 assert_eq_m256i(r, b);
5480 }
5481
5482 #[simd_test(enable = "avx2")]
5483 unsafe fn test_mm256_subs_epu8() {
5484 let a = _mm256_set1_epi8(4);
5485 let b = _mm256_set1_epi8(2);
5486 let r = _mm256_subs_epu8(a, b);
5487 assert_eq_m256i(r, b);
5488 }
5489
5490 #[simd_test(enable = "avx2")]
5491 unsafe fn test_mm256_xor_si256() {
5492 let a = _mm256_set1_epi8(5);
5493 let b = _mm256_set1_epi8(3);
5494 let r = _mm256_xor_si256(a, b);
5495 assert_eq_m256i(r, _mm256_set1_epi8(6));
5496 }
5497
5498 #[simd_test(enable = "avx2")]
5499 unsafe fn test_mm256_alignr_epi8() {
5500 #[rustfmt::skip]
5501 let a = _mm256_setr_epi8(
5502 1, 2, 3, 4, 5, 6, 7, 8,
5503 9, 10, 11, 12, 13, 14, 15, 16,
5504 17, 18, 19, 20, 21, 22, 23, 24,
5505 25, 26, 27, 28, 29, 30, 31, 32,
5506 );
5507 #[rustfmt::skip]
5508 let b = _mm256_setr_epi8(
5509 -1, -2, -3, -4, -5, -6, -7, -8,
5510 -9, -10, -11, -12, -13, -14, -15, -16,
5511 -17, -18, -19, -20, -21, -22, -23, -24,
5512 -25, -26, -27, -28, -29, -30, -31, -32,
5513 );
5514 let r = _mm256_alignr_epi8(a, b, 33);
5515 assert_eq_m256i(r, _mm256_set1_epi8(0));
5516
5517 let r = _mm256_alignr_epi8(a, b, 17);
5518 #[rustfmt::skip]
5519 let expected = _mm256_setr_epi8(
5520 2, 3, 4, 5, 6, 7, 8, 9,
5521 10, 11, 12, 13, 14, 15, 16, 0,
5522 18, 19, 20, 21, 22, 23, 24, 25,
5523 26, 27, 28, 29, 30, 31, 32, 0,
5524 );
5525 assert_eq_m256i(r, expected);
5526
5527 let r = _mm256_alignr_epi8(a, b, 4);
5528 #[rustfmt::skip]
5529 let expected = _mm256_setr_epi8(
5530 -5, -6, -7, -8, -9, -10, -11, -12,
5531 -13, -14, -15, -16, 1, 2, 3, 4,
5532 -21, -22, -23, -24, -25, -26, -27, -28,
5533 -29, -30, -31, -32, 17, 18, 19, 20,
5534 );
5535 assert_eq_m256i(r, expected);
5536
5537 #[rustfmt::skip]
5538 let expected = _mm256_setr_epi8(
5539 -1, -2, -3, -4, -5, -6, -7, -8,
5540 -9, -10, -11, -12, -13, -14, -15, -16, -17,
5541 -18, -19, -20, -21, -22, -23, -24, -25,
5542 -26, -27, -28, -29, -30, -31, -32,
5543 );
5544 let r = _mm256_alignr_epi8(a, b, 16);
5545 assert_eq_m256i(r, expected);
5546
5547 let r = _mm256_alignr_epi8(a, b, 15);
5548 #[rustfmt::skip]
5549 let expected = _mm256_setr_epi8(
5550 -16, 1, 2, 3, 4, 5, 6, 7,
5551 8, 9, 10, 11, 12, 13, 14, 15,
5552 -32, 17, 18, 19, 20, 21, 22, 23,
5553 24, 25, 26, 27, 28, 29, 30, 31,
5554 );
5555 assert_eq_m256i(r, expected);
5556
5557 let r = _mm256_alignr_epi8(a, b, 0);
5558 assert_eq_m256i(r, b);
5559 }
5560
5561 #[simd_test(enable = "avx2")]
5562 unsafe fn test_mm256_shuffle_epi8() {
5563 #[rustfmt::skip]
5564 let a = _mm256_setr_epi8(
5565 1, 2, 3, 4, 5, 6, 7, 8,
5566 9, 10, 11, 12, 13, 14, 15, 16,
5567 17, 18, 19, 20, 21, 22, 23, 24,
5568 25, 26, 27, 28, 29, 30, 31, 32,
5569 );
5570 #[rustfmt::skip]
5571 let b = _mm256_setr_epi8(
5572 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5573 12, 5, 5, 10, 4, 1, 8, 0,
5574 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5575 12, 5, 5, 10, 4, 1, 8, 0,
5576 );
5577 #[rustfmt::skip]
5578 let expected = _mm256_setr_epi8(
5579 5, 0, 5, 4, 9, 13, 7, 4,
5580 13, 6, 6, 11, 5, 2, 9, 1,
5581 21, 0, 21, 20, 25, 29, 23, 20,
5582 29, 22, 22, 27, 21, 18, 25, 17,
5583 );
5584 let r = _mm256_shuffle_epi8(a, b);
5585 assert_eq_m256i(r, expected);
5586 }
5587
5588 #[simd_test(enable = "avx2")]
5589 unsafe fn test_mm256_permutevar8x32_epi32() {
5590 let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5591 let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5592 let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5593 let r = _mm256_permutevar8x32_epi32(a, b);
5594 assert_eq_m256i(r, expected);
5595 }
5596
5597 #[simd_test(enable = "avx2")]
5598 unsafe fn test_mm256_permute4x64_epi64() {
5599 let a = _mm256_setr_epi64x(100, 200, 300, 400);
5600 let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5601 let r = _mm256_permute4x64_epi64(a, 0b00010011);
5602 assert_eq_m256i(r, expected);
5603 }
5604
5605 #[simd_test(enable = "avx2")]
5606 unsafe fn test_mm256_permute2x128_si256() {
5607 let a = _mm256_setr_epi64x(100, 200, 500, 600);
5608 let b = _mm256_setr_epi64x(300, 400, 700, 800);
5609 let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
5610 let e = _mm256_setr_epi64x(700, 800, 500, 600);
5611 assert_eq_m256i(r, e);
5612 }
5613
5614 #[simd_test(enable = "avx2")]
5615 unsafe fn test_mm256_permute4x64_pd() {
5616 let a = _mm256_setr_pd(1., 2., 3., 4.);
5617 let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
5618 let e = _mm256_setr_pd(4., 1., 2., 1.);
5619 assert_eq_m256d(r, e);
5620 }
5621
5622 #[simd_test(enable = "avx2")]
5623 unsafe fn test_mm256_permutevar8x32_ps() {
5624 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5625 let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5626 let r = _mm256_permutevar8x32_ps(a, b);
5627 let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5628 assert_eq_m256(r, e);
5629 }
5630
5631 #[simd_test(enable = "avx2")]
5632 unsafe fn test_mm_i32gather_epi32() {
5633 let mut arr = [0i32; 128];
5634 for i in 0..128i32 {
5635 arr[i as usize] = i;
5636 }
5637 // A multiplier of 4 is word-addressing
5638 let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5639 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5640 }
5641
5642 #[simd_test(enable = "avx2")]
5643 unsafe fn test_mm_mask_i32gather_epi32() {
5644 let mut arr = [0i32; 128];
5645 for i in 0..128i32 {
5646 arr[i as usize] = i;
5647 }
5648 // A multiplier of 4 is word-addressing
5649 let r = _mm_mask_i32gather_epi32(
5650 _mm_set1_epi32(256),
5651 arr.as_ptr(),
5652 _mm_setr_epi32(0, 16, 64, 96),
5653 _mm_setr_epi32(-1, -1, -1, 0),
5654 4,
5655 );
5656 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5657 }
5658
5659 #[simd_test(enable = "avx2")]
5660 unsafe fn test_mm256_i32gather_epi32() {
5661 let mut arr = [0i32; 128];
5662 for i in 0..128i32 {
5663 arr[i as usize] = i;
5664 }
5665 // A multiplier of 4 is word-addressing
5666 let r = _mm256_i32gather_epi32(
5667 arr.as_ptr(),
5668 _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5669 4,
5670 );
5671 assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5672 }
5673
5674 #[simd_test(enable = "avx2")]
5675 unsafe fn test_mm256_mask_i32gather_epi32() {
5676 let mut arr = [0i32; 128];
5677 for i in 0..128i32 {
5678 arr[i as usize] = i;
5679 }
5680 // A multiplier of 4 is word-addressing
5681 let r = _mm256_mask_i32gather_epi32(
5682 _mm256_set1_epi32(256),
5683 arr.as_ptr(),
5684 _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5685 _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5686 4,
5687 );
5688 assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5689 }
5690
5691 #[simd_test(enable = "avx2")]
5692 unsafe fn test_mm_i32gather_ps() {
5693 let mut arr = [0.0f32; 128];
5694 let mut j = 0.0;
5695 for i in 0..128usize {
5696 arr[i] = j;
5697 j += 1.0;
5698 }
5699 // A multiplier of 4 is word-addressing for f32s
5700 let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5701 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5702 }
5703
5704 #[simd_test(enable = "avx2")]
5705 unsafe fn test_mm_mask_i32gather_ps() {
5706 let mut arr = [0.0f32; 128];
5707 let mut j = 0.0;
5708 for i in 0..128usize {
5709 arr[i] = j;
5710 j += 1.0;
5711 }
5712 // A multiplier of 4 is word-addressing for f32s
5713 let r = _mm_mask_i32gather_ps(
5714 _mm_set1_ps(256.0),
5715 arr.as_ptr(),
5716 _mm_setr_epi32(0, 16, 64, 96),
5717 _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5718 4,
5719 );
5720 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5721 }
5722
5723 #[simd_test(enable = "avx2")]
5724 unsafe fn test_mm256_i32gather_ps() {
5725 let mut arr = [0.0f32; 128];
5726 let mut j = 0.0;
5727 for i in 0..128usize {
5728 arr[i] = j;
5729 j += 1.0;
5730 }
5731 // A multiplier of 4 is word-addressing for f32s
5732 let r = _mm256_i32gather_ps(
5733 arr.as_ptr(),
5734 _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5735 4,
5736 );
5737 assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5738 }
5739
5740 #[simd_test(enable = "avx2")]
5741 unsafe fn test_mm256_mask_i32gather_ps() {
5742 let mut arr = [0.0f32; 128];
5743 let mut j = 0.0;
5744 for i in 0..128usize {
5745 arr[i] = j;
5746 j += 1.0;
5747 }
5748 // A multiplier of 4 is word-addressing for f32s
5749 let r = _mm256_mask_i32gather_ps(
5750 _mm256_set1_ps(256.0),
5751 arr.as_ptr(),
5752 _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5753 _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5754 4,
5755 );
5756 assert_eq_m256(
5757 r,
5758 _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5759 );
5760 }
5761
5762 #[simd_test(enable = "avx2")]
5763 unsafe fn test_mm_i32gather_epi64() {
5764 let mut arr = [0i64; 128];
5765 for i in 0..128i64 {
5766 arr[i as usize] = i;
5767 }
5768 // A multiplier of 8 is word-addressing for i64s
5769 let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5770 assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5771 }
5772
5773 #[simd_test(enable = "avx2")]
5774 unsafe fn test_mm_mask_i32gather_epi64() {
5775 let mut arr = [0i64; 128];
5776 for i in 0..128i64 {
5777 arr[i as usize] = i;
5778 }
5779 // A multiplier of 8 is word-addressing for i64s
5780 let r = _mm_mask_i32gather_epi64(
5781 _mm_set1_epi64x(256),
5782 arr.as_ptr(),
5783 _mm_setr_epi32(16, 16, 16, 16),
5784 _mm_setr_epi64x(-1, 0),
5785 8,
5786 );
5787 assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5788 }
5789
5790 #[simd_test(enable = "avx2")]
5791 unsafe fn test_mm256_i32gather_epi64() {
5792 let mut arr = [0i64; 128];
5793 for i in 0..128i64 {
5794 arr[i as usize] = i;
5795 }
5796 // A multiplier of 8 is word-addressing for i64s
5797 let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5798 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5799 }
5800
5801 #[simd_test(enable = "avx2")]
5802 unsafe fn test_mm256_mask_i32gather_epi64() {
5803 let mut arr = [0i64; 128];
5804 for i in 0..128i64 {
5805 arr[i as usize] = i;
5806 }
5807 // A multiplier of 8 is word-addressing for i64s
5808 let r = _mm256_mask_i32gather_epi64(
5809 _mm256_set1_epi64x(256),
5810 arr.as_ptr(),
5811 _mm_setr_epi32(0, 16, 64, 96),
5812 _mm256_setr_epi64x(-1, -1, -1, 0),
5813 8,
5814 );
5815 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5816 }
5817
5818 #[simd_test(enable = "avx2")]
5819 unsafe fn test_mm_i32gather_pd() {
5820 let mut arr = [0.0f64; 128];
5821 let mut j = 0.0;
5822 for i in 0..128usize {
5823 arr[i] = j;
5824 j += 1.0;
5825 }
5826 // A multiplier of 8 is word-addressing for f64s
5827 let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5828 assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5829 }
5830
5831 #[simd_test(enable = "avx2")]
5832 unsafe fn test_mm_mask_i32gather_pd() {
5833 let mut arr = [0.0f64; 128];
5834 let mut j = 0.0;
5835 for i in 0..128usize {
5836 arr[i] = j;
5837 j += 1.0;
5838 }
5839 // A multiplier of 8 is word-addressing for f64s
5840 let r = _mm_mask_i32gather_pd(
5841 _mm_set1_pd(256.0),
5842 arr.as_ptr(),
5843 _mm_setr_epi32(16, 16, 16, 16),
5844 _mm_setr_pd(-1.0, 0.0),
5845 8,
5846 );
5847 assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5848 }
5849
5850 #[simd_test(enable = "avx2")]
5851 unsafe fn test_mm256_i32gather_pd() {
5852 let mut arr = [0.0f64; 128];
5853 let mut j = 0.0;
5854 for i in 0..128usize {
5855 arr[i] = j;
5856 j += 1.0;
5857 }
5858 // A multiplier of 8 is word-addressing for f64s
5859 let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5860 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5861 }
5862
5863 #[simd_test(enable = "avx2")]
5864 unsafe fn test_mm256_mask_i32gather_pd() {
5865 let mut arr = [0.0f64; 128];
5866 let mut j = 0.0;
5867 for i in 0..128usize {
5868 arr[i] = j;
5869 j += 1.0;
5870 }
5871 // A multiplier of 8 is word-addressing for f64s
5872 let r = _mm256_mask_i32gather_pd(
5873 _mm256_set1_pd(256.0),
5874 arr.as_ptr(),
5875 _mm_setr_epi32(0, 16, 64, 96),
5876 _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5877 8,
5878 );
5879 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5880 }
5881
5882 #[simd_test(enable = "avx2")]
5883 unsafe fn test_mm_i64gather_epi32() {
5884 let mut arr = [0i32; 128];
5885 for i in 0..128i32 {
5886 arr[i as usize] = i;
5887 }
5888 // A multiplier of 4 is word-addressing
5889 let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5890 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5891 }
5892
5893 #[simd_test(enable = "avx2")]
5894 unsafe fn test_mm_mask_i64gather_epi32() {
5895 let mut arr = [0i32; 128];
5896 for i in 0..128i32 {
5897 arr[i as usize] = i;
5898 }
5899 // A multiplier of 4 is word-addressing
5900 let r = _mm_mask_i64gather_epi32(
5901 _mm_set1_epi32(256),
5902 arr.as_ptr(),
5903 _mm_setr_epi64x(0, 16),
5904 _mm_setr_epi32(-1, 0, -1, 0),
5905 4,
5906 );
5907 assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5908 }
5909
5910 #[simd_test(enable = "avx2")]
5911 unsafe fn test_mm256_i64gather_epi32() {
5912 let mut arr = [0i32; 128];
5913 for i in 0..128i32 {
5914 arr[i as usize] = i;
5915 }
5916 // A multiplier of 4 is word-addressing
5917 let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5918 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5919 }
5920
5921 #[simd_test(enable = "avx2")]
5922 unsafe fn test_mm256_mask_i64gather_epi32() {
5923 let mut arr = [0i32; 128];
5924 for i in 0..128i32 {
5925 arr[i as usize] = i;
5926 }
5927 // A multiplier of 4 is word-addressing
5928 let r = _mm256_mask_i64gather_epi32(
5929 _mm_set1_epi32(256),
5930 arr.as_ptr(),
5931 _mm256_setr_epi64x(0, 16, 64, 96),
5932 _mm_setr_epi32(-1, -1, -1, 0),
5933 4,
5934 );
5935 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5936 }
5937
5938 #[simd_test(enable = "avx2")]
5939 unsafe fn test_mm_i64gather_ps() {
5940 let mut arr = [0.0f32; 128];
5941 let mut j = 0.0;
5942 for i in 0..128usize {
5943 arr[i] = j;
5944 j += 1.0;
5945 }
5946 // A multiplier of 4 is word-addressing for f32s
5947 let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5948 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5949 }
5950
5951 #[simd_test(enable = "avx2")]
5952 unsafe fn test_mm_mask_i64gather_ps() {
5953 let mut arr = [0.0f32; 128];
5954 let mut j = 0.0;
5955 for i in 0..128usize {
5956 arr[i] = j;
5957 j += 1.0;
5958 }
5959 // A multiplier of 4 is word-addressing for f32s
5960 let r = _mm_mask_i64gather_ps(
5961 _mm_set1_ps(256.0),
5962 arr.as_ptr(),
5963 _mm_setr_epi64x(0, 16),
5964 _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5965 4,
5966 );
5967 assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5968 }
5969
5970 #[simd_test(enable = "avx2")]
5971 unsafe fn test_mm256_i64gather_ps() {
5972 let mut arr = [0.0f32; 128];
5973 let mut j = 0.0;
5974 for i in 0..128usize {
5975 arr[i] = j;
5976 j += 1.0;
5977 }
5978 // A multiplier of 4 is word-addressing for f32s
5979 let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5980 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5981 }
5982
5983 #[simd_test(enable = "avx2")]
5984 unsafe fn test_mm256_mask_i64gather_ps() {
5985 let mut arr = [0.0f32; 128];
5986 let mut j = 0.0;
5987 for i in 0..128usize {
5988 arr[i] = j;
5989 j += 1.0;
5990 }
5991 // A multiplier of 4 is word-addressing for f32s
5992 let r = _mm256_mask_i64gather_ps(
5993 _mm_set1_ps(256.0),
5994 arr.as_ptr(),
5995 _mm256_setr_epi64x(0, 16, 64, 96),
5996 _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5997 4,
5998 );
5999 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
6000 }
6001
6002 #[simd_test(enable = "avx2")]
6003 unsafe fn test_mm_i64gather_epi64() {
6004 let mut arr = [0i64; 128];
6005 for i in 0..128i64 {
6006 arr[i as usize] = i;
6007 }
6008 // A multiplier of 8 is word-addressing for i64s
6009 let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6010 assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
6011 }
6012
6013 #[simd_test(enable = "avx2")]
6014 unsafe fn test_mm_mask_i64gather_epi64() {
6015 let mut arr = [0i64; 128];
6016 for i in 0..128i64 {
6017 arr[i as usize] = i;
6018 }
6019 // A multiplier of 8 is word-addressing for i64s
6020 let r = _mm_mask_i64gather_epi64(
6021 _mm_set1_epi64x(256),
6022 arr.as_ptr(),
6023 _mm_setr_epi64x(16, 16),
6024 _mm_setr_epi64x(-1, 0),
6025 8,
6026 );
6027 assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
6028 }
6029
6030 #[simd_test(enable = "avx2")]
6031 unsafe fn test_mm256_i64gather_epi64() {
6032 let mut arr = [0i64; 128];
6033 for i in 0..128i64 {
6034 arr[i as usize] = i;
6035 }
6036 // A multiplier of 8 is word-addressing for i64s
6037 let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6038 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
6039 }
6040
6041 #[simd_test(enable = "avx2")]
6042 unsafe fn test_mm256_mask_i64gather_epi64() {
6043 let mut arr = [0i64; 128];
6044 for i in 0..128i64 {
6045 arr[i as usize] = i;
6046 }
6047 // A multiplier of 8 is word-addressing for i64s
6048 let r = _mm256_mask_i64gather_epi64(
6049 _mm256_set1_epi64x(256),
6050 arr.as_ptr(),
6051 _mm256_setr_epi64x(0, 16, 64, 96),
6052 _mm256_setr_epi64x(-1, -1, -1, 0),
6053 8,
6054 );
6055 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
6056 }
6057
6058 #[simd_test(enable = "avx2")]
6059 unsafe fn test_mm_i64gather_pd() {
6060 let mut arr = [0.0f64; 128];
6061 let mut j = 0.0;
6062 for i in 0..128usize {
6063 arr[i] = j;
6064 j += 1.0;
6065 }
6066 // A multiplier of 8 is word-addressing for f64s
6067 let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6068 assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
6069 }
6070
6071 #[simd_test(enable = "avx2")]
6072 unsafe fn test_mm_mask_i64gather_pd() {
6073 let mut arr = [0.0f64; 128];
6074 let mut j = 0.0;
6075 for i in 0..128usize {
6076 arr[i] = j;
6077 j += 1.0;
6078 }
6079 // A multiplier of 8 is word-addressing for f64s
6080 let r = _mm_mask_i64gather_pd(
6081 _mm_set1_pd(256.0),
6082 arr.as_ptr(),
6083 _mm_setr_epi64x(16, 16),
6084 _mm_setr_pd(-1.0, 0.0),
6085 8,
6086 );
6087 assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
6088 }
6089
6090 #[simd_test(enable = "avx2")]
6091 unsafe fn test_mm256_i64gather_pd() {
6092 let mut arr = [0.0f64; 128];
6093 let mut j = 0.0;
6094 for i in 0..128usize {
6095 arr[i] = j;
6096 j += 1.0;
6097 }
6098 // A multiplier of 8 is word-addressing for f64s
6099 let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6100 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6101 }
6102
6103 #[simd_test(enable = "avx2")]
6104 unsafe fn test_mm256_mask_i64gather_pd() {
6105 let mut arr = [0.0f64; 128];
6106 let mut j = 0.0;
6107 for i in 0..128usize {
6108 arr[i] = j;
6109 j += 1.0;
6110 }
6111 // A multiplier of 8 is word-addressing for f64s
6112 let r = _mm256_mask_i64gather_pd(
6113 _mm256_set1_pd(256.0),
6114 arr.as_ptr(),
6115 _mm256_setr_epi64x(0, 16, 64, 96),
6116 _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6117 8,
6118 );
6119 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6120 }
6121
6122 #[simd_test(enable = "avx")]
6123 unsafe fn test_mm256_extract_epi8() {
6124 #[rustfmt::skip]
6125 let a = _mm256_setr_epi8(
6126 -1, 1, 2, 3, 4, 5, 6, 7,
6127 8, 9, 10, 11, 12, 13, 14, 15,
6128 16, 17, 18, 19, 20, 21, 22, 23,
6129 24, 25, 26, 27, 28, 29, 30, 31
6130 );
6131 let r1 = _mm256_extract_epi8(a, 0);
6132 let r2 = _mm256_extract_epi8(a, 35);
6133 assert_eq!(r1, 0xFF);
6134 assert_eq!(r2, 3);
6135 }
6136
6137 #[simd_test(enable = "avx2")]
6138 unsafe fn test_mm256_extract_epi16() {
6139 #[rustfmt::skip]
6140 let a = _mm256_setr_epi16(
6141 -1, 1, 2, 3, 4, 5, 6, 7,
6142 8, 9, 10, 11, 12, 13, 14, 15,
6143 );
6144 let r1 = _mm256_extract_epi16(a, 0);
6145 let r2 = _mm256_extract_epi16(a, 19);
6146 assert_eq!(r1, 0xFFFF);
6147 assert_eq!(r2, 3);
6148 }
6149
6150 #[simd_test(enable = "avx2")]
6151 unsafe fn test_mm256_extract_epi32() {
6152 let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
6153 let r1 = _mm256_extract_epi32(a, 0);
6154 let r2 = _mm256_extract_epi32(a, 11);
6155 assert_eq!(r1, -1);
6156 assert_eq!(r2, 3);
6157 }
6158
6159 #[simd_test(enable = "avx2")]
6160 unsafe fn test_mm256_cvtsd_f64() {
6161 let a = _mm256_setr_pd(1., 2., 3., 4.);
6162 let r = _mm256_cvtsd_f64(a);
6163 assert_eq!(r, 1.);
6164 }
6165
6166 #[simd_test(enable = "avx2")]
6167 unsafe fn test_mm256_cvtsi256_si32() {
6168 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
6169 let r = _mm256_cvtsi256_si32(a);
6170 assert_eq!(r, 1);
6171 }
6172 }