]> git.proxmox.com Git - rustc.git/blob - src/stdarch/crates/core_arch/src/x86/avx2.rs
84f3364b92222101ece2f18d0d91e673726c0798
[rustc.git] / src / stdarch / crates / core_arch / src / x86 / avx2.rs
1 //! Advanced Vector Extensions 2 (AVX)
2 //!
3 //! AVX2 expands most AVX commands to 256-bit wide vector registers and
4 //! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5 //!
6 //! The references are:
7 //!
8 //! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9 //! Instruction Set Reference, A-Z][intel64_ref].
10 //! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11 //! System Instructions][amd64_ref].
12 //!
13 //! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14 //! overview of the instructions available.
15 //!
16 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18 //! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19 //! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21 use crate::{
22 core_arch::{simd::*, simd_llvm::*, x86::*},
23 mem::transmute,
24 };
25
26 #[cfg(test)]
27 use stdarch_test::assert_instr;
28
29 /// Computes the absolute values of packed 32-bit integers in `a`.
30 ///
31 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
32 #[inline]
33 #[target_feature(enable = "avx2")]
34 #[cfg_attr(test, assert_instr(vpabsd))]
35 #[stable(feature = "simd_x86", since = "1.27.0")]
36 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
37 transmute(pabsd(a.as_i32x8()))
38 }
39
40 /// Computes the absolute values of packed 16-bit integers in `a`.
41 ///
42 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
43 #[inline]
44 #[target_feature(enable = "avx2")]
45 #[cfg_attr(test, assert_instr(vpabsw))]
46 #[stable(feature = "simd_x86", since = "1.27.0")]
47 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
48 transmute(pabsw(a.as_i16x16()))
49 }
50
51 /// Computes the absolute values of packed 8-bit integers in `a`.
52 ///
53 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
54 #[inline]
55 #[target_feature(enable = "avx2")]
56 #[cfg_attr(test, assert_instr(vpabsb))]
57 #[stable(feature = "simd_x86", since = "1.27.0")]
58 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
59 transmute(pabsb(a.as_i8x32()))
60 }
61
62 /// Adds packed 64-bit integers in `a` and `b`.
63 ///
64 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
65 #[inline]
66 #[target_feature(enable = "avx2")]
67 #[cfg_attr(test, assert_instr(vpaddq))]
68 #[stable(feature = "simd_x86", since = "1.27.0")]
69 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
70 transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
71 }
72
73 /// Adds packed 32-bit integers in `a` and `b`.
74 ///
75 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
76 #[inline]
77 #[target_feature(enable = "avx2")]
78 #[cfg_attr(test, assert_instr(vpaddd))]
79 #[stable(feature = "simd_x86", since = "1.27.0")]
80 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
81 transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
82 }
83
84 /// Adds packed 16-bit integers in `a` and `b`.
85 ///
86 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
87 #[inline]
88 #[target_feature(enable = "avx2")]
89 #[cfg_attr(test, assert_instr(vpaddw))]
90 #[stable(feature = "simd_x86", since = "1.27.0")]
91 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
92 transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
93 }
94
95 /// Adds packed 8-bit integers in `a` and `b`.
96 ///
97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
98 #[inline]
99 #[target_feature(enable = "avx2")]
100 #[cfg_attr(test, assert_instr(vpaddb))]
101 #[stable(feature = "simd_x86", since = "1.27.0")]
102 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
103 transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
104 }
105
106 /// Adds packed 8-bit integers in `a` and `b` using saturation.
107 ///
108 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
109 #[inline]
110 #[target_feature(enable = "avx2")]
111 #[cfg_attr(test, assert_instr(vpaddsb))]
112 #[stable(feature = "simd_x86", since = "1.27.0")]
113 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
114 transmute(paddsb(a.as_i8x32(), b.as_i8x32()))
115 }
116
117 /// Adds packed 16-bit integers in `a` and `b` using saturation.
118 ///
119 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
120 #[inline]
121 #[target_feature(enable = "avx2")]
122 #[cfg_attr(test, assert_instr(vpaddsw))]
123 #[stable(feature = "simd_x86", since = "1.27.0")]
124 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
125 transmute(paddsw(a.as_i16x16(), b.as_i16x16()))
126 }
127
128 /// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
129 ///
130 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
131 #[inline]
132 #[target_feature(enable = "avx2")]
133 #[cfg_attr(test, assert_instr(vpaddusb))]
134 #[stable(feature = "simd_x86", since = "1.27.0")]
135 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
136 transmute(paddusb(a.as_u8x32(), b.as_u8x32()))
137 }
138
139 /// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
140 ///
141 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
142 #[inline]
143 #[target_feature(enable = "avx2")]
144 #[cfg_attr(test, assert_instr(vpaddusw))]
145 #[stable(feature = "simd_x86", since = "1.27.0")]
146 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
147 transmute(paddusw(a.as_u16x16(), b.as_u16x16()))
148 }
149
150 /// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
151 /// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
152 ///
153 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
154 #[inline]
155 #[target_feature(enable = "avx2")]
156 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
157 #[rustc_args_required_const(2)]
158 #[stable(feature = "simd_x86", since = "1.27.0")]
159 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
160 let n = n as u32;
161 // If `palignr` is shifting the pair of vectors more than the size of two
162 // lanes, emit zero.
163 if n > 32 {
164 return _mm256_set1_epi8(0);
165 }
166 // If `palignr` is shifting the pair of input vectors more than one lane,
167 // but less than two lanes, convert to shifting in zeroes.
168 let (a, b, n) = if n > 16 {
169 (_mm256_set1_epi8(0), a, n - 16)
170 } else {
171 (a, b, n)
172 };
173
174 let a = a.as_i8x32();
175 let b = b.as_i8x32();
176
177 let r: i8x32 = match n {
178 0 => simd_shuffle32(
179 b,
180 a,
181 [
182 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
183 23, 24, 25, 26, 27, 28, 29, 30, 31,
184 ],
185 ),
186 1 => simd_shuffle32(
187 b,
188 a,
189 [
190 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
191 24, 25, 26, 27, 28, 29, 30, 31, 48,
192 ],
193 ),
194 2 => simd_shuffle32(
195 b,
196 a,
197 [
198 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
199 25, 26, 27, 28, 29, 30, 31, 48, 49,
200 ],
201 ),
202 3 => simd_shuffle32(
203 b,
204 a,
205 [
206 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
207 25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
208 ],
209 ),
210 4 => simd_shuffle32(
211 b,
212 a,
213 [
214 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
215 26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
216 ],
217 ),
218 5 => simd_shuffle32(
219 b,
220 a,
221 [
222 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
223 27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
224 ],
225 ),
226 6 => simd_shuffle32(
227 b,
228 a,
229 [
230 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
231 28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
232 ],
233 ),
234 7 => simd_shuffle32(
235 b,
236 a,
237 [
238 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
239 28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
240 ],
241 ),
242 8 => simd_shuffle32(
243 b,
244 a,
245 [
246 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
247 29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
248 ],
249 ),
250 9 => simd_shuffle32(
251 b,
252 a,
253 [
254 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
255 30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
256 ],
257 ),
258 10 => simd_shuffle32(
259 b,
260 a,
261 [
262 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
263 31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
264 ],
265 ),
266 11 => simd_shuffle32(
267 b,
268 a,
269 [
270 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
271 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
272 ],
273 ),
274 12 => simd_shuffle32(
275 b,
276 a,
277 [
278 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
279 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
280 ],
281 ),
282 13 => simd_shuffle32(
283 b,
284 a,
285 [
286 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
287 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
288 ],
289 ),
290 14 => simd_shuffle32(
291 b,
292 a,
293 [
294 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
295 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
296 ],
297 ),
298 15 => simd_shuffle32(
299 b,
300 a,
301 [
302 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
303 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
304 ],
305 ),
306 _ => b,
307 };
308 transmute(r)
309 }
310
311 /// Computes the bitwise AND of 256 bits (representing integer data)
312 /// in `a` and `b`.
313 ///
314 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
315 #[inline]
316 #[target_feature(enable = "avx2")]
317 #[cfg_attr(test, assert_instr(vandps))]
318 #[stable(feature = "simd_x86", since = "1.27.0")]
319 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
320 transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
321 }
322
323 /// Computes the bitwise NOT of 256 bits (representing integer data)
324 /// in `a` and then AND with `b`.
325 ///
326 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
327 #[inline]
328 #[target_feature(enable = "avx2")]
329 #[cfg_attr(test, assert_instr(vandnps))]
330 #[stable(feature = "simd_x86", since = "1.27.0")]
331 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
332 let all_ones = _mm256_set1_epi8(-1);
333 transmute(simd_and(
334 simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
335 b.as_i64x4(),
336 ))
337 }
338
339 /// Averages packed unsigned 16-bit integers in `a` and `b`.
340 ///
341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
342 #[inline]
343 #[target_feature(enable = "avx2")]
344 #[cfg_attr(test, assert_instr(vpavgw))]
345 #[stable(feature = "simd_x86", since = "1.27.0")]
346 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
347 transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
348 }
349
350 /// Averages packed unsigned 8-bit integers in `a` and `b`.
351 ///
352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
353 #[inline]
354 #[target_feature(enable = "avx2")]
355 #[cfg_attr(test, assert_instr(vpavgb))]
356 #[stable(feature = "simd_x86", since = "1.27.0")]
357 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
358 transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
359 }
360
361 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
362 ///
363 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
364 #[inline]
365 #[target_feature(enable = "avx2")]
366 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
367 #[rustc_args_required_const(2)]
368 #[stable(feature = "simd_x86", since = "1.27.0")]
369 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
370 let imm8 = (imm8 & 0xFF) as u8;
371 let a = a.as_i32x4();
372 let b = b.as_i32x4();
373 macro_rules! blend2 {
374 ($a:expr, $b:expr, $c:expr, $d:expr) => {
375 simd_shuffle4(a, b, [$a, $b, $c, $d]);
376 };
377 }
378 macro_rules! blend1 {
379 ($a:expr, $b:expr) => {
380 match (imm8 >> 2) & 0b11 {
381 0b00 => blend2!($a, $b, 2, 3),
382 0b01 => blend2!($a, $b, 6, 3),
383 0b10 => blend2!($a, $b, 2, 7),
384 _ => blend2!($a, $b, 6, 7),
385 }
386 };
387 }
388 let r: i32x4 = match imm8 & 0b11 {
389 0b00 => blend1!(0, 1),
390 0b01 => blend1!(4, 1),
391 0b10 => blend1!(0, 5),
392 _ => blend1!(4, 5),
393 };
394 transmute(r)
395 }
396
397 /// Blends packed 32-bit integers from `a` and `b` using control mask `imm8`.
398 ///
399 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
400 #[inline]
401 #[target_feature(enable = "avx2")]
402 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
403 #[rustc_args_required_const(2)]
404 #[stable(feature = "simd_x86", since = "1.27.0")]
405 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
406 let imm8 = (imm8 & 0xFF) as u8;
407 let a = a.as_i32x8();
408 let b = b.as_i32x8();
409 macro_rules! blend4 {
410 (
411 $a:expr,
412 $b:expr,
413 $c:expr,
414 $d:expr,
415 $e:expr,
416 $f:expr,
417 $g:expr,
418 $h:expr
419 ) => {
420 simd_shuffle8(a, b, [$a, $b, $c, $d, $e, $f, $g, $h]);
421 };
422 }
423 macro_rules! blend3 {
424 ($a:expr, $b:expr, $c:expr, $d:expr, $e:expr, $f:expr) => {
425 match (imm8 >> 6) & 0b11 {
426 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7),
427 0b01 => blend4!($a, $b, $c, $d, $e, $f, 14, 7),
428 0b10 => blend4!($a, $b, $c, $d, $e, $f, 6, 15),
429 _ => blend4!($a, $b, $c, $d, $e, $f, 14, 15),
430 }
431 };
432 }
433 macro_rules! blend2 {
434 ($a:expr, $b:expr, $c:expr, $d:expr) => {
435 match (imm8 >> 4) & 0b11 {
436 0b00 => blend3!($a, $b, $c, $d, 4, 5),
437 0b01 => blend3!($a, $b, $c, $d, 12, 5),
438 0b10 => blend3!($a, $b, $c, $d, 4, 13),
439 _ => blend3!($a, $b, $c, $d, 12, 13),
440 }
441 };
442 }
443 macro_rules! blend1 {
444 ($a:expr, $b:expr) => {
445 match (imm8 >> 2) & 0b11 {
446 0b00 => blend2!($a, $b, 2, 3),
447 0b01 => blend2!($a, $b, 10, 3),
448 0b10 => blend2!($a, $b, 2, 11),
449 _ => blend2!($a, $b, 10, 11),
450 }
451 };
452 }
453 let r: i32x8 = match imm8 & 0b11 {
454 0b00 => blend1!(0, 1),
455 0b01 => blend1!(8, 1),
456 0b10 => blend1!(0, 9),
457 _ => blend1!(8, 9),
458 };
459 transmute(r)
460 }
461
462 /// Blends packed 16-bit integers from `a` and `b` using control mask `imm8`.
463 ///
464 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
465 #[inline]
466 #[target_feature(enable = "avx2")]
467 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
468 #[rustc_args_required_const(2)]
469 #[stable(feature = "simd_x86", since = "1.27.0")]
470 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
471 let imm8 = (imm8 & 0xFF) as u8;
472 let a = a.as_i16x16();
473 let b = b.as_i16x16();
474 macro_rules! blend4 {
475 (
476 $a:expr,
477 $b:expr,
478 $c:expr,
479 $d:expr,
480 $e:expr,
481 $f:expr,
482 $g:expr,
483 $h:expr,
484 $i:expr,
485 $j:expr,
486 $k:expr,
487 $l:expr,
488 $m:expr,
489 $n:expr,
490 $o:expr,
491 $p:expr
492 ) => {
493 simd_shuffle16(
494 a,
495 b,
496 [
497 $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p,
498 ],
499 )
500 };
501 }
502 macro_rules! blend3 {
503 (
504 $a:expr,
505 $b:expr,
506 $c:expr,
507 $d:expr,
508 $e:expr,
509 $f:expr,
510 $a2:expr,
511 $b2:expr,
512 $c2:expr,
513 $d2:expr,
514 $e2:expr,
515 $f2:expr
516 ) => {
517 match (imm8 >> 6) & 0b11 {
518 0b00 => blend4!($a, $b, $c, $d, $e, $f, 6, 7, $a2, $b2, $c2, $d2, $e2, $f2, 14, 15),
519 0b01 => {
520 blend4!($a, $b, $c, $d, $e, $f, 22, 7, $a2, $b2, $c2, $d2, $e2, $f2, 30, 15)
521 }
522 0b10 => {
523 blend4!($a, $b, $c, $d, $e, $f, 6, 23, $a2, $b2, $c2, $d2, $e2, $f2, 14, 31)
524 }
525 _ => blend4!($a, $b, $c, $d, $e, $f, 22, 23, $a2, $b2, $c2, $d2, $e2, $f2, 30, 31),
526 }
527 };
528 }
529 macro_rules! blend2 {
530 (
531 $a:expr,
532 $b:expr,
533 $c:expr,
534 $d:expr,
535 $a2:expr,
536 $b2:expr,
537 $c2:expr,
538 $d2:expr
539 ) => {
540 match (imm8 >> 4) & 0b11 {
541 0b00 => blend3!($a, $b, $c, $d, 4, 5, $a2, $b2, $c2, $d2, 12, 13),
542 0b01 => blend3!($a, $b, $c, $d, 20, 5, $a2, $b2, $c2, $d2, 28, 13),
543 0b10 => blend3!($a, $b, $c, $d, 4, 21, $a2, $b2, $c2, $d2, 12, 29),
544 _ => blend3!($a, $b, $c, $d, 20, 21, $a2, $b2, $c2, $d2, 28, 29),
545 }
546 };
547 }
548 macro_rules! blend1 {
549 ($a1:expr, $b1:expr, $a2:expr, $b2:expr) => {
550 match (imm8 >> 2) & 0b11 {
551 0b00 => blend2!($a1, $b1, 2, 3, $a2, $b2, 10, 11),
552 0b01 => blend2!($a1, $b1, 18, 3, $a2, $b2, 26, 11),
553 0b10 => blend2!($a1, $b1, 2, 19, $a2, $b2, 10, 27),
554 _ => blend2!($a1, $b1, 18, 19, $a2, $b2, 26, 27),
555 }
556 };
557 }
558 let r: i16x16 = match imm8 & 0b11 {
559 0b00 => blend1!(0, 1, 8, 9),
560 0b01 => blend1!(16, 1, 24, 9),
561 0b10 => blend1!(0, 17, 8, 25),
562 _ => blend1!(16, 17, 24, 25),
563 };
564 transmute(r)
565 }
566
567 /// Blends packed 8-bit integers from `a` and `b` using `mask`.
568 ///
569 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
570 #[inline]
571 #[target_feature(enable = "avx2")]
572 #[cfg_attr(test, assert_instr(vpblendvb))]
573 #[stable(feature = "simd_x86", since = "1.27.0")]
574 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
575 transmute(pblendvb(a.as_i8x32(), b.as_i8x32(), mask.as_i8x32()))
576 }
577
578 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
579 /// the 128-bit returned value.
580 ///
581 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
582 #[inline]
583 #[target_feature(enable = "avx2")]
584 #[cfg_attr(test, assert_instr(vpbroadcastb))]
585 #[stable(feature = "simd_x86", since = "1.27.0")]
586 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
587 let zero = _mm_setzero_si128();
588 let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
589 transmute::<i8x16, _>(ret)
590 }
591
592 /// Broadcasts the low packed 8-bit integer from `a` to all elements of
593 /// the 256-bit returned value.
594 ///
595 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
596 #[inline]
597 #[target_feature(enable = "avx2")]
598 #[cfg_attr(test, assert_instr(vpbroadcastb))]
599 #[stable(feature = "simd_x86", since = "1.27.0")]
600 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
601 let zero = _mm_setzero_si128();
602 let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
603 transmute::<i8x32, _>(ret)
604 }
605
606 // N.B., `simd_shuffle4` with integer data types for `a` and `b` is
607 // often compiled to `vbroadcastss`.
608 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
609 /// the 128-bit returned value.
610 ///
611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
612 #[inline]
613 #[target_feature(enable = "avx2")]
614 #[cfg_attr(test, assert_instr(vbroadcastss))]
615 #[stable(feature = "simd_x86", since = "1.27.0")]
616 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
617 let zero = _mm_setzero_si128();
618 let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
619 transmute::<i32x4, _>(ret)
620 }
621
622 // N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
623 // often compiled to `vbroadcastss`.
624 /// Broadcasts the low packed 32-bit integer from `a` to all elements of
625 /// the 256-bit returned value.
626 ///
627 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
628 #[inline]
629 #[target_feature(enable = "avx2")]
630 #[cfg_attr(test, assert_instr(vbroadcastss))]
631 #[stable(feature = "simd_x86", since = "1.27.0")]
632 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
633 let zero = _mm_setzero_si128();
634 let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
635 transmute::<i32x8, _>(ret)
636 }
637
638 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
639 /// the 128-bit returned value.
640 ///
641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
642 #[inline]
643 #[target_feature(enable = "avx2")]
644 #[cfg_attr(test, assert_instr(vpbroadcastq))]
645 #[stable(feature = "simd_x86", since = "1.27.0")]
646 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
647 let zero = _mm_setzero_si128().as_i64x2();
648 let ret = simd_shuffle2(a.as_i64x2(), zero, [0_u32; 2]);
649 transmute::<i64x2, _>(ret)
650 }
651
652 // N.B. `simd_shuffle4` with integer data types for `a` and `b` is
653 // often compiled to `vbroadcastsd`.
654 /// Broadcasts the low packed 64-bit integer from `a` to all elements of
655 /// the 256-bit returned value.
656 ///
657 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
658 #[inline]
659 #[target_feature(enable = "avx2")]
660 #[cfg_attr(test, assert_instr(vbroadcastsd))]
661 #[stable(feature = "simd_x86", since = "1.27.0")]
662 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
663 let zero = _mm_setzero_si128();
664 let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0_u32; 4]);
665 transmute::<i64x4, _>(ret)
666 }
667
668 /// Broadcasts the low double-precision (64-bit) floating-point element
669 /// from `a` to all elements of the 128-bit returned value.
670 ///
671 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
672 #[inline]
673 #[target_feature(enable = "avx2")]
674 #[cfg_attr(test, assert_instr(vmovddup))]
675 #[stable(feature = "simd_x86", since = "1.27.0")]
676 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
677 simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
678 }
679
680 /// Broadcasts the low double-precision (64-bit) floating-point element
681 /// from `a` to all elements of the 256-bit returned value.
682 ///
683 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
684 #[inline]
685 #[target_feature(enable = "avx2")]
686 #[cfg_attr(test, assert_instr(vbroadcastsd))]
687 #[stable(feature = "simd_x86", since = "1.27.0")]
688 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
689 simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
690 }
691
692 // N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
693 // `vbroadcastf128`.
694 /// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
695 /// the 256-bit returned value.
696 ///
697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
698 #[inline]
699 #[target_feature(enable = "avx2")]
700 #[stable(feature = "simd_x86", since = "1.27.0")]
701 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
702 let zero = _mm_setzero_si128();
703 let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
704 transmute::<i64x4, _>(ret)
705 }
706
707 /// Broadcasts the low single-precision (32-bit) floating-point element
708 /// from `a` to all elements of the 128-bit returned value.
709 ///
710 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
711 #[inline]
712 #[target_feature(enable = "avx2")]
713 #[cfg_attr(test, assert_instr(vbroadcastss))]
714 #[stable(feature = "simd_x86", since = "1.27.0")]
715 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
716 simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
717 }
718
719 /// Broadcasts the low single-precision (32-bit) floating-point element
720 /// from `a` to all elements of the 256-bit returned value.
721 ///
722 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
723 #[inline]
724 #[target_feature(enable = "avx2")]
725 #[cfg_attr(test, assert_instr(vbroadcastss))]
726 #[stable(feature = "simd_x86", since = "1.27.0")]
727 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
728 simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
729 }
730
731 /// Broadcasts the low packed 16-bit integer from a to all elements of
732 /// the 128-bit returned value
733 ///
734 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
735 #[inline]
736 #[target_feature(enable = "avx2")]
737 #[cfg_attr(test, assert_instr(vpbroadcastw))]
738 #[stable(feature = "simd_x86", since = "1.27.0")]
739 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
740 let zero = _mm_setzero_si128();
741 let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
742 transmute::<i16x8, _>(ret)
743 }
744
745 /// Broadcasts the low packed 16-bit integer from a to all elements of
746 /// the 256-bit returned value
747 ///
748 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
749 #[inline]
750 #[target_feature(enable = "avx2")]
751 #[cfg_attr(test, assert_instr(vpbroadcastw))]
752 #[stable(feature = "simd_x86", since = "1.27.0")]
753 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
754 let zero = _mm_setzero_si128();
755 let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
756 transmute::<i16x16, _>(ret)
757 }
758
759 /// Compares packed 64-bit integers in `a` and `b` for equality.
760 ///
761 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
762 #[inline]
763 #[target_feature(enable = "avx2")]
764 #[cfg_attr(test, assert_instr(vpcmpeqq))]
765 #[stable(feature = "simd_x86", since = "1.27.0")]
766 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
767 transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
768 }
769
770 /// Compares packed 32-bit integers in `a` and `b` for equality.
771 ///
772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
773 #[inline]
774 #[target_feature(enable = "avx2")]
775 #[cfg_attr(test, assert_instr(vpcmpeqd))]
776 #[stable(feature = "simd_x86", since = "1.27.0")]
777 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
778 transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
779 }
780
781 /// Compares packed 16-bit integers in `a` and `b` for equality.
782 ///
783 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
784 #[inline]
785 #[target_feature(enable = "avx2")]
786 #[cfg_attr(test, assert_instr(vpcmpeqw))]
787 #[stable(feature = "simd_x86", since = "1.27.0")]
788 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
789 transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
790 }
791
792 /// Compares packed 8-bit integers in `a` and `b` for equality.
793 ///
794 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
795 #[inline]
796 #[target_feature(enable = "avx2")]
797 #[cfg_attr(test, assert_instr(vpcmpeqb))]
798 #[stable(feature = "simd_x86", since = "1.27.0")]
799 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
800 transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
801 }
802
803 /// Compares packed 64-bit integers in `a` and `b` for greater-than.
804 ///
805 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
806 #[inline]
807 #[target_feature(enable = "avx2")]
808 #[cfg_attr(test, assert_instr(vpcmpgtq))]
809 #[stable(feature = "simd_x86", since = "1.27.0")]
810 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
811 transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
812 }
813
814 /// Compares packed 32-bit integers in `a` and `b` for greater-than.
815 ///
816 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
817 #[inline]
818 #[target_feature(enable = "avx2")]
819 #[cfg_attr(test, assert_instr(vpcmpgtd))]
820 #[stable(feature = "simd_x86", since = "1.27.0")]
821 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
822 transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
823 }
824
825 /// Compares packed 16-bit integers in `a` and `b` for greater-than.
826 ///
827 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
828 #[inline]
829 #[target_feature(enable = "avx2")]
830 #[cfg_attr(test, assert_instr(vpcmpgtw))]
831 #[stable(feature = "simd_x86", since = "1.27.0")]
832 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
833 transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
834 }
835
836 /// Compares packed 8-bit integers in `a` and `b` for greater-than.
837 ///
838 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
839 #[inline]
840 #[target_feature(enable = "avx2")]
841 #[cfg_attr(test, assert_instr(vpcmpgtb))]
842 #[stable(feature = "simd_x86", since = "1.27.0")]
843 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
844 transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
845 }
846
847 /// Sign-extend 16-bit integers to 32-bit integers.
848 ///
849 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
850 #[inline]
851 #[target_feature(enable = "avx2")]
852 #[cfg_attr(test, assert_instr(vpmovsxwd))]
853 #[stable(feature = "simd_x86", since = "1.27.0")]
854 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
855 transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
856 }
857
858 /// Sign-extend 16-bit integers to 64-bit integers.
859 ///
860 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
861 #[inline]
862 #[target_feature(enable = "avx2")]
863 #[cfg_attr(test, assert_instr(vpmovsxwq))]
864 #[stable(feature = "simd_x86", since = "1.27.0")]
865 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
866 let a = a.as_i16x8();
867 let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
868 transmute::<i64x4, _>(simd_cast(v64))
869 }
870
871 /// Sign-extend 32-bit integers to 64-bit integers.
872 ///
873 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
874 #[inline]
875 #[target_feature(enable = "avx2")]
876 #[cfg_attr(test, assert_instr(vpmovsxdq))]
877 #[stable(feature = "simd_x86", since = "1.27.0")]
878 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
879 transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
880 }
881
882 /// Sign-extend 8-bit integers to 16-bit integers.
883 ///
884 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
885 #[inline]
886 #[target_feature(enable = "avx2")]
887 #[cfg_attr(test, assert_instr(vpmovsxbw))]
888 #[stable(feature = "simd_x86", since = "1.27.0")]
889 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
890 transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
891 }
892
893 /// Sign-extend 8-bit integers to 32-bit integers.
894 ///
895 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
896 #[inline]
897 #[target_feature(enable = "avx2")]
898 #[cfg_attr(test, assert_instr(vpmovsxbd))]
899 #[stable(feature = "simd_x86", since = "1.27.0")]
900 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
901 let a = a.as_i8x16();
902 let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
903 transmute::<i32x8, _>(simd_cast(v64))
904 }
905
906 /// Sign-extend 8-bit integers to 64-bit integers.
907 ///
908 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
909 #[inline]
910 #[target_feature(enable = "avx2")]
911 #[cfg_attr(test, assert_instr(vpmovsxbq))]
912 #[stable(feature = "simd_x86", since = "1.27.0")]
913 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
914 let a = a.as_i8x16();
915 let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
916 transmute::<i64x4, _>(simd_cast(v32))
917 }
918
919 /// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
920 /// integers, and stores the results in `dst`.
921 ///
922 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
923 #[inline]
924 #[target_feature(enable = "avx2")]
925 #[cfg_attr(test, assert_instr(vpmovzxwd))]
926 #[stable(feature = "simd_x86", since = "1.27.0")]
927 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
928 transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
929 }
930
931 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
932 /// integers. The upper four elements of `a` are unused.
933 ///
934 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
935 #[inline]
936 #[target_feature(enable = "avx2")]
937 #[cfg_attr(test, assert_instr(vpmovzxwq))]
938 #[stable(feature = "simd_x86", since = "1.27.0")]
939 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
940 let a = a.as_u16x8();
941 let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
942 transmute::<i64x4, _>(simd_cast(v64))
943 }
944
945 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
946 ///
947 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
948 #[inline]
949 #[target_feature(enable = "avx2")]
950 #[cfg_attr(test, assert_instr(vpmovzxdq))]
951 #[stable(feature = "simd_x86", since = "1.27.0")]
952 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
953 transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
954 }
955
956 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
957 ///
958 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
959 #[inline]
960 #[target_feature(enable = "avx2")]
961 #[cfg_attr(test, assert_instr(vpmovzxbw))]
962 #[stable(feature = "simd_x86", since = "1.27.0")]
963 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
964 transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
965 }
966
967 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
968 /// integers. The upper eight elements of `a` are unused.
969 ///
970 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
971 #[inline]
972 #[target_feature(enable = "avx2")]
973 #[cfg_attr(test, assert_instr(vpmovzxbd))]
974 #[stable(feature = "simd_x86", since = "1.27.0")]
975 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
976 let a = a.as_u8x16();
977 let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
978 transmute::<i32x8, _>(simd_cast(v64))
979 }
980
981 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
982 /// integers. The upper twelve elements of `a` are unused.
983 ///
984 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
985 #[inline]
986 #[target_feature(enable = "avx2")]
987 #[cfg_attr(test, assert_instr(vpmovzxbq))]
988 #[stable(feature = "simd_x86", since = "1.27.0")]
989 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
990 let a = a.as_u8x16();
991 let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
992 transmute::<i64x4, _>(simd_cast(v32))
993 }
994
995 /// Extracts 128 bits (of integer data) from `a` selected with `imm8`.
996 ///
997 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
998 #[inline]
999 #[target_feature(enable = "avx2")]
1000 #[cfg_attr(
1001 all(test, not(target_os = "windows")),
1002 assert_instr(vextractf128, imm8 = 1)
1003 )]
1004 #[rustc_args_required_const(1)]
1005 #[stable(feature = "simd_x86", since = "1.27.0")]
1006 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
1007 let a = a.as_i64x4();
1008 let b = _mm256_undefined_si256().as_i64x4();
1009 let dst: i64x2 = match imm8 & 0b01 {
1010 0 => simd_shuffle2(a, b, [0, 1]),
1011 _ => simd_shuffle2(a, b, [2, 3]),
1012 };
1013 transmute(dst)
1014 }
1015
1016 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
1017 ///
1018 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
1019 #[inline]
1020 #[target_feature(enable = "avx2")]
1021 #[cfg_attr(test, assert_instr(vphaddw))]
1022 #[stable(feature = "simd_x86", since = "1.27.0")]
1023 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
1024 transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
1025 }
1026
1027 /// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
1028 ///
1029 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
1030 #[inline]
1031 #[target_feature(enable = "avx2")]
1032 #[cfg_attr(test, assert_instr(vphaddd))]
1033 #[stable(feature = "simd_x86", since = "1.27.0")]
1034 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
1035 transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
1036 }
1037
1038 /// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
1039 /// using saturation.
1040 ///
1041 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
1042 #[inline]
1043 #[target_feature(enable = "avx2")]
1044 #[cfg_attr(test, assert_instr(vphaddsw))]
1045 #[stable(feature = "simd_x86", since = "1.27.0")]
1046 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
1047 transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
1048 }
1049
1050 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
1051 ///
1052 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
1053 #[inline]
1054 #[target_feature(enable = "avx2")]
1055 #[cfg_attr(test, assert_instr(vphsubw))]
1056 #[stable(feature = "simd_x86", since = "1.27.0")]
1057 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
1058 transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
1059 }
1060
1061 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
1062 ///
1063 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
1064 #[inline]
1065 #[target_feature(enable = "avx2")]
1066 #[cfg_attr(test, assert_instr(vphsubd))]
1067 #[stable(feature = "simd_x86", since = "1.27.0")]
1068 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
1069 transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
1070 }
1071
1072 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
1073 /// using saturation.
1074 ///
1075 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
1076 #[inline]
1077 #[target_feature(enable = "avx2")]
1078 #[cfg_attr(test, assert_instr(vphsubsw))]
1079 #[stable(feature = "simd_x86", since = "1.27.0")]
1080 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1081 transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
1082 }
1083
1084 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1085 /// where
1086 /// `scale` is between 1 and 8.
1087 ///
1088 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
1089 #[inline]
1090 #[target_feature(enable = "avx2")]
1091 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1092 #[rustc_args_required_const(2)]
1093 #[stable(feature = "simd_x86", since = "1.27.0")]
1094 pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1095 let zero = _mm_setzero_si128().as_i32x4();
1096 let neg_one = _mm_set1_epi32(-1).as_i32x4();
1097 let offsets = offsets.as_i32x4();
1098 let slice = slice as *const i8;
1099 macro_rules! call {
1100 ($imm8:expr) => {
1101 pgatherdd(zero, slice, offsets, neg_one, $imm8)
1102 };
1103 }
1104 let r = constify_imm8!(scale, call);
1105 transmute(r)
1106 }
1107
1108 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1109 /// where
1110 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1111 /// that position instead.
1112 ///
1113 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
1114 #[inline]
1115 #[target_feature(enable = "avx2")]
1116 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1117 #[rustc_args_required_const(4)]
1118 #[stable(feature = "simd_x86", since = "1.27.0")]
1119 pub unsafe fn _mm_mask_i32gather_epi32(
1120 src: __m128i,
1121 slice: *const i32,
1122 offsets: __m128i,
1123 mask: __m128i,
1124 scale: i32,
1125 ) -> __m128i {
1126 let src = src.as_i32x4();
1127 let mask = mask.as_i32x4();
1128 let offsets = offsets.as_i32x4();
1129 let slice = slice as *const i8;
1130 macro_rules! call {
1131 ($imm8:expr) => {
1132 pgatherdd(src, slice, offsets, mask, $imm8)
1133 };
1134 }
1135 let r = constify_imm8!(scale, call);
1136 transmute(r)
1137 }
1138
1139 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1140 /// where
1141 /// `scale` is between 1 and 8.
1142 ///
1143 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
1144 #[inline]
1145 #[target_feature(enable = "avx2")]
1146 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1147 #[rustc_args_required_const(2)]
1148 #[stable(feature = "simd_x86", since = "1.27.0")]
1149 pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m256i {
1150 let zero = _mm256_setzero_si256().as_i32x8();
1151 let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1152 let offsets = offsets.as_i32x8();
1153 let slice = slice as *const i8;
1154 macro_rules! call {
1155 ($imm8:expr) => {
1156 vpgatherdd(zero, slice, offsets, neg_one, $imm8)
1157 };
1158 }
1159 let r = constify_imm8!(scale, call);
1160 transmute(r)
1161 }
1162
1163 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1164 /// where
1165 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1166 /// that position instead.
1167 ///
1168 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
1169 #[inline]
1170 #[target_feature(enable = "avx2")]
1171 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
1172 #[rustc_args_required_const(4)]
1173 #[stable(feature = "simd_x86", since = "1.27.0")]
1174 pub unsafe fn _mm256_mask_i32gather_epi32(
1175 src: __m256i,
1176 slice: *const i32,
1177 offsets: __m256i,
1178 mask: __m256i,
1179 scale: i32,
1180 ) -> __m256i {
1181 let src = src.as_i32x8();
1182 let mask = mask.as_i32x8();
1183 let offsets = offsets.as_i32x8();
1184 let slice = slice as *const i8;
1185 macro_rules! call {
1186 ($imm8:expr) => {
1187 vpgatherdd(src, slice, offsets, mask, $imm8)
1188 };
1189 }
1190 let r = constify_imm8!(scale, call);
1191 transmute(r)
1192 }
1193
1194 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1195 /// where
1196 /// `scale` is between 1 and 8.
1197 ///
1198 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
1199 #[inline]
1200 #[target_feature(enable = "avx2")]
1201 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1202 #[rustc_args_required_const(2)]
1203 #[stable(feature = "simd_x86", since = "1.27.0")]
1204 pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1205 let zero = _mm_setzero_ps();
1206 let neg_one = _mm_set1_ps(-1.0);
1207 let offsets = offsets.as_i32x4();
1208 let slice = slice as *const i8;
1209 macro_rules! call {
1210 ($imm8:expr) => {
1211 pgatherdps(zero, slice, offsets, neg_one, $imm8)
1212 };
1213 }
1214 constify_imm8!(scale, call)
1215 }
1216
1217 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1218 /// where
1219 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1220 /// that position instead.
1221 ///
1222 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
1223 #[inline]
1224 #[target_feature(enable = "avx2")]
1225 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1226 #[rustc_args_required_const(4)]
1227 #[stable(feature = "simd_x86", since = "1.27.0")]
1228 pub unsafe fn _mm_mask_i32gather_ps(
1229 src: __m128,
1230 slice: *const f32,
1231 offsets: __m128i,
1232 mask: __m128,
1233 scale: i32,
1234 ) -> __m128 {
1235 let offsets = offsets.as_i32x4();
1236 let slice = slice as *const i8;
1237 macro_rules! call {
1238 ($imm8:expr) => {
1239 pgatherdps(src, slice, offsets, mask, $imm8)
1240 };
1241 }
1242 constify_imm8!(scale, call)
1243 }
1244
1245 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1246 /// where
1247 /// `scale` is between 1 and 8.
1248 ///
1249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
1250 #[inline]
1251 #[target_feature(enable = "avx2")]
1252 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1253 #[rustc_args_required_const(2)]
1254 #[stable(feature = "simd_x86", since = "1.27.0")]
1255 pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m256 {
1256 let zero = _mm256_setzero_ps();
1257 let neg_one = _mm256_set1_ps(-1.0);
1258 let offsets = offsets.as_i32x8();
1259 let slice = slice as *const i8;
1260 macro_rules! call {
1261 ($imm8:expr) => {
1262 vpgatherdps(zero, slice, offsets, neg_one, $imm8)
1263 };
1264 }
1265 constify_imm8!(scale, call)
1266 }
1267
1268 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1269 /// where
1270 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1271 /// that position instead.
1272 ///
1273 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
1274 #[inline]
1275 #[target_feature(enable = "avx2")]
1276 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
1277 #[rustc_args_required_const(4)]
1278 #[stable(feature = "simd_x86", since = "1.27.0")]
1279 pub unsafe fn _mm256_mask_i32gather_ps(
1280 src: __m256,
1281 slice: *const f32,
1282 offsets: __m256i,
1283 mask: __m256,
1284 scale: i32,
1285 ) -> __m256 {
1286 let offsets = offsets.as_i32x8();
1287 let slice = slice as *const i8;
1288 macro_rules! call {
1289 ($imm8:expr) => {
1290 vpgatherdps(src, slice, offsets, mask, $imm8)
1291 };
1292 }
1293 constify_imm8!(scale, call)
1294 }
1295
1296 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1297 /// where
1298 /// `scale` is between 1 and 8.
1299 ///
1300 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
1301 #[inline]
1302 #[target_feature(enable = "avx2")]
1303 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1304 #[rustc_args_required_const(2)]
1305 #[stable(feature = "simd_x86", since = "1.27.0")]
1306 pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1307 let zero = _mm_setzero_si128().as_i64x2();
1308 let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1309 let offsets = offsets.as_i32x4();
1310 let slice = slice as *const i8;
1311 macro_rules! call {
1312 ($imm8:expr) => {
1313 pgatherdq(zero, slice, offsets, neg_one, $imm8)
1314 };
1315 }
1316 let r = constify_imm8!(scale, call);
1317 transmute(r)
1318 }
1319
1320 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1321 /// where
1322 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1323 /// that position instead.
1324 ///
1325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
1326 #[inline]
1327 #[target_feature(enable = "avx2")]
1328 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1329 #[rustc_args_required_const(4)]
1330 #[stable(feature = "simd_x86", since = "1.27.0")]
1331 pub unsafe fn _mm_mask_i32gather_epi64(
1332 src: __m128i,
1333 slice: *const i64,
1334 offsets: __m128i,
1335 mask: __m128i,
1336 scale: i32,
1337 ) -> __m128i {
1338 let src = src.as_i64x2();
1339 let mask = mask.as_i64x2();
1340 let offsets = offsets.as_i32x4();
1341 let slice = slice as *const i8;
1342 macro_rules! call {
1343 ($imm8:expr) => {
1344 pgatherdq(src, slice, offsets, mask, $imm8)
1345 };
1346 }
1347 let r = constify_imm8!(scale, call);
1348 transmute(r)
1349 }
1350
1351 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1352 /// where
1353 /// `scale` is between 1 and 8.
1354 ///
1355 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
1356 #[inline]
1357 #[target_feature(enable = "avx2")]
1358 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1359 #[rustc_args_required_const(2)]
1360 #[stable(feature = "simd_x86", since = "1.27.0")]
1361 pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m256i {
1362 let zero = _mm256_setzero_si256().as_i64x4();
1363 let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1364 let offsets = offsets.as_i32x4();
1365 let slice = slice as *const i8;
1366 macro_rules! call {
1367 ($imm8:expr) => {
1368 vpgatherdq(zero, slice, offsets, neg_one, $imm8)
1369 };
1370 }
1371 let r = constify_imm8!(scale, call);
1372 transmute(r)
1373 }
1374
1375 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1376 /// where
1377 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1378 /// that position instead.
1379 ///
1380 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
1381 #[inline]
1382 #[target_feature(enable = "avx2")]
1383 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
1384 #[rustc_args_required_const(4)]
1385 #[stable(feature = "simd_x86", since = "1.27.0")]
1386 pub unsafe fn _mm256_mask_i32gather_epi64(
1387 src: __m256i,
1388 slice: *const i64,
1389 offsets: __m128i,
1390 mask: __m256i,
1391 scale: i32,
1392 ) -> __m256i {
1393 let src = src.as_i64x4();
1394 let mask = mask.as_i64x4();
1395 let offsets = offsets.as_i32x4();
1396 let slice = slice as *const i8;
1397 macro_rules! call {
1398 ($imm8:expr) => {
1399 vpgatherdq(src, slice, offsets, mask, $imm8)
1400 };
1401 }
1402 let r = constify_imm8!(scale, call);
1403 transmute(r)
1404 }
1405
1406 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1407 /// where
1408 /// `scale` is between 1 and 8.
1409 ///
1410 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
1411 #[inline]
1412 #[target_feature(enable = "avx2")]
1413 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1414 #[rustc_args_required_const(2)]
1415 #[stable(feature = "simd_x86", since = "1.27.0")]
1416 pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1417 let zero = _mm_setzero_pd();
1418 let neg_one = _mm_set1_pd(-1.0);
1419 let offsets = offsets.as_i32x4();
1420 let slice = slice as *const i8;
1421 macro_rules! call {
1422 ($imm8:expr) => {
1423 pgatherdpd(zero, slice, offsets, neg_one, $imm8)
1424 };
1425 }
1426 constify_imm8!(scale, call)
1427 }
1428
1429 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1430 /// where
1431 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1432 /// that position instead.
1433 ///
1434 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
1435 #[inline]
1436 #[target_feature(enable = "avx2")]
1437 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1438 #[rustc_args_required_const(4)]
1439 #[stable(feature = "simd_x86", since = "1.27.0")]
1440 pub unsafe fn _mm_mask_i32gather_pd(
1441 src: __m128d,
1442 slice: *const f64,
1443 offsets: __m128i,
1444 mask: __m128d,
1445 scale: i32,
1446 ) -> __m128d {
1447 let offsets = offsets.as_i32x4();
1448 let slice = slice as *const i8;
1449 macro_rules! call {
1450 ($imm8:expr) => {
1451 pgatherdpd(src, slice, offsets, mask, $imm8)
1452 };
1453 }
1454 constify_imm8!(scale, call)
1455 }
1456
1457 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1458 /// where
1459 /// `scale` is between 1 and 8.
1460 ///
1461 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
1462 #[inline]
1463 #[target_feature(enable = "avx2")]
1464 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1465 #[rustc_args_required_const(2)]
1466 #[stable(feature = "simd_x86", since = "1.27.0")]
1467 pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m256d {
1468 let zero = _mm256_setzero_pd();
1469 let neg_one = _mm256_set1_pd(-1.0);
1470 let offsets = offsets.as_i32x4();
1471 let slice = slice as *const i8;
1472 macro_rules! call {
1473 ($imm8:expr) => {
1474 vpgatherdpd(zero, slice, offsets, neg_one, $imm8)
1475 };
1476 }
1477 constify_imm8!(scale, call)
1478 }
1479
1480 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1481 /// where
1482 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1483 /// that position instead.
1484 ///
1485 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
1486 #[inline]
1487 #[target_feature(enable = "avx2")]
1488 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
1489 #[rustc_args_required_const(4)]
1490 #[stable(feature = "simd_x86", since = "1.27.0")]
1491 pub unsafe fn _mm256_mask_i32gather_pd(
1492 src: __m256d,
1493 slice: *const f64,
1494 offsets: __m128i,
1495 mask: __m256d,
1496 scale: i32,
1497 ) -> __m256d {
1498 let offsets = offsets.as_i32x4();
1499 let slice = slice as *const i8;
1500 macro_rules! call {
1501 ($imm8:expr) => {
1502 vpgatherdpd(src, slice, offsets, mask, $imm8)
1503 };
1504 }
1505 constify_imm8!(scale, call)
1506 }
1507
1508 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1509 /// where
1510 /// `scale` is between 1 and 8.
1511 ///
1512 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
1513 #[inline]
1514 #[target_feature(enable = "avx2")]
1515 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1516 #[rustc_args_required_const(2)]
1517 #[stable(feature = "simd_x86", since = "1.27.0")]
1518 pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: __m128i, scale: i32) -> __m128i {
1519 let zero = _mm_setzero_si128().as_i32x4();
1520 let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1521 let offsets = offsets.as_i64x2();
1522 let slice = slice as *const i8;
1523 macro_rules! call {
1524 ($imm8:expr) => {
1525 pgatherqd(zero, slice, offsets, neg_one, $imm8)
1526 };
1527 }
1528 let r = constify_imm8!(scale, call);
1529 transmute(r)
1530 }
1531
1532 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1533 /// where
1534 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1535 /// that position instead.
1536 ///
1537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
1538 #[inline]
1539 #[target_feature(enable = "avx2")]
1540 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1541 #[rustc_args_required_const(4)]
1542 #[stable(feature = "simd_x86", since = "1.27.0")]
1543 pub unsafe fn _mm_mask_i64gather_epi32(
1544 src: __m128i,
1545 slice: *const i32,
1546 offsets: __m128i,
1547 mask: __m128i,
1548 scale: i32,
1549 ) -> __m128i {
1550 let src = src.as_i32x4();
1551 let mask = mask.as_i32x4();
1552 let offsets = offsets.as_i64x2();
1553 let slice = slice as *const i8;
1554 macro_rules! call {
1555 ($imm8:expr) => {
1556 pgatherqd(src, slice, offsets, mask, $imm8)
1557 };
1558 }
1559 let r = constify_imm8!(scale, call);
1560 transmute(r)
1561 }
1562
1563 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1564 /// where
1565 /// `scale` is between 1 and 8.
1566 ///
1567 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
1568 #[inline]
1569 #[target_feature(enable = "avx2")]
1570 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1571 #[rustc_args_required_const(2)]
1572 #[stable(feature = "simd_x86", since = "1.27.0")]
1573 pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: __m256i, scale: i32) -> __m128i {
1574 let zero = _mm_setzero_si128().as_i32x4();
1575 let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1576 let offsets = offsets.as_i64x4();
1577 let slice = slice as *const i8;
1578 macro_rules! call {
1579 ($imm8:expr) => {
1580 vpgatherqd(zero, slice, offsets, neg_one, $imm8)
1581 };
1582 }
1583 let r = constify_imm8!(scale, call);
1584 transmute(r)
1585 }
1586
1587 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1588 /// where
1589 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1590 /// that position instead.
1591 ///
1592 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
1593 #[inline]
1594 #[target_feature(enable = "avx2")]
1595 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
1596 #[rustc_args_required_const(4)]
1597 #[stable(feature = "simd_x86", since = "1.27.0")]
1598 pub unsafe fn _mm256_mask_i64gather_epi32(
1599 src: __m128i,
1600 slice: *const i32,
1601 offsets: __m256i,
1602 mask: __m128i,
1603 scale: i32,
1604 ) -> __m128i {
1605 let src = src.as_i32x4();
1606 let mask = mask.as_i32x4();
1607 let offsets = offsets.as_i64x4();
1608 let slice = slice as *const i8;
1609 macro_rules! call {
1610 ($imm8:expr) => {
1611 vpgatherqd(src, slice, offsets, mask, $imm8)
1612 };
1613 }
1614 let r = constify_imm8!(scale, call);
1615 transmute(r)
1616 }
1617
1618 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1619 /// where
1620 /// `scale` is between 1 and 8.
1621 ///
1622 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
1623 #[inline]
1624 #[target_feature(enable = "avx2")]
1625 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1626 #[rustc_args_required_const(2)]
1627 #[stable(feature = "simd_x86", since = "1.27.0")]
1628 pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i, scale: i32) -> __m128 {
1629 let zero = _mm_setzero_ps();
1630 let neg_one = _mm_set1_ps(-1.0);
1631 let offsets = offsets.as_i64x2();
1632 let slice = slice as *const i8;
1633 macro_rules! call {
1634 ($imm8:expr) => {
1635 pgatherqps(zero, slice, offsets, neg_one, $imm8)
1636 };
1637 }
1638 constify_imm8!(scale, call)
1639 }
1640
1641 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1642 /// where
1643 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1644 /// that position instead.
1645 ///
1646 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
1647 #[inline]
1648 #[target_feature(enable = "avx2")]
1649 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1650 #[rustc_args_required_const(4)]
1651 #[stable(feature = "simd_x86", since = "1.27.0")]
1652 pub unsafe fn _mm_mask_i64gather_ps(
1653 src: __m128,
1654 slice: *const f32,
1655 offsets: __m128i,
1656 mask: __m128,
1657 scale: i32,
1658 ) -> __m128 {
1659 let offsets = offsets.as_i64x2();
1660 let slice = slice as *const i8;
1661 macro_rules! call {
1662 ($imm8:expr) => {
1663 pgatherqps(src, slice, offsets, mask, $imm8)
1664 };
1665 }
1666 constify_imm8!(scale, call)
1667 }
1668
1669 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1670 /// where
1671 /// `scale` is between 1 and 8.
1672 ///
1673 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
1674 #[inline]
1675 #[target_feature(enable = "avx2")]
1676 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1677 #[rustc_args_required_const(2)]
1678 #[stable(feature = "simd_x86", since = "1.27.0")]
1679 pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: __m256i, scale: i32) -> __m128 {
1680 let zero = _mm_setzero_ps();
1681 let neg_one = _mm_set1_ps(-1.0);
1682 let offsets = offsets.as_i64x4();
1683 let slice = slice as *const i8;
1684 macro_rules! call {
1685 ($imm8:expr) => {
1686 vpgatherqps(zero, slice, offsets, neg_one, $imm8)
1687 };
1688 }
1689 constify_imm8!(scale, call)
1690 }
1691
1692 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1693 /// where
1694 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1695 /// that position instead.
1696 ///
1697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
1698 #[inline]
1699 #[target_feature(enable = "avx2")]
1700 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
1701 #[rustc_args_required_const(4)]
1702 #[stable(feature = "simd_x86", since = "1.27.0")]
1703 pub unsafe fn _mm256_mask_i64gather_ps(
1704 src: __m128,
1705 slice: *const f32,
1706 offsets: __m256i,
1707 mask: __m128,
1708 scale: i32,
1709 ) -> __m128 {
1710 let offsets = offsets.as_i64x4();
1711 let slice = slice as *const i8;
1712 macro_rules! call {
1713 ($imm8:expr) => {
1714 vpgatherqps(src, slice, offsets, mask, $imm8)
1715 };
1716 }
1717 constify_imm8!(scale, call)
1718 }
1719
1720 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1721 /// where
1722 /// `scale` is between 1 and 8.
1723 ///
1724 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
1725 #[inline]
1726 #[target_feature(enable = "avx2")]
1727 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1728 #[rustc_args_required_const(2)]
1729 #[stable(feature = "simd_x86", since = "1.27.0")]
1730 pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: __m128i, scale: i32) -> __m128i {
1731 let zero = _mm_setzero_si128().as_i64x2();
1732 let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1733 let slice = slice as *const i8;
1734 let offsets = offsets.as_i64x2();
1735 macro_rules! call {
1736 ($imm8:expr) => {
1737 pgatherqq(zero, slice, offsets, neg_one, $imm8)
1738 };
1739 }
1740 let r = constify_imm8!(scale, call);
1741 transmute(r)
1742 }
1743
1744 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1745 /// where
1746 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1747 /// that position instead.
1748 ///
1749 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
1750 #[inline]
1751 #[target_feature(enable = "avx2")]
1752 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1753 #[rustc_args_required_const(4)]
1754 #[stable(feature = "simd_x86", since = "1.27.0")]
1755 pub unsafe fn _mm_mask_i64gather_epi64(
1756 src: __m128i,
1757 slice: *const i64,
1758 offsets: __m128i,
1759 mask: __m128i,
1760 scale: i32,
1761 ) -> __m128i {
1762 let src = src.as_i64x2();
1763 let mask = mask.as_i64x2();
1764 let offsets = offsets.as_i64x2();
1765 let slice = slice as *const i8;
1766 macro_rules! call {
1767 ($imm8:expr) => {
1768 pgatherqq(src, slice, offsets, mask, $imm8)
1769 };
1770 }
1771 let r = constify_imm8!(scale, call);
1772 transmute(r)
1773 }
1774
1775 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1776 /// where
1777 /// `scale` is between 1 and 8.
1778 ///
1779 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
1780 #[inline]
1781 #[target_feature(enable = "avx2")]
1782 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1783 #[rustc_args_required_const(2)]
1784 #[stable(feature = "simd_x86", since = "1.27.0")]
1785 pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: __m256i, scale: i32) -> __m256i {
1786 let zero = _mm256_setzero_si256().as_i64x4();
1787 let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1788 let slice = slice as *const i8;
1789 let offsets = offsets.as_i64x4();
1790 macro_rules! call {
1791 ($imm8:expr) => {
1792 vpgatherqq(zero, slice, offsets, neg_one, $imm8)
1793 };
1794 }
1795 let r = constify_imm8!(scale, call);
1796 transmute(r)
1797 }
1798
1799 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1800 /// where
1801 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1802 /// that position instead.
1803 ///
1804 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
1805 #[inline]
1806 #[target_feature(enable = "avx2")]
1807 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
1808 #[rustc_args_required_const(4)]
1809 #[stable(feature = "simd_x86", since = "1.27.0")]
1810 pub unsafe fn _mm256_mask_i64gather_epi64(
1811 src: __m256i,
1812 slice: *const i64,
1813 offsets: __m256i,
1814 mask: __m256i,
1815 scale: i32,
1816 ) -> __m256i {
1817 let src = src.as_i64x4();
1818 let mask = mask.as_i64x4();
1819 let offsets = offsets.as_i64x4();
1820 let slice = slice as *const i8;
1821 macro_rules! call {
1822 ($imm8:expr) => {
1823 vpgatherqq(src, slice, offsets, mask, $imm8)
1824 };
1825 }
1826 let r = constify_imm8!(scale, call);
1827 transmute(r)
1828 }
1829
1830 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1831 /// where
1832 /// `scale` is between 1 and 8.
1833 ///
1834 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
1835 #[inline]
1836 #[target_feature(enable = "avx2")]
1837 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1838 #[rustc_args_required_const(2)]
1839 #[stable(feature = "simd_x86", since = "1.27.0")]
1840 pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: __m128i, scale: i32) -> __m128d {
1841 let zero = _mm_setzero_pd();
1842 let neg_one = _mm_set1_pd(-1.0);
1843 let slice = slice as *const i8;
1844 let offsets = offsets.as_i64x2();
1845 macro_rules! call {
1846 ($imm8:expr) => {
1847 pgatherqpd(zero, slice, offsets, neg_one, $imm8)
1848 };
1849 }
1850 constify_imm8!(scale, call)
1851 }
1852
1853 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1854 /// where
1855 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1856 /// that position instead.
1857 ///
1858 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
1859 #[inline]
1860 #[target_feature(enable = "avx2")]
1861 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1862 #[rustc_args_required_const(4)]
1863 #[stable(feature = "simd_x86", since = "1.27.0")]
1864 pub unsafe fn _mm_mask_i64gather_pd(
1865 src: __m128d,
1866 slice: *const f64,
1867 offsets: __m128i,
1868 mask: __m128d,
1869 scale: i32,
1870 ) -> __m128d {
1871 let slice = slice as *const i8;
1872 let offsets = offsets.as_i64x2();
1873 macro_rules! call {
1874 ($imm8:expr) => {
1875 pgatherqpd(src, slice, offsets, mask, $imm8)
1876 };
1877 }
1878 constify_imm8!(scale, call)
1879 }
1880
1881 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1882 /// where
1883 /// `scale` is between 1 and 8.
1884 ///
1885 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
1886 #[inline]
1887 #[target_feature(enable = "avx2")]
1888 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1889 #[rustc_args_required_const(2)]
1890 #[stable(feature = "simd_x86", since = "1.27.0")]
1891 pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: __m256i, scale: i32) -> __m256d {
1892 let zero = _mm256_setzero_pd();
1893 let neg_one = _mm256_set1_pd(-1.0);
1894 let slice = slice as *const i8;
1895 let offsets = offsets.as_i64x4();
1896 macro_rules! call {
1897 ($imm8:expr) => {
1898 vpgatherqpd(zero, slice, offsets, neg_one, $imm8)
1899 };
1900 }
1901 constify_imm8!(scale, call)
1902 }
1903
1904 /// Returns values from `slice` at offsets determined by `offsets * scale`,
1905 /// where
1906 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
1907 /// that position instead.
1908 ///
1909 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
1910 #[inline]
1911 #[target_feature(enable = "avx2")]
1912 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
1913 #[rustc_args_required_const(4)]
1914 #[stable(feature = "simd_x86", since = "1.27.0")]
1915 pub unsafe fn _mm256_mask_i64gather_pd(
1916 src: __m256d,
1917 slice: *const f64,
1918 offsets: __m256i,
1919 mask: __m256d,
1920 scale: i32,
1921 ) -> __m256d {
1922 let slice = slice as *const i8;
1923 let offsets = offsets.as_i64x4();
1924 macro_rules! call {
1925 ($imm8:expr) => {
1926 vpgatherqpd(src, slice, offsets, mask, $imm8)
1927 };
1928 }
1929 constify_imm8!(scale, call)
1930 }
1931
1932 /// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1933 /// location specified by `imm8`.
1934 ///
1935 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
1936 #[inline]
1937 #[target_feature(enable = "avx2")]
1938 #[cfg_attr(
1939 all(test, not(target_os = "windows")),
1940 assert_instr(vinsertf128, imm8 = 1)
1941 )]
1942 #[rustc_args_required_const(2)]
1943 #[stable(feature = "simd_x86", since = "1.27.0")]
1944 pub unsafe fn _mm256_inserti128_si256(a: __m256i, b: __m128i, imm8: i32) -> __m256i {
1945 let a = a.as_i64x4();
1946 let b = _mm256_castsi128_si256(b).as_i64x4();
1947 let dst: i64x4 = match imm8 & 0b01 {
1948 0 => simd_shuffle4(a, b, [4, 5, 2, 3]),
1949 _ => simd_shuffle4(a, b, [0, 1, 4, 5]),
1950 };
1951 transmute(dst)
1952 }
1953
1954 /// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1955 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1956 /// of intermediate 32-bit integers.
1957 ///
1958 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
1959 #[inline]
1960 #[target_feature(enable = "avx2")]
1961 #[cfg_attr(test, assert_instr(vpmaddwd))]
1962 #[stable(feature = "simd_x86", since = "1.27.0")]
1963 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1964 transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1965 }
1966
1967 /// Vertically multiplies each unsigned 8-bit integer from `a` with the
1968 /// corresponding signed 8-bit integer from `b`, producing intermediate
1969 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1970 /// signed 16-bit integers
1971 ///
1972 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
1973 #[inline]
1974 #[target_feature(enable = "avx2")]
1975 #[cfg_attr(test, assert_instr(vpmaddubsw))]
1976 #[stable(feature = "simd_x86", since = "1.27.0")]
1977 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1978 transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1979 }
1980
1981 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1982 /// (elements are zeroed out when the highest bit is not set in the
1983 /// corresponding element).
1984 ///
1985 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
1986 #[inline]
1987 #[target_feature(enable = "avx2")]
1988 #[cfg_attr(test, assert_instr(vpmaskmovd))]
1989 #[stable(feature = "simd_x86", since = "1.27.0")]
1990 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1991 transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1992 }
1993
1994 /// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1995 /// (elements are zeroed out when the highest bit is not set in the
1996 /// corresponding element).
1997 ///
1998 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
1999 #[inline]
2000 #[target_feature(enable = "avx2")]
2001 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2002 #[stable(feature = "simd_x86", since = "1.27.0")]
2003 pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
2004 transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
2005 }
2006
2007 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2008 /// (elements are zeroed out when the highest bit is not set in the
2009 /// corresponding element).
2010 ///
2011 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
2012 #[inline]
2013 #[target_feature(enable = "avx2")]
2014 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2015 #[stable(feature = "simd_x86", since = "1.27.0")]
2016 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
2017 transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
2018 }
2019
2020 /// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
2021 /// (elements are zeroed out when the highest bit is not set in the
2022 /// corresponding element).
2023 ///
2024 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
2025 #[inline]
2026 #[target_feature(enable = "avx2")]
2027 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2028 #[stable(feature = "simd_x86", since = "1.27.0")]
2029 pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
2030 transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
2031 }
2032
2033 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2034 /// using `mask` (elements are not stored when the highest bit is not set
2035 /// in the corresponding element).
2036 ///
2037 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
2038 #[inline]
2039 #[target_feature(enable = "avx2")]
2040 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2041 #[stable(feature = "simd_x86", since = "1.27.0")]
2042 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
2043 maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
2044 }
2045
2046 /// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
2047 /// using `mask` (elements are not stored when the highest bit is not set
2048 /// in the corresponding element).
2049 ///
2050 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
2051 #[inline]
2052 #[target_feature(enable = "avx2")]
2053 #[cfg_attr(test, assert_instr(vpmaskmovd))]
2054 #[stable(feature = "simd_x86", since = "1.27.0")]
2055 pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
2056 maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
2057 }
2058
2059 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2060 /// using `mask` (elements are not stored when the highest bit is not set
2061 /// in the corresponding element).
2062 ///
2063 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
2064 #[inline]
2065 #[target_feature(enable = "avx2")]
2066 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2067 #[stable(feature = "simd_x86", since = "1.27.0")]
2068 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
2069 maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
2070 }
2071
2072 /// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
2073 /// using `mask` (elements are not stored when the highest bit is not set
2074 /// in the corresponding element).
2075 ///
2076 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
2077 #[inline]
2078 #[target_feature(enable = "avx2")]
2079 #[cfg_attr(test, assert_instr(vpmaskmovq))]
2080 #[stable(feature = "simd_x86", since = "1.27.0")]
2081 pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
2082 maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
2083 }
2084
2085 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2086 /// maximum values.
2087 ///
2088 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
2089 #[inline]
2090 #[target_feature(enable = "avx2")]
2091 #[cfg_attr(test, assert_instr(vpmaxsw))]
2092 #[stable(feature = "simd_x86", since = "1.27.0")]
2093 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
2094 transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
2095 }
2096
2097 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2098 /// maximum values.
2099 ///
2100 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
2101 #[inline]
2102 #[target_feature(enable = "avx2")]
2103 #[cfg_attr(test, assert_instr(vpmaxsd))]
2104 #[stable(feature = "simd_x86", since = "1.27.0")]
2105 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
2106 transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
2107 }
2108
2109 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2110 /// maximum values.
2111 ///
2112 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
2113 #[inline]
2114 #[target_feature(enable = "avx2")]
2115 #[cfg_attr(test, assert_instr(vpmaxsb))]
2116 #[stable(feature = "simd_x86", since = "1.27.0")]
2117 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
2118 transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
2119 }
2120
2121 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2122 /// the packed maximum values.
2123 ///
2124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
2125 #[inline]
2126 #[target_feature(enable = "avx2")]
2127 #[cfg_attr(test, assert_instr(vpmaxuw))]
2128 #[stable(feature = "simd_x86", since = "1.27.0")]
2129 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
2130 transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
2131 }
2132
2133 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2134 /// the packed maximum values.
2135 ///
2136 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
2137 #[inline]
2138 #[target_feature(enable = "avx2")]
2139 #[cfg_attr(test, assert_instr(vpmaxud))]
2140 #[stable(feature = "simd_x86", since = "1.27.0")]
2141 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
2142 transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
2143 }
2144
2145 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2146 /// the packed maximum values.
2147 ///
2148 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
2149 #[inline]
2150 #[target_feature(enable = "avx2")]
2151 #[cfg_attr(test, assert_instr(vpmaxub))]
2152 #[stable(feature = "simd_x86", since = "1.27.0")]
2153 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
2154 transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
2155 }
2156
2157 /// Compares packed 16-bit integers in `a` and `b`, and returns the packed
2158 /// minimum values.
2159 ///
2160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
2161 #[inline]
2162 #[target_feature(enable = "avx2")]
2163 #[cfg_attr(test, assert_instr(vpminsw))]
2164 #[stable(feature = "simd_x86", since = "1.27.0")]
2165 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2166 transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
2167 }
2168
2169 /// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2170 /// minimum values.
2171 ///
2172 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
2173 #[inline]
2174 #[target_feature(enable = "avx2")]
2175 #[cfg_attr(test, assert_instr(vpminsd))]
2176 #[stable(feature = "simd_x86", since = "1.27.0")]
2177 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2178 transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
2179 }
2180
2181 /// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2182 /// minimum values.
2183 ///
2184 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
2185 #[inline]
2186 #[target_feature(enable = "avx2")]
2187 #[cfg_attr(test, assert_instr(vpminsb))]
2188 #[stable(feature = "simd_x86", since = "1.27.0")]
2189 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2190 transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
2191 }
2192
2193 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2194 /// the packed minimum values.
2195 ///
2196 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
2197 #[inline]
2198 #[target_feature(enable = "avx2")]
2199 #[cfg_attr(test, assert_instr(vpminuw))]
2200 #[stable(feature = "simd_x86", since = "1.27.0")]
2201 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2202 transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
2203 }
2204
2205 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2206 /// the packed minimum values.
2207 ///
2208 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
2209 #[inline]
2210 #[target_feature(enable = "avx2")]
2211 #[cfg_attr(test, assert_instr(vpminud))]
2212 #[stable(feature = "simd_x86", since = "1.27.0")]
2213 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2214 transmute(pminud(a.as_u32x8(), b.as_u32x8()))
2215 }
2216
2217 /// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2218 /// the packed minimum values.
2219 ///
2220 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
2221 #[inline]
2222 #[target_feature(enable = "avx2")]
2223 #[cfg_attr(test, assert_instr(vpminub))]
2224 #[stable(feature = "simd_x86", since = "1.27.0")]
2225 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2226 transmute(pminub(a.as_u8x32(), b.as_u8x32()))
2227 }
2228
2229 /// Creates mask from the most significant bit of each 8-bit element in `a`,
2230 /// return the result.
2231 ///
2232 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
2233 #[inline]
2234 #[target_feature(enable = "avx2")]
2235 #[cfg_attr(test, assert_instr(vpmovmskb))]
2236 #[stable(feature = "simd_x86", since = "1.27.0")]
2237 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2238 pmovmskb(a.as_i8x32())
2239 }
2240
2241 /// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2242 /// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2243 /// results in dst. Eight SADs are performed for each 128-bit lane using one
2244 /// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2245 /// selected from `b` starting at on the offset specified in `imm8`. Eight
2246 /// quadruplets are formed from sequential 8-bit integers selected from `a`
2247 /// starting at the offset specified in `imm8`.
2248 ///
2249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
2250 #[inline]
2251 #[target_feature(enable = "avx2")]
2252 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
2253 #[rustc_args_required_const(2)]
2254 #[stable(feature = "simd_x86", since = "1.27.0")]
2255 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2256 let a = a.as_u8x32();
2257 let b = b.as_u8x32();
2258 macro_rules! call {
2259 ($imm8:expr) => {
2260 mpsadbw(a, b, $imm8)
2261 };
2262 }
2263 let r = constify_imm8!(imm8, call);
2264 transmute(r)
2265 }
2266
2267 /// Multiplies the low 32-bit integers from each packed 64-bit element in
2268 /// `a` and `b`
2269 ///
2270 /// Returns the 64-bit results.
2271 ///
2272 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
2273 #[inline]
2274 #[target_feature(enable = "avx2")]
2275 #[cfg_attr(test, assert_instr(vpmuldq))]
2276 #[stable(feature = "simd_x86", since = "1.27.0")]
2277 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2278 transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
2279 }
2280
2281 /// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2282 /// element in `a` and `b`
2283 ///
2284 /// Returns the unsigned 64-bit results.
2285 ///
2286 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
2287 #[inline]
2288 #[target_feature(enable = "avx2")]
2289 #[cfg_attr(test, assert_instr(vpmuludq))]
2290 #[stable(feature = "simd_x86", since = "1.27.0")]
2291 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2292 transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
2293 }
2294
2295 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2296 /// intermediate 32-bit integers and returning the high 16 bits of the
2297 /// intermediate integers.
2298 ///
2299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
2300 #[inline]
2301 #[target_feature(enable = "avx2")]
2302 #[cfg_attr(test, assert_instr(vpmulhw))]
2303 #[stable(feature = "simd_x86", since = "1.27.0")]
2304 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2305 transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
2306 }
2307
2308 /// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2309 /// intermediate 32-bit integers and returning the high 16 bits of the
2310 /// intermediate integers.
2311 ///
2312 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
2313 #[inline]
2314 #[target_feature(enable = "avx2")]
2315 #[cfg_attr(test, assert_instr(vpmulhuw))]
2316 #[stable(feature = "simd_x86", since = "1.27.0")]
2317 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2318 transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
2319 }
2320
2321 /// Multiplies the packed 16-bit integers in `a` and `b`, producing
2322 /// intermediate 32-bit integers, and returns the low 16 bits of the
2323 /// intermediate integers
2324 ///
2325 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
2326 #[inline]
2327 #[target_feature(enable = "avx2")]
2328 #[cfg_attr(test, assert_instr(vpmullw))]
2329 #[stable(feature = "simd_x86", since = "1.27.0")]
2330 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2331 transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2332 }
2333
2334 /// Multiplies the packed 32-bit integers in `a` and `b`, producing
2335 /// intermediate 64-bit integers, and returns the low 32 bits of the
2336 /// intermediate integers
2337 ///
2338 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
2339 #[inline]
2340 #[target_feature(enable = "avx2")]
2341 #[cfg_attr(test, assert_instr(vpmulld))]
2342 #[stable(feature = "simd_x86", since = "1.27.0")]
2343 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2344 transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2345 }
2346
2347 /// Multiplies packed 16-bit integers in `a` and `b`, producing
2348 /// intermediate signed 32-bit integers. Truncate each intermediate
2349 /// integer to the 18 most significant bits, round by adding 1, and
2350 /// return bits `[16:1]`.
2351 ///
2352 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
2353 #[inline]
2354 #[target_feature(enable = "avx2")]
2355 #[cfg_attr(test, assert_instr(vpmulhrsw))]
2356 #[stable(feature = "simd_x86", since = "1.27.0")]
2357 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2358 transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2359 }
2360
2361 /// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2362 /// and `b`
2363 ///
2364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
2365 #[inline]
2366 #[target_feature(enable = "avx2")]
2367 #[cfg_attr(test, assert_instr(vorps))]
2368 #[stable(feature = "simd_x86", since = "1.27.0")]
2369 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2370 transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2371 }
2372
2373 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2374 /// using signed saturation
2375 ///
2376 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
2377 #[inline]
2378 #[target_feature(enable = "avx2")]
2379 #[cfg_attr(test, assert_instr(vpacksswb))]
2380 #[stable(feature = "simd_x86", since = "1.27.0")]
2381 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2382 transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2383 }
2384
2385 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2386 /// using signed saturation
2387 ///
2388 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
2389 #[inline]
2390 #[target_feature(enable = "avx2")]
2391 #[cfg_attr(test, assert_instr(vpackssdw))]
2392 #[stable(feature = "simd_x86", since = "1.27.0")]
2393 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2394 transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2395 }
2396
2397 /// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2398 /// using unsigned saturation
2399 ///
2400 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
2401 #[inline]
2402 #[target_feature(enable = "avx2")]
2403 #[cfg_attr(test, assert_instr(vpackuswb))]
2404 #[stable(feature = "simd_x86", since = "1.27.0")]
2405 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2406 transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2407 }
2408
2409 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2410 /// using unsigned saturation
2411 ///
2412 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
2413 #[inline]
2414 #[target_feature(enable = "avx2")]
2415 #[cfg_attr(test, assert_instr(vpackusdw))]
2416 #[stable(feature = "simd_x86", since = "1.27.0")]
2417 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2418 transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2419 }
2420
2421 /// Permutes packed 32-bit integers from `a` according to the content of `b`.
2422 ///
2423 /// The last 3 bits of each integer of `b` are used as addresses into the 8
2424 /// integers of `a`.
2425 ///
2426 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
2427 #[inline]
2428 #[target_feature(enable = "avx2")]
2429 #[cfg_attr(test, assert_instr(vpermps))]
2430 #[stable(feature = "simd_x86", since = "1.27.0")]
2431 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2432 transmute(permd(a.as_u32x8(), b.as_u32x8()))
2433 }
2434
2435 /// Permutes 64-bit integers from `a` using control mask `imm8`.
2436 ///
2437 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
2438 #[inline]
2439 #[target_feature(enable = "avx2")]
2440 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
2441 #[rustc_args_required_const(1)]
2442 #[stable(feature = "simd_x86", since = "1.27.0")]
2443 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
2444 let imm8 = (imm8 & 0xFF) as u8;
2445 let zero = _mm256_setzero_si256().as_i64x4();
2446 let a = a.as_i64x4();
2447 macro_rules! permute4 {
2448 ($a:expr, $b:expr, $c:expr, $d:expr) => {
2449 simd_shuffle4(a, zero, [$a, $b, $c, $d]);
2450 };
2451 }
2452 macro_rules! permute3 {
2453 ($a:expr, $b:expr, $c:expr) => {
2454 match (imm8 >> 6) & 0b11 {
2455 0b00 => permute4!($a, $b, $c, 0),
2456 0b01 => permute4!($a, $b, $c, 1),
2457 0b10 => permute4!($a, $b, $c, 2),
2458 _ => permute4!($a, $b, $c, 3),
2459 }
2460 };
2461 }
2462 macro_rules! permute2 {
2463 ($a:expr, $b:expr) => {
2464 match (imm8 >> 4) & 0b11 {
2465 0b00 => permute3!($a, $b, 0),
2466 0b01 => permute3!($a, $b, 1),
2467 0b10 => permute3!($a, $b, 2),
2468 _ => permute3!($a, $b, 3),
2469 }
2470 };
2471 }
2472 macro_rules! permute1 {
2473 ($a:expr) => {
2474 match (imm8 >> 2) & 0b11 {
2475 0b00 => permute2!($a, 0),
2476 0b01 => permute2!($a, 1),
2477 0b10 => permute2!($a, 2),
2478 _ => permute2!($a, 3),
2479 }
2480 };
2481 }
2482 let r: i64x4 = match imm8 & 0b11 {
2483 0b00 => permute1!(0),
2484 0b01 => permute1!(1),
2485 0b10 => permute1!(2),
2486 _ => permute1!(3),
2487 };
2488 transmute(r)
2489 }
2490
2491 /// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2492 ///
2493 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
2494 #[inline]
2495 #[target_feature(enable = "avx2")]
2496 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
2497 #[rustc_args_required_const(2)]
2498 #[stable(feature = "simd_x86", since = "1.27.0")]
2499 pub unsafe fn _mm256_permute2x128_si256(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
2500 let a = a.as_i64x4();
2501 let b = b.as_i64x4();
2502 macro_rules! call {
2503 ($imm8:expr) => {
2504 vperm2i128(a, b, $imm8)
2505 };
2506 }
2507 transmute(constify_imm8!(imm8, call))
2508 }
2509
2510 /// Shuffles 64-bit floating-point elements in `a` across lanes using the
2511 /// control in `imm8`.
2512 ///
2513 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
2514 #[inline]
2515 #[target_feature(enable = "avx2")]
2516 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
2517 #[rustc_args_required_const(1)]
2518 #[stable(feature = "simd_x86", since = "1.27.0")]
2519 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
2520 let imm8 = (imm8 & 0xFF) as u8;
2521 let undef = _mm256_undefined_pd();
2522 macro_rules! shuffle_done {
2523 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2524 simd_shuffle4(a, undef, [$x01, $x23, $x45, $x67])
2525 };
2526 }
2527 macro_rules! shuffle_x67 {
2528 ($x01:expr, $x23:expr, $x45:expr) => {
2529 match (imm8 >> 6) & 0b11 {
2530 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2531 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2532 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2533 _ => shuffle_done!($x01, $x23, $x45, 3),
2534 }
2535 };
2536 }
2537 macro_rules! shuffle_x45 {
2538 ($x01:expr, $x23:expr) => {
2539 match (imm8 >> 4) & 0b11 {
2540 0b00 => shuffle_x67!($x01, $x23, 0),
2541 0b01 => shuffle_x67!($x01, $x23, 1),
2542 0b10 => shuffle_x67!($x01, $x23, 2),
2543 _ => shuffle_x67!($x01, $x23, 3),
2544 }
2545 };
2546 }
2547 macro_rules! shuffle_x23 {
2548 ($x01:expr) => {
2549 match (imm8 >> 2) & 0b11 {
2550 0b00 => shuffle_x45!($x01, 0),
2551 0b01 => shuffle_x45!($x01, 1),
2552 0b10 => shuffle_x45!($x01, 2),
2553 _ => shuffle_x45!($x01, 3),
2554 }
2555 };
2556 }
2557 match imm8 & 0b11 {
2558 0b00 => shuffle_x23!(0),
2559 0b01 => shuffle_x23!(1),
2560 0b10 => shuffle_x23!(2),
2561 _ => shuffle_x23!(3),
2562 }
2563 }
2564
2565 /// Shuffles eight 32-bit foating-point elements in `a` across lanes using
2566 /// the corresponding 32-bit integer index in `idx`.
2567 ///
2568 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
2569 #[inline]
2570 #[target_feature(enable = "avx2")]
2571 #[cfg_attr(test, assert_instr(vpermps))]
2572 #[stable(feature = "simd_x86", since = "1.27.0")]
2573 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2574 permps(a, idx.as_i32x8())
2575 }
2576
2577 /// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2578 /// and `b`, then horizontally sum each consecutive 8 differences to
2579 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2580 /// integers in the low 16 bits of the 64-bit return value
2581 ///
2582 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
2583 #[inline]
2584 #[target_feature(enable = "avx2")]
2585 #[cfg_attr(test, assert_instr(vpsadbw))]
2586 #[stable(feature = "simd_x86", since = "1.27.0")]
2587 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2588 transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2589 }
2590
2591 /// Shuffles bytes from `a` according to the content of `b`.
2592 ///
2593 /// The last 4 bits of each byte of `b` are used as addresses into the 32 bytes
2594 /// of `a`.
2595 ///
2596 /// In addition, if the highest significant bit of a byte of `b` is set, the
2597 /// respective destination byte is set to 0.
2598 ///
2599 /// The low and high halves of the vectors are shuffled separately.
2600 ///
2601 /// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2602 /// equivalent to:
2603 ///
2604 /// ```
2605 /// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2606 /// let mut r = [0; 32];
2607 /// for i in 0..16 {
2608 /// // if the most significant bit of b is set,
2609 /// // then the destination byte is set to 0.
2610 /// if b[i] & 0x80 == 0u8 {
2611 /// r[i] = a[(b[i] % 16) as usize];
2612 /// }
2613 /// if b[i + 16] & 0x80 == 0u8 {
2614 /// r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2615 /// }
2616 /// }
2617 /// r
2618 /// }
2619 /// ```
2620 ///
2621 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
2622 #[inline]
2623 #[target_feature(enable = "avx2")]
2624 #[cfg_attr(test, assert_instr(vpshufb))]
2625 #[stable(feature = "simd_x86", since = "1.27.0")]
2626 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2627 transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2628 }
2629
2630 /// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2631 /// `imm8`.
2632 ///
2633 /// ```rust
2634 /// #[cfg(target_arch = "x86")]
2635 /// use std::arch::x86::*;
2636 /// #[cfg(target_arch = "x86_64")]
2637 /// use std::arch::x86_64::*;
2638 ///
2639 /// # fn main() {
2640 /// # if is_x86_feature_detected!("avx2") {
2641 /// # #[target_feature(enable = "avx2")]
2642 /// # unsafe fn worker() {
2643 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2644 ///
2645 /// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2646 /// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2647 ///
2648 /// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2649 /// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2650 ///
2651 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2652 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2653 /// # }
2654 /// # unsafe { worker(); }
2655 /// # }
2656 /// # }
2657 /// ```
2658 ///
2659 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
2660 #[inline]
2661 #[target_feature(enable = "avx2")]
2662 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
2663 #[rustc_args_required_const(1)]
2664 #[stable(feature = "simd_x86", since = "1.27.0")]
2665 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
2666 // simd_shuffleX requires that its selector parameter be made up of
2667 // constant values, but we can't enforce that here. In spirit, we need
2668 // to write a `match` on all possible values of a byte, and for each value,
2669 // hard-code the correct `simd_shuffleX` call using only constants. We
2670 // then hope for LLVM to do the rest.
2671 //
2672 // Of course, that's... awful. So we try to use macros to do it for us.
2673 let imm8 = (imm8 & 0xFF) as u8;
2674
2675 let a = a.as_i32x8();
2676 macro_rules! shuffle_done {
2677 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2678 simd_shuffle8(
2679 a,
2680 a,
2681 [
2682 $x01,
2683 $x23,
2684 $x45,
2685 $x67,
2686 4 + $x01,
2687 4 + $x23,
2688 4 + $x45,
2689 4 + $x67,
2690 ],
2691 )
2692 };
2693 }
2694 macro_rules! shuffle_x67 {
2695 ($x01:expr, $x23:expr, $x45:expr) => {
2696 match (imm8 >> 6) & 0b11 {
2697 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2698 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2699 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2700 _ => shuffle_done!($x01, $x23, $x45, 3),
2701 }
2702 };
2703 }
2704 macro_rules! shuffle_x45 {
2705 ($x01:expr, $x23:expr) => {
2706 match (imm8 >> 4) & 0b11 {
2707 0b00 => shuffle_x67!($x01, $x23, 0),
2708 0b01 => shuffle_x67!($x01, $x23, 1),
2709 0b10 => shuffle_x67!($x01, $x23, 2),
2710 _ => shuffle_x67!($x01, $x23, 3),
2711 }
2712 };
2713 }
2714 macro_rules! shuffle_x23 {
2715 ($x01:expr) => {
2716 match (imm8 >> 2) & 0b11 {
2717 0b00 => shuffle_x45!($x01, 0),
2718 0b01 => shuffle_x45!($x01, 1),
2719 0b10 => shuffle_x45!($x01, 2),
2720 _ => shuffle_x45!($x01, 3),
2721 }
2722 };
2723 }
2724 let r: i32x8 = match imm8 & 0b11 {
2725 0b00 => shuffle_x23!(0),
2726 0b01 => shuffle_x23!(1),
2727 0b10 => shuffle_x23!(2),
2728 _ => shuffle_x23!(3),
2729 };
2730 transmute(r)
2731 }
2732
2733 /// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2734 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2735 /// to the output.
2736 ///
2737 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
2738 #[inline]
2739 #[target_feature(enable = "avx2")]
2740 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
2741 #[rustc_args_required_const(1)]
2742 #[stable(feature = "simd_x86", since = "1.27.0")]
2743 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
2744 let imm8 = (imm8 & 0xFF) as u8;
2745 let a = a.as_i16x16();
2746 macro_rules! shuffle_done {
2747 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
2748 #[rustfmt::skip]
2749 simd_shuffle16(a, a, [
2750 0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
2751 8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
2752 ]);
2753 };
2754 }
2755 macro_rules! shuffle_x67 {
2756 ($x01:expr, $x23:expr, $x45:expr) => {
2757 match (imm8 >> 6) & 0b11 {
2758 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2759 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2760 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2761 _ => shuffle_done!($x01, $x23, $x45, 3),
2762 }
2763 };
2764 }
2765 macro_rules! shuffle_x45 {
2766 ($x01:expr, $x23:expr) => {
2767 match (imm8 >> 4) & 0b11 {
2768 0b00 => shuffle_x67!($x01, $x23, 0),
2769 0b01 => shuffle_x67!($x01, $x23, 1),
2770 0b10 => shuffle_x67!($x01, $x23, 2),
2771 _ => shuffle_x67!($x01, $x23, 3),
2772 }
2773 };
2774 }
2775 macro_rules! shuffle_x23 {
2776 ($x01:expr) => {
2777 match (imm8 >> 2) & 0b11 {
2778 0b00 => shuffle_x45!($x01, 0),
2779 0b01 => shuffle_x45!($x01, 1),
2780 0b10 => shuffle_x45!($x01, 2),
2781 _ => shuffle_x45!($x01, 3),
2782 }
2783 };
2784 }
2785 let r: i16x16 = match imm8 & 0b11 {
2786 0b00 => shuffle_x23!(0),
2787 0b01 => shuffle_x23!(1),
2788 0b10 => shuffle_x23!(2),
2789 _ => shuffle_x23!(3),
2790 };
2791 transmute(r)
2792 }
2793
2794 /// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2795 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2796 /// to the output.
2797 ///
2798 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
2799 #[inline]
2800 #[target_feature(enable = "avx2")]
2801 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
2802 #[rustc_args_required_const(1)]
2803 #[stable(feature = "simd_x86", since = "1.27.0")]
2804 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
2805 let imm8 = (imm8 & 0xFF) as u8;
2806 let a = a.as_i16x16();
2807 macro_rules! shuffle_done {
2808 ($x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
2809 #[rustfmt::skip]
2810 simd_shuffle16(a, a, [
2811 0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
2812 8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
2813 ]);
2814 };
2815 }
2816 macro_rules! shuffle_x67 {
2817 ($x01:expr, $x23:expr, $x45:expr) => {
2818 match (imm8 >> 6) & 0b11 {
2819 0b00 => shuffle_done!($x01, $x23, $x45, 0),
2820 0b01 => shuffle_done!($x01, $x23, $x45, 1),
2821 0b10 => shuffle_done!($x01, $x23, $x45, 2),
2822 _ => shuffle_done!($x01, $x23, $x45, 3),
2823 }
2824 };
2825 }
2826 macro_rules! shuffle_x45 {
2827 ($x01:expr, $x23:expr) => {
2828 match (imm8 >> 4) & 0b11 {
2829 0b00 => shuffle_x67!($x01, $x23, 0),
2830 0b01 => shuffle_x67!($x01, $x23, 1),
2831 0b10 => shuffle_x67!($x01, $x23, 2),
2832 _ => shuffle_x67!($x01, $x23, 3),
2833 }
2834 };
2835 }
2836 macro_rules! shuffle_x23 {
2837 ($x01:expr) => {
2838 match (imm8 >> 2) & 0b11 {
2839 0b00 => shuffle_x45!($x01, 0),
2840 0b01 => shuffle_x45!($x01, 1),
2841 0b10 => shuffle_x45!($x01, 2),
2842 _ => shuffle_x45!($x01, 3),
2843 }
2844 };
2845 }
2846 let r: i16x16 = match imm8 & 0b11 {
2847 0b00 => shuffle_x23!(0),
2848 0b01 => shuffle_x23!(1),
2849 0b10 => shuffle_x23!(2),
2850 _ => shuffle_x23!(3),
2851 };
2852 transmute(r)
2853 }
2854
2855 /// Negates packed 16-bit integers in `a` when the corresponding signed
2856 /// 16-bit integer in `b` is negative, and returns the results.
2857 /// Results are zeroed out when the corresponding element in `b` is zero.
2858 ///
2859 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
2860 #[inline]
2861 #[target_feature(enable = "avx2")]
2862 #[cfg_attr(test, assert_instr(vpsignw))]
2863 #[stable(feature = "simd_x86", since = "1.27.0")]
2864 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2865 transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2866 }
2867
2868 /// Negates packed 32-bit integers in `a` when the corresponding signed
2869 /// 32-bit integer in `b` is negative, and returns the results.
2870 /// Results are zeroed out when the corresponding element in `b` is zero.
2871 ///
2872 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
2873 #[inline]
2874 #[target_feature(enable = "avx2")]
2875 #[cfg_attr(test, assert_instr(vpsignd))]
2876 #[stable(feature = "simd_x86", since = "1.27.0")]
2877 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2878 transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2879 }
2880
2881 /// Negates packed 8-bit integers in `a` when the corresponding signed
2882 /// 8-bit integer in `b` is negative, and returns the results.
2883 /// Results are zeroed out when the corresponding element in `b` is zero.
2884 ///
2885 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
2886 #[inline]
2887 #[target_feature(enable = "avx2")]
2888 #[cfg_attr(test, assert_instr(vpsignb))]
2889 #[stable(feature = "simd_x86", since = "1.27.0")]
2890 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2891 transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2892 }
2893
2894 /// Shifts packed 16-bit integers in `a` left by `count` while
2895 /// shifting in zeros, and returns the result
2896 ///
2897 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
2898 #[inline]
2899 #[target_feature(enable = "avx2")]
2900 #[cfg_attr(test, assert_instr(vpsllw))]
2901 #[stable(feature = "simd_x86", since = "1.27.0")]
2902 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2903 transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2904 }
2905
2906 /// Shifts packed 32-bit integers in `a` left by `count` while
2907 /// shifting in zeros, and returns the result
2908 ///
2909 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
2910 #[inline]
2911 #[target_feature(enable = "avx2")]
2912 #[cfg_attr(test, assert_instr(vpslld))]
2913 #[stable(feature = "simd_x86", since = "1.27.0")]
2914 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2915 transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2916 }
2917
2918 /// Shifts packed 64-bit integers in `a` left by `count` while
2919 /// shifting in zeros, and returns the result
2920 ///
2921 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
2922 #[inline]
2923 #[target_feature(enable = "avx2")]
2924 #[cfg_attr(test, assert_instr(vpsllq))]
2925 #[stable(feature = "simd_x86", since = "1.27.0")]
2926 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2927 transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2928 }
2929
2930 /// Shifts packed 16-bit integers in `a` left by `imm8` while
2931 /// shifting in zeros, return the results;
2932 ///
2933 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
2934 #[inline]
2935 #[target_feature(enable = "avx2")]
2936 #[cfg_attr(test, assert_instr(vpsllw))]
2937 #[stable(feature = "simd_x86", since = "1.27.0")]
2938 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
2939 transmute(pslliw(a.as_i16x16(), imm8))
2940 }
2941
2942 /// Shifts packed 32-bit integers in `a` left by `imm8` while
2943 /// shifting in zeros, return the results;
2944 ///
2945 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
2946 #[inline]
2947 #[target_feature(enable = "avx2")]
2948 #[cfg_attr(test, assert_instr(vpslld))]
2949 #[stable(feature = "simd_x86", since = "1.27.0")]
2950 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
2951 transmute(psllid(a.as_i32x8(), imm8))
2952 }
2953
2954 /// Shifts packed 64-bit integers in `a` left by `imm8` while
2955 /// shifting in zeros, return the results;
2956 ///
2957 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
2958 #[inline]
2959 #[target_feature(enable = "avx2")]
2960 #[cfg_attr(test, assert_instr(vpsllq))]
2961 #[stable(feature = "simd_x86", since = "1.27.0")]
2962 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
2963 transmute(pslliq(a.as_i64x4(), imm8))
2964 }
2965
2966 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2967 ///
2968 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
2969 #[inline]
2970 #[target_feature(enable = "avx2")]
2971 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2972 #[rustc_args_required_const(1)]
2973 #[stable(feature = "simd_x86", since = "1.27.0")]
2974 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
2975 let a = a.as_i64x4();
2976 macro_rules! call {
2977 ($imm8:expr) => {
2978 vpslldq(a, $imm8)
2979 };
2980 }
2981 transmute(constify_imm8!(imm8 * 8, call))
2982 }
2983
2984 /// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2985 ///
2986 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
2987 #[inline]
2988 #[target_feature(enable = "avx2")]
2989 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
2990 #[rustc_args_required_const(1)]
2991 #[stable(feature = "simd_x86", since = "1.27.0")]
2992 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
2993 let a = a.as_i64x4();
2994 macro_rules! call {
2995 ($imm8:expr) => {
2996 vpslldq(a, $imm8)
2997 };
2998 }
2999 transmute(constify_imm8!(imm8 * 8, call))
3000 }
3001
3002 /// Shifts packed 32-bit integers in `a` left by the amount
3003 /// specified by the corresponding element in `count` while
3004 /// shifting in zeros, and returns the result.
3005 ///
3006 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
3007 #[inline]
3008 #[target_feature(enable = "avx2")]
3009 #[cfg_attr(test, assert_instr(vpsllvd))]
3010 #[stable(feature = "simd_x86", since = "1.27.0")]
3011 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
3012 transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
3013 }
3014
3015 /// Shifts packed 32-bit integers in `a` left by the amount
3016 /// specified by the corresponding element in `count` while
3017 /// shifting in zeros, and returns the result.
3018 ///
3019 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
3020 #[inline]
3021 #[target_feature(enable = "avx2")]
3022 #[cfg_attr(test, assert_instr(vpsllvd))]
3023 #[stable(feature = "simd_x86", since = "1.27.0")]
3024 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
3025 transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
3026 }
3027
3028 /// Shifts packed 64-bit integers in `a` left by the amount
3029 /// specified by the corresponding element in `count` while
3030 /// shifting in zeros, and returns the result.
3031 ///
3032 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
3033 #[inline]
3034 #[target_feature(enable = "avx2")]
3035 #[cfg_attr(test, assert_instr(vpsllvq))]
3036 #[stable(feature = "simd_x86", since = "1.27.0")]
3037 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
3038 transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
3039 }
3040
3041 /// Shifts packed 64-bit integers in `a` left by the amount
3042 /// specified by the corresponding element in `count` while
3043 /// shifting in zeros, and returns the result.
3044 ///
3045 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
3046 #[inline]
3047 #[target_feature(enable = "avx2")]
3048 #[cfg_attr(test, assert_instr(vpsllvq))]
3049 #[stable(feature = "simd_x86", since = "1.27.0")]
3050 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
3051 transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
3052 }
3053
3054 /// Shifts packed 16-bit integers in `a` right by `count` while
3055 /// shifting in sign bits.
3056 ///
3057 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
3058 #[inline]
3059 #[target_feature(enable = "avx2")]
3060 #[cfg_attr(test, assert_instr(vpsraw))]
3061 #[stable(feature = "simd_x86", since = "1.27.0")]
3062 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
3063 transmute(psraw(a.as_i16x16(), count.as_i16x8()))
3064 }
3065
3066 /// Shifts packed 32-bit integers in `a` right by `count` while
3067 /// shifting in sign bits.
3068 ///
3069 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
3070 #[inline]
3071 #[target_feature(enable = "avx2")]
3072 #[cfg_attr(test, assert_instr(vpsrad))]
3073 #[stable(feature = "simd_x86", since = "1.27.0")]
3074 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
3075 transmute(psrad(a.as_i32x8(), count.as_i32x4()))
3076 }
3077
3078 /// Shifts packed 16-bit integers in `a` right by `imm8` while
3079 /// shifting in sign bits.
3080 ///
3081 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
3082 #[inline]
3083 #[target_feature(enable = "avx2")]
3084 #[cfg_attr(test, assert_instr(vpsraw))]
3085 #[stable(feature = "simd_x86", since = "1.27.0")]
3086 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
3087 transmute(psraiw(a.as_i16x16(), imm8))
3088 }
3089
3090 /// Shifts packed 32-bit integers in `a` right by `imm8` while
3091 /// shifting in sign bits.
3092 ///
3093 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
3094 #[inline]
3095 #[target_feature(enable = "avx2")]
3096 #[cfg_attr(test, assert_instr(vpsrad))]
3097 #[stable(feature = "simd_x86", since = "1.27.0")]
3098 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
3099 transmute(psraid(a.as_i32x8(), imm8))
3100 }
3101
3102 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3103 /// corresponding element in `count` while shifting in sign bits.
3104 ///
3105 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
3106 #[inline]
3107 #[target_feature(enable = "avx2")]
3108 #[cfg_attr(test, assert_instr(vpsravd))]
3109 #[stable(feature = "simd_x86", since = "1.27.0")]
3110 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
3111 transmute(psravd(a.as_i32x4(), count.as_i32x4()))
3112 }
3113
3114 /// Shifts packed 32-bit integers in `a` right by the amount specified by the
3115 /// corresponding element in `count` while shifting in sign bits.
3116 ///
3117 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
3118 #[inline]
3119 #[target_feature(enable = "avx2")]
3120 #[cfg_attr(test, assert_instr(vpsravd))]
3121 #[stable(feature = "simd_x86", since = "1.27.0")]
3122 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
3123 transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
3124 }
3125
3126 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3127 ///
3128 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
3129 #[inline]
3130 #[target_feature(enable = "avx2")]
3131 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3132 #[rustc_args_required_const(1)]
3133 #[stable(feature = "simd_x86", since = "1.27.0")]
3134 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
3135 let a = a.as_i64x4();
3136 macro_rules! call {
3137 ($imm8:expr) => {
3138 vpsrldq(a, $imm8)
3139 };
3140 }
3141 transmute(constify_imm8!(imm8 * 8, call))
3142 }
3143
3144 /// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
3145 ///
3146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
3147 #[inline]
3148 #[target_feature(enable = "avx2")]
3149 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
3150 #[rustc_args_required_const(1)]
3151 #[stable(feature = "simd_x86", since = "1.27.0")]
3152 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
3153 let a = a.as_i64x4();
3154 macro_rules! call {
3155 ($imm8:expr) => {
3156 vpsrldq(a, $imm8)
3157 };
3158 }
3159 transmute(constify_imm8!(imm8 * 8, call))
3160 }
3161
3162 /// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3163 /// zeros.
3164 ///
3165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
3166 #[inline]
3167 #[target_feature(enable = "avx2")]
3168 #[cfg_attr(test, assert_instr(vpsrlw))]
3169 #[stable(feature = "simd_x86", since = "1.27.0")]
3170 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3171 transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3172 }
3173
3174 /// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3175 /// zeros.
3176 ///
3177 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
3178 #[inline]
3179 #[target_feature(enable = "avx2")]
3180 #[cfg_attr(test, assert_instr(vpsrld))]
3181 #[stable(feature = "simd_x86", since = "1.27.0")]
3182 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3183 transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3184 }
3185
3186 /// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3187 /// zeros.
3188 ///
3189 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
3190 #[inline]
3191 #[target_feature(enable = "avx2")]
3192 #[cfg_attr(test, assert_instr(vpsrlq))]
3193 #[stable(feature = "simd_x86", since = "1.27.0")]
3194 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3195 transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3196 }
3197
3198 /// Shifts packed 16-bit integers in `a` right by `imm8` while shifting in
3199 /// zeros
3200 ///
3201 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
3202 #[inline]
3203 #[target_feature(enable = "avx2")]
3204 #[cfg_attr(test, assert_instr(vpsrlw))]
3205 #[stable(feature = "simd_x86", since = "1.27.0")]
3206 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
3207 transmute(psrliw(a.as_i16x16(), imm8))
3208 }
3209
3210 /// Shifts packed 32-bit integers in `a` right by `imm8` while shifting in
3211 /// zeros
3212 ///
3213 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
3214 #[inline]
3215 #[target_feature(enable = "avx2")]
3216 #[cfg_attr(test, assert_instr(vpsrld))]
3217 #[stable(feature = "simd_x86", since = "1.27.0")]
3218 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
3219 transmute(psrlid(a.as_i32x8(), imm8))
3220 }
3221
3222 /// Shifts packed 64-bit integers in `a` right by `imm8` while shifting in
3223 /// zeros
3224 ///
3225 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
3226 #[inline]
3227 #[target_feature(enable = "avx2")]
3228 #[cfg_attr(test, assert_instr(vpsrlq))]
3229 #[stable(feature = "simd_x86", since = "1.27.0")]
3230 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
3231 transmute(psrliq(a.as_i64x4(), imm8))
3232 }
3233
3234 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3235 /// the corresponding element in `count` while shifting in zeros,
3236 ///
3237 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
3238 #[inline]
3239 #[target_feature(enable = "avx2")]
3240 #[cfg_attr(test, assert_instr(vpsrlvd))]
3241 #[stable(feature = "simd_x86", since = "1.27.0")]
3242 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3243 transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3244 }
3245
3246 /// Shifts packed 32-bit integers in `a` right by the amount specified by
3247 /// the corresponding element in `count` while shifting in zeros,
3248 ///
3249 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
3250 #[inline]
3251 #[target_feature(enable = "avx2")]
3252 #[cfg_attr(test, assert_instr(vpsrlvd))]
3253 #[stable(feature = "simd_x86", since = "1.27.0")]
3254 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3255 transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3256 }
3257
3258 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3259 /// the corresponding element in `count` while shifting in zeros,
3260 ///
3261 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
3262 #[inline]
3263 #[target_feature(enable = "avx2")]
3264 #[cfg_attr(test, assert_instr(vpsrlvq))]
3265 #[stable(feature = "simd_x86", since = "1.27.0")]
3266 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3267 transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3268 }
3269
3270 /// Shifts packed 64-bit integers in `a` right by the amount specified by
3271 /// the corresponding element in `count` while shifting in zeros,
3272 ///
3273 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
3274 #[inline]
3275 #[target_feature(enable = "avx2")]
3276 #[cfg_attr(test, assert_instr(vpsrlvq))]
3277 #[stable(feature = "simd_x86", since = "1.27.0")]
3278 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3279 transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3280 }
3281
3282 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
3283
3284 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3285 ///
3286 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
3287 #[inline]
3288 #[target_feature(enable = "avx2")]
3289 #[cfg_attr(test, assert_instr(vpsubw))]
3290 #[stable(feature = "simd_x86", since = "1.27.0")]
3291 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3292 transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3293 }
3294
3295 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
3296 ///
3297 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
3298 #[inline]
3299 #[target_feature(enable = "avx2")]
3300 #[cfg_attr(test, assert_instr(vpsubd))]
3301 #[stable(feature = "simd_x86", since = "1.27.0")]
3302 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3303 transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3304 }
3305
3306 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
3307 ///
3308 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
3309 #[inline]
3310 #[target_feature(enable = "avx2")]
3311 #[cfg_attr(test, assert_instr(vpsubq))]
3312 #[stable(feature = "simd_x86", since = "1.27.0")]
3313 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3314 transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3315 }
3316
3317 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
3318 ///
3319 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
3320 #[inline]
3321 #[target_feature(enable = "avx2")]
3322 #[cfg_attr(test, assert_instr(vpsubb))]
3323 #[stable(feature = "simd_x86", since = "1.27.0")]
3324 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3325 transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3326 }
3327
3328 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3329 /// `a` using saturation.
3330 ///
3331 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
3332 #[inline]
3333 #[target_feature(enable = "avx2")]
3334 #[cfg_attr(test, assert_instr(vpsubsw))]
3335 #[stable(feature = "simd_x86", since = "1.27.0")]
3336 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3337 transmute(psubsw(a.as_i16x16(), b.as_i16x16()))
3338 }
3339
3340 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3341 /// `a` using saturation.
3342 ///
3343 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
3344 #[inline]
3345 #[target_feature(enable = "avx2")]
3346 #[cfg_attr(test, assert_instr(vpsubsb))]
3347 #[stable(feature = "simd_x86", since = "1.27.0")]
3348 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3349 transmute(psubsb(a.as_i8x32(), b.as_i8x32()))
3350 }
3351
3352 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3353 /// integers in `a` using saturation.
3354 ///
3355 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
3356 #[inline]
3357 #[target_feature(enable = "avx2")]
3358 #[cfg_attr(test, assert_instr(vpsubusw))]
3359 #[stable(feature = "simd_x86", since = "1.27.0")]
3360 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3361 transmute(psubusw(a.as_u16x16(), b.as_u16x16()))
3362 }
3363
3364 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3365 /// integers in `a` using saturation.
3366 ///
3367 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
3368 #[inline]
3369 #[target_feature(enable = "avx2")]
3370 #[cfg_attr(test, assert_instr(vpsubusb))]
3371 #[stable(feature = "simd_x86", since = "1.27.0")]
3372 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3373 transmute(psubusb(a.as_u8x32(), b.as_u8x32()))
3374 }
3375
3376 /// Unpacks and interleave 8-bit integers from the high half of each
3377 /// 128-bit lane in `a` and `b`.
3378 ///
3379 /// ```rust
3380 /// #[cfg(target_arch = "x86")]
3381 /// use std::arch::x86::*;
3382 /// #[cfg(target_arch = "x86_64")]
3383 /// use std::arch::x86_64::*;
3384 ///
3385 /// # fn main() {
3386 /// # if is_x86_feature_detected!("avx2") {
3387 /// # #[target_feature(enable = "avx2")]
3388 /// # unsafe fn worker() {
3389 /// let a = _mm256_setr_epi8(
3390 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3391 /// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3392 /// );
3393 /// let b = _mm256_setr_epi8(
3394 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3395 /// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3396 /// -30, -31,
3397 /// );
3398 ///
3399 /// let c = _mm256_unpackhi_epi8(a, b);
3400 ///
3401 /// let expected = _mm256_setr_epi8(
3402 /// 8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3403 /// 24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3404 /// -31,
3405 /// );
3406 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3407 ///
3408 /// # }
3409 /// # unsafe { worker(); }
3410 /// # }
3411 /// # }
3412 /// ```
3413 ///
3414 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
3415 #[inline]
3416 #[target_feature(enable = "avx2")]
3417 #[cfg_attr(test, assert_instr(vpunpckhbw))]
3418 #[stable(feature = "simd_x86", since = "1.27.0")]
3419 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3420 #[rustfmt::skip]
3421 let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3422 8, 40, 9, 41, 10, 42, 11, 43,
3423 12, 44, 13, 45, 14, 46, 15, 47,
3424 24, 56, 25, 57, 26, 58, 27, 59,
3425 28, 60, 29, 61, 30, 62, 31, 63,
3426 ]);
3427 transmute(r)
3428 }
3429
3430 /// Unpacks and interleave 8-bit integers from the low half of each
3431 /// 128-bit lane of `a` and `b`.
3432 ///
3433 /// ```rust
3434 /// #[cfg(target_arch = "x86")]
3435 /// use std::arch::x86::*;
3436 /// #[cfg(target_arch = "x86_64")]
3437 /// use std::arch::x86_64::*;
3438 ///
3439 /// # fn main() {
3440 /// # if is_x86_feature_detected!("avx2") {
3441 /// # #[target_feature(enable = "avx2")]
3442 /// # unsafe fn worker() {
3443 /// let a = _mm256_setr_epi8(
3444 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3445 /// 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3446 /// );
3447 /// let b = _mm256_setr_epi8(
3448 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3449 /// -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3450 /// -30, -31,
3451 /// );
3452 ///
3453 /// let c = _mm256_unpacklo_epi8(a, b);
3454 ///
3455 /// let expected = _mm256_setr_epi8(
3456 /// 0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3457 /// -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3458 /// );
3459 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3460 ///
3461 /// # }
3462 /// # unsafe { worker(); }
3463 /// # }
3464 /// # }
3465 /// ```
3466 ///
3467 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
3468 #[inline]
3469 #[target_feature(enable = "avx2")]
3470 #[cfg_attr(test, assert_instr(vpunpcklbw))]
3471 #[stable(feature = "simd_x86", since = "1.27.0")]
3472 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3473 #[rustfmt::skip]
3474 let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
3475 0, 32, 1, 33, 2, 34, 3, 35,
3476 4, 36, 5, 37, 6, 38, 7, 39,
3477 16, 48, 17, 49, 18, 50, 19, 51,
3478 20, 52, 21, 53, 22, 54, 23, 55,
3479 ]);
3480 transmute(r)
3481 }
3482
3483 /// Unpacks and interleave 16-bit integers from the high half of each
3484 /// 128-bit lane of `a` and `b`.
3485 ///
3486 /// ```rust
3487 /// #[cfg(target_arch = "x86")]
3488 /// use std::arch::x86::*;
3489 /// #[cfg(target_arch = "x86_64")]
3490 /// use std::arch::x86_64::*;
3491 ///
3492 /// # fn main() {
3493 /// # if is_x86_feature_detected!("avx2") {
3494 /// # #[target_feature(enable = "avx2")]
3495 /// # unsafe fn worker() {
3496 /// let a = _mm256_setr_epi16(
3497 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3498 /// );
3499 /// let b = _mm256_setr_epi16(
3500 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3501 /// );
3502 ///
3503 /// let c = _mm256_unpackhi_epi16(a, b);
3504 ///
3505 /// let expected = _mm256_setr_epi16(
3506 /// 4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3507 /// );
3508 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3509 ///
3510 /// # }
3511 /// # unsafe { worker(); }
3512 /// # }
3513 /// # }
3514 /// ```
3515 ///
3516 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
3517 #[inline]
3518 #[target_feature(enable = "avx2")]
3519 #[cfg_attr(test, assert_instr(vpunpckhwd))]
3520 #[stable(feature = "simd_x86", since = "1.27.0")]
3521 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3522 let r: i16x16 = simd_shuffle16(
3523 a.as_i16x16(),
3524 b.as_i16x16(),
3525 [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3526 );
3527 transmute(r)
3528 }
3529
3530 /// Unpacks and interleave 16-bit integers from the low half of each
3531 /// 128-bit lane of `a` and `b`.
3532 ///
3533 /// ```rust
3534 /// #[cfg(target_arch = "x86")]
3535 /// use std::arch::x86::*;
3536 /// #[cfg(target_arch = "x86_64")]
3537 /// use std::arch::x86_64::*;
3538 ///
3539 /// # fn main() {
3540 /// # if is_x86_feature_detected!("avx2") {
3541 /// # #[target_feature(enable = "avx2")]
3542 /// # unsafe fn worker() {
3543 ///
3544 /// let a = _mm256_setr_epi16(
3545 /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3546 /// );
3547 /// let b = _mm256_setr_epi16(
3548 /// 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3549 /// );
3550 ///
3551 /// let c = _mm256_unpacklo_epi16(a, b);
3552 ///
3553 /// let expected = _mm256_setr_epi16(
3554 /// 0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3555 /// );
3556 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3557 ///
3558 /// # }
3559 /// # unsafe { worker(); }
3560 /// # }
3561 /// # }
3562 /// ```
3563 ///
3564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
3565 #[inline]
3566 #[target_feature(enable = "avx2")]
3567 #[cfg_attr(test, assert_instr(vpunpcklwd))]
3568 #[stable(feature = "simd_x86", since = "1.27.0")]
3569 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3570 let r: i16x16 = simd_shuffle16(
3571 a.as_i16x16(),
3572 b.as_i16x16(),
3573 [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3574 );
3575 transmute(r)
3576 }
3577
3578 /// Unpacks and interleave 32-bit integers from the high half of each
3579 /// 128-bit lane of `a` and `b`.
3580 ///
3581 /// ```rust
3582 /// #[cfg(target_arch = "x86")]
3583 /// use std::arch::x86::*;
3584 /// #[cfg(target_arch = "x86_64")]
3585 /// use std::arch::x86_64::*;
3586 ///
3587 /// # fn main() {
3588 /// # if is_x86_feature_detected!("avx2") {
3589 /// # #[target_feature(enable = "avx2")]
3590 /// # unsafe fn worker() {
3591 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3592 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3593 ///
3594 /// let c = _mm256_unpackhi_epi32(a, b);
3595 ///
3596 /// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3597 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3598 ///
3599 /// # }
3600 /// # unsafe { worker(); }
3601 /// # }
3602 /// # }
3603 /// ```
3604 ///
3605 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
3606 #[inline]
3607 #[target_feature(enable = "avx2")]
3608 #[cfg_attr(test, assert_instr(vunpckhps))]
3609 #[stable(feature = "simd_x86", since = "1.27.0")]
3610 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3611 let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3612 transmute(r)
3613 }
3614
3615 /// Unpacks and interleave 32-bit integers from the low half of each
3616 /// 128-bit lane of `a` and `b`.
3617 ///
3618 /// ```rust
3619 /// #[cfg(target_arch = "x86")]
3620 /// use std::arch::x86::*;
3621 /// #[cfg(target_arch = "x86_64")]
3622 /// use std::arch::x86_64::*;
3623 ///
3624 /// # fn main() {
3625 /// # if is_x86_feature_detected!("avx2") {
3626 /// # #[target_feature(enable = "avx2")]
3627 /// # unsafe fn worker() {
3628 /// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3629 /// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3630 ///
3631 /// let c = _mm256_unpacklo_epi32(a, b);
3632 ///
3633 /// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3634 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3635 ///
3636 /// # }
3637 /// # unsafe { worker(); }
3638 /// # }
3639 /// # }
3640 /// ```
3641 ///
3642 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
3643 #[inline]
3644 #[target_feature(enable = "avx2")]
3645 #[cfg_attr(test, assert_instr(vunpcklps))]
3646 #[stable(feature = "simd_x86", since = "1.27.0")]
3647 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3648 let r: i32x8 = simd_shuffle8(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3649 transmute(r)
3650 }
3651
3652 /// Unpacks and interleave 64-bit integers from the high half of each
3653 /// 128-bit lane of `a` and `b`.
3654 ///
3655 /// ```rust
3656 /// #[cfg(target_arch = "x86")]
3657 /// use std::arch::x86::*;
3658 /// #[cfg(target_arch = "x86_64")]
3659 /// use std::arch::x86_64::*;
3660 ///
3661 /// # fn main() {
3662 /// # if is_x86_feature_detected!("avx2") {
3663 /// # #[target_feature(enable = "avx2")]
3664 /// # unsafe fn worker() {
3665 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3666 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3667 ///
3668 /// let c = _mm256_unpackhi_epi64(a, b);
3669 ///
3670 /// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3671 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3672 ///
3673 /// # }
3674 /// # unsafe { worker(); }
3675 /// # }
3676 /// # }
3677 /// ```
3678 ///
3679 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
3680 #[inline]
3681 #[target_feature(enable = "avx2")]
3682 #[cfg_attr(test, assert_instr(vunpckhpd))]
3683 #[stable(feature = "simd_x86", since = "1.27.0")]
3684 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3685 let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3686 transmute(r)
3687 }
3688
3689 /// Unpacks and interleave 64-bit integers from the low half of each
3690 /// 128-bit lane of `a` and `b`.
3691 ///
3692 /// ```rust
3693 /// #[cfg(target_arch = "x86")]
3694 /// use std::arch::x86::*;
3695 /// #[cfg(target_arch = "x86_64")]
3696 /// use std::arch::x86_64::*;
3697 ///
3698 /// # fn main() {
3699 /// # if is_x86_feature_detected!("avx2") {
3700 /// # #[target_feature(enable = "avx2")]
3701 /// # unsafe fn worker() {
3702 /// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3703 /// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3704 ///
3705 /// let c = _mm256_unpacklo_epi64(a, b);
3706 ///
3707 /// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3708 /// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3709 ///
3710 /// # }
3711 /// # unsafe { worker(); }
3712 /// # }
3713 /// # }
3714 /// ```
3715 ///
3716 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
3717 #[inline]
3718 #[target_feature(enable = "avx2")]
3719 #[cfg_attr(test, assert_instr(vunpcklpd))]
3720 #[stable(feature = "simd_x86", since = "1.27.0")]
3721 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3722 let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3723 transmute(r)
3724 }
3725
3726 /// Computes the bitwise XOR of 256 bits (representing integer data)
3727 /// in `a` and `b`
3728 ///
3729 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
3730 #[inline]
3731 #[target_feature(enable = "avx2")]
3732 #[cfg_attr(test, assert_instr(vxorps))]
3733 #[stable(feature = "simd_x86", since = "1.27.0")]
3734 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3735 transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3736 }
3737
3738 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3739 /// integer containing the zero-extended integer data.
3740 ///
3741 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3742 ///
3743 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
3744 #[inline]
3745 #[target_feature(enable = "avx2")]
3746 // This intrinsic has no corresponding instruction.
3747 #[rustc_args_required_const(1)]
3748 #[stable(feature = "simd_x86", since = "1.27.0")]
3749 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
3750 let imm8 = (imm8 & 31) as u32;
3751 simd_extract(a.as_i8x32(), imm8)
3752 }
3753
3754 /// Extracts a 16-bit integer from `a`, selected with `imm8`. Returns a 32-bit
3755 /// integer containing the zero-extended integer data.
3756 ///
3757 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
3758 ///
3759 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
3760 #[inline]
3761 #[target_feature(enable = "avx2")]
3762 // This intrinsic has no corresponding instruction.
3763 #[rustc_args_required_const(1)]
3764 #[stable(feature = "simd_x86", since = "1.27.0")]
3765 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
3766 let imm8 = (imm8 & 15) as u32;
3767 simd_extract(a.as_i16x16(), imm8)
3768 }
3769
3770 /// Extracts a 32-bit integer from `a`, selected with `imm8`.
3771 ///
3772 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
3773 #[inline]
3774 #[target_feature(enable = "avx2")]
3775 // This intrinsic has no corresponding instruction.
3776 #[rustc_args_required_const(1)]
3777 #[stable(feature = "simd_x86", since = "1.27.0")]
3778 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
3779 let imm8 = (imm8 & 7) as u32;
3780 simd_extract(a.as_i32x8(), imm8)
3781 }
3782
3783 /// Returns the first element of the input vector of `[4 x double]`.
3784 ///
3785 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
3786 #[inline]
3787 #[target_feature(enable = "avx2")]
3788 //#[cfg_attr(test, assert_instr(movsd))] FIXME
3789 #[stable(feature = "simd_x86", since = "1.27.0")]
3790 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
3791 simd_extract(a, 0)
3792 }
3793
3794 /// Returns the first element of the input vector of `[8 x i32]`.
3795 ///
3796 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
3797 #[inline]
3798 #[target_feature(enable = "avx2")]
3799 //#[cfg_attr(test, assert_instr(movd))] FIXME
3800 #[stable(feature = "simd_x86", since = "1.27.0")]
3801 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
3802 simd_extract(a.as_i32x8(), 0)
3803 }
3804
3805 #[allow(improper_ctypes)]
3806 extern "C" {
3807 #[link_name = "llvm.x86.avx2.pabs.b"]
3808 fn pabsb(a: i8x32) -> u8x32;
3809 #[link_name = "llvm.x86.avx2.pabs.w"]
3810 fn pabsw(a: i16x16) -> u16x16;
3811 #[link_name = "llvm.x86.avx2.pabs.d"]
3812 fn pabsd(a: i32x8) -> u32x8;
3813 #[link_name = "llvm.x86.avx2.padds.b"]
3814 fn paddsb(a: i8x32, b: i8x32) -> i8x32;
3815 #[link_name = "llvm.x86.avx2.padds.w"]
3816 fn paddsw(a: i16x16, b: i16x16) -> i16x16;
3817 #[link_name = "llvm.x86.avx2.paddus.b"]
3818 fn paddusb(a: u8x32, b: u8x32) -> u8x32;
3819 #[link_name = "llvm.x86.avx2.paddus.w"]
3820 fn paddusw(a: u16x16, b: u16x16) -> u16x16;
3821 #[link_name = "llvm.x86.avx2.pavg.b"]
3822 fn pavgb(a: u8x32, b: u8x32) -> u8x32;
3823 #[link_name = "llvm.x86.avx2.pavg.w"]
3824 fn pavgw(a: u16x16, b: u16x16) -> u16x16;
3825 #[link_name = "llvm.x86.avx2.pblendvb"]
3826 fn pblendvb(a: i8x32, b: i8x32, mask: i8x32) -> i8x32;
3827 #[link_name = "llvm.x86.avx2.phadd.w"]
3828 fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3829 #[link_name = "llvm.x86.avx2.phadd.d"]
3830 fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3831 #[link_name = "llvm.x86.avx2.phadd.sw"]
3832 fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3833 #[link_name = "llvm.x86.avx2.phsub.w"]
3834 fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3835 #[link_name = "llvm.x86.avx2.phsub.d"]
3836 fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3837 #[link_name = "llvm.x86.avx2.phsub.sw"]
3838 fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3839 #[link_name = "llvm.x86.avx2.pmadd.wd"]
3840 fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3841 #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3842 fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3843 #[link_name = "llvm.x86.avx2.maskload.d"]
3844 fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3845 #[link_name = "llvm.x86.avx2.maskload.d.256"]
3846 fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3847 #[link_name = "llvm.x86.avx2.maskload.q"]
3848 fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3849 #[link_name = "llvm.x86.avx2.maskload.q.256"]
3850 fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3851 #[link_name = "llvm.x86.avx2.maskstore.d"]
3852 fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3853 #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3854 fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3855 #[link_name = "llvm.x86.avx2.maskstore.q"]
3856 fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3857 #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3858 fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3859 #[link_name = "llvm.x86.avx2.pmaxs.w"]
3860 fn pmaxsw(a: i16x16, b: i16x16) -> i16x16;
3861 #[link_name = "llvm.x86.avx2.pmaxs.d"]
3862 fn pmaxsd(a: i32x8, b: i32x8) -> i32x8;
3863 #[link_name = "llvm.x86.avx2.pmaxs.b"]
3864 fn pmaxsb(a: i8x32, b: i8x32) -> i8x32;
3865 #[link_name = "llvm.x86.avx2.pmaxu.w"]
3866 fn pmaxuw(a: u16x16, b: u16x16) -> u16x16;
3867 #[link_name = "llvm.x86.avx2.pmaxu.d"]
3868 fn pmaxud(a: u32x8, b: u32x8) -> u32x8;
3869 #[link_name = "llvm.x86.avx2.pmaxu.b"]
3870 fn pmaxub(a: u8x32, b: u8x32) -> u8x32;
3871 #[link_name = "llvm.x86.avx2.pmins.w"]
3872 fn pminsw(a: i16x16, b: i16x16) -> i16x16;
3873 #[link_name = "llvm.x86.avx2.pmins.d"]
3874 fn pminsd(a: i32x8, b: i32x8) -> i32x8;
3875 #[link_name = "llvm.x86.avx2.pmins.b"]
3876 fn pminsb(a: i8x32, b: i8x32) -> i8x32;
3877 #[link_name = "llvm.x86.avx2.pminu.w"]
3878 fn pminuw(a: u16x16, b: u16x16) -> u16x16;
3879 #[link_name = "llvm.x86.avx2.pminu.d"]
3880 fn pminud(a: u32x8, b: u32x8) -> u32x8;
3881 #[link_name = "llvm.x86.avx2.pminu.b"]
3882 fn pminub(a: u8x32, b: u8x32) -> u8x32;
3883 #[link_name = "llvm.x86.avx2.pmovmskb"]
3884 fn pmovmskb(a: i8x32) -> i32;
3885 #[link_name = "llvm.x86.avx2.mpsadbw"]
3886 fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3887 #[link_name = "llvm.x86.avx2.pmulhu.w"]
3888 fn pmulhuw(a: u16x16, b: u16x16) -> u16x16;
3889 #[link_name = "llvm.x86.avx2.pmulh.w"]
3890 fn pmulhw(a: i16x16, b: i16x16) -> i16x16;
3891 #[link_name = "llvm.x86.avx2.pmul.dq"]
3892 fn pmuldq(a: i32x8, b: i32x8) -> i64x4;
3893 #[link_name = "llvm.x86.avx2.pmulu.dq"]
3894 fn pmuludq(a: u32x8, b: u32x8) -> u64x4;
3895 #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3896 fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3897 #[link_name = "llvm.x86.avx2.packsswb"]
3898 fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3899 #[link_name = "llvm.x86.avx2.packssdw"]
3900 fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3901 #[link_name = "llvm.x86.avx2.packuswb"]
3902 fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3903 #[link_name = "llvm.x86.avx2.packusdw"]
3904 fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3905 #[link_name = "llvm.x86.avx2.psad.bw"]
3906 fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3907 #[link_name = "llvm.x86.avx2.psign.b"]
3908 fn psignb(a: i8x32, b: i8x32) -> i8x32;
3909 #[link_name = "llvm.x86.avx2.psign.w"]
3910 fn psignw(a: i16x16, b: i16x16) -> i16x16;
3911 #[link_name = "llvm.x86.avx2.psign.d"]
3912 fn psignd(a: i32x8, b: i32x8) -> i32x8;
3913 #[link_name = "llvm.x86.avx2.psll.w"]
3914 fn psllw(a: i16x16, count: i16x8) -> i16x16;
3915 #[link_name = "llvm.x86.avx2.psll.d"]
3916 fn pslld(a: i32x8, count: i32x4) -> i32x8;
3917 #[link_name = "llvm.x86.avx2.psll.q"]
3918 fn psllq(a: i64x4, count: i64x2) -> i64x4;
3919 #[link_name = "llvm.x86.avx2.pslli.w"]
3920 fn pslliw(a: i16x16, imm8: i32) -> i16x16;
3921 #[link_name = "llvm.x86.avx2.pslli.d"]
3922 fn psllid(a: i32x8, imm8: i32) -> i32x8;
3923 #[link_name = "llvm.x86.avx2.pslli.q"]
3924 fn pslliq(a: i64x4, imm8: i32) -> i64x4;
3925 #[link_name = "llvm.x86.avx2.psllv.d"]
3926 fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3927 #[link_name = "llvm.x86.avx2.psllv.d.256"]
3928 fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3929 #[link_name = "llvm.x86.avx2.psllv.q"]
3930 fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3931 #[link_name = "llvm.x86.avx2.psllv.q.256"]
3932 fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3933 #[link_name = "llvm.x86.avx2.psra.w"]
3934 fn psraw(a: i16x16, count: i16x8) -> i16x16;
3935 #[link_name = "llvm.x86.avx2.psra.d"]
3936 fn psrad(a: i32x8, count: i32x4) -> i32x8;
3937 #[link_name = "llvm.x86.avx2.psrai.w"]
3938 fn psraiw(a: i16x16, imm8: i32) -> i16x16;
3939 #[link_name = "llvm.x86.avx2.psrai.d"]
3940 fn psraid(a: i32x8, imm8: i32) -> i32x8;
3941 #[link_name = "llvm.x86.avx2.psrav.d"]
3942 fn psravd(a: i32x4, count: i32x4) -> i32x4;
3943 #[link_name = "llvm.x86.avx2.psrav.d.256"]
3944 fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3945 #[link_name = "llvm.x86.avx2.psrl.w"]
3946 fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3947 #[link_name = "llvm.x86.avx2.psrl.d"]
3948 fn psrld(a: i32x8, count: i32x4) -> i32x8;
3949 #[link_name = "llvm.x86.avx2.psrl.q"]
3950 fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3951 #[link_name = "llvm.x86.avx2.psrli.w"]
3952 fn psrliw(a: i16x16, imm8: i32) -> i16x16;
3953 #[link_name = "llvm.x86.avx2.psrli.d"]
3954 fn psrlid(a: i32x8, imm8: i32) -> i32x8;
3955 #[link_name = "llvm.x86.avx2.psrli.q"]
3956 fn psrliq(a: i64x4, imm8: i32) -> i64x4;
3957 #[link_name = "llvm.x86.avx2.psrlv.d"]
3958 fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3959 #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3960 fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3961 #[link_name = "llvm.x86.avx2.psrlv.q"]
3962 fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3963 #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3964 fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3965 #[link_name = "llvm.x86.avx2.psubs.b"]
3966 fn psubsb(a: i8x32, b: i8x32) -> i8x32;
3967 #[link_name = "llvm.x86.avx2.psubs.w"]
3968 fn psubsw(a: i16x16, b: i16x16) -> i16x16;
3969 #[link_name = "llvm.x86.avx2.psubus.b"]
3970 fn psubusb(a: u8x32, b: u8x32) -> u8x32;
3971 #[link_name = "llvm.x86.avx2.psubus.w"]
3972 fn psubusw(a: u16x16, b: u16x16) -> u16x16;
3973 #[link_name = "llvm.x86.avx2.pshuf.b"]
3974 fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3975 #[link_name = "llvm.x86.avx2.permd"]
3976 fn permd(a: u32x8, b: u32x8) -> u32x8;
3977 #[link_name = "llvm.x86.avx2.permps"]
3978 fn permps(a: __m256, b: i32x8) -> __m256;
3979 #[link_name = "llvm.x86.avx2.vperm2i128"]
3980 fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3981 #[link_name = "llvm.x86.avx2.gather.d.d"]
3982 fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3983 #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3984 fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3985 #[link_name = "llvm.x86.avx2.gather.d.q"]
3986 fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3987 #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3988 fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3989 #[link_name = "llvm.x86.avx2.gather.q.d"]
3990 fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3991 #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3992 fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3993 #[link_name = "llvm.x86.avx2.gather.q.q"]
3994 fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3995 #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3996 fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3997 #[link_name = "llvm.x86.avx2.gather.d.pd"]
3998 fn pgatherdpd(
3999 src: __m128d,
4000 slice: *const i8,
4001 offsets: i32x4,
4002 mask: __m128d,
4003 scale: i8,
4004 ) -> __m128d;
4005 #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
4006 fn vpgatherdpd(
4007 src: __m256d,
4008 slice: *const i8,
4009 offsets: i32x4,
4010 mask: __m256d,
4011 scale: i8,
4012 ) -> __m256d;
4013 #[link_name = "llvm.x86.avx2.gather.q.pd"]
4014 fn pgatherqpd(
4015 src: __m128d,
4016 slice: *const i8,
4017 offsets: i64x2,
4018 mask: __m128d,
4019 scale: i8,
4020 ) -> __m128d;
4021 #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
4022 fn vpgatherqpd(
4023 src: __m256d,
4024 slice: *const i8,
4025 offsets: i64x4,
4026 mask: __m256d,
4027 scale: i8,
4028 ) -> __m256d;
4029 #[link_name = "llvm.x86.avx2.gather.d.ps"]
4030 fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
4031 -> __m128;
4032 #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
4033 fn vpgatherdps(
4034 src: __m256,
4035 slice: *const i8,
4036 offsets: i32x8,
4037 mask: __m256,
4038 scale: i8,
4039 ) -> __m256;
4040 #[link_name = "llvm.x86.avx2.gather.q.ps"]
4041 fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
4042 -> __m128;
4043 #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
4044 fn vpgatherqps(
4045 src: __m128,
4046 slice: *const i8,
4047 offsets: i64x4,
4048 mask: __m128,
4049 scale: i8,
4050 ) -> __m128;
4051 #[link_name = "llvm.x86.avx2.psll.dq"]
4052 fn vpslldq(a: i64x4, b: i32) -> i64x4;
4053 #[link_name = "llvm.x86.avx2.psrl.dq"]
4054 fn vpsrldq(a: i64x4, b: i32) -> i64x4;
4055 }
4056
4057 #[cfg(test)]
4058 mod tests {
4059 use std;
4060 use stdarch_test::simd_test;
4061
4062 use crate::core_arch::x86::*;
4063
4064 #[simd_test(enable = "avx2")]
4065 unsafe fn test_mm256_abs_epi32() {
4066 #[rustfmt::skip]
4067 let a = _mm256_setr_epi32(
4068 0, 1, -1, std::i32::MAX,
4069 std::i32::MIN, 100, -100, -32,
4070 );
4071 let r = _mm256_abs_epi32(a);
4072 #[rustfmt::skip]
4073 let e = _mm256_setr_epi32(
4074 0, 1, 1, std::i32::MAX,
4075 std::i32::MAX.wrapping_add(1), 100, 100, 32,
4076 );
4077 assert_eq_m256i(r, e);
4078 }
4079
4080 #[simd_test(enable = "avx2")]
4081 unsafe fn test_mm256_abs_epi16() {
4082 #[rustfmt::skip]
4083 let a = _mm256_setr_epi16(
4084 0, 1, -1, 2, -2, 3, -3, 4,
4085 -4, 5, -5, std::i16::MAX, std::i16::MIN, 100, -100, -32,
4086 );
4087 let r = _mm256_abs_epi16(a);
4088 #[rustfmt::skip]
4089 let e = _mm256_setr_epi16(
4090 0, 1, 1, 2, 2, 3, 3, 4,
4091 4, 5, 5, std::i16::MAX, std::i16::MAX.wrapping_add(1), 100, 100, 32,
4092 );
4093 assert_eq_m256i(r, e);
4094 }
4095
4096 #[simd_test(enable = "avx2")]
4097 unsafe fn test_mm256_abs_epi8() {
4098 #[rustfmt::skip]
4099 let a = _mm256_setr_epi8(
4100 0, 1, -1, 2, -2, 3, -3, 4,
4101 -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32,
4102 0, 1, -1, 2, -2, 3, -3, 4,
4103 -4, 5, -5, std::i8::MAX, std::i8::MIN, 100, -100, -32,
4104 );
4105 let r = _mm256_abs_epi8(a);
4106 #[rustfmt::skip]
4107 let e = _mm256_setr_epi8(
4108 0, 1, 1, 2, 2, 3, 3, 4,
4109 4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32,
4110 0, 1, 1, 2, 2, 3, 3, 4,
4111 4, 5, 5, std::i8::MAX, std::i8::MAX.wrapping_add(1), 100, 100, 32,
4112 );
4113 assert_eq_m256i(r, e);
4114 }
4115
4116 #[simd_test(enable = "avx2")]
4117 unsafe fn test_mm256_add_epi64() {
4118 let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
4119 let b = _mm256_setr_epi64x(-1, 0, 1, 2);
4120 let r = _mm256_add_epi64(a, b);
4121 let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
4122 assert_eq_m256i(r, e);
4123 }
4124
4125 #[simd_test(enable = "avx2")]
4126 unsafe fn test_mm256_add_epi32() {
4127 let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
4128 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4129 let r = _mm256_add_epi32(a, b);
4130 let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
4131 assert_eq_m256i(r, e);
4132 }
4133
4134 #[simd_test(enable = "avx2")]
4135 unsafe fn test_mm256_add_epi16() {
4136 #[rustfmt::skip]
4137 let a = _mm256_setr_epi16(
4138 0, 1, 2, 3, 4, 5, 6, 7,
4139 8, 9, 10, 11, 12, 13, 14, 15,
4140 );
4141 #[rustfmt::skip]
4142 let b = _mm256_setr_epi16(
4143 0, 1, 2, 3, 4, 5, 6, 7,
4144 8, 9, 10, 11, 12, 13, 14, 15,
4145 );
4146 let r = _mm256_add_epi16(a, b);
4147 #[rustfmt::skip]
4148 let e = _mm256_setr_epi16(
4149 0, 2, 4, 6, 8, 10, 12, 14,
4150 16, 18, 20, 22, 24, 26, 28, 30,
4151 );
4152 assert_eq_m256i(r, e);
4153 }
4154
4155 #[simd_test(enable = "avx2")]
4156 unsafe fn test_mm256_add_epi8() {
4157 #[rustfmt::skip]
4158 let a = _mm256_setr_epi8(
4159 0, 1, 2, 3, 4, 5, 6, 7,
4160 8, 9, 10, 11, 12, 13, 14, 15,
4161 16, 17, 18, 19, 20, 21, 22, 23,
4162 24, 25, 26, 27, 28, 29, 30, 31,
4163 );
4164 #[rustfmt::skip]
4165 let b = _mm256_setr_epi8(
4166 0, 1, 2, 3, 4, 5, 6, 7,
4167 8, 9, 10, 11, 12, 13, 14, 15,
4168 16, 17, 18, 19, 20, 21, 22, 23,
4169 24, 25, 26, 27, 28, 29, 30, 31,
4170 );
4171 let r = _mm256_add_epi8(a, b);
4172 #[rustfmt::skip]
4173 let e = _mm256_setr_epi8(
4174 0, 2, 4, 6, 8, 10, 12, 14,
4175 16, 18, 20, 22, 24, 26, 28, 30,
4176 32, 34, 36, 38, 40, 42, 44, 46,
4177 48, 50, 52, 54, 56, 58, 60, 62,
4178 );
4179 assert_eq_m256i(r, e);
4180 }
4181
4182 #[simd_test(enable = "avx2")]
4183 unsafe fn test_mm256_adds_epi8() {
4184 #[rustfmt::skip]
4185 let a = _mm256_setr_epi8(
4186 0, 1, 2, 3, 4, 5, 6, 7,
4187 8, 9, 10, 11, 12, 13, 14, 15,
4188 16, 17, 18, 19, 20, 21, 22, 23,
4189 24, 25, 26, 27, 28, 29, 30, 31,
4190 );
4191 #[rustfmt::skip]
4192 let b = _mm256_setr_epi8(
4193 32, 33, 34, 35, 36, 37, 38, 39,
4194 40, 41, 42, 43, 44, 45, 46, 47,
4195 48, 49, 50, 51, 52, 53, 54, 55,
4196 56, 57, 58, 59, 60, 61, 62, 63,
4197 );
4198 let r = _mm256_adds_epi8(a, b);
4199 #[rustfmt::skip]
4200 let e = _mm256_setr_epi8(
4201 32, 34, 36, 38, 40, 42, 44, 46,
4202 48, 50, 52, 54, 56, 58, 60, 62,
4203 64, 66, 68, 70, 72, 74, 76, 78,
4204 80, 82, 84, 86, 88, 90, 92, 94,
4205 );
4206 assert_eq_m256i(r, e);
4207 }
4208
4209 #[simd_test(enable = "avx2")]
4210 unsafe fn test_mm256_adds_epi8_saturate_positive() {
4211 let a = _mm256_set1_epi8(0x7F);
4212 let b = _mm256_set1_epi8(1);
4213 let r = _mm256_adds_epi8(a, b);
4214 assert_eq_m256i(r, a);
4215 }
4216
4217 #[simd_test(enable = "avx2")]
4218 unsafe fn test_mm256_adds_epi8_saturate_negative() {
4219 let a = _mm256_set1_epi8(-0x80);
4220 let b = _mm256_set1_epi8(-1);
4221 let r = _mm256_adds_epi8(a, b);
4222 assert_eq_m256i(r, a);
4223 }
4224
4225 #[simd_test(enable = "avx2")]
4226 unsafe fn test_mm256_adds_epi16() {
4227 #[rustfmt::skip]
4228 let a = _mm256_setr_epi16(
4229 0, 1, 2, 3, 4, 5, 6, 7,
4230 8, 9, 10, 11, 12, 13, 14, 15,
4231 );
4232 #[rustfmt::skip]
4233 let b = _mm256_setr_epi16(
4234 32, 33, 34, 35, 36, 37, 38, 39,
4235 40, 41, 42, 43, 44, 45, 46, 47,
4236 );
4237 let r = _mm256_adds_epi16(a, b);
4238 #[rustfmt::skip]
4239 let e = _mm256_setr_epi16(
4240 32, 34, 36, 38, 40, 42, 44, 46,
4241 48, 50, 52, 54, 56, 58, 60, 62,
4242 );
4243
4244 assert_eq_m256i(r, e);
4245 }
4246
4247 #[simd_test(enable = "avx2")]
4248 unsafe fn test_mm256_adds_epi16_saturate_positive() {
4249 let a = _mm256_set1_epi16(0x7FFF);
4250 let b = _mm256_set1_epi16(1);
4251 let r = _mm256_adds_epi16(a, b);
4252 assert_eq_m256i(r, a);
4253 }
4254
4255 #[simd_test(enable = "avx2")]
4256 unsafe fn test_mm256_adds_epi16_saturate_negative() {
4257 let a = _mm256_set1_epi16(-0x8000);
4258 let b = _mm256_set1_epi16(-1);
4259 let r = _mm256_adds_epi16(a, b);
4260 assert_eq_m256i(r, a);
4261 }
4262
4263 #[simd_test(enable = "avx2")]
4264 unsafe fn test_mm256_adds_epu8() {
4265 #[rustfmt::skip]
4266 let a = _mm256_setr_epi8(
4267 0, 1, 2, 3, 4, 5, 6, 7,
4268 8, 9, 10, 11, 12, 13, 14, 15,
4269 16, 17, 18, 19, 20, 21, 22, 23,
4270 24, 25, 26, 27, 28, 29, 30, 31,
4271 );
4272 #[rustfmt::skip]
4273 let b = _mm256_setr_epi8(
4274 32, 33, 34, 35, 36, 37, 38, 39,
4275 40, 41, 42, 43, 44, 45, 46, 47,
4276 48, 49, 50, 51, 52, 53, 54, 55,
4277 56, 57, 58, 59, 60, 61, 62, 63,
4278 );
4279 let r = _mm256_adds_epu8(a, b);
4280 #[rustfmt::skip]
4281 let e = _mm256_setr_epi8(
4282 32, 34, 36, 38, 40, 42, 44, 46,
4283 48, 50, 52, 54, 56, 58, 60, 62,
4284 64, 66, 68, 70, 72, 74, 76, 78,
4285 80, 82, 84, 86, 88, 90, 92, 94,
4286 );
4287 assert_eq_m256i(r, e);
4288 }
4289
4290 #[simd_test(enable = "avx2")]
4291 unsafe fn test_mm256_adds_epu8_saturate() {
4292 let a = _mm256_set1_epi8(!0);
4293 let b = _mm256_set1_epi8(1);
4294 let r = _mm256_adds_epu8(a, b);
4295 assert_eq_m256i(r, a);
4296 }
4297
4298 #[simd_test(enable = "avx2")]
4299 unsafe fn test_mm256_adds_epu16() {
4300 #[rustfmt::skip]
4301 let a = _mm256_setr_epi16(
4302 0, 1, 2, 3, 4, 5, 6, 7,
4303 8, 9, 10, 11, 12, 13, 14, 15,
4304 );
4305 #[rustfmt::skip]
4306 let b = _mm256_setr_epi16(
4307 32, 33, 34, 35, 36, 37, 38, 39,
4308 40, 41, 42, 43, 44, 45, 46, 47,
4309 );
4310 let r = _mm256_adds_epu16(a, b);
4311 #[rustfmt::skip]
4312 let e = _mm256_setr_epi16(
4313 32, 34, 36, 38, 40, 42, 44, 46,
4314 48, 50, 52, 54, 56, 58, 60, 62,
4315 );
4316
4317 assert_eq_m256i(r, e);
4318 }
4319
4320 #[simd_test(enable = "avx2")]
4321 unsafe fn test_mm256_adds_epu16_saturate() {
4322 let a = _mm256_set1_epi16(!0);
4323 let b = _mm256_set1_epi16(1);
4324 let r = _mm256_adds_epu16(a, b);
4325 assert_eq_m256i(r, a);
4326 }
4327
4328 #[simd_test(enable = "avx2")]
4329 unsafe fn test_mm256_and_si256() {
4330 let a = _mm256_set1_epi8(5);
4331 let b = _mm256_set1_epi8(3);
4332 let got = _mm256_and_si256(a, b);
4333 assert_eq_m256i(got, _mm256_set1_epi8(1));
4334 }
4335
4336 #[simd_test(enable = "avx2")]
4337 unsafe fn test_mm256_andnot_si256() {
4338 let a = _mm256_set1_epi8(5);
4339 let b = _mm256_set1_epi8(3);
4340 let got = _mm256_andnot_si256(a, b);
4341 assert_eq_m256i(got, _mm256_set1_epi8(2));
4342 }
4343
4344 #[simd_test(enable = "avx2")]
4345 unsafe fn test_mm256_avg_epu8() {
4346 let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4347 let r = _mm256_avg_epu8(a, b);
4348 assert_eq_m256i(r, _mm256_set1_epi8(6));
4349 }
4350
4351 #[simd_test(enable = "avx2")]
4352 unsafe fn test_mm256_avg_epu16() {
4353 let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4354 let r = _mm256_avg_epu16(a, b);
4355 assert_eq_m256i(r, _mm256_set1_epi16(6));
4356 }
4357
4358 #[simd_test(enable = "avx2")]
4359 unsafe fn test_mm_blend_epi32() {
4360 let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4361 let e = _mm_setr_epi32(9, 3, 3, 3);
4362 let r = _mm_blend_epi32(a, b, 0x01 as i32);
4363 assert_eq_m128i(r, e);
4364
4365 let r = _mm_blend_epi32(b, a, 0x0E as i32);
4366 assert_eq_m128i(r, e);
4367 }
4368
4369 #[simd_test(enable = "avx2")]
4370 unsafe fn test_mm256_blend_epi32() {
4371 let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4372 let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4373 let r = _mm256_blend_epi32(a, b, 0x01 as i32);
4374 assert_eq_m256i(r, e);
4375
4376 let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4377 let r = _mm256_blend_epi32(a, b, 0x82 as i32);
4378 assert_eq_m256i(r, e);
4379
4380 let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4381 let r = _mm256_blend_epi32(a, b, 0x7C as i32);
4382 assert_eq_m256i(r, e);
4383 }
4384
4385 #[simd_test(enable = "avx2")]
4386 unsafe fn test_mm256_blend_epi16() {
4387 let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4388 let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4389 let r = _mm256_blend_epi16(a, b, 0x01 as i32);
4390 assert_eq_m256i(r, e);
4391
4392 let r = _mm256_blend_epi16(b, a, 0xFE as i32);
4393 assert_eq_m256i(r, e);
4394 }
4395
4396 #[simd_test(enable = "avx2")]
4397 unsafe fn test_mm256_blendv_epi8() {
4398 let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4399 let mask = _mm256_insert_epi8(_mm256_set1_epi8(0), -1, 2);
4400 let e = _mm256_insert_epi8(_mm256_set1_epi8(4), 2, 2);
4401 let r = _mm256_blendv_epi8(a, b, mask);
4402 assert_eq_m256i(r, e);
4403 }
4404
4405 #[simd_test(enable = "avx2")]
4406 unsafe fn test_mm_broadcastb_epi8() {
4407 let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4408 let res = _mm_broadcastb_epi8(a);
4409 assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4410 }
4411
4412 #[simd_test(enable = "avx2")]
4413 unsafe fn test_mm256_broadcastb_epi8() {
4414 let a = _mm_insert_epi8(_mm_set1_epi8(0x00), 0x2a, 0);
4415 let res = _mm256_broadcastb_epi8(a);
4416 assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4417 }
4418
4419 #[simd_test(enable = "avx2")]
4420 unsafe fn test_mm_broadcastd_epi32() {
4421 let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4422 let res = _mm_broadcastd_epi32(a);
4423 assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4424 }
4425
4426 #[simd_test(enable = "avx2")]
4427 unsafe fn test_mm256_broadcastd_epi32() {
4428 let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4429 let res = _mm256_broadcastd_epi32(a);
4430 assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4431 }
4432
4433 #[simd_test(enable = "avx2")]
4434 unsafe fn test_mm_broadcastq_epi64() {
4435 let a = _mm_setr_epi64x(0x1ffffffff, 0);
4436 let res = _mm_broadcastq_epi64(a);
4437 assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4438 }
4439
4440 #[simd_test(enable = "avx2")]
4441 unsafe fn test_mm256_broadcastq_epi64() {
4442 let a = _mm_setr_epi64x(0x1ffffffff, 0);
4443 let res = _mm256_broadcastq_epi64(a);
4444 assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4445 }
4446
4447 #[simd_test(enable = "avx2")]
4448 unsafe fn test_mm_broadcastsd_pd() {
4449 let a = _mm_setr_pd(6.28, 3.14);
4450 let res = _mm_broadcastsd_pd(a);
4451 assert_eq_m128d(res, _mm_set1_pd(6.28f64));
4452 }
4453
4454 #[simd_test(enable = "avx2")]
4455 unsafe fn test_mm256_broadcastsd_pd() {
4456 let a = _mm_setr_pd(6.28, 3.14);
4457 let res = _mm256_broadcastsd_pd(a);
4458 assert_eq_m256d(res, _mm256_set1_pd(6.28f64));
4459 }
4460
4461 #[simd_test(enable = "avx2")]
4462 unsafe fn test_mm256_broadcastsi128_si256() {
4463 let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4464 let res = _mm256_broadcastsi128_si256(a);
4465 let retval = _mm256_setr_epi64x(
4466 0x0987654321012334,
4467 0x5678909876543210,
4468 0x0987654321012334,
4469 0x5678909876543210,
4470 );
4471 assert_eq_m256i(res, retval);
4472 }
4473
4474 #[simd_test(enable = "avx2")]
4475 unsafe fn test_mm_broadcastss_ps() {
4476 let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4477 let res = _mm_broadcastss_ps(a);
4478 assert_eq_m128(res, _mm_set1_ps(6.28f32));
4479 }
4480
4481 #[simd_test(enable = "avx2")]
4482 unsafe fn test_mm256_broadcastss_ps() {
4483 let a = _mm_setr_ps(6.28, 3.14, 0.0, 0.0);
4484 let res = _mm256_broadcastss_ps(a);
4485 assert_eq_m256(res, _mm256_set1_ps(6.28f32));
4486 }
4487
4488 #[simd_test(enable = "avx2")]
4489 unsafe fn test_mm_broadcastw_epi16() {
4490 let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4491 let res = _mm_broadcastw_epi16(a);
4492 assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4493 }
4494
4495 #[simd_test(enable = "avx2")]
4496 unsafe fn test_mm256_broadcastw_epi16() {
4497 let a = _mm_insert_epi16(_mm_set1_epi16(0x2a), 0x22b, 0);
4498 let res = _mm256_broadcastw_epi16(a);
4499 assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4500 }
4501
4502 #[simd_test(enable = "avx2")]
4503 unsafe fn test_mm256_cmpeq_epi8() {
4504 #[rustfmt::skip]
4505 let a = _mm256_setr_epi8(
4506 0, 1, 2, 3, 4, 5, 6, 7,
4507 8, 9, 10, 11, 12, 13, 14, 15,
4508 16, 17, 18, 19, 20, 21, 22, 23,
4509 24, 25, 26, 27, 28, 29, 30, 31,
4510 );
4511 #[rustfmt::skip]
4512 let b = _mm256_setr_epi8(
4513 31, 30, 2, 28, 27, 26, 25, 24,
4514 23, 22, 21, 20, 19, 18, 17, 16,
4515 15, 14, 13, 12, 11, 10, 9, 8,
4516 7, 6, 5, 4, 3, 2, 1, 0,
4517 );
4518 let r = _mm256_cmpeq_epi8(a, b);
4519 assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 2));
4520 }
4521
4522 #[simd_test(enable = "avx2")]
4523 unsafe fn test_mm256_cmpeq_epi16() {
4524 #[rustfmt::skip]
4525 let a = _mm256_setr_epi16(
4526 0, 1, 2, 3, 4, 5, 6, 7,
4527 8, 9, 10, 11, 12, 13, 14, 15,
4528 );
4529 #[rustfmt::skip]
4530 let b = _mm256_setr_epi16(
4531 15, 14, 2, 12, 11, 10, 9, 8,
4532 7, 6, 5, 4, 3, 2, 1, 0,
4533 );
4534 let r = _mm256_cmpeq_epi16(a, b);
4535 assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 2));
4536 }
4537
4538 #[simd_test(enable = "avx2")]
4539 unsafe fn test_mm256_cmpeq_epi32() {
4540 let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4541 let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4542 let r = _mm256_cmpeq_epi32(a, b);
4543 let e = _mm256_set1_epi32(0);
4544 let e = _mm256_insert_epi32(e, !0, 2);
4545 assert_eq_m256i(r, e);
4546 }
4547
4548 #[simd_test(enable = "avx2")]
4549 unsafe fn test_mm256_cmpeq_epi64() {
4550 let a = _mm256_setr_epi64x(0, 1, 2, 3);
4551 let b = _mm256_setr_epi64x(3, 2, 2, 0);
4552 let r = _mm256_cmpeq_epi64(a, b);
4553 assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 2));
4554 }
4555
4556 #[simd_test(enable = "avx2")]
4557 unsafe fn test_mm256_cmpgt_epi8() {
4558 let a = _mm256_insert_epi8(_mm256_set1_epi8(0), 5, 0);
4559 let b = _mm256_set1_epi8(0);
4560 let r = _mm256_cmpgt_epi8(a, b);
4561 assert_eq_m256i(r, _mm256_insert_epi8(_mm256_set1_epi8(0), !0, 0));
4562 }
4563
4564 #[simd_test(enable = "avx2")]
4565 unsafe fn test_mm256_cmpgt_epi16() {
4566 let a = _mm256_insert_epi16(_mm256_set1_epi16(0), 5, 0);
4567 let b = _mm256_set1_epi16(0);
4568 let r = _mm256_cmpgt_epi16(a, b);
4569 assert_eq_m256i(r, _mm256_insert_epi16(_mm256_set1_epi16(0), !0, 0));
4570 }
4571
4572 #[simd_test(enable = "avx2")]
4573 unsafe fn test_mm256_cmpgt_epi32() {
4574 let a = _mm256_insert_epi32(_mm256_set1_epi32(0), 5, 0);
4575 let b = _mm256_set1_epi32(0);
4576 let r = _mm256_cmpgt_epi32(a, b);
4577 assert_eq_m256i(r, _mm256_insert_epi32(_mm256_set1_epi32(0), !0, 0));
4578 }
4579
4580 #[simd_test(enable = "avx2")]
4581 unsafe fn test_mm256_cmpgt_epi64() {
4582 let a = _mm256_insert_epi64(_mm256_set1_epi64x(0), 5, 0);
4583 let b = _mm256_set1_epi64x(0);
4584 let r = _mm256_cmpgt_epi64(a, b);
4585 assert_eq_m256i(r, _mm256_insert_epi64(_mm256_set1_epi64x(0), !0, 0));
4586 }
4587
4588 #[simd_test(enable = "avx2")]
4589 unsafe fn test_mm256_cvtepi8_epi16() {
4590 #[rustfmt::skip]
4591 let a = _mm_setr_epi8(
4592 0, 0, -1, 1, -2, 2, -3, 3,
4593 -4, 4, -5, 5, -6, 6, -7, 7,
4594 );
4595 #[rustfmt::skip]
4596 let r = _mm256_setr_epi16(
4597 0, 0, -1, 1, -2, 2, -3, 3,
4598 -4, 4, -5, 5, -6, 6, -7, 7,
4599 );
4600 assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4601 }
4602
4603 #[simd_test(enable = "avx2")]
4604 unsafe fn test_mm256_cvtepi8_epi32() {
4605 #[rustfmt::skip]
4606 let a = _mm_setr_epi8(
4607 0, 0, -1, 1, -2, 2, -3, 3,
4608 -4, 4, -5, 5, -6, 6, -7, 7,
4609 );
4610 let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4611 assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4612 }
4613
4614 #[simd_test(enable = "avx2")]
4615 unsafe fn test_mm256_cvtepi8_epi64() {
4616 #[rustfmt::skip]
4617 let a = _mm_setr_epi8(
4618 0, 0, -1, 1, -2, 2, -3, 3,
4619 -4, 4, -5, 5, -6, 6, -7, 7,
4620 );
4621 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4622 assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4623 }
4624
4625 #[simd_test(enable = "avx2")]
4626 unsafe fn test_mm256_cvtepi16_epi32() {
4627 let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4628 let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4629 assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4630 }
4631
4632 #[simd_test(enable = "avx2")]
4633 unsafe fn test_mm256_cvtepi16_epi64() {
4634 let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4635 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4636 assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4637 }
4638
4639 #[simd_test(enable = "avx2")]
4640 unsafe fn test_mm256_cvtepi32_epi64() {
4641 let a = _mm_setr_epi32(0, 0, -1, 1);
4642 let r = _mm256_setr_epi64x(0, 0, -1, 1);
4643 assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4644 }
4645
4646 #[simd_test(enable = "avx2")]
4647 unsafe fn test_mm256_cvtepu16_epi32() {
4648 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4649 let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4650 assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4651 }
4652
4653 #[simd_test(enable = "avx2")]
4654 unsafe fn test_mm256_cvtepu16_epi64() {
4655 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4656 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4657 assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4658 }
4659
4660 #[simd_test(enable = "avx2")]
4661 unsafe fn test_mm256_cvtepu32_epi64() {
4662 let a = _mm_setr_epi32(0, 1, 2, 3);
4663 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4664 assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4665 }
4666
4667 #[simd_test(enable = "avx2")]
4668 unsafe fn test_mm256_cvtepu8_epi16() {
4669 #[rustfmt::skip]
4670 let a = _mm_setr_epi8(
4671 0, 1, 2, 3, 4, 5, 6, 7,
4672 8, 9, 10, 11, 12, 13, 14, 15,
4673 );
4674 #[rustfmt::skip]
4675 let r = _mm256_setr_epi16(
4676 0, 1, 2, 3, 4, 5, 6, 7,
4677 8, 9, 10, 11, 12, 13, 14, 15,
4678 );
4679 assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4680 }
4681
4682 #[simd_test(enable = "avx2")]
4683 unsafe fn test_mm256_cvtepu8_epi32() {
4684 #[rustfmt::skip]
4685 let a = _mm_setr_epi8(
4686 0, 1, 2, 3, 4, 5, 6, 7,
4687 8, 9, 10, 11, 12, 13, 14, 15,
4688 );
4689 let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4690 assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4691 }
4692
4693 #[simd_test(enable = "avx2")]
4694 unsafe fn test_mm256_cvtepu8_epi64() {
4695 #[rustfmt::skip]
4696 let a = _mm_setr_epi8(
4697 0, 1, 2, 3, 4, 5, 6, 7,
4698 8, 9, 10, 11, 12, 13, 14, 15,
4699 );
4700 let r = _mm256_setr_epi64x(0, 1, 2, 3);
4701 assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4702 }
4703
4704 #[simd_test(enable = "avx2")]
4705 unsafe fn test_mm256_extracti128_si256() {
4706 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4707 let r = _mm256_extracti128_si256(a, 0b01);
4708 let e = _mm_setr_epi64x(3, 4);
4709 assert_eq_m128i(r, e);
4710 }
4711
4712 #[simd_test(enable = "avx2")]
4713 unsafe fn test_mm256_hadd_epi16() {
4714 let a = _mm256_set1_epi16(2);
4715 let b = _mm256_set1_epi16(4);
4716 let r = _mm256_hadd_epi16(a, b);
4717 let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4718 assert_eq_m256i(r, e);
4719 }
4720
4721 #[simd_test(enable = "avx2")]
4722 unsafe fn test_mm256_hadd_epi32() {
4723 let a = _mm256_set1_epi32(2);
4724 let b = _mm256_set1_epi32(4);
4725 let r = _mm256_hadd_epi32(a, b);
4726 let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4727 assert_eq_m256i(r, e);
4728 }
4729
4730 #[simd_test(enable = "avx2")]
4731 unsafe fn test_mm256_hadds_epi16() {
4732 let a = _mm256_set1_epi16(2);
4733 let a = _mm256_insert_epi16(a, 0x7fff, 0);
4734 let a = _mm256_insert_epi16(a, 1, 1);
4735 let b = _mm256_set1_epi16(4);
4736 let r = _mm256_hadds_epi16(a, b);
4737 #[rustfmt::skip]
4738 let e = _mm256_setr_epi16(
4739 0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4740 4, 4, 4, 4, 8, 8, 8, 8,
4741 );
4742 assert_eq_m256i(r, e);
4743 }
4744
4745 #[simd_test(enable = "avx2")]
4746 unsafe fn test_mm256_hsub_epi16() {
4747 let a = _mm256_set1_epi16(2);
4748 let b = _mm256_set1_epi16(4);
4749 let r = _mm256_hsub_epi16(a, b);
4750 let e = _mm256_set1_epi16(0);
4751 assert_eq_m256i(r, e);
4752 }
4753
4754 #[simd_test(enable = "avx2")]
4755 unsafe fn test_mm256_hsub_epi32() {
4756 let a = _mm256_set1_epi32(2);
4757 let b = _mm256_set1_epi32(4);
4758 let r = _mm256_hsub_epi32(a, b);
4759 let e = _mm256_set1_epi32(0);
4760 assert_eq_m256i(r, e);
4761 }
4762
4763 #[simd_test(enable = "avx2")]
4764 unsafe fn test_mm256_hsubs_epi16() {
4765 let a = _mm256_set1_epi16(2);
4766 let a = _mm256_insert_epi16(a, 0x7fff, 0);
4767 let a = _mm256_insert_epi16(a, -1, 1);
4768 let b = _mm256_set1_epi16(4);
4769 let r = _mm256_hsubs_epi16(a, b);
4770 let e = _mm256_insert_epi16(_mm256_set1_epi16(0), 0x7FFF, 0);
4771 assert_eq_m256i(r, e);
4772 }
4773
4774 #[simd_test(enable = "avx2")]
4775 unsafe fn test_mm256_madd_epi16() {
4776 let a = _mm256_set1_epi16(2);
4777 let b = _mm256_set1_epi16(4);
4778 let r = _mm256_madd_epi16(a, b);
4779 let e = _mm256_set1_epi32(16);
4780 assert_eq_m256i(r, e);
4781 }
4782
4783 #[simd_test(enable = "avx2")]
4784 unsafe fn test_mm256_inserti128_si256() {
4785 let a = _mm256_setr_epi64x(1, 2, 3, 4);
4786 let b = _mm_setr_epi64x(7, 8);
4787 let r = _mm256_inserti128_si256(a, b, 0b01);
4788 let e = _mm256_setr_epi64x(1, 2, 7, 8);
4789 assert_eq_m256i(r, e);
4790 }
4791
4792 #[simd_test(enable = "avx2")]
4793 unsafe fn test_mm256_maddubs_epi16() {
4794 let a = _mm256_set1_epi8(2);
4795 let b = _mm256_set1_epi8(4);
4796 let r = _mm256_maddubs_epi16(a, b);
4797 let e = _mm256_set1_epi16(16);
4798 assert_eq_m256i(r, e);
4799 }
4800
4801 #[simd_test(enable = "avx2")]
4802 unsafe fn test_mm_maskload_epi32() {
4803 let nums = [1, 2, 3, 4];
4804 let a = &nums as *const i32;
4805 let mask = _mm_setr_epi32(-1, 0, 0, -1);
4806 let r = _mm_maskload_epi32(a, mask);
4807 let e = _mm_setr_epi32(1, 0, 0, 4);
4808 assert_eq_m128i(r, e);
4809 }
4810
4811 #[simd_test(enable = "avx2")]
4812 unsafe fn test_mm256_maskload_epi32() {
4813 let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4814 let a = &nums as *const i32;
4815 let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4816 let r = _mm256_maskload_epi32(a, mask);
4817 let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4818 assert_eq_m256i(r, e);
4819 }
4820
4821 #[simd_test(enable = "avx2")]
4822 unsafe fn test_mm_maskload_epi64() {
4823 let nums = [1_i64, 2_i64];
4824 let a = &nums as *const i64;
4825 let mask = _mm_setr_epi64x(0, -1);
4826 let r = _mm_maskload_epi64(a, mask);
4827 let e = _mm_setr_epi64x(0, 2);
4828 assert_eq_m128i(r, e);
4829 }
4830
4831 #[simd_test(enable = "avx2")]
4832 unsafe fn test_mm256_maskload_epi64() {
4833 let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4834 let a = &nums as *const i64;
4835 let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4836 let r = _mm256_maskload_epi64(a, mask);
4837 let e = _mm256_setr_epi64x(0, 2, 3, 0);
4838 assert_eq_m256i(r, e);
4839 }
4840
4841 #[simd_test(enable = "avx2")]
4842 unsafe fn test_mm_maskstore_epi32() {
4843 let a = _mm_setr_epi32(1, 2, 3, 4);
4844 let mut arr = [-1, -1, -1, -1];
4845 let mask = _mm_setr_epi32(-1, 0, 0, -1);
4846 _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4847 let e = [1, -1, -1, 4];
4848 assert_eq!(arr, e);
4849 }
4850
4851 #[simd_test(enable = "avx2")]
4852 unsafe fn test_mm256_maskstore_epi32() {
4853 let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4854 let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4855 let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4856 _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4857 let e = [1, -1, -1, 42, -1, 6, 7, -1];
4858 assert_eq!(arr, e);
4859 }
4860
4861 #[simd_test(enable = "avx2")]
4862 unsafe fn test_mm_maskstore_epi64() {
4863 let a = _mm_setr_epi64x(1_i64, 2_i64);
4864 let mut arr = [-1_i64, -1_i64];
4865 let mask = _mm_setr_epi64x(0, -1);
4866 _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4867 let e = [-1, 2];
4868 assert_eq!(arr, e);
4869 }
4870
4871 #[simd_test(enable = "avx2")]
4872 unsafe fn test_mm256_maskstore_epi64() {
4873 let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4874 let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4875 let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4876 _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4877 let e = [-1, 2, 3, -1];
4878 assert_eq!(arr, e);
4879 }
4880
4881 #[simd_test(enable = "avx2")]
4882 unsafe fn test_mm256_max_epi16() {
4883 let a = _mm256_set1_epi16(2);
4884 let b = _mm256_set1_epi16(4);
4885 let r = _mm256_max_epi16(a, b);
4886 assert_eq_m256i(r, b);
4887 }
4888
4889 #[simd_test(enable = "avx2")]
4890 unsafe fn test_mm256_max_epi32() {
4891 let a = _mm256_set1_epi32(2);
4892 let b = _mm256_set1_epi32(4);
4893 let r = _mm256_max_epi32(a, b);
4894 assert_eq_m256i(r, b);
4895 }
4896
4897 #[simd_test(enable = "avx2")]
4898 unsafe fn test_mm256_max_epi8() {
4899 let a = _mm256_set1_epi8(2);
4900 let b = _mm256_set1_epi8(4);
4901 let r = _mm256_max_epi8(a, b);
4902 assert_eq_m256i(r, b);
4903 }
4904
4905 #[simd_test(enable = "avx2")]
4906 unsafe fn test_mm256_max_epu16() {
4907 let a = _mm256_set1_epi16(2);
4908 let b = _mm256_set1_epi16(4);
4909 let r = _mm256_max_epu16(a, b);
4910 assert_eq_m256i(r, b);
4911 }
4912
4913 #[simd_test(enable = "avx2")]
4914 unsafe fn test_mm256_max_epu32() {
4915 let a = _mm256_set1_epi32(2);
4916 let b = _mm256_set1_epi32(4);
4917 let r = _mm256_max_epu32(a, b);
4918 assert_eq_m256i(r, b);
4919 }
4920
4921 #[simd_test(enable = "avx2")]
4922 unsafe fn test_mm256_max_epu8() {
4923 let a = _mm256_set1_epi8(2);
4924 let b = _mm256_set1_epi8(4);
4925 let r = _mm256_max_epu8(a, b);
4926 assert_eq_m256i(r, b);
4927 }
4928
4929 #[simd_test(enable = "avx2")]
4930 unsafe fn test_mm256_min_epi16() {
4931 let a = _mm256_set1_epi16(2);
4932 let b = _mm256_set1_epi16(4);
4933 let r = _mm256_min_epi16(a, b);
4934 assert_eq_m256i(r, a);
4935 }
4936
4937 #[simd_test(enable = "avx2")]
4938 unsafe fn test_mm256_min_epi32() {
4939 let a = _mm256_set1_epi32(2);
4940 let b = _mm256_set1_epi32(4);
4941 let r = _mm256_min_epi32(a, b);
4942 assert_eq_m256i(r, a);
4943 }
4944
4945 #[simd_test(enable = "avx2")]
4946 unsafe fn test_mm256_min_epi8() {
4947 let a = _mm256_set1_epi8(2);
4948 let b = _mm256_set1_epi8(4);
4949 let r = _mm256_min_epi8(a, b);
4950 assert_eq_m256i(r, a);
4951 }
4952
4953 #[simd_test(enable = "avx2")]
4954 unsafe fn test_mm256_min_epu16() {
4955 let a = _mm256_set1_epi16(2);
4956 let b = _mm256_set1_epi16(4);
4957 let r = _mm256_min_epu16(a, b);
4958 assert_eq_m256i(r, a);
4959 }
4960
4961 #[simd_test(enable = "avx2")]
4962 unsafe fn test_mm256_min_epu32() {
4963 let a = _mm256_set1_epi32(2);
4964 let b = _mm256_set1_epi32(4);
4965 let r = _mm256_min_epu32(a, b);
4966 assert_eq_m256i(r, a);
4967 }
4968
4969 #[simd_test(enable = "avx2")]
4970 unsafe fn test_mm256_min_epu8() {
4971 let a = _mm256_set1_epi8(2);
4972 let b = _mm256_set1_epi8(4);
4973 let r = _mm256_min_epu8(a, b);
4974 assert_eq_m256i(r, a);
4975 }
4976
4977 #[simd_test(enable = "avx2")]
4978 unsafe fn test_mm256_movemask_epi8() {
4979 let a = _mm256_set1_epi8(-1);
4980 let r = _mm256_movemask_epi8(a);
4981 let e = -1;
4982 assert_eq!(r, e);
4983 }
4984
4985 #[simd_test(enable = "avx2")]
4986 unsafe fn test_mm256_mpsadbw_epu8() {
4987 let a = _mm256_set1_epi8(2);
4988 let b = _mm256_set1_epi8(4);
4989 let r = _mm256_mpsadbw_epu8(a, b, 0);
4990 let e = _mm256_set1_epi16(8);
4991 assert_eq_m256i(r, e);
4992 }
4993
4994 #[simd_test(enable = "avx2")]
4995 unsafe fn test_mm256_mul_epi32() {
4996 let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4997 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4998 let r = _mm256_mul_epi32(a, b);
4999 let e = _mm256_setr_epi64x(0, 0, 10, 14);
5000 assert_eq_m256i(r, e);
5001 }
5002
5003 #[simd_test(enable = "avx2")]
5004 unsafe fn test_mm256_mul_epu32() {
5005 let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
5006 let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
5007 let r = _mm256_mul_epu32(a, b);
5008 let e = _mm256_setr_epi64x(0, 0, 10, 14);
5009 assert_eq_m256i(r, e);
5010 }
5011
5012 #[simd_test(enable = "avx2")]
5013 unsafe fn test_mm256_mulhi_epi16() {
5014 let a = _mm256_set1_epi16(6535);
5015 let b = _mm256_set1_epi16(6535);
5016 let r = _mm256_mulhi_epi16(a, b);
5017 let e = _mm256_set1_epi16(651);
5018 assert_eq_m256i(r, e);
5019 }
5020
5021 #[simd_test(enable = "avx2")]
5022 unsafe fn test_mm256_mulhi_epu16() {
5023 let a = _mm256_set1_epi16(6535);
5024 let b = _mm256_set1_epi16(6535);
5025 let r = _mm256_mulhi_epu16(a, b);
5026 let e = _mm256_set1_epi16(651);
5027 assert_eq_m256i(r, e);
5028 }
5029
5030 #[simd_test(enable = "avx2")]
5031 unsafe fn test_mm256_mullo_epi16() {
5032 let a = _mm256_set1_epi16(2);
5033 let b = _mm256_set1_epi16(4);
5034 let r = _mm256_mullo_epi16(a, b);
5035 let e = _mm256_set1_epi16(8);
5036 assert_eq_m256i(r, e);
5037 }
5038
5039 #[simd_test(enable = "avx2")]
5040 unsafe fn test_mm256_mullo_epi32() {
5041 let a = _mm256_set1_epi32(2);
5042 let b = _mm256_set1_epi32(4);
5043 let r = _mm256_mullo_epi32(a, b);
5044 let e = _mm256_set1_epi32(8);
5045 assert_eq_m256i(r, e);
5046 }
5047
5048 #[simd_test(enable = "avx2")]
5049 unsafe fn test_mm256_mulhrs_epi16() {
5050 let a = _mm256_set1_epi16(2);
5051 let b = _mm256_set1_epi16(4);
5052 let r = _mm256_mullo_epi16(a, b);
5053 let e = _mm256_set1_epi16(8);
5054 assert_eq_m256i(r, e);
5055 }
5056
5057 #[simd_test(enable = "avx2")]
5058 unsafe fn test_mm256_or_si256() {
5059 let a = _mm256_set1_epi8(-1);
5060 let b = _mm256_set1_epi8(0);
5061 let r = _mm256_or_si256(a, b);
5062 assert_eq_m256i(r, a);
5063 }
5064
5065 #[simd_test(enable = "avx2")]
5066 unsafe fn test_mm256_packs_epi16() {
5067 let a = _mm256_set1_epi16(2);
5068 let b = _mm256_set1_epi16(4);
5069 let r = _mm256_packs_epi16(a, b);
5070 #[rustfmt::skip]
5071 let e = _mm256_setr_epi8(
5072 2, 2, 2, 2, 2, 2, 2, 2,
5073 4, 4, 4, 4, 4, 4, 4, 4,
5074 2, 2, 2, 2, 2, 2, 2, 2,
5075 4, 4, 4, 4, 4, 4, 4, 4,
5076 );
5077
5078 assert_eq_m256i(r, e);
5079 }
5080
5081 #[simd_test(enable = "avx2")]
5082 unsafe fn test_mm256_packs_epi32() {
5083 let a = _mm256_set1_epi32(2);
5084 let b = _mm256_set1_epi32(4);
5085 let r = _mm256_packs_epi32(a, b);
5086 let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5087
5088 assert_eq_m256i(r, e);
5089 }
5090
5091 #[simd_test(enable = "avx2")]
5092 unsafe fn test_mm256_packus_epi16() {
5093 let a = _mm256_set1_epi16(2);
5094 let b = _mm256_set1_epi16(4);
5095 let r = _mm256_packus_epi16(a, b);
5096 #[rustfmt::skip]
5097 let e = _mm256_setr_epi8(
5098 2, 2, 2, 2, 2, 2, 2, 2,
5099 4, 4, 4, 4, 4, 4, 4, 4,
5100 2, 2, 2, 2, 2, 2, 2, 2,
5101 4, 4, 4, 4, 4, 4, 4, 4,
5102 );
5103
5104 assert_eq_m256i(r, e);
5105 }
5106
5107 #[simd_test(enable = "avx2")]
5108 unsafe fn test_mm256_packus_epi32() {
5109 let a = _mm256_set1_epi32(2);
5110 let b = _mm256_set1_epi32(4);
5111 let r = _mm256_packus_epi32(a, b);
5112 let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
5113
5114 assert_eq_m256i(r, e);
5115 }
5116
5117 #[simd_test(enable = "avx2")]
5118 unsafe fn test_mm256_sad_epu8() {
5119 let a = _mm256_set1_epi8(2);
5120 let b = _mm256_set1_epi8(4);
5121 let r = _mm256_sad_epu8(a, b);
5122 let e = _mm256_set1_epi64x(16);
5123 assert_eq_m256i(r, e);
5124 }
5125
5126 #[simd_test(enable = "avx2")]
5127 unsafe fn test_mm256_shufflehi_epi16() {
5128 #[rustfmt::skip]
5129 let a = _mm256_setr_epi16(
5130 0, 1, 2, 3, 11, 22, 33, 44,
5131 4, 5, 6, 7, 55, 66, 77, 88,
5132 );
5133 #[rustfmt::skip]
5134 let e = _mm256_setr_epi16(
5135 0, 1, 2, 3, 44, 22, 22, 11,
5136 4, 5, 6, 7, 88, 66, 66, 55,
5137 );
5138 let r = _mm256_shufflehi_epi16(a, 0b00_01_01_11);
5139 assert_eq_m256i(r, e);
5140 }
5141
5142 #[simd_test(enable = "avx2")]
5143 unsafe fn test_mm256_shufflelo_epi16() {
5144 #[rustfmt::skip]
5145 let a = _mm256_setr_epi16(
5146 11, 22, 33, 44, 0, 1, 2, 3,
5147 55, 66, 77, 88, 4, 5, 6, 7,
5148 );
5149 #[rustfmt::skip]
5150 let e = _mm256_setr_epi16(
5151 44, 22, 22, 11, 0, 1, 2, 3,
5152 88, 66, 66, 55, 4, 5, 6, 7,
5153 );
5154 let r = _mm256_shufflelo_epi16(a, 0b00_01_01_11);
5155 assert_eq_m256i(r, e);
5156 }
5157
5158 #[simd_test(enable = "avx2")]
5159 unsafe fn test_mm256_sign_epi16() {
5160 let a = _mm256_set1_epi16(2);
5161 let b = _mm256_set1_epi16(-1);
5162 let r = _mm256_sign_epi16(a, b);
5163 let e = _mm256_set1_epi16(-2);
5164 assert_eq_m256i(r, e);
5165 }
5166
5167 #[simd_test(enable = "avx2")]
5168 unsafe fn test_mm256_sign_epi32() {
5169 let a = _mm256_set1_epi32(2);
5170 let b = _mm256_set1_epi32(-1);
5171 let r = _mm256_sign_epi32(a, b);
5172 let e = _mm256_set1_epi32(-2);
5173 assert_eq_m256i(r, e);
5174 }
5175
5176 #[simd_test(enable = "avx2")]
5177 unsafe fn test_mm256_sign_epi8() {
5178 let a = _mm256_set1_epi8(2);
5179 let b = _mm256_set1_epi8(-1);
5180 let r = _mm256_sign_epi8(a, b);
5181 let e = _mm256_set1_epi8(-2);
5182 assert_eq_m256i(r, e);
5183 }
5184
5185 #[simd_test(enable = "avx2")]
5186 unsafe fn test_mm256_sll_epi16() {
5187 let a = _mm256_set1_epi16(0xFF);
5188 let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5189 let r = _mm256_sll_epi16(a, b);
5190 assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
5191 }
5192
5193 #[simd_test(enable = "avx2")]
5194 unsafe fn test_mm256_sll_epi32() {
5195 let a = _mm256_set1_epi32(0xFFFF);
5196 let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5197 let r = _mm256_sll_epi32(a, b);
5198 assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
5199 }
5200
5201 #[simd_test(enable = "avx2")]
5202 unsafe fn test_mm256_sll_epi64() {
5203 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5204 let b = _mm_insert_epi64(_mm_set1_epi64x(0), 4, 0);
5205 let r = _mm256_sll_epi64(a, b);
5206 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
5207 }
5208
5209 #[simd_test(enable = "avx2")]
5210 unsafe fn test_mm256_slli_epi16() {
5211 assert_eq_m256i(
5212 _mm256_slli_epi16(_mm256_set1_epi16(0xFF), 4),
5213 _mm256_set1_epi16(0xFF0),
5214 );
5215 }
5216
5217 #[simd_test(enable = "avx2")]
5218 unsafe fn test_mm256_slli_epi32() {
5219 assert_eq_m256i(
5220 _mm256_slli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5221 _mm256_set1_epi32(0xFFFF0),
5222 );
5223 }
5224
5225 #[simd_test(enable = "avx2")]
5226 unsafe fn test_mm256_slli_epi64() {
5227 assert_eq_m256i(
5228 _mm256_slli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5229 _mm256_set1_epi64x(0xFFFFFFFF0),
5230 );
5231 }
5232
5233 #[simd_test(enable = "avx2")]
5234 unsafe fn test_mm256_slli_si256() {
5235 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5236 let r = _mm256_slli_si256(a, 3);
5237 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5238 }
5239
5240 #[simd_test(enable = "avx2")]
5241 unsafe fn test_mm_sllv_epi32() {
5242 let a = _mm_set1_epi32(2);
5243 let b = _mm_set1_epi32(1);
5244 let r = _mm_sllv_epi32(a, b);
5245 let e = _mm_set1_epi32(4);
5246 assert_eq_m128i(r, e);
5247 }
5248
5249 #[simd_test(enable = "avx2")]
5250 unsafe fn test_mm256_sllv_epi32() {
5251 let a = _mm256_set1_epi32(2);
5252 let b = _mm256_set1_epi32(1);
5253 let r = _mm256_sllv_epi32(a, b);
5254 let e = _mm256_set1_epi32(4);
5255 assert_eq_m256i(r, e);
5256 }
5257
5258 #[simd_test(enable = "avx2")]
5259 unsafe fn test_mm_sllv_epi64() {
5260 let a = _mm_set1_epi64x(2);
5261 let b = _mm_set1_epi64x(1);
5262 let r = _mm_sllv_epi64(a, b);
5263 let e = _mm_set1_epi64x(4);
5264 assert_eq_m128i(r, e);
5265 }
5266
5267 #[simd_test(enable = "avx2")]
5268 unsafe fn test_mm256_sllv_epi64() {
5269 let a = _mm256_set1_epi64x(2);
5270 let b = _mm256_set1_epi64x(1);
5271 let r = _mm256_sllv_epi64(a, b);
5272 let e = _mm256_set1_epi64x(4);
5273 assert_eq_m256i(r, e);
5274 }
5275
5276 #[simd_test(enable = "avx2")]
5277 unsafe fn test_mm256_sra_epi16() {
5278 let a = _mm256_set1_epi16(-1);
5279 let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5280 let r = _mm256_sra_epi16(a, b);
5281 assert_eq_m256i(r, _mm256_set1_epi16(-1));
5282 }
5283
5284 #[simd_test(enable = "avx2")]
5285 unsafe fn test_mm256_sra_epi32() {
5286 let a = _mm256_set1_epi32(-1);
5287 let b = _mm_insert_epi32(_mm_set1_epi32(0), 1, 0);
5288 let r = _mm256_sra_epi32(a, b);
5289 assert_eq_m256i(r, _mm256_set1_epi32(-1));
5290 }
5291
5292 #[simd_test(enable = "avx2")]
5293 unsafe fn test_mm256_srai_epi16() {
5294 assert_eq_m256i(
5295 _mm256_srai_epi16(_mm256_set1_epi16(-1), 1),
5296 _mm256_set1_epi16(-1),
5297 );
5298 }
5299
5300 #[simd_test(enable = "avx2")]
5301 unsafe fn test_mm256_srai_epi32() {
5302 assert_eq_m256i(
5303 _mm256_srai_epi32(_mm256_set1_epi32(-1), 1),
5304 _mm256_set1_epi32(-1),
5305 );
5306 }
5307
5308 #[simd_test(enable = "avx2")]
5309 unsafe fn test_mm_srav_epi32() {
5310 let a = _mm_set1_epi32(4);
5311 let count = _mm_set1_epi32(1);
5312 let r = _mm_srav_epi32(a, count);
5313 let e = _mm_set1_epi32(2);
5314 assert_eq_m128i(r, e);
5315 }
5316
5317 #[simd_test(enable = "avx2")]
5318 unsafe fn test_mm256_srav_epi32() {
5319 let a = _mm256_set1_epi32(4);
5320 let count = _mm256_set1_epi32(1);
5321 let r = _mm256_srav_epi32(a, count);
5322 let e = _mm256_set1_epi32(2);
5323 assert_eq_m256i(r, e);
5324 }
5325
5326 #[simd_test(enable = "avx2")]
5327 unsafe fn test_mm256_srli_si256() {
5328 #[rustfmt::skip]
5329 let a = _mm256_setr_epi8(
5330 1, 2, 3, 4, 5, 6, 7, 8,
5331 9, 10, 11, 12, 13, 14, 15, 16,
5332 17, 18, 19, 20, 21, 22, 23, 24,
5333 25, 26, 27, 28, 29, 30, 31, 32,
5334 );
5335 let r = _mm256_srli_si256(a, 3);
5336 #[rustfmt::skip]
5337 let e = _mm256_setr_epi8(
5338 4, 5, 6, 7, 8, 9, 10, 11,
5339 12, 13, 14, 15, 16, 0, 0, 0,
5340 20, 21, 22, 23, 24, 25, 26, 27,
5341 28, 29, 30, 31, 32, 0, 0, 0,
5342 );
5343 assert_eq_m256i(r, e);
5344 }
5345
5346 #[simd_test(enable = "avx2")]
5347 unsafe fn test_mm256_srl_epi16() {
5348 let a = _mm256_set1_epi16(0xFF);
5349 let b = _mm_insert_epi16(_mm_set1_epi16(0), 4, 0);
5350 let r = _mm256_srl_epi16(a, b);
5351 assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5352 }
5353
5354 #[simd_test(enable = "avx2")]
5355 unsafe fn test_mm256_srl_epi32() {
5356 let a = _mm256_set1_epi32(0xFFFF);
5357 let b = _mm_insert_epi32(_mm_set1_epi32(0), 4, 0);
5358 let r = _mm256_srl_epi32(a, b);
5359 assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5360 }
5361
5362 #[simd_test(enable = "avx2")]
5363 unsafe fn test_mm256_srl_epi64() {
5364 let a = _mm256_set1_epi64x(0xFFFFFFFF);
5365 let b = _mm_setr_epi64x(4, 0);
5366 let r = _mm256_srl_epi64(a, b);
5367 assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5368 }
5369
5370 #[simd_test(enable = "avx2")]
5371 unsafe fn test_mm256_srli_epi16() {
5372 assert_eq_m256i(
5373 _mm256_srli_epi16(_mm256_set1_epi16(0xFF), 4),
5374 _mm256_set1_epi16(0xF),
5375 );
5376 }
5377
5378 #[simd_test(enable = "avx2")]
5379 unsafe fn test_mm256_srli_epi32() {
5380 assert_eq_m256i(
5381 _mm256_srli_epi32(_mm256_set1_epi32(0xFFFF), 4),
5382 _mm256_set1_epi32(0xFFF),
5383 );
5384 }
5385
5386 #[simd_test(enable = "avx2")]
5387 unsafe fn test_mm256_srli_epi64() {
5388 assert_eq_m256i(
5389 _mm256_srli_epi64(_mm256_set1_epi64x(0xFFFFFFFF), 4),
5390 _mm256_set1_epi64x(0xFFFFFFF),
5391 );
5392 }
5393
5394 #[simd_test(enable = "avx2")]
5395 unsafe fn test_mm_srlv_epi32() {
5396 let a = _mm_set1_epi32(2);
5397 let count = _mm_set1_epi32(1);
5398 let r = _mm_srlv_epi32(a, count);
5399 let e = _mm_set1_epi32(1);
5400 assert_eq_m128i(r, e);
5401 }
5402
5403 #[simd_test(enable = "avx2")]
5404 unsafe fn test_mm256_srlv_epi32() {
5405 let a = _mm256_set1_epi32(2);
5406 let count = _mm256_set1_epi32(1);
5407 let r = _mm256_srlv_epi32(a, count);
5408 let e = _mm256_set1_epi32(1);
5409 assert_eq_m256i(r, e);
5410 }
5411
5412 #[simd_test(enable = "avx2")]
5413 unsafe fn test_mm_srlv_epi64() {
5414 let a = _mm_set1_epi64x(2);
5415 let count = _mm_set1_epi64x(1);
5416 let r = _mm_srlv_epi64(a, count);
5417 let e = _mm_set1_epi64x(1);
5418 assert_eq_m128i(r, e);
5419 }
5420
5421 #[simd_test(enable = "avx2")]
5422 unsafe fn test_mm256_srlv_epi64() {
5423 let a = _mm256_set1_epi64x(2);
5424 let count = _mm256_set1_epi64x(1);
5425 let r = _mm256_srlv_epi64(a, count);
5426 let e = _mm256_set1_epi64x(1);
5427 assert_eq_m256i(r, e);
5428 }
5429
5430 #[simd_test(enable = "avx2")]
5431 unsafe fn test_mm256_sub_epi16() {
5432 let a = _mm256_set1_epi16(4);
5433 let b = _mm256_set1_epi16(2);
5434 let r = _mm256_sub_epi16(a, b);
5435 assert_eq_m256i(r, b);
5436 }
5437
5438 #[simd_test(enable = "avx2")]
5439 unsafe fn test_mm256_sub_epi32() {
5440 let a = _mm256_set1_epi32(4);
5441 let b = _mm256_set1_epi32(2);
5442 let r = _mm256_sub_epi32(a, b);
5443 assert_eq_m256i(r, b);
5444 }
5445
5446 #[simd_test(enable = "avx2")]
5447 unsafe fn test_mm256_sub_epi64() {
5448 let a = _mm256_set1_epi64x(4);
5449 let b = _mm256_set1_epi64x(2);
5450 let r = _mm256_sub_epi64(a, b);
5451 assert_eq_m256i(r, b);
5452 }
5453
5454 #[simd_test(enable = "avx2")]
5455 unsafe fn test_mm256_sub_epi8() {
5456 let a = _mm256_set1_epi8(4);
5457 let b = _mm256_set1_epi8(2);
5458 let r = _mm256_sub_epi8(a, b);
5459 assert_eq_m256i(r, b);
5460 }
5461
5462 #[simd_test(enable = "avx2")]
5463 unsafe fn test_mm256_subs_epi16() {
5464 let a = _mm256_set1_epi16(4);
5465 let b = _mm256_set1_epi16(2);
5466 let r = _mm256_subs_epi16(a, b);
5467 assert_eq_m256i(r, b);
5468 }
5469
5470 #[simd_test(enable = "avx2")]
5471 unsafe fn test_mm256_subs_epi8() {
5472 let a = _mm256_set1_epi8(4);
5473 let b = _mm256_set1_epi8(2);
5474 let r = _mm256_subs_epi8(a, b);
5475 assert_eq_m256i(r, b);
5476 }
5477
5478 #[simd_test(enable = "avx2")]
5479 unsafe fn test_mm256_subs_epu16() {
5480 let a = _mm256_set1_epi16(4);
5481 let b = _mm256_set1_epi16(2);
5482 let r = _mm256_subs_epu16(a, b);
5483 assert_eq_m256i(r, b);
5484 }
5485
5486 #[simd_test(enable = "avx2")]
5487 unsafe fn test_mm256_subs_epu8() {
5488 let a = _mm256_set1_epi8(4);
5489 let b = _mm256_set1_epi8(2);
5490 let r = _mm256_subs_epu8(a, b);
5491 assert_eq_m256i(r, b);
5492 }
5493
5494 #[simd_test(enable = "avx2")]
5495 unsafe fn test_mm256_xor_si256() {
5496 let a = _mm256_set1_epi8(5);
5497 let b = _mm256_set1_epi8(3);
5498 let r = _mm256_xor_si256(a, b);
5499 assert_eq_m256i(r, _mm256_set1_epi8(6));
5500 }
5501
5502 #[simd_test(enable = "avx2")]
5503 unsafe fn test_mm256_alignr_epi8() {
5504 #[rustfmt::skip]
5505 let a = _mm256_setr_epi8(
5506 1, 2, 3, 4, 5, 6, 7, 8,
5507 9, 10, 11, 12, 13, 14, 15, 16,
5508 17, 18, 19, 20, 21, 22, 23, 24,
5509 25, 26, 27, 28, 29, 30, 31, 32,
5510 );
5511 #[rustfmt::skip]
5512 let b = _mm256_setr_epi8(
5513 -1, -2, -3, -4, -5, -6, -7, -8,
5514 -9, -10, -11, -12, -13, -14, -15, -16,
5515 -17, -18, -19, -20, -21, -22, -23, -24,
5516 -25, -26, -27, -28, -29, -30, -31, -32,
5517 );
5518 let r = _mm256_alignr_epi8(a, b, 33);
5519 assert_eq_m256i(r, _mm256_set1_epi8(0));
5520
5521 let r = _mm256_alignr_epi8(a, b, 17);
5522 #[rustfmt::skip]
5523 let expected = _mm256_setr_epi8(
5524 2, 3, 4, 5, 6, 7, 8, 9,
5525 10, 11, 12, 13, 14, 15, 16, 0,
5526 18, 19, 20, 21, 22, 23, 24, 25,
5527 26, 27, 28, 29, 30, 31, 32, 0,
5528 );
5529 assert_eq_m256i(r, expected);
5530
5531 let r = _mm256_alignr_epi8(a, b, 4);
5532 #[rustfmt::skip]
5533 let expected = _mm256_setr_epi8(
5534 -5, -6, -7, -8, -9, -10, -11, -12,
5535 -13, -14, -15, -16, 1, 2, 3, 4,
5536 -21, -22, -23, -24, -25, -26, -27, -28,
5537 -29, -30, -31, -32, 17, 18, 19, 20,
5538 );
5539 assert_eq_m256i(r, expected);
5540
5541 #[rustfmt::skip]
5542 let expected = _mm256_setr_epi8(
5543 -1, -2, -3, -4, -5, -6, -7, -8,
5544 -9, -10, -11, -12, -13, -14, -15, -16, -17,
5545 -18, -19, -20, -21, -22, -23, -24, -25,
5546 -26, -27, -28, -29, -30, -31, -32,
5547 );
5548 let r = _mm256_alignr_epi8(a, b, 16);
5549 assert_eq_m256i(r, expected);
5550
5551 let r = _mm256_alignr_epi8(a, b, 15);
5552 #[rustfmt::skip]
5553 let expected = _mm256_setr_epi8(
5554 -16, 1, 2, 3, 4, 5, 6, 7,
5555 8, 9, 10, 11, 12, 13, 14, 15,
5556 -32, 17, 18, 19, 20, 21, 22, 23,
5557 24, 25, 26, 27, 28, 29, 30, 31,
5558 );
5559 assert_eq_m256i(r, expected);
5560
5561 let r = _mm256_alignr_epi8(a, b, 0);
5562 assert_eq_m256i(r, b);
5563 }
5564
5565 #[simd_test(enable = "avx2")]
5566 unsafe fn test_mm256_shuffle_epi8() {
5567 #[rustfmt::skip]
5568 let a = _mm256_setr_epi8(
5569 1, 2, 3, 4, 5, 6, 7, 8,
5570 9, 10, 11, 12, 13, 14, 15, 16,
5571 17, 18, 19, 20, 21, 22, 23, 24,
5572 25, 26, 27, 28, 29, 30, 31, 32,
5573 );
5574 #[rustfmt::skip]
5575 let b = _mm256_setr_epi8(
5576 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5577 12, 5, 5, 10, 4, 1, 8, 0,
5578 4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5579 12, 5, 5, 10, 4, 1, 8, 0,
5580 );
5581 #[rustfmt::skip]
5582 let expected = _mm256_setr_epi8(
5583 5, 0, 5, 4, 9, 13, 7, 4,
5584 13, 6, 6, 11, 5, 2, 9, 1,
5585 21, 0, 21, 20, 25, 29, 23, 20,
5586 29, 22, 22, 27, 21, 18, 25, 17,
5587 );
5588 let r = _mm256_shuffle_epi8(a, b);
5589 assert_eq_m256i(r, expected);
5590 }
5591
5592 #[simd_test(enable = "avx2")]
5593 unsafe fn test_mm256_permutevar8x32_epi32() {
5594 let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5595 let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5596 let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5597 let r = _mm256_permutevar8x32_epi32(a, b);
5598 assert_eq_m256i(r, expected);
5599 }
5600
5601 #[simd_test(enable = "avx2")]
5602 unsafe fn test_mm256_permute4x64_epi64() {
5603 let a = _mm256_setr_epi64x(100, 200, 300, 400);
5604 let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5605 let r = _mm256_permute4x64_epi64(a, 0b00010011);
5606 assert_eq_m256i(r, expected);
5607 }
5608
5609 #[simd_test(enable = "avx2")]
5610 unsafe fn test_mm256_permute2x128_si256() {
5611 let a = _mm256_setr_epi64x(100, 200, 500, 600);
5612 let b = _mm256_setr_epi64x(300, 400, 700, 800);
5613 let r = _mm256_permute2x128_si256(a, b, 0b00_01_00_11);
5614 let e = _mm256_setr_epi64x(700, 800, 500, 600);
5615 assert_eq_m256i(r, e);
5616 }
5617
5618 #[simd_test(enable = "avx2")]
5619 unsafe fn test_mm256_permute4x64_pd() {
5620 let a = _mm256_setr_pd(1., 2., 3., 4.);
5621 let r = _mm256_permute4x64_pd(a, 0b00_01_00_11);
5622 let e = _mm256_setr_pd(4., 1., 2., 1.);
5623 assert_eq_m256d(r, e);
5624 }
5625
5626 #[simd_test(enable = "avx2")]
5627 unsafe fn test_mm256_permutevar8x32_ps() {
5628 let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5629 let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5630 let r = _mm256_permutevar8x32_ps(a, b);
5631 let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5632 assert_eq_m256(r, e);
5633 }
5634
5635 #[simd_test(enable = "avx2")]
5636 unsafe fn test_mm_i32gather_epi32() {
5637 let mut arr = [0i32; 128];
5638 for i in 0..128i32 {
5639 arr[i as usize] = i;
5640 }
5641 // A multiplier of 4 is word-addressing
5642 let r = _mm_i32gather_epi32(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5643 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5644 }
5645
5646 #[simd_test(enable = "avx2")]
5647 unsafe fn test_mm_mask_i32gather_epi32() {
5648 let mut arr = [0i32; 128];
5649 for i in 0..128i32 {
5650 arr[i as usize] = i;
5651 }
5652 // A multiplier of 4 is word-addressing
5653 let r = _mm_mask_i32gather_epi32(
5654 _mm_set1_epi32(256),
5655 arr.as_ptr(),
5656 _mm_setr_epi32(0, 16, 64, 96),
5657 _mm_setr_epi32(-1, -1, -1, 0),
5658 4,
5659 );
5660 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5661 }
5662
5663 #[simd_test(enable = "avx2")]
5664 unsafe fn test_mm256_i32gather_epi32() {
5665 let mut arr = [0i32; 128];
5666 for i in 0..128i32 {
5667 arr[i as usize] = i;
5668 }
5669 // A multiplier of 4 is word-addressing
5670 let r = _mm256_i32gather_epi32(
5671 arr.as_ptr(),
5672 _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5673 4,
5674 );
5675 assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5676 }
5677
5678 #[simd_test(enable = "avx2")]
5679 unsafe fn test_mm256_mask_i32gather_epi32() {
5680 let mut arr = [0i32; 128];
5681 for i in 0..128i32 {
5682 arr[i as usize] = i;
5683 }
5684 // A multiplier of 4 is word-addressing
5685 let r = _mm256_mask_i32gather_epi32(
5686 _mm256_set1_epi32(256),
5687 arr.as_ptr(),
5688 _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5689 _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5690 4,
5691 );
5692 assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5693 }
5694
5695 #[simd_test(enable = "avx2")]
5696 unsafe fn test_mm_i32gather_ps() {
5697 let mut arr = [0.0f32; 128];
5698 let mut j = 0.0;
5699 for i in 0..128usize {
5700 arr[i] = j;
5701 j += 1.0;
5702 }
5703 // A multiplier of 4 is word-addressing for f32s
5704 let r = _mm_i32gather_ps(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 4);
5705 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5706 }
5707
5708 #[simd_test(enable = "avx2")]
5709 unsafe fn test_mm_mask_i32gather_ps() {
5710 let mut arr = [0.0f32; 128];
5711 let mut j = 0.0;
5712 for i in 0..128usize {
5713 arr[i] = j;
5714 j += 1.0;
5715 }
5716 // A multiplier of 4 is word-addressing for f32s
5717 let r = _mm_mask_i32gather_ps(
5718 _mm_set1_ps(256.0),
5719 arr.as_ptr(),
5720 _mm_setr_epi32(0, 16, 64, 96),
5721 _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5722 4,
5723 );
5724 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5725 }
5726
5727 #[simd_test(enable = "avx2")]
5728 unsafe fn test_mm256_i32gather_ps() {
5729 let mut arr = [0.0f32; 128];
5730 let mut j = 0.0;
5731 for i in 0..128usize {
5732 arr[i] = j;
5733 j += 1.0;
5734 }
5735 // A multiplier of 4 is word-addressing for f32s
5736 let r = _mm256_i32gather_ps(
5737 arr.as_ptr(),
5738 _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4),
5739 4,
5740 );
5741 assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5742 }
5743
5744 #[simd_test(enable = "avx2")]
5745 unsafe fn test_mm256_mask_i32gather_ps() {
5746 let mut arr = [0.0f32; 128];
5747 let mut j = 0.0;
5748 for i in 0..128usize {
5749 arr[i] = j;
5750 j += 1.0;
5751 }
5752 // A multiplier of 4 is word-addressing for f32s
5753 let r = _mm256_mask_i32gather_ps(
5754 _mm256_set1_ps(256.0),
5755 arr.as_ptr(),
5756 _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5757 _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5758 4,
5759 );
5760 assert_eq_m256(
5761 r,
5762 _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5763 );
5764 }
5765
5766 #[simd_test(enable = "avx2")]
5767 unsafe fn test_mm_i32gather_epi64() {
5768 let mut arr = [0i64; 128];
5769 for i in 0..128i64 {
5770 arr[i as usize] = i;
5771 }
5772 // A multiplier of 8 is word-addressing for i64s
5773 let r = _mm_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5774 assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5775 }
5776
5777 #[simd_test(enable = "avx2")]
5778 unsafe fn test_mm_mask_i32gather_epi64() {
5779 let mut arr = [0i64; 128];
5780 for i in 0..128i64 {
5781 arr[i as usize] = i;
5782 }
5783 // A multiplier of 8 is word-addressing for i64s
5784 let r = _mm_mask_i32gather_epi64(
5785 _mm_set1_epi64x(256),
5786 arr.as_ptr(),
5787 _mm_setr_epi32(16, 16, 16, 16),
5788 _mm_setr_epi64x(-1, 0),
5789 8,
5790 );
5791 assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5792 }
5793
5794 #[simd_test(enable = "avx2")]
5795 unsafe fn test_mm256_i32gather_epi64() {
5796 let mut arr = [0i64; 128];
5797 for i in 0..128i64 {
5798 arr[i as usize] = i;
5799 }
5800 // A multiplier of 8 is word-addressing for i64s
5801 let r = _mm256_i32gather_epi64(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5802 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5803 }
5804
5805 #[simd_test(enable = "avx2")]
5806 unsafe fn test_mm256_mask_i32gather_epi64() {
5807 let mut arr = [0i64; 128];
5808 for i in 0..128i64 {
5809 arr[i as usize] = i;
5810 }
5811 // A multiplier of 8 is word-addressing for i64s
5812 let r = _mm256_mask_i32gather_epi64(
5813 _mm256_set1_epi64x(256),
5814 arr.as_ptr(),
5815 _mm_setr_epi32(0, 16, 64, 96),
5816 _mm256_setr_epi64x(-1, -1, -1, 0),
5817 8,
5818 );
5819 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5820 }
5821
5822 #[simd_test(enable = "avx2")]
5823 unsafe fn test_mm_i32gather_pd() {
5824 let mut arr = [0.0f64; 128];
5825 let mut j = 0.0;
5826 for i in 0..128usize {
5827 arr[i] = j;
5828 j += 1.0;
5829 }
5830 // A multiplier of 8 is word-addressing for f64s
5831 let r = _mm_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0), 8);
5832 assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5833 }
5834
5835 #[simd_test(enable = "avx2")]
5836 unsafe fn test_mm_mask_i32gather_pd() {
5837 let mut arr = [0.0f64; 128];
5838 let mut j = 0.0;
5839 for i in 0..128usize {
5840 arr[i] = j;
5841 j += 1.0;
5842 }
5843 // A multiplier of 8 is word-addressing for f64s
5844 let r = _mm_mask_i32gather_pd(
5845 _mm_set1_pd(256.0),
5846 arr.as_ptr(),
5847 _mm_setr_epi32(16, 16, 16, 16),
5848 _mm_setr_pd(-1.0, 0.0),
5849 8,
5850 );
5851 assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5852 }
5853
5854 #[simd_test(enable = "avx2")]
5855 unsafe fn test_mm256_i32gather_pd() {
5856 let mut arr = [0.0f64; 128];
5857 let mut j = 0.0;
5858 for i in 0..128usize {
5859 arr[i] = j;
5860 j += 1.0;
5861 }
5862 // A multiplier of 8 is word-addressing for f64s
5863 let r = _mm256_i32gather_pd(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48), 8);
5864 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5865 }
5866
5867 #[simd_test(enable = "avx2")]
5868 unsafe fn test_mm256_mask_i32gather_pd() {
5869 let mut arr = [0.0f64; 128];
5870 let mut j = 0.0;
5871 for i in 0..128usize {
5872 arr[i] = j;
5873 j += 1.0;
5874 }
5875 // A multiplier of 8 is word-addressing for f64s
5876 let r = _mm256_mask_i32gather_pd(
5877 _mm256_set1_pd(256.0),
5878 arr.as_ptr(),
5879 _mm_setr_epi32(0, 16, 64, 96),
5880 _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5881 8,
5882 );
5883 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5884 }
5885
5886 #[simd_test(enable = "avx2")]
5887 unsafe fn test_mm_i64gather_epi32() {
5888 let mut arr = [0i32; 128];
5889 for i in 0..128i32 {
5890 arr[i as usize] = i;
5891 }
5892 // A multiplier of 4 is word-addressing
5893 let r = _mm_i64gather_epi32(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5894 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5895 }
5896
5897 #[simd_test(enable = "avx2")]
5898 unsafe fn test_mm_mask_i64gather_epi32() {
5899 let mut arr = [0i32; 128];
5900 for i in 0..128i32 {
5901 arr[i as usize] = i;
5902 }
5903 // A multiplier of 4 is word-addressing
5904 let r = _mm_mask_i64gather_epi32(
5905 _mm_set1_epi32(256),
5906 arr.as_ptr(),
5907 _mm_setr_epi64x(0, 16),
5908 _mm_setr_epi32(-1, 0, -1, 0),
5909 4,
5910 );
5911 assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5912 }
5913
5914 #[simd_test(enable = "avx2")]
5915 unsafe fn test_mm256_i64gather_epi32() {
5916 let mut arr = [0i32; 128];
5917 for i in 0..128i32 {
5918 arr[i as usize] = i;
5919 }
5920 // A multiplier of 4 is word-addressing
5921 let r = _mm256_i64gather_epi32(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5922 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5923 }
5924
5925 #[simd_test(enable = "avx2")]
5926 unsafe fn test_mm256_mask_i64gather_epi32() {
5927 let mut arr = [0i32; 128];
5928 for i in 0..128i32 {
5929 arr[i as usize] = i;
5930 }
5931 // A multiplier of 4 is word-addressing
5932 let r = _mm256_mask_i64gather_epi32(
5933 _mm_set1_epi32(256),
5934 arr.as_ptr(),
5935 _mm256_setr_epi64x(0, 16, 64, 96),
5936 _mm_setr_epi32(-1, -1, -1, 0),
5937 4,
5938 );
5939 assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5940 }
5941
5942 #[simd_test(enable = "avx2")]
5943 unsafe fn test_mm_i64gather_ps() {
5944 let mut arr = [0.0f32; 128];
5945 let mut j = 0.0;
5946 for i in 0..128usize {
5947 arr[i] = j;
5948 j += 1.0;
5949 }
5950 // A multiplier of 4 is word-addressing for f32s
5951 let r = _mm_i64gather_ps(arr.as_ptr(), _mm_setr_epi64x(0, 16), 4);
5952 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5953 }
5954
5955 #[simd_test(enable = "avx2")]
5956 unsafe fn test_mm_mask_i64gather_ps() {
5957 let mut arr = [0.0f32; 128];
5958 let mut j = 0.0;
5959 for i in 0..128usize {
5960 arr[i] = j;
5961 j += 1.0;
5962 }
5963 // A multiplier of 4 is word-addressing for f32s
5964 let r = _mm_mask_i64gather_ps(
5965 _mm_set1_ps(256.0),
5966 arr.as_ptr(),
5967 _mm_setr_epi64x(0, 16),
5968 _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5969 4,
5970 );
5971 assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5972 }
5973
5974 #[simd_test(enable = "avx2")]
5975 unsafe fn test_mm256_i64gather_ps() {
5976 let mut arr = [0.0f32; 128];
5977 let mut j = 0.0;
5978 for i in 0..128usize {
5979 arr[i] = j;
5980 j += 1.0;
5981 }
5982 // A multiplier of 4 is word-addressing for f32s
5983 let r = _mm256_i64gather_ps(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 4);
5984 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5985 }
5986
5987 #[simd_test(enable = "avx2")]
5988 unsafe fn test_mm256_mask_i64gather_ps() {
5989 let mut arr = [0.0f32; 128];
5990 let mut j = 0.0;
5991 for i in 0..128usize {
5992 arr[i] = j;
5993 j += 1.0;
5994 }
5995 // A multiplier of 4 is word-addressing for f32s
5996 let r = _mm256_mask_i64gather_ps(
5997 _mm_set1_ps(256.0),
5998 arr.as_ptr(),
5999 _mm256_setr_epi64x(0, 16, 64, 96),
6000 _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
6001 4,
6002 );
6003 assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
6004 }
6005
6006 #[simd_test(enable = "avx2")]
6007 unsafe fn test_mm_i64gather_epi64() {
6008 let mut arr = [0i64; 128];
6009 for i in 0..128i64 {
6010 arr[i as usize] = i;
6011 }
6012 // A multiplier of 8 is word-addressing for i64s
6013 let r = _mm_i64gather_epi64(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6014 assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
6015 }
6016
6017 #[simd_test(enable = "avx2")]
6018 unsafe fn test_mm_mask_i64gather_epi64() {
6019 let mut arr = [0i64; 128];
6020 for i in 0..128i64 {
6021 arr[i as usize] = i;
6022 }
6023 // A multiplier of 8 is word-addressing for i64s
6024 let r = _mm_mask_i64gather_epi64(
6025 _mm_set1_epi64x(256),
6026 arr.as_ptr(),
6027 _mm_setr_epi64x(16, 16),
6028 _mm_setr_epi64x(-1, 0),
6029 8,
6030 );
6031 assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
6032 }
6033
6034 #[simd_test(enable = "avx2")]
6035 unsafe fn test_mm256_i64gather_epi64() {
6036 let mut arr = [0i64; 128];
6037 for i in 0..128i64 {
6038 arr[i as usize] = i;
6039 }
6040 // A multiplier of 8 is word-addressing for i64s
6041 let r = _mm256_i64gather_epi64(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6042 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
6043 }
6044
6045 #[simd_test(enable = "avx2")]
6046 unsafe fn test_mm256_mask_i64gather_epi64() {
6047 let mut arr = [0i64; 128];
6048 for i in 0..128i64 {
6049 arr[i as usize] = i;
6050 }
6051 // A multiplier of 8 is word-addressing for i64s
6052 let r = _mm256_mask_i64gather_epi64(
6053 _mm256_set1_epi64x(256),
6054 arr.as_ptr(),
6055 _mm256_setr_epi64x(0, 16, 64, 96),
6056 _mm256_setr_epi64x(-1, -1, -1, 0),
6057 8,
6058 );
6059 assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
6060 }
6061
6062 #[simd_test(enable = "avx2")]
6063 unsafe fn test_mm_i64gather_pd() {
6064 let mut arr = [0.0f64; 128];
6065 let mut j = 0.0;
6066 for i in 0..128usize {
6067 arr[i] = j;
6068 j += 1.0;
6069 }
6070 // A multiplier of 8 is word-addressing for f64s
6071 let r = _mm_i64gather_pd(arr.as_ptr(), _mm_setr_epi64x(0, 16), 8);
6072 assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
6073 }
6074
6075 #[simd_test(enable = "avx2")]
6076 unsafe fn test_mm_mask_i64gather_pd() {
6077 let mut arr = [0.0f64; 128];
6078 let mut j = 0.0;
6079 for i in 0..128usize {
6080 arr[i] = j;
6081 j += 1.0;
6082 }
6083 // A multiplier of 8 is word-addressing for f64s
6084 let r = _mm_mask_i64gather_pd(
6085 _mm_set1_pd(256.0),
6086 arr.as_ptr(),
6087 _mm_setr_epi64x(16, 16),
6088 _mm_setr_pd(-1.0, 0.0),
6089 8,
6090 );
6091 assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
6092 }
6093
6094 #[simd_test(enable = "avx2")]
6095 unsafe fn test_mm256_i64gather_pd() {
6096 let mut arr = [0.0f64; 128];
6097 let mut j = 0.0;
6098 for i in 0..128usize {
6099 arr[i] = j;
6100 j += 1.0;
6101 }
6102 // A multiplier of 8 is word-addressing for f64s
6103 let r = _mm256_i64gather_pd(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48), 8);
6104 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
6105 }
6106
6107 #[simd_test(enable = "avx2")]
6108 unsafe fn test_mm256_mask_i64gather_pd() {
6109 let mut arr = [0.0f64; 128];
6110 let mut j = 0.0;
6111 for i in 0..128usize {
6112 arr[i] = j;
6113 j += 1.0;
6114 }
6115 // A multiplier of 8 is word-addressing for f64s
6116 let r = _mm256_mask_i64gather_pd(
6117 _mm256_set1_pd(256.0),
6118 arr.as_ptr(),
6119 _mm256_setr_epi64x(0, 16, 64, 96),
6120 _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
6121 8,
6122 );
6123 assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
6124 }
6125
6126 #[simd_test(enable = "avx")]
6127 unsafe fn test_mm256_extract_epi8() {
6128 #[rustfmt::skip]
6129 let a = _mm256_setr_epi8(
6130 -1, 1, 2, 3, 4, 5, 6, 7,
6131 8, 9, 10, 11, 12, 13, 14, 15,
6132 16, 17, 18, 19, 20, 21, 22, 23,
6133 24, 25, 26, 27, 28, 29, 30, 31
6134 );
6135 let r1 = _mm256_extract_epi8(a, 0);
6136 let r2 = _mm256_extract_epi8(a, 35);
6137 assert_eq!(r1, -1);
6138 assert_eq!(r2, 3);
6139 }
6140
6141 #[simd_test(enable = "avx2")]
6142 unsafe fn test_mm256_extract_epi16() {
6143 #[rustfmt::skip]
6144 let a = _mm256_setr_epi16(
6145 -1, 1, 2, 3, 4, 5, 6, 7,
6146 8, 9, 10, 11, 12, 13, 14, 15,
6147 );
6148 let r1 = _mm256_extract_epi16(a, 0);
6149 let r2 = _mm256_extract_epi16(a, 19);
6150 assert_eq!(r1, -1);
6151 assert_eq!(r2, 3);
6152 }
6153
6154 #[simd_test(enable = "avx2")]
6155 unsafe fn test_mm256_extract_epi32() {
6156 let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
6157 let r1 = _mm256_extract_epi32(a, 0);
6158 let r2 = _mm256_extract_epi32(a, 11);
6159 assert_eq!(r1, -1);
6160 assert_eq!(r2, 3);
6161 }
6162
6163 #[simd_test(enable = "avx2")]
6164 unsafe fn test_mm256_cvtsd_f64() {
6165 let a = _mm256_setr_pd(1., 2., 3., 4.);
6166 let r = _mm256_cvtsd_f64(a);
6167 assert_eq!(r, 1.);
6168 }
6169
6170 #[simd_test(enable = "avx2")]
6171 unsafe fn test_mm256_cvtsi256_si32() {
6172 let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
6173 let r = _mm256_cvtsi256_si32(a);
6174 assert_eq!(r, 1);
6175 }
6176 }