]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/sse41.rs
New upstream version 1.50.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / sse41.rs
CommitLineData
0531ce1d
XL
1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
532ac7d7
XL
3use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 mem::transmute,
6};
0531ce1d
XL
7
8#[cfg(test)]
416331ca 9use stdarch_test::assert_instr;
0531ce1d
XL
10
11// SSE4 rounding constans
12/// round to nearest
83c7162d 13#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
14pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15/// round down
83c7162d 16#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
17pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18/// round up
83c7162d 19#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
20pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21/// truncate
83c7162d 22#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
23pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
83c7162d 25#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
26pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27/// do not suppress exceptions
83c7162d 28#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
29pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30/// suppress exceptions
83c7162d 31#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
32pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33/// round to nearest and do not suppress exceptions
83c7162d 34#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
35pub const _MM_FROUND_NINT: i32 = 0x00;
36/// round down and do not suppress exceptions
83c7162d 37#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 38pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
0531ce1d 39/// round up and do not suppress exceptions
83c7162d 40#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 41pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
0531ce1d 42/// truncate and do not suppress exceptions
83c7162d 43#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 44pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
0531ce1d
XL
45/// use MXCSR.RC and do not suppress exceptions; see
46/// `vendor::_MM_SET_ROUNDING_MODE`
83c7162d 47#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 48pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
0531ce1d 49/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
83c7162d 50#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 51pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
0531ce1d
XL
52
53/// Blend packed 8-bit integers from `a` and `b` using `mask`
54///
55/// The high bit of each corresponding mask byte determines the selection.
56/// If the high bit is set the element of `a` is selected. The element
57/// of `b` is selected otherwise.
83c7162d
XL
58///
59/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
0531ce1d
XL
60#[inline]
61#[target_feature(enable = "sse4.1")]
62#[cfg_attr(test, assert_instr(pblendvb))]
83c7162d 63#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 64pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
532ac7d7 65 transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
0531ce1d
XL
66}
67
68/// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
69///
70/// The mask bits determine the selection. A clear bit selects the
71/// corresponding element of `a`, and a set bit the corresponding
72/// element of `b`.
83c7162d
XL
73///
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
0531ce1d
XL
75#[inline]
76#[target_feature(enable = "sse4.1")]
8faf50e0
XL
77// Note: LLVM7 prefers the single-precision floating-point domain when possible
78// see https://bugs.llvm.org/show_bug.cgi?id=38195
79// #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
80#[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))]
0531ce1d 81#[rustc_args_required_const(2)]
83c7162d 82#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
83pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
84 let a = a.as_i16x8();
85 let b = b.as_i16x8();
86 macro_rules! call {
83c7162d
XL
87 ($imm8:expr) => {
88 pblendw(a, b, $imm8)
89 };
0531ce1d 90 }
532ac7d7 91 transmute(constify_imm8!(imm8, call))
0531ce1d
XL
92}
93
94/// Blend packed double-precision (64-bit) floating-point elements from `a`
95/// and `b` using `mask`
83c7162d
XL
96///
97/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
0531ce1d
XL
98#[inline]
99#[target_feature(enable = "sse4.1")]
100#[cfg_attr(test, assert_instr(blendvpd))]
83c7162d 101#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
102pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
103 blendvpd(a, b, mask)
104}
105
106/// Blend packed single-precision (32-bit) floating-point elements from `a`
107/// and `b` using `mask`
83c7162d
XL
108///
109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
0531ce1d
XL
110#[inline]
111#[target_feature(enable = "sse4.1")]
112#[cfg_attr(test, assert_instr(blendvps))]
83c7162d 113#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
114pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
115 blendvps(a, b, mask)
116}
117
118/// Blend packed double-precision (64-bit) floating-point elements from `a`
119/// and `b` using control mask `imm2`
83c7162d
XL
120///
121/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
0531ce1d
XL
122#[inline]
123#[target_feature(enable = "sse4.1")]
8faf50e0
XL
124// Note: LLVM7 prefers the single-precision floating-point domain when possible
125// see https://bugs.llvm.org/show_bug.cgi?id=38195
126// #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
127#[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))]
0531ce1d 128#[rustc_args_required_const(2)]
83c7162d 129#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
130pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
131 macro_rules! call {
83c7162d
XL
132 ($imm2:expr) => {
133 blendpd(a, b, $imm2)
134 };
0531ce1d
XL
135 }
136 constify_imm2!(imm2, call)
137}
138
139/// Blend packed single-precision (32-bit) floating-point elements from `a`
140/// and `b` using mask `imm4`
83c7162d
XL
141///
142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
0531ce1d
XL
143#[inline]
144#[target_feature(enable = "sse4.1")]
145#[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
146#[rustc_args_required_const(2)]
83c7162d 147#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
148pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
149 macro_rules! call {
83c7162d
XL
150 ($imm4:expr) => {
151 blendps(a, b, $imm4)
152 };
0531ce1d
XL
153 }
154 constify_imm4!(imm4, call)
155}
156
532ac7d7 157/// Extracts a single-precision (32-bit) floating-point element from `a`,
0531ce1d 158/// selected with `imm8`
83c7162d
XL
159///
160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
0531ce1d
XL
161#[inline]
162#[target_feature(enable = "sse4.1")]
0731742a
XL
163#[cfg_attr(
164 all(test, not(target_os = "windows")),
165 assert_instr(extractps, imm8 = 0)
166)]
0531ce1d 167#[rustc_args_required_const(1)]
83c7162d 168#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 169pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
3dfed10e
XL
170 macro_rules! call {
171 ($imm2:expr) => {
172 transmute(simd_extract::<_, f32>(a, $imm2))
173 };
174 }
175 constify_imm2!(imm8, call)
0531ce1d
XL
176}
177
532ac7d7 178/// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
0531ce1d
XL
179/// integer containing the zero-extended integer data.
180///
fc512014 181/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
83c7162d
XL
182///
183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
0531ce1d
XL
184#[inline]
185#[target_feature(enable = "sse4.1")]
186#[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
187#[rustc_args_required_const(1)]
83c7162d 188#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 189pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
3dfed10e
XL
190 let a = a.as_u8x16();
191 macro_rules! call {
192 ($imm4:expr) => {
193 simd_extract::<_, u8>(a, $imm4) as i32
194 };
195 }
196 constify_imm4!(imm8, call)
0531ce1d
XL
197}
198
532ac7d7 199/// Extracts an 32-bit integer from `a` selected with `imm8`
83c7162d
XL
200///
201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
0531ce1d
XL
202#[inline]
203#[target_feature(enable = "sse4.1")]
0731742a
XL
204#[cfg_attr(
205 all(test, not(target_os = "windows")),
206 assert_instr(extractps, imm8 = 1)
207)]
0531ce1d 208#[rustc_args_required_const(1)]
83c7162d 209#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 210pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
3dfed10e
XL
211 let a = a.as_i32x4();
212 macro_rules! call {
213 ($imm2:expr) => {
214 simd_extract::<_, i32>(a, $imm2)
215 };
216 }
217 constify_imm2!(imm8, call)
0531ce1d
XL
218}
219
220/// Select a single value in `a` to store at some position in `b`,
221/// Then zero elements according to `imm8`.
222///
223/// `imm8` specifies which bits from operand `a` will be copied, which bits in
224/// the result they will be copied to, and which bits in the result will be
225/// cleared. The following assignments are made:
226///
227/// * Bits `[7:6]` specify the bits to copy from operand `a`:
228/// - `00`: Selects bits `[31:0]` from operand `a`.
229/// - `01`: Selects bits `[63:32]` from operand `a`.
230/// - `10`: Selects bits `[95:64]` from operand `a`.
231/// - `11`: Selects bits `[127:96]` from operand `a`.
232///
233/// * Bits `[5:4]` specify the bits in the result to which the selected bits
234/// from operand `a` are copied:
235/// - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
236/// - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
237/// - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
238/// - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
239///
240/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
241/// element is cleared.
83c7162d
XL
242///
243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
0531ce1d
XL
244#[inline]
245#[target_feature(enable = "sse4.1")]
246#[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
247#[rustc_args_required_const(2)]
83c7162d 248#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
249pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
250 macro_rules! call {
83c7162d
XL
251 ($imm8:expr) => {
252 insertps(a, b, $imm8)
253 };
0531ce1d
XL
254 }
255 constify_imm8!(imm8, call)
256}
257
532ac7d7 258/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
0531ce1d 259/// location specified by `imm8`.
83c7162d
XL
260///
261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
0531ce1d
XL
262#[inline]
263#[target_feature(enable = "sse4.1")]
264#[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
265#[rustc_args_required_const(2)]
83c7162d 266#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 267pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
3dfed10e
XL
268 let a = a.as_i8x16();
269 macro_rules! call {
270 ($imm4:expr) => {
271 transmute(simd_insert(a, $imm4, i as i8))
272 };
273 }
274 constify_imm4!(imm8, call)
0531ce1d
XL
275}
276
532ac7d7 277/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
0531ce1d 278/// location specified by `imm8`.
83c7162d
XL
279///
280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
0531ce1d
XL
281#[inline]
282#[target_feature(enable = "sse4.1")]
283#[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
284#[rustc_args_required_const(2)]
83c7162d 285#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 286pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
3dfed10e
XL
287 let a = a.as_i32x4();
288 macro_rules! call {
289 ($imm2:expr) => {
290 transmute(simd_insert(a, $imm2, i))
291 };
292 }
293 constify_imm2!(imm8, call)
0531ce1d
XL
294}
295
532ac7d7 296/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
0531ce1d 297/// values in dst.
83c7162d
XL
298///
299/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
0531ce1d
XL
300#[inline]
301#[target_feature(enable = "sse4.1")]
302#[cfg_attr(test, assert_instr(pmaxsb))]
83c7162d 303#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 304pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 305 transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
0531ce1d
XL
306}
307
532ac7d7 308/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
0531ce1d 309/// maximum.
83c7162d
XL
310///
311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
0531ce1d
XL
312#[inline]
313#[target_feature(enable = "sse4.1")]
314#[cfg_attr(test, assert_instr(pmaxuw))]
83c7162d 315#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 316pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 317 transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
0531ce1d
XL
318}
319
532ac7d7 320/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
0531ce1d 321/// values.
83c7162d
XL
322///
323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
0531ce1d
XL
324#[inline]
325#[target_feature(enable = "sse4.1")]
326#[cfg_attr(test, assert_instr(pmaxsd))]
83c7162d 327#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 328pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 329 transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
330}
331
532ac7d7 332/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
0531ce1d 333/// maximum values.
83c7162d
XL
334///
335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
0531ce1d
XL
336#[inline]
337#[target_feature(enable = "sse4.1")]
338#[cfg_attr(test, assert_instr(pmaxud))]
83c7162d 339#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 340pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 341 transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
0531ce1d
XL
342}
343
532ac7d7 344/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
0531ce1d 345/// values in dst.
83c7162d
XL
346///
347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
0531ce1d
XL
348#[inline]
349#[target_feature(enable = "sse4.1")]
350#[cfg_attr(test, assert_instr(pminsb))]
83c7162d 351#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 352pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 353 transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
0531ce1d
XL
354}
355
532ac7d7 356/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
0531ce1d 357/// minimum.
83c7162d
XL
358///
359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
0531ce1d
XL
360#[inline]
361#[target_feature(enable = "sse4.1")]
362#[cfg_attr(test, assert_instr(pminuw))]
83c7162d 363#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 364pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 365 transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
0531ce1d
XL
366}
367
532ac7d7 368/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
0531ce1d 369/// values.
83c7162d
XL
370///
371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
0531ce1d
XL
372#[inline]
373#[target_feature(enable = "sse4.1")]
374#[cfg_attr(test, assert_instr(pminsd))]
83c7162d 375#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 376pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 377 transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
378}
379
532ac7d7 380/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
0531ce1d 381/// minimum values.
83c7162d
XL
382///
383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
0531ce1d
XL
384#[inline]
385#[target_feature(enable = "sse4.1")]
386#[cfg_attr(test, assert_instr(pminud))]
83c7162d 387#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 388pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 389 transmute(pminud(a.as_u32x4(), b.as_u32x4()))
0531ce1d
XL
390}
391
532ac7d7 392/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
0531ce1d 393/// using unsigned saturation
83c7162d
XL
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
0531ce1d
XL
396#[inline]
397#[target_feature(enable = "sse4.1")]
398#[cfg_attr(test, assert_instr(packusdw))]
83c7162d 399#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 400pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 401 transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
402}
403
532ac7d7 404/// Compares packed 64-bit integers in `a` and `b` for equality
83c7162d
XL
405///
406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
0531ce1d
XL
407#[inline]
408#[target_feature(enable = "sse4.1")]
409#[cfg_attr(test, assert_instr(pcmpeqq))]
83c7162d 410#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 411pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 412 transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
0531ce1d
XL
413}
414
415/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
83c7162d
XL
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
0531ce1d
XL
418#[inline]
419#[target_feature(enable = "sse4.1")]
420#[cfg_attr(test, assert_instr(pmovsxbw))]
83c7162d 421#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
422pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
423 let a = a.as_i8x16();
424 let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
532ac7d7 425 transmute(simd_cast::<_, i16x8>(a))
0531ce1d
XL
426}
427
428/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
83c7162d
XL
429///
430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
0531ce1d
XL
431#[inline]
432#[target_feature(enable = "sse4.1")]
433#[cfg_attr(test, assert_instr(pmovsxbd))]
83c7162d 434#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
435pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
436 let a = a.as_i8x16();
437 let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
532ac7d7 438 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
439}
440
441/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
442/// 64-bit integers
83c7162d
XL
443///
444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
0531ce1d
XL
445#[inline]
446#[target_feature(enable = "sse4.1")]
447#[cfg_attr(test, assert_instr(pmovsxbq))]
83c7162d 448#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
449pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
450 let a = a.as_i8x16();
451 let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
532ac7d7 452 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
453}
454
455/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
83c7162d
XL
456///
457/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
0531ce1d
XL
458#[inline]
459#[target_feature(enable = "sse4.1")]
460#[cfg_attr(test, assert_instr(pmovsxwd))]
83c7162d 461#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
462pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
463 let a = a.as_i16x8();
464 let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
532ac7d7 465 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
466}
467
468/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
83c7162d
XL
469///
470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
0531ce1d
XL
471#[inline]
472#[target_feature(enable = "sse4.1")]
473#[cfg_attr(test, assert_instr(pmovsxwq))]
83c7162d 474#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
475pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
476 let a = a.as_i16x8();
477 let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
532ac7d7 478 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
479}
480
481/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
83c7162d
XL
482///
483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
0531ce1d
XL
484#[inline]
485#[target_feature(enable = "sse4.1")]
486#[cfg_attr(test, assert_instr(pmovsxdq))]
83c7162d 487#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
488pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
489 let a = a.as_i32x4();
490 let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
532ac7d7 491 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
492}
493
532ac7d7 494/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
83c7162d
XL
495///
496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
0531ce1d
XL
497#[inline]
498#[target_feature(enable = "sse4.1")]
499#[cfg_attr(test, assert_instr(pmovzxbw))]
83c7162d 500#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
501pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
502 let a = a.as_u8x16();
503 let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
532ac7d7 504 transmute(simd_cast::<_, i16x8>(a))
0531ce1d
XL
505}
506
532ac7d7 507/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
83c7162d
XL
508///
509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
0531ce1d
XL
510#[inline]
511#[target_feature(enable = "sse4.1")]
512#[cfg_attr(test, assert_instr(pmovzxbd))]
83c7162d 513#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
514pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
515 let a = a.as_u8x16();
516 let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
532ac7d7 517 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
518}
519
532ac7d7 520/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
83c7162d
XL
521///
522/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
0531ce1d
XL
523#[inline]
524#[target_feature(enable = "sse4.1")]
525#[cfg_attr(test, assert_instr(pmovzxbq))]
83c7162d 526#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
527pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
528 let a = a.as_u8x16();
529 let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
532ac7d7 530 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
531}
532
532ac7d7 533/// Zeroes extend packed unsigned 16-bit integers in `a`
0531ce1d 534/// to packed 32-bit integers
83c7162d
XL
535///
536/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
0531ce1d
XL
537#[inline]
538#[target_feature(enable = "sse4.1")]
539#[cfg_attr(test, assert_instr(pmovzxwd))]
83c7162d 540#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
541pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
542 let a = a.as_u16x8();
543 let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
532ac7d7 544 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
545}
546
532ac7d7 547/// Zeroes extend packed unsigned 16-bit integers in `a`
0531ce1d 548/// to packed 64-bit integers
83c7162d
XL
549///
550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
0531ce1d
XL
551#[inline]
552#[target_feature(enable = "sse4.1")]
553#[cfg_attr(test, assert_instr(pmovzxwq))]
83c7162d 554#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
555pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
556 let a = a.as_u16x8();
557 let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
532ac7d7 558 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
559}
560
532ac7d7 561/// Zeroes extend packed unsigned 32-bit integers in `a`
0531ce1d 562/// to packed 64-bit integers
83c7162d
XL
563///
564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
0531ce1d
XL
565#[inline]
566#[target_feature(enable = "sse4.1")]
567#[cfg_attr(test, assert_instr(pmovzxdq))]
83c7162d 568#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
569pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
570 let a = a.as_u32x4();
571 let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
532ac7d7 572 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
573}
574
575/// Returns the dot product of two __m128d vectors.
576///
577/// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
578/// If a condition mask bit is zero, the corresponding multiplication is
579/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
580/// the dot product will be stored in the return value component. Otherwise if
581/// the broadcast mask bit is zero then the return component will be zero.
83c7162d
XL
582///
583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
0531ce1d
XL
584#[inline]
585#[target_feature(enable = "sse4.1")]
586#[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
587#[rustc_args_required_const(2)]
83c7162d 588#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
589pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
590 macro_rules! call {
83c7162d
XL
591 ($imm8:expr) => {
592 dppd(a, b, $imm8)
593 };
0531ce1d
XL
594 }
595 constify_imm8!(imm8, call)
596}
597
598/// Returns the dot product of two __m128 vectors.
599///
600/// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
601/// If a condition mask bit is zero, the corresponding multiplication is
602/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
603/// the dot product will be stored in the return value component. Otherwise if
604/// the broadcast mask bit is zero then the return component will be zero.
83c7162d
XL
605///
606/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
0531ce1d
XL
607#[inline]
608#[target_feature(enable = "sse4.1")]
609#[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
610#[rustc_args_required_const(2)]
83c7162d 611#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
612pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
613 macro_rules! call {
83c7162d
XL
614 ($imm8:expr) => {
615 dpps(a, b, $imm8)
616 };
0531ce1d
XL
617 }
618 constify_imm8!(imm8, call)
619}
620
621/// Round the packed double-precision (64-bit) floating-point elements in `a`
532ac7d7 622/// down to an integer value, and stores the results as packed double-precision
0531ce1d 623/// floating-point elements.
83c7162d
XL
624///
625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
0531ce1d
XL
626#[inline]
627#[target_feature(enable = "sse4.1")]
628#[cfg_attr(test, assert_instr(roundpd))]
83c7162d 629#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 630pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
74b04a01 631 simd_floor(a)
0531ce1d
XL
632}
633
634/// Round the packed single-precision (32-bit) floating-point elements in `a`
532ac7d7 635/// down to an integer value, and stores the results as packed single-precision
0531ce1d 636/// floating-point elements.
83c7162d
XL
637///
638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
0531ce1d
XL
639#[inline]
640#[target_feature(enable = "sse4.1")]
641#[cfg_attr(test, assert_instr(roundps))]
83c7162d 642#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 643pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
74b04a01 644 simd_floor(a)
0531ce1d
XL
645}
646
647/// Round the lower double-precision (64-bit) floating-point element in `b`
648/// down to an integer value, store the result as a double-precision
649/// floating-point element in the lower element of the intrinsic result,
532ac7d7 650/// and copies the upper element from `a` to the upper element of the intrinsic
0531ce1d 651/// result.
83c7162d
XL
652///
653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
0531ce1d
XL
654#[inline]
655#[target_feature(enable = "sse4.1")]
656#[cfg_attr(test, assert_instr(roundsd))]
83c7162d 657#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
658pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
659 roundsd(a, b, _MM_FROUND_FLOOR)
660}
661
662/// Round the lower single-precision (32-bit) floating-point element in `b`
663/// down to an integer value, store the result as a single-precision
664/// floating-point element in the lower element of the intrinsic result,
532ac7d7 665/// and copies the upper 3 packed elements from `a` to the upper elements
0531ce1d 666/// of the intrinsic result.
83c7162d
XL
667///
668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
0531ce1d
XL
669#[inline]
670#[target_feature(enable = "sse4.1")]
671#[cfg_attr(test, assert_instr(roundss))]
83c7162d 672#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
673pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
674 roundss(a, b, _MM_FROUND_FLOOR)
675}
676
677/// Round the packed double-precision (64-bit) floating-point elements in `a`
532ac7d7 678/// up to an integer value, and stores the results as packed double-precision
0531ce1d 679/// floating-point elements.
83c7162d
XL
680///
681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
0531ce1d
XL
682#[inline]
683#[target_feature(enable = "sse4.1")]
684#[cfg_attr(test, assert_instr(roundpd))]
83c7162d 685#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 686pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
74b04a01 687 simd_ceil(a)
0531ce1d
XL
688}
689
690/// Round the packed single-precision (32-bit) floating-point elements in `a`
532ac7d7 691/// up to an integer value, and stores the results as packed single-precision
0531ce1d 692/// floating-point elements.
83c7162d
XL
693///
694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
0531ce1d
XL
695#[inline]
696#[target_feature(enable = "sse4.1")]
697#[cfg_attr(test, assert_instr(roundps))]
83c7162d 698#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 699pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
74b04a01 700 simd_ceil(a)
0531ce1d
XL
701}
702
703/// Round the lower double-precision (64-bit) floating-point element in `b`
704/// up to an integer value, store the result as a double-precision
705/// floating-point element in the lower element of the intrisic result,
532ac7d7 706/// and copies the upper element from `a` to the upper element
0531ce1d 707/// of the intrinsic result.
83c7162d
XL
708///
709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
0531ce1d
XL
710#[inline]
711#[target_feature(enable = "sse4.1")]
712#[cfg_attr(test, assert_instr(roundsd))]
83c7162d 713#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
714pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
715 roundsd(a, b, _MM_FROUND_CEIL)
716}
717
718/// Round the lower single-precision (32-bit) floating-point element in `b`
719/// up to an integer value, store the result as a single-precision
720/// floating-point element in the lower element of the intrinsic result,
532ac7d7 721/// and copies the upper 3 packed elements from `a` to the upper elements
0531ce1d 722/// of the intrinsic result.
83c7162d
XL
723///
724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
0531ce1d
XL
725#[inline]
726#[target_feature(enable = "sse4.1")]
727#[cfg_attr(test, assert_instr(roundss))]
83c7162d 728#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
729pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
730 roundss(a, b, _MM_FROUND_CEIL)
731}
732
733/// Round the packed double-precision (64-bit) floating-point elements in `a`
532ac7d7 734/// using the `rounding` parameter, and stores the results as packed
0531ce1d
XL
735/// double-precision floating-point elements.
736/// Rounding is done according to the rounding parameter, which can be one of:
737///
738/// ```
0531ce1d
XL
739/// #[cfg(target_arch = "x86")]
740/// use std::arch::x86::*;
741/// #[cfg(target_arch = "x86_64")]
742/// use std::arch::x86_64::*;
743///
744/// # fn main() {
745/// // round to nearest, and suppress exceptions:
746/// # let _x =
747/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
748/// // round down, and suppress exceptions:
749/// # let _x =
750/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
751/// // round up, and suppress exceptions:
752/// # let _x =
753/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
754/// // truncate, and suppress exceptions:
755/// # let _x =
756/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
757/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
758/// # let _x =
759/// _MM_FROUND_CUR_DIRECTION;
760/// # }
761/// ```
83c7162d
XL
762///
763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
0531ce1d
XL
764#[inline]
765#[target_feature(enable = "sse4.1")]
766#[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
767#[rustc_args_required_const(1)]
83c7162d 768#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
769pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
770 macro_rules! call {
83c7162d
XL
771 ($imm4:expr) => {
772 roundpd(a, $imm4)
773 };
0531ce1d
XL
774 }
775 constify_imm4!(rounding, call)
776}
777
778/// Round the packed single-precision (32-bit) floating-point elements in `a`
532ac7d7 779/// using the `rounding` parameter, and stores the results as packed
0531ce1d
XL
780/// single-precision floating-point elements.
781/// Rounding is done according to the rounding parameter, which can be one of:
782///
783/// ```
0531ce1d
XL
784/// #[cfg(target_arch = "x86")]
785/// use std::arch::x86::*;
786/// #[cfg(target_arch = "x86_64")]
787/// use std::arch::x86_64::*;
788///
789/// # fn main() {
790/// // round to nearest, and suppress exceptions:
791/// # let _x =
792/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
793/// // round down, and suppress exceptions:
794/// # let _x =
795/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
796/// // round up, and suppress exceptions:
797/// # let _x =
798/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
799/// // truncate, and suppress exceptions:
800/// # let _x =
801/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
802/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
803/// # let _x =
804/// _MM_FROUND_CUR_DIRECTION;
805/// # }
806/// ```
83c7162d
XL
807///
808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
0531ce1d
XL
809#[inline]
810#[target_feature(enable = "sse4.1")]
811#[cfg_attr(test, assert_instr(roundps, rounding = 0))]
812#[rustc_args_required_const(1)]
83c7162d 813#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
814pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
815 macro_rules! call {
83c7162d
XL
816 ($imm4:expr) => {
817 roundps(a, $imm4)
818 };
0531ce1d
XL
819 }
820 constify_imm4!(rounding, call)
821}
822
823/// Round the lower double-precision (64-bit) floating-point element in `b`
824/// using the `rounding` parameter, store the result as a double-precision
825/// floating-point element in the lower element of the intrinsic result,
532ac7d7 826/// and copies the upper element from `a` to the upper element of the intrinsic
0531ce1d
XL
827/// result.
828/// Rounding is done according to the rounding parameter, which can be one of:
829///
830/// ```
0531ce1d
XL
831/// #[cfg(target_arch = "x86")]
832/// use std::arch::x86::*;
833/// #[cfg(target_arch = "x86_64")]
834/// use std::arch::x86_64::*;
835///
836/// # fn main() {
837/// // round to nearest, and suppress exceptions:
838/// # let _x =
839/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
840/// // round down, and suppress exceptions:
841/// # let _x =
842/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
843/// // round up, and suppress exceptions:
844/// # let _x =
845/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
846/// // truncate, and suppress exceptions:
847/// # let _x =
848/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
849/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
850/// # let _x =
851/// _MM_FROUND_CUR_DIRECTION;
852/// # }
853/// ```
83c7162d
XL
854///
855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
0531ce1d
XL
856#[inline]
857#[target_feature(enable = "sse4.1")]
858#[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
859#[rustc_args_required_const(2)]
83c7162d 860#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
861pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
862 macro_rules! call {
83c7162d
XL
863 ($imm4:expr) => {
864 roundsd(a, b, $imm4)
865 };
0531ce1d
XL
866 }
867 constify_imm4!(rounding, call)
868}
869
870/// Round the lower single-precision (32-bit) floating-point element in `b`
871/// using the `rounding` parameter, store the result as a single-precision
872/// floating-point element in the lower element of the intrinsic result,
532ac7d7 873/// and copies the upper 3 packed elements from `a` to the upper elements
0531ce1d
XL
874/// of the instrinsic result.
875/// Rounding is done according to the rounding parameter, which can be one of:
876///
877/// ```
0531ce1d
XL
878/// #[cfg(target_arch = "x86")]
879/// use std::arch::x86::*;
880/// #[cfg(target_arch = "x86_64")]
881/// use std::arch::x86_64::*;
882///
883/// # fn main() {
884/// // round to nearest, and suppress exceptions:
885/// # let _x =
886/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
887/// // round down, and suppress exceptions:
888/// # let _x =
889/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
890/// // round up, and suppress exceptions:
891/// # let _x =
892/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
893/// // truncate, and suppress exceptions:
894/// # let _x =
895/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
896/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
897/// # let _x =
898/// _MM_FROUND_CUR_DIRECTION;
899/// # }
900/// ```
83c7162d
XL
901///
902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
0531ce1d
XL
903#[inline]
904#[target_feature(enable = "sse4.1")]
905#[cfg_attr(test, assert_instr(roundss, rounding = 0))]
906#[rustc_args_required_const(2)]
83c7162d 907#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
908pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
909 macro_rules! call {
83c7162d
XL
910 ($imm4:expr) => {
911 roundss(a, b, $imm4)
912 };
0531ce1d
XL
913 }
914 constify_imm4!(rounding, call)
915}
916
917/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
918/// returning a vector containing its value in its first position, and its
919/// index
920/// in its second position; all other elements are set to zero.
921///
fc512014 922/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
0531ce1d
XL
923/// instruction.
924///
925/// Arguments:
926///
927/// * `a` - A 128-bit vector of type `__m128i`.
928///
929/// Returns:
930///
931/// A 128-bit value where:
932///
933/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
934/// * bits `[18:16]` - contain the index of the minimum value
935/// * remaining bits are set to `0`.
83c7162d
XL
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
0531ce1d
XL
938#[inline]
939#[target_feature(enable = "sse4.1")]
940#[cfg_attr(test, assert_instr(phminposuw))]
83c7162d 941#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 942pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
532ac7d7 943 transmute(phminposuw(a.as_u16x8()))
0531ce1d
XL
944}
945
532ac7d7
XL
946/// Multiplies the low 32-bit integers from each packed 64-bit
947/// element in `a` and `b`, and returns the signed 64-bit result.
83c7162d
XL
948///
949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
0531ce1d
XL
950#[inline]
951#[target_feature(enable = "sse4.1")]
952#[cfg_attr(test, assert_instr(pmuldq))]
83c7162d 953#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 954pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 955 transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
956}
957
532ac7d7 958/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
0531ce1d
XL
959/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
960/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
961/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
962/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
963/// return a negative number.
83c7162d
XL
964///
965/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
0531ce1d
XL
966#[inline]
967#[target_feature(enable = "sse4.1")]
968#[cfg_attr(test, assert_instr(pmulld))]
83c7162d 969#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 970pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 971 transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
972}
973
974/// Subtracts 8-bit unsigned integer values and computes the absolute
975/// values of the differences to the corresponding bits in the destination.
976/// Then sums of the absolute differences are returned according to the bit
977/// fields in the immediate operand.
978///
979/// The following algorithm is performed:
980///
981/// ```ignore
982/// i = imm8[2] * 4
983/// j = imm8[1:0] * 4
984/// for k := 0 to 7
985/// d0 = abs(a[i + k + 0] - b[j + 0])
986/// d1 = abs(a[i + k + 1] - b[j + 1])
987/// d2 = abs(a[i + k + 2] - b[j + 2])
988/// d3 = abs(a[i + k + 3] - b[j + 3])
989/// r[k] = d0 + d1 + d2 + d3
990/// ```
991///
992/// Arguments:
993///
994/// * `a` - A 128-bit vector of type `__m128i`.
995/// * `b` - A 128-bit vector of type `__m128i`.
996/// * `imm8` - An 8-bit immediate operand specifying how the absolute
0731742a 997/// differences are to be calculated
0531ce1d
XL
998/// * Bit `[2]` specify the offset for operand `a`
999/// * Bits `[1:0]` specify the offset for operand `b`
1000///
1001/// Returns:
1002///
0731742a
XL
1003/// * A `__m128i` vector containing the sums of the sets of absolute
1004/// differences between both operands.
83c7162d
XL
1005///
1006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
0531ce1d
XL
1007#[inline]
1008#[target_feature(enable = "sse4.1")]
1009#[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
1010#[rustc_args_required_const(2)]
83c7162d 1011#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1012pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1013 let a = a.as_u8x16();
1014 let b = b.as_u8x16();
1015 macro_rules! call {
83c7162d
XL
1016 ($imm8:expr) => {
1017 mpsadbw(a, b, $imm8)
1018 };
0531ce1d 1019 }
532ac7d7 1020 transmute(constify_imm3!(imm8, call))
0531ce1d
XL
1021}
1022
1023/// Tests whether the specified bits in a 128-bit integer vector are all
1024/// zeros.
1025///
1026/// Arguments:
1027///
1028/// * `a` - A 128-bit integer vector containing the bits to be tested.
1029/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1030/// operand `a`.
0531ce1d
XL
1031///
1032/// Returns:
1033///
1034/// * `1` - if the specified bits are all zeros,
1035/// * `0` - otherwise.
83c7162d
XL
1036///
1037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
0531ce1d
XL
1038#[inline]
1039#[target_feature(enable = "sse4.1")]
1040#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1041#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1042pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1043 ptestz(a.as_i64x2(), mask.as_i64x2())
1044}
1045
1046/// Tests whether the specified bits in a 128-bit integer vector are all
1047/// ones.
1048///
1049/// Arguments:
1050///
1051/// * `a` - A 128-bit integer vector containing the bits to be tested.
1052/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1053/// operand `a`.
0531ce1d
XL
1054///
1055/// Returns:
1056///
1057/// * `1` - if the specified bits are all ones,
1058/// * `0` - otherwise.
83c7162d
XL
1059///
1060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
0531ce1d
XL
1061#[inline]
1062#[target_feature(enable = "sse4.1")]
1063#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1064#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1065pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1066 ptestc(a.as_i64x2(), mask.as_i64x2())
1067}
1068
1069/// Tests whether the specified bits in a 128-bit integer vector are
1070/// neither all zeros nor all ones.
1071///
1072/// Arguments:
1073///
1074/// * `a` - A 128-bit integer vector containing the bits to be tested.
1075/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1076/// operand `a`.
0531ce1d
XL
1077///
1078/// Returns:
1079///
1080/// * `1` - if the specified bits are neither all zeros nor all ones,
1081/// * `0` - otherwise.
83c7162d
XL
1082///
1083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
0531ce1d
XL
1084#[inline]
1085#[target_feature(enable = "sse4.1")]
1086#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1087#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1088pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1089 ptestnzc(a.as_i64x2(), mask.as_i64x2())
1090}
1091
1092/// Tests whether the specified bits in a 128-bit integer vector are all
1093/// zeros.
1094///
1095/// Arguments:
1096///
1097/// * `a` - A 128-bit integer vector containing the bits to be tested.
1098/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1099/// operand `a`.
0531ce1d
XL
1100///
1101/// Returns:
1102///
1103/// * `1` - if the specified bits are all zeros,
1104/// * `0` - otherwise.
83c7162d
XL
1105///
1106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
0531ce1d
XL
1107#[inline]
1108#[target_feature(enable = "sse4.1")]
1109#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1110#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1111pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1112 _mm_testz_si128(a, mask)
1113}
1114
1115/// Tests whether the specified bits in `a` 128-bit integer vector are all
1116/// ones.
1117///
1118/// Argument:
1119///
1120/// * `a` - A 128-bit integer vector containing the bits to be tested.
1121///
1122/// Returns:
1123///
1124/// * `1` - if the bits specified in the operand are all set to 1,
1125/// * `0` - otherwise.
83c7162d
XL
1126///
1127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
0531ce1d
XL
1128#[inline]
1129#[target_feature(enable = "sse4.1")]
1130#[cfg_attr(test, assert_instr(pcmpeqd))]
1131#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1132#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1133pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1134 _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1135}
1136
1137/// Tests whether the specified bits in a 128-bit integer vector are
1138/// neither all zeros nor all ones.
1139///
1140/// Arguments:
1141///
1142/// * `a` - A 128-bit integer vector containing the bits to be tested.
1143/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1144/// operand `a`.
0531ce1d
XL
1145///
1146/// Returns:
1147///
1148/// * `1` - if the specified bits are neither all zeros nor all ones,
1149/// * `0` - otherwise.
83c7162d
XL
1150///
1151/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
0531ce1d
XL
1152#[inline]
1153#[target_feature(enable = "sse4.1")]
1154#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1155#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1156pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1157 _mm_testnzc_si128(a, mask)
1158}
1159
1160#[allow(improper_ctypes)]
1161extern "C" {
1162 #[link_name = "llvm.x86.sse41.pblendvb"]
1163 fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1164 #[link_name = "llvm.x86.sse41.blendvpd"]
1165 fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1166 #[link_name = "llvm.x86.sse41.blendvps"]
1167 fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1168 #[link_name = "llvm.x86.sse41.blendpd"]
1169 fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1170 #[link_name = "llvm.x86.sse41.blendps"]
1171 fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1172 #[link_name = "llvm.x86.sse41.pblendw"]
1173 fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1174 #[link_name = "llvm.x86.sse41.insertps"]
1175 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1176 #[link_name = "llvm.x86.sse41.pmaxsb"]
1177 fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1178 #[link_name = "llvm.x86.sse41.pmaxuw"]
1179 fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1180 #[link_name = "llvm.x86.sse41.pmaxsd"]
1181 fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1182 #[link_name = "llvm.x86.sse41.pmaxud"]
1183 fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1184 #[link_name = "llvm.x86.sse41.pminsb"]
1185 fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1186 #[link_name = "llvm.x86.sse41.pminuw"]
1187 fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1188 #[link_name = "llvm.x86.sse41.pminsd"]
1189 fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1190 #[link_name = "llvm.x86.sse41.pminud"]
1191 fn pminud(a: u32x4, b: u32x4) -> u32x4;
1192 #[link_name = "llvm.x86.sse41.packusdw"]
1193 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1194 #[link_name = "llvm.x86.sse41.dppd"]
1195 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1196 #[link_name = "llvm.x86.sse41.dpps"]
1197 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1198 #[link_name = "llvm.x86.sse41.round.pd"]
1199 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1200 #[link_name = "llvm.x86.sse41.round.ps"]
1201 fn roundps(a: __m128, rounding: i32) -> __m128;
1202 #[link_name = "llvm.x86.sse41.round.sd"]
1203 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1204 #[link_name = "llvm.x86.sse41.round.ss"]
1205 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1206 #[link_name = "llvm.x86.sse41.phminposuw"]
1207 fn phminposuw(a: u16x8) -> u16x8;
1208 #[link_name = "llvm.x86.sse41.pmuldq"]
1209 fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1210 #[link_name = "llvm.x86.sse41.mpsadbw"]
1211 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1212 #[link_name = "llvm.x86.sse41.ptestz"]
1213 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1214 #[link_name = "llvm.x86.sse41.ptestc"]
1215 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1216 #[link_name = "llvm.x86.sse41.ptestnzc"]
1217 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1218}
1219
1220#[cfg(test)]
1221mod tests {
532ac7d7 1222 use crate::core_arch::x86::*;
0531ce1d 1223 use std::mem;
416331ca 1224 use stdarch_test::simd_test;
0531ce1d 1225
83c7162d 1226 #[simd_test(enable = "sse4.1")]
0531ce1d 1227 unsafe fn test_mm_blendv_epi8() {
0731742a 1228 #[rustfmt::skip]
0531ce1d
XL
1229 let a = _mm_setr_epi8(
1230 0, 1, 2, 3, 4, 5, 6, 7,
1231 8, 9, 10, 11, 12, 13, 14, 15,
1232 );
0731742a 1233 #[rustfmt::skip]
0531ce1d
XL
1234 let b = _mm_setr_epi8(
1235 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1236 );
0731742a 1237 #[rustfmt::skip]
0531ce1d
XL
1238 let mask = _mm_setr_epi8(
1239 0, -1, 0, -1, 0, -1, 0, -1,
1240 0, -1, 0, -1, 0, -1, 0, -1,
1241 );
0731742a 1242 #[rustfmt::skip]
0531ce1d
XL
1243 let e = _mm_setr_epi8(
1244 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1245 );
1246 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1247 }
1248
83c7162d 1249 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1250 unsafe fn test_mm_blendv_pd() {
1251 let a = _mm_set1_pd(0.0);
1252 let b = _mm_set1_pd(1.0);
532ac7d7 1253 let mask = transmute(_mm_setr_epi64x(0, -1));
0531ce1d
XL
1254 let r = _mm_blendv_pd(a, b, mask);
1255 let e = _mm_setr_pd(0.0, 1.0);
1256 assert_eq_m128d(r, e);
1257 }
1258
83c7162d 1259 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1260 unsafe fn test_mm_blendv_ps() {
1261 let a = _mm_set1_ps(0.0);
1262 let b = _mm_set1_ps(1.0);
532ac7d7 1263 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
0531ce1d
XL
1264 let r = _mm_blendv_ps(a, b, mask);
1265 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1266 assert_eq_m128(r, e);
1267 }
1268
83c7162d 1269 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1270 unsafe fn test_mm_blend_pd() {
1271 let a = _mm_set1_pd(0.0);
1272 let b = _mm_set1_pd(1.0);
1273 let r = _mm_blend_pd(a, b, 0b10);
1274 let e = _mm_setr_pd(0.0, 1.0);
1275 assert_eq_m128d(r, e);
1276 }
1277
83c7162d 1278 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1279 unsafe fn test_mm_blend_ps() {
1280 let a = _mm_set1_ps(0.0);
1281 let b = _mm_set1_ps(1.0);
1282 let r = _mm_blend_ps(a, b, 0b1010);
1283 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1284 assert_eq_m128(r, e);
1285 }
1286
83c7162d 1287 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1288 unsafe fn test_mm_blend_epi16() {
1289 let a = _mm_set1_epi16(0);
1290 let b = _mm_set1_epi16(1);
1291 let r = _mm_blend_epi16(a, b, 0b1010_1100);
1292 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1293 assert_eq_m128i(r, e);
1294 }
1295
83c7162d 1296 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1297 unsafe fn test_mm_extract_ps() {
1298 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
532ac7d7 1299 let r: f32 = transmute(_mm_extract_ps(a, 1));
0531ce1d 1300 assert_eq!(r, 1.0);
532ac7d7 1301 let r: f32 = transmute(_mm_extract_ps(a, 5));
0531ce1d
XL
1302 assert_eq!(r, 1.0);
1303 }
1304
83c7162d 1305 #[simd_test(enable = "sse4.1")]
0531ce1d 1306 unsafe fn test_mm_extract_epi8() {
0731742a 1307 #[rustfmt::skip]
0531ce1d
XL
1308 let a = _mm_setr_epi8(
1309 -1, 1, 2, 3, 4, 5, 6, 7,
1310 8, 9, 10, 11, 12, 13, 14, 15
1311 );
1312 let r1 = _mm_extract_epi8(a, 0);
1313 let r2 = _mm_extract_epi8(a, 19);
1314 assert_eq!(r1, 0xFF);
1315 assert_eq!(r2, 3);
1316 }
1317
83c7162d 1318 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1319 unsafe fn test_mm_extract_epi32() {
1320 let a = _mm_setr_epi32(0, 1, 2, 3);
1321 let r = _mm_extract_epi32(a, 1);
1322 assert_eq!(r, 1);
1323 let r = _mm_extract_epi32(a, 5);
1324 assert_eq!(r, 1);
1325 }
1326
83c7162d 1327 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1328 unsafe fn test_mm_insert_ps() {
1329 let a = _mm_set1_ps(1.0);
1330 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1331 let r = _mm_insert_ps(a, b, 0b11_00_1100);
1332 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1333 assert_eq_m128(r, e);
1334 }
1335
83c7162d 1336 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1337 unsafe fn test_mm_insert_epi8() {
1338 let a = _mm_set1_epi8(0);
1339 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1340 let r = _mm_insert_epi8(a, 32, 1);
1341 assert_eq_m128i(r, e);
1342 let r = _mm_insert_epi8(a, 32, 17);
1343 assert_eq_m128i(r, e);
1344 }
1345
83c7162d 1346 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1347 unsafe fn test_mm_insert_epi32() {
1348 let a = _mm_set1_epi32(0);
1349 let e = _mm_setr_epi32(0, 32, 0, 0);
1350 let r = _mm_insert_epi32(a, 32, 1);
1351 assert_eq_m128i(r, e);
1352 let r = _mm_insert_epi32(a, 32, 5);
1353 assert_eq_m128i(r, e);
1354 }
1355
83c7162d 1356 #[simd_test(enable = "sse4.1")]
0531ce1d 1357 unsafe fn test_mm_max_epi8() {
0731742a 1358 #[rustfmt::skip]
0531ce1d
XL
1359 let a = _mm_setr_epi8(
1360 1, 4, 5, 8, 9, 12, 13, 16,
1361 17, 20, 21, 24, 25, 28, 29, 32,
1362 );
0731742a 1363 #[rustfmt::skip]
0531ce1d
XL
1364 let b = _mm_setr_epi8(
1365 2, 3, 6, 7, 10, 11, 14, 15,
1366 18, 19, 22, 23, 26, 27, 30, 31,
1367 );
1368 let r = _mm_max_epi8(a, b);
0731742a 1369 #[rustfmt::skip]
0531ce1d
XL
1370 let e = _mm_setr_epi8(
1371 2, 4, 6, 8, 10, 12, 14, 16,
1372 18, 20, 22, 24, 26, 28, 30, 32,
1373 );
1374 assert_eq_m128i(r, e);
1375 }
1376
83c7162d 1377 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1378 unsafe fn test_mm_max_epu16() {
1379 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1380 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1381 let r = _mm_max_epu16(a, b);
1382 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1383 assert_eq_m128i(r, e);
1384 }
1385
83c7162d 1386 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1387 unsafe fn test_mm_max_epi32() {
1388 let a = _mm_setr_epi32(1, 4, 5, 8);
1389 let b = _mm_setr_epi32(2, 3, 6, 7);
1390 let r = _mm_max_epi32(a, b);
1391 let e = _mm_setr_epi32(2, 4, 6, 8);
1392 assert_eq_m128i(r, e);
1393 }
1394
83c7162d 1395 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1396 unsafe fn test_mm_max_epu32() {
1397 let a = _mm_setr_epi32(1, 4, 5, 8);
1398 let b = _mm_setr_epi32(2, 3, 6, 7);
1399 let r = _mm_max_epu32(a, b);
1400 let e = _mm_setr_epi32(2, 4, 6, 8);
1401 assert_eq_m128i(r, e);
1402 }
1403
83c7162d 1404 #[simd_test(enable = "sse4.1")]
0531ce1d 1405 unsafe fn test_mm_min_epi8_1() {
0731742a 1406 #[rustfmt::skip]
0531ce1d
XL
1407 let a = _mm_setr_epi8(
1408 1, 4, 5, 8, 9, 12, 13, 16,
1409 17, 20, 21, 24, 25, 28, 29, 32,
1410 );
0731742a 1411 #[rustfmt::skip]
0531ce1d
XL
1412 let b = _mm_setr_epi8(
1413 2, 3, 6, 7, 10, 11, 14, 15,
1414 18, 19, 22, 23, 26, 27, 30, 31,
1415 );
1416 let r = _mm_min_epi8(a, b);
0731742a 1417 #[rustfmt::skip]
0531ce1d
XL
1418 let e = _mm_setr_epi8(
1419 1, 3, 5, 7, 9, 11, 13, 15,
1420 17, 19, 21, 23, 25, 27, 29, 31,
1421 );
1422 assert_eq_m128i(r, e);
1423 }
1424
83c7162d 1425 #[simd_test(enable = "sse4.1")]
0531ce1d 1426 unsafe fn test_mm_min_epi8_2() {
0731742a 1427 #[rustfmt::skip]
0531ce1d
XL
1428 let a = _mm_setr_epi8(
1429 1, -4, -5, 8, -9, -12, 13, -16,
1430 17, 20, 21, 24, 25, 28, 29, 32,
1431 );
0731742a 1432 #[rustfmt::skip]
0531ce1d
XL
1433 let b = _mm_setr_epi8(
1434 2, -3, -6, 7, -10, -11, 14, -15,
1435 18, 19, 22, 23, 26, 27, 30, 31,
1436 );
1437 let r = _mm_min_epi8(a, b);
0731742a 1438 #[rustfmt::skip]
0531ce1d
XL
1439 let e = _mm_setr_epi8(
1440 1, -4, -6, 7, -10, -12, 13, -16,
1441 17, 19, 21, 23, 25, 27, 29, 31,
1442 );
1443 assert_eq_m128i(r, e);
1444 }
1445
83c7162d 1446 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1447 unsafe fn test_mm_min_epu16() {
1448 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1449 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1450 let r = _mm_min_epu16(a, b);
1451 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1452 assert_eq_m128i(r, e);
1453 }
1454
83c7162d 1455 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1456 unsafe fn test_mm_min_epi32_1() {
1457 let a = _mm_setr_epi32(1, 4, 5, 8);
1458 let b = _mm_setr_epi32(2, 3, 6, 7);
1459 let r = _mm_min_epi32(a, b);
1460 let e = _mm_setr_epi32(1, 3, 5, 7);
1461 assert_eq_m128i(r, e);
1462 }
1463
83c7162d 1464 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1465 unsafe fn test_mm_min_epi32_2() {
1466 let a = _mm_setr_epi32(-1, 4, 5, -7);
1467 let b = _mm_setr_epi32(-2, 3, -6, 8);
1468 let r = _mm_min_epi32(a, b);
1469 let e = _mm_setr_epi32(-2, 3, -6, -7);
1470 assert_eq_m128i(r, e);
1471 }
1472
83c7162d 1473 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1474 unsafe fn test_mm_min_epu32() {
1475 let a = _mm_setr_epi32(1, 4, 5, 8);
1476 let b = _mm_setr_epi32(2, 3, 6, 7);
1477 let r = _mm_min_epu32(a, b);
1478 let e = _mm_setr_epi32(1, 3, 5, 7);
1479 assert_eq_m128i(r, e);
1480 }
1481
83c7162d 1482 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1483 unsafe fn test_mm_packus_epi32() {
1484 let a = _mm_setr_epi32(1, 2, 3, 4);
1485 let b = _mm_setr_epi32(-1, -2, -3, -4);
1486 let r = _mm_packus_epi32(a, b);
1487 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1488 assert_eq_m128i(r, e);
1489 }
1490
83c7162d 1491 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1492 unsafe fn test_mm_cmpeq_epi64() {
1493 let a = _mm_setr_epi64x(0, 1);
1494 let b = _mm_setr_epi64x(0, 0);
1495 let r = _mm_cmpeq_epi64(a, b);
1496 let e = _mm_setr_epi64x(-1, 0);
1497 assert_eq_m128i(r, e);
1498 }
1499
83c7162d 1500 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1501 unsafe fn test_mm_cvtepi8_epi16() {
1502 let a = _mm_set1_epi8(10);
1503 let r = _mm_cvtepi8_epi16(a);
1504 let e = _mm_set1_epi16(10);
1505 assert_eq_m128i(r, e);
1506 let a = _mm_set1_epi8(-10);
1507 let r = _mm_cvtepi8_epi16(a);
1508 let e = _mm_set1_epi16(-10);
1509 assert_eq_m128i(r, e);
1510 }
1511
83c7162d 1512 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1513 unsafe fn test_mm_cvtepi8_epi32() {
1514 let a = _mm_set1_epi8(10);
1515 let r = _mm_cvtepi8_epi32(a);
1516 let e = _mm_set1_epi32(10);
1517 assert_eq_m128i(r, e);
1518 let a = _mm_set1_epi8(-10);
1519 let r = _mm_cvtepi8_epi32(a);
1520 let e = _mm_set1_epi32(-10);
1521 assert_eq_m128i(r, e);
1522 }
1523
83c7162d 1524 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1525 unsafe fn test_mm_cvtepi8_epi64() {
1526 let a = _mm_set1_epi8(10);
1527 let r = _mm_cvtepi8_epi64(a);
1528 let e = _mm_set1_epi64x(10);
1529 assert_eq_m128i(r, e);
1530 let a = _mm_set1_epi8(-10);
1531 let r = _mm_cvtepi8_epi64(a);
1532 let e = _mm_set1_epi64x(-10);
1533 assert_eq_m128i(r, e);
1534 }
1535
83c7162d 1536 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1537 unsafe fn test_mm_cvtepi16_epi32() {
1538 let a = _mm_set1_epi16(10);
1539 let r = _mm_cvtepi16_epi32(a);
1540 let e = _mm_set1_epi32(10);
1541 assert_eq_m128i(r, e);
1542 let a = _mm_set1_epi16(-10);
1543 let r = _mm_cvtepi16_epi32(a);
1544 let e = _mm_set1_epi32(-10);
1545 assert_eq_m128i(r, e);
1546 }
1547
83c7162d 1548 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1549 unsafe fn test_mm_cvtepi16_epi64() {
1550 let a = _mm_set1_epi16(10);
1551 let r = _mm_cvtepi16_epi64(a);
1552 let e = _mm_set1_epi64x(10);
1553 assert_eq_m128i(r, e);
1554 let a = _mm_set1_epi16(-10);
1555 let r = _mm_cvtepi16_epi64(a);
1556 let e = _mm_set1_epi64x(-10);
1557 assert_eq_m128i(r, e);
1558 }
1559
83c7162d 1560 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1561 unsafe fn test_mm_cvtepi32_epi64() {
1562 let a = _mm_set1_epi32(10);
1563 let r = _mm_cvtepi32_epi64(a);
1564 let e = _mm_set1_epi64x(10);
1565 assert_eq_m128i(r, e);
1566 let a = _mm_set1_epi32(-10);
1567 let r = _mm_cvtepi32_epi64(a);
1568 let e = _mm_set1_epi64x(-10);
1569 assert_eq_m128i(r, e);
1570 }
1571
83c7162d 1572 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1573 unsafe fn test_mm_cvtepu8_epi16() {
1574 let a = _mm_set1_epi8(10);
1575 let r = _mm_cvtepu8_epi16(a);
1576 let e = _mm_set1_epi16(10);
1577 assert_eq_m128i(r, e);
1578 }
1579
83c7162d 1580 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1581 unsafe fn test_mm_cvtepu8_epi32() {
1582 let a = _mm_set1_epi8(10);
1583 let r = _mm_cvtepu8_epi32(a);
1584 let e = _mm_set1_epi32(10);
1585 assert_eq_m128i(r, e);
1586 }
1587
83c7162d 1588 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1589 unsafe fn test_mm_cvtepu8_epi64() {
1590 let a = _mm_set1_epi8(10);
1591 let r = _mm_cvtepu8_epi64(a);
1592 let e = _mm_set1_epi64x(10);
1593 assert_eq_m128i(r, e);
1594 }
1595
83c7162d 1596 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1597 unsafe fn test_mm_cvtepu16_epi32() {
1598 let a = _mm_set1_epi16(10);
1599 let r = _mm_cvtepu16_epi32(a);
1600 let e = _mm_set1_epi32(10);
1601 assert_eq_m128i(r, e);
1602 }
1603
83c7162d 1604 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1605 unsafe fn test_mm_cvtepu16_epi64() {
1606 let a = _mm_set1_epi16(10);
1607 let r = _mm_cvtepu16_epi64(a);
1608 let e = _mm_set1_epi64x(10);
1609 assert_eq_m128i(r, e);
1610 }
1611
83c7162d 1612 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1613 unsafe fn test_mm_cvtepu32_epi64() {
1614 let a = _mm_set1_epi32(10);
1615 let r = _mm_cvtepu32_epi64(a);
1616 let e = _mm_set1_epi64x(10);
1617 assert_eq_m128i(r, e);
1618 }
1619
83c7162d 1620 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1621 unsafe fn test_mm_dp_pd() {
1622 let a = _mm_setr_pd(2.0, 3.0);
1623 let b = _mm_setr_pd(1.0, 4.0);
1624 let e = _mm_setr_pd(14.0, 0.0);
1625 assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
1626 }
1627
83c7162d 1628 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1629 unsafe fn test_mm_dp_ps() {
1630 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1631 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1632 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1633 assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
1634 }
1635
83c7162d 1636 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1637 unsafe fn test_mm_floor_pd() {
1638 let a = _mm_setr_pd(2.5, 4.5);
1639 let r = _mm_floor_pd(a);
1640 let e = _mm_setr_pd(2.0, 4.0);
1641 assert_eq_m128d(r, e);
1642 }
1643
83c7162d 1644 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1645 unsafe fn test_mm_floor_ps() {
1646 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1647 let r = _mm_floor_ps(a);
1648 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1649 assert_eq_m128(r, e);
1650 }
1651
83c7162d 1652 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1653 unsafe fn test_mm_floor_sd() {
1654 let a = _mm_setr_pd(2.5, 4.5);
1655 let b = _mm_setr_pd(-1.5, -3.5);
1656 let r = _mm_floor_sd(a, b);
1657 let e = _mm_setr_pd(-2.0, 4.5);
1658 assert_eq_m128d(r, e);
1659 }
1660
83c7162d 1661 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1662 unsafe fn test_mm_floor_ss() {
1663 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1664 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1665 let r = _mm_floor_ss(a, b);
1666 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1667 assert_eq_m128(r, e);
1668 }
1669
83c7162d 1670 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1671 unsafe fn test_mm_ceil_pd() {
1672 let a = _mm_setr_pd(1.5, 3.5);
1673 let r = _mm_ceil_pd(a);
1674 let e = _mm_setr_pd(2.0, 4.0);
1675 assert_eq_m128d(r, e);
1676 }
1677
83c7162d 1678 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1679 unsafe fn test_mm_ceil_ps() {
1680 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1681 let r = _mm_ceil_ps(a);
1682 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1683 assert_eq_m128(r, e);
1684 }
1685
83c7162d 1686 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1687 unsafe fn test_mm_ceil_sd() {
1688 let a = _mm_setr_pd(1.5, 3.5);
1689 let b = _mm_setr_pd(-2.5, -4.5);
1690 let r = _mm_ceil_sd(a, b);
1691 let e = _mm_setr_pd(-2.0, 3.5);
1692 assert_eq_m128d(r, e);
1693 }
1694
83c7162d 1695 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1696 unsafe fn test_mm_ceil_ss() {
1697 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1698 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1699 let r = _mm_ceil_ss(a, b);
1700 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1701 assert_eq_m128(r, e);
1702 }
1703
83c7162d 1704 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1705 unsafe fn test_mm_round_pd() {
1706 let a = _mm_setr_pd(1.25, 3.75);
1707 let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
1708 let e = _mm_setr_pd(1.0, 4.0);
1709 assert_eq_m128d(r, e);
1710 }
1711
83c7162d 1712 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1713 unsafe fn test_mm_round_ps() {
1714 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1715 let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
1716 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1717 assert_eq_m128(r, e);
1718 }
1719
83c7162d 1720 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1721 unsafe fn test_mm_round_sd() {
1722 let a = _mm_setr_pd(1.5, 3.5);
1723 let b = _mm_setr_pd(-2.5, -4.5);
1724 let old_mode = _MM_GET_ROUNDING_MODE();
1725 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1726 let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
1727 _MM_SET_ROUNDING_MODE(old_mode);
1728 let e = _mm_setr_pd(-2.0, 3.5);
1729 assert_eq_m128d(r, e);
1730 }
1731
83c7162d 1732 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1733 unsafe fn test_mm_round_ss() {
1734 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1735 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1736 let old_mode = _MM_GET_ROUNDING_MODE();
1737 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1738 let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
1739 _MM_SET_ROUNDING_MODE(old_mode);
1740 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1741 assert_eq_m128(r, e);
1742 }
1743
83c7162d 1744 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1745 unsafe fn test_mm_minpos_epu16_1() {
1746 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1747 let r = _mm_minpos_epu16(a);
1748 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1749 assert_eq_m128i(r, e);
1750 }
1751
83c7162d 1752 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1753 unsafe fn test_mm_minpos_epu16_2() {
1754 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1755 let r = _mm_minpos_epu16(a);
1756 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1757 assert_eq_m128i(r, e);
1758 }
1759
83c7162d 1760 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1761 unsafe fn test_mm_mul_epi32() {
1762 {
1763 let a = _mm_setr_epi32(1, 1, 1, 1);
1764 let b = _mm_setr_epi32(1, 2, 3, 4);
1765 let r = _mm_mul_epi32(a, b);
1766 let e = _mm_setr_epi64x(1, 3);
1767 assert_eq_m128i(r, e);
1768 }
1769 {
0731742a 1770 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
0531ce1d 1771 let b = _mm_setr_epi32(
8faf50e0
XL
1772 -20, -256, /* ignored */
1773 666666, 666666, /* ignored */
0531ce1d
XL
1774 );
1775 let r = _mm_mul_epi32(a, b);
1776 let e = _mm_setr_epi64x(-300, 823043843622);
1777 assert_eq_m128i(r, e);
1778 }
1779 }
1780
83c7162d 1781 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1782 unsafe fn test_mm_mullo_epi32() {
1783 {
1784 let a = _mm_setr_epi32(1, 1, 1, 1);
1785 let b = _mm_setr_epi32(1, 2, 3, 4);
1786 let r = _mm_mullo_epi32(a, b);
1787 let e = _mm_setr_epi32(1, 2, 3, 4);
1788 assert_eq_m128i(r, e);
1789 }
1790 {
1791 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1792 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1793 let r = _mm_mullo_epi32(a, b);
1794 // Attention, most significant bit in r[2] is treated
1795 // as a sign bit:
1796 // 1234567 * 666666 = -1589877210
1797 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1798 assert_eq_m128i(r, e);
1799 }
1800 }
1801
83c7162d 1802 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1803 unsafe fn test_mm_minpos_epu16() {
1804 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1805 let r = _mm_minpos_epu16(a);
1806 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1807 assert_eq_m128i(r, e);
1808 }
1809
83c7162d 1810 #[simd_test(enable = "sse4.1")]
0531ce1d 1811 unsafe fn test_mm_mpsadbw_epu8() {
0731742a 1812 #[rustfmt::skip]
0531ce1d
XL
1813 let a = _mm_setr_epi8(
1814 0, 1, 2, 3, 4, 5, 6, 7,
1815 8, 9, 10, 11, 12, 13, 14, 15,
1816 );
1817
1818 let r = _mm_mpsadbw_epu8(a, a, 0b000);
1819 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1820 assert_eq_m128i(r, e);
1821
1822 let r = _mm_mpsadbw_epu8(a, a, 0b001);
1823 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1824 assert_eq_m128i(r, e);
1825
1826 let r = _mm_mpsadbw_epu8(a, a, 0b100);
1827 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1828 assert_eq_m128i(r, e);
1829
1830 let r = _mm_mpsadbw_epu8(a, a, 0b101);
1831 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1832 assert_eq_m128i(r, e);
1833
1834 let r = _mm_mpsadbw_epu8(a, a, 0b111);
1835 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1836 assert_eq_m128i(r, e);
1837 }
1838
83c7162d 1839 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1840 unsafe fn test_mm_testz_si128() {
1841 let a = _mm_set1_epi8(1);
1842 let mask = _mm_set1_epi8(0);
1843 let r = _mm_testz_si128(a, mask);
1844 assert_eq!(r, 1);
1845 let a = _mm_set1_epi8(0b101);
1846 let mask = _mm_set1_epi8(0b110);
1847 let r = _mm_testz_si128(a, mask);
1848 assert_eq!(r, 0);
1849 let a = _mm_set1_epi8(0b011);
1850 let mask = _mm_set1_epi8(0b100);
1851 let r = _mm_testz_si128(a, mask);
1852 assert_eq!(r, 1);
1853 }
1854
83c7162d 1855 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1856 unsafe fn test_mm_testc_si128() {
1857 let a = _mm_set1_epi8(-1);
1858 let mask = _mm_set1_epi8(0);
1859 let r = _mm_testc_si128(a, mask);
1860 assert_eq!(r, 1);
1861 let a = _mm_set1_epi8(0b101);
1862 let mask = _mm_set1_epi8(0b110);
1863 let r = _mm_testc_si128(a, mask);
1864 assert_eq!(r, 0);
1865 let a = _mm_set1_epi8(0b101);
1866 let mask = _mm_set1_epi8(0b100);
1867 let r = _mm_testc_si128(a, mask);
1868 assert_eq!(r, 1);
1869 }
1870
83c7162d 1871 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1872 unsafe fn test_mm_testnzc_si128() {
1873 let a = _mm_set1_epi8(0);
1874 let mask = _mm_set1_epi8(1);
1875 let r = _mm_testnzc_si128(a, mask);
1876 assert_eq!(r, 0);
1877 let a = _mm_set1_epi8(-1);
1878 let mask = _mm_set1_epi8(0);
1879 let r = _mm_testnzc_si128(a, mask);
1880 assert_eq!(r, 0);
1881 let a = _mm_set1_epi8(0b101);
1882 let mask = _mm_set1_epi8(0b110);
1883 let r = _mm_testnzc_si128(a, mask);
1884 assert_eq!(r, 1);
1885 let a = _mm_set1_epi8(0b101);
1886 let mask = _mm_set1_epi8(0b101);
1887 let r = _mm_testnzc_si128(a, mask);
1888 assert_eq!(r, 0);
1889 }
1890
83c7162d 1891 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1892 unsafe fn test_mm_test_all_zeros() {
1893 let a = _mm_set1_epi8(1);
1894 let mask = _mm_set1_epi8(0);
1895 let r = _mm_test_all_zeros(a, mask);
1896 assert_eq!(r, 1);
1897 let a = _mm_set1_epi8(0b101);
1898 let mask = _mm_set1_epi8(0b110);
1899 let r = _mm_test_all_zeros(a, mask);
1900 assert_eq!(r, 0);
1901 let a = _mm_set1_epi8(0b011);
1902 let mask = _mm_set1_epi8(0b100);
1903 let r = _mm_test_all_zeros(a, mask);
1904 assert_eq!(r, 1);
1905 }
1906
83c7162d 1907 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1908 unsafe fn test_mm_test_all_ones() {
1909 let a = _mm_set1_epi8(-1);
1910 let r = _mm_test_all_ones(a);
1911 assert_eq!(r, 1);
1912 let a = _mm_set1_epi8(0b101);
1913 let r = _mm_test_all_ones(a);
1914 assert_eq!(r, 0);
1915 }
1916
83c7162d 1917 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1918 unsafe fn test_mm_test_mix_ones_zeros() {
1919 let a = _mm_set1_epi8(0);
1920 let mask = _mm_set1_epi8(1);
1921 let r = _mm_test_mix_ones_zeros(a, mask);
1922 assert_eq!(r, 0);
1923 let a = _mm_set1_epi8(-1);
1924 let mask = _mm_set1_epi8(0);
1925 let r = _mm_test_mix_ones_zeros(a, mask);
1926 assert_eq!(r, 0);
1927 let a = _mm_set1_epi8(0b101);
1928 let mask = _mm_set1_epi8(0b110);
1929 let r = _mm_test_mix_ones_zeros(a, mask);
1930 assert_eq!(r, 1);
1931 let a = _mm_set1_epi8(0b101);
1932 let mask = _mm_set1_epi8(0b101);
1933 let r = _mm_test_mix_ones_zeros(a, mask);
1934 assert_eq!(r, 0);
1935 }
1936}