]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/sse41.rs
New upstream version 1.58.1+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / sse41.rs
CommitLineData
0531ce1d
XL
1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
532ac7d7
XL
3use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 mem::transmute,
6};
0531ce1d
XL
7
8#[cfg(test)]
416331ca 9use stdarch_test::assert_instr;
0531ce1d
XL
10
11// SSE4 rounding constans
12/// round to nearest
83c7162d 13#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
14pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15/// round down
83c7162d 16#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
17pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18/// round up
83c7162d 19#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
20pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21/// truncate
83c7162d 22#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
23pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
83c7162d 25#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
26pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27/// do not suppress exceptions
83c7162d 28#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
29pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30/// suppress exceptions
83c7162d 31#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
32pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33/// round to nearest and do not suppress exceptions
83c7162d 34#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
35pub const _MM_FROUND_NINT: i32 = 0x00;
36/// round down and do not suppress exceptions
83c7162d 37#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 38pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
0531ce1d 39/// round up and do not suppress exceptions
83c7162d 40#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 41pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
0531ce1d 42/// truncate and do not suppress exceptions
83c7162d 43#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 44pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
0531ce1d
XL
45/// use MXCSR.RC and do not suppress exceptions; see
46/// `vendor::_MM_SET_ROUNDING_MODE`
83c7162d 47#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 48pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
0531ce1d 49/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
83c7162d 50#[stable(feature = "simd_x86", since = "1.27.0")]
74b04a01 51pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
0531ce1d
XL
52
53/// Blend packed 8-bit integers from `a` and `b` using `mask`
54///
55/// The high bit of each corresponding mask byte determines the selection.
56/// If the high bit is set the element of `a` is selected. The element
57/// of `b` is selected otherwise.
83c7162d
XL
58///
59/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
0531ce1d
XL
60#[inline]
61#[target_feature(enable = "sse4.1")]
62#[cfg_attr(test, assert_instr(pblendvb))]
83c7162d 63#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 64pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
532ac7d7 65 transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
0531ce1d
XL
66}
67
17df50a5 68/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
0531ce1d
XL
69///
70/// The mask bits determine the selection. A clear bit selects the
71/// corresponding element of `a`, and a set bit the corresponding
72/// element of `b`.
83c7162d
XL
73///
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
0531ce1d
XL
75#[inline]
76#[target_feature(enable = "sse4.1")]
8faf50e0
XL
77// Note: LLVM7 prefers the single-precision floating-point domain when possible
78// see https://bugs.llvm.org/show_bug.cgi?id=38195
17df50a5
XL
79// #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
80#[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
81#[rustc_legacy_const_generics(2)]
83c7162d 82#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
83pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
84 static_assert_imm8!(IMM8);
85 transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
0531ce1d
XL
86}
87
88/// Blend packed double-precision (64-bit) floating-point elements from `a`
89/// and `b` using `mask`
83c7162d
XL
90///
91/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
0531ce1d
XL
92#[inline]
93#[target_feature(enable = "sse4.1")]
94#[cfg_attr(test, assert_instr(blendvpd))]
83c7162d 95#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
96pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
97 blendvpd(a, b, mask)
98}
99
100/// Blend packed single-precision (32-bit) floating-point elements from `a`
101/// and `b` using `mask`
83c7162d
XL
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
0531ce1d
XL
104#[inline]
105#[target_feature(enable = "sse4.1")]
106#[cfg_attr(test, assert_instr(blendvps))]
83c7162d 107#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
108pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
109 blendvps(a, b, mask)
110}
111
112/// Blend packed double-precision (64-bit) floating-point elements from `a`
17df50a5 113/// and `b` using control mask `IMM2`
83c7162d
XL
114///
115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
0531ce1d
XL
116#[inline]
117#[target_feature(enable = "sse4.1")]
8faf50e0
XL
118// Note: LLVM7 prefers the single-precision floating-point domain when possible
119// see https://bugs.llvm.org/show_bug.cgi?id=38195
17df50a5
XL
120// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
121#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
122#[rustc_legacy_const_generics(2)]
123#[stable(feature = "simd_x86", since = "1.27.0")]
124pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
125 static_assert_imm2!(IMM2);
126 blendpd(a, b, IMM2 as u8)
0531ce1d
XL
127}
128
129/// Blend packed single-precision (32-bit) floating-point elements from `a`
17df50a5 130/// and `b` using mask `IMM4`
83c7162d
XL
131///
132/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
0531ce1d
XL
133#[inline]
134#[target_feature(enable = "sse4.1")]
17df50a5
XL
135#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
136#[rustc_legacy_const_generics(2)]
137#[stable(feature = "simd_x86", since = "1.27.0")]
138pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
139 static_assert_imm4!(IMM4);
140 blendps(a, b, IMM4 as u8)
0531ce1d
XL
141}
142
532ac7d7 143/// Extracts a single-precision (32-bit) floating-point element from `a`,
3c0e092e
XL
144/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
145/// and may be converted back to a floating point number via casting.
83c7162d 146///
3c0e092e
XL
147/// # Example
148/// ```rust
149/// # #[cfg(target_arch = "x86")]
150/// # use std::arch::x86::*;
151/// # #[cfg(target_arch = "x86_64")]
152/// # use std::arch::x86_64::*;
153/// # fn main() {
154/// # if is_x86_feature_detected!("sse4.1") {
155/// # #[target_feature(enable = "sse4.1")]
156/// # unsafe fn worker() {
157/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
158/// unsafe {
159/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
160/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
161/// float_store.push(f32::from_bits(x as u32));
162/// }
163/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
164/// # }
165/// # unsafe { worker() }
166/// # }
167/// # }
168/// ```
83c7162d 169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
0531ce1d
XL
170#[inline]
171#[target_feature(enable = "sse4.1")]
0731742a
XL
172#[cfg_attr(
173 all(test, not(target_os = "windows")),
17df50a5 174 assert_instr(extractps, IMM8 = 0)
0731742a 175)]
17df50a5 176#[rustc_legacy_const_generics(1)]
83c7162d 177#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
178pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
179 static_assert_imm2!(IMM8);
180 transmute(simd_extract::<_, f32>(a, IMM8 as u32))
0531ce1d
XL
181}
182
17df50a5 183/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
0531ce1d
XL
184/// integer containing the zero-extended integer data.
185///
fc512014 186/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
83c7162d
XL
187///
188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
0531ce1d
XL
189#[inline]
190#[target_feature(enable = "sse4.1")]
17df50a5
XL
191#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
192#[rustc_legacy_const_generics(1)]
83c7162d 193#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
194pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
195 static_assert_imm4!(IMM8);
196 simd_extract::<_, u8>(a.as_u8x16(), IMM8 as u32) as i32
0531ce1d
XL
197}
198
17df50a5 199/// Extracts an 32-bit integer from `a` selected with `IMM8`
83c7162d
XL
200///
201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
0531ce1d
XL
202#[inline]
203#[target_feature(enable = "sse4.1")]
0731742a
XL
204#[cfg_attr(
205 all(test, not(target_os = "windows")),
17df50a5 206 assert_instr(extractps, IMM8 = 1)
0731742a 207)]
17df50a5 208#[rustc_legacy_const_generics(1)]
83c7162d 209#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
210pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
211 static_assert_imm2!(IMM8);
212 simd_extract::<_, i32>(a.as_i32x4(), IMM8 as u32)
0531ce1d
XL
213}
214
215/// Select a single value in `a` to store at some position in `b`,
17df50a5 216/// Then zero elements according to `IMM8`.
0531ce1d 217///
17df50a5 218/// `IMM8` specifies which bits from operand `a` will be copied, which bits in
0531ce1d
XL
219/// the result they will be copied to, and which bits in the result will be
220/// cleared. The following assignments are made:
221///
222/// * Bits `[7:6]` specify the bits to copy from operand `a`:
223/// - `00`: Selects bits `[31:0]` from operand `a`.
224/// - `01`: Selects bits `[63:32]` from operand `a`.
225/// - `10`: Selects bits `[95:64]` from operand `a`.
226/// - `11`: Selects bits `[127:96]` from operand `a`.
227///
228/// * Bits `[5:4]` specify the bits in the result to which the selected bits
229/// from operand `a` are copied:
230/// - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
231/// - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
232/// - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
233/// - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
234///
235/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
236/// element is cleared.
83c7162d
XL
237///
238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
0531ce1d
XL
239#[inline]
240#[target_feature(enable = "sse4.1")]
17df50a5
XL
241#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
242#[rustc_legacy_const_generics(2)]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
245 static_assert_imm8!(IMM8);
246 insertps(a, b, IMM8 as u8)
0531ce1d
XL
247}
248
532ac7d7 249/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
17df50a5 250/// location specified by `IMM8`.
83c7162d
XL
251///
252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
0531ce1d
XL
253#[inline]
254#[target_feature(enable = "sse4.1")]
17df50a5
XL
255#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
256#[rustc_legacy_const_generics(2)]
83c7162d 257#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
258pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
259 static_assert_imm4!(IMM8);
260 transmute(simd_insert(a.as_i8x16(), IMM8 as u32, i as i8))
0531ce1d
XL
261}
262
532ac7d7 263/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
17df50a5 264/// location specified by `IMM8`.
83c7162d
XL
265///
266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
0531ce1d
XL
267#[inline]
268#[target_feature(enable = "sse4.1")]
17df50a5
XL
269#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
270#[rustc_legacy_const_generics(2)]
83c7162d 271#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
272pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
273 static_assert_imm2!(IMM8);
274 transmute(simd_insert(a.as_i32x4(), IMM8 as u32, i))
0531ce1d
XL
275}
276
532ac7d7 277/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
0531ce1d 278/// values in dst.
83c7162d
XL
279///
280/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
0531ce1d
XL
281#[inline]
282#[target_feature(enable = "sse4.1")]
283#[cfg_attr(test, assert_instr(pmaxsb))]
83c7162d 284#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 285pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 286 transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
0531ce1d
XL
287}
288
532ac7d7 289/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
0531ce1d 290/// maximum.
83c7162d
XL
291///
292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
0531ce1d
XL
293#[inline]
294#[target_feature(enable = "sse4.1")]
295#[cfg_attr(test, assert_instr(pmaxuw))]
83c7162d 296#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 297pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 298 transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
0531ce1d
XL
299}
300
532ac7d7 301/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
0531ce1d 302/// values.
83c7162d
XL
303///
304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
0531ce1d
XL
305#[inline]
306#[target_feature(enable = "sse4.1")]
307#[cfg_attr(test, assert_instr(pmaxsd))]
83c7162d 308#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 309pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 310 transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
311}
312
532ac7d7 313/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
0531ce1d 314/// maximum values.
83c7162d
XL
315///
316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
0531ce1d
XL
317#[inline]
318#[target_feature(enable = "sse4.1")]
319#[cfg_attr(test, assert_instr(pmaxud))]
83c7162d 320#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 321pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 322 transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
0531ce1d
XL
323}
324
532ac7d7 325/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
0531ce1d 326/// values in dst.
83c7162d
XL
327///
328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
0531ce1d
XL
329#[inline]
330#[target_feature(enable = "sse4.1")]
331#[cfg_attr(test, assert_instr(pminsb))]
83c7162d 332#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 333pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 334 transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
0531ce1d
XL
335}
336
532ac7d7 337/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
0531ce1d 338/// minimum.
83c7162d
XL
339///
340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
0531ce1d
XL
341#[inline]
342#[target_feature(enable = "sse4.1")]
343#[cfg_attr(test, assert_instr(pminuw))]
83c7162d 344#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 345pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 346 transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
0531ce1d
XL
347}
348
532ac7d7 349/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
0531ce1d 350/// values.
83c7162d
XL
351///
352/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
0531ce1d
XL
353#[inline]
354#[target_feature(enable = "sse4.1")]
355#[cfg_attr(test, assert_instr(pminsd))]
83c7162d 356#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 357pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 358 transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
359}
360
532ac7d7 361/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
0531ce1d 362/// minimum values.
83c7162d
XL
363///
364/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
0531ce1d
XL
365#[inline]
366#[target_feature(enable = "sse4.1")]
367#[cfg_attr(test, assert_instr(pminud))]
83c7162d 368#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 369pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 370 transmute(pminud(a.as_u32x4(), b.as_u32x4()))
0531ce1d
XL
371}
372
532ac7d7 373/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
0531ce1d 374/// using unsigned saturation
83c7162d
XL
375///
376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
0531ce1d
XL
377#[inline]
378#[target_feature(enable = "sse4.1")]
379#[cfg_attr(test, assert_instr(packusdw))]
83c7162d 380#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 381pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 382 transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
383}
384
532ac7d7 385/// Compares packed 64-bit integers in `a` and `b` for equality
83c7162d
XL
386///
387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
0531ce1d
XL
388#[inline]
389#[target_feature(enable = "sse4.1")]
390#[cfg_attr(test, assert_instr(pcmpeqq))]
83c7162d 391#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 392pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 393 transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
0531ce1d
XL
394}
395
396/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
83c7162d
XL
397///
398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
0531ce1d
XL
399#[inline]
400#[target_feature(enable = "sse4.1")]
401#[cfg_attr(test, assert_instr(pmovsxbw))]
83c7162d 402#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
403pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
404 let a = a.as_i8x16();
17df50a5 405 let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
532ac7d7 406 transmute(simd_cast::<_, i16x8>(a))
0531ce1d
XL
407}
408
409/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
83c7162d
XL
410///
411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
0531ce1d
XL
412#[inline]
413#[target_feature(enable = "sse4.1")]
414#[cfg_attr(test, assert_instr(pmovsxbd))]
83c7162d 415#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
416pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
417 let a = a.as_i8x16();
17df50a5 418 let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
532ac7d7 419 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
420}
421
422/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
423/// 64-bit integers
83c7162d
XL
424///
425/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
0531ce1d
XL
426#[inline]
427#[target_feature(enable = "sse4.1")]
428#[cfg_attr(test, assert_instr(pmovsxbq))]
83c7162d 429#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
430pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
431 let a = a.as_i8x16();
17df50a5 432 let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 433 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
434}
435
436/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
83c7162d
XL
437///
438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
0531ce1d
XL
439#[inline]
440#[target_feature(enable = "sse4.1")]
441#[cfg_attr(test, assert_instr(pmovsxwd))]
83c7162d 442#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
443pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
444 let a = a.as_i16x8();
17df50a5 445 let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
532ac7d7 446 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
447}
448
449/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
83c7162d
XL
450///
451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
0531ce1d
XL
452#[inline]
453#[target_feature(enable = "sse4.1")]
454#[cfg_attr(test, assert_instr(pmovsxwq))]
83c7162d 455#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
456pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
457 let a = a.as_i16x8();
17df50a5 458 let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 459 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
460}
461
462/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
83c7162d
XL
463///
464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
0531ce1d
XL
465#[inline]
466#[target_feature(enable = "sse4.1")]
467#[cfg_attr(test, assert_instr(pmovsxdq))]
83c7162d 468#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
469pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
470 let a = a.as_i32x4();
17df50a5 471 let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 472 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
473}
474
532ac7d7 475/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
83c7162d
XL
476///
477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
0531ce1d
XL
478#[inline]
479#[target_feature(enable = "sse4.1")]
480#[cfg_attr(test, assert_instr(pmovzxbw))]
83c7162d 481#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
482pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
483 let a = a.as_u8x16();
17df50a5 484 let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
532ac7d7 485 transmute(simd_cast::<_, i16x8>(a))
0531ce1d
XL
486}
487
532ac7d7 488/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
83c7162d
XL
489///
490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
0531ce1d
XL
491#[inline]
492#[target_feature(enable = "sse4.1")]
493#[cfg_attr(test, assert_instr(pmovzxbd))]
83c7162d 494#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
495pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
496 let a = a.as_u8x16();
17df50a5 497 let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
532ac7d7 498 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
499}
500
532ac7d7 501/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
83c7162d
XL
502///
503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
0531ce1d
XL
504#[inline]
505#[target_feature(enable = "sse4.1")]
506#[cfg_attr(test, assert_instr(pmovzxbq))]
83c7162d 507#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
508pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
509 let a = a.as_u8x16();
17df50a5 510 let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 511 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
512}
513
532ac7d7 514/// Zeroes extend packed unsigned 16-bit integers in `a`
0531ce1d 515/// to packed 32-bit integers
83c7162d
XL
516///
517/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
0531ce1d
XL
518#[inline]
519#[target_feature(enable = "sse4.1")]
520#[cfg_attr(test, assert_instr(pmovzxwd))]
83c7162d 521#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
522pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
523 let a = a.as_u16x8();
17df50a5 524 let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
532ac7d7 525 transmute(simd_cast::<_, i32x4>(a))
0531ce1d
XL
526}
527
532ac7d7 528/// Zeroes extend packed unsigned 16-bit integers in `a`
0531ce1d 529/// to packed 64-bit integers
83c7162d
XL
530///
531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
0531ce1d
XL
532#[inline]
533#[target_feature(enable = "sse4.1")]
534#[cfg_attr(test, assert_instr(pmovzxwq))]
83c7162d 535#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
536pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
537 let a = a.as_u16x8();
17df50a5 538 let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 539 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
540}
541
532ac7d7 542/// Zeroes extend packed unsigned 32-bit integers in `a`
0531ce1d 543/// to packed 64-bit integers
83c7162d
XL
544///
545/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
0531ce1d
XL
546#[inline]
547#[target_feature(enable = "sse4.1")]
548#[cfg_attr(test, assert_instr(pmovzxdq))]
83c7162d 549#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
550pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
551 let a = a.as_u32x4();
17df50a5 552 let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
532ac7d7 553 transmute(simd_cast::<_, i64x2>(a))
0531ce1d
XL
554}
555
556/// Returns the dot product of two __m128d vectors.
557///
17df50a5 558/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
0531ce1d
XL
559/// If a condition mask bit is zero, the corresponding multiplication is
560/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
561/// the dot product will be stored in the return value component. Otherwise if
562/// the broadcast mask bit is zero then the return component will be zero.
83c7162d
XL
563///
564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
0531ce1d
XL
565#[inline]
566#[target_feature(enable = "sse4.1")]
17df50a5
XL
567#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
568#[rustc_legacy_const_generics(2)]
569#[stable(feature = "simd_x86", since = "1.27.0")]
570pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
571 static_assert_imm8!(IMM8);
572 dppd(a, b, IMM8 as u8)
0531ce1d
XL
573}
574
575/// Returns the dot product of two __m128 vectors.
576///
17df50a5 577/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
0531ce1d
XL
578/// If a condition mask bit is zero, the corresponding multiplication is
579/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
580/// the dot product will be stored in the return value component. Otherwise if
581/// the broadcast mask bit is zero then the return component will be zero.
83c7162d
XL
582///
583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
0531ce1d
XL
584#[inline]
585#[target_feature(enable = "sse4.1")]
17df50a5
XL
586#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
587#[rustc_legacy_const_generics(2)]
588#[stable(feature = "simd_x86", since = "1.27.0")]
589pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
590 static_assert_imm8!(IMM8);
591 dpps(a, b, IMM8 as u8)
0531ce1d
XL
592}
593
594/// Round the packed double-precision (64-bit) floating-point elements in `a`
532ac7d7 595/// down to an integer value, and stores the results as packed double-precision
0531ce1d 596/// floating-point elements.
83c7162d
XL
597///
598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
0531ce1d
XL
599#[inline]
600#[target_feature(enable = "sse4.1")]
601#[cfg_attr(test, assert_instr(roundpd))]
83c7162d 602#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 603pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
74b04a01 604 simd_floor(a)
0531ce1d
XL
605}
606
607/// Round the packed single-precision (32-bit) floating-point elements in `a`
532ac7d7 608/// down to an integer value, and stores the results as packed single-precision
0531ce1d 609/// floating-point elements.
83c7162d
XL
610///
611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
0531ce1d
XL
612#[inline]
613#[target_feature(enable = "sse4.1")]
614#[cfg_attr(test, assert_instr(roundps))]
83c7162d 615#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 616pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
74b04a01 617 simd_floor(a)
0531ce1d
XL
618}
619
620/// Round the lower double-precision (64-bit) floating-point element in `b`
621/// down to an integer value, store the result as a double-precision
622/// floating-point element in the lower element of the intrinsic result,
532ac7d7 623/// and copies the upper element from `a` to the upper element of the intrinsic
0531ce1d 624/// result.
83c7162d
XL
625///
626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
0531ce1d
XL
627#[inline]
628#[target_feature(enable = "sse4.1")]
629#[cfg_attr(test, assert_instr(roundsd))]
83c7162d 630#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
631pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
632 roundsd(a, b, _MM_FROUND_FLOOR)
633}
634
635/// Round the lower single-precision (32-bit) floating-point element in `b`
636/// down to an integer value, store the result as a single-precision
637/// floating-point element in the lower element of the intrinsic result,
532ac7d7 638/// and copies the upper 3 packed elements from `a` to the upper elements
0531ce1d 639/// of the intrinsic result.
83c7162d
XL
640///
641/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
0531ce1d
XL
642#[inline]
643#[target_feature(enable = "sse4.1")]
644#[cfg_attr(test, assert_instr(roundss))]
83c7162d 645#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
646pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
647 roundss(a, b, _MM_FROUND_FLOOR)
648}
649
650/// Round the packed double-precision (64-bit) floating-point elements in `a`
532ac7d7 651/// up to an integer value, and stores the results as packed double-precision
0531ce1d 652/// floating-point elements.
83c7162d
XL
653///
654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
0531ce1d
XL
655#[inline]
656#[target_feature(enable = "sse4.1")]
657#[cfg_attr(test, assert_instr(roundpd))]
83c7162d 658#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 659pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
74b04a01 660 simd_ceil(a)
0531ce1d
XL
661}
662
663/// Round the packed single-precision (32-bit) floating-point elements in `a`
532ac7d7 664/// up to an integer value, and stores the results as packed single-precision
0531ce1d 665/// floating-point elements.
83c7162d
XL
666///
667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
0531ce1d
XL
668#[inline]
669#[target_feature(enable = "sse4.1")]
670#[cfg_attr(test, assert_instr(roundps))]
83c7162d 671#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 672pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
74b04a01 673 simd_ceil(a)
0531ce1d
XL
674}
675
676/// Round the lower double-precision (64-bit) floating-point element in `b`
677/// up to an integer value, store the result as a double-precision
678/// floating-point element in the lower element of the intrisic result,
532ac7d7 679/// and copies the upper element from `a` to the upper element
0531ce1d 680/// of the intrinsic result.
83c7162d
XL
681///
682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
0531ce1d
XL
683#[inline]
684#[target_feature(enable = "sse4.1")]
685#[cfg_attr(test, assert_instr(roundsd))]
83c7162d 686#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
687pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
688 roundsd(a, b, _MM_FROUND_CEIL)
689}
690
691/// Round the lower single-precision (32-bit) floating-point element in `b`
692/// up to an integer value, store the result as a single-precision
693/// floating-point element in the lower element of the intrinsic result,
532ac7d7 694/// and copies the upper 3 packed elements from `a` to the upper elements
0531ce1d 695/// of the intrinsic result.
83c7162d
XL
696///
697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
0531ce1d
XL
698#[inline]
699#[target_feature(enable = "sse4.1")]
700#[cfg_attr(test, assert_instr(roundss))]
83c7162d 701#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
702pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
703 roundss(a, b, _MM_FROUND_CEIL)
704}
705
706/// Round the packed double-precision (64-bit) floating-point elements in `a`
17df50a5 707/// using the `ROUNDING` parameter, and stores the results as packed
0531ce1d
XL
708/// double-precision floating-point elements.
709/// Rounding is done according to the rounding parameter, which can be one of:
710///
711/// ```
0531ce1d
XL
712/// #[cfg(target_arch = "x86")]
713/// use std::arch::x86::*;
714/// #[cfg(target_arch = "x86_64")]
715/// use std::arch::x86_64::*;
716///
717/// # fn main() {
718/// // round to nearest, and suppress exceptions:
719/// # let _x =
720/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
721/// // round down, and suppress exceptions:
722/// # let _x =
723/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
724/// // round up, and suppress exceptions:
725/// # let _x =
726/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
727/// // truncate, and suppress exceptions:
728/// # let _x =
729/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
730/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
731/// # let _x =
732/// _MM_FROUND_CUR_DIRECTION;
733/// # }
734/// ```
83c7162d
XL
735///
736/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
0531ce1d
XL
737#[inline]
738#[target_feature(enable = "sse4.1")]
17df50a5
XL
739#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
740#[rustc_legacy_const_generics(1)]
741#[stable(feature = "simd_x86", since = "1.27.0")]
742pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
743 static_assert_imm4!(ROUNDING);
744 roundpd(a, ROUNDING)
0531ce1d
XL
745}
746
747/// Round the packed single-precision (32-bit) floating-point elements in `a`
17df50a5 748/// using the `ROUNDING` parameter, and stores the results as packed
0531ce1d
XL
749/// single-precision floating-point elements.
750/// Rounding is done according to the rounding parameter, which can be one of:
751///
752/// ```
0531ce1d
XL
753/// #[cfg(target_arch = "x86")]
754/// use std::arch::x86::*;
755/// #[cfg(target_arch = "x86_64")]
756/// use std::arch::x86_64::*;
757///
758/// # fn main() {
759/// // round to nearest, and suppress exceptions:
760/// # let _x =
761/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
762/// // round down, and suppress exceptions:
763/// # let _x =
764/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
765/// // round up, and suppress exceptions:
766/// # let _x =
767/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
768/// // truncate, and suppress exceptions:
769/// # let _x =
770/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
771/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
772/// # let _x =
773/// _MM_FROUND_CUR_DIRECTION;
774/// # }
775/// ```
83c7162d
XL
776///
777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
0531ce1d
XL
778#[inline]
779#[target_feature(enable = "sse4.1")]
17df50a5
XL
780#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
781#[rustc_legacy_const_generics(1)]
782#[stable(feature = "simd_x86", since = "1.27.0")]
783pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
784 static_assert_imm4!(ROUNDING);
785 roundps(a, ROUNDING)
0531ce1d
XL
786}
787
788/// Round the lower double-precision (64-bit) floating-point element in `b`
17df50a5 789/// using the `ROUNDING` parameter, store the result as a double-precision
0531ce1d 790/// floating-point element in the lower element of the intrinsic result,
532ac7d7 791/// and copies the upper element from `a` to the upper element of the intrinsic
0531ce1d
XL
792/// result.
793/// Rounding is done according to the rounding parameter, which can be one of:
794///
795/// ```
0531ce1d
XL
796/// #[cfg(target_arch = "x86")]
797/// use std::arch::x86::*;
798/// #[cfg(target_arch = "x86_64")]
799/// use std::arch::x86_64::*;
800///
801/// # fn main() {
802/// // round to nearest, and suppress exceptions:
803/// # let _x =
804/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
805/// // round down, and suppress exceptions:
806/// # let _x =
807/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
808/// // round up, and suppress exceptions:
809/// # let _x =
810/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
811/// // truncate, and suppress exceptions:
812/// # let _x =
813/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
814/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
815/// # let _x =
816/// _MM_FROUND_CUR_DIRECTION;
817/// # }
818/// ```
83c7162d
XL
819///
820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
0531ce1d
XL
821#[inline]
822#[target_feature(enable = "sse4.1")]
17df50a5
XL
823#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
824#[rustc_legacy_const_generics(2)]
825#[stable(feature = "simd_x86", since = "1.27.0")]
826pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
827 static_assert_imm4!(ROUNDING);
828 roundsd(a, b, ROUNDING)
0531ce1d
XL
829}
830
831/// Round the lower single-precision (32-bit) floating-point element in `b`
17df50a5 832/// using the `ROUNDING` parameter, store the result as a single-precision
0531ce1d 833/// floating-point element in the lower element of the intrinsic result,
532ac7d7 834/// and copies the upper 3 packed elements from `a` to the upper elements
0531ce1d
XL
835/// of the instrinsic result.
836/// Rounding is done according to the rounding parameter, which can be one of:
837///
838/// ```
0531ce1d
XL
839/// #[cfg(target_arch = "x86")]
840/// use std::arch::x86::*;
841/// #[cfg(target_arch = "x86_64")]
842/// use std::arch::x86_64::*;
843///
844/// # fn main() {
845/// // round to nearest, and suppress exceptions:
846/// # let _x =
847/// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
848/// // round down, and suppress exceptions:
849/// # let _x =
850/// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
851/// // round up, and suppress exceptions:
852/// # let _x =
853/// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
854/// // truncate, and suppress exceptions:
855/// # let _x =
856/// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
857/// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
858/// # let _x =
859/// _MM_FROUND_CUR_DIRECTION;
860/// # }
861/// ```
83c7162d
XL
862///
863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
0531ce1d
XL
864#[inline]
865#[target_feature(enable = "sse4.1")]
17df50a5
XL
866#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
867#[rustc_legacy_const_generics(2)]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
870 static_assert_imm4!(ROUNDING);
871 roundss(a, b, ROUNDING)
0531ce1d
XL
872}
873
874/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
875/// returning a vector containing its value in its first position, and its
876/// index
877/// in its second position; all other elements are set to zero.
878///
fc512014 879/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
0531ce1d
XL
880/// instruction.
881///
882/// Arguments:
883///
884/// * `a` - A 128-bit vector of type `__m128i`.
885///
886/// Returns:
887///
888/// A 128-bit value where:
889///
890/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
891/// * bits `[18:16]` - contain the index of the minimum value
892/// * remaining bits are set to `0`.
83c7162d
XL
893///
894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
0531ce1d
XL
895#[inline]
896#[target_feature(enable = "sse4.1")]
897#[cfg_attr(test, assert_instr(phminposuw))]
83c7162d 898#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 899pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
532ac7d7 900 transmute(phminposuw(a.as_u16x8()))
0531ce1d
XL
901}
902
532ac7d7
XL
903/// Multiplies the low 32-bit integers from each packed 64-bit
904/// element in `a` and `b`, and returns the signed 64-bit result.
83c7162d
XL
905///
906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
0531ce1d
XL
907#[inline]
908#[target_feature(enable = "sse4.1")]
909#[cfg_attr(test, assert_instr(pmuldq))]
83c7162d 910#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 911pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 912 transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
913}
914
532ac7d7 915/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
0531ce1d
XL
916/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
917/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
918/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
919/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
920/// return a negative number.
83c7162d
XL
921///
922/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
0531ce1d
XL
923#[inline]
924#[target_feature(enable = "sse4.1")]
925#[cfg_attr(test, assert_instr(pmulld))]
83c7162d 926#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 927pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
532ac7d7 928 transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
0531ce1d
XL
929}
930
931/// Subtracts 8-bit unsigned integer values and computes the absolute
932/// values of the differences to the corresponding bits in the destination.
933/// Then sums of the absolute differences are returned according to the bit
934/// fields in the immediate operand.
935///
936/// The following algorithm is performed:
937///
938/// ```ignore
17df50a5
XL
939/// i = IMM8[2] * 4
940/// j = IMM8[1:0] * 4
0531ce1d
XL
941/// for k := 0 to 7
942/// d0 = abs(a[i + k + 0] - b[j + 0])
943/// d1 = abs(a[i + k + 1] - b[j + 1])
944/// d2 = abs(a[i + k + 2] - b[j + 2])
945/// d3 = abs(a[i + k + 3] - b[j + 3])
946/// r[k] = d0 + d1 + d2 + d3
947/// ```
948///
949/// Arguments:
950///
951/// * `a` - A 128-bit vector of type `__m128i`.
952/// * `b` - A 128-bit vector of type `__m128i`.
17df50a5 953/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
0731742a 954/// differences are to be calculated
0531ce1d
XL
955/// * Bit `[2]` specify the offset for operand `a`
956/// * Bits `[1:0]` specify the offset for operand `b`
957///
958/// Returns:
959///
0731742a
XL
960/// * A `__m128i` vector containing the sums of the sets of absolute
961/// differences between both operands.
83c7162d
XL
962///
963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
0531ce1d
XL
964#[inline]
965#[target_feature(enable = "sse4.1")]
17df50a5
XL
966#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
967#[rustc_legacy_const_generics(2)]
83c7162d 968#[stable(feature = "simd_x86", since = "1.27.0")]
17df50a5
XL
969pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
970 static_assert_imm3!(IMM8);
971 transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8))
0531ce1d
XL
972}
973
974/// Tests whether the specified bits in a 128-bit integer vector are all
975/// zeros.
976///
977/// Arguments:
978///
979/// * `a` - A 128-bit integer vector containing the bits to be tested.
980/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 981/// operand `a`.
0531ce1d
XL
982///
983/// Returns:
984///
985/// * `1` - if the specified bits are all zeros,
986/// * `0` - otherwise.
83c7162d
XL
987///
988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
0531ce1d
XL
989#[inline]
990#[target_feature(enable = "sse4.1")]
991#[cfg_attr(test, assert_instr(ptest))]
83c7162d 992#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
993pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
994 ptestz(a.as_i64x2(), mask.as_i64x2())
995}
996
997/// Tests whether the specified bits in a 128-bit integer vector are all
998/// ones.
999///
1000/// Arguments:
1001///
1002/// * `a` - A 128-bit integer vector containing the bits to be tested.
1003/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1004/// operand `a`.
0531ce1d
XL
1005///
1006/// Returns:
1007///
1008/// * `1` - if the specified bits are all ones,
1009/// * `0` - otherwise.
83c7162d
XL
1010///
1011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
0531ce1d
XL
1012#[inline]
1013#[target_feature(enable = "sse4.1")]
1014#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1015#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1016pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1017 ptestc(a.as_i64x2(), mask.as_i64x2())
1018}
1019
1020/// Tests whether the specified bits in a 128-bit integer vector are
1021/// neither all zeros nor all ones.
1022///
1023/// Arguments:
1024///
1025/// * `a` - A 128-bit integer vector containing the bits to be tested.
1026/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1027/// operand `a`.
0531ce1d
XL
1028///
1029/// Returns:
1030///
1031/// * `1` - if the specified bits are neither all zeros nor all ones,
1032/// * `0` - otherwise.
83c7162d
XL
1033///
1034/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
0531ce1d
XL
1035#[inline]
1036#[target_feature(enable = "sse4.1")]
1037#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1038#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1039pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1040 ptestnzc(a.as_i64x2(), mask.as_i64x2())
1041}
1042
1043/// Tests whether the specified bits in a 128-bit integer vector are all
1044/// zeros.
1045///
1046/// Arguments:
1047///
1048/// * `a` - A 128-bit integer vector containing the bits to be tested.
1049/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1050/// operand `a`.
0531ce1d
XL
1051///
1052/// Returns:
1053///
1054/// * `1` - if the specified bits are all zeros,
1055/// * `0` - otherwise.
83c7162d
XL
1056///
1057/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
0531ce1d
XL
1058#[inline]
1059#[target_feature(enable = "sse4.1")]
1060#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1061#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1062pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1063 _mm_testz_si128(a, mask)
1064}
1065
1066/// Tests whether the specified bits in `a` 128-bit integer vector are all
1067/// ones.
1068///
1069/// Argument:
1070///
1071/// * `a` - A 128-bit integer vector containing the bits to be tested.
1072///
1073/// Returns:
1074///
1075/// * `1` - if the bits specified in the operand are all set to 1,
1076/// * `0` - otherwise.
83c7162d
XL
1077///
1078/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
0531ce1d
XL
1079#[inline]
1080#[target_feature(enable = "sse4.1")]
1081#[cfg_attr(test, assert_instr(pcmpeqd))]
1082#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1083#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1084pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1085 _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1086}
1087
1088/// Tests whether the specified bits in a 128-bit integer vector are
1089/// neither all zeros nor all ones.
1090///
1091/// Arguments:
1092///
1093/// * `a` - A 128-bit integer vector containing the bits to be tested.
1094/// * `mask` - A 128-bit integer vector selecting which bits to test in
0731742a 1095/// operand `a`.
0531ce1d
XL
1096///
1097/// Returns:
1098///
1099/// * `1` - if the specified bits are neither all zeros nor all ones,
1100/// * `0` - otherwise.
83c7162d
XL
1101///
1102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
0531ce1d
XL
1103#[inline]
1104#[target_feature(enable = "sse4.1")]
1105#[cfg_attr(test, assert_instr(ptest))]
83c7162d 1106#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1107pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1108 _mm_testnzc_si128(a, mask)
1109}
1110
1111#[allow(improper_ctypes)]
1112extern "C" {
1113 #[link_name = "llvm.x86.sse41.pblendvb"]
1114 fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1115 #[link_name = "llvm.x86.sse41.blendvpd"]
1116 fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1117 #[link_name = "llvm.x86.sse41.blendvps"]
1118 fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1119 #[link_name = "llvm.x86.sse41.blendpd"]
1120 fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1121 #[link_name = "llvm.x86.sse41.blendps"]
1122 fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1123 #[link_name = "llvm.x86.sse41.pblendw"]
1124 fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1125 #[link_name = "llvm.x86.sse41.insertps"]
1126 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1127 #[link_name = "llvm.x86.sse41.pmaxsb"]
1128 fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1129 #[link_name = "llvm.x86.sse41.pmaxuw"]
1130 fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1131 #[link_name = "llvm.x86.sse41.pmaxsd"]
1132 fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1133 #[link_name = "llvm.x86.sse41.pmaxud"]
1134 fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1135 #[link_name = "llvm.x86.sse41.pminsb"]
1136 fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1137 #[link_name = "llvm.x86.sse41.pminuw"]
1138 fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1139 #[link_name = "llvm.x86.sse41.pminsd"]
1140 fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1141 #[link_name = "llvm.x86.sse41.pminud"]
1142 fn pminud(a: u32x4, b: u32x4) -> u32x4;
1143 #[link_name = "llvm.x86.sse41.packusdw"]
1144 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1145 #[link_name = "llvm.x86.sse41.dppd"]
1146 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1147 #[link_name = "llvm.x86.sse41.dpps"]
1148 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1149 #[link_name = "llvm.x86.sse41.round.pd"]
1150 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1151 #[link_name = "llvm.x86.sse41.round.ps"]
1152 fn roundps(a: __m128, rounding: i32) -> __m128;
1153 #[link_name = "llvm.x86.sse41.round.sd"]
1154 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1155 #[link_name = "llvm.x86.sse41.round.ss"]
1156 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1157 #[link_name = "llvm.x86.sse41.phminposuw"]
1158 fn phminposuw(a: u16x8) -> u16x8;
1159 #[link_name = "llvm.x86.sse41.pmuldq"]
1160 fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1161 #[link_name = "llvm.x86.sse41.mpsadbw"]
1162 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1163 #[link_name = "llvm.x86.sse41.ptestz"]
1164 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1165 #[link_name = "llvm.x86.sse41.ptestc"]
1166 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1167 #[link_name = "llvm.x86.sse41.ptestnzc"]
1168 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1169}
1170
1171#[cfg(test)]
1172mod tests {
532ac7d7 1173 use crate::core_arch::x86::*;
0531ce1d 1174 use std::mem;
416331ca 1175 use stdarch_test::simd_test;
0531ce1d 1176
83c7162d 1177 #[simd_test(enable = "sse4.1")]
0531ce1d 1178 unsafe fn test_mm_blendv_epi8() {
0731742a 1179 #[rustfmt::skip]
0531ce1d
XL
1180 let a = _mm_setr_epi8(
1181 0, 1, 2, 3, 4, 5, 6, 7,
1182 8, 9, 10, 11, 12, 13, 14, 15,
1183 );
0731742a 1184 #[rustfmt::skip]
0531ce1d
XL
1185 let b = _mm_setr_epi8(
1186 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1187 );
0731742a 1188 #[rustfmt::skip]
0531ce1d
XL
1189 let mask = _mm_setr_epi8(
1190 0, -1, 0, -1, 0, -1, 0, -1,
1191 0, -1, 0, -1, 0, -1, 0, -1,
1192 );
0731742a 1193 #[rustfmt::skip]
0531ce1d
XL
1194 let e = _mm_setr_epi8(
1195 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1196 );
1197 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1198 }
1199
83c7162d 1200 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1201 unsafe fn test_mm_blendv_pd() {
1202 let a = _mm_set1_pd(0.0);
1203 let b = _mm_set1_pd(1.0);
532ac7d7 1204 let mask = transmute(_mm_setr_epi64x(0, -1));
0531ce1d
XL
1205 let r = _mm_blendv_pd(a, b, mask);
1206 let e = _mm_setr_pd(0.0, 1.0);
1207 assert_eq_m128d(r, e);
1208 }
1209
83c7162d 1210 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1211 unsafe fn test_mm_blendv_ps() {
1212 let a = _mm_set1_ps(0.0);
1213 let b = _mm_set1_ps(1.0);
532ac7d7 1214 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
0531ce1d
XL
1215 let r = _mm_blendv_ps(a, b, mask);
1216 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1217 assert_eq_m128(r, e);
1218 }
1219
83c7162d 1220 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1221 unsafe fn test_mm_blend_pd() {
1222 let a = _mm_set1_pd(0.0);
1223 let b = _mm_set1_pd(1.0);
17df50a5 1224 let r = _mm_blend_pd::<0b10>(a, b);
0531ce1d
XL
1225 let e = _mm_setr_pd(0.0, 1.0);
1226 assert_eq_m128d(r, e);
1227 }
1228
83c7162d 1229 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1230 unsafe fn test_mm_blend_ps() {
1231 let a = _mm_set1_ps(0.0);
1232 let b = _mm_set1_ps(1.0);
17df50a5 1233 let r = _mm_blend_ps::<0b1010>(a, b);
0531ce1d
XL
1234 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1235 assert_eq_m128(r, e);
1236 }
1237
83c7162d 1238 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1239 unsafe fn test_mm_blend_epi16() {
1240 let a = _mm_set1_epi16(0);
1241 let b = _mm_set1_epi16(1);
17df50a5 1242 let r = _mm_blend_epi16::<0b1010_1100>(a, b);
0531ce1d
XL
1243 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1244 assert_eq_m128i(r, e);
1245 }
1246
83c7162d 1247 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1248 unsafe fn test_mm_extract_ps() {
1249 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
17df50a5 1250 let r: f32 = transmute(_mm_extract_ps::<1>(a));
0531ce1d 1251 assert_eq!(r, 1.0);
17df50a5
XL
1252 let r: f32 = transmute(_mm_extract_ps::<3>(a));
1253 assert_eq!(r, 3.0);
0531ce1d
XL
1254 }
1255
83c7162d 1256 #[simd_test(enable = "sse4.1")]
0531ce1d 1257 unsafe fn test_mm_extract_epi8() {
0731742a 1258 #[rustfmt::skip]
0531ce1d
XL
1259 let a = _mm_setr_epi8(
1260 -1, 1, 2, 3, 4, 5, 6, 7,
1261 8, 9, 10, 11, 12, 13, 14, 15
1262 );
17df50a5
XL
1263 let r1 = _mm_extract_epi8::<0>(a);
1264 let r2 = _mm_extract_epi8::<3>(a);
0531ce1d
XL
1265 assert_eq!(r1, 0xFF);
1266 assert_eq!(r2, 3);
1267 }
1268
83c7162d 1269 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1270 unsafe fn test_mm_extract_epi32() {
1271 let a = _mm_setr_epi32(0, 1, 2, 3);
17df50a5 1272 let r = _mm_extract_epi32::<1>(a);
0531ce1d 1273 assert_eq!(r, 1);
17df50a5
XL
1274 let r = _mm_extract_epi32::<3>(a);
1275 assert_eq!(r, 3);
0531ce1d
XL
1276 }
1277
83c7162d 1278 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1279 unsafe fn test_mm_insert_ps() {
1280 let a = _mm_set1_ps(1.0);
1281 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
17df50a5 1282 let r = _mm_insert_ps::<0b11_00_1100>(a, b);
0531ce1d
XL
1283 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1284 assert_eq_m128(r, e);
1285 }
1286
83c7162d 1287 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1288 unsafe fn test_mm_insert_epi8() {
1289 let a = _mm_set1_epi8(0);
1290 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
17df50a5 1291 let r = _mm_insert_epi8::<1>(a, 32);
0531ce1d 1292 assert_eq_m128i(r, e);
17df50a5
XL
1293 let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1294 let r = _mm_insert_epi8::<14>(a, 32);
0531ce1d
XL
1295 assert_eq_m128i(r, e);
1296 }
1297
83c7162d 1298 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1299 unsafe fn test_mm_insert_epi32() {
1300 let a = _mm_set1_epi32(0);
1301 let e = _mm_setr_epi32(0, 32, 0, 0);
17df50a5 1302 let r = _mm_insert_epi32::<1>(a, 32);
0531ce1d 1303 assert_eq_m128i(r, e);
17df50a5
XL
1304 let e = _mm_setr_epi32(0, 0, 0, 32);
1305 let r = _mm_insert_epi32::<3>(a, 32);
0531ce1d
XL
1306 assert_eq_m128i(r, e);
1307 }
1308
83c7162d 1309 #[simd_test(enable = "sse4.1")]
0531ce1d 1310 unsafe fn test_mm_max_epi8() {
0731742a 1311 #[rustfmt::skip]
0531ce1d
XL
1312 let a = _mm_setr_epi8(
1313 1, 4, 5, 8, 9, 12, 13, 16,
1314 17, 20, 21, 24, 25, 28, 29, 32,
1315 );
0731742a 1316 #[rustfmt::skip]
0531ce1d
XL
1317 let b = _mm_setr_epi8(
1318 2, 3, 6, 7, 10, 11, 14, 15,
1319 18, 19, 22, 23, 26, 27, 30, 31,
1320 );
1321 let r = _mm_max_epi8(a, b);
0731742a 1322 #[rustfmt::skip]
0531ce1d
XL
1323 let e = _mm_setr_epi8(
1324 2, 4, 6, 8, 10, 12, 14, 16,
1325 18, 20, 22, 24, 26, 28, 30, 32,
1326 );
1327 assert_eq_m128i(r, e);
1328 }
1329
83c7162d 1330 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1331 unsafe fn test_mm_max_epu16() {
1332 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1333 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1334 let r = _mm_max_epu16(a, b);
1335 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1336 assert_eq_m128i(r, e);
1337 }
1338
83c7162d 1339 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1340 unsafe fn test_mm_max_epi32() {
1341 let a = _mm_setr_epi32(1, 4, 5, 8);
1342 let b = _mm_setr_epi32(2, 3, 6, 7);
1343 let r = _mm_max_epi32(a, b);
1344 let e = _mm_setr_epi32(2, 4, 6, 8);
1345 assert_eq_m128i(r, e);
1346 }
1347
83c7162d 1348 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1349 unsafe fn test_mm_max_epu32() {
1350 let a = _mm_setr_epi32(1, 4, 5, 8);
1351 let b = _mm_setr_epi32(2, 3, 6, 7);
1352 let r = _mm_max_epu32(a, b);
1353 let e = _mm_setr_epi32(2, 4, 6, 8);
1354 assert_eq_m128i(r, e);
1355 }
1356
83c7162d 1357 #[simd_test(enable = "sse4.1")]
0531ce1d 1358 unsafe fn test_mm_min_epi8_1() {
0731742a 1359 #[rustfmt::skip]
0531ce1d
XL
1360 let a = _mm_setr_epi8(
1361 1, 4, 5, 8, 9, 12, 13, 16,
1362 17, 20, 21, 24, 25, 28, 29, 32,
1363 );
0731742a 1364 #[rustfmt::skip]
0531ce1d
XL
1365 let b = _mm_setr_epi8(
1366 2, 3, 6, 7, 10, 11, 14, 15,
1367 18, 19, 22, 23, 26, 27, 30, 31,
1368 );
1369 let r = _mm_min_epi8(a, b);
0731742a 1370 #[rustfmt::skip]
0531ce1d
XL
1371 let e = _mm_setr_epi8(
1372 1, 3, 5, 7, 9, 11, 13, 15,
1373 17, 19, 21, 23, 25, 27, 29, 31,
1374 );
1375 assert_eq_m128i(r, e);
1376 }
1377
83c7162d 1378 #[simd_test(enable = "sse4.1")]
0531ce1d 1379 unsafe fn test_mm_min_epi8_2() {
0731742a 1380 #[rustfmt::skip]
0531ce1d
XL
1381 let a = _mm_setr_epi8(
1382 1, -4, -5, 8, -9, -12, 13, -16,
1383 17, 20, 21, 24, 25, 28, 29, 32,
1384 );
0731742a 1385 #[rustfmt::skip]
0531ce1d
XL
1386 let b = _mm_setr_epi8(
1387 2, -3, -6, 7, -10, -11, 14, -15,
1388 18, 19, 22, 23, 26, 27, 30, 31,
1389 );
1390 let r = _mm_min_epi8(a, b);
0731742a 1391 #[rustfmt::skip]
0531ce1d
XL
1392 let e = _mm_setr_epi8(
1393 1, -4, -6, 7, -10, -12, 13, -16,
1394 17, 19, 21, 23, 25, 27, 29, 31,
1395 );
1396 assert_eq_m128i(r, e);
1397 }
1398
83c7162d 1399 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1400 unsafe fn test_mm_min_epu16() {
1401 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1402 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1403 let r = _mm_min_epu16(a, b);
1404 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1405 assert_eq_m128i(r, e);
1406 }
1407
83c7162d 1408 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1409 unsafe fn test_mm_min_epi32_1() {
1410 let a = _mm_setr_epi32(1, 4, 5, 8);
1411 let b = _mm_setr_epi32(2, 3, 6, 7);
1412 let r = _mm_min_epi32(a, b);
1413 let e = _mm_setr_epi32(1, 3, 5, 7);
1414 assert_eq_m128i(r, e);
1415 }
1416
83c7162d 1417 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1418 unsafe fn test_mm_min_epi32_2() {
1419 let a = _mm_setr_epi32(-1, 4, 5, -7);
1420 let b = _mm_setr_epi32(-2, 3, -6, 8);
1421 let r = _mm_min_epi32(a, b);
1422 let e = _mm_setr_epi32(-2, 3, -6, -7);
1423 assert_eq_m128i(r, e);
1424 }
1425
83c7162d 1426 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1427 unsafe fn test_mm_min_epu32() {
1428 let a = _mm_setr_epi32(1, 4, 5, 8);
1429 let b = _mm_setr_epi32(2, 3, 6, 7);
1430 let r = _mm_min_epu32(a, b);
1431 let e = _mm_setr_epi32(1, 3, 5, 7);
1432 assert_eq_m128i(r, e);
1433 }
1434
83c7162d 1435 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1436 unsafe fn test_mm_packus_epi32() {
1437 let a = _mm_setr_epi32(1, 2, 3, 4);
1438 let b = _mm_setr_epi32(-1, -2, -3, -4);
1439 let r = _mm_packus_epi32(a, b);
1440 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1441 assert_eq_m128i(r, e);
1442 }
1443
83c7162d 1444 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1445 unsafe fn test_mm_cmpeq_epi64() {
1446 let a = _mm_setr_epi64x(0, 1);
1447 let b = _mm_setr_epi64x(0, 0);
1448 let r = _mm_cmpeq_epi64(a, b);
1449 let e = _mm_setr_epi64x(-1, 0);
1450 assert_eq_m128i(r, e);
1451 }
1452
83c7162d 1453 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1454 unsafe fn test_mm_cvtepi8_epi16() {
1455 let a = _mm_set1_epi8(10);
1456 let r = _mm_cvtepi8_epi16(a);
1457 let e = _mm_set1_epi16(10);
1458 assert_eq_m128i(r, e);
1459 let a = _mm_set1_epi8(-10);
1460 let r = _mm_cvtepi8_epi16(a);
1461 let e = _mm_set1_epi16(-10);
1462 assert_eq_m128i(r, e);
1463 }
1464
83c7162d 1465 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1466 unsafe fn test_mm_cvtepi8_epi32() {
1467 let a = _mm_set1_epi8(10);
1468 let r = _mm_cvtepi8_epi32(a);
1469 let e = _mm_set1_epi32(10);
1470 assert_eq_m128i(r, e);
1471 let a = _mm_set1_epi8(-10);
1472 let r = _mm_cvtepi8_epi32(a);
1473 let e = _mm_set1_epi32(-10);
1474 assert_eq_m128i(r, e);
1475 }
1476
83c7162d 1477 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1478 unsafe fn test_mm_cvtepi8_epi64() {
1479 let a = _mm_set1_epi8(10);
1480 let r = _mm_cvtepi8_epi64(a);
1481 let e = _mm_set1_epi64x(10);
1482 assert_eq_m128i(r, e);
1483 let a = _mm_set1_epi8(-10);
1484 let r = _mm_cvtepi8_epi64(a);
1485 let e = _mm_set1_epi64x(-10);
1486 assert_eq_m128i(r, e);
1487 }
1488
83c7162d 1489 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1490 unsafe fn test_mm_cvtepi16_epi32() {
1491 let a = _mm_set1_epi16(10);
1492 let r = _mm_cvtepi16_epi32(a);
1493 let e = _mm_set1_epi32(10);
1494 assert_eq_m128i(r, e);
1495 let a = _mm_set1_epi16(-10);
1496 let r = _mm_cvtepi16_epi32(a);
1497 let e = _mm_set1_epi32(-10);
1498 assert_eq_m128i(r, e);
1499 }
1500
83c7162d 1501 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1502 unsafe fn test_mm_cvtepi16_epi64() {
1503 let a = _mm_set1_epi16(10);
1504 let r = _mm_cvtepi16_epi64(a);
1505 let e = _mm_set1_epi64x(10);
1506 assert_eq_m128i(r, e);
1507 let a = _mm_set1_epi16(-10);
1508 let r = _mm_cvtepi16_epi64(a);
1509 let e = _mm_set1_epi64x(-10);
1510 assert_eq_m128i(r, e);
1511 }
1512
83c7162d 1513 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1514 unsafe fn test_mm_cvtepi32_epi64() {
1515 let a = _mm_set1_epi32(10);
1516 let r = _mm_cvtepi32_epi64(a);
1517 let e = _mm_set1_epi64x(10);
1518 assert_eq_m128i(r, e);
1519 let a = _mm_set1_epi32(-10);
1520 let r = _mm_cvtepi32_epi64(a);
1521 let e = _mm_set1_epi64x(-10);
1522 assert_eq_m128i(r, e);
1523 }
1524
83c7162d 1525 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1526 unsafe fn test_mm_cvtepu8_epi16() {
1527 let a = _mm_set1_epi8(10);
1528 let r = _mm_cvtepu8_epi16(a);
1529 let e = _mm_set1_epi16(10);
1530 assert_eq_m128i(r, e);
1531 }
1532
83c7162d 1533 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1534 unsafe fn test_mm_cvtepu8_epi32() {
1535 let a = _mm_set1_epi8(10);
1536 let r = _mm_cvtepu8_epi32(a);
1537 let e = _mm_set1_epi32(10);
1538 assert_eq_m128i(r, e);
1539 }
1540
83c7162d 1541 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1542 unsafe fn test_mm_cvtepu8_epi64() {
1543 let a = _mm_set1_epi8(10);
1544 let r = _mm_cvtepu8_epi64(a);
1545 let e = _mm_set1_epi64x(10);
1546 assert_eq_m128i(r, e);
1547 }
1548
83c7162d 1549 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1550 unsafe fn test_mm_cvtepu16_epi32() {
1551 let a = _mm_set1_epi16(10);
1552 let r = _mm_cvtepu16_epi32(a);
1553 let e = _mm_set1_epi32(10);
1554 assert_eq_m128i(r, e);
1555 }
1556
83c7162d 1557 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1558 unsafe fn test_mm_cvtepu16_epi64() {
1559 let a = _mm_set1_epi16(10);
1560 let r = _mm_cvtepu16_epi64(a);
1561 let e = _mm_set1_epi64x(10);
1562 assert_eq_m128i(r, e);
1563 }
1564
83c7162d 1565 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1566 unsafe fn test_mm_cvtepu32_epi64() {
1567 let a = _mm_set1_epi32(10);
1568 let r = _mm_cvtepu32_epi64(a);
1569 let e = _mm_set1_epi64x(10);
1570 assert_eq_m128i(r, e);
1571 }
1572
83c7162d 1573 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1574 unsafe fn test_mm_dp_pd() {
1575 let a = _mm_setr_pd(2.0, 3.0);
1576 let b = _mm_setr_pd(1.0, 4.0);
1577 let e = _mm_setr_pd(14.0, 0.0);
17df50a5 1578 assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
0531ce1d
XL
1579 }
1580
83c7162d 1581 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1582 unsafe fn test_mm_dp_ps() {
1583 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1584 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1585 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
17df50a5 1586 assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
0531ce1d
XL
1587 }
1588
83c7162d 1589 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1590 unsafe fn test_mm_floor_pd() {
1591 let a = _mm_setr_pd(2.5, 4.5);
1592 let r = _mm_floor_pd(a);
1593 let e = _mm_setr_pd(2.0, 4.0);
1594 assert_eq_m128d(r, e);
1595 }
1596
83c7162d 1597 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1598 unsafe fn test_mm_floor_ps() {
1599 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1600 let r = _mm_floor_ps(a);
1601 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1602 assert_eq_m128(r, e);
1603 }
1604
83c7162d 1605 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1606 unsafe fn test_mm_floor_sd() {
1607 let a = _mm_setr_pd(2.5, 4.5);
1608 let b = _mm_setr_pd(-1.5, -3.5);
1609 let r = _mm_floor_sd(a, b);
1610 let e = _mm_setr_pd(-2.0, 4.5);
1611 assert_eq_m128d(r, e);
1612 }
1613
83c7162d 1614 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1615 unsafe fn test_mm_floor_ss() {
1616 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1617 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1618 let r = _mm_floor_ss(a, b);
1619 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1620 assert_eq_m128(r, e);
1621 }
1622
83c7162d 1623 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1624 unsafe fn test_mm_ceil_pd() {
1625 let a = _mm_setr_pd(1.5, 3.5);
1626 let r = _mm_ceil_pd(a);
1627 let e = _mm_setr_pd(2.0, 4.0);
1628 assert_eq_m128d(r, e);
1629 }
1630
83c7162d 1631 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1632 unsafe fn test_mm_ceil_ps() {
1633 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1634 let r = _mm_ceil_ps(a);
1635 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1636 assert_eq_m128(r, e);
1637 }
1638
83c7162d 1639 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1640 unsafe fn test_mm_ceil_sd() {
1641 let a = _mm_setr_pd(1.5, 3.5);
1642 let b = _mm_setr_pd(-2.5, -4.5);
1643 let r = _mm_ceil_sd(a, b);
1644 let e = _mm_setr_pd(-2.0, 3.5);
1645 assert_eq_m128d(r, e);
1646 }
1647
83c7162d 1648 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1649 unsafe fn test_mm_ceil_ss() {
1650 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1651 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1652 let r = _mm_ceil_ss(a, b);
1653 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1654 assert_eq_m128(r, e);
1655 }
1656
83c7162d 1657 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1658 unsafe fn test_mm_round_pd() {
1659 let a = _mm_setr_pd(1.25, 3.75);
17df50a5 1660 let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
0531ce1d
XL
1661 let e = _mm_setr_pd(1.0, 4.0);
1662 assert_eq_m128d(r, e);
1663 }
1664
83c7162d 1665 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1666 unsafe fn test_mm_round_ps() {
1667 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
17df50a5 1668 let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
0531ce1d
XL
1669 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1670 assert_eq_m128(r, e);
1671 }
1672
83c7162d 1673 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1674 unsafe fn test_mm_round_sd() {
1675 let a = _mm_setr_pd(1.5, 3.5);
1676 let b = _mm_setr_pd(-2.5, -4.5);
1677 let old_mode = _MM_GET_ROUNDING_MODE();
1678 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
17df50a5 1679 let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
0531ce1d
XL
1680 _MM_SET_ROUNDING_MODE(old_mode);
1681 let e = _mm_setr_pd(-2.0, 3.5);
1682 assert_eq_m128d(r, e);
1683 }
1684
83c7162d 1685 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1686 unsafe fn test_mm_round_ss() {
1687 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1688 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1689 let old_mode = _MM_GET_ROUNDING_MODE();
1690 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
17df50a5 1691 let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
0531ce1d
XL
1692 _MM_SET_ROUNDING_MODE(old_mode);
1693 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1694 assert_eq_m128(r, e);
1695 }
1696
83c7162d 1697 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1698 unsafe fn test_mm_minpos_epu16_1() {
1699 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1700 let r = _mm_minpos_epu16(a);
1701 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1702 assert_eq_m128i(r, e);
1703 }
1704
83c7162d 1705 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1706 unsafe fn test_mm_minpos_epu16_2() {
1707 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1708 let r = _mm_minpos_epu16(a);
1709 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1710 assert_eq_m128i(r, e);
1711 }
1712
83c7162d 1713 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1714 unsafe fn test_mm_mul_epi32() {
1715 {
1716 let a = _mm_setr_epi32(1, 1, 1, 1);
1717 let b = _mm_setr_epi32(1, 2, 3, 4);
1718 let r = _mm_mul_epi32(a, b);
1719 let e = _mm_setr_epi64x(1, 3);
1720 assert_eq_m128i(r, e);
1721 }
1722 {
0731742a 1723 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
0531ce1d 1724 let b = _mm_setr_epi32(
8faf50e0
XL
1725 -20, -256, /* ignored */
1726 666666, 666666, /* ignored */
0531ce1d
XL
1727 );
1728 let r = _mm_mul_epi32(a, b);
1729 let e = _mm_setr_epi64x(-300, 823043843622);
1730 assert_eq_m128i(r, e);
1731 }
1732 }
1733
83c7162d 1734 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1735 unsafe fn test_mm_mullo_epi32() {
1736 {
1737 let a = _mm_setr_epi32(1, 1, 1, 1);
1738 let b = _mm_setr_epi32(1, 2, 3, 4);
1739 let r = _mm_mullo_epi32(a, b);
1740 let e = _mm_setr_epi32(1, 2, 3, 4);
1741 assert_eq_m128i(r, e);
1742 }
1743 {
1744 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1745 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1746 let r = _mm_mullo_epi32(a, b);
1747 // Attention, most significant bit in r[2] is treated
1748 // as a sign bit:
1749 // 1234567 * 666666 = -1589877210
1750 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1751 assert_eq_m128i(r, e);
1752 }
1753 }
1754
83c7162d 1755 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1756 unsafe fn test_mm_minpos_epu16() {
1757 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1758 let r = _mm_minpos_epu16(a);
1759 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1760 assert_eq_m128i(r, e);
1761 }
1762
83c7162d 1763 #[simd_test(enable = "sse4.1")]
0531ce1d 1764 unsafe fn test_mm_mpsadbw_epu8() {
0731742a 1765 #[rustfmt::skip]
0531ce1d
XL
1766 let a = _mm_setr_epi8(
1767 0, 1, 2, 3, 4, 5, 6, 7,
1768 8, 9, 10, 11, 12, 13, 14, 15,
1769 );
1770
17df50a5 1771 let r = _mm_mpsadbw_epu8::<0b000>(a, a);
0531ce1d
XL
1772 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1773 assert_eq_m128i(r, e);
1774
17df50a5 1775 let r = _mm_mpsadbw_epu8::<0b001>(a, a);
0531ce1d
XL
1776 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1777 assert_eq_m128i(r, e);
1778
17df50a5 1779 let r = _mm_mpsadbw_epu8::<0b100>(a, a);
0531ce1d
XL
1780 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1781 assert_eq_m128i(r, e);
1782
17df50a5 1783 let r = _mm_mpsadbw_epu8::<0b101>(a, a);
0531ce1d
XL
1784 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1785 assert_eq_m128i(r, e);
1786
17df50a5 1787 let r = _mm_mpsadbw_epu8::<0b111>(a, a);
0531ce1d
XL
1788 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1789 assert_eq_m128i(r, e);
1790 }
1791
83c7162d 1792 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1793 unsafe fn test_mm_testz_si128() {
1794 let a = _mm_set1_epi8(1);
1795 let mask = _mm_set1_epi8(0);
1796 let r = _mm_testz_si128(a, mask);
1797 assert_eq!(r, 1);
1798 let a = _mm_set1_epi8(0b101);
1799 let mask = _mm_set1_epi8(0b110);
1800 let r = _mm_testz_si128(a, mask);
1801 assert_eq!(r, 0);
1802 let a = _mm_set1_epi8(0b011);
1803 let mask = _mm_set1_epi8(0b100);
1804 let r = _mm_testz_si128(a, mask);
1805 assert_eq!(r, 1);
1806 }
1807
83c7162d 1808 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1809 unsafe fn test_mm_testc_si128() {
1810 let a = _mm_set1_epi8(-1);
1811 let mask = _mm_set1_epi8(0);
1812 let r = _mm_testc_si128(a, mask);
1813 assert_eq!(r, 1);
1814 let a = _mm_set1_epi8(0b101);
1815 let mask = _mm_set1_epi8(0b110);
1816 let r = _mm_testc_si128(a, mask);
1817 assert_eq!(r, 0);
1818 let a = _mm_set1_epi8(0b101);
1819 let mask = _mm_set1_epi8(0b100);
1820 let r = _mm_testc_si128(a, mask);
1821 assert_eq!(r, 1);
1822 }
1823
83c7162d 1824 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1825 unsafe fn test_mm_testnzc_si128() {
1826 let a = _mm_set1_epi8(0);
1827 let mask = _mm_set1_epi8(1);
1828 let r = _mm_testnzc_si128(a, mask);
1829 assert_eq!(r, 0);
1830 let a = _mm_set1_epi8(-1);
1831 let mask = _mm_set1_epi8(0);
1832 let r = _mm_testnzc_si128(a, mask);
1833 assert_eq!(r, 0);
1834 let a = _mm_set1_epi8(0b101);
1835 let mask = _mm_set1_epi8(0b110);
1836 let r = _mm_testnzc_si128(a, mask);
1837 assert_eq!(r, 1);
1838 let a = _mm_set1_epi8(0b101);
1839 let mask = _mm_set1_epi8(0b101);
1840 let r = _mm_testnzc_si128(a, mask);
1841 assert_eq!(r, 0);
1842 }
1843
83c7162d 1844 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1845 unsafe fn test_mm_test_all_zeros() {
1846 let a = _mm_set1_epi8(1);
1847 let mask = _mm_set1_epi8(0);
1848 let r = _mm_test_all_zeros(a, mask);
1849 assert_eq!(r, 1);
1850 let a = _mm_set1_epi8(0b101);
1851 let mask = _mm_set1_epi8(0b110);
1852 let r = _mm_test_all_zeros(a, mask);
1853 assert_eq!(r, 0);
1854 let a = _mm_set1_epi8(0b011);
1855 let mask = _mm_set1_epi8(0b100);
1856 let r = _mm_test_all_zeros(a, mask);
1857 assert_eq!(r, 1);
1858 }
1859
83c7162d 1860 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1861 unsafe fn test_mm_test_all_ones() {
1862 let a = _mm_set1_epi8(-1);
1863 let r = _mm_test_all_ones(a);
1864 assert_eq!(r, 1);
1865 let a = _mm_set1_epi8(0b101);
1866 let r = _mm_test_all_ones(a);
1867 assert_eq!(r, 0);
1868 }
1869
83c7162d 1870 #[simd_test(enable = "sse4.1")]
0531ce1d
XL
1871 unsafe fn test_mm_test_mix_ones_zeros() {
1872 let a = _mm_set1_epi8(0);
1873 let mask = _mm_set1_epi8(1);
1874 let r = _mm_test_mix_ones_zeros(a, mask);
1875 assert_eq!(r, 0);
1876 let a = _mm_set1_epi8(-1);
1877 let mask = _mm_set1_epi8(0);
1878 let r = _mm_test_mix_ones_zeros(a, mask);
1879 assert_eq!(r, 0);
1880 let a = _mm_set1_epi8(0b101);
1881 let mask = _mm_set1_epi8(0b110);
1882 let r = _mm_test_mix_ones_zeros(a, mask);
1883 assert_eq!(r, 1);
1884 let a = _mm_set1_epi8(0b101);
1885 let mask = _mm_set1_epi8(0b101);
1886 let r = _mm_test_mix_ones_zeros(a, mask);
1887 assert_eq!(r, 0);
1888 }
1889}