]> git.proxmox.com Git - rustc.git/blob - src/stdsimd/crates/core_arch/src/x86/sse41.rs
New upstream version 1.37.0+dfsg1
[rustc.git] / src / stdsimd / crates / core_arch / src / x86 / sse41.rs
1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3 use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 mem::transmute,
6 };
7
8 #[cfg(test)]
9 use stdsimd_test::assert_instr;
10
11 // SSE4 rounding constans
12 /// round to nearest
13 #[stable(feature = "simd_x86", since = "1.27.0")]
14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15 /// round down
16 #[stable(feature = "simd_x86", since = "1.27.0")]
17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18 /// round up
19 #[stable(feature = "simd_x86", since = "1.27.0")]
20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21 /// truncate
22 #[stable(feature = "simd_x86", since = "1.27.0")]
23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
25 #[stable(feature = "simd_x86", since = "1.27.0")]
26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27 /// do not suppress exceptions
28 #[stable(feature = "simd_x86", since = "1.27.0")]
29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30 /// suppress exceptions
31 #[stable(feature = "simd_x86", since = "1.27.0")]
32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33 /// round to nearest and do not suppress exceptions
34 #[stable(feature = "simd_x86", since = "1.27.0")]
35 pub const _MM_FROUND_NINT: i32 = 0x00;
36 /// round down and do not suppress exceptions
37 #[stable(feature = "simd_x86", since = "1.27.0")]
38 pub const _MM_FROUND_FLOOR: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
39 /// round up and do not suppress exceptions
40 #[stable(feature = "simd_x86", since = "1.27.0")]
41 pub const _MM_FROUND_CEIL: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
42 /// truncate and do not suppress exceptions
43 #[stable(feature = "simd_x86", since = "1.27.0")]
44 pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
45 /// use MXCSR.RC and do not suppress exceptions; see
46 /// `vendor::_MM_SET_ROUNDING_MODE`
47 #[stable(feature = "simd_x86", since = "1.27.0")]
48 pub const _MM_FROUND_RINT: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
50 #[stable(feature = "simd_x86", since = "1.27.0")]
51 pub const _MM_FROUND_NEARBYINT: i32 = (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
52
53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
54 ///
55 /// The high bit of each corresponding mask byte determines the selection.
56 /// If the high bit is set the element of `a` is selected. The element
57 /// of `b` is selected otherwise.
58 ///
59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
60 #[inline]
61 #[target_feature(enable = "sse4.1")]
62 #[cfg_attr(test, assert_instr(pblendvb))]
63 #[stable(feature = "simd_x86", since = "1.27.0")]
64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
65 transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
66 }
67
68 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
69 ///
70 /// The mask bits determine the selection. A clear bit selects the
71 /// corresponding element of `a`, and a set bit the corresponding
72 /// element of `b`.
73 ///
74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
75 #[inline]
76 #[target_feature(enable = "sse4.1")]
77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
79 // #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
80 #[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))]
81 #[rustc_args_required_const(2)]
82 #[stable(feature = "simd_x86", since = "1.27.0")]
83 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
84 let a = a.as_i16x8();
85 let b = b.as_i16x8();
86 macro_rules! call {
87 ($imm8:expr) => {
88 pblendw(a, b, $imm8)
89 };
90 }
91 transmute(constify_imm8!(imm8, call))
92 }
93
94 /// Blend packed double-precision (64-bit) floating-point elements from `a`
95 /// and `b` using `mask`
96 ///
97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
98 #[inline]
99 #[target_feature(enable = "sse4.1")]
100 #[cfg_attr(test, assert_instr(blendvpd))]
101 #[stable(feature = "simd_x86", since = "1.27.0")]
102 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
103 blendvpd(a, b, mask)
104 }
105
106 /// Blend packed single-precision (32-bit) floating-point elements from `a`
107 /// and `b` using `mask`
108 ///
109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
110 #[inline]
111 #[target_feature(enable = "sse4.1")]
112 #[cfg_attr(test, assert_instr(blendvps))]
113 #[stable(feature = "simd_x86", since = "1.27.0")]
114 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
115 blendvps(a, b, mask)
116 }
117
118 /// Blend packed double-precision (64-bit) floating-point elements from `a`
119 /// and `b` using control mask `imm2`
120 ///
121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
122 #[inline]
123 #[target_feature(enable = "sse4.1")]
124 // Note: LLVM7 prefers the single-precision floating-point domain when possible
125 // see https://bugs.llvm.org/show_bug.cgi?id=38195
126 // #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
127 #[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))]
128 #[rustc_args_required_const(2)]
129 #[stable(feature = "simd_x86", since = "1.27.0")]
130 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
131 macro_rules! call {
132 ($imm2:expr) => {
133 blendpd(a, b, $imm2)
134 };
135 }
136 constify_imm2!(imm2, call)
137 }
138
139 /// Blend packed single-precision (32-bit) floating-point elements from `a`
140 /// and `b` using mask `imm4`
141 ///
142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
143 #[inline]
144 #[target_feature(enable = "sse4.1")]
145 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
146 #[rustc_args_required_const(2)]
147 #[stable(feature = "simd_x86", since = "1.27.0")]
148 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
149 macro_rules! call {
150 ($imm4:expr) => {
151 blendps(a, b, $imm4)
152 };
153 }
154 constify_imm4!(imm4, call)
155 }
156
157 /// Extracts a single-precision (32-bit) floating-point element from `a`,
158 /// selected with `imm8`
159 ///
160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
161 #[inline]
162 #[target_feature(enable = "sse4.1")]
163 #[cfg_attr(
164 all(test, not(target_os = "windows")),
165 assert_instr(extractps, imm8 = 0)
166 )]
167 #[rustc_args_required_const(1)]
168 #[stable(feature = "simd_x86", since = "1.27.0")]
169 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
170 transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
171 }
172
173 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
174 /// integer containing the zero-extended integer data.
175 ///
176 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
177 ///
178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
179 #[inline]
180 #[target_feature(enable = "sse4.1")]
181 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
182 #[rustc_args_required_const(1)]
183 #[stable(feature = "simd_x86", since = "1.27.0")]
184 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
185 let imm8 = (imm8 & 15) as u32;
186 simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32
187 }
188
189 /// Extracts an 32-bit integer from `a` selected with `imm8`
190 ///
191 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
192 #[inline]
193 #[target_feature(enable = "sse4.1")]
194 #[cfg_attr(
195 all(test, not(target_os = "windows")),
196 assert_instr(extractps, imm8 = 1)
197 )]
198 #[rustc_args_required_const(1)]
199 #[stable(feature = "simd_x86", since = "1.27.0")]
200 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
201 let imm8 = (imm8 & 3) as u32;
202 simd_extract::<_, i32>(a.as_i32x4(), imm8)
203 }
204
205 /// Select a single value in `a` to store at some position in `b`,
206 /// Then zero elements according to `imm8`.
207 ///
208 /// `imm8` specifies which bits from operand `a` will be copied, which bits in
209 /// the result they will be copied to, and which bits in the result will be
210 /// cleared. The following assignments are made:
211 ///
212 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
213 /// - `00`: Selects bits `[31:0]` from operand `a`.
214 /// - `01`: Selects bits `[63:32]` from operand `a`.
215 /// - `10`: Selects bits `[95:64]` from operand `a`.
216 /// - `11`: Selects bits `[127:96]` from operand `a`.
217 ///
218 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
219 /// from operand `a` are copied:
220 /// - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
221 /// - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
222 /// - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
223 /// - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
224 ///
225 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
226 /// element is cleared.
227 ///
228 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
229 #[inline]
230 #[target_feature(enable = "sse4.1")]
231 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
232 #[rustc_args_required_const(2)]
233 #[stable(feature = "simd_x86", since = "1.27.0")]
234 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
235 macro_rules! call {
236 ($imm8:expr) => {
237 insertps(a, b, $imm8)
238 };
239 }
240 constify_imm8!(imm8, call)
241 }
242
243 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
244 /// location specified by `imm8`.
245 ///
246 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
247 #[inline]
248 #[target_feature(enable = "sse4.1")]
249 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
250 #[rustc_args_required_const(2)]
251 #[stable(feature = "simd_x86", since = "1.27.0")]
252 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
253 transmute(simd_insert(a.as_i8x16(), (imm8 & 0b1111) as u32, i as i8))
254 }
255
256 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
257 /// location specified by `imm8`.
258 ///
259 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
260 #[inline]
261 #[target_feature(enable = "sse4.1")]
262 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
263 #[rustc_args_required_const(2)]
264 #[stable(feature = "simd_x86", since = "1.27.0")]
265 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
266 transmute(simd_insert(a.as_i32x4(), (imm8 & 0b11) as u32, i))
267 }
268
269 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
270 /// values in dst.
271 ///
272 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
273 #[inline]
274 #[target_feature(enable = "sse4.1")]
275 #[cfg_attr(test, assert_instr(pmaxsb))]
276 #[stable(feature = "simd_x86", since = "1.27.0")]
277 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
278 transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
279 }
280
281 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
282 /// maximum.
283 ///
284 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
285 #[inline]
286 #[target_feature(enable = "sse4.1")]
287 #[cfg_attr(test, assert_instr(pmaxuw))]
288 #[stable(feature = "simd_x86", since = "1.27.0")]
289 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
290 transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
291 }
292
293 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
294 /// values.
295 ///
296 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
297 #[inline]
298 #[target_feature(enable = "sse4.1")]
299 #[cfg_attr(test, assert_instr(pmaxsd))]
300 #[stable(feature = "simd_x86", since = "1.27.0")]
301 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
302 transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
303 }
304
305 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
306 /// maximum values.
307 ///
308 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
309 #[inline]
310 #[target_feature(enable = "sse4.1")]
311 #[cfg_attr(test, assert_instr(pmaxud))]
312 #[stable(feature = "simd_x86", since = "1.27.0")]
313 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
314 transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
315 }
316
317 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
318 /// values in dst.
319 ///
320 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
321 #[inline]
322 #[target_feature(enable = "sse4.1")]
323 #[cfg_attr(test, assert_instr(pminsb))]
324 #[stable(feature = "simd_x86", since = "1.27.0")]
325 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
326 transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
327 }
328
329 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
330 /// minimum.
331 ///
332 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
333 #[inline]
334 #[target_feature(enable = "sse4.1")]
335 #[cfg_attr(test, assert_instr(pminuw))]
336 #[stable(feature = "simd_x86", since = "1.27.0")]
337 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
338 transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
339 }
340
341 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
342 /// values.
343 ///
344 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
345 #[inline]
346 #[target_feature(enable = "sse4.1")]
347 #[cfg_attr(test, assert_instr(pminsd))]
348 #[stable(feature = "simd_x86", since = "1.27.0")]
349 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
350 transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
351 }
352
353 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
354 /// minimum values.
355 ///
356 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
357 #[inline]
358 #[target_feature(enable = "sse4.1")]
359 #[cfg_attr(test, assert_instr(pminud))]
360 #[stable(feature = "simd_x86", since = "1.27.0")]
361 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
362 transmute(pminud(a.as_u32x4(), b.as_u32x4()))
363 }
364
365 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
366 /// using unsigned saturation
367 ///
368 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
369 #[inline]
370 #[target_feature(enable = "sse4.1")]
371 #[cfg_attr(test, assert_instr(packusdw))]
372 #[stable(feature = "simd_x86", since = "1.27.0")]
373 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
374 transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
375 }
376
377 /// Compares packed 64-bit integers in `a` and `b` for equality
378 ///
379 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
380 #[inline]
381 #[target_feature(enable = "sse4.1")]
382 #[cfg_attr(test, assert_instr(pcmpeqq))]
383 #[stable(feature = "simd_x86", since = "1.27.0")]
384 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
385 transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
386 }
387
388 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
389 ///
390 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
391 #[inline]
392 #[target_feature(enable = "sse4.1")]
393 #[cfg_attr(test, assert_instr(pmovsxbw))]
394 #[stable(feature = "simd_x86", since = "1.27.0")]
395 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
396 let a = a.as_i8x16();
397 let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
398 transmute(simd_cast::<_, i16x8>(a))
399 }
400
401 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
402 ///
403 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
404 #[inline]
405 #[target_feature(enable = "sse4.1")]
406 #[cfg_attr(test, assert_instr(pmovsxbd))]
407 #[stable(feature = "simd_x86", since = "1.27.0")]
408 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
409 let a = a.as_i8x16();
410 let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
411 transmute(simd_cast::<_, i32x4>(a))
412 }
413
414 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
415 /// 64-bit integers
416 ///
417 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
418 #[inline]
419 #[target_feature(enable = "sse4.1")]
420 #[cfg_attr(test, assert_instr(pmovsxbq))]
421 #[stable(feature = "simd_x86", since = "1.27.0")]
422 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
423 let a = a.as_i8x16();
424 let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
425 transmute(simd_cast::<_, i64x2>(a))
426 }
427
428 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
429 ///
430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
431 #[inline]
432 #[target_feature(enable = "sse4.1")]
433 #[cfg_attr(test, assert_instr(pmovsxwd))]
434 #[stable(feature = "simd_x86", since = "1.27.0")]
435 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
436 let a = a.as_i16x8();
437 let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
438 transmute(simd_cast::<_, i32x4>(a))
439 }
440
441 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
442 ///
443 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
444 #[inline]
445 #[target_feature(enable = "sse4.1")]
446 #[cfg_attr(test, assert_instr(pmovsxwq))]
447 #[stable(feature = "simd_x86", since = "1.27.0")]
448 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
449 let a = a.as_i16x8();
450 let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
451 transmute(simd_cast::<_, i64x2>(a))
452 }
453
454 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
455 ///
456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
457 #[inline]
458 #[target_feature(enable = "sse4.1")]
459 #[cfg_attr(test, assert_instr(pmovsxdq))]
460 #[stable(feature = "simd_x86", since = "1.27.0")]
461 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
462 let a = a.as_i32x4();
463 let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
464 transmute(simd_cast::<_, i64x2>(a))
465 }
466
467 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
468 ///
469 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
470 #[inline]
471 #[target_feature(enable = "sse4.1")]
472 #[cfg_attr(test, assert_instr(pmovzxbw))]
473 #[stable(feature = "simd_x86", since = "1.27.0")]
474 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
475 let a = a.as_u8x16();
476 let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
477 transmute(simd_cast::<_, i16x8>(a))
478 }
479
480 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
481 ///
482 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
483 #[inline]
484 #[target_feature(enable = "sse4.1")]
485 #[cfg_attr(test, assert_instr(pmovzxbd))]
486 #[stable(feature = "simd_x86", since = "1.27.0")]
487 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
488 let a = a.as_u8x16();
489 let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
490 transmute(simd_cast::<_, i32x4>(a))
491 }
492
493 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
494 ///
495 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
496 #[inline]
497 #[target_feature(enable = "sse4.1")]
498 #[cfg_attr(test, assert_instr(pmovzxbq))]
499 #[stable(feature = "simd_x86", since = "1.27.0")]
500 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
501 let a = a.as_u8x16();
502 let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
503 transmute(simd_cast::<_, i64x2>(a))
504 }
505
506 /// Zeroes extend packed unsigned 16-bit integers in `a`
507 /// to packed 32-bit integers
508 ///
509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
510 #[inline]
511 #[target_feature(enable = "sse4.1")]
512 #[cfg_attr(test, assert_instr(pmovzxwd))]
513 #[stable(feature = "simd_x86", since = "1.27.0")]
514 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
515 let a = a.as_u16x8();
516 let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
517 transmute(simd_cast::<_, i32x4>(a))
518 }
519
520 /// Zeroes extend packed unsigned 16-bit integers in `a`
521 /// to packed 64-bit integers
522 ///
523 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
524 #[inline]
525 #[target_feature(enable = "sse4.1")]
526 #[cfg_attr(test, assert_instr(pmovzxwq))]
527 #[stable(feature = "simd_x86", since = "1.27.0")]
528 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
529 let a = a.as_u16x8();
530 let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
531 transmute(simd_cast::<_, i64x2>(a))
532 }
533
534 /// Zeroes extend packed unsigned 32-bit integers in `a`
535 /// to packed 64-bit integers
536 ///
537 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
538 #[inline]
539 #[target_feature(enable = "sse4.1")]
540 #[cfg_attr(test, assert_instr(pmovzxdq))]
541 #[stable(feature = "simd_x86", since = "1.27.0")]
542 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
543 let a = a.as_u32x4();
544 let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
545 transmute(simd_cast::<_, i64x2>(a))
546 }
547
548 /// Returns the dot product of two __m128d vectors.
549 ///
550 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
551 /// If a condition mask bit is zero, the corresponding multiplication is
552 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
553 /// the dot product will be stored in the return value component. Otherwise if
554 /// the broadcast mask bit is zero then the return component will be zero.
555 ///
556 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
557 #[inline]
558 #[target_feature(enable = "sse4.1")]
559 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
560 #[rustc_args_required_const(2)]
561 #[stable(feature = "simd_x86", since = "1.27.0")]
562 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
563 macro_rules! call {
564 ($imm8:expr) => {
565 dppd(a, b, $imm8)
566 };
567 }
568 constify_imm8!(imm8, call)
569 }
570
571 /// Returns the dot product of two __m128 vectors.
572 ///
573 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
574 /// If a condition mask bit is zero, the corresponding multiplication is
575 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
576 /// the dot product will be stored in the return value component. Otherwise if
577 /// the broadcast mask bit is zero then the return component will be zero.
578 ///
579 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
580 #[inline]
581 #[target_feature(enable = "sse4.1")]
582 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
583 #[rustc_args_required_const(2)]
584 #[stable(feature = "simd_x86", since = "1.27.0")]
585 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
586 macro_rules! call {
587 ($imm8:expr) => {
588 dpps(a, b, $imm8)
589 };
590 }
591 constify_imm8!(imm8, call)
592 }
593
594 /// Round the packed double-precision (64-bit) floating-point elements in `a`
595 /// down to an integer value, and stores the results as packed double-precision
596 /// floating-point elements.
597 ///
598 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
599 #[inline]
600 #[target_feature(enable = "sse4.1")]
601 #[cfg_attr(test, assert_instr(roundpd))]
602 #[stable(feature = "simd_x86", since = "1.27.0")]
603 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
604 roundpd(a, _MM_FROUND_FLOOR)
605 }
606
607 /// Round the packed single-precision (32-bit) floating-point elements in `a`
608 /// down to an integer value, and stores the results as packed single-precision
609 /// floating-point elements.
610 ///
611 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
612 #[inline]
613 #[target_feature(enable = "sse4.1")]
614 #[cfg_attr(test, assert_instr(roundps))]
615 #[stable(feature = "simd_x86", since = "1.27.0")]
616 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
617 roundps(a, _MM_FROUND_FLOOR)
618 }
619
620 /// Round the lower double-precision (64-bit) floating-point element in `b`
621 /// down to an integer value, store the result as a double-precision
622 /// floating-point element in the lower element of the intrinsic result,
623 /// and copies the upper element from `a` to the upper element of the intrinsic
624 /// result.
625 ///
626 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
627 #[inline]
628 #[target_feature(enable = "sse4.1")]
629 #[cfg_attr(test, assert_instr(roundsd))]
630 #[stable(feature = "simd_x86", since = "1.27.0")]
631 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
632 roundsd(a, b, _MM_FROUND_FLOOR)
633 }
634
635 /// Round the lower single-precision (32-bit) floating-point element in `b`
636 /// down to an integer value, store the result as a single-precision
637 /// floating-point element in the lower element of the intrinsic result,
638 /// and copies the upper 3 packed elements from `a` to the upper elements
639 /// of the intrinsic result.
640 ///
641 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
642 #[inline]
643 #[target_feature(enable = "sse4.1")]
644 #[cfg_attr(test, assert_instr(roundss))]
645 #[stable(feature = "simd_x86", since = "1.27.0")]
646 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
647 roundss(a, b, _MM_FROUND_FLOOR)
648 }
649
650 /// Round the packed double-precision (64-bit) floating-point elements in `a`
651 /// up to an integer value, and stores the results as packed double-precision
652 /// floating-point elements.
653 ///
654 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
655 #[inline]
656 #[target_feature(enable = "sse4.1")]
657 #[cfg_attr(test, assert_instr(roundpd))]
658 #[stable(feature = "simd_x86", since = "1.27.0")]
659 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
660 roundpd(a, _MM_FROUND_CEIL)
661 }
662
663 /// Round the packed single-precision (32-bit) floating-point elements in `a`
664 /// up to an integer value, and stores the results as packed single-precision
665 /// floating-point elements.
666 ///
667 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
668 #[inline]
669 #[target_feature(enable = "sse4.1")]
670 #[cfg_attr(test, assert_instr(roundps))]
671 #[stable(feature = "simd_x86", since = "1.27.0")]
672 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
673 roundps(a, _MM_FROUND_CEIL)
674 }
675
676 /// Round the lower double-precision (64-bit) floating-point element in `b`
677 /// up to an integer value, store the result as a double-precision
678 /// floating-point element in the lower element of the intrisic result,
679 /// and copies the upper element from `a` to the upper element
680 /// of the intrinsic result.
681 ///
682 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
683 #[inline]
684 #[target_feature(enable = "sse4.1")]
685 #[cfg_attr(test, assert_instr(roundsd))]
686 #[stable(feature = "simd_x86", since = "1.27.0")]
687 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
688 roundsd(a, b, _MM_FROUND_CEIL)
689 }
690
691 /// Round the lower single-precision (32-bit) floating-point element in `b`
692 /// up to an integer value, store the result as a single-precision
693 /// floating-point element in the lower element of the intrinsic result,
694 /// and copies the upper 3 packed elements from `a` to the upper elements
695 /// of the intrinsic result.
696 ///
697 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
698 #[inline]
699 #[target_feature(enable = "sse4.1")]
700 #[cfg_attr(test, assert_instr(roundss))]
701 #[stable(feature = "simd_x86", since = "1.27.0")]
702 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
703 roundss(a, b, _MM_FROUND_CEIL)
704 }
705
706 /// Round the packed double-precision (64-bit) floating-point elements in `a`
707 /// using the `rounding` parameter, and stores the results as packed
708 /// double-precision floating-point elements.
709 /// Rounding is done according to the rounding parameter, which can be one of:
710 ///
711 /// ```
712 /// #[cfg(target_arch = "x86")]
713 /// use std::arch::x86::*;
714 /// #[cfg(target_arch = "x86_64")]
715 /// use std::arch::x86_64::*;
716 ///
717 /// # fn main() {
718 /// // round to nearest, and suppress exceptions:
719 /// # let _x =
720 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
721 /// // round down, and suppress exceptions:
722 /// # let _x =
723 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
724 /// // round up, and suppress exceptions:
725 /// # let _x =
726 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
727 /// // truncate, and suppress exceptions:
728 /// # let _x =
729 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
730 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
731 /// # let _x =
732 /// _MM_FROUND_CUR_DIRECTION;
733 /// # }
734 /// ```
735 ///
736 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
737 #[inline]
738 #[target_feature(enable = "sse4.1")]
739 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
740 #[rustc_args_required_const(1)]
741 #[stable(feature = "simd_x86", since = "1.27.0")]
742 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
743 macro_rules! call {
744 ($imm4:expr) => {
745 roundpd(a, $imm4)
746 };
747 }
748 constify_imm4!(rounding, call)
749 }
750
751 /// Round the packed single-precision (32-bit) floating-point elements in `a`
752 /// using the `rounding` parameter, and stores the results as packed
753 /// single-precision floating-point elements.
754 /// Rounding is done according to the rounding parameter, which can be one of:
755 ///
756 /// ```
757 /// #[cfg(target_arch = "x86")]
758 /// use std::arch::x86::*;
759 /// #[cfg(target_arch = "x86_64")]
760 /// use std::arch::x86_64::*;
761 ///
762 /// # fn main() {
763 /// // round to nearest, and suppress exceptions:
764 /// # let _x =
765 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
766 /// // round down, and suppress exceptions:
767 /// # let _x =
768 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
769 /// // round up, and suppress exceptions:
770 /// # let _x =
771 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
772 /// // truncate, and suppress exceptions:
773 /// # let _x =
774 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
775 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
776 /// # let _x =
777 /// _MM_FROUND_CUR_DIRECTION;
778 /// # }
779 /// ```
780 ///
781 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
782 #[inline]
783 #[target_feature(enable = "sse4.1")]
784 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
785 #[rustc_args_required_const(1)]
786 #[stable(feature = "simd_x86", since = "1.27.0")]
787 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
788 macro_rules! call {
789 ($imm4:expr) => {
790 roundps(a, $imm4)
791 };
792 }
793 constify_imm4!(rounding, call)
794 }
795
796 /// Round the lower double-precision (64-bit) floating-point element in `b`
797 /// using the `rounding` parameter, store the result as a double-precision
798 /// floating-point element in the lower element of the intrinsic result,
799 /// and copies the upper element from `a` to the upper element of the intrinsic
800 /// result.
801 /// Rounding is done according to the rounding parameter, which can be one of:
802 ///
803 /// ```
804 /// #[cfg(target_arch = "x86")]
805 /// use std::arch::x86::*;
806 /// #[cfg(target_arch = "x86_64")]
807 /// use std::arch::x86_64::*;
808 ///
809 /// # fn main() {
810 /// // round to nearest, and suppress exceptions:
811 /// # let _x =
812 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
813 /// // round down, and suppress exceptions:
814 /// # let _x =
815 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
816 /// // round up, and suppress exceptions:
817 /// # let _x =
818 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
819 /// // truncate, and suppress exceptions:
820 /// # let _x =
821 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
822 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
823 /// # let _x =
824 /// _MM_FROUND_CUR_DIRECTION;
825 /// # }
826 /// ```
827 ///
828 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
829 #[inline]
830 #[target_feature(enable = "sse4.1")]
831 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
832 #[rustc_args_required_const(2)]
833 #[stable(feature = "simd_x86", since = "1.27.0")]
834 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
835 macro_rules! call {
836 ($imm4:expr) => {
837 roundsd(a, b, $imm4)
838 };
839 }
840 constify_imm4!(rounding, call)
841 }
842
843 /// Round the lower single-precision (32-bit) floating-point element in `b`
844 /// using the `rounding` parameter, store the result as a single-precision
845 /// floating-point element in the lower element of the intrinsic result,
846 /// and copies the upper 3 packed elements from `a` to the upper elements
847 /// of the instrinsic result.
848 /// Rounding is done according to the rounding parameter, which can be one of:
849 ///
850 /// ```
851 /// #[cfg(target_arch = "x86")]
852 /// use std::arch::x86::*;
853 /// #[cfg(target_arch = "x86_64")]
854 /// use std::arch::x86_64::*;
855 ///
856 /// # fn main() {
857 /// // round to nearest, and suppress exceptions:
858 /// # let _x =
859 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
860 /// // round down, and suppress exceptions:
861 /// # let _x =
862 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
863 /// // round up, and suppress exceptions:
864 /// # let _x =
865 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
866 /// // truncate, and suppress exceptions:
867 /// # let _x =
868 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
869 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
870 /// # let _x =
871 /// _MM_FROUND_CUR_DIRECTION;
872 /// # }
873 /// ```
874 ///
875 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
876 #[inline]
877 #[target_feature(enable = "sse4.1")]
878 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
879 #[rustc_args_required_const(2)]
880 #[stable(feature = "simd_x86", since = "1.27.0")]
881 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
882 macro_rules! call {
883 ($imm4:expr) => {
884 roundss(a, b, $imm4)
885 };
886 }
887 constify_imm4!(rounding, call)
888 }
889
890 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
891 /// returning a vector containing its value in its first position, and its
892 /// index
893 /// in its second position; all other elements are set to zero.
894 ///
895 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
896 /// instruction.
897 ///
898 /// Arguments:
899 ///
900 /// * `a` - A 128-bit vector of type `__m128i`.
901 ///
902 /// Returns:
903 ///
904 /// A 128-bit value where:
905 ///
906 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
907 /// * bits `[18:16]` - contain the index of the minimum value
908 /// * remaining bits are set to `0`.
909 ///
910 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
911 #[inline]
912 #[target_feature(enable = "sse4.1")]
913 #[cfg_attr(test, assert_instr(phminposuw))]
914 #[stable(feature = "simd_x86", since = "1.27.0")]
915 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
916 transmute(phminposuw(a.as_u16x8()))
917 }
918
919 /// Multiplies the low 32-bit integers from each packed 64-bit
920 /// element in `a` and `b`, and returns the signed 64-bit result.
921 ///
922 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
923 #[inline]
924 #[target_feature(enable = "sse4.1")]
925 #[cfg_attr(test, assert_instr(pmuldq))]
926 #[stable(feature = "simd_x86", since = "1.27.0")]
927 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
928 transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
929 }
930
931 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
932 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
933 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
934 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
935 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
936 /// return a negative number.
937 ///
938 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
939 #[inline]
940 #[target_feature(enable = "sse4.1")]
941 #[cfg_attr(test, assert_instr(pmulld))]
942 #[stable(feature = "simd_x86", since = "1.27.0")]
943 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
944 transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
945 }
946
947 /// Subtracts 8-bit unsigned integer values and computes the absolute
948 /// values of the differences to the corresponding bits in the destination.
949 /// Then sums of the absolute differences are returned according to the bit
950 /// fields in the immediate operand.
951 ///
952 /// The following algorithm is performed:
953 ///
954 /// ```ignore
955 /// i = imm8[2] * 4
956 /// j = imm8[1:0] * 4
957 /// for k := 0 to 7
958 /// d0 = abs(a[i + k + 0] - b[j + 0])
959 /// d1 = abs(a[i + k + 1] - b[j + 1])
960 /// d2 = abs(a[i + k + 2] - b[j + 2])
961 /// d3 = abs(a[i + k + 3] - b[j + 3])
962 /// r[k] = d0 + d1 + d2 + d3
963 /// ```
964 ///
965 /// Arguments:
966 ///
967 /// * `a` - A 128-bit vector of type `__m128i`.
968 /// * `b` - A 128-bit vector of type `__m128i`.
969 /// * `imm8` - An 8-bit immediate operand specifying how the absolute
970 /// differences are to be calculated
971 /// * Bit `[2]` specify the offset for operand `a`
972 /// * Bits `[1:0]` specify the offset for operand `b`
973 ///
974 /// Returns:
975 ///
976 /// * A `__m128i` vector containing the sums of the sets of absolute
977 /// differences between both operands.
978 ///
979 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
980 #[inline]
981 #[target_feature(enable = "sse4.1")]
982 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
983 #[rustc_args_required_const(2)]
984 #[stable(feature = "simd_x86", since = "1.27.0")]
985 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
986 let a = a.as_u8x16();
987 let b = b.as_u8x16();
988 macro_rules! call {
989 ($imm8:expr) => {
990 mpsadbw(a, b, $imm8)
991 };
992 }
993 transmute(constify_imm3!(imm8, call))
994 }
995
996 /// Tests whether the specified bits in a 128-bit integer vector are all
997 /// zeros.
998 ///
999 /// Arguments:
1000 ///
1001 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1002 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1003 /// operand `a`.
1004 ///
1005 /// Returns:
1006 ///
1007 /// * `1` - if the specified bits are all zeros,
1008 /// * `0` - otherwise.
1009 ///
1010 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
1011 #[inline]
1012 #[target_feature(enable = "sse4.1")]
1013 #[cfg_attr(test, assert_instr(ptest))]
1014 #[stable(feature = "simd_x86", since = "1.27.0")]
1015 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1016 ptestz(a.as_i64x2(), mask.as_i64x2())
1017 }
1018
1019 /// Tests whether the specified bits in a 128-bit integer vector are all
1020 /// ones.
1021 ///
1022 /// Arguments:
1023 ///
1024 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1025 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1026 /// operand `a`.
1027 ///
1028 /// Returns:
1029 ///
1030 /// * `1` - if the specified bits are all ones,
1031 /// * `0` - otherwise.
1032 ///
1033 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
1034 #[inline]
1035 #[target_feature(enable = "sse4.1")]
1036 #[cfg_attr(test, assert_instr(ptest))]
1037 #[stable(feature = "simd_x86", since = "1.27.0")]
1038 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1039 ptestc(a.as_i64x2(), mask.as_i64x2())
1040 }
1041
1042 /// Tests whether the specified bits in a 128-bit integer vector are
1043 /// neither all zeros nor all ones.
1044 ///
1045 /// Arguments:
1046 ///
1047 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1048 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1049 /// operand `a`.
1050 ///
1051 /// Returns:
1052 ///
1053 /// * `1` - if the specified bits are neither all zeros nor all ones,
1054 /// * `0` - otherwise.
1055 ///
1056 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1057 #[inline]
1058 #[target_feature(enable = "sse4.1")]
1059 #[cfg_attr(test, assert_instr(ptest))]
1060 #[stable(feature = "simd_x86", since = "1.27.0")]
1061 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1062 ptestnzc(a.as_i64x2(), mask.as_i64x2())
1063 }
1064
1065 /// Tests whether the specified bits in a 128-bit integer vector are all
1066 /// zeros.
1067 ///
1068 /// Arguments:
1069 ///
1070 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1071 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1072 /// operand `a`.
1073 ///
1074 /// Returns:
1075 ///
1076 /// * `1` - if the specified bits are all zeros,
1077 /// * `0` - otherwise.
1078 ///
1079 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1080 #[inline]
1081 #[target_feature(enable = "sse4.1")]
1082 #[cfg_attr(test, assert_instr(ptest))]
1083 #[stable(feature = "simd_x86", since = "1.27.0")]
1084 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1085 _mm_testz_si128(a, mask)
1086 }
1087
1088 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1089 /// ones.
1090 ///
1091 /// Argument:
1092 ///
1093 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1094 ///
1095 /// Returns:
1096 ///
1097 /// * `1` - if the bits specified in the operand are all set to 1,
1098 /// * `0` - otherwise.
1099 ///
1100 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1101 #[inline]
1102 #[target_feature(enable = "sse4.1")]
1103 #[cfg_attr(test, assert_instr(pcmpeqd))]
1104 #[cfg_attr(test, assert_instr(ptest))]
1105 #[stable(feature = "simd_x86", since = "1.27.0")]
1106 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1107 _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1108 }
1109
1110 /// Tests whether the specified bits in a 128-bit integer vector are
1111 /// neither all zeros nor all ones.
1112 ///
1113 /// Arguments:
1114 ///
1115 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1116 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1117 /// operand `a`.
1118 ///
1119 /// Returns:
1120 ///
1121 /// * `1` - if the specified bits are neither all zeros nor all ones,
1122 /// * `0` - otherwise.
1123 ///
1124 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1125 #[inline]
1126 #[target_feature(enable = "sse4.1")]
1127 #[cfg_attr(test, assert_instr(ptest))]
1128 #[stable(feature = "simd_x86", since = "1.27.0")]
1129 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1130 _mm_testnzc_si128(a, mask)
1131 }
1132
1133 #[allow(improper_ctypes)]
1134 extern "C" {
1135 #[link_name = "llvm.x86.sse41.pblendvb"]
1136 fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1137 #[link_name = "llvm.x86.sse41.blendvpd"]
1138 fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1139 #[link_name = "llvm.x86.sse41.blendvps"]
1140 fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1141 #[link_name = "llvm.x86.sse41.blendpd"]
1142 fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1143 #[link_name = "llvm.x86.sse41.blendps"]
1144 fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1145 #[link_name = "llvm.x86.sse41.pblendw"]
1146 fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1147 #[link_name = "llvm.x86.sse41.insertps"]
1148 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1149 #[link_name = "llvm.x86.sse41.pmaxsb"]
1150 fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1151 #[link_name = "llvm.x86.sse41.pmaxuw"]
1152 fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1153 #[link_name = "llvm.x86.sse41.pmaxsd"]
1154 fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1155 #[link_name = "llvm.x86.sse41.pmaxud"]
1156 fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1157 #[link_name = "llvm.x86.sse41.pminsb"]
1158 fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1159 #[link_name = "llvm.x86.sse41.pminuw"]
1160 fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1161 #[link_name = "llvm.x86.sse41.pminsd"]
1162 fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1163 #[link_name = "llvm.x86.sse41.pminud"]
1164 fn pminud(a: u32x4, b: u32x4) -> u32x4;
1165 #[link_name = "llvm.x86.sse41.packusdw"]
1166 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1167 #[link_name = "llvm.x86.sse41.dppd"]
1168 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1169 #[link_name = "llvm.x86.sse41.dpps"]
1170 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1171 #[link_name = "llvm.x86.sse41.round.pd"]
1172 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1173 #[link_name = "llvm.x86.sse41.round.ps"]
1174 fn roundps(a: __m128, rounding: i32) -> __m128;
1175 #[link_name = "llvm.x86.sse41.round.sd"]
1176 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1177 #[link_name = "llvm.x86.sse41.round.ss"]
1178 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1179 #[link_name = "llvm.x86.sse41.phminposuw"]
1180 fn phminposuw(a: u16x8) -> u16x8;
1181 #[link_name = "llvm.x86.sse41.pmuldq"]
1182 fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1183 #[link_name = "llvm.x86.sse41.mpsadbw"]
1184 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1185 #[link_name = "llvm.x86.sse41.ptestz"]
1186 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1187 #[link_name = "llvm.x86.sse41.ptestc"]
1188 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1189 #[link_name = "llvm.x86.sse41.ptestnzc"]
1190 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1191 }
1192
1193 #[cfg(test)]
1194 mod tests {
1195 use crate::core_arch::x86::*;
1196 use std::mem;
1197 use stdsimd_test::simd_test;
1198
1199 #[simd_test(enable = "sse4.1")]
1200 unsafe fn test_mm_blendv_epi8() {
1201 #[rustfmt::skip]
1202 let a = _mm_setr_epi8(
1203 0, 1, 2, 3, 4, 5, 6, 7,
1204 8, 9, 10, 11, 12, 13, 14, 15,
1205 );
1206 #[rustfmt::skip]
1207 let b = _mm_setr_epi8(
1208 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1209 );
1210 #[rustfmt::skip]
1211 let mask = _mm_setr_epi8(
1212 0, -1, 0, -1, 0, -1, 0, -1,
1213 0, -1, 0, -1, 0, -1, 0, -1,
1214 );
1215 #[rustfmt::skip]
1216 let e = _mm_setr_epi8(
1217 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1218 );
1219 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1220 }
1221
1222 #[simd_test(enable = "sse4.1")]
1223 unsafe fn test_mm_blendv_pd() {
1224 let a = _mm_set1_pd(0.0);
1225 let b = _mm_set1_pd(1.0);
1226 let mask = transmute(_mm_setr_epi64x(0, -1));
1227 let r = _mm_blendv_pd(a, b, mask);
1228 let e = _mm_setr_pd(0.0, 1.0);
1229 assert_eq_m128d(r, e);
1230 }
1231
1232 #[simd_test(enable = "sse4.1")]
1233 unsafe fn test_mm_blendv_ps() {
1234 let a = _mm_set1_ps(0.0);
1235 let b = _mm_set1_ps(1.0);
1236 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1237 let r = _mm_blendv_ps(a, b, mask);
1238 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1239 assert_eq_m128(r, e);
1240 }
1241
1242 #[simd_test(enable = "sse4.1")]
1243 unsafe fn test_mm_blend_pd() {
1244 let a = _mm_set1_pd(0.0);
1245 let b = _mm_set1_pd(1.0);
1246 let r = _mm_blend_pd(a, b, 0b10);
1247 let e = _mm_setr_pd(0.0, 1.0);
1248 assert_eq_m128d(r, e);
1249 }
1250
1251 #[simd_test(enable = "sse4.1")]
1252 unsafe fn test_mm_blend_ps() {
1253 let a = _mm_set1_ps(0.0);
1254 let b = _mm_set1_ps(1.0);
1255 let r = _mm_blend_ps(a, b, 0b1010);
1256 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1257 assert_eq_m128(r, e);
1258 }
1259
1260 #[simd_test(enable = "sse4.1")]
1261 unsafe fn test_mm_blend_epi16() {
1262 let a = _mm_set1_epi16(0);
1263 let b = _mm_set1_epi16(1);
1264 let r = _mm_blend_epi16(a, b, 0b1010_1100);
1265 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1266 assert_eq_m128i(r, e);
1267 }
1268
1269 #[simd_test(enable = "sse4.1")]
1270 unsafe fn test_mm_extract_ps() {
1271 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1272 let r: f32 = transmute(_mm_extract_ps(a, 1));
1273 assert_eq!(r, 1.0);
1274 let r: f32 = transmute(_mm_extract_ps(a, 5));
1275 assert_eq!(r, 1.0);
1276 }
1277
1278 #[simd_test(enable = "sse4.1")]
1279 unsafe fn test_mm_extract_epi8() {
1280 #[rustfmt::skip]
1281 let a = _mm_setr_epi8(
1282 -1, 1, 2, 3, 4, 5, 6, 7,
1283 8, 9, 10, 11, 12, 13, 14, 15
1284 );
1285 let r1 = _mm_extract_epi8(a, 0);
1286 let r2 = _mm_extract_epi8(a, 19);
1287 assert_eq!(r1, 0xFF);
1288 assert_eq!(r2, 3);
1289 }
1290
1291 #[simd_test(enable = "sse4.1")]
1292 unsafe fn test_mm_extract_epi32() {
1293 let a = _mm_setr_epi32(0, 1, 2, 3);
1294 let r = _mm_extract_epi32(a, 1);
1295 assert_eq!(r, 1);
1296 let r = _mm_extract_epi32(a, 5);
1297 assert_eq!(r, 1);
1298 }
1299
1300 #[simd_test(enable = "sse4.1")]
1301 unsafe fn test_mm_insert_ps() {
1302 let a = _mm_set1_ps(1.0);
1303 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1304 let r = _mm_insert_ps(a, b, 0b11_00_1100);
1305 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1306 assert_eq_m128(r, e);
1307 }
1308
1309 #[simd_test(enable = "sse4.1")]
1310 unsafe fn test_mm_insert_epi8() {
1311 let a = _mm_set1_epi8(0);
1312 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1313 let r = _mm_insert_epi8(a, 32, 1);
1314 assert_eq_m128i(r, e);
1315 let r = _mm_insert_epi8(a, 32, 17);
1316 assert_eq_m128i(r, e);
1317 }
1318
1319 #[simd_test(enable = "sse4.1")]
1320 unsafe fn test_mm_insert_epi32() {
1321 let a = _mm_set1_epi32(0);
1322 let e = _mm_setr_epi32(0, 32, 0, 0);
1323 let r = _mm_insert_epi32(a, 32, 1);
1324 assert_eq_m128i(r, e);
1325 let r = _mm_insert_epi32(a, 32, 5);
1326 assert_eq_m128i(r, e);
1327 }
1328
1329 #[simd_test(enable = "sse4.1")]
1330 unsafe fn test_mm_max_epi8() {
1331 #[rustfmt::skip]
1332 let a = _mm_setr_epi8(
1333 1, 4, 5, 8, 9, 12, 13, 16,
1334 17, 20, 21, 24, 25, 28, 29, 32,
1335 );
1336 #[rustfmt::skip]
1337 let b = _mm_setr_epi8(
1338 2, 3, 6, 7, 10, 11, 14, 15,
1339 18, 19, 22, 23, 26, 27, 30, 31,
1340 );
1341 let r = _mm_max_epi8(a, b);
1342 #[rustfmt::skip]
1343 let e = _mm_setr_epi8(
1344 2, 4, 6, 8, 10, 12, 14, 16,
1345 18, 20, 22, 24, 26, 28, 30, 32,
1346 );
1347 assert_eq_m128i(r, e);
1348 }
1349
1350 #[simd_test(enable = "sse4.1")]
1351 unsafe fn test_mm_max_epu16() {
1352 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1353 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1354 let r = _mm_max_epu16(a, b);
1355 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1356 assert_eq_m128i(r, e);
1357 }
1358
1359 #[simd_test(enable = "sse4.1")]
1360 unsafe fn test_mm_max_epi32() {
1361 let a = _mm_setr_epi32(1, 4, 5, 8);
1362 let b = _mm_setr_epi32(2, 3, 6, 7);
1363 let r = _mm_max_epi32(a, b);
1364 let e = _mm_setr_epi32(2, 4, 6, 8);
1365 assert_eq_m128i(r, e);
1366 }
1367
1368 #[simd_test(enable = "sse4.1")]
1369 unsafe fn test_mm_max_epu32() {
1370 let a = _mm_setr_epi32(1, 4, 5, 8);
1371 let b = _mm_setr_epi32(2, 3, 6, 7);
1372 let r = _mm_max_epu32(a, b);
1373 let e = _mm_setr_epi32(2, 4, 6, 8);
1374 assert_eq_m128i(r, e);
1375 }
1376
1377 #[simd_test(enable = "sse4.1")]
1378 unsafe fn test_mm_min_epi8_1() {
1379 #[rustfmt::skip]
1380 let a = _mm_setr_epi8(
1381 1, 4, 5, 8, 9, 12, 13, 16,
1382 17, 20, 21, 24, 25, 28, 29, 32,
1383 );
1384 #[rustfmt::skip]
1385 let b = _mm_setr_epi8(
1386 2, 3, 6, 7, 10, 11, 14, 15,
1387 18, 19, 22, 23, 26, 27, 30, 31,
1388 );
1389 let r = _mm_min_epi8(a, b);
1390 #[rustfmt::skip]
1391 let e = _mm_setr_epi8(
1392 1, 3, 5, 7, 9, 11, 13, 15,
1393 17, 19, 21, 23, 25, 27, 29, 31,
1394 );
1395 assert_eq_m128i(r, e);
1396 }
1397
1398 #[simd_test(enable = "sse4.1")]
1399 unsafe fn test_mm_min_epi8_2() {
1400 #[rustfmt::skip]
1401 let a = _mm_setr_epi8(
1402 1, -4, -5, 8, -9, -12, 13, -16,
1403 17, 20, 21, 24, 25, 28, 29, 32,
1404 );
1405 #[rustfmt::skip]
1406 let b = _mm_setr_epi8(
1407 2, -3, -6, 7, -10, -11, 14, -15,
1408 18, 19, 22, 23, 26, 27, 30, 31,
1409 );
1410 let r = _mm_min_epi8(a, b);
1411 #[rustfmt::skip]
1412 let e = _mm_setr_epi8(
1413 1, -4, -6, 7, -10, -12, 13, -16,
1414 17, 19, 21, 23, 25, 27, 29, 31,
1415 );
1416 assert_eq_m128i(r, e);
1417 }
1418
1419 #[simd_test(enable = "sse4.1")]
1420 unsafe fn test_mm_min_epu16() {
1421 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1422 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1423 let r = _mm_min_epu16(a, b);
1424 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1425 assert_eq_m128i(r, e);
1426 }
1427
1428 #[simd_test(enable = "sse4.1")]
1429 unsafe fn test_mm_min_epi32_1() {
1430 let a = _mm_setr_epi32(1, 4, 5, 8);
1431 let b = _mm_setr_epi32(2, 3, 6, 7);
1432 let r = _mm_min_epi32(a, b);
1433 let e = _mm_setr_epi32(1, 3, 5, 7);
1434 assert_eq_m128i(r, e);
1435 }
1436
1437 #[simd_test(enable = "sse4.1")]
1438 unsafe fn test_mm_min_epi32_2() {
1439 let a = _mm_setr_epi32(-1, 4, 5, -7);
1440 let b = _mm_setr_epi32(-2, 3, -6, 8);
1441 let r = _mm_min_epi32(a, b);
1442 let e = _mm_setr_epi32(-2, 3, -6, -7);
1443 assert_eq_m128i(r, e);
1444 }
1445
1446 #[simd_test(enable = "sse4.1")]
1447 unsafe fn test_mm_min_epu32() {
1448 let a = _mm_setr_epi32(1, 4, 5, 8);
1449 let b = _mm_setr_epi32(2, 3, 6, 7);
1450 let r = _mm_min_epu32(a, b);
1451 let e = _mm_setr_epi32(1, 3, 5, 7);
1452 assert_eq_m128i(r, e);
1453 }
1454
1455 #[simd_test(enable = "sse4.1")]
1456 unsafe fn test_mm_packus_epi32() {
1457 let a = _mm_setr_epi32(1, 2, 3, 4);
1458 let b = _mm_setr_epi32(-1, -2, -3, -4);
1459 let r = _mm_packus_epi32(a, b);
1460 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1461 assert_eq_m128i(r, e);
1462 }
1463
1464 #[simd_test(enable = "sse4.1")]
1465 unsafe fn test_mm_cmpeq_epi64() {
1466 let a = _mm_setr_epi64x(0, 1);
1467 let b = _mm_setr_epi64x(0, 0);
1468 let r = _mm_cmpeq_epi64(a, b);
1469 let e = _mm_setr_epi64x(-1, 0);
1470 assert_eq_m128i(r, e);
1471 }
1472
1473 #[simd_test(enable = "sse4.1")]
1474 unsafe fn test_mm_cvtepi8_epi16() {
1475 let a = _mm_set1_epi8(10);
1476 let r = _mm_cvtepi8_epi16(a);
1477 let e = _mm_set1_epi16(10);
1478 assert_eq_m128i(r, e);
1479 let a = _mm_set1_epi8(-10);
1480 let r = _mm_cvtepi8_epi16(a);
1481 let e = _mm_set1_epi16(-10);
1482 assert_eq_m128i(r, e);
1483 }
1484
1485 #[simd_test(enable = "sse4.1")]
1486 unsafe fn test_mm_cvtepi8_epi32() {
1487 let a = _mm_set1_epi8(10);
1488 let r = _mm_cvtepi8_epi32(a);
1489 let e = _mm_set1_epi32(10);
1490 assert_eq_m128i(r, e);
1491 let a = _mm_set1_epi8(-10);
1492 let r = _mm_cvtepi8_epi32(a);
1493 let e = _mm_set1_epi32(-10);
1494 assert_eq_m128i(r, e);
1495 }
1496
1497 #[simd_test(enable = "sse4.1")]
1498 unsafe fn test_mm_cvtepi8_epi64() {
1499 let a = _mm_set1_epi8(10);
1500 let r = _mm_cvtepi8_epi64(a);
1501 let e = _mm_set1_epi64x(10);
1502 assert_eq_m128i(r, e);
1503 let a = _mm_set1_epi8(-10);
1504 let r = _mm_cvtepi8_epi64(a);
1505 let e = _mm_set1_epi64x(-10);
1506 assert_eq_m128i(r, e);
1507 }
1508
1509 #[simd_test(enable = "sse4.1")]
1510 unsafe fn test_mm_cvtepi16_epi32() {
1511 let a = _mm_set1_epi16(10);
1512 let r = _mm_cvtepi16_epi32(a);
1513 let e = _mm_set1_epi32(10);
1514 assert_eq_m128i(r, e);
1515 let a = _mm_set1_epi16(-10);
1516 let r = _mm_cvtepi16_epi32(a);
1517 let e = _mm_set1_epi32(-10);
1518 assert_eq_m128i(r, e);
1519 }
1520
1521 #[simd_test(enable = "sse4.1")]
1522 unsafe fn test_mm_cvtepi16_epi64() {
1523 let a = _mm_set1_epi16(10);
1524 let r = _mm_cvtepi16_epi64(a);
1525 let e = _mm_set1_epi64x(10);
1526 assert_eq_m128i(r, e);
1527 let a = _mm_set1_epi16(-10);
1528 let r = _mm_cvtepi16_epi64(a);
1529 let e = _mm_set1_epi64x(-10);
1530 assert_eq_m128i(r, e);
1531 }
1532
1533 #[simd_test(enable = "sse4.1")]
1534 unsafe fn test_mm_cvtepi32_epi64() {
1535 let a = _mm_set1_epi32(10);
1536 let r = _mm_cvtepi32_epi64(a);
1537 let e = _mm_set1_epi64x(10);
1538 assert_eq_m128i(r, e);
1539 let a = _mm_set1_epi32(-10);
1540 let r = _mm_cvtepi32_epi64(a);
1541 let e = _mm_set1_epi64x(-10);
1542 assert_eq_m128i(r, e);
1543 }
1544
1545 #[simd_test(enable = "sse4.1")]
1546 unsafe fn test_mm_cvtepu8_epi16() {
1547 let a = _mm_set1_epi8(10);
1548 let r = _mm_cvtepu8_epi16(a);
1549 let e = _mm_set1_epi16(10);
1550 assert_eq_m128i(r, e);
1551 }
1552
1553 #[simd_test(enable = "sse4.1")]
1554 unsafe fn test_mm_cvtepu8_epi32() {
1555 let a = _mm_set1_epi8(10);
1556 let r = _mm_cvtepu8_epi32(a);
1557 let e = _mm_set1_epi32(10);
1558 assert_eq_m128i(r, e);
1559 }
1560
1561 #[simd_test(enable = "sse4.1")]
1562 unsafe fn test_mm_cvtepu8_epi64() {
1563 let a = _mm_set1_epi8(10);
1564 let r = _mm_cvtepu8_epi64(a);
1565 let e = _mm_set1_epi64x(10);
1566 assert_eq_m128i(r, e);
1567 }
1568
1569 #[simd_test(enable = "sse4.1")]
1570 unsafe fn test_mm_cvtepu16_epi32() {
1571 let a = _mm_set1_epi16(10);
1572 let r = _mm_cvtepu16_epi32(a);
1573 let e = _mm_set1_epi32(10);
1574 assert_eq_m128i(r, e);
1575 }
1576
1577 #[simd_test(enable = "sse4.1")]
1578 unsafe fn test_mm_cvtepu16_epi64() {
1579 let a = _mm_set1_epi16(10);
1580 let r = _mm_cvtepu16_epi64(a);
1581 let e = _mm_set1_epi64x(10);
1582 assert_eq_m128i(r, e);
1583 }
1584
1585 #[simd_test(enable = "sse4.1")]
1586 unsafe fn test_mm_cvtepu32_epi64() {
1587 let a = _mm_set1_epi32(10);
1588 let r = _mm_cvtepu32_epi64(a);
1589 let e = _mm_set1_epi64x(10);
1590 assert_eq_m128i(r, e);
1591 }
1592
1593 #[simd_test(enable = "sse4.1")]
1594 unsafe fn test_mm_dp_pd() {
1595 let a = _mm_setr_pd(2.0, 3.0);
1596 let b = _mm_setr_pd(1.0, 4.0);
1597 let e = _mm_setr_pd(14.0, 0.0);
1598 assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
1599 }
1600
1601 #[simd_test(enable = "sse4.1")]
1602 unsafe fn test_mm_dp_ps() {
1603 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1604 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1605 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1606 assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
1607 }
1608
1609 #[simd_test(enable = "sse4.1")]
1610 unsafe fn test_mm_floor_pd() {
1611 let a = _mm_setr_pd(2.5, 4.5);
1612 let r = _mm_floor_pd(a);
1613 let e = _mm_setr_pd(2.0, 4.0);
1614 assert_eq_m128d(r, e);
1615 }
1616
1617 #[simd_test(enable = "sse4.1")]
1618 unsafe fn test_mm_floor_ps() {
1619 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1620 let r = _mm_floor_ps(a);
1621 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1622 assert_eq_m128(r, e);
1623 }
1624
1625 #[simd_test(enable = "sse4.1")]
1626 unsafe fn test_mm_floor_sd() {
1627 let a = _mm_setr_pd(2.5, 4.5);
1628 let b = _mm_setr_pd(-1.5, -3.5);
1629 let r = _mm_floor_sd(a, b);
1630 let e = _mm_setr_pd(-2.0, 4.5);
1631 assert_eq_m128d(r, e);
1632 }
1633
1634 #[simd_test(enable = "sse4.1")]
1635 unsafe fn test_mm_floor_ss() {
1636 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1637 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1638 let r = _mm_floor_ss(a, b);
1639 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1640 assert_eq_m128(r, e);
1641 }
1642
1643 #[simd_test(enable = "sse4.1")]
1644 unsafe fn test_mm_ceil_pd() {
1645 let a = _mm_setr_pd(1.5, 3.5);
1646 let r = _mm_ceil_pd(a);
1647 let e = _mm_setr_pd(2.0, 4.0);
1648 assert_eq_m128d(r, e);
1649 }
1650
1651 #[simd_test(enable = "sse4.1")]
1652 unsafe fn test_mm_ceil_ps() {
1653 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1654 let r = _mm_ceil_ps(a);
1655 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1656 assert_eq_m128(r, e);
1657 }
1658
1659 #[simd_test(enable = "sse4.1")]
1660 unsafe fn test_mm_ceil_sd() {
1661 let a = _mm_setr_pd(1.5, 3.5);
1662 let b = _mm_setr_pd(-2.5, -4.5);
1663 let r = _mm_ceil_sd(a, b);
1664 let e = _mm_setr_pd(-2.0, 3.5);
1665 assert_eq_m128d(r, e);
1666 }
1667
1668 #[simd_test(enable = "sse4.1")]
1669 unsafe fn test_mm_ceil_ss() {
1670 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1671 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1672 let r = _mm_ceil_ss(a, b);
1673 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1674 assert_eq_m128(r, e);
1675 }
1676
1677 #[simd_test(enable = "sse4.1")]
1678 unsafe fn test_mm_round_pd() {
1679 let a = _mm_setr_pd(1.25, 3.75);
1680 let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
1681 let e = _mm_setr_pd(1.0, 4.0);
1682 assert_eq_m128d(r, e);
1683 }
1684
1685 #[simd_test(enable = "sse4.1")]
1686 unsafe fn test_mm_round_ps() {
1687 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1688 let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
1689 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1690 assert_eq_m128(r, e);
1691 }
1692
1693 #[simd_test(enable = "sse4.1")]
1694 unsafe fn test_mm_round_sd() {
1695 let a = _mm_setr_pd(1.5, 3.5);
1696 let b = _mm_setr_pd(-2.5, -4.5);
1697 let old_mode = _MM_GET_ROUNDING_MODE();
1698 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1699 let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
1700 _MM_SET_ROUNDING_MODE(old_mode);
1701 let e = _mm_setr_pd(-2.0, 3.5);
1702 assert_eq_m128d(r, e);
1703 }
1704
1705 #[simd_test(enable = "sse4.1")]
1706 unsafe fn test_mm_round_ss() {
1707 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1708 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1709 let old_mode = _MM_GET_ROUNDING_MODE();
1710 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1711 let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
1712 _MM_SET_ROUNDING_MODE(old_mode);
1713 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1714 assert_eq_m128(r, e);
1715 }
1716
1717 #[simd_test(enable = "sse4.1")]
1718 unsafe fn test_mm_minpos_epu16_1() {
1719 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1720 let r = _mm_minpos_epu16(a);
1721 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1722 assert_eq_m128i(r, e);
1723 }
1724
1725 #[simd_test(enable = "sse4.1")]
1726 unsafe fn test_mm_minpos_epu16_2() {
1727 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1728 let r = _mm_minpos_epu16(a);
1729 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1730 assert_eq_m128i(r, e);
1731 }
1732
1733 #[simd_test(enable = "sse4.1")]
1734 unsafe fn test_mm_mul_epi32() {
1735 {
1736 let a = _mm_setr_epi32(1, 1, 1, 1);
1737 let b = _mm_setr_epi32(1, 2, 3, 4);
1738 let r = _mm_mul_epi32(a, b);
1739 let e = _mm_setr_epi64x(1, 3);
1740 assert_eq_m128i(r, e);
1741 }
1742 {
1743 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1744 let b = _mm_setr_epi32(
1745 -20, -256, /* ignored */
1746 666666, 666666, /* ignored */
1747 );
1748 let r = _mm_mul_epi32(a, b);
1749 let e = _mm_setr_epi64x(-300, 823043843622);
1750 assert_eq_m128i(r, e);
1751 }
1752 }
1753
1754 #[simd_test(enable = "sse4.1")]
1755 unsafe fn test_mm_mullo_epi32() {
1756 {
1757 let a = _mm_setr_epi32(1, 1, 1, 1);
1758 let b = _mm_setr_epi32(1, 2, 3, 4);
1759 let r = _mm_mullo_epi32(a, b);
1760 let e = _mm_setr_epi32(1, 2, 3, 4);
1761 assert_eq_m128i(r, e);
1762 }
1763 {
1764 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1765 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1766 let r = _mm_mullo_epi32(a, b);
1767 // Attention, most significant bit in r[2] is treated
1768 // as a sign bit:
1769 // 1234567 * 666666 = -1589877210
1770 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1771 assert_eq_m128i(r, e);
1772 }
1773 }
1774
1775 #[simd_test(enable = "sse4.1")]
1776 unsafe fn test_mm_minpos_epu16() {
1777 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1778 let r = _mm_minpos_epu16(a);
1779 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1780 assert_eq_m128i(r, e);
1781 }
1782
1783 #[simd_test(enable = "sse4.1")]
1784 unsafe fn test_mm_mpsadbw_epu8() {
1785 #[rustfmt::skip]
1786 let a = _mm_setr_epi8(
1787 0, 1, 2, 3, 4, 5, 6, 7,
1788 8, 9, 10, 11, 12, 13, 14, 15,
1789 );
1790
1791 let r = _mm_mpsadbw_epu8(a, a, 0b000);
1792 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1793 assert_eq_m128i(r, e);
1794
1795 let r = _mm_mpsadbw_epu8(a, a, 0b001);
1796 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1797 assert_eq_m128i(r, e);
1798
1799 let r = _mm_mpsadbw_epu8(a, a, 0b100);
1800 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1801 assert_eq_m128i(r, e);
1802
1803 let r = _mm_mpsadbw_epu8(a, a, 0b101);
1804 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1805 assert_eq_m128i(r, e);
1806
1807 let r = _mm_mpsadbw_epu8(a, a, 0b111);
1808 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1809 assert_eq_m128i(r, e);
1810 }
1811
1812 #[simd_test(enable = "sse4.1")]
1813 unsafe fn test_mm_testz_si128() {
1814 let a = _mm_set1_epi8(1);
1815 let mask = _mm_set1_epi8(0);
1816 let r = _mm_testz_si128(a, mask);
1817 assert_eq!(r, 1);
1818 let a = _mm_set1_epi8(0b101);
1819 let mask = _mm_set1_epi8(0b110);
1820 let r = _mm_testz_si128(a, mask);
1821 assert_eq!(r, 0);
1822 let a = _mm_set1_epi8(0b011);
1823 let mask = _mm_set1_epi8(0b100);
1824 let r = _mm_testz_si128(a, mask);
1825 assert_eq!(r, 1);
1826 }
1827
1828 #[simd_test(enable = "sse4.1")]
1829 unsafe fn test_mm_testc_si128() {
1830 let a = _mm_set1_epi8(-1);
1831 let mask = _mm_set1_epi8(0);
1832 let r = _mm_testc_si128(a, mask);
1833 assert_eq!(r, 1);
1834 let a = _mm_set1_epi8(0b101);
1835 let mask = _mm_set1_epi8(0b110);
1836 let r = _mm_testc_si128(a, mask);
1837 assert_eq!(r, 0);
1838 let a = _mm_set1_epi8(0b101);
1839 let mask = _mm_set1_epi8(0b100);
1840 let r = _mm_testc_si128(a, mask);
1841 assert_eq!(r, 1);
1842 }
1843
1844 #[simd_test(enable = "sse4.1")]
1845 unsafe fn test_mm_testnzc_si128() {
1846 let a = _mm_set1_epi8(0);
1847 let mask = _mm_set1_epi8(1);
1848 let r = _mm_testnzc_si128(a, mask);
1849 assert_eq!(r, 0);
1850 let a = _mm_set1_epi8(-1);
1851 let mask = _mm_set1_epi8(0);
1852 let r = _mm_testnzc_si128(a, mask);
1853 assert_eq!(r, 0);
1854 let a = _mm_set1_epi8(0b101);
1855 let mask = _mm_set1_epi8(0b110);
1856 let r = _mm_testnzc_si128(a, mask);
1857 assert_eq!(r, 1);
1858 let a = _mm_set1_epi8(0b101);
1859 let mask = _mm_set1_epi8(0b101);
1860 let r = _mm_testnzc_si128(a, mask);
1861 assert_eq!(r, 0);
1862 }
1863
1864 #[simd_test(enable = "sse4.1")]
1865 unsafe fn test_mm_test_all_zeros() {
1866 let a = _mm_set1_epi8(1);
1867 let mask = _mm_set1_epi8(0);
1868 let r = _mm_test_all_zeros(a, mask);
1869 assert_eq!(r, 1);
1870 let a = _mm_set1_epi8(0b101);
1871 let mask = _mm_set1_epi8(0b110);
1872 let r = _mm_test_all_zeros(a, mask);
1873 assert_eq!(r, 0);
1874 let a = _mm_set1_epi8(0b011);
1875 let mask = _mm_set1_epi8(0b100);
1876 let r = _mm_test_all_zeros(a, mask);
1877 assert_eq!(r, 1);
1878 }
1879
1880 #[simd_test(enable = "sse4.1")]
1881 unsafe fn test_mm_test_all_ones() {
1882 let a = _mm_set1_epi8(-1);
1883 let r = _mm_test_all_ones(a);
1884 assert_eq!(r, 1);
1885 let a = _mm_set1_epi8(0b101);
1886 let r = _mm_test_all_ones(a);
1887 assert_eq!(r, 0);
1888 }
1889
1890 #[simd_test(enable = "sse4.1")]
1891 unsafe fn test_mm_test_mix_ones_zeros() {
1892 let a = _mm_set1_epi8(0);
1893 let mask = _mm_set1_epi8(1);
1894 let r = _mm_test_mix_ones_zeros(a, mask);
1895 assert_eq!(r, 0);
1896 let a = _mm_set1_epi8(-1);
1897 let mask = _mm_set1_epi8(0);
1898 let r = _mm_test_mix_ones_zeros(a, mask);
1899 assert_eq!(r, 0);
1900 let a = _mm_set1_epi8(0b101);
1901 let mask = _mm_set1_epi8(0b110);
1902 let r = _mm_test_mix_ones_zeros(a, mask);
1903 assert_eq!(r, 1);
1904 let a = _mm_set1_epi8(0b101);
1905 let mask = _mm_set1_epi8(0b101);
1906 let r = _mm_test_mix_ones_zeros(a, mask);
1907 assert_eq!(r, 0);
1908 }
1909 }