]> git.proxmox.com Git - rustc.git/blob - library/stdarch/crates/core_arch/src/x86/sse41.rs
New upstream version 1.49.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / sse41.rs
1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3 use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 mem::transmute,
6 };
7
8 #[cfg(test)]
9 use stdarch_test::assert_instr;
10
11 // SSE4 rounding constans
12 /// round to nearest
13 #[stable(feature = "simd_x86", since = "1.27.0")]
14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15 /// round down
16 #[stable(feature = "simd_x86", since = "1.27.0")]
17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18 /// round up
19 #[stable(feature = "simd_x86", since = "1.27.0")]
20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21 /// truncate
22 #[stable(feature = "simd_x86", since = "1.27.0")]
23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
25 #[stable(feature = "simd_x86", since = "1.27.0")]
26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27 /// do not suppress exceptions
28 #[stable(feature = "simd_x86", since = "1.27.0")]
29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30 /// suppress exceptions
31 #[stable(feature = "simd_x86", since = "1.27.0")]
32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33 /// round to nearest and do not suppress exceptions
34 #[stable(feature = "simd_x86", since = "1.27.0")]
35 pub const _MM_FROUND_NINT: i32 = 0x00;
36 /// round down and do not suppress exceptions
37 #[stable(feature = "simd_x86", since = "1.27.0")]
38 pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
39 /// round up and do not suppress exceptions
40 #[stable(feature = "simd_x86", since = "1.27.0")]
41 pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
42 /// truncate and do not suppress exceptions
43 #[stable(feature = "simd_x86", since = "1.27.0")]
44 pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
45 /// use MXCSR.RC and do not suppress exceptions; see
46 /// `vendor::_MM_SET_ROUNDING_MODE`
47 #[stable(feature = "simd_x86", since = "1.27.0")]
48 pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
50 #[stable(feature = "simd_x86", since = "1.27.0")]
51 pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
52
53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
54 ///
55 /// The high bit of each corresponding mask byte determines the selection.
56 /// If the high bit is set the element of `a` is selected. The element
57 /// of `b` is selected otherwise.
58 ///
59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
60 #[inline]
61 #[target_feature(enable = "sse4.1")]
62 #[cfg_attr(test, assert_instr(pblendvb))]
63 #[stable(feature = "simd_x86", since = "1.27.0")]
64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
65 transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
66 }
67
68 /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`.
69 ///
70 /// The mask bits determine the selection. A clear bit selects the
71 /// corresponding element of `a`, and a set bit the corresponding
72 /// element of `b`.
73 ///
74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
75 #[inline]
76 #[target_feature(enable = "sse4.1")]
77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
79 // #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
80 #[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))]
81 #[rustc_args_required_const(2)]
82 #[stable(feature = "simd_x86", since = "1.27.0")]
83 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
84 let a = a.as_i16x8();
85 let b = b.as_i16x8();
86 macro_rules! call {
87 ($imm8:expr) => {
88 pblendw(a, b, $imm8)
89 };
90 }
91 transmute(constify_imm8!(imm8, call))
92 }
93
94 /// Blend packed double-precision (64-bit) floating-point elements from `a`
95 /// and `b` using `mask`
96 ///
97 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
98 #[inline]
99 #[target_feature(enable = "sse4.1")]
100 #[cfg_attr(test, assert_instr(blendvpd))]
101 #[stable(feature = "simd_x86", since = "1.27.0")]
102 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
103 blendvpd(a, b, mask)
104 }
105
106 /// Blend packed single-precision (32-bit) floating-point elements from `a`
107 /// and `b` using `mask`
108 ///
109 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
110 #[inline]
111 #[target_feature(enable = "sse4.1")]
112 #[cfg_attr(test, assert_instr(blendvps))]
113 #[stable(feature = "simd_x86", since = "1.27.0")]
114 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
115 blendvps(a, b, mask)
116 }
117
118 /// Blend packed double-precision (64-bit) floating-point elements from `a`
119 /// and `b` using control mask `imm2`
120 ///
121 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
122 #[inline]
123 #[target_feature(enable = "sse4.1")]
124 // Note: LLVM7 prefers the single-precision floating-point domain when possible
125 // see https://bugs.llvm.org/show_bug.cgi?id=38195
126 // #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
127 #[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))]
128 #[rustc_args_required_const(2)]
129 #[stable(feature = "simd_x86", since = "1.27.0")]
130 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
131 macro_rules! call {
132 ($imm2:expr) => {
133 blendpd(a, b, $imm2)
134 };
135 }
136 constify_imm2!(imm2, call)
137 }
138
139 /// Blend packed single-precision (32-bit) floating-point elements from `a`
140 /// and `b` using mask `imm4`
141 ///
142 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
143 #[inline]
144 #[target_feature(enable = "sse4.1")]
145 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
146 #[rustc_args_required_const(2)]
147 #[stable(feature = "simd_x86", since = "1.27.0")]
148 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
149 macro_rules! call {
150 ($imm4:expr) => {
151 blendps(a, b, $imm4)
152 };
153 }
154 constify_imm4!(imm4, call)
155 }
156
157 /// Extracts a single-precision (32-bit) floating-point element from `a`,
158 /// selected with `imm8`
159 ///
160 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
161 #[inline]
162 #[target_feature(enable = "sse4.1")]
163 #[cfg_attr(
164 all(test, not(target_os = "windows")),
165 assert_instr(extractps, imm8 = 0)
166 )]
167 #[rustc_args_required_const(1)]
168 #[stable(feature = "simd_x86", since = "1.27.0")]
169 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
170 macro_rules! call {
171 ($imm2:expr) => {
172 transmute(simd_extract::<_, f32>(a, $imm2))
173 };
174 }
175 constify_imm2!(imm8, call)
176 }
177
178 /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit
179 /// integer containing the zero-extended integer data.
180 ///
181 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
182 ///
183 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
184 #[inline]
185 #[target_feature(enable = "sse4.1")]
186 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
187 #[rustc_args_required_const(1)]
188 #[stable(feature = "simd_x86", since = "1.27.0")]
189 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
190 let a = a.as_u8x16();
191 macro_rules! call {
192 ($imm4:expr) => {
193 simd_extract::<_, u8>(a, $imm4) as i32
194 };
195 }
196 constify_imm4!(imm8, call)
197 }
198
199 /// Extracts an 32-bit integer from `a` selected with `imm8`
200 ///
201 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
202 #[inline]
203 #[target_feature(enable = "sse4.1")]
204 #[cfg_attr(
205 all(test, not(target_os = "windows")),
206 assert_instr(extractps, imm8 = 1)
207 )]
208 #[rustc_args_required_const(1)]
209 #[stable(feature = "simd_x86", since = "1.27.0")]
210 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
211 let a = a.as_i32x4();
212 macro_rules! call {
213 ($imm2:expr) => {
214 simd_extract::<_, i32>(a, $imm2)
215 };
216 }
217 constify_imm2!(imm8, call)
218 }
219
220 /// Select a single value in `a` to store at some position in `b`,
221 /// Then zero elements according to `imm8`.
222 ///
223 /// `imm8` specifies which bits from operand `a` will be copied, which bits in
224 /// the result they will be copied to, and which bits in the result will be
225 /// cleared. The following assignments are made:
226 ///
227 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
228 /// - `00`: Selects bits `[31:0]` from operand `a`.
229 /// - `01`: Selects bits `[63:32]` from operand `a`.
230 /// - `10`: Selects bits `[95:64]` from operand `a`.
231 /// - `11`: Selects bits `[127:96]` from operand `a`.
232 ///
233 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
234 /// from operand `a` are copied:
235 /// - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
236 /// - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
237 /// - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
238 /// - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
239 ///
240 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
241 /// element is cleared.
242 ///
243 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
244 #[inline]
245 #[target_feature(enable = "sse4.1")]
246 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
247 #[rustc_args_required_const(2)]
248 #[stable(feature = "simd_x86", since = "1.27.0")]
249 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
250 macro_rules! call {
251 ($imm8:expr) => {
252 insertps(a, b, $imm8)
253 };
254 }
255 constify_imm8!(imm8, call)
256 }
257
258 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
259 /// location specified by `imm8`.
260 ///
261 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
262 #[inline]
263 #[target_feature(enable = "sse4.1")]
264 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
265 #[rustc_args_required_const(2)]
266 #[stable(feature = "simd_x86", since = "1.27.0")]
267 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
268 let a = a.as_i8x16();
269 macro_rules! call {
270 ($imm4:expr) => {
271 transmute(simd_insert(a, $imm4, i as i8))
272 };
273 }
274 constify_imm4!(imm8, call)
275 }
276
277 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
278 /// location specified by `imm8`.
279 ///
280 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
281 #[inline]
282 #[target_feature(enable = "sse4.1")]
283 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
284 #[rustc_args_required_const(2)]
285 #[stable(feature = "simd_x86", since = "1.27.0")]
286 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
287 let a = a.as_i32x4();
288 macro_rules! call {
289 ($imm2:expr) => {
290 transmute(simd_insert(a, $imm2, i))
291 };
292 }
293 constify_imm2!(imm8, call)
294 }
295
296 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
297 /// values in dst.
298 ///
299 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
300 #[inline]
301 #[target_feature(enable = "sse4.1")]
302 #[cfg_attr(test, assert_instr(pmaxsb))]
303 #[stable(feature = "simd_x86", since = "1.27.0")]
304 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
305 transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
306 }
307
308 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
309 /// maximum.
310 ///
311 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
312 #[inline]
313 #[target_feature(enable = "sse4.1")]
314 #[cfg_attr(test, assert_instr(pmaxuw))]
315 #[stable(feature = "simd_x86", since = "1.27.0")]
316 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
317 transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
318 }
319
320 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
321 /// values.
322 ///
323 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
324 #[inline]
325 #[target_feature(enable = "sse4.1")]
326 #[cfg_attr(test, assert_instr(pmaxsd))]
327 #[stable(feature = "simd_x86", since = "1.27.0")]
328 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
329 transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
330 }
331
332 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
333 /// maximum values.
334 ///
335 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
336 #[inline]
337 #[target_feature(enable = "sse4.1")]
338 #[cfg_attr(test, assert_instr(pmaxud))]
339 #[stable(feature = "simd_x86", since = "1.27.0")]
340 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
341 transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
342 }
343
344 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
345 /// values in dst.
346 ///
347 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
348 #[inline]
349 #[target_feature(enable = "sse4.1")]
350 #[cfg_attr(test, assert_instr(pminsb))]
351 #[stable(feature = "simd_x86", since = "1.27.0")]
352 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
353 transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
354 }
355
356 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
357 /// minimum.
358 ///
359 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
360 #[inline]
361 #[target_feature(enable = "sse4.1")]
362 #[cfg_attr(test, assert_instr(pminuw))]
363 #[stable(feature = "simd_x86", since = "1.27.0")]
364 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
365 transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
366 }
367
368 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
369 /// values.
370 ///
371 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
372 #[inline]
373 #[target_feature(enable = "sse4.1")]
374 #[cfg_attr(test, assert_instr(pminsd))]
375 #[stable(feature = "simd_x86", since = "1.27.0")]
376 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
377 transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
378 }
379
380 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
381 /// minimum values.
382 ///
383 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
384 #[inline]
385 #[target_feature(enable = "sse4.1")]
386 #[cfg_attr(test, assert_instr(pminud))]
387 #[stable(feature = "simd_x86", since = "1.27.0")]
388 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
389 transmute(pminud(a.as_u32x4(), b.as_u32x4()))
390 }
391
392 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
393 /// using unsigned saturation
394 ///
395 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
396 #[inline]
397 #[target_feature(enable = "sse4.1")]
398 #[cfg_attr(test, assert_instr(packusdw))]
399 #[stable(feature = "simd_x86", since = "1.27.0")]
400 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
401 transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
402 }
403
404 /// Compares packed 64-bit integers in `a` and `b` for equality
405 ///
406 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
407 #[inline]
408 #[target_feature(enable = "sse4.1")]
409 #[cfg_attr(test, assert_instr(pcmpeqq))]
410 #[stable(feature = "simd_x86", since = "1.27.0")]
411 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
412 transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
413 }
414
415 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
416 ///
417 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
418 #[inline]
419 #[target_feature(enable = "sse4.1")]
420 #[cfg_attr(test, assert_instr(pmovsxbw))]
421 #[stable(feature = "simd_x86", since = "1.27.0")]
422 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
423 let a = a.as_i8x16();
424 let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
425 transmute(simd_cast::<_, i16x8>(a))
426 }
427
428 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
429 ///
430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
431 #[inline]
432 #[target_feature(enable = "sse4.1")]
433 #[cfg_attr(test, assert_instr(pmovsxbd))]
434 #[stable(feature = "simd_x86", since = "1.27.0")]
435 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
436 let a = a.as_i8x16();
437 let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
438 transmute(simd_cast::<_, i32x4>(a))
439 }
440
441 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
442 /// 64-bit integers
443 ///
444 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
445 #[inline]
446 #[target_feature(enable = "sse4.1")]
447 #[cfg_attr(test, assert_instr(pmovsxbq))]
448 #[stable(feature = "simd_x86", since = "1.27.0")]
449 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
450 let a = a.as_i8x16();
451 let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
452 transmute(simd_cast::<_, i64x2>(a))
453 }
454
455 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
456 ///
457 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
458 #[inline]
459 #[target_feature(enable = "sse4.1")]
460 #[cfg_attr(test, assert_instr(pmovsxwd))]
461 #[stable(feature = "simd_x86", since = "1.27.0")]
462 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
463 let a = a.as_i16x8();
464 let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
465 transmute(simd_cast::<_, i32x4>(a))
466 }
467
468 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
469 ///
470 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
471 #[inline]
472 #[target_feature(enable = "sse4.1")]
473 #[cfg_attr(test, assert_instr(pmovsxwq))]
474 #[stable(feature = "simd_x86", since = "1.27.0")]
475 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
476 let a = a.as_i16x8();
477 let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
478 transmute(simd_cast::<_, i64x2>(a))
479 }
480
481 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
482 ///
483 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
484 #[inline]
485 #[target_feature(enable = "sse4.1")]
486 #[cfg_attr(test, assert_instr(pmovsxdq))]
487 #[stable(feature = "simd_x86", since = "1.27.0")]
488 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
489 let a = a.as_i32x4();
490 let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
491 transmute(simd_cast::<_, i64x2>(a))
492 }
493
494 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
495 ///
496 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
497 #[inline]
498 #[target_feature(enable = "sse4.1")]
499 #[cfg_attr(test, assert_instr(pmovzxbw))]
500 #[stable(feature = "simd_x86", since = "1.27.0")]
501 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
502 let a = a.as_u8x16();
503 let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
504 transmute(simd_cast::<_, i16x8>(a))
505 }
506
507 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
508 ///
509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
510 #[inline]
511 #[target_feature(enable = "sse4.1")]
512 #[cfg_attr(test, assert_instr(pmovzxbd))]
513 #[stable(feature = "simd_x86", since = "1.27.0")]
514 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
515 let a = a.as_u8x16();
516 let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
517 transmute(simd_cast::<_, i32x4>(a))
518 }
519
520 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
521 ///
522 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
523 #[inline]
524 #[target_feature(enable = "sse4.1")]
525 #[cfg_attr(test, assert_instr(pmovzxbq))]
526 #[stable(feature = "simd_x86", since = "1.27.0")]
527 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
528 let a = a.as_u8x16();
529 let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
530 transmute(simd_cast::<_, i64x2>(a))
531 }
532
533 /// Zeroes extend packed unsigned 16-bit integers in `a`
534 /// to packed 32-bit integers
535 ///
536 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
537 #[inline]
538 #[target_feature(enable = "sse4.1")]
539 #[cfg_attr(test, assert_instr(pmovzxwd))]
540 #[stable(feature = "simd_x86", since = "1.27.0")]
541 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
542 let a = a.as_u16x8();
543 let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
544 transmute(simd_cast::<_, i32x4>(a))
545 }
546
547 /// Zeroes extend packed unsigned 16-bit integers in `a`
548 /// to packed 64-bit integers
549 ///
550 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
551 #[inline]
552 #[target_feature(enable = "sse4.1")]
553 #[cfg_attr(test, assert_instr(pmovzxwq))]
554 #[stable(feature = "simd_x86", since = "1.27.0")]
555 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
556 let a = a.as_u16x8();
557 let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
558 transmute(simd_cast::<_, i64x2>(a))
559 }
560
561 /// Zeroes extend packed unsigned 32-bit integers in `a`
562 /// to packed 64-bit integers
563 ///
564 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
565 #[inline]
566 #[target_feature(enable = "sse4.1")]
567 #[cfg_attr(test, assert_instr(pmovzxdq))]
568 #[stable(feature = "simd_x86", since = "1.27.0")]
569 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
570 let a = a.as_u32x4();
571 let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
572 transmute(simd_cast::<_, i64x2>(a))
573 }
574
575 /// Returns the dot product of two __m128d vectors.
576 ///
577 /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask.
578 /// If a condition mask bit is zero, the corresponding multiplication is
579 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
580 /// the dot product will be stored in the return value component. Otherwise if
581 /// the broadcast mask bit is zero then the return component will be zero.
582 ///
583 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
584 #[inline]
585 #[target_feature(enable = "sse4.1")]
586 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
587 #[rustc_args_required_const(2)]
588 #[stable(feature = "simd_x86", since = "1.27.0")]
589 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
590 macro_rules! call {
591 ($imm8:expr) => {
592 dppd(a, b, $imm8)
593 };
594 }
595 constify_imm8!(imm8, call)
596 }
597
598 /// Returns the dot product of two __m128 vectors.
599 ///
600 /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask.
601 /// If a condition mask bit is zero, the corresponding multiplication is
602 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
603 /// the dot product will be stored in the return value component. Otherwise if
604 /// the broadcast mask bit is zero then the return component will be zero.
605 ///
606 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
607 #[inline]
608 #[target_feature(enable = "sse4.1")]
609 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
610 #[rustc_args_required_const(2)]
611 #[stable(feature = "simd_x86", since = "1.27.0")]
612 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
613 macro_rules! call {
614 ($imm8:expr) => {
615 dpps(a, b, $imm8)
616 };
617 }
618 constify_imm8!(imm8, call)
619 }
620
621 /// Round the packed double-precision (64-bit) floating-point elements in `a`
622 /// down to an integer value, and stores the results as packed double-precision
623 /// floating-point elements.
624 ///
625 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
626 #[inline]
627 #[target_feature(enable = "sse4.1")]
628 #[cfg_attr(test, assert_instr(roundpd))]
629 #[stable(feature = "simd_x86", since = "1.27.0")]
630 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
631 simd_floor(a)
632 }
633
634 /// Round the packed single-precision (32-bit) floating-point elements in `a`
635 /// down to an integer value, and stores the results as packed single-precision
636 /// floating-point elements.
637 ///
638 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
639 #[inline]
640 #[target_feature(enable = "sse4.1")]
641 #[cfg_attr(test, assert_instr(roundps))]
642 #[stable(feature = "simd_x86", since = "1.27.0")]
643 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
644 simd_floor(a)
645 }
646
647 /// Round the lower double-precision (64-bit) floating-point element in `b`
648 /// down to an integer value, store the result as a double-precision
649 /// floating-point element in the lower element of the intrinsic result,
650 /// and copies the upper element from `a` to the upper element of the intrinsic
651 /// result.
652 ///
653 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
654 #[inline]
655 #[target_feature(enable = "sse4.1")]
656 #[cfg_attr(test, assert_instr(roundsd))]
657 #[stable(feature = "simd_x86", since = "1.27.0")]
658 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
659 roundsd(a, b, _MM_FROUND_FLOOR)
660 }
661
662 /// Round the lower single-precision (32-bit) floating-point element in `b`
663 /// down to an integer value, store the result as a single-precision
664 /// floating-point element in the lower element of the intrinsic result,
665 /// and copies the upper 3 packed elements from `a` to the upper elements
666 /// of the intrinsic result.
667 ///
668 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
669 #[inline]
670 #[target_feature(enable = "sse4.1")]
671 #[cfg_attr(test, assert_instr(roundss))]
672 #[stable(feature = "simd_x86", since = "1.27.0")]
673 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
674 roundss(a, b, _MM_FROUND_FLOOR)
675 }
676
677 /// Round the packed double-precision (64-bit) floating-point elements in `a`
678 /// up to an integer value, and stores the results as packed double-precision
679 /// floating-point elements.
680 ///
681 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
682 #[inline]
683 #[target_feature(enable = "sse4.1")]
684 #[cfg_attr(test, assert_instr(roundpd))]
685 #[stable(feature = "simd_x86", since = "1.27.0")]
686 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
687 simd_ceil(a)
688 }
689
690 /// Round the packed single-precision (32-bit) floating-point elements in `a`
691 /// up to an integer value, and stores the results as packed single-precision
692 /// floating-point elements.
693 ///
694 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
695 #[inline]
696 #[target_feature(enable = "sse4.1")]
697 #[cfg_attr(test, assert_instr(roundps))]
698 #[stable(feature = "simd_x86", since = "1.27.0")]
699 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
700 simd_ceil(a)
701 }
702
703 /// Round the lower double-precision (64-bit) floating-point element in `b`
704 /// up to an integer value, store the result as a double-precision
705 /// floating-point element in the lower element of the intrisic result,
706 /// and copies the upper element from `a` to the upper element
707 /// of the intrinsic result.
708 ///
709 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
710 #[inline]
711 #[target_feature(enable = "sse4.1")]
712 #[cfg_attr(test, assert_instr(roundsd))]
713 #[stable(feature = "simd_x86", since = "1.27.0")]
714 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
715 roundsd(a, b, _MM_FROUND_CEIL)
716 }
717
718 /// Round the lower single-precision (32-bit) floating-point element in `b`
719 /// up to an integer value, store the result as a single-precision
720 /// floating-point element in the lower element of the intrinsic result,
721 /// and copies the upper 3 packed elements from `a` to the upper elements
722 /// of the intrinsic result.
723 ///
724 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
725 #[inline]
726 #[target_feature(enable = "sse4.1")]
727 #[cfg_attr(test, assert_instr(roundss))]
728 #[stable(feature = "simd_x86", since = "1.27.0")]
729 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
730 roundss(a, b, _MM_FROUND_CEIL)
731 }
732
733 /// Round the packed double-precision (64-bit) floating-point elements in `a`
734 /// using the `rounding` parameter, and stores the results as packed
735 /// double-precision floating-point elements.
736 /// Rounding is done according to the rounding parameter, which can be one of:
737 ///
738 /// ```
739 /// #[cfg(target_arch = "x86")]
740 /// use std::arch::x86::*;
741 /// #[cfg(target_arch = "x86_64")]
742 /// use std::arch::x86_64::*;
743 ///
744 /// # fn main() {
745 /// // round to nearest, and suppress exceptions:
746 /// # let _x =
747 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
748 /// // round down, and suppress exceptions:
749 /// # let _x =
750 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
751 /// // round up, and suppress exceptions:
752 /// # let _x =
753 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
754 /// // truncate, and suppress exceptions:
755 /// # let _x =
756 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
757 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
758 /// # let _x =
759 /// _MM_FROUND_CUR_DIRECTION;
760 /// # }
761 /// ```
762 ///
763 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
764 #[inline]
765 #[target_feature(enable = "sse4.1")]
766 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
767 #[rustc_args_required_const(1)]
768 #[stable(feature = "simd_x86", since = "1.27.0")]
769 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
770 macro_rules! call {
771 ($imm4:expr) => {
772 roundpd(a, $imm4)
773 };
774 }
775 constify_imm4!(rounding, call)
776 }
777
778 /// Round the packed single-precision (32-bit) floating-point elements in `a`
779 /// using the `rounding` parameter, and stores the results as packed
780 /// single-precision floating-point elements.
781 /// Rounding is done according to the rounding parameter, which can be one of:
782 ///
783 /// ```
784 /// #[cfg(target_arch = "x86")]
785 /// use std::arch::x86::*;
786 /// #[cfg(target_arch = "x86_64")]
787 /// use std::arch::x86_64::*;
788 ///
789 /// # fn main() {
790 /// // round to nearest, and suppress exceptions:
791 /// # let _x =
792 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
793 /// // round down, and suppress exceptions:
794 /// # let _x =
795 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
796 /// // round up, and suppress exceptions:
797 /// # let _x =
798 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
799 /// // truncate, and suppress exceptions:
800 /// # let _x =
801 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
802 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
803 /// # let _x =
804 /// _MM_FROUND_CUR_DIRECTION;
805 /// # }
806 /// ```
807 ///
808 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
809 #[inline]
810 #[target_feature(enable = "sse4.1")]
811 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
812 #[rustc_args_required_const(1)]
813 #[stable(feature = "simd_x86", since = "1.27.0")]
814 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
815 macro_rules! call {
816 ($imm4:expr) => {
817 roundps(a, $imm4)
818 };
819 }
820 constify_imm4!(rounding, call)
821 }
822
823 /// Round the lower double-precision (64-bit) floating-point element in `b`
824 /// using the `rounding` parameter, store the result as a double-precision
825 /// floating-point element in the lower element of the intrinsic result,
826 /// and copies the upper element from `a` to the upper element of the intrinsic
827 /// result.
828 /// Rounding is done according to the rounding parameter, which can be one of:
829 ///
830 /// ```
831 /// #[cfg(target_arch = "x86")]
832 /// use std::arch::x86::*;
833 /// #[cfg(target_arch = "x86_64")]
834 /// use std::arch::x86_64::*;
835 ///
836 /// # fn main() {
837 /// // round to nearest, and suppress exceptions:
838 /// # let _x =
839 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
840 /// // round down, and suppress exceptions:
841 /// # let _x =
842 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
843 /// // round up, and suppress exceptions:
844 /// # let _x =
845 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
846 /// // truncate, and suppress exceptions:
847 /// # let _x =
848 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
849 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
850 /// # let _x =
851 /// _MM_FROUND_CUR_DIRECTION;
852 /// # }
853 /// ```
854 ///
855 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
856 #[inline]
857 #[target_feature(enable = "sse4.1")]
858 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
859 #[rustc_args_required_const(2)]
860 #[stable(feature = "simd_x86", since = "1.27.0")]
861 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
862 macro_rules! call {
863 ($imm4:expr) => {
864 roundsd(a, b, $imm4)
865 };
866 }
867 constify_imm4!(rounding, call)
868 }
869
870 /// Round the lower single-precision (32-bit) floating-point element in `b`
871 /// using the `rounding` parameter, store the result as a single-precision
872 /// floating-point element in the lower element of the intrinsic result,
873 /// and copies the upper 3 packed elements from `a` to the upper elements
874 /// of the instrinsic result.
875 /// Rounding is done according to the rounding parameter, which can be one of:
876 ///
877 /// ```
878 /// #[cfg(target_arch = "x86")]
879 /// use std::arch::x86::*;
880 /// #[cfg(target_arch = "x86_64")]
881 /// use std::arch::x86_64::*;
882 ///
883 /// # fn main() {
884 /// // round to nearest, and suppress exceptions:
885 /// # let _x =
886 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
887 /// // round down, and suppress exceptions:
888 /// # let _x =
889 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
890 /// // round up, and suppress exceptions:
891 /// # let _x =
892 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
893 /// // truncate, and suppress exceptions:
894 /// # let _x =
895 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
896 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
897 /// # let _x =
898 /// _MM_FROUND_CUR_DIRECTION;
899 /// # }
900 /// ```
901 ///
902 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
903 #[inline]
904 #[target_feature(enable = "sse4.1")]
905 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
906 #[rustc_args_required_const(2)]
907 #[stable(feature = "simd_x86", since = "1.27.0")]
908 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
909 macro_rules! call {
910 ($imm4:expr) => {
911 roundss(a, b, $imm4)
912 };
913 }
914 constify_imm4!(rounding, call)
915 }
916
917 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
918 /// returning a vector containing its value in its first position, and its
919 /// index
920 /// in its second position; all other elements are set to zero.
921 ///
922 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
923 /// instruction.
924 ///
925 /// Arguments:
926 ///
927 /// * `a` - A 128-bit vector of type `__m128i`.
928 ///
929 /// Returns:
930 ///
931 /// A 128-bit value where:
932 ///
933 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
934 /// * bits `[18:16]` - contain the index of the minimum value
935 /// * remaining bits are set to `0`.
936 ///
937 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
938 #[inline]
939 #[target_feature(enable = "sse4.1")]
940 #[cfg_attr(test, assert_instr(phminposuw))]
941 #[stable(feature = "simd_x86", since = "1.27.0")]
942 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
943 transmute(phminposuw(a.as_u16x8()))
944 }
945
946 /// Multiplies the low 32-bit integers from each packed 64-bit
947 /// element in `a` and `b`, and returns the signed 64-bit result.
948 ///
949 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
950 #[inline]
951 #[target_feature(enable = "sse4.1")]
952 #[cfg_attr(test, assert_instr(pmuldq))]
953 #[stable(feature = "simd_x86", since = "1.27.0")]
954 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
955 transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
956 }
957
958 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
959 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
960 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
961 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
962 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
963 /// return a negative number.
964 ///
965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
966 #[inline]
967 #[target_feature(enable = "sse4.1")]
968 #[cfg_attr(test, assert_instr(pmulld))]
969 #[stable(feature = "simd_x86", since = "1.27.0")]
970 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
971 transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
972 }
973
974 /// Subtracts 8-bit unsigned integer values and computes the absolute
975 /// values of the differences to the corresponding bits in the destination.
976 /// Then sums of the absolute differences are returned according to the bit
977 /// fields in the immediate operand.
978 ///
979 /// The following algorithm is performed:
980 ///
981 /// ```ignore
982 /// i = imm8[2] * 4
983 /// j = imm8[1:0] * 4
984 /// for k := 0 to 7
985 /// d0 = abs(a[i + k + 0] - b[j + 0])
986 /// d1 = abs(a[i + k + 1] - b[j + 1])
987 /// d2 = abs(a[i + k + 2] - b[j + 2])
988 /// d3 = abs(a[i + k + 3] - b[j + 3])
989 /// r[k] = d0 + d1 + d2 + d3
990 /// ```
991 ///
992 /// Arguments:
993 ///
994 /// * `a` - A 128-bit vector of type `__m128i`.
995 /// * `b` - A 128-bit vector of type `__m128i`.
996 /// * `imm8` - An 8-bit immediate operand specifying how the absolute
997 /// differences are to be calculated
998 /// * Bit `[2]` specify the offset for operand `a`
999 /// * Bits `[1:0]` specify the offset for operand `b`
1000 ///
1001 /// Returns:
1002 ///
1003 /// * A `__m128i` vector containing the sums of the sets of absolute
1004 /// differences between both operands.
1005 ///
1006 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
1007 #[inline]
1008 #[target_feature(enable = "sse4.1")]
1009 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
1010 #[rustc_args_required_const(2)]
1011 #[stable(feature = "simd_x86", since = "1.27.0")]
1012 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
1013 let a = a.as_u8x16();
1014 let b = b.as_u8x16();
1015 macro_rules! call {
1016 ($imm8:expr) => {
1017 mpsadbw(a, b, $imm8)
1018 };
1019 }
1020 transmute(constify_imm3!(imm8, call))
1021 }
1022
1023 /// Tests whether the specified bits in a 128-bit integer vector are all
1024 /// zeros.
1025 ///
1026 /// Arguments:
1027 ///
1028 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1029 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1030 /// operand `a`.
1031 ///
1032 /// Returns:
1033 ///
1034 /// * `1` - if the specified bits are all zeros,
1035 /// * `0` - otherwise.
1036 ///
1037 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
1038 #[inline]
1039 #[target_feature(enable = "sse4.1")]
1040 #[cfg_attr(test, assert_instr(ptest))]
1041 #[stable(feature = "simd_x86", since = "1.27.0")]
1042 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
1043 ptestz(a.as_i64x2(), mask.as_i64x2())
1044 }
1045
1046 /// Tests whether the specified bits in a 128-bit integer vector are all
1047 /// ones.
1048 ///
1049 /// Arguments:
1050 ///
1051 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1052 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1053 /// operand `a`.
1054 ///
1055 /// Returns:
1056 ///
1057 /// * `1` - if the specified bits are all ones,
1058 /// * `0` - otherwise.
1059 ///
1060 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
1061 #[inline]
1062 #[target_feature(enable = "sse4.1")]
1063 #[cfg_attr(test, assert_instr(ptest))]
1064 #[stable(feature = "simd_x86", since = "1.27.0")]
1065 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
1066 ptestc(a.as_i64x2(), mask.as_i64x2())
1067 }
1068
1069 /// Tests whether the specified bits in a 128-bit integer vector are
1070 /// neither all zeros nor all ones.
1071 ///
1072 /// Arguments:
1073 ///
1074 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1075 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1076 /// operand `a`.
1077 ///
1078 /// Returns:
1079 ///
1080 /// * `1` - if the specified bits are neither all zeros nor all ones,
1081 /// * `0` - otherwise.
1082 ///
1083 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1084 #[inline]
1085 #[target_feature(enable = "sse4.1")]
1086 #[cfg_attr(test, assert_instr(ptest))]
1087 #[stable(feature = "simd_x86", since = "1.27.0")]
1088 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1089 ptestnzc(a.as_i64x2(), mask.as_i64x2())
1090 }
1091
1092 /// Tests whether the specified bits in a 128-bit integer vector are all
1093 /// zeros.
1094 ///
1095 /// Arguments:
1096 ///
1097 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1098 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1099 /// operand `a`.
1100 ///
1101 /// Returns:
1102 ///
1103 /// * `1` - if the specified bits are all zeros,
1104 /// * `0` - otherwise.
1105 ///
1106 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1107 #[inline]
1108 #[target_feature(enable = "sse4.1")]
1109 #[cfg_attr(test, assert_instr(ptest))]
1110 #[stable(feature = "simd_x86", since = "1.27.0")]
1111 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1112 _mm_testz_si128(a, mask)
1113 }
1114
1115 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1116 /// ones.
1117 ///
1118 /// Argument:
1119 ///
1120 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1121 ///
1122 /// Returns:
1123 ///
1124 /// * `1` - if the bits specified in the operand are all set to 1,
1125 /// * `0` - otherwise.
1126 ///
1127 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1128 #[inline]
1129 #[target_feature(enable = "sse4.1")]
1130 #[cfg_attr(test, assert_instr(pcmpeqd))]
1131 #[cfg_attr(test, assert_instr(ptest))]
1132 #[stable(feature = "simd_x86", since = "1.27.0")]
1133 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1134 _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1135 }
1136
1137 /// Tests whether the specified bits in a 128-bit integer vector are
1138 /// neither all zeros nor all ones.
1139 ///
1140 /// Arguments:
1141 ///
1142 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1143 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1144 /// operand `a`.
1145 ///
1146 /// Returns:
1147 ///
1148 /// * `1` - if the specified bits are neither all zeros nor all ones,
1149 /// * `0` - otherwise.
1150 ///
1151 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1152 #[inline]
1153 #[target_feature(enable = "sse4.1")]
1154 #[cfg_attr(test, assert_instr(ptest))]
1155 #[stable(feature = "simd_x86", since = "1.27.0")]
1156 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1157 _mm_testnzc_si128(a, mask)
1158 }
1159
1160 #[allow(improper_ctypes)]
1161 extern "C" {
1162 #[link_name = "llvm.x86.sse41.pblendvb"]
1163 fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1164 #[link_name = "llvm.x86.sse41.blendvpd"]
1165 fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1166 #[link_name = "llvm.x86.sse41.blendvps"]
1167 fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1168 #[link_name = "llvm.x86.sse41.blendpd"]
1169 fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1170 #[link_name = "llvm.x86.sse41.blendps"]
1171 fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1172 #[link_name = "llvm.x86.sse41.pblendw"]
1173 fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1174 #[link_name = "llvm.x86.sse41.insertps"]
1175 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1176 #[link_name = "llvm.x86.sse41.pmaxsb"]
1177 fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1178 #[link_name = "llvm.x86.sse41.pmaxuw"]
1179 fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1180 #[link_name = "llvm.x86.sse41.pmaxsd"]
1181 fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1182 #[link_name = "llvm.x86.sse41.pmaxud"]
1183 fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1184 #[link_name = "llvm.x86.sse41.pminsb"]
1185 fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1186 #[link_name = "llvm.x86.sse41.pminuw"]
1187 fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1188 #[link_name = "llvm.x86.sse41.pminsd"]
1189 fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1190 #[link_name = "llvm.x86.sse41.pminud"]
1191 fn pminud(a: u32x4, b: u32x4) -> u32x4;
1192 #[link_name = "llvm.x86.sse41.packusdw"]
1193 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1194 #[link_name = "llvm.x86.sse41.dppd"]
1195 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1196 #[link_name = "llvm.x86.sse41.dpps"]
1197 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1198 #[link_name = "llvm.x86.sse41.round.pd"]
1199 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1200 #[link_name = "llvm.x86.sse41.round.ps"]
1201 fn roundps(a: __m128, rounding: i32) -> __m128;
1202 #[link_name = "llvm.x86.sse41.round.sd"]
1203 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1204 #[link_name = "llvm.x86.sse41.round.ss"]
1205 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1206 #[link_name = "llvm.x86.sse41.phminposuw"]
1207 fn phminposuw(a: u16x8) -> u16x8;
1208 #[link_name = "llvm.x86.sse41.pmuldq"]
1209 fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1210 #[link_name = "llvm.x86.sse41.mpsadbw"]
1211 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1212 #[link_name = "llvm.x86.sse41.ptestz"]
1213 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1214 #[link_name = "llvm.x86.sse41.ptestc"]
1215 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1216 #[link_name = "llvm.x86.sse41.ptestnzc"]
1217 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1218 }
1219
1220 #[cfg(test)]
1221 mod tests {
1222 use crate::core_arch::x86::*;
1223 use std::mem;
1224 use stdarch_test::simd_test;
1225
1226 #[simd_test(enable = "sse4.1")]
1227 unsafe fn test_mm_blendv_epi8() {
1228 #[rustfmt::skip]
1229 let a = _mm_setr_epi8(
1230 0, 1, 2, 3, 4, 5, 6, 7,
1231 8, 9, 10, 11, 12, 13, 14, 15,
1232 );
1233 #[rustfmt::skip]
1234 let b = _mm_setr_epi8(
1235 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1236 );
1237 #[rustfmt::skip]
1238 let mask = _mm_setr_epi8(
1239 0, -1, 0, -1, 0, -1, 0, -1,
1240 0, -1, 0, -1, 0, -1, 0, -1,
1241 );
1242 #[rustfmt::skip]
1243 let e = _mm_setr_epi8(
1244 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1245 );
1246 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1247 }
1248
1249 #[simd_test(enable = "sse4.1")]
1250 unsafe fn test_mm_blendv_pd() {
1251 let a = _mm_set1_pd(0.0);
1252 let b = _mm_set1_pd(1.0);
1253 let mask = transmute(_mm_setr_epi64x(0, -1));
1254 let r = _mm_blendv_pd(a, b, mask);
1255 let e = _mm_setr_pd(0.0, 1.0);
1256 assert_eq_m128d(r, e);
1257 }
1258
1259 #[simd_test(enable = "sse4.1")]
1260 unsafe fn test_mm_blendv_ps() {
1261 let a = _mm_set1_ps(0.0);
1262 let b = _mm_set1_ps(1.0);
1263 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1264 let r = _mm_blendv_ps(a, b, mask);
1265 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1266 assert_eq_m128(r, e);
1267 }
1268
1269 #[simd_test(enable = "sse4.1")]
1270 unsafe fn test_mm_blend_pd() {
1271 let a = _mm_set1_pd(0.0);
1272 let b = _mm_set1_pd(1.0);
1273 let r = _mm_blend_pd(a, b, 0b10);
1274 let e = _mm_setr_pd(0.0, 1.0);
1275 assert_eq_m128d(r, e);
1276 }
1277
1278 #[simd_test(enable = "sse4.1")]
1279 unsafe fn test_mm_blend_ps() {
1280 let a = _mm_set1_ps(0.0);
1281 let b = _mm_set1_ps(1.0);
1282 let r = _mm_blend_ps(a, b, 0b1010);
1283 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1284 assert_eq_m128(r, e);
1285 }
1286
1287 #[simd_test(enable = "sse4.1")]
1288 unsafe fn test_mm_blend_epi16() {
1289 let a = _mm_set1_epi16(0);
1290 let b = _mm_set1_epi16(1);
1291 let r = _mm_blend_epi16(a, b, 0b1010_1100);
1292 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1293 assert_eq_m128i(r, e);
1294 }
1295
1296 #[simd_test(enable = "sse4.1")]
1297 unsafe fn test_mm_extract_ps() {
1298 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1299 let r: f32 = transmute(_mm_extract_ps(a, 1));
1300 assert_eq!(r, 1.0);
1301 let r: f32 = transmute(_mm_extract_ps(a, 5));
1302 assert_eq!(r, 1.0);
1303 }
1304
1305 #[simd_test(enable = "sse4.1")]
1306 unsafe fn test_mm_extract_epi8() {
1307 #[rustfmt::skip]
1308 let a = _mm_setr_epi8(
1309 -1, 1, 2, 3, 4, 5, 6, 7,
1310 8, 9, 10, 11, 12, 13, 14, 15
1311 );
1312 let r1 = _mm_extract_epi8(a, 0);
1313 let r2 = _mm_extract_epi8(a, 19);
1314 assert_eq!(r1, 0xFF);
1315 assert_eq!(r2, 3);
1316 }
1317
1318 #[simd_test(enable = "sse4.1")]
1319 unsafe fn test_mm_extract_epi32() {
1320 let a = _mm_setr_epi32(0, 1, 2, 3);
1321 let r = _mm_extract_epi32(a, 1);
1322 assert_eq!(r, 1);
1323 let r = _mm_extract_epi32(a, 5);
1324 assert_eq!(r, 1);
1325 }
1326
1327 #[simd_test(enable = "sse4.1")]
1328 unsafe fn test_mm_insert_ps() {
1329 let a = _mm_set1_ps(1.0);
1330 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1331 let r = _mm_insert_ps(a, b, 0b11_00_1100);
1332 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1333 assert_eq_m128(r, e);
1334 }
1335
1336 #[simd_test(enable = "sse4.1")]
1337 unsafe fn test_mm_insert_epi8() {
1338 let a = _mm_set1_epi8(0);
1339 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1340 let r = _mm_insert_epi8(a, 32, 1);
1341 assert_eq_m128i(r, e);
1342 let r = _mm_insert_epi8(a, 32, 17);
1343 assert_eq_m128i(r, e);
1344 }
1345
1346 #[simd_test(enable = "sse4.1")]
1347 unsafe fn test_mm_insert_epi32() {
1348 let a = _mm_set1_epi32(0);
1349 let e = _mm_setr_epi32(0, 32, 0, 0);
1350 let r = _mm_insert_epi32(a, 32, 1);
1351 assert_eq_m128i(r, e);
1352 let r = _mm_insert_epi32(a, 32, 5);
1353 assert_eq_m128i(r, e);
1354 }
1355
1356 #[simd_test(enable = "sse4.1")]
1357 unsafe fn test_mm_max_epi8() {
1358 #[rustfmt::skip]
1359 let a = _mm_setr_epi8(
1360 1, 4, 5, 8, 9, 12, 13, 16,
1361 17, 20, 21, 24, 25, 28, 29, 32,
1362 );
1363 #[rustfmt::skip]
1364 let b = _mm_setr_epi8(
1365 2, 3, 6, 7, 10, 11, 14, 15,
1366 18, 19, 22, 23, 26, 27, 30, 31,
1367 );
1368 let r = _mm_max_epi8(a, b);
1369 #[rustfmt::skip]
1370 let e = _mm_setr_epi8(
1371 2, 4, 6, 8, 10, 12, 14, 16,
1372 18, 20, 22, 24, 26, 28, 30, 32,
1373 );
1374 assert_eq_m128i(r, e);
1375 }
1376
1377 #[simd_test(enable = "sse4.1")]
1378 unsafe fn test_mm_max_epu16() {
1379 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1380 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1381 let r = _mm_max_epu16(a, b);
1382 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1383 assert_eq_m128i(r, e);
1384 }
1385
1386 #[simd_test(enable = "sse4.1")]
1387 unsafe fn test_mm_max_epi32() {
1388 let a = _mm_setr_epi32(1, 4, 5, 8);
1389 let b = _mm_setr_epi32(2, 3, 6, 7);
1390 let r = _mm_max_epi32(a, b);
1391 let e = _mm_setr_epi32(2, 4, 6, 8);
1392 assert_eq_m128i(r, e);
1393 }
1394
1395 #[simd_test(enable = "sse4.1")]
1396 unsafe fn test_mm_max_epu32() {
1397 let a = _mm_setr_epi32(1, 4, 5, 8);
1398 let b = _mm_setr_epi32(2, 3, 6, 7);
1399 let r = _mm_max_epu32(a, b);
1400 let e = _mm_setr_epi32(2, 4, 6, 8);
1401 assert_eq_m128i(r, e);
1402 }
1403
1404 #[simd_test(enable = "sse4.1")]
1405 unsafe fn test_mm_min_epi8_1() {
1406 #[rustfmt::skip]
1407 let a = _mm_setr_epi8(
1408 1, 4, 5, 8, 9, 12, 13, 16,
1409 17, 20, 21, 24, 25, 28, 29, 32,
1410 );
1411 #[rustfmt::skip]
1412 let b = _mm_setr_epi8(
1413 2, 3, 6, 7, 10, 11, 14, 15,
1414 18, 19, 22, 23, 26, 27, 30, 31,
1415 );
1416 let r = _mm_min_epi8(a, b);
1417 #[rustfmt::skip]
1418 let e = _mm_setr_epi8(
1419 1, 3, 5, 7, 9, 11, 13, 15,
1420 17, 19, 21, 23, 25, 27, 29, 31,
1421 );
1422 assert_eq_m128i(r, e);
1423 }
1424
1425 #[simd_test(enable = "sse4.1")]
1426 unsafe fn test_mm_min_epi8_2() {
1427 #[rustfmt::skip]
1428 let a = _mm_setr_epi8(
1429 1, -4, -5, 8, -9, -12, 13, -16,
1430 17, 20, 21, 24, 25, 28, 29, 32,
1431 );
1432 #[rustfmt::skip]
1433 let b = _mm_setr_epi8(
1434 2, -3, -6, 7, -10, -11, 14, -15,
1435 18, 19, 22, 23, 26, 27, 30, 31,
1436 );
1437 let r = _mm_min_epi8(a, b);
1438 #[rustfmt::skip]
1439 let e = _mm_setr_epi8(
1440 1, -4, -6, 7, -10, -12, 13, -16,
1441 17, 19, 21, 23, 25, 27, 29, 31,
1442 );
1443 assert_eq_m128i(r, e);
1444 }
1445
1446 #[simd_test(enable = "sse4.1")]
1447 unsafe fn test_mm_min_epu16() {
1448 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1449 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1450 let r = _mm_min_epu16(a, b);
1451 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1452 assert_eq_m128i(r, e);
1453 }
1454
1455 #[simd_test(enable = "sse4.1")]
1456 unsafe fn test_mm_min_epi32_1() {
1457 let a = _mm_setr_epi32(1, 4, 5, 8);
1458 let b = _mm_setr_epi32(2, 3, 6, 7);
1459 let r = _mm_min_epi32(a, b);
1460 let e = _mm_setr_epi32(1, 3, 5, 7);
1461 assert_eq_m128i(r, e);
1462 }
1463
1464 #[simd_test(enable = "sse4.1")]
1465 unsafe fn test_mm_min_epi32_2() {
1466 let a = _mm_setr_epi32(-1, 4, 5, -7);
1467 let b = _mm_setr_epi32(-2, 3, -6, 8);
1468 let r = _mm_min_epi32(a, b);
1469 let e = _mm_setr_epi32(-2, 3, -6, -7);
1470 assert_eq_m128i(r, e);
1471 }
1472
1473 #[simd_test(enable = "sse4.1")]
1474 unsafe fn test_mm_min_epu32() {
1475 let a = _mm_setr_epi32(1, 4, 5, 8);
1476 let b = _mm_setr_epi32(2, 3, 6, 7);
1477 let r = _mm_min_epu32(a, b);
1478 let e = _mm_setr_epi32(1, 3, 5, 7);
1479 assert_eq_m128i(r, e);
1480 }
1481
1482 #[simd_test(enable = "sse4.1")]
1483 unsafe fn test_mm_packus_epi32() {
1484 let a = _mm_setr_epi32(1, 2, 3, 4);
1485 let b = _mm_setr_epi32(-1, -2, -3, -4);
1486 let r = _mm_packus_epi32(a, b);
1487 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1488 assert_eq_m128i(r, e);
1489 }
1490
1491 #[simd_test(enable = "sse4.1")]
1492 unsafe fn test_mm_cmpeq_epi64() {
1493 let a = _mm_setr_epi64x(0, 1);
1494 let b = _mm_setr_epi64x(0, 0);
1495 let r = _mm_cmpeq_epi64(a, b);
1496 let e = _mm_setr_epi64x(-1, 0);
1497 assert_eq_m128i(r, e);
1498 }
1499
1500 #[simd_test(enable = "sse4.1")]
1501 unsafe fn test_mm_cvtepi8_epi16() {
1502 let a = _mm_set1_epi8(10);
1503 let r = _mm_cvtepi8_epi16(a);
1504 let e = _mm_set1_epi16(10);
1505 assert_eq_m128i(r, e);
1506 let a = _mm_set1_epi8(-10);
1507 let r = _mm_cvtepi8_epi16(a);
1508 let e = _mm_set1_epi16(-10);
1509 assert_eq_m128i(r, e);
1510 }
1511
1512 #[simd_test(enable = "sse4.1")]
1513 unsafe fn test_mm_cvtepi8_epi32() {
1514 let a = _mm_set1_epi8(10);
1515 let r = _mm_cvtepi8_epi32(a);
1516 let e = _mm_set1_epi32(10);
1517 assert_eq_m128i(r, e);
1518 let a = _mm_set1_epi8(-10);
1519 let r = _mm_cvtepi8_epi32(a);
1520 let e = _mm_set1_epi32(-10);
1521 assert_eq_m128i(r, e);
1522 }
1523
1524 #[simd_test(enable = "sse4.1")]
1525 unsafe fn test_mm_cvtepi8_epi64() {
1526 let a = _mm_set1_epi8(10);
1527 let r = _mm_cvtepi8_epi64(a);
1528 let e = _mm_set1_epi64x(10);
1529 assert_eq_m128i(r, e);
1530 let a = _mm_set1_epi8(-10);
1531 let r = _mm_cvtepi8_epi64(a);
1532 let e = _mm_set1_epi64x(-10);
1533 assert_eq_m128i(r, e);
1534 }
1535
1536 #[simd_test(enable = "sse4.1")]
1537 unsafe fn test_mm_cvtepi16_epi32() {
1538 let a = _mm_set1_epi16(10);
1539 let r = _mm_cvtepi16_epi32(a);
1540 let e = _mm_set1_epi32(10);
1541 assert_eq_m128i(r, e);
1542 let a = _mm_set1_epi16(-10);
1543 let r = _mm_cvtepi16_epi32(a);
1544 let e = _mm_set1_epi32(-10);
1545 assert_eq_m128i(r, e);
1546 }
1547
1548 #[simd_test(enable = "sse4.1")]
1549 unsafe fn test_mm_cvtepi16_epi64() {
1550 let a = _mm_set1_epi16(10);
1551 let r = _mm_cvtepi16_epi64(a);
1552 let e = _mm_set1_epi64x(10);
1553 assert_eq_m128i(r, e);
1554 let a = _mm_set1_epi16(-10);
1555 let r = _mm_cvtepi16_epi64(a);
1556 let e = _mm_set1_epi64x(-10);
1557 assert_eq_m128i(r, e);
1558 }
1559
1560 #[simd_test(enable = "sse4.1")]
1561 unsafe fn test_mm_cvtepi32_epi64() {
1562 let a = _mm_set1_epi32(10);
1563 let r = _mm_cvtepi32_epi64(a);
1564 let e = _mm_set1_epi64x(10);
1565 assert_eq_m128i(r, e);
1566 let a = _mm_set1_epi32(-10);
1567 let r = _mm_cvtepi32_epi64(a);
1568 let e = _mm_set1_epi64x(-10);
1569 assert_eq_m128i(r, e);
1570 }
1571
1572 #[simd_test(enable = "sse4.1")]
1573 unsafe fn test_mm_cvtepu8_epi16() {
1574 let a = _mm_set1_epi8(10);
1575 let r = _mm_cvtepu8_epi16(a);
1576 let e = _mm_set1_epi16(10);
1577 assert_eq_m128i(r, e);
1578 }
1579
1580 #[simd_test(enable = "sse4.1")]
1581 unsafe fn test_mm_cvtepu8_epi32() {
1582 let a = _mm_set1_epi8(10);
1583 let r = _mm_cvtepu8_epi32(a);
1584 let e = _mm_set1_epi32(10);
1585 assert_eq_m128i(r, e);
1586 }
1587
1588 #[simd_test(enable = "sse4.1")]
1589 unsafe fn test_mm_cvtepu8_epi64() {
1590 let a = _mm_set1_epi8(10);
1591 let r = _mm_cvtepu8_epi64(a);
1592 let e = _mm_set1_epi64x(10);
1593 assert_eq_m128i(r, e);
1594 }
1595
1596 #[simd_test(enable = "sse4.1")]
1597 unsafe fn test_mm_cvtepu16_epi32() {
1598 let a = _mm_set1_epi16(10);
1599 let r = _mm_cvtepu16_epi32(a);
1600 let e = _mm_set1_epi32(10);
1601 assert_eq_m128i(r, e);
1602 }
1603
1604 #[simd_test(enable = "sse4.1")]
1605 unsafe fn test_mm_cvtepu16_epi64() {
1606 let a = _mm_set1_epi16(10);
1607 let r = _mm_cvtepu16_epi64(a);
1608 let e = _mm_set1_epi64x(10);
1609 assert_eq_m128i(r, e);
1610 }
1611
1612 #[simd_test(enable = "sse4.1")]
1613 unsafe fn test_mm_cvtepu32_epi64() {
1614 let a = _mm_set1_epi32(10);
1615 let r = _mm_cvtepu32_epi64(a);
1616 let e = _mm_set1_epi64x(10);
1617 assert_eq_m128i(r, e);
1618 }
1619
1620 #[simd_test(enable = "sse4.1")]
1621 unsafe fn test_mm_dp_pd() {
1622 let a = _mm_setr_pd(2.0, 3.0);
1623 let b = _mm_setr_pd(1.0, 4.0);
1624 let e = _mm_setr_pd(14.0, 0.0);
1625 assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e);
1626 }
1627
1628 #[simd_test(enable = "sse4.1")]
1629 unsafe fn test_mm_dp_ps() {
1630 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1631 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1632 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1633 assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e);
1634 }
1635
1636 #[simd_test(enable = "sse4.1")]
1637 unsafe fn test_mm_floor_pd() {
1638 let a = _mm_setr_pd(2.5, 4.5);
1639 let r = _mm_floor_pd(a);
1640 let e = _mm_setr_pd(2.0, 4.0);
1641 assert_eq_m128d(r, e);
1642 }
1643
1644 #[simd_test(enable = "sse4.1")]
1645 unsafe fn test_mm_floor_ps() {
1646 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1647 let r = _mm_floor_ps(a);
1648 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1649 assert_eq_m128(r, e);
1650 }
1651
1652 #[simd_test(enable = "sse4.1")]
1653 unsafe fn test_mm_floor_sd() {
1654 let a = _mm_setr_pd(2.5, 4.5);
1655 let b = _mm_setr_pd(-1.5, -3.5);
1656 let r = _mm_floor_sd(a, b);
1657 let e = _mm_setr_pd(-2.0, 4.5);
1658 assert_eq_m128d(r, e);
1659 }
1660
1661 #[simd_test(enable = "sse4.1")]
1662 unsafe fn test_mm_floor_ss() {
1663 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1664 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1665 let r = _mm_floor_ss(a, b);
1666 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1667 assert_eq_m128(r, e);
1668 }
1669
1670 #[simd_test(enable = "sse4.1")]
1671 unsafe fn test_mm_ceil_pd() {
1672 let a = _mm_setr_pd(1.5, 3.5);
1673 let r = _mm_ceil_pd(a);
1674 let e = _mm_setr_pd(2.0, 4.0);
1675 assert_eq_m128d(r, e);
1676 }
1677
1678 #[simd_test(enable = "sse4.1")]
1679 unsafe fn test_mm_ceil_ps() {
1680 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1681 let r = _mm_ceil_ps(a);
1682 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1683 assert_eq_m128(r, e);
1684 }
1685
1686 #[simd_test(enable = "sse4.1")]
1687 unsafe fn test_mm_ceil_sd() {
1688 let a = _mm_setr_pd(1.5, 3.5);
1689 let b = _mm_setr_pd(-2.5, -4.5);
1690 let r = _mm_ceil_sd(a, b);
1691 let e = _mm_setr_pd(-2.0, 3.5);
1692 assert_eq_m128d(r, e);
1693 }
1694
1695 #[simd_test(enable = "sse4.1")]
1696 unsafe fn test_mm_ceil_ss() {
1697 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1698 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1699 let r = _mm_ceil_ss(a, b);
1700 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1701 assert_eq_m128(r, e);
1702 }
1703
1704 #[simd_test(enable = "sse4.1")]
1705 unsafe fn test_mm_round_pd() {
1706 let a = _mm_setr_pd(1.25, 3.75);
1707 let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT);
1708 let e = _mm_setr_pd(1.0, 4.0);
1709 assert_eq_m128d(r, e);
1710 }
1711
1712 #[simd_test(enable = "sse4.1")]
1713 unsafe fn test_mm_round_ps() {
1714 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1715 let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO);
1716 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1717 assert_eq_m128(r, e);
1718 }
1719
1720 #[simd_test(enable = "sse4.1")]
1721 unsafe fn test_mm_round_sd() {
1722 let a = _mm_setr_pd(1.5, 3.5);
1723 let b = _mm_setr_pd(-2.5, -4.5);
1724 let old_mode = _MM_GET_ROUNDING_MODE();
1725 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1726 let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION);
1727 _MM_SET_ROUNDING_MODE(old_mode);
1728 let e = _mm_setr_pd(-2.0, 3.5);
1729 assert_eq_m128d(r, e);
1730 }
1731
1732 #[simd_test(enable = "sse4.1")]
1733 unsafe fn test_mm_round_ss() {
1734 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1735 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1736 let old_mode = _MM_GET_ROUNDING_MODE();
1737 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1738 let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION);
1739 _MM_SET_ROUNDING_MODE(old_mode);
1740 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1741 assert_eq_m128(r, e);
1742 }
1743
1744 #[simd_test(enable = "sse4.1")]
1745 unsafe fn test_mm_minpos_epu16_1() {
1746 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1747 let r = _mm_minpos_epu16(a);
1748 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1749 assert_eq_m128i(r, e);
1750 }
1751
1752 #[simd_test(enable = "sse4.1")]
1753 unsafe fn test_mm_minpos_epu16_2() {
1754 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1755 let r = _mm_minpos_epu16(a);
1756 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1757 assert_eq_m128i(r, e);
1758 }
1759
1760 #[simd_test(enable = "sse4.1")]
1761 unsafe fn test_mm_mul_epi32() {
1762 {
1763 let a = _mm_setr_epi32(1, 1, 1, 1);
1764 let b = _mm_setr_epi32(1, 2, 3, 4);
1765 let r = _mm_mul_epi32(a, b);
1766 let e = _mm_setr_epi64x(1, 3);
1767 assert_eq_m128i(r, e);
1768 }
1769 {
1770 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1771 let b = _mm_setr_epi32(
1772 -20, -256, /* ignored */
1773 666666, 666666, /* ignored */
1774 );
1775 let r = _mm_mul_epi32(a, b);
1776 let e = _mm_setr_epi64x(-300, 823043843622);
1777 assert_eq_m128i(r, e);
1778 }
1779 }
1780
1781 #[simd_test(enable = "sse4.1")]
1782 unsafe fn test_mm_mullo_epi32() {
1783 {
1784 let a = _mm_setr_epi32(1, 1, 1, 1);
1785 let b = _mm_setr_epi32(1, 2, 3, 4);
1786 let r = _mm_mullo_epi32(a, b);
1787 let e = _mm_setr_epi32(1, 2, 3, 4);
1788 assert_eq_m128i(r, e);
1789 }
1790 {
1791 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1792 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1793 let r = _mm_mullo_epi32(a, b);
1794 // Attention, most significant bit in r[2] is treated
1795 // as a sign bit:
1796 // 1234567 * 666666 = -1589877210
1797 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1798 assert_eq_m128i(r, e);
1799 }
1800 }
1801
1802 #[simd_test(enable = "sse4.1")]
1803 unsafe fn test_mm_minpos_epu16() {
1804 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1805 let r = _mm_minpos_epu16(a);
1806 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1807 assert_eq_m128i(r, e);
1808 }
1809
1810 #[simd_test(enable = "sse4.1")]
1811 unsafe fn test_mm_mpsadbw_epu8() {
1812 #[rustfmt::skip]
1813 let a = _mm_setr_epi8(
1814 0, 1, 2, 3, 4, 5, 6, 7,
1815 8, 9, 10, 11, 12, 13, 14, 15,
1816 );
1817
1818 let r = _mm_mpsadbw_epu8(a, a, 0b000);
1819 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1820 assert_eq_m128i(r, e);
1821
1822 let r = _mm_mpsadbw_epu8(a, a, 0b001);
1823 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1824 assert_eq_m128i(r, e);
1825
1826 let r = _mm_mpsadbw_epu8(a, a, 0b100);
1827 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1828 assert_eq_m128i(r, e);
1829
1830 let r = _mm_mpsadbw_epu8(a, a, 0b101);
1831 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1832 assert_eq_m128i(r, e);
1833
1834 let r = _mm_mpsadbw_epu8(a, a, 0b111);
1835 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1836 assert_eq_m128i(r, e);
1837 }
1838
1839 #[simd_test(enable = "sse4.1")]
1840 unsafe fn test_mm_testz_si128() {
1841 let a = _mm_set1_epi8(1);
1842 let mask = _mm_set1_epi8(0);
1843 let r = _mm_testz_si128(a, mask);
1844 assert_eq!(r, 1);
1845 let a = _mm_set1_epi8(0b101);
1846 let mask = _mm_set1_epi8(0b110);
1847 let r = _mm_testz_si128(a, mask);
1848 assert_eq!(r, 0);
1849 let a = _mm_set1_epi8(0b011);
1850 let mask = _mm_set1_epi8(0b100);
1851 let r = _mm_testz_si128(a, mask);
1852 assert_eq!(r, 1);
1853 }
1854
1855 #[simd_test(enable = "sse4.1")]
1856 unsafe fn test_mm_testc_si128() {
1857 let a = _mm_set1_epi8(-1);
1858 let mask = _mm_set1_epi8(0);
1859 let r = _mm_testc_si128(a, mask);
1860 assert_eq!(r, 1);
1861 let a = _mm_set1_epi8(0b101);
1862 let mask = _mm_set1_epi8(0b110);
1863 let r = _mm_testc_si128(a, mask);
1864 assert_eq!(r, 0);
1865 let a = _mm_set1_epi8(0b101);
1866 let mask = _mm_set1_epi8(0b100);
1867 let r = _mm_testc_si128(a, mask);
1868 assert_eq!(r, 1);
1869 }
1870
1871 #[simd_test(enable = "sse4.1")]
1872 unsafe fn test_mm_testnzc_si128() {
1873 let a = _mm_set1_epi8(0);
1874 let mask = _mm_set1_epi8(1);
1875 let r = _mm_testnzc_si128(a, mask);
1876 assert_eq!(r, 0);
1877 let a = _mm_set1_epi8(-1);
1878 let mask = _mm_set1_epi8(0);
1879 let r = _mm_testnzc_si128(a, mask);
1880 assert_eq!(r, 0);
1881 let a = _mm_set1_epi8(0b101);
1882 let mask = _mm_set1_epi8(0b110);
1883 let r = _mm_testnzc_si128(a, mask);
1884 assert_eq!(r, 1);
1885 let a = _mm_set1_epi8(0b101);
1886 let mask = _mm_set1_epi8(0b101);
1887 let r = _mm_testnzc_si128(a, mask);
1888 assert_eq!(r, 0);
1889 }
1890
1891 #[simd_test(enable = "sse4.1")]
1892 unsafe fn test_mm_test_all_zeros() {
1893 let a = _mm_set1_epi8(1);
1894 let mask = _mm_set1_epi8(0);
1895 let r = _mm_test_all_zeros(a, mask);
1896 assert_eq!(r, 1);
1897 let a = _mm_set1_epi8(0b101);
1898 let mask = _mm_set1_epi8(0b110);
1899 let r = _mm_test_all_zeros(a, mask);
1900 assert_eq!(r, 0);
1901 let a = _mm_set1_epi8(0b011);
1902 let mask = _mm_set1_epi8(0b100);
1903 let r = _mm_test_all_zeros(a, mask);
1904 assert_eq!(r, 1);
1905 }
1906
1907 #[simd_test(enable = "sse4.1")]
1908 unsafe fn test_mm_test_all_ones() {
1909 let a = _mm_set1_epi8(-1);
1910 let r = _mm_test_all_ones(a);
1911 assert_eq!(r, 1);
1912 let a = _mm_set1_epi8(0b101);
1913 let r = _mm_test_all_ones(a);
1914 assert_eq!(r, 0);
1915 }
1916
1917 #[simd_test(enable = "sse4.1")]
1918 unsafe fn test_mm_test_mix_ones_zeros() {
1919 let a = _mm_set1_epi8(0);
1920 let mask = _mm_set1_epi8(1);
1921 let r = _mm_test_mix_ones_zeros(a, mask);
1922 assert_eq!(r, 0);
1923 let a = _mm_set1_epi8(-1);
1924 let mask = _mm_set1_epi8(0);
1925 let r = _mm_test_mix_ones_zeros(a, mask);
1926 assert_eq!(r, 0);
1927 let a = _mm_set1_epi8(0b101);
1928 let mask = _mm_set1_epi8(0b110);
1929 let r = _mm_test_mix_ones_zeros(a, mask);
1930 assert_eq!(r, 1);
1931 let a = _mm_set1_epi8(0b101);
1932 let mask = _mm_set1_epi8(0b101);
1933 let r = _mm_test_mix_ones_zeros(a, mask);
1934 assert_eq!(r, 0);
1935 }
1936 }