]> git.proxmox.com Git - rustc.git/blob - library/stdarch/crates/core_arch/src/x86/sse41.rs
New upstream version 1.54.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / sse41.rs
1 //! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3 use crate::{
4 core_arch::{simd::*, simd_llvm::*, x86::*},
5 mem::transmute,
6 };
7
8 #[cfg(test)]
9 use stdarch_test::assert_instr;
10
11 // SSE4 rounding constans
12 /// round to nearest
13 #[stable(feature = "simd_x86", since = "1.27.0")]
14 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
15 /// round down
16 #[stable(feature = "simd_x86", since = "1.27.0")]
17 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
18 /// round up
19 #[stable(feature = "simd_x86", since = "1.27.0")]
20 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
21 /// truncate
22 #[stable(feature = "simd_x86", since = "1.27.0")]
23 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
24 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
25 #[stable(feature = "simd_x86", since = "1.27.0")]
26 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
27 /// do not suppress exceptions
28 #[stable(feature = "simd_x86", since = "1.27.0")]
29 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
30 /// suppress exceptions
31 #[stable(feature = "simd_x86", since = "1.27.0")]
32 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
33 /// round to nearest and do not suppress exceptions
34 #[stable(feature = "simd_x86", since = "1.27.0")]
35 pub const _MM_FROUND_NINT: i32 = 0x00;
36 /// round down and do not suppress exceptions
37 #[stable(feature = "simd_x86", since = "1.27.0")]
38 pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
39 /// round up and do not suppress exceptions
40 #[stable(feature = "simd_x86", since = "1.27.0")]
41 pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
42 /// truncate and do not suppress exceptions
43 #[stable(feature = "simd_x86", since = "1.27.0")]
44 pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
45 /// use MXCSR.RC and do not suppress exceptions; see
46 /// `vendor::_MM_SET_ROUNDING_MODE`
47 #[stable(feature = "simd_x86", since = "1.27.0")]
48 pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
49 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
50 #[stable(feature = "simd_x86", since = "1.27.0")]
51 pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
52
53 /// Blend packed 8-bit integers from `a` and `b` using `mask`
54 ///
55 /// The high bit of each corresponding mask byte determines the selection.
56 /// If the high bit is set the element of `a` is selected. The element
57 /// of `b` is selected otherwise.
58 ///
59 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
60 #[inline]
61 #[target_feature(enable = "sse4.1")]
62 #[cfg_attr(test, assert_instr(pblendvb))]
63 #[stable(feature = "simd_x86", since = "1.27.0")]
64 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
65 transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16()))
66 }
67
68 /// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
69 ///
70 /// The mask bits determine the selection. A clear bit selects the
71 /// corresponding element of `a`, and a set bit the corresponding
72 /// element of `b`.
73 ///
74 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
75 #[inline]
76 #[target_feature(enable = "sse4.1")]
77 // Note: LLVM7 prefers the single-precision floating-point domain when possible
78 // see https://bugs.llvm.org/show_bug.cgi?id=38195
79 // #[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xF0))]
80 #[cfg_attr(test, assert_instr(blendps, IMM8 = 0xF0))]
81 #[rustc_legacy_const_generics(2)]
82 #[stable(feature = "simd_x86", since = "1.27.0")]
83 pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
84 static_assert_imm8!(IMM8);
85 transmute(pblendw(a.as_i16x8(), b.as_i16x8(), IMM8 as u8))
86 }
87
88 /// Blend packed double-precision (64-bit) floating-point elements from `a`
89 /// and `b` using `mask`
90 ///
91 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
92 #[inline]
93 #[target_feature(enable = "sse4.1")]
94 #[cfg_attr(test, assert_instr(blendvpd))]
95 #[stable(feature = "simd_x86", since = "1.27.0")]
96 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
97 blendvpd(a, b, mask)
98 }
99
100 /// Blend packed single-precision (32-bit) floating-point elements from `a`
101 /// and `b` using `mask`
102 ///
103 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
104 #[inline]
105 #[target_feature(enable = "sse4.1")]
106 #[cfg_attr(test, assert_instr(blendvps))]
107 #[stable(feature = "simd_x86", since = "1.27.0")]
108 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
109 blendvps(a, b, mask)
110 }
111
112 /// Blend packed double-precision (64-bit) floating-point elements from `a`
113 /// and `b` using control mask `IMM2`
114 ///
115 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
116 #[inline]
117 #[target_feature(enable = "sse4.1")]
118 // Note: LLVM7 prefers the single-precision floating-point domain when possible
119 // see https://bugs.llvm.org/show_bug.cgi?id=38195
120 // #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
121 #[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
122 #[rustc_legacy_const_generics(2)]
123 #[stable(feature = "simd_x86", since = "1.27.0")]
124 pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
125 static_assert_imm2!(IMM2);
126 blendpd(a, b, IMM2 as u8)
127 }
128
129 /// Blend packed single-precision (32-bit) floating-point elements from `a`
130 /// and `b` using mask `IMM4`
131 ///
132 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
133 #[inline]
134 #[target_feature(enable = "sse4.1")]
135 #[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
136 #[rustc_legacy_const_generics(2)]
137 #[stable(feature = "simd_x86", since = "1.27.0")]
138 pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
139 static_assert_imm4!(IMM4);
140 blendps(a, b, IMM4 as u8)
141 }
142
143 /// Extracts a single-precision (32-bit) floating-point element from `a`,
144 /// selected with `IMM8`
145 ///
146 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
147 #[inline]
148 #[target_feature(enable = "sse4.1")]
149 #[cfg_attr(
150 all(test, not(target_os = "windows")),
151 assert_instr(extractps, IMM8 = 0)
152 )]
153 #[rustc_legacy_const_generics(1)]
154 #[stable(feature = "simd_x86", since = "1.27.0")]
155 pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
156 static_assert_imm2!(IMM8);
157 transmute(simd_extract::<_, f32>(a, IMM8 as u32))
158 }
159
160 /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
161 /// integer containing the zero-extended integer data.
162 ///
163 /// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
164 ///
165 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
166 #[inline]
167 #[target_feature(enable = "sse4.1")]
168 #[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
169 #[rustc_legacy_const_generics(1)]
170 #[stable(feature = "simd_x86", since = "1.27.0")]
171 pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
172 static_assert_imm4!(IMM8);
173 simd_extract::<_, u8>(a.as_u8x16(), IMM8 as u32) as i32
174 }
175
176 /// Extracts an 32-bit integer from `a` selected with `IMM8`
177 ///
178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
179 #[inline]
180 #[target_feature(enable = "sse4.1")]
181 #[cfg_attr(
182 all(test, not(target_os = "windows")),
183 assert_instr(extractps, IMM8 = 1)
184 )]
185 #[rustc_legacy_const_generics(1)]
186 #[stable(feature = "simd_x86", since = "1.27.0")]
187 pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
188 static_assert_imm2!(IMM8);
189 simd_extract::<_, i32>(a.as_i32x4(), IMM8 as u32)
190 }
191
192 /// Select a single value in `a` to store at some position in `b`,
193 /// Then zero elements according to `IMM8`.
194 ///
195 /// `IMM8` specifies which bits from operand `a` will be copied, which bits in
196 /// the result they will be copied to, and which bits in the result will be
197 /// cleared. The following assignments are made:
198 ///
199 /// * Bits `[7:6]` specify the bits to copy from operand `a`:
200 /// - `00`: Selects bits `[31:0]` from operand `a`.
201 /// - `01`: Selects bits `[63:32]` from operand `a`.
202 /// - `10`: Selects bits `[95:64]` from operand `a`.
203 /// - `11`: Selects bits `[127:96]` from operand `a`.
204 ///
205 /// * Bits `[5:4]` specify the bits in the result to which the selected bits
206 /// from operand `a` are copied:
207 /// - `00`: Copies the selected bits from `a` to result bits `[31:0]`.
208 /// - `01`: Copies the selected bits from `a` to result bits `[63:32]`.
209 /// - `10`: Copies the selected bits from `a` to result bits `[95:64]`.
210 /// - `11`: Copies the selected bits from `a` to result bits `[127:96]`.
211 ///
212 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
213 /// element is cleared.
214 ///
215 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
216 #[inline]
217 #[target_feature(enable = "sse4.1")]
218 #[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
219 #[rustc_legacy_const_generics(2)]
220 #[stable(feature = "simd_x86", since = "1.27.0")]
221 pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
222 static_assert_imm8!(IMM8);
223 insertps(a, b, IMM8 as u8)
224 }
225
226 /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
227 /// location specified by `IMM8`.
228 ///
229 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
230 #[inline]
231 #[target_feature(enable = "sse4.1")]
232 #[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
233 #[rustc_legacy_const_generics(2)]
234 #[stable(feature = "simd_x86", since = "1.27.0")]
235 pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
236 static_assert_imm4!(IMM8);
237 transmute(simd_insert(a.as_i8x16(), IMM8 as u32, i as i8))
238 }
239
240 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
241 /// location specified by `IMM8`.
242 ///
243 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
244 #[inline]
245 #[target_feature(enable = "sse4.1")]
246 #[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
247 #[rustc_legacy_const_generics(2)]
248 #[stable(feature = "simd_x86", since = "1.27.0")]
249 pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
250 static_assert_imm2!(IMM8);
251 transmute(simd_insert(a.as_i32x4(), IMM8 as u32, i))
252 }
253
254 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
255 /// values in dst.
256 ///
257 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
258 #[inline]
259 #[target_feature(enable = "sse4.1")]
260 #[cfg_attr(test, assert_instr(pmaxsb))]
261 #[stable(feature = "simd_x86", since = "1.27.0")]
262 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
263 transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
264 }
265
266 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
267 /// maximum.
268 ///
269 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
270 #[inline]
271 #[target_feature(enable = "sse4.1")]
272 #[cfg_attr(test, assert_instr(pmaxuw))]
273 #[stable(feature = "simd_x86", since = "1.27.0")]
274 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
275 transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
276 }
277
278 /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
279 /// values.
280 ///
281 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
282 #[inline]
283 #[target_feature(enable = "sse4.1")]
284 #[cfg_attr(test, assert_instr(pmaxsd))]
285 #[stable(feature = "simd_x86", since = "1.27.0")]
286 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
287 transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
288 }
289
290 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
291 /// maximum values.
292 ///
293 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
294 #[inline]
295 #[target_feature(enable = "sse4.1")]
296 #[cfg_attr(test, assert_instr(pmaxud))]
297 #[stable(feature = "simd_x86", since = "1.27.0")]
298 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
299 transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
300 }
301
302 /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
303 /// values in dst.
304 ///
305 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
306 #[inline]
307 #[target_feature(enable = "sse4.1")]
308 #[cfg_attr(test, assert_instr(pminsb))]
309 #[stable(feature = "simd_x86", since = "1.27.0")]
310 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
311 transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
312 }
313
314 /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
315 /// minimum.
316 ///
317 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
318 #[inline]
319 #[target_feature(enable = "sse4.1")]
320 #[cfg_attr(test, assert_instr(pminuw))]
321 #[stable(feature = "simd_x86", since = "1.27.0")]
322 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
323 transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
324 }
325
326 /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
327 /// values.
328 ///
329 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
330 #[inline]
331 #[target_feature(enable = "sse4.1")]
332 #[cfg_attr(test, assert_instr(pminsd))]
333 #[stable(feature = "simd_x86", since = "1.27.0")]
334 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
335 transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
336 }
337
338 /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
339 /// minimum values.
340 ///
341 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
342 #[inline]
343 #[target_feature(enable = "sse4.1")]
344 #[cfg_attr(test, assert_instr(pminud))]
345 #[stable(feature = "simd_x86", since = "1.27.0")]
346 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
347 transmute(pminud(a.as_u32x4(), b.as_u32x4()))
348 }
349
350 /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
351 /// using unsigned saturation
352 ///
353 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
354 #[inline]
355 #[target_feature(enable = "sse4.1")]
356 #[cfg_attr(test, assert_instr(packusdw))]
357 #[stable(feature = "simd_x86", since = "1.27.0")]
358 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
359 transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
360 }
361
362 /// Compares packed 64-bit integers in `a` and `b` for equality
363 ///
364 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
365 #[inline]
366 #[target_feature(enable = "sse4.1")]
367 #[cfg_attr(test, assert_instr(pcmpeqq))]
368 #[stable(feature = "simd_x86", since = "1.27.0")]
369 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
370 transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
371 }
372
373 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
374 ///
375 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
376 #[inline]
377 #[target_feature(enable = "sse4.1")]
378 #[cfg_attr(test, assert_instr(pmovsxbw))]
379 #[stable(feature = "simd_x86", since = "1.27.0")]
380 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
381 let a = a.as_i8x16();
382 let a: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
383 transmute(simd_cast::<_, i16x8>(a))
384 }
385
386 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
387 ///
388 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
389 #[inline]
390 #[target_feature(enable = "sse4.1")]
391 #[cfg_attr(test, assert_instr(pmovsxbd))]
392 #[stable(feature = "simd_x86", since = "1.27.0")]
393 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
394 let a = a.as_i8x16();
395 let a: i8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
396 transmute(simd_cast::<_, i32x4>(a))
397 }
398
399 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
400 /// 64-bit integers
401 ///
402 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
403 #[inline]
404 #[target_feature(enable = "sse4.1")]
405 #[cfg_attr(test, assert_instr(pmovsxbq))]
406 #[stable(feature = "simd_x86", since = "1.27.0")]
407 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
408 let a = a.as_i8x16();
409 let a: i8x2 = simd_shuffle2!(a, a, [0, 1]);
410 transmute(simd_cast::<_, i64x2>(a))
411 }
412
413 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
414 ///
415 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
416 #[inline]
417 #[target_feature(enable = "sse4.1")]
418 #[cfg_attr(test, assert_instr(pmovsxwd))]
419 #[stable(feature = "simd_x86", since = "1.27.0")]
420 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
421 let a = a.as_i16x8();
422 let a: i16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
423 transmute(simd_cast::<_, i32x4>(a))
424 }
425
426 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
427 ///
428 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
429 #[inline]
430 #[target_feature(enable = "sse4.1")]
431 #[cfg_attr(test, assert_instr(pmovsxwq))]
432 #[stable(feature = "simd_x86", since = "1.27.0")]
433 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
434 let a = a.as_i16x8();
435 let a: i16x2 = simd_shuffle2!(a, a, [0, 1]);
436 transmute(simd_cast::<_, i64x2>(a))
437 }
438
439 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
440 ///
441 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
442 #[inline]
443 #[target_feature(enable = "sse4.1")]
444 #[cfg_attr(test, assert_instr(pmovsxdq))]
445 #[stable(feature = "simd_x86", since = "1.27.0")]
446 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
447 let a = a.as_i32x4();
448 let a: i32x2 = simd_shuffle2!(a, a, [0, 1]);
449 transmute(simd_cast::<_, i64x2>(a))
450 }
451
452 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
453 ///
454 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
455 #[inline]
456 #[target_feature(enable = "sse4.1")]
457 #[cfg_attr(test, assert_instr(pmovzxbw))]
458 #[stable(feature = "simd_x86", since = "1.27.0")]
459 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
460 let a = a.as_u8x16();
461 let a: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
462 transmute(simd_cast::<_, i16x8>(a))
463 }
464
465 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
466 ///
467 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
468 #[inline]
469 #[target_feature(enable = "sse4.1")]
470 #[cfg_attr(test, assert_instr(pmovzxbd))]
471 #[stable(feature = "simd_x86", since = "1.27.0")]
472 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
473 let a = a.as_u8x16();
474 let a: u8x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
475 transmute(simd_cast::<_, i32x4>(a))
476 }
477
478 /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
479 ///
480 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
481 #[inline]
482 #[target_feature(enable = "sse4.1")]
483 #[cfg_attr(test, assert_instr(pmovzxbq))]
484 #[stable(feature = "simd_x86", since = "1.27.0")]
485 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
486 let a = a.as_u8x16();
487 let a: u8x2 = simd_shuffle2!(a, a, [0, 1]);
488 transmute(simd_cast::<_, i64x2>(a))
489 }
490
491 /// Zeroes extend packed unsigned 16-bit integers in `a`
492 /// to packed 32-bit integers
493 ///
494 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
495 #[inline]
496 #[target_feature(enable = "sse4.1")]
497 #[cfg_attr(test, assert_instr(pmovzxwd))]
498 #[stable(feature = "simd_x86", since = "1.27.0")]
499 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
500 let a = a.as_u16x8();
501 let a: u16x4 = simd_shuffle4!(a, a, [0, 1, 2, 3]);
502 transmute(simd_cast::<_, i32x4>(a))
503 }
504
505 /// Zeroes extend packed unsigned 16-bit integers in `a`
506 /// to packed 64-bit integers
507 ///
508 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
509 #[inline]
510 #[target_feature(enable = "sse4.1")]
511 #[cfg_attr(test, assert_instr(pmovzxwq))]
512 #[stable(feature = "simd_x86", since = "1.27.0")]
513 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
514 let a = a.as_u16x8();
515 let a: u16x2 = simd_shuffle2!(a, a, [0, 1]);
516 transmute(simd_cast::<_, i64x2>(a))
517 }
518
519 /// Zeroes extend packed unsigned 32-bit integers in `a`
520 /// to packed 64-bit integers
521 ///
522 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
523 #[inline]
524 #[target_feature(enable = "sse4.1")]
525 #[cfg_attr(test, assert_instr(pmovzxdq))]
526 #[stable(feature = "simd_x86", since = "1.27.0")]
527 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
528 let a = a.as_u32x4();
529 let a: u32x2 = simd_shuffle2!(a, a, [0, 1]);
530 transmute(simd_cast::<_, i64x2>(a))
531 }
532
533 /// Returns the dot product of two __m128d vectors.
534 ///
535 /// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
536 /// If a condition mask bit is zero, the corresponding multiplication is
537 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
538 /// the dot product will be stored in the return value component. Otherwise if
539 /// the broadcast mask bit is zero then the return component will be zero.
540 ///
541 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
542 #[inline]
543 #[target_feature(enable = "sse4.1")]
544 #[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
545 #[rustc_legacy_const_generics(2)]
546 #[stable(feature = "simd_x86", since = "1.27.0")]
547 pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
548 static_assert_imm8!(IMM8);
549 dppd(a, b, IMM8 as u8)
550 }
551
552 /// Returns the dot product of two __m128 vectors.
553 ///
554 /// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
555 /// If a condition mask bit is zero, the corresponding multiplication is
556 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
557 /// the dot product will be stored in the return value component. Otherwise if
558 /// the broadcast mask bit is zero then the return component will be zero.
559 ///
560 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
561 #[inline]
562 #[target_feature(enable = "sse4.1")]
563 #[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
564 #[rustc_legacy_const_generics(2)]
565 #[stable(feature = "simd_x86", since = "1.27.0")]
566 pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
567 static_assert_imm8!(IMM8);
568 dpps(a, b, IMM8 as u8)
569 }
570
571 /// Round the packed double-precision (64-bit) floating-point elements in `a`
572 /// down to an integer value, and stores the results as packed double-precision
573 /// floating-point elements.
574 ///
575 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
576 #[inline]
577 #[target_feature(enable = "sse4.1")]
578 #[cfg_attr(test, assert_instr(roundpd))]
579 #[stable(feature = "simd_x86", since = "1.27.0")]
580 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
581 simd_floor(a)
582 }
583
584 /// Round the packed single-precision (32-bit) floating-point elements in `a`
585 /// down to an integer value, and stores the results as packed single-precision
586 /// floating-point elements.
587 ///
588 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
589 #[inline]
590 #[target_feature(enable = "sse4.1")]
591 #[cfg_attr(test, assert_instr(roundps))]
592 #[stable(feature = "simd_x86", since = "1.27.0")]
593 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
594 simd_floor(a)
595 }
596
597 /// Round the lower double-precision (64-bit) floating-point element in `b`
598 /// down to an integer value, store the result as a double-precision
599 /// floating-point element in the lower element of the intrinsic result,
600 /// and copies the upper element from `a` to the upper element of the intrinsic
601 /// result.
602 ///
603 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
604 #[inline]
605 #[target_feature(enable = "sse4.1")]
606 #[cfg_attr(test, assert_instr(roundsd))]
607 #[stable(feature = "simd_x86", since = "1.27.0")]
608 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
609 roundsd(a, b, _MM_FROUND_FLOOR)
610 }
611
612 /// Round the lower single-precision (32-bit) floating-point element in `b`
613 /// down to an integer value, store the result as a single-precision
614 /// floating-point element in the lower element of the intrinsic result,
615 /// and copies the upper 3 packed elements from `a` to the upper elements
616 /// of the intrinsic result.
617 ///
618 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
619 #[inline]
620 #[target_feature(enable = "sse4.1")]
621 #[cfg_attr(test, assert_instr(roundss))]
622 #[stable(feature = "simd_x86", since = "1.27.0")]
623 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
624 roundss(a, b, _MM_FROUND_FLOOR)
625 }
626
627 /// Round the packed double-precision (64-bit) floating-point elements in `a`
628 /// up to an integer value, and stores the results as packed double-precision
629 /// floating-point elements.
630 ///
631 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
632 #[inline]
633 #[target_feature(enable = "sse4.1")]
634 #[cfg_attr(test, assert_instr(roundpd))]
635 #[stable(feature = "simd_x86", since = "1.27.0")]
636 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
637 simd_ceil(a)
638 }
639
640 /// Round the packed single-precision (32-bit) floating-point elements in `a`
641 /// up to an integer value, and stores the results as packed single-precision
642 /// floating-point elements.
643 ///
644 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
645 #[inline]
646 #[target_feature(enable = "sse4.1")]
647 #[cfg_attr(test, assert_instr(roundps))]
648 #[stable(feature = "simd_x86", since = "1.27.0")]
649 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
650 simd_ceil(a)
651 }
652
653 /// Round the lower double-precision (64-bit) floating-point element in `b`
654 /// up to an integer value, store the result as a double-precision
655 /// floating-point element in the lower element of the intrisic result,
656 /// and copies the upper element from `a` to the upper element
657 /// of the intrinsic result.
658 ///
659 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
660 #[inline]
661 #[target_feature(enable = "sse4.1")]
662 #[cfg_attr(test, assert_instr(roundsd))]
663 #[stable(feature = "simd_x86", since = "1.27.0")]
664 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
665 roundsd(a, b, _MM_FROUND_CEIL)
666 }
667
668 /// Round the lower single-precision (32-bit) floating-point element in `b`
669 /// up to an integer value, store the result as a single-precision
670 /// floating-point element in the lower element of the intrinsic result,
671 /// and copies the upper 3 packed elements from `a` to the upper elements
672 /// of the intrinsic result.
673 ///
674 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
675 #[inline]
676 #[target_feature(enable = "sse4.1")]
677 #[cfg_attr(test, assert_instr(roundss))]
678 #[stable(feature = "simd_x86", since = "1.27.0")]
679 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
680 roundss(a, b, _MM_FROUND_CEIL)
681 }
682
683 /// Round the packed double-precision (64-bit) floating-point elements in `a`
684 /// using the `ROUNDING` parameter, and stores the results as packed
685 /// double-precision floating-point elements.
686 /// Rounding is done according to the rounding parameter, which can be one of:
687 ///
688 /// ```
689 /// #[cfg(target_arch = "x86")]
690 /// use std::arch::x86::*;
691 /// #[cfg(target_arch = "x86_64")]
692 /// use std::arch::x86_64::*;
693 ///
694 /// # fn main() {
695 /// // round to nearest, and suppress exceptions:
696 /// # let _x =
697 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
698 /// // round down, and suppress exceptions:
699 /// # let _x =
700 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
701 /// // round up, and suppress exceptions:
702 /// # let _x =
703 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
704 /// // truncate, and suppress exceptions:
705 /// # let _x =
706 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
707 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
708 /// # let _x =
709 /// _MM_FROUND_CUR_DIRECTION;
710 /// # }
711 /// ```
712 ///
713 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
714 #[inline]
715 #[target_feature(enable = "sse4.1")]
716 #[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
717 #[rustc_legacy_const_generics(1)]
718 #[stable(feature = "simd_x86", since = "1.27.0")]
719 pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
720 static_assert_imm4!(ROUNDING);
721 roundpd(a, ROUNDING)
722 }
723
724 /// Round the packed single-precision (32-bit) floating-point elements in `a`
725 /// using the `ROUNDING` parameter, and stores the results as packed
726 /// single-precision floating-point elements.
727 /// Rounding is done according to the rounding parameter, which can be one of:
728 ///
729 /// ```
730 /// #[cfg(target_arch = "x86")]
731 /// use std::arch::x86::*;
732 /// #[cfg(target_arch = "x86_64")]
733 /// use std::arch::x86_64::*;
734 ///
735 /// # fn main() {
736 /// // round to nearest, and suppress exceptions:
737 /// # let _x =
738 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
739 /// // round down, and suppress exceptions:
740 /// # let _x =
741 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
742 /// // round up, and suppress exceptions:
743 /// # let _x =
744 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
745 /// // truncate, and suppress exceptions:
746 /// # let _x =
747 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
748 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
749 /// # let _x =
750 /// _MM_FROUND_CUR_DIRECTION;
751 /// # }
752 /// ```
753 ///
754 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
755 #[inline]
756 #[target_feature(enable = "sse4.1")]
757 #[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
758 #[rustc_legacy_const_generics(1)]
759 #[stable(feature = "simd_x86", since = "1.27.0")]
760 pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
761 static_assert_imm4!(ROUNDING);
762 roundps(a, ROUNDING)
763 }
764
765 /// Round the lower double-precision (64-bit) floating-point element in `b`
766 /// using the `ROUNDING` parameter, store the result as a double-precision
767 /// floating-point element in the lower element of the intrinsic result,
768 /// and copies the upper element from `a` to the upper element of the intrinsic
769 /// result.
770 /// Rounding is done according to the rounding parameter, which can be one of:
771 ///
772 /// ```
773 /// #[cfg(target_arch = "x86")]
774 /// use std::arch::x86::*;
775 /// #[cfg(target_arch = "x86_64")]
776 /// use std::arch::x86_64::*;
777 ///
778 /// # fn main() {
779 /// // round to nearest, and suppress exceptions:
780 /// # let _x =
781 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
782 /// // round down, and suppress exceptions:
783 /// # let _x =
784 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
785 /// // round up, and suppress exceptions:
786 /// # let _x =
787 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
788 /// // truncate, and suppress exceptions:
789 /// # let _x =
790 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
791 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
792 /// # let _x =
793 /// _MM_FROUND_CUR_DIRECTION;
794 /// # }
795 /// ```
796 ///
797 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
798 #[inline]
799 #[target_feature(enable = "sse4.1")]
800 #[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
801 #[rustc_legacy_const_generics(2)]
802 #[stable(feature = "simd_x86", since = "1.27.0")]
803 pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
804 static_assert_imm4!(ROUNDING);
805 roundsd(a, b, ROUNDING)
806 }
807
808 /// Round the lower single-precision (32-bit) floating-point element in `b`
809 /// using the `ROUNDING` parameter, store the result as a single-precision
810 /// floating-point element in the lower element of the intrinsic result,
811 /// and copies the upper 3 packed elements from `a` to the upper elements
812 /// of the instrinsic result.
813 /// Rounding is done according to the rounding parameter, which can be one of:
814 ///
815 /// ```
816 /// #[cfg(target_arch = "x86")]
817 /// use std::arch::x86::*;
818 /// #[cfg(target_arch = "x86_64")]
819 /// use std::arch::x86_64::*;
820 ///
821 /// # fn main() {
822 /// // round to nearest, and suppress exceptions:
823 /// # let _x =
824 /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
825 /// // round down, and suppress exceptions:
826 /// # let _x =
827 /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC;
828 /// // round up, and suppress exceptions:
829 /// # let _x =
830 /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC;
831 /// // truncate, and suppress exceptions:
832 /// # let _x =
833 /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC;
834 /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`:
835 /// # let _x =
836 /// _MM_FROUND_CUR_DIRECTION;
837 /// # }
838 /// ```
839 ///
840 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
841 #[inline]
842 #[target_feature(enable = "sse4.1")]
843 #[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
844 #[rustc_legacy_const_generics(2)]
845 #[stable(feature = "simd_x86", since = "1.27.0")]
846 pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
847 static_assert_imm4!(ROUNDING);
848 roundss(a, b, ROUNDING)
849 }
850
851 /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
852 /// returning a vector containing its value in its first position, and its
853 /// index
854 /// in its second position; all other elements are set to zero.
855 ///
856 /// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
857 /// instruction.
858 ///
859 /// Arguments:
860 ///
861 /// * `a` - A 128-bit vector of type `__m128i`.
862 ///
863 /// Returns:
864 ///
865 /// A 128-bit value where:
866 ///
867 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
868 /// * bits `[18:16]` - contain the index of the minimum value
869 /// * remaining bits are set to `0`.
870 ///
871 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
872 #[inline]
873 #[target_feature(enable = "sse4.1")]
874 #[cfg_attr(test, assert_instr(phminposuw))]
875 #[stable(feature = "simd_x86", since = "1.27.0")]
876 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
877 transmute(phminposuw(a.as_u16x8()))
878 }
879
880 /// Multiplies the low 32-bit integers from each packed 64-bit
881 /// element in `a` and `b`, and returns the signed 64-bit result.
882 ///
883 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
884 #[inline]
885 #[target_feature(enable = "sse4.1")]
886 #[cfg_attr(test, assert_instr(pmuldq))]
887 #[stable(feature = "simd_x86", since = "1.27.0")]
888 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
889 transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
890 }
891
892 /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
893 /// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
894 /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
895 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
896 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
897 /// return a negative number.
898 ///
899 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
900 #[inline]
901 #[target_feature(enable = "sse4.1")]
902 #[cfg_attr(test, assert_instr(pmulld))]
903 #[stable(feature = "simd_x86", since = "1.27.0")]
904 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
905 transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
906 }
907
908 /// Subtracts 8-bit unsigned integer values and computes the absolute
909 /// values of the differences to the corresponding bits in the destination.
910 /// Then sums of the absolute differences are returned according to the bit
911 /// fields in the immediate operand.
912 ///
913 /// The following algorithm is performed:
914 ///
915 /// ```ignore
916 /// i = IMM8[2] * 4
917 /// j = IMM8[1:0] * 4
918 /// for k := 0 to 7
919 /// d0 = abs(a[i + k + 0] - b[j + 0])
920 /// d1 = abs(a[i + k + 1] - b[j + 1])
921 /// d2 = abs(a[i + k + 2] - b[j + 2])
922 /// d3 = abs(a[i + k + 3] - b[j + 3])
923 /// r[k] = d0 + d1 + d2 + d3
924 /// ```
925 ///
926 /// Arguments:
927 ///
928 /// * `a` - A 128-bit vector of type `__m128i`.
929 /// * `b` - A 128-bit vector of type `__m128i`.
930 /// * `IMM8` - An 8-bit immediate operand specifying how the absolute
931 /// differences are to be calculated
932 /// * Bit `[2]` specify the offset for operand `a`
933 /// * Bits `[1:0]` specify the offset for operand `b`
934 ///
935 /// Returns:
936 ///
937 /// * A `__m128i` vector containing the sums of the sets of absolute
938 /// differences between both operands.
939 ///
940 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
941 #[inline]
942 #[target_feature(enable = "sse4.1")]
943 #[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
944 #[rustc_legacy_const_generics(2)]
945 #[stable(feature = "simd_x86", since = "1.27.0")]
946 pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
947 static_assert_imm3!(IMM8);
948 transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8))
949 }
950
951 /// Tests whether the specified bits in a 128-bit integer vector are all
952 /// zeros.
953 ///
954 /// Arguments:
955 ///
956 /// * `a` - A 128-bit integer vector containing the bits to be tested.
957 /// * `mask` - A 128-bit integer vector selecting which bits to test in
958 /// operand `a`.
959 ///
960 /// Returns:
961 ///
962 /// * `1` - if the specified bits are all zeros,
963 /// * `0` - otherwise.
964 ///
965 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
966 #[inline]
967 #[target_feature(enable = "sse4.1")]
968 #[cfg_attr(test, assert_instr(ptest))]
969 #[stable(feature = "simd_x86", since = "1.27.0")]
970 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
971 ptestz(a.as_i64x2(), mask.as_i64x2())
972 }
973
974 /// Tests whether the specified bits in a 128-bit integer vector are all
975 /// ones.
976 ///
977 /// Arguments:
978 ///
979 /// * `a` - A 128-bit integer vector containing the bits to be tested.
980 /// * `mask` - A 128-bit integer vector selecting which bits to test in
981 /// operand `a`.
982 ///
983 /// Returns:
984 ///
985 /// * `1` - if the specified bits are all ones,
986 /// * `0` - otherwise.
987 ///
988 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
989 #[inline]
990 #[target_feature(enable = "sse4.1")]
991 #[cfg_attr(test, assert_instr(ptest))]
992 #[stable(feature = "simd_x86", since = "1.27.0")]
993 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
994 ptestc(a.as_i64x2(), mask.as_i64x2())
995 }
996
997 /// Tests whether the specified bits in a 128-bit integer vector are
998 /// neither all zeros nor all ones.
999 ///
1000 /// Arguments:
1001 ///
1002 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1003 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1004 /// operand `a`.
1005 ///
1006 /// Returns:
1007 ///
1008 /// * `1` - if the specified bits are neither all zeros nor all ones,
1009 /// * `0` - otherwise.
1010 ///
1011 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
1012 #[inline]
1013 #[target_feature(enable = "sse4.1")]
1014 #[cfg_attr(test, assert_instr(ptest))]
1015 #[stable(feature = "simd_x86", since = "1.27.0")]
1016 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
1017 ptestnzc(a.as_i64x2(), mask.as_i64x2())
1018 }
1019
1020 /// Tests whether the specified bits in a 128-bit integer vector are all
1021 /// zeros.
1022 ///
1023 /// Arguments:
1024 ///
1025 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1026 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1027 /// operand `a`.
1028 ///
1029 /// Returns:
1030 ///
1031 /// * `1` - if the specified bits are all zeros,
1032 /// * `0` - otherwise.
1033 ///
1034 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
1035 #[inline]
1036 #[target_feature(enable = "sse4.1")]
1037 #[cfg_attr(test, assert_instr(ptest))]
1038 #[stable(feature = "simd_x86", since = "1.27.0")]
1039 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1040 _mm_testz_si128(a, mask)
1041 }
1042
1043 /// Tests whether the specified bits in `a` 128-bit integer vector are all
1044 /// ones.
1045 ///
1046 /// Argument:
1047 ///
1048 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1049 ///
1050 /// Returns:
1051 ///
1052 /// * `1` - if the bits specified in the operand are all set to 1,
1053 /// * `0` - otherwise.
1054 ///
1055 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
1056 #[inline]
1057 #[target_feature(enable = "sse4.1")]
1058 #[cfg_attr(test, assert_instr(pcmpeqd))]
1059 #[cfg_attr(test, assert_instr(ptest))]
1060 #[stable(feature = "simd_x86", since = "1.27.0")]
1061 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1062 _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1063 }
1064
1065 /// Tests whether the specified bits in a 128-bit integer vector are
1066 /// neither all zeros nor all ones.
1067 ///
1068 /// Arguments:
1069 ///
1070 /// * `a` - A 128-bit integer vector containing the bits to be tested.
1071 /// * `mask` - A 128-bit integer vector selecting which bits to test in
1072 /// operand `a`.
1073 ///
1074 /// Returns:
1075 ///
1076 /// * `1` - if the specified bits are neither all zeros nor all ones,
1077 /// * `0` - otherwise.
1078 ///
1079 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
1080 #[inline]
1081 #[target_feature(enable = "sse4.1")]
1082 #[cfg_attr(test, assert_instr(ptest))]
1083 #[stable(feature = "simd_x86", since = "1.27.0")]
1084 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1085 _mm_testnzc_si128(a, mask)
1086 }
1087
1088 #[allow(improper_ctypes)]
1089 extern "C" {
1090 #[link_name = "llvm.x86.sse41.pblendvb"]
1091 fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16;
1092 #[link_name = "llvm.x86.sse41.blendvpd"]
1093 fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d;
1094 #[link_name = "llvm.x86.sse41.blendvps"]
1095 fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128;
1096 #[link_name = "llvm.x86.sse41.blendpd"]
1097 fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d;
1098 #[link_name = "llvm.x86.sse41.blendps"]
1099 fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128;
1100 #[link_name = "llvm.x86.sse41.pblendw"]
1101 fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8;
1102 #[link_name = "llvm.x86.sse41.insertps"]
1103 fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1104 #[link_name = "llvm.x86.sse41.pmaxsb"]
1105 fn pmaxsb(a: i8x16, b: i8x16) -> i8x16;
1106 #[link_name = "llvm.x86.sse41.pmaxuw"]
1107 fn pmaxuw(a: u16x8, b: u16x8) -> u16x8;
1108 #[link_name = "llvm.x86.sse41.pmaxsd"]
1109 fn pmaxsd(a: i32x4, b: i32x4) -> i32x4;
1110 #[link_name = "llvm.x86.sse41.pmaxud"]
1111 fn pmaxud(a: u32x4, b: u32x4) -> u32x4;
1112 #[link_name = "llvm.x86.sse41.pminsb"]
1113 fn pminsb(a: i8x16, b: i8x16) -> i8x16;
1114 #[link_name = "llvm.x86.sse41.pminuw"]
1115 fn pminuw(a: u16x8, b: u16x8) -> u16x8;
1116 #[link_name = "llvm.x86.sse41.pminsd"]
1117 fn pminsd(a: i32x4, b: i32x4) -> i32x4;
1118 #[link_name = "llvm.x86.sse41.pminud"]
1119 fn pminud(a: u32x4, b: u32x4) -> u32x4;
1120 #[link_name = "llvm.x86.sse41.packusdw"]
1121 fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1122 #[link_name = "llvm.x86.sse41.dppd"]
1123 fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1124 #[link_name = "llvm.x86.sse41.dpps"]
1125 fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1126 #[link_name = "llvm.x86.sse41.round.pd"]
1127 fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1128 #[link_name = "llvm.x86.sse41.round.ps"]
1129 fn roundps(a: __m128, rounding: i32) -> __m128;
1130 #[link_name = "llvm.x86.sse41.round.sd"]
1131 fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1132 #[link_name = "llvm.x86.sse41.round.ss"]
1133 fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1134 #[link_name = "llvm.x86.sse41.phminposuw"]
1135 fn phminposuw(a: u16x8) -> u16x8;
1136 #[link_name = "llvm.x86.sse41.pmuldq"]
1137 fn pmuldq(a: i32x4, b: i32x4) -> i64x2;
1138 #[link_name = "llvm.x86.sse41.mpsadbw"]
1139 fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1140 #[link_name = "llvm.x86.sse41.ptestz"]
1141 fn ptestz(a: i64x2, mask: i64x2) -> i32;
1142 #[link_name = "llvm.x86.sse41.ptestc"]
1143 fn ptestc(a: i64x2, mask: i64x2) -> i32;
1144 #[link_name = "llvm.x86.sse41.ptestnzc"]
1145 fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1146 }
1147
1148 #[cfg(test)]
1149 mod tests {
1150 use crate::core_arch::x86::*;
1151 use std::mem;
1152 use stdarch_test::simd_test;
1153
1154 #[simd_test(enable = "sse4.1")]
1155 unsafe fn test_mm_blendv_epi8() {
1156 #[rustfmt::skip]
1157 let a = _mm_setr_epi8(
1158 0, 1, 2, 3, 4, 5, 6, 7,
1159 8, 9, 10, 11, 12, 13, 14, 15,
1160 );
1161 #[rustfmt::skip]
1162 let b = _mm_setr_epi8(
1163 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1164 );
1165 #[rustfmt::skip]
1166 let mask = _mm_setr_epi8(
1167 0, -1, 0, -1, 0, -1, 0, -1,
1168 0, -1, 0, -1, 0, -1, 0, -1,
1169 );
1170 #[rustfmt::skip]
1171 let e = _mm_setr_epi8(
1172 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1173 );
1174 assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1175 }
1176
1177 #[simd_test(enable = "sse4.1")]
1178 unsafe fn test_mm_blendv_pd() {
1179 let a = _mm_set1_pd(0.0);
1180 let b = _mm_set1_pd(1.0);
1181 let mask = transmute(_mm_setr_epi64x(0, -1));
1182 let r = _mm_blendv_pd(a, b, mask);
1183 let e = _mm_setr_pd(0.0, 1.0);
1184 assert_eq_m128d(r, e);
1185 }
1186
1187 #[simd_test(enable = "sse4.1")]
1188 unsafe fn test_mm_blendv_ps() {
1189 let a = _mm_set1_ps(0.0);
1190 let b = _mm_set1_ps(1.0);
1191 let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1192 let r = _mm_blendv_ps(a, b, mask);
1193 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1194 assert_eq_m128(r, e);
1195 }
1196
1197 #[simd_test(enable = "sse4.1")]
1198 unsafe fn test_mm_blend_pd() {
1199 let a = _mm_set1_pd(0.0);
1200 let b = _mm_set1_pd(1.0);
1201 let r = _mm_blend_pd::<0b10>(a, b);
1202 let e = _mm_setr_pd(0.0, 1.0);
1203 assert_eq_m128d(r, e);
1204 }
1205
1206 #[simd_test(enable = "sse4.1")]
1207 unsafe fn test_mm_blend_ps() {
1208 let a = _mm_set1_ps(0.0);
1209 let b = _mm_set1_ps(1.0);
1210 let r = _mm_blend_ps::<0b1010>(a, b);
1211 let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1212 assert_eq_m128(r, e);
1213 }
1214
1215 #[simd_test(enable = "sse4.1")]
1216 unsafe fn test_mm_blend_epi16() {
1217 let a = _mm_set1_epi16(0);
1218 let b = _mm_set1_epi16(1);
1219 let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1220 let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1221 assert_eq_m128i(r, e);
1222 }
1223
1224 #[simd_test(enable = "sse4.1")]
1225 unsafe fn test_mm_extract_ps() {
1226 let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1227 let r: f32 = transmute(_mm_extract_ps::<1>(a));
1228 assert_eq!(r, 1.0);
1229 let r: f32 = transmute(_mm_extract_ps::<3>(a));
1230 assert_eq!(r, 3.0);
1231 }
1232
1233 #[simd_test(enable = "sse4.1")]
1234 unsafe fn test_mm_extract_epi8() {
1235 #[rustfmt::skip]
1236 let a = _mm_setr_epi8(
1237 -1, 1, 2, 3, 4, 5, 6, 7,
1238 8, 9, 10, 11, 12, 13, 14, 15
1239 );
1240 let r1 = _mm_extract_epi8::<0>(a);
1241 let r2 = _mm_extract_epi8::<3>(a);
1242 assert_eq!(r1, 0xFF);
1243 assert_eq!(r2, 3);
1244 }
1245
1246 #[simd_test(enable = "sse4.1")]
1247 unsafe fn test_mm_extract_epi32() {
1248 let a = _mm_setr_epi32(0, 1, 2, 3);
1249 let r = _mm_extract_epi32::<1>(a);
1250 assert_eq!(r, 1);
1251 let r = _mm_extract_epi32::<3>(a);
1252 assert_eq!(r, 3);
1253 }
1254
1255 #[simd_test(enable = "sse4.1")]
1256 unsafe fn test_mm_insert_ps() {
1257 let a = _mm_set1_ps(1.0);
1258 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1259 let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1260 let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1261 assert_eq_m128(r, e);
1262 }
1263
1264 #[simd_test(enable = "sse4.1")]
1265 unsafe fn test_mm_insert_epi8() {
1266 let a = _mm_set1_epi8(0);
1267 let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1268 let r = _mm_insert_epi8::<1>(a, 32);
1269 assert_eq_m128i(r, e);
1270 let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1271 let r = _mm_insert_epi8::<14>(a, 32);
1272 assert_eq_m128i(r, e);
1273 }
1274
1275 #[simd_test(enable = "sse4.1")]
1276 unsafe fn test_mm_insert_epi32() {
1277 let a = _mm_set1_epi32(0);
1278 let e = _mm_setr_epi32(0, 32, 0, 0);
1279 let r = _mm_insert_epi32::<1>(a, 32);
1280 assert_eq_m128i(r, e);
1281 let e = _mm_setr_epi32(0, 0, 0, 32);
1282 let r = _mm_insert_epi32::<3>(a, 32);
1283 assert_eq_m128i(r, e);
1284 }
1285
1286 #[simd_test(enable = "sse4.1")]
1287 unsafe fn test_mm_max_epi8() {
1288 #[rustfmt::skip]
1289 let a = _mm_setr_epi8(
1290 1, 4, 5, 8, 9, 12, 13, 16,
1291 17, 20, 21, 24, 25, 28, 29, 32,
1292 );
1293 #[rustfmt::skip]
1294 let b = _mm_setr_epi8(
1295 2, 3, 6, 7, 10, 11, 14, 15,
1296 18, 19, 22, 23, 26, 27, 30, 31,
1297 );
1298 let r = _mm_max_epi8(a, b);
1299 #[rustfmt::skip]
1300 let e = _mm_setr_epi8(
1301 2, 4, 6, 8, 10, 12, 14, 16,
1302 18, 20, 22, 24, 26, 28, 30, 32,
1303 );
1304 assert_eq_m128i(r, e);
1305 }
1306
1307 #[simd_test(enable = "sse4.1")]
1308 unsafe fn test_mm_max_epu16() {
1309 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1310 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1311 let r = _mm_max_epu16(a, b);
1312 let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1313 assert_eq_m128i(r, e);
1314 }
1315
1316 #[simd_test(enable = "sse4.1")]
1317 unsafe fn test_mm_max_epi32() {
1318 let a = _mm_setr_epi32(1, 4, 5, 8);
1319 let b = _mm_setr_epi32(2, 3, 6, 7);
1320 let r = _mm_max_epi32(a, b);
1321 let e = _mm_setr_epi32(2, 4, 6, 8);
1322 assert_eq_m128i(r, e);
1323 }
1324
1325 #[simd_test(enable = "sse4.1")]
1326 unsafe fn test_mm_max_epu32() {
1327 let a = _mm_setr_epi32(1, 4, 5, 8);
1328 let b = _mm_setr_epi32(2, 3, 6, 7);
1329 let r = _mm_max_epu32(a, b);
1330 let e = _mm_setr_epi32(2, 4, 6, 8);
1331 assert_eq_m128i(r, e);
1332 }
1333
1334 #[simd_test(enable = "sse4.1")]
1335 unsafe fn test_mm_min_epi8_1() {
1336 #[rustfmt::skip]
1337 let a = _mm_setr_epi8(
1338 1, 4, 5, 8, 9, 12, 13, 16,
1339 17, 20, 21, 24, 25, 28, 29, 32,
1340 );
1341 #[rustfmt::skip]
1342 let b = _mm_setr_epi8(
1343 2, 3, 6, 7, 10, 11, 14, 15,
1344 18, 19, 22, 23, 26, 27, 30, 31,
1345 );
1346 let r = _mm_min_epi8(a, b);
1347 #[rustfmt::skip]
1348 let e = _mm_setr_epi8(
1349 1, 3, 5, 7, 9, 11, 13, 15,
1350 17, 19, 21, 23, 25, 27, 29, 31,
1351 );
1352 assert_eq_m128i(r, e);
1353 }
1354
1355 #[simd_test(enable = "sse4.1")]
1356 unsafe fn test_mm_min_epi8_2() {
1357 #[rustfmt::skip]
1358 let a = _mm_setr_epi8(
1359 1, -4, -5, 8, -9, -12, 13, -16,
1360 17, 20, 21, 24, 25, 28, 29, 32,
1361 );
1362 #[rustfmt::skip]
1363 let b = _mm_setr_epi8(
1364 2, -3, -6, 7, -10, -11, 14, -15,
1365 18, 19, 22, 23, 26, 27, 30, 31,
1366 );
1367 let r = _mm_min_epi8(a, b);
1368 #[rustfmt::skip]
1369 let e = _mm_setr_epi8(
1370 1, -4, -6, 7, -10, -12, 13, -16,
1371 17, 19, 21, 23, 25, 27, 29, 31,
1372 );
1373 assert_eq_m128i(r, e);
1374 }
1375
1376 #[simd_test(enable = "sse4.1")]
1377 unsafe fn test_mm_min_epu16() {
1378 let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1379 let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1380 let r = _mm_min_epu16(a, b);
1381 let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1382 assert_eq_m128i(r, e);
1383 }
1384
1385 #[simd_test(enable = "sse4.1")]
1386 unsafe fn test_mm_min_epi32_1() {
1387 let a = _mm_setr_epi32(1, 4, 5, 8);
1388 let b = _mm_setr_epi32(2, 3, 6, 7);
1389 let r = _mm_min_epi32(a, b);
1390 let e = _mm_setr_epi32(1, 3, 5, 7);
1391 assert_eq_m128i(r, e);
1392 }
1393
1394 #[simd_test(enable = "sse4.1")]
1395 unsafe fn test_mm_min_epi32_2() {
1396 let a = _mm_setr_epi32(-1, 4, 5, -7);
1397 let b = _mm_setr_epi32(-2, 3, -6, 8);
1398 let r = _mm_min_epi32(a, b);
1399 let e = _mm_setr_epi32(-2, 3, -6, -7);
1400 assert_eq_m128i(r, e);
1401 }
1402
1403 #[simd_test(enable = "sse4.1")]
1404 unsafe fn test_mm_min_epu32() {
1405 let a = _mm_setr_epi32(1, 4, 5, 8);
1406 let b = _mm_setr_epi32(2, 3, 6, 7);
1407 let r = _mm_min_epu32(a, b);
1408 let e = _mm_setr_epi32(1, 3, 5, 7);
1409 assert_eq_m128i(r, e);
1410 }
1411
1412 #[simd_test(enable = "sse4.1")]
1413 unsafe fn test_mm_packus_epi32() {
1414 let a = _mm_setr_epi32(1, 2, 3, 4);
1415 let b = _mm_setr_epi32(-1, -2, -3, -4);
1416 let r = _mm_packus_epi32(a, b);
1417 let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1418 assert_eq_m128i(r, e);
1419 }
1420
1421 #[simd_test(enable = "sse4.1")]
1422 unsafe fn test_mm_cmpeq_epi64() {
1423 let a = _mm_setr_epi64x(0, 1);
1424 let b = _mm_setr_epi64x(0, 0);
1425 let r = _mm_cmpeq_epi64(a, b);
1426 let e = _mm_setr_epi64x(-1, 0);
1427 assert_eq_m128i(r, e);
1428 }
1429
1430 #[simd_test(enable = "sse4.1")]
1431 unsafe fn test_mm_cvtepi8_epi16() {
1432 let a = _mm_set1_epi8(10);
1433 let r = _mm_cvtepi8_epi16(a);
1434 let e = _mm_set1_epi16(10);
1435 assert_eq_m128i(r, e);
1436 let a = _mm_set1_epi8(-10);
1437 let r = _mm_cvtepi8_epi16(a);
1438 let e = _mm_set1_epi16(-10);
1439 assert_eq_m128i(r, e);
1440 }
1441
1442 #[simd_test(enable = "sse4.1")]
1443 unsafe fn test_mm_cvtepi8_epi32() {
1444 let a = _mm_set1_epi8(10);
1445 let r = _mm_cvtepi8_epi32(a);
1446 let e = _mm_set1_epi32(10);
1447 assert_eq_m128i(r, e);
1448 let a = _mm_set1_epi8(-10);
1449 let r = _mm_cvtepi8_epi32(a);
1450 let e = _mm_set1_epi32(-10);
1451 assert_eq_m128i(r, e);
1452 }
1453
1454 #[simd_test(enable = "sse4.1")]
1455 unsafe fn test_mm_cvtepi8_epi64() {
1456 let a = _mm_set1_epi8(10);
1457 let r = _mm_cvtepi8_epi64(a);
1458 let e = _mm_set1_epi64x(10);
1459 assert_eq_m128i(r, e);
1460 let a = _mm_set1_epi8(-10);
1461 let r = _mm_cvtepi8_epi64(a);
1462 let e = _mm_set1_epi64x(-10);
1463 assert_eq_m128i(r, e);
1464 }
1465
1466 #[simd_test(enable = "sse4.1")]
1467 unsafe fn test_mm_cvtepi16_epi32() {
1468 let a = _mm_set1_epi16(10);
1469 let r = _mm_cvtepi16_epi32(a);
1470 let e = _mm_set1_epi32(10);
1471 assert_eq_m128i(r, e);
1472 let a = _mm_set1_epi16(-10);
1473 let r = _mm_cvtepi16_epi32(a);
1474 let e = _mm_set1_epi32(-10);
1475 assert_eq_m128i(r, e);
1476 }
1477
1478 #[simd_test(enable = "sse4.1")]
1479 unsafe fn test_mm_cvtepi16_epi64() {
1480 let a = _mm_set1_epi16(10);
1481 let r = _mm_cvtepi16_epi64(a);
1482 let e = _mm_set1_epi64x(10);
1483 assert_eq_m128i(r, e);
1484 let a = _mm_set1_epi16(-10);
1485 let r = _mm_cvtepi16_epi64(a);
1486 let e = _mm_set1_epi64x(-10);
1487 assert_eq_m128i(r, e);
1488 }
1489
1490 #[simd_test(enable = "sse4.1")]
1491 unsafe fn test_mm_cvtepi32_epi64() {
1492 let a = _mm_set1_epi32(10);
1493 let r = _mm_cvtepi32_epi64(a);
1494 let e = _mm_set1_epi64x(10);
1495 assert_eq_m128i(r, e);
1496 let a = _mm_set1_epi32(-10);
1497 let r = _mm_cvtepi32_epi64(a);
1498 let e = _mm_set1_epi64x(-10);
1499 assert_eq_m128i(r, e);
1500 }
1501
1502 #[simd_test(enable = "sse4.1")]
1503 unsafe fn test_mm_cvtepu8_epi16() {
1504 let a = _mm_set1_epi8(10);
1505 let r = _mm_cvtepu8_epi16(a);
1506 let e = _mm_set1_epi16(10);
1507 assert_eq_m128i(r, e);
1508 }
1509
1510 #[simd_test(enable = "sse4.1")]
1511 unsafe fn test_mm_cvtepu8_epi32() {
1512 let a = _mm_set1_epi8(10);
1513 let r = _mm_cvtepu8_epi32(a);
1514 let e = _mm_set1_epi32(10);
1515 assert_eq_m128i(r, e);
1516 }
1517
1518 #[simd_test(enable = "sse4.1")]
1519 unsafe fn test_mm_cvtepu8_epi64() {
1520 let a = _mm_set1_epi8(10);
1521 let r = _mm_cvtepu8_epi64(a);
1522 let e = _mm_set1_epi64x(10);
1523 assert_eq_m128i(r, e);
1524 }
1525
1526 #[simd_test(enable = "sse4.1")]
1527 unsafe fn test_mm_cvtepu16_epi32() {
1528 let a = _mm_set1_epi16(10);
1529 let r = _mm_cvtepu16_epi32(a);
1530 let e = _mm_set1_epi32(10);
1531 assert_eq_m128i(r, e);
1532 }
1533
1534 #[simd_test(enable = "sse4.1")]
1535 unsafe fn test_mm_cvtepu16_epi64() {
1536 let a = _mm_set1_epi16(10);
1537 let r = _mm_cvtepu16_epi64(a);
1538 let e = _mm_set1_epi64x(10);
1539 assert_eq_m128i(r, e);
1540 }
1541
1542 #[simd_test(enable = "sse4.1")]
1543 unsafe fn test_mm_cvtepu32_epi64() {
1544 let a = _mm_set1_epi32(10);
1545 let r = _mm_cvtepu32_epi64(a);
1546 let e = _mm_set1_epi64x(10);
1547 assert_eq_m128i(r, e);
1548 }
1549
1550 #[simd_test(enable = "sse4.1")]
1551 unsafe fn test_mm_dp_pd() {
1552 let a = _mm_setr_pd(2.0, 3.0);
1553 let b = _mm_setr_pd(1.0, 4.0);
1554 let e = _mm_setr_pd(14.0, 0.0);
1555 assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1556 }
1557
1558 #[simd_test(enable = "sse4.1")]
1559 unsafe fn test_mm_dp_ps() {
1560 let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1561 let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1562 let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1563 assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1564 }
1565
1566 #[simd_test(enable = "sse4.1")]
1567 unsafe fn test_mm_floor_pd() {
1568 let a = _mm_setr_pd(2.5, 4.5);
1569 let r = _mm_floor_pd(a);
1570 let e = _mm_setr_pd(2.0, 4.0);
1571 assert_eq_m128d(r, e);
1572 }
1573
1574 #[simd_test(enable = "sse4.1")]
1575 unsafe fn test_mm_floor_ps() {
1576 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1577 let r = _mm_floor_ps(a);
1578 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1579 assert_eq_m128(r, e);
1580 }
1581
1582 #[simd_test(enable = "sse4.1")]
1583 unsafe fn test_mm_floor_sd() {
1584 let a = _mm_setr_pd(2.5, 4.5);
1585 let b = _mm_setr_pd(-1.5, -3.5);
1586 let r = _mm_floor_sd(a, b);
1587 let e = _mm_setr_pd(-2.0, 4.5);
1588 assert_eq_m128d(r, e);
1589 }
1590
1591 #[simd_test(enable = "sse4.1")]
1592 unsafe fn test_mm_floor_ss() {
1593 let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1594 let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1595 let r = _mm_floor_ss(a, b);
1596 let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1597 assert_eq_m128(r, e);
1598 }
1599
1600 #[simd_test(enable = "sse4.1")]
1601 unsafe fn test_mm_ceil_pd() {
1602 let a = _mm_setr_pd(1.5, 3.5);
1603 let r = _mm_ceil_pd(a);
1604 let e = _mm_setr_pd(2.0, 4.0);
1605 assert_eq_m128d(r, e);
1606 }
1607
1608 #[simd_test(enable = "sse4.1")]
1609 unsafe fn test_mm_ceil_ps() {
1610 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1611 let r = _mm_ceil_ps(a);
1612 let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1613 assert_eq_m128(r, e);
1614 }
1615
1616 #[simd_test(enable = "sse4.1")]
1617 unsafe fn test_mm_ceil_sd() {
1618 let a = _mm_setr_pd(1.5, 3.5);
1619 let b = _mm_setr_pd(-2.5, -4.5);
1620 let r = _mm_ceil_sd(a, b);
1621 let e = _mm_setr_pd(-2.0, 3.5);
1622 assert_eq_m128d(r, e);
1623 }
1624
1625 #[simd_test(enable = "sse4.1")]
1626 unsafe fn test_mm_ceil_ss() {
1627 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1628 let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1629 let r = _mm_ceil_ss(a, b);
1630 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1631 assert_eq_m128(r, e);
1632 }
1633
1634 #[simd_test(enable = "sse4.1")]
1635 unsafe fn test_mm_round_pd() {
1636 let a = _mm_setr_pd(1.25, 3.75);
1637 let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1638 let e = _mm_setr_pd(1.0, 4.0);
1639 assert_eq_m128d(r, e);
1640 }
1641
1642 #[simd_test(enable = "sse4.1")]
1643 unsafe fn test_mm_round_ps() {
1644 let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1645 let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1646 let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1647 assert_eq_m128(r, e);
1648 }
1649
1650 #[simd_test(enable = "sse4.1")]
1651 unsafe fn test_mm_round_sd() {
1652 let a = _mm_setr_pd(1.5, 3.5);
1653 let b = _mm_setr_pd(-2.5, -4.5);
1654 let old_mode = _MM_GET_ROUNDING_MODE();
1655 _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO);
1656 let r = _mm_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
1657 _MM_SET_ROUNDING_MODE(old_mode);
1658 let e = _mm_setr_pd(-2.0, 3.5);
1659 assert_eq_m128d(r, e);
1660 }
1661
1662 #[simd_test(enable = "sse4.1")]
1663 unsafe fn test_mm_round_ss() {
1664 let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1665 let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1666 let old_mode = _MM_GET_ROUNDING_MODE();
1667 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
1668 let r = _mm_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
1669 _MM_SET_ROUNDING_MODE(old_mode);
1670 let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1671 assert_eq_m128(r, e);
1672 }
1673
1674 #[simd_test(enable = "sse4.1")]
1675 unsafe fn test_mm_minpos_epu16_1() {
1676 let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1677 let r = _mm_minpos_epu16(a);
1678 let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1679 assert_eq_m128i(r, e);
1680 }
1681
1682 #[simd_test(enable = "sse4.1")]
1683 unsafe fn test_mm_minpos_epu16_2() {
1684 let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1685 let r = _mm_minpos_epu16(a);
1686 let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1687 assert_eq_m128i(r, e);
1688 }
1689
1690 #[simd_test(enable = "sse4.1")]
1691 unsafe fn test_mm_mul_epi32() {
1692 {
1693 let a = _mm_setr_epi32(1, 1, 1, 1);
1694 let b = _mm_setr_epi32(1, 2, 3, 4);
1695 let r = _mm_mul_epi32(a, b);
1696 let e = _mm_setr_epi64x(1, 3);
1697 assert_eq_m128i(r, e);
1698 }
1699 {
1700 let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1701 let b = _mm_setr_epi32(
1702 -20, -256, /* ignored */
1703 666666, 666666, /* ignored */
1704 );
1705 let r = _mm_mul_epi32(a, b);
1706 let e = _mm_setr_epi64x(-300, 823043843622);
1707 assert_eq_m128i(r, e);
1708 }
1709 }
1710
1711 #[simd_test(enable = "sse4.1")]
1712 unsafe fn test_mm_mullo_epi32() {
1713 {
1714 let a = _mm_setr_epi32(1, 1, 1, 1);
1715 let b = _mm_setr_epi32(1, 2, 3, 4);
1716 let r = _mm_mullo_epi32(a, b);
1717 let e = _mm_setr_epi32(1, 2, 3, 4);
1718 assert_eq_m128i(r, e);
1719 }
1720 {
1721 let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1722 let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1723 let r = _mm_mullo_epi32(a, b);
1724 // Attention, most significant bit in r[2] is treated
1725 // as a sign bit:
1726 // 1234567 * 666666 = -1589877210
1727 let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1728 assert_eq_m128i(r, e);
1729 }
1730 }
1731
1732 #[simd_test(enable = "sse4.1")]
1733 unsafe fn test_mm_minpos_epu16() {
1734 let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1735 let r = _mm_minpos_epu16(a);
1736 let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1737 assert_eq_m128i(r, e);
1738 }
1739
1740 #[simd_test(enable = "sse4.1")]
1741 unsafe fn test_mm_mpsadbw_epu8() {
1742 #[rustfmt::skip]
1743 let a = _mm_setr_epi8(
1744 0, 1, 2, 3, 4, 5, 6, 7,
1745 8, 9, 10, 11, 12, 13, 14, 15,
1746 );
1747
1748 let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1749 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1750 assert_eq_m128i(r, e);
1751
1752 let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1753 let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1754 assert_eq_m128i(r, e);
1755
1756 let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1757 let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1758 assert_eq_m128i(r, e);
1759
1760 let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1761 let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1762 assert_eq_m128i(r, e);
1763
1764 let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1765 let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1766 assert_eq_m128i(r, e);
1767 }
1768
1769 #[simd_test(enable = "sse4.1")]
1770 unsafe fn test_mm_testz_si128() {
1771 let a = _mm_set1_epi8(1);
1772 let mask = _mm_set1_epi8(0);
1773 let r = _mm_testz_si128(a, mask);
1774 assert_eq!(r, 1);
1775 let a = _mm_set1_epi8(0b101);
1776 let mask = _mm_set1_epi8(0b110);
1777 let r = _mm_testz_si128(a, mask);
1778 assert_eq!(r, 0);
1779 let a = _mm_set1_epi8(0b011);
1780 let mask = _mm_set1_epi8(0b100);
1781 let r = _mm_testz_si128(a, mask);
1782 assert_eq!(r, 1);
1783 }
1784
1785 #[simd_test(enable = "sse4.1")]
1786 unsafe fn test_mm_testc_si128() {
1787 let a = _mm_set1_epi8(-1);
1788 let mask = _mm_set1_epi8(0);
1789 let r = _mm_testc_si128(a, mask);
1790 assert_eq!(r, 1);
1791 let a = _mm_set1_epi8(0b101);
1792 let mask = _mm_set1_epi8(0b110);
1793 let r = _mm_testc_si128(a, mask);
1794 assert_eq!(r, 0);
1795 let a = _mm_set1_epi8(0b101);
1796 let mask = _mm_set1_epi8(0b100);
1797 let r = _mm_testc_si128(a, mask);
1798 assert_eq!(r, 1);
1799 }
1800
1801 #[simd_test(enable = "sse4.1")]
1802 unsafe fn test_mm_testnzc_si128() {
1803 let a = _mm_set1_epi8(0);
1804 let mask = _mm_set1_epi8(1);
1805 let r = _mm_testnzc_si128(a, mask);
1806 assert_eq!(r, 0);
1807 let a = _mm_set1_epi8(-1);
1808 let mask = _mm_set1_epi8(0);
1809 let r = _mm_testnzc_si128(a, mask);
1810 assert_eq!(r, 0);
1811 let a = _mm_set1_epi8(0b101);
1812 let mask = _mm_set1_epi8(0b110);
1813 let r = _mm_testnzc_si128(a, mask);
1814 assert_eq!(r, 1);
1815 let a = _mm_set1_epi8(0b101);
1816 let mask = _mm_set1_epi8(0b101);
1817 let r = _mm_testnzc_si128(a, mask);
1818 assert_eq!(r, 0);
1819 }
1820
1821 #[simd_test(enable = "sse4.1")]
1822 unsafe fn test_mm_test_all_zeros() {
1823 let a = _mm_set1_epi8(1);
1824 let mask = _mm_set1_epi8(0);
1825 let r = _mm_test_all_zeros(a, mask);
1826 assert_eq!(r, 1);
1827 let a = _mm_set1_epi8(0b101);
1828 let mask = _mm_set1_epi8(0b110);
1829 let r = _mm_test_all_zeros(a, mask);
1830 assert_eq!(r, 0);
1831 let a = _mm_set1_epi8(0b011);
1832 let mask = _mm_set1_epi8(0b100);
1833 let r = _mm_test_all_zeros(a, mask);
1834 assert_eq!(r, 1);
1835 }
1836
1837 #[simd_test(enable = "sse4.1")]
1838 unsafe fn test_mm_test_all_ones() {
1839 let a = _mm_set1_epi8(-1);
1840 let r = _mm_test_all_ones(a);
1841 assert_eq!(r, 1);
1842 let a = _mm_set1_epi8(0b101);
1843 let r = _mm_test_all_ones(a);
1844 assert_eq!(r, 0);
1845 }
1846
1847 #[simd_test(enable = "sse4.1")]
1848 unsafe fn test_mm_test_mix_ones_zeros() {
1849 let a = _mm_set1_epi8(0);
1850 let mask = _mm_set1_epi8(1);
1851 let r = _mm_test_mix_ones_zeros(a, mask);
1852 assert_eq!(r, 0);
1853 let a = _mm_set1_epi8(-1);
1854 let mask = _mm_set1_epi8(0);
1855 let r = _mm_test_mix_ones_zeros(a, mask);
1856 assert_eq!(r, 0);
1857 let a = _mm_set1_epi8(0b101);
1858 let mask = _mm_set1_epi8(0b110);
1859 let r = _mm_test_mix_ones_zeros(a, mask);
1860 assert_eq!(r, 1);
1861 let a = _mm_set1_epi8(0b101);
1862 let mask = _mm_set1_epi8(0b101);
1863 let r = _mm_test_mix_ones_zeros(a, mask);
1864 assert_eq!(r, 0);
1865 }
1866 }