]> git.proxmox.com Git - rustc.git/blame - src/stdsimd/coresimd/x86/sse2.rs
New upstream version 1.33.0+dfsg1
[rustc.git] / src / stdsimd / coresimd / x86 / sse2.rs
CommitLineData
0531ce1d
XL
1//! Streaming SIMD Extensions 2 (SSE2)
2
3#[cfg(test)]
4use stdsimd_test::assert_instr;
5
0531ce1d 6use coresimd::simd::*;
83c7162d 7use coresimd::simd_llvm::*;
0531ce1d
XL
8use coresimd::x86::*;
9use intrinsics;
10use mem;
11use ptr;
12
13/// Provide a hint to the processor that the code sequence is a spin-wait loop.
14///
15/// This can help improve the performance and power consumption of spin-wait
16/// loops.
83c7162d
XL
17///
18/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_pause)
0531ce1d
XL
19#[inline]
20#[target_feature(enable = "sse2")]
21#[cfg_attr(test, assert_instr(pause))]
83c7162d 22#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
23pub unsafe fn _mm_pause() {
24 pause()
25}
26
27/// Invalidate and flush the cache line that contains `p` from all levels of
28/// the cache hierarchy.
83c7162d
XL
29///
30/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clflush)
0531ce1d
XL
31#[inline]
32#[target_feature(enable = "sse2")]
33#[cfg_attr(test, assert_instr(clflush))]
83c7162d 34#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
35pub unsafe fn _mm_clflush(p: *mut u8) {
36 clflush(p)
37}
38
39/// Perform a serializing operation on all load-from-memory instructions
40/// that were issued prior to this instruction.
41///
42/// Guarantees that every load instruction that precedes, in program order, is
43/// globally visible before any load instruction which follows the fence in
44/// program order.
83c7162d
XL
45///
46/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lfence)
0531ce1d
XL
47#[inline]
48#[target_feature(enable = "sse2")]
49#[cfg_attr(test, assert_instr(lfence))]
83c7162d 50#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
51pub unsafe fn _mm_lfence() {
52 lfence()
53}
54
55/// Perform a serializing operation on all load-from-memory and store-to-memory
56/// instructions that were issued prior to this instruction.
57///
58/// Guarantees that every memory access that precedes, in program order, the
59/// memory fence instruction is globally visible before any memory instruction
60/// which follows the fence in program order.
83c7162d
XL
61///
62/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mfence)
0531ce1d
XL
63#[inline]
64#[target_feature(enable = "sse2")]
65#[cfg_attr(test, assert_instr(mfence))]
83c7162d 66#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
67pub unsafe fn _mm_mfence() {
68 mfence()
69}
70
71/// Add packed 8-bit integers in `a` and `b`.
83c7162d
XL
72///
73/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi8)
0531ce1d
XL
74#[inline]
75#[target_feature(enable = "sse2")]
76#[cfg_attr(test, assert_instr(paddb))]
83c7162d 77#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
78pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
79 mem::transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
80}
81
82/// Add packed 16-bit integers in `a` and `b`.
83c7162d
XL
83///
84/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi16)
0531ce1d
XL
85#[inline]
86#[target_feature(enable = "sse2")]
87#[cfg_attr(test, assert_instr(paddw))]
83c7162d 88#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
89pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
90 mem::transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
91}
92
93/// Add packed 32-bit integers in `a` and `b`.
83c7162d
XL
94///
95/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi32)
0531ce1d
XL
96#[inline]
97#[target_feature(enable = "sse2")]
98#[cfg_attr(test, assert_instr(paddd))]
83c7162d 99#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
100pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
101 mem::transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
102}
103
104/// Add packed 64-bit integers in `a` and "b`.
83c7162d
XL
105///
106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi64)
0531ce1d
XL
107#[inline]
108#[target_feature(enable = "sse2")]
109#[cfg_attr(test, assert_instr(paddq))]
83c7162d 110#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
111pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
112 mem::transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
113}
114
115/// Add packed 8-bit integers in `a` and `b` using saturation.
83c7162d
XL
116///
117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8)
0531ce1d
XL
118#[inline]
119#[target_feature(enable = "sse2")]
120#[cfg_attr(test, assert_instr(paddsb))]
83c7162d 121#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
122pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
123 mem::transmute(paddsb(a.as_i8x16(), b.as_i8x16()))
124}
125
126/// Add packed 16-bit integers in `a` and `b` using saturation.
83c7162d
XL
127///
128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi16)
0531ce1d
XL
129#[inline]
130#[target_feature(enable = "sse2")]
131#[cfg_attr(test, assert_instr(paddsw))]
83c7162d 132#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
133pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
134 mem::transmute(paddsw(a.as_i16x8(), b.as_i16x8()))
135}
136
137/// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
83c7162d
XL
138///
139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu8)
0531ce1d
XL
140#[inline]
141#[target_feature(enable = "sse2")]
142#[cfg_attr(test, assert_instr(paddusb))]
83c7162d 143#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
144pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
145 mem::transmute(paddsub(a.as_u8x16(), b.as_u8x16()))
146}
147
148/// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
83c7162d
XL
149///
150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16)
0531ce1d
XL
151#[inline]
152#[target_feature(enable = "sse2")]
153#[cfg_attr(test, assert_instr(paddusw))]
83c7162d 154#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
155pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
156 mem::transmute(paddsuw(a.as_u16x8(), b.as_u16x8()))
157}
158
159/// Average packed unsigned 8-bit integers in `a` and `b`.
83c7162d
XL
160///
161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu8)
0531ce1d
XL
162#[inline]
163#[target_feature(enable = "sse2")]
164#[cfg_attr(test, assert_instr(pavgb))]
83c7162d 165#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
166pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
167 mem::transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
168}
169
170/// Average packed unsigned 16-bit integers in `a` and `b`.
83c7162d
XL
171///
172/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu16)
0531ce1d
XL
173#[inline]
174#[target_feature(enable = "sse2")]
175#[cfg_attr(test, assert_instr(pavgw))]
83c7162d 176#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
177pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
178 mem::transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
179}
180
181/// Multiply and then horizontally add signed 16 bit integers in `a` and `b`.
182///
183/// Multiply packed signed 16-bit integers in `a` and `b`, producing
184/// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
185/// intermediate 32-bit integers.
83c7162d
XL
186///
187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd_epi16)
0531ce1d
XL
188#[inline]
189#[target_feature(enable = "sse2")]
190#[cfg_attr(test, assert_instr(pmaddwd))]
83c7162d 191#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
192pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
193 mem::transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
194}
195
196/// Compare packed 16-bit integers in `a` and `b`, and return the packed
197/// maximum values.
83c7162d
XL
198///
199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi16)
0531ce1d
XL
200#[inline]
201#[target_feature(enable = "sse2")]
202#[cfg_attr(test, assert_instr(pmaxsw))]
83c7162d 203#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
204pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
205 mem::transmute(pmaxsw(a.as_i16x8(), b.as_i16x8()))
206}
207
208/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
209/// packed maximum values.
83c7162d
XL
210///
211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu8)
0531ce1d
XL
212#[inline]
213#[target_feature(enable = "sse2")]
214#[cfg_attr(test, assert_instr(pmaxub))]
83c7162d 215#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
216pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
217 mem::transmute(pmaxub(a.as_u8x16(), b.as_u8x16()))
218}
219
220/// Compare packed 16-bit integers in `a` and `b`, and return the packed
221/// minimum values.
83c7162d
XL
222///
223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi16)
0531ce1d
XL
224#[inline]
225#[target_feature(enable = "sse2")]
226#[cfg_attr(test, assert_instr(pminsw))]
83c7162d 227#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
228pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
229 mem::transmute(pminsw(a.as_i16x8(), b.as_i16x8()))
230}
231
232/// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
233/// packed minimum values.
83c7162d
XL
234///
235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu8)
0531ce1d
XL
236#[inline]
237#[target_feature(enable = "sse2")]
238#[cfg_attr(test, assert_instr(pminub))]
83c7162d 239#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
240pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
241 mem::transmute(pminub(a.as_u8x16(), b.as_u8x16()))
242}
243
244/// Multiply the packed 16-bit integers in `a` and `b`.
245///
246/// The multiplication produces intermediate 32-bit integers, and returns the
247/// high 16 bits of the intermediate integers.
83c7162d
XL
248///
249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epi16)
0531ce1d
XL
250#[inline]
251#[target_feature(enable = "sse2")]
252#[cfg_attr(test, assert_instr(pmulhw))]
83c7162d 253#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
254pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
255 mem::transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
256}
257
258/// Multiply the packed unsigned 16-bit integers in `a` and `b`.
259///
260/// The multiplication produces intermediate 32-bit integers, and returns the
261/// high 16 bits of the intermediate integers.
83c7162d
XL
262///
263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16)
0531ce1d
XL
264#[inline]
265#[target_feature(enable = "sse2")]
266#[cfg_attr(test, assert_instr(pmulhuw))]
83c7162d 267#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
268pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
269 mem::transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
270}
271
272/// Multiply the packed 16-bit integers in `a` and `b`.
273///
274/// The multiplication produces intermediate 32-bit integers, and returns the
275/// low 16 bits of the intermediate integers.
83c7162d
XL
276///
277/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi16)
0531ce1d
XL
278#[inline]
279#[target_feature(enable = "sse2")]
280#[cfg_attr(test, assert_instr(pmullw))]
83c7162d 281#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
282pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
283 mem::transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
284}
285
286/// Multiply the low unsigned 32-bit integers from each packed 64-bit element
287/// in `a` and `b`.
288///
289/// Return the unsigned 64-bit results.
83c7162d
XL
290///
291/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epu32)
0531ce1d
XL
292#[inline]
293#[target_feature(enable = "sse2")]
294#[cfg_attr(test, assert_instr(pmuludq))]
83c7162d 295#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
296pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
297 mem::transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
298}
299
300/// Sum the absolute differences of packed unsigned 8-bit integers.
301///
302/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
303/// and `b`, then horizontally sum each consecutive 8 differences to produce
304/// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
305/// the low 16 bits of 64-bit elements returned.
83c7162d
XL
306///
307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8)
0531ce1d
XL
308#[inline]
309#[target_feature(enable = "sse2")]
310#[cfg_attr(test, assert_instr(psadbw))]
83c7162d 311#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
312pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
313 mem::transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
314}
315
316/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
83c7162d
XL
317///
318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8)
0531ce1d
XL
319#[inline]
320#[target_feature(enable = "sse2")]
321#[cfg_attr(test, assert_instr(psubb))]
83c7162d 322#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
323pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
324 mem::transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
325}
326
327/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
83c7162d
XL
328///
329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16)
0531ce1d
XL
330#[inline]
331#[target_feature(enable = "sse2")]
332#[cfg_attr(test, assert_instr(psubw))]
83c7162d 333#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
334pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
335 mem::transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
336}
337
338/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
83c7162d
XL
339///
340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi32)
0531ce1d
XL
341#[inline]
342#[target_feature(enable = "sse2")]
343#[cfg_attr(test, assert_instr(psubd))]
83c7162d 344#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
345pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
346 mem::transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
347}
348
349/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
83c7162d
XL
350///
351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi64)
0531ce1d
XL
352#[inline]
353#[target_feature(enable = "sse2")]
354#[cfg_attr(test, assert_instr(psubq))]
83c7162d 355#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
356pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
357 mem::transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
358}
359
360/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
361/// using saturation.
83c7162d
XL
362///
363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi8)
0531ce1d
XL
364#[inline]
365#[target_feature(enable = "sse2")]
366#[cfg_attr(test, assert_instr(psubsb))]
83c7162d 367#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
368pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
369 mem::transmute(psubsb(a.as_i8x16(), b.as_i8x16()))
370}
371
372/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
373/// using saturation.
83c7162d
XL
374///
375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi16)
0531ce1d
XL
376#[inline]
377#[target_feature(enable = "sse2")]
378#[cfg_attr(test, assert_instr(psubsw))]
83c7162d 379#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
380pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
381 mem::transmute(psubsw(a.as_i16x8(), b.as_i16x8()))
382}
383
384/// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
385/// integers in `a` using saturation.
83c7162d
XL
386///
387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu8)
0531ce1d
XL
388#[inline]
389#[target_feature(enable = "sse2")]
390#[cfg_attr(test, assert_instr(psubusb))]
83c7162d 391#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
392pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
393 mem::transmute(psubusb(a.as_u8x16(), b.as_u8x16()))
394}
395
396/// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
397/// integers in `a` using saturation.
83c7162d
XL
398///
399/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu16)
0531ce1d
XL
400#[inline]
401#[target_feature(enable = "sse2")]
402#[cfg_attr(test, assert_instr(psubusw))]
83c7162d 403#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
404pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
405 mem::transmute(psubusw(a.as_u16x8(), b.as_u16x8()))
406}
407
408/// Shift `a` left by `imm8` bytes while shifting in zeros.
83c7162d
XL
409///
410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128)
0531ce1d
XL
411#[inline]
412#[target_feature(enable = "sse2")]
413#[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
414#[rustc_args_required_const(1)]
83c7162d 415#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
416pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
417 _mm_slli_si128_impl(a, imm8)
418}
419
420/// Implementation detail: converts the immediate argument of the
421/// `_mm_slli_si128` intrinsic into a compile-time constant.
422#[inline]
423#[target_feature(enable = "sse2")]
424unsafe fn _mm_slli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
425 let (zero, imm8) = (_mm_set1_epi8(0).as_i8x16(), imm8 as u32);
426 let a = a.as_i8x16();
427 macro_rules! shuffle {
428 ($shift:expr) => {
83c7162d
XL
429 simd_shuffle16::<i8x16, i8x16>(
430 zero,
431 a,
432 [
433 16 - $shift,
434 17 - $shift,
435 18 - $shift,
436 19 - $shift,
437 20 - $shift,
438 21 - $shift,
439 22 - $shift,
440 23 - $shift,
441 24 - $shift,
442 25 - $shift,
443 26 - $shift,
444 27 - $shift,
445 28 - $shift,
446 29 - $shift,
447 30 - $shift,
448 31 - $shift,
449 ],
450 )
451 };
0531ce1d
XL
452 }
453 let x = match imm8 {
454 0 => shuffle!(0),
455 1 => shuffle!(1),
456 2 => shuffle!(2),
457 3 => shuffle!(3),
458 4 => shuffle!(4),
459 5 => shuffle!(5),
460 6 => shuffle!(6),
461 7 => shuffle!(7),
462 8 => shuffle!(8),
463 9 => shuffle!(9),
464 10 => shuffle!(10),
465 11 => shuffle!(11),
466 12 => shuffle!(12),
467 13 => shuffle!(13),
468 14 => shuffle!(14),
469 15 => shuffle!(15),
470 _ => shuffle!(16),
471 };
472 mem::transmute(x)
473}
474
475/// Shift `a` left by `imm8` bytes while shifting in zeros.
83c7162d
XL
476///
477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128)
0531ce1d
XL
478#[inline]
479#[target_feature(enable = "sse2")]
480#[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
481#[rustc_args_required_const(1)]
83c7162d 482#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
483pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
484 _mm_slli_si128_impl(a, imm8)
485}
486
487/// Shift `a` right by `imm8` bytes while shifting in zeros.
83c7162d
XL
488///
489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128)
0531ce1d
XL
490#[inline]
491#[target_feature(enable = "sse2")]
492#[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
493#[rustc_args_required_const(1)]
83c7162d 494#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
495pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
496 _mm_srli_si128_impl(a, imm8)
497}
498
499/// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
83c7162d
XL
500///
501/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16)
0531ce1d
XL
502#[inline]
503#[target_feature(enable = "sse2")]
504#[cfg_attr(test, assert_instr(psllw, imm8 = 7))]
505#[rustc_args_required_const(1)]
83c7162d 506#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
507pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
508 mem::transmute(pslliw(a.as_i16x8(), imm8))
509}
510
511/// Shift packed 16-bit integers in `a` left by `count` while shifting in
512/// zeros.
83c7162d
XL
513///
514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16)
0531ce1d
XL
515#[inline]
516#[target_feature(enable = "sse2")]
517#[cfg_attr(test, assert_instr(psllw))]
83c7162d 518#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
519pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
520 mem::transmute(psllw(a.as_i16x8(), count.as_i16x8()))
521}
522
523/// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
83c7162d
XL
524///
525/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32)
0531ce1d
XL
526#[inline]
527#[target_feature(enable = "sse2")]
528#[cfg_attr(test, assert_instr(pslld, imm8 = 7))]
529#[rustc_args_required_const(1)]
83c7162d 530#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
531pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
532 mem::transmute(psllid(a.as_i32x4(), imm8))
533}
534
535/// Shift packed 32-bit integers in `a` left by `count` while shifting in
536/// zeros.
83c7162d
XL
537///
538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32)
0531ce1d
XL
539#[inline]
540#[target_feature(enable = "sse2")]
541#[cfg_attr(test, assert_instr(pslld))]
83c7162d 542#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
543pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
544 mem::transmute(pslld(a.as_i32x4(), count.as_i32x4()))
545}
546
547/// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
83c7162d
XL
548///
549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64)
0531ce1d
XL
550#[inline]
551#[target_feature(enable = "sse2")]
552#[cfg_attr(test, assert_instr(psllq, imm8 = 7))]
553#[rustc_args_required_const(1)]
83c7162d 554#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
555pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
556 mem::transmute(pslliq(a.as_i64x2(), imm8))
557}
558
559/// Shift packed 64-bit integers in `a` left by `count` while shifting in
560/// zeros.
83c7162d
XL
561///
562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64)
0531ce1d
XL
563#[inline]
564#[target_feature(enable = "sse2")]
565#[cfg_attr(test, assert_instr(psllq))]
83c7162d 566#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
567pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
568 mem::transmute(psllq(a.as_i64x2(), count.as_i64x2()))
569}
570
571/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
572/// bits.
83c7162d
XL
573///
574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16)
0531ce1d
XL
575#[inline]
576#[target_feature(enable = "sse2")]
577#[cfg_attr(test, assert_instr(psraw, imm8 = 1))]
578#[rustc_args_required_const(1)]
83c7162d 579#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
580pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
581 mem::transmute(psraiw(a.as_i16x8(), imm8))
582}
583
584/// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
585/// bits.
83c7162d
XL
586///
587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16)
0531ce1d
XL
588#[inline]
589#[target_feature(enable = "sse2")]
590#[cfg_attr(test, assert_instr(psraw))]
83c7162d 591#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
592pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
593 mem::transmute(psraw(a.as_i16x8(), count.as_i16x8()))
594}
595
596/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
597/// bits.
83c7162d
XL
598///
599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32)
0531ce1d
XL
600#[inline]
601#[target_feature(enable = "sse2")]
602#[cfg_attr(test, assert_instr(psrad, imm8 = 1))]
603#[rustc_args_required_const(1)]
83c7162d 604#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
605pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
606 mem::transmute(psraid(a.as_i32x4(), imm8))
607}
608
609/// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
610/// bits.
83c7162d
XL
611///
612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32)
0531ce1d
XL
613#[inline]
614#[target_feature(enable = "sse2")]
615#[cfg_attr(test, assert_instr(psrad))]
83c7162d 616#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
617pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
618 mem::transmute(psrad(a.as_i32x4(), count.as_i32x4()))
619}
620
621/// Shift `a` right by `imm8` bytes while shifting in zeros.
83c7162d
XL
622///
623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128)
0531ce1d
XL
624#[inline]
625#[target_feature(enable = "sse2")]
626#[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
627#[rustc_args_required_const(1)]
83c7162d 628#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
629pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
630 _mm_srli_si128_impl(a, imm8)
631}
632
633/// Implementation detail: converts the immediate argument of the
634/// `_mm_srli_si128` intrinsic into a compile-time constant.
635#[inline]
636#[target_feature(enable = "sse2")]
637unsafe fn _mm_srli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
638 let (zero, imm8) = (_mm_set1_epi8(0).as_i8x16(), imm8 as u32);
639 let a = a.as_i8x16();
640 macro_rules! shuffle {
641 ($shift:expr) => {
83c7162d
XL
642 simd_shuffle16(
643 a,
644 zero,
645 [
646 0 + $shift,
647 1 + $shift,
648 2 + $shift,
649 3 + $shift,
650 4 + $shift,
651 5 + $shift,
652 6 + $shift,
653 7 + $shift,
654 8 + $shift,
655 9 + $shift,
656 10 + $shift,
657 11 + $shift,
658 12 + $shift,
659 13 + $shift,
660 14 + $shift,
661 15 + $shift,
662 ],
663 )
664 };
0531ce1d
XL
665 }
666 let x: i8x16 = match imm8 {
667 0 => shuffle!(0),
668 1 => shuffle!(1),
669 2 => shuffle!(2),
670 3 => shuffle!(3),
671 4 => shuffle!(4),
672 5 => shuffle!(5),
673 6 => shuffle!(6),
674 7 => shuffle!(7),
675 8 => shuffle!(8),
676 9 => shuffle!(9),
677 10 => shuffle!(10),
678 11 => shuffle!(11),
679 12 => shuffle!(12),
680 13 => shuffle!(13),
681 14 => shuffle!(14),
682 15 => shuffle!(15),
683 _ => shuffle!(16),
684 };
685 mem::transmute(x)
686}
687
688/// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
689/// zeros.
83c7162d
XL
690///
691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16)
0531ce1d
XL
692#[inline]
693#[target_feature(enable = "sse2")]
694#[cfg_attr(test, assert_instr(psrlw, imm8 = 1))]
695#[rustc_args_required_const(1)]
83c7162d 696#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
697pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
698 mem::transmute(psrliw(a.as_i16x8(), imm8))
699}
700
701/// Shift packed 16-bit integers in `a` right by `count` while shifting in
702/// zeros.
83c7162d
XL
703///
704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16)
0531ce1d
XL
705#[inline]
706#[target_feature(enable = "sse2")]
707#[cfg_attr(test, assert_instr(psrlw))]
83c7162d 708#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
709pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
710 mem::transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
711}
712
713/// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
714/// zeros.
83c7162d
XL
715///
716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32)
0531ce1d
XL
717#[inline]
718#[target_feature(enable = "sse2")]
719#[cfg_attr(test, assert_instr(psrld, imm8 = 8))]
720#[rustc_args_required_const(1)]
83c7162d 721#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
722pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
723 mem::transmute(psrlid(a.as_i32x4(), imm8))
724}
725
726/// Shift packed 32-bit integers in `a` right by `count` while shifting in
727/// zeros.
83c7162d
XL
728///
729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32)
0531ce1d
XL
730#[inline]
731#[target_feature(enable = "sse2")]
732#[cfg_attr(test, assert_instr(psrld))]
83c7162d 733#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
734pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
735 mem::transmute(psrld(a.as_i32x4(), count.as_i32x4()))
736}
737
738/// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
739/// zeros.
83c7162d
XL
740///
741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64)
0531ce1d
XL
742#[inline]
743#[target_feature(enable = "sse2")]
744#[cfg_attr(test, assert_instr(psrlq, imm8 = 1))]
745#[rustc_args_required_const(1)]
83c7162d 746#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
747pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
748 mem::transmute(psrliq(a.as_i64x2(), imm8))
749}
750
751/// Shift packed 64-bit integers in `a` right by `count` while shifting in
752/// zeros.
83c7162d
XL
753///
754/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64)
0531ce1d
XL
755#[inline]
756#[target_feature(enable = "sse2")]
757#[cfg_attr(test, assert_instr(psrlq))]
83c7162d 758#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
759pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
760 mem::transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
761}
762
763/// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
764/// `b`.
83c7162d
XL
765///
766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_si128)
0531ce1d
XL
767#[inline]
768#[target_feature(enable = "sse2")]
769#[cfg_attr(test, assert_instr(andps))]
83c7162d 770#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
771pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
772 simd_and(a, b)
773}
774
775/// Compute the bitwise NOT of 128 bits (representing integer data) in `a` and
776/// then AND with `b`.
83c7162d
XL
777///
778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_si128)
0531ce1d
XL
779#[inline]
780#[target_feature(enable = "sse2")]
781#[cfg_attr(test, assert_instr(andnps))]
83c7162d 782#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
783pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
784 simd_and(simd_xor(_mm_set1_epi8(-1), a), b)
785}
786
787/// Compute the bitwise OR of 128 bits (representing integer data) in `a` and
788/// `b`.
83c7162d
XL
789///
790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_si128)
0531ce1d
XL
791#[inline]
792#[target_feature(enable = "sse2")]
793#[cfg_attr(test, assert_instr(orps))]
83c7162d 794#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
795pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
796 simd_or(a, b)
797}
798
799/// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and
800/// `b`.
83c7162d
XL
801///
802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_si128)
0531ce1d
XL
803#[inline]
804#[target_feature(enable = "sse2")]
805#[cfg_attr(test, assert_instr(xorps))]
83c7162d 806#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
807pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
808 simd_xor(a, b)
809}
810
811/// Compare packed 8-bit integers in `a` and `b` for equality.
83c7162d
XL
812///
813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi8)
0531ce1d
XL
814#[inline]
815#[target_feature(enable = "sse2")]
816#[cfg_attr(test, assert_instr(pcmpeqb))]
83c7162d 817#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
818pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
819 mem::transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
820}
821
822/// Compare packed 16-bit integers in `a` and `b` for equality.
83c7162d
XL
823///
824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi16)
0531ce1d
XL
825#[inline]
826#[target_feature(enable = "sse2")]
827#[cfg_attr(test, assert_instr(pcmpeqw))]
83c7162d 828#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
829pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
830 mem::transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
831}
832
833/// Compare packed 32-bit integers in `a` and `b` for equality.
83c7162d
XL
834///
835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi32)
0531ce1d
XL
836#[inline]
837#[target_feature(enable = "sse2")]
838#[cfg_attr(test, assert_instr(pcmpeqd))]
83c7162d 839#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
840pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
841 mem::transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
842}
843
844/// Compare packed 8-bit integers in `a` and `b` for greater-than.
83c7162d
XL
845///
846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi8)
0531ce1d
XL
847#[inline]
848#[target_feature(enable = "sse2")]
849#[cfg_attr(test, assert_instr(pcmpgtb))]
83c7162d 850#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
851pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
852 mem::transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
853}
854
855/// Compare packed 16-bit integers in `a` and `b` for greater-than.
83c7162d
XL
856///
857/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi16)
0531ce1d
XL
858#[inline]
859#[target_feature(enable = "sse2")]
860#[cfg_attr(test, assert_instr(pcmpgtw))]
83c7162d 861#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
862pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
863 mem::transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
864}
865
866/// Compare packed 32-bit integers in `a` and `b` for greater-than.
83c7162d
XL
867///
868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi32)
0531ce1d
XL
869#[inline]
870#[target_feature(enable = "sse2")]
871#[cfg_attr(test, assert_instr(pcmpgtd))]
83c7162d 872#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
873pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
874 mem::transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
875}
876
877/// Compare packed 8-bit integers in `a` and `b` for less-than.
83c7162d
XL
878///
879/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi8)
0531ce1d
XL
880#[inline]
881#[target_feature(enable = "sse2")]
882#[cfg_attr(test, assert_instr(pcmpgtb))]
83c7162d 883#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
884pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
885 mem::transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
886}
887
888/// Compare packed 16-bit integers in `a` and `b` for less-than.
83c7162d
XL
889///
890/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi16)
0531ce1d
XL
891#[inline]
892#[target_feature(enable = "sse2")]
893#[cfg_attr(test, assert_instr(pcmpgtw))]
83c7162d 894#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
895pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
896 mem::transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
897}
898
899/// Compare packed 32-bit integers in `a` and `b` for less-than.
83c7162d
XL
900///
901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi32)
0531ce1d
XL
902#[inline]
903#[target_feature(enable = "sse2")]
904#[cfg_attr(test, assert_instr(pcmpgtd))]
83c7162d 905#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
906pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
907 mem::transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
908}
909
910/// Convert the lower two packed 32-bit integers in `a` to packed
911/// double-precision (64-bit) floating-point elements.
83c7162d
XL
912///
913/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd)
0531ce1d
XL
914#[inline]
915#[target_feature(enable = "sse2")]
916#[cfg_attr(test, assert_instr(cvtdq2pd))]
83c7162d 917#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
918pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
919 let a = a.as_i32x4();
920 simd_cast::<i32x2, __m128d>(simd_shuffle2(a, a, [0, 1]))
921}
922
923/// Return `a` with its lower element replaced by `b` after converting it to
924/// an `f64`.
83c7162d
XL
925///
926/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd)
0531ce1d
XL
927#[inline]
928#[target_feature(enable = "sse2")]
929#[cfg_attr(test, assert_instr(cvtsi2sd))]
83c7162d 930#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
931pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
932 simd_insert(a, 0, b as f64)
933}
934
935/// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
936/// floating-point elements.
83c7162d
XL
937///
938/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ps)
0531ce1d
XL
939#[inline]
940#[target_feature(enable = "sse2")]
941#[cfg_attr(test, assert_instr(cvtdq2ps))]
83c7162d 942#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
943pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
944 cvtdq2ps(a.as_i32x4())
945}
946
947/// Convert packed single-precision (32-bit) floating-point elements in `a`
948/// to packed 32-bit integers.
83c7162d
XL
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epi32)
0531ce1d
XL
951#[inline]
952#[target_feature(enable = "sse2")]
953#[cfg_attr(test, assert_instr(cvtps2dq))]
83c7162d 954#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
955pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
956 mem::transmute(cvtps2dq(a))
957}
958
959/// Return a vector whose lowest element is `a` and all higher elements are
960/// `0`.
83c7162d
XL
961///
962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_si128)
0531ce1d
XL
963#[inline]
964#[target_feature(enable = "sse2")]
965#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))]
83c7162d 966#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
967pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
968 mem::transmute(i32x4::new(a, 0, 0, 0))
969}
970
971/// Return the lowest element of `a`.
83c7162d
XL
972///
973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32)
0531ce1d
XL
974#[inline]
975#[target_feature(enable = "sse2")]
0731742a 976#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movd))]
83c7162d 977#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
978pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
979 simd_extract(a.as_i32x4(), 0)
980}
981
982/// Set packed 64-bit integers with the supplied values, from highest to
983/// lowest.
83c7162d
XL
984///
985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi64x)
0531ce1d
XL
986#[inline]
987#[target_feature(enable = "sse2")]
988// no particular instruction to test
83c7162d 989#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
990pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
991 mem::transmute(i64x2::new(e0, e1))
992}
993
994/// Set packed 32-bit integers with the supplied values.
83c7162d
XL
995///
996/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi32)
0531ce1d
XL
997#[inline]
998#[target_feature(enable = "sse2")]
999// no particular instruction to test
83c7162d 1000#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1001pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1002 mem::transmute(i32x4::new(e0, e1, e2, e3))
1003}
1004
1005/// Set packed 16-bit integers with the supplied values.
83c7162d
XL
1006///
1007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi16)
0531ce1d
XL
1008#[inline]
1009#[target_feature(enable = "sse2")]
1010// no particular instruction to test
83c7162d 1011#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1012pub unsafe fn _mm_set_epi16(
0731742a
XL
1013 e7: i16,
1014 e6: i16,
1015 e5: i16,
1016 e4: i16,
1017 e3: i16,
1018 e2: i16,
1019 e1: i16,
1020 e0: i16,
0531ce1d
XL
1021) -> __m128i {
1022 mem::transmute(i16x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
1023}
1024
1025/// Set packed 8-bit integers with the supplied values.
83c7162d
XL
1026///
1027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi8)
0531ce1d
XL
1028#[inline]
1029#[target_feature(enable = "sse2")]
1030// no particular instruction to test
83c7162d 1031#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1032pub unsafe fn _mm_set_epi8(
0731742a
XL
1033 e15: i8,
1034 e14: i8,
1035 e13: i8,
1036 e12: i8,
1037 e11: i8,
1038 e10: i8,
1039 e9: i8,
1040 e8: i8,
1041 e7: i8,
1042 e6: i8,
1043 e5: i8,
1044 e4: i8,
1045 e3: i8,
1046 e2: i8,
1047 e1: i8,
1048 e0: i8,
0531ce1d 1049) -> __m128i {
0731742a 1050 #[rustfmt::skip]
0531ce1d
XL
1051 mem::transmute(i8x16::new(
1052 e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1053 ))
1054}
1055
1056/// Broadcast 64-bit integer `a` to all elements.
83c7162d
XL
1057///
1058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x)
0531ce1d
XL
1059#[inline]
1060#[target_feature(enable = "sse2")]
1061// no particular instruction to test
83c7162d 1062#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1063pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
1064 _mm_set_epi64x(a, a)
1065}
1066
1067/// Broadcast 32-bit integer `a` to all elements.
83c7162d
XL
1068///
1069/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi32)
0531ce1d
XL
1070#[inline]
1071#[target_feature(enable = "sse2")]
1072// no particular instruction to test
83c7162d 1073#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1074pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
1075 _mm_set_epi32(a, a, a, a)
1076}
1077
1078/// Broadcast 16-bit integer `a` to all elements.
83c7162d
XL
1079///
1080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi16)
0531ce1d
XL
1081#[inline]
1082#[target_feature(enable = "sse2")]
1083// no particular instruction to test
83c7162d 1084#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1085pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
1086 _mm_set_epi16(a, a, a, a, a, a, a, a)
1087}
1088
1089/// Broadcast 8-bit integer `a` to all elements.
83c7162d
XL
1090///
1091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi8)
0531ce1d
XL
1092#[inline]
1093#[target_feature(enable = "sse2")]
1094// no particular instruction to test
83c7162d 1095#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1096pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
1097 _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
1098}
1099
1100/// Set packed 32-bit integers with the supplied values in reverse order.
83c7162d
XL
1101///
1102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi32)
0531ce1d
XL
1103#[inline]
1104#[target_feature(enable = "sse2")]
1105// no particular instruction to test
83c7162d 1106#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1107pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
1108 _mm_set_epi32(e0, e1, e2, e3)
1109}
1110
1111/// Set packed 16-bit integers with the supplied values in reverse order.
83c7162d
XL
1112///
1113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi16)
0531ce1d
XL
1114#[inline]
1115#[target_feature(enable = "sse2")]
1116// no particular instruction to test
83c7162d 1117#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1118pub unsafe fn _mm_setr_epi16(
0731742a
XL
1119 e7: i16,
1120 e6: i16,
1121 e5: i16,
1122 e4: i16,
1123 e3: i16,
1124 e2: i16,
1125 e1: i16,
1126 e0: i16,
0531ce1d
XL
1127) -> __m128i {
1128 _mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7)
1129}
1130
1131/// Set packed 8-bit integers with the supplied values in reverse order.
83c7162d
XL
1132///
1133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi8)
0531ce1d
XL
1134#[inline]
1135#[target_feature(enable = "sse2")]
1136// no particular instruction to test
83c7162d 1137#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1138pub unsafe fn _mm_setr_epi8(
0731742a
XL
1139 e15: i8,
1140 e14: i8,
1141 e13: i8,
1142 e12: i8,
1143 e11: i8,
1144 e10: i8,
1145 e9: i8,
1146 e8: i8,
1147 e7: i8,
1148 e6: i8,
1149 e5: i8,
1150 e4: i8,
1151 e3: i8,
1152 e2: i8,
1153 e1: i8,
1154 e0: i8,
0531ce1d 1155) -> __m128i {
0731742a 1156 #[rustfmt::skip]
0531ce1d
XL
1157 _mm_set_epi8(
1158 e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
1159 )
1160}
1161
1162/// Returns a vector with all elements set to zero.
83c7162d
XL
1163///
1164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_si128)
0531ce1d
XL
1165#[inline]
1166#[target_feature(enable = "sse2")]
1167#[cfg_attr(test, assert_instr(xorps))]
83c7162d 1168#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1169pub unsafe fn _mm_setzero_si128() -> __m128i {
1170 _mm_set1_epi64x(0)
1171}
1172
1173/// Load 64-bit integer from memory into first element of returned vector.
83c7162d
XL
1174///
1175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64)
0531ce1d
XL
1176#[inline]
1177#[target_feature(enable = "sse2")]
1178// FIXME movsd on windows
8faf50e0
XL
1179#[cfg_attr(
1180 all(
1181 test,
1182 not(windows),
1183 not(all(target_os = "linux", target_arch = "x86_64")),
1184 target_arch = "x86_64"
1185 ),
1186 assert_instr(movq)
1187)]
83c7162d 1188#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1189pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
a1dfa0c6 1190 _mm_set_epi64x(0, ptr::read_unaligned(mem_addr as *const i64))
0531ce1d
XL
1191}
1192
1193/// Load 128-bits of integer data from memory into a new vector.
1194///
1195/// `mem_addr` must be aligned on a 16-byte boundary.
83c7162d
XL
1196///
1197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_si128)
0531ce1d
XL
1198#[inline]
1199#[target_feature(enable = "sse2")]
1200#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1201#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1202pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
1203 *mem_addr
1204}
1205
1206/// Load 128-bits of integer data from memory into a new vector.
1207///
1208/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1209///
1210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si128)
0531ce1d
XL
1211#[inline]
1212#[target_feature(enable = "sse2")]
1213#[cfg_attr(test, assert_instr(movups))]
83c7162d 1214#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1215pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
1216 let mut dst: __m128i = _mm_undefined_si128();
1217 ptr::copy_nonoverlapping(
1218 mem_addr as *const u8,
1219 &mut dst as *mut __m128i as *mut u8,
1220 mem::size_of::<__m128i>(),
1221 );
1222 dst
1223}
1224
1225/// Conditionally store 8-bit integer elements from `a` into memory using
1226/// `mask`.
1227///
1228/// Elements are not stored when the highest bit is not set in the
1229/// corresponding element.
1230///
1231/// `mem_addr` should correspond to a 128-bit memory location and does not need
1232/// to be aligned on any particular boundary.
83c7162d
XL
1233///
1234/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128)
0531ce1d
XL
1235#[inline]
1236#[target_feature(enable = "sse2")]
1237#[cfg_attr(test, assert_instr(maskmovdqu))]
83c7162d 1238#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 1239pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
0531ce1d
XL
1240 maskmovdqu(a.as_i8x16(), mask.as_i8x16(), mem_addr)
1241}
1242
1243/// Store 128-bits of integer data from `a` into memory.
1244///
1245/// `mem_addr` must be aligned on a 16-byte boundary.
83c7162d
XL
1246///
1247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_si128)
0531ce1d
XL
1248#[inline]
1249#[target_feature(enable = "sse2")]
1250#[cfg_attr(test, assert_instr(movaps))]
83c7162d 1251#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1252pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
1253 *mem_addr = a;
1254}
1255
1256/// Store 128-bits of integer data from `a` into memory.
1257///
1258/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1259///
1260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128)
0531ce1d
XL
1261#[inline]
1262#[target_feature(enable = "sse2")]
1263#[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
83c7162d 1264#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1265pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
1266 storeudq(mem_addr as *mut i8, a);
1267}
1268
1269/// Store the lower 64-bit integer `a` to a memory location.
1270///
1271/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
1272///
1273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_epi64)
0531ce1d
XL
1274#[inline]
1275#[target_feature(enable = "sse2")]
1276// FIXME mov on windows, movlps on i686
8faf50e0
XL
1277#[cfg_attr(
1278 all(
1279 test,
1280 not(windows),
1281 not(all(target_os = "linux", target_arch = "x86_64")),
1282 target_arch = "x86_64"
1283 ),
1284 assert_instr(movq)
1285)]
83c7162d 1286#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1287pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
0731742a 1288 ptr::copy_nonoverlapping(&a as *const _ as *const u8, mem_addr as *mut u8, 8);
0531ce1d
XL
1289}
1290
1291/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
1292/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1293/// used again soon).
83c7162d
XL
1294///
1295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si128)
0531ce1d
XL
1296#[inline]
1297#[target_feature(enable = "sse2")]
1298#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
83c7162d 1299#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1300pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1301 ::intrinsics::nontemporal_store(mem_addr, a);
1302}
1303
1304/// Stores a 32-bit integer value in the specified memory location.
1305/// To minimize caching, the data is flagged as non-temporal (unlikely to be
1306/// used again soon).
83c7162d
XL
1307///
1308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32)
0531ce1d
XL
1309#[inline]
1310#[target_feature(enable = "sse2")]
1311#[cfg_attr(test, assert_instr(movnti))]
83c7162d 1312#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1313pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1314 ::intrinsics::nontemporal_store(mem_addr, a);
1315}
1316
1317/// Return a vector where the low element is extracted from `a` and its upper
1318/// element is zero.
83c7162d
XL
1319///
1320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64)
0531ce1d
XL
1321#[inline]
1322#[target_feature(enable = "sse2")]
1323// FIXME movd on windows, movd on i686
0731742a 1324#[cfg_attr(all(test, not(windows), target_arch = "x86_64"), assert_instr(movq))]
83c7162d 1325#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1326pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
1327 let zero = _mm_setzero_si128();
1328 let r: i64x2 = simd_shuffle2(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
1329 mem::transmute(r)
1330}
1331
1332/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
1333/// using signed saturation.
83c7162d
XL
1334///
1335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi16)
0531ce1d
XL
1336#[inline]
1337#[target_feature(enable = "sse2")]
1338#[cfg_attr(test, assert_instr(packsswb))]
83c7162d 1339#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1340pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
1341 mem::transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
1342}
1343
1344/// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
1345/// using signed saturation.
83c7162d
XL
1346///
1347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi32)
0531ce1d
XL
1348#[inline]
1349#[target_feature(enable = "sse2")]
1350#[cfg_attr(test, assert_instr(packssdw))]
83c7162d 1351#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1352pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
1353 mem::transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
1354}
1355
1356/// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
1357/// using unsigned saturation.
83c7162d
XL
1358///
1359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi16)
0531ce1d
XL
1360#[inline]
1361#[target_feature(enable = "sse2")]
1362#[cfg_attr(test, assert_instr(packuswb))]
83c7162d 1363#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1364pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
1365 mem::transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
1366}
1367
1368/// Return the `imm8` element of `a`.
83c7162d
XL
1369///
1370/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi16)
0531ce1d
XL
1371#[inline]
1372#[target_feature(enable = "sse2")]
1373#[cfg_attr(test, assert_instr(pextrw, imm8 = 9))]
1374#[rustc_args_required_const(1)]
83c7162d 1375#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1376pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
1377 simd_extract::<_, i16>(a.as_i16x8(), (imm8 & 7) as u32) as i32
1378}
1379
1380/// Return a new vector where the `imm8` element of `a` is replaced with `i`.
83c7162d
XL
1381///
1382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi16)
0531ce1d
XL
1383#[inline]
1384#[target_feature(enable = "sse2")]
1385#[cfg_attr(test, assert_instr(pinsrw, imm8 = 9))]
1386#[rustc_args_required_const(2)]
83c7162d 1387#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1388pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
8faf50e0 1389 mem::transmute(simd_insert(a.as_i16x8(), (imm8 & 7) as u32, i as i16))
0531ce1d
XL
1390}
1391
1392/// Return a mask of the most significant bit of each element in `a`.
83c7162d
XL
1393///
1394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_epi8)
0531ce1d
XL
1395#[inline]
1396#[target_feature(enable = "sse2")]
1397#[cfg_attr(test, assert_instr(pmovmskb))]
83c7162d 1398#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1399pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
1400 pmovmskb(a.as_i8x16())
1401}
1402
1403/// Shuffle 32-bit integers in `a` using the control in `imm8`.
83c7162d
XL
1404///
1405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi32)
0531ce1d
XL
1406#[inline]
1407#[target_feature(enable = "sse2")]
1408#[cfg_attr(test, assert_instr(pshufd, imm8 = 9))]
1409#[rustc_args_required_const(1)]
83c7162d 1410#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1411pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
1412 // simd_shuffleX requires that its selector parameter be made up of
1413 // constant values, but we can't enforce that here. In spirit, we need
1414 // to write a `match` on all possible values of a byte, and for each value,
1415 // hard-code the correct `simd_shuffleX` call using only constants. We
1416 // then hope for LLVM to do the rest.
1417 //
1418 // Of course, that's... awful. So we try to use macros to do it for us.
1419 let imm8 = (imm8 & 0xFF) as u8;
1420 let a = a.as_i32x4();
1421
1422 macro_rules! shuffle_done {
1423 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1424 simd_shuffle4(a, a, [$x01, $x23, $x45, $x67])
83c7162d 1425 };
0531ce1d
XL
1426 }
1427 macro_rules! shuffle_x67 {
1428 ($x01:expr, $x23:expr, $x45:expr) => {
1429 match (imm8 >> 6) & 0b11 {
1430 0b00 => shuffle_done!($x01, $x23, $x45, 0),
1431 0b01 => shuffle_done!($x01, $x23, $x45, 1),
1432 0b10 => shuffle_done!($x01, $x23, $x45, 2),
1433 _ => shuffle_done!($x01, $x23, $x45, 3),
1434 }
83c7162d 1435 };
0531ce1d
XL
1436 }
1437 macro_rules! shuffle_x45 {
1438 ($x01:expr, $x23:expr) => {
1439 match (imm8 >> 4) & 0b11 {
1440 0b00 => shuffle_x67!($x01, $x23, 0),
1441 0b01 => shuffle_x67!($x01, $x23, 1),
1442 0b10 => shuffle_x67!($x01, $x23, 2),
1443 _ => shuffle_x67!($x01, $x23, 3),
1444 }
83c7162d 1445 };
0531ce1d
XL
1446 }
1447 macro_rules! shuffle_x23 {
1448 ($x01:expr) => {
1449 match (imm8 >> 2) & 0b11 {
1450 0b00 => shuffle_x45!($x01, 0),
1451 0b01 => shuffle_x45!($x01, 1),
1452 0b10 => shuffle_x45!($x01, 2),
1453 _ => shuffle_x45!($x01, 3),
1454 }
83c7162d 1455 };
0531ce1d
XL
1456 }
1457 let x: i32x4 = match imm8 & 0b11 {
1458 0b00 => shuffle_x23!(0),
1459 0b01 => shuffle_x23!(1),
1460 0b10 => shuffle_x23!(2),
1461 _ => shuffle_x23!(3),
1462 };
1463 mem::transmute(x)
1464}
1465
1466/// Shuffle 16-bit integers in the high 64 bits of `a` using the control in
1467/// `imm8`.
1468///
1469/// Put the results in the high 64 bits of the returned vector, with the low 64
1470/// bits being copied from from `a`.
83c7162d
XL
1471///
1472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflehi_epi16)
0531ce1d
XL
1473#[inline]
1474#[target_feature(enable = "sse2")]
1475#[cfg_attr(test, assert_instr(pshufhw, imm8 = 9))]
1476#[rustc_args_required_const(1)]
83c7162d 1477#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1478pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
1479 // See _mm_shuffle_epi32.
1480 let imm8 = (imm8 & 0xFF) as u8;
1481 let a = a.as_i16x8();
1482 macro_rules! shuffle_done {
1483 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
0731742a 1484 simd_shuffle8(a, a, [0, 1, 2, 3, $x01 + 4, $x23 + 4, $x45 + 4, $x67 + 4])
83c7162d 1485 };
0531ce1d
XL
1486 }
1487 macro_rules! shuffle_x67 {
1488 ($x01:expr, $x23:expr, $x45:expr) => {
1489 match (imm8 >> 6) & 0b11 {
1490 0b00 => shuffle_done!($x01, $x23, $x45, 0),
1491 0b01 => shuffle_done!($x01, $x23, $x45, 1),
1492 0b10 => shuffle_done!($x01, $x23, $x45, 2),
1493 _ => shuffle_done!($x01, $x23, $x45, 3),
1494 }
83c7162d 1495 };
0531ce1d
XL
1496 }
1497 macro_rules! shuffle_x45 {
1498 ($x01:expr, $x23:expr) => {
1499 match (imm8 >> 4) & 0b11 {
1500 0b00 => shuffle_x67!($x01, $x23, 0),
1501 0b01 => shuffle_x67!($x01, $x23, 1),
1502 0b10 => shuffle_x67!($x01, $x23, 2),
1503 _ => shuffle_x67!($x01, $x23, 3),
1504 }
83c7162d 1505 };
0531ce1d
XL
1506 }
1507 macro_rules! shuffle_x23 {
1508 ($x01:expr) => {
1509 match (imm8 >> 2) & 0b11 {
1510 0b00 => shuffle_x45!($x01, 0),
1511 0b01 => shuffle_x45!($x01, 1),
1512 0b10 => shuffle_x45!($x01, 2),
1513 _ => shuffle_x45!($x01, 3),
1514 }
83c7162d 1515 };
0531ce1d
XL
1516 }
1517 let x: i16x8 = match imm8 & 0b11 {
1518 0b00 => shuffle_x23!(0),
1519 0b01 => shuffle_x23!(1),
1520 0b10 => shuffle_x23!(2),
1521 _ => shuffle_x23!(3),
1522 };
1523 mem::transmute(x)
1524}
1525
1526/// Shuffle 16-bit integers in the low 64 bits of `a` using the control in
1527/// `imm8`.
1528///
1529/// Put the results in the low 64 bits of the returned vector, with the high 64
1530/// bits being copied from from `a`.
83c7162d
XL
1531///
1532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflelo_epi16)
0531ce1d
XL
1533#[inline]
1534#[target_feature(enable = "sse2")]
1535#[cfg_attr(test, assert_instr(pshuflw, imm8 = 9))]
1536#[rustc_args_required_const(1)]
83c7162d 1537#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1538pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
1539 // See _mm_shuffle_epi32.
1540 let imm8 = (imm8 & 0xFF) as u8;
1541 let a = a.as_i16x8();
1542
1543 macro_rules! shuffle_done {
1544 ($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1545 simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4, 5, 6, 7])
83c7162d 1546 };
0531ce1d
XL
1547 }
1548 macro_rules! shuffle_x67 {
1549 ($x01:expr, $x23:expr, $x45:expr) => {
1550 match (imm8 >> 6) & 0b11 {
1551 0b00 => shuffle_done!($x01, $x23, $x45, 0),
1552 0b01 => shuffle_done!($x01, $x23, $x45, 1),
1553 0b10 => shuffle_done!($x01, $x23, $x45, 2),
1554 _ => shuffle_done!($x01, $x23, $x45, 3),
1555 }
83c7162d 1556 };
0531ce1d
XL
1557 }
1558 macro_rules! shuffle_x45 {
1559 ($x01:expr, $x23:expr) => {
1560 match (imm8 >> 4) & 0b11 {
1561 0b00 => shuffle_x67!($x01, $x23, 0),
1562 0b01 => shuffle_x67!($x01, $x23, 1),
1563 0b10 => shuffle_x67!($x01, $x23, 2),
1564 _ => shuffle_x67!($x01, $x23, 3),
1565 }
83c7162d 1566 };
0531ce1d
XL
1567 }
1568 macro_rules! shuffle_x23 {
1569 ($x01:expr) => {
1570 match (imm8 >> 2) & 0b11 {
1571 0b00 => shuffle_x45!($x01, 0),
1572 0b01 => shuffle_x45!($x01, 1),
1573 0b10 => shuffle_x45!($x01, 2),
1574 _ => shuffle_x45!($x01, 3),
1575 }
83c7162d 1576 };
0531ce1d
XL
1577 }
1578 let x: i16x8 = match imm8 & 0b11 {
1579 0b00 => shuffle_x23!(0),
1580 0b01 => shuffle_x23!(1),
1581 0b10 => shuffle_x23!(2),
1582 _ => shuffle_x23!(3),
1583 };
1584 mem::transmute(x)
1585}
1586
1587/// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
83c7162d
XL
1588///
1589/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi8)
0531ce1d
XL
1590#[inline]
1591#[target_feature(enable = "sse2")]
1592#[cfg_attr(test, assert_instr(punpckhbw))]
83c7162d 1593#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1594pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
1595 mem::transmute::<i8x16, _>(simd_shuffle16(
1596 a.as_i8x16(),
1597 b.as_i8x16(),
8faf50e0 1598 [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31],
0531ce1d
XL
1599 ))
1600}
1601
1602/// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
83c7162d
XL
1603///
1604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi16)
0531ce1d
XL
1605#[inline]
1606#[target_feature(enable = "sse2")]
1607#[cfg_attr(test, assert_instr(punpckhwd))]
83c7162d 1608#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1609pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
0731742a 1610 let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [4, 12, 5, 13, 6, 14, 7, 15]);
0531ce1d
XL
1611 mem::transmute::<i16x8, _>(x)
1612}
1613
1614/// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
83c7162d
XL
1615///
1616/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi32)
0531ce1d
XL
1617#[inline]
1618#[target_feature(enable = "sse2")]
1619#[cfg_attr(test, assert_instr(unpckhps))]
83c7162d 1620#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1621pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
0731742a 1622 mem::transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [2, 6, 3, 7]))
0531ce1d
XL
1623}
1624
1625/// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
83c7162d
XL
1626///
1627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi64)
0531ce1d
XL
1628#[inline]
1629#[target_feature(enable = "sse2")]
1630#[cfg_attr(test, assert_instr(unpckhpd))]
83c7162d 1631#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1632pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
0731742a 1633 mem::transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [1, 3]))
0531ce1d
XL
1634}
1635
1636/// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
83c7162d
XL
1637///
1638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi8)
0531ce1d
XL
1639#[inline]
1640#[target_feature(enable = "sse2")]
1641#[cfg_attr(test, assert_instr(punpcklbw))]
83c7162d 1642#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1643pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
1644 mem::transmute::<i8x16, _>(simd_shuffle16(
1645 a.as_i8x16(),
1646 b.as_i8x16(),
8faf50e0 1647 [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23],
0531ce1d
XL
1648 ))
1649}
1650
1651/// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
83c7162d
XL
1652///
1653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi16)
0531ce1d
XL
1654#[inline]
1655#[target_feature(enable = "sse2")]
1656#[cfg_attr(test, assert_instr(punpcklwd))]
83c7162d 1657#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1658pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
0731742a 1659 let x = simd_shuffle8(a.as_i16x8(), b.as_i16x8(), [0, 8, 1, 9, 2, 10, 3, 11]);
0531ce1d
XL
1660 mem::transmute::<i16x8, _>(x)
1661}
1662
1663/// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
83c7162d
XL
1664///
1665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi32)
0531ce1d
XL
1666#[inline]
1667#[target_feature(enable = "sse2")]
1668#[cfg_attr(test, assert_instr(unpcklps))]
83c7162d 1669#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1670pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
0731742a 1671 mem::transmute::<i32x4, _>(simd_shuffle4(a.as_i32x4(), b.as_i32x4(), [0, 4, 1, 5]))
0531ce1d
XL
1672}
1673
1674/// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
83c7162d
XL
1675///
1676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi64)
0531ce1d
XL
1677#[inline]
1678#[target_feature(enable = "sse2")]
0731742a 1679#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
83c7162d 1680#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1681pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
0731742a 1682 mem::transmute::<i64x2, _>(simd_shuffle2(a.as_i64x2(), b.as_i64x2(), [0, 2]))
0531ce1d
XL
1683}
1684
1685/// Return a new vector with the low element of `a` replaced by the sum of the
1686/// low elements of `a` and `b`.
83c7162d
XL
1687///
1688/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd)
0531ce1d
XL
1689#[inline]
1690#[target_feature(enable = "sse2")]
1691#[cfg_attr(test, assert_instr(addsd))]
83c7162d 1692#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1693pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
1694 simd_insert(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
1695}
1696
1697/// Add packed double-precision (64-bit) floating-point elements in `a` and
1698/// `b`.
83c7162d
XL
1699///
1700/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd)
0531ce1d
XL
1701#[inline]
1702#[target_feature(enable = "sse2")]
1703#[cfg_attr(test, assert_instr(addpd))]
83c7162d 1704#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1705pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
1706 simd_add(a, b)
1707}
1708
1709/// Return a new vector with the low element of `a` replaced by the result of
1710/// diving the lower element of `a` by the lower element of `b`.
83c7162d
XL
1711///
1712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd)
0531ce1d
XL
1713#[inline]
1714#[target_feature(enable = "sse2")]
1715#[cfg_attr(test, assert_instr(divsd))]
83c7162d 1716#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1717pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
1718 simd_insert(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
1719}
1720
1721/// Divide packed double-precision (64-bit) floating-point elements in `a` by
1722/// packed elements in `b`.
83c7162d
XL
1723///
1724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd)
0531ce1d
XL
1725#[inline]
1726#[target_feature(enable = "sse2")]
1727#[cfg_attr(test, assert_instr(divpd))]
83c7162d 1728#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1729pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
1730 simd_div(a, b)
1731}
1732
1733/// Return a new vector with the low element of `a` replaced by the maximum
1734/// of the lower elements of `a` and `b`.
83c7162d
XL
1735///
1736/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd)
0531ce1d
XL
1737#[inline]
1738#[target_feature(enable = "sse2")]
1739#[cfg_attr(test, assert_instr(maxsd))]
83c7162d 1740#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1741pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
1742 maxsd(a, b)
1743}
1744
1745/// Return a new vector with the maximum values from corresponding elements in
1746/// `a` and `b`.
83c7162d
XL
1747///
1748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd)
0531ce1d
XL
1749#[inline]
1750#[target_feature(enable = "sse2")]
1751#[cfg_attr(test, assert_instr(maxpd))]
83c7162d 1752#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1753pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
1754 maxpd(a, b)
1755}
1756
1757/// Return a new vector with the low element of `a` replaced by the minimum
1758/// of the lower elements of `a` and `b`.
83c7162d
XL
1759///
1760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd)
0531ce1d
XL
1761#[inline]
1762#[target_feature(enable = "sse2")]
1763#[cfg_attr(test, assert_instr(minsd))]
83c7162d 1764#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1765pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
1766 minsd(a, b)
1767}
1768
1769/// Return a new vector with the minimum values from corresponding elements in
1770/// `a` and `b`.
83c7162d
XL
1771///
1772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd)
0531ce1d
XL
1773#[inline]
1774#[target_feature(enable = "sse2")]
1775#[cfg_attr(test, assert_instr(minpd))]
83c7162d 1776#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1777pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
1778 minpd(a, b)
1779}
1780
1781/// Return a new vector with the low element of `a` replaced by multiplying the
1782/// low elements of `a` and `b`.
83c7162d
XL
1783///
1784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sd)
0531ce1d
XL
1785#[inline]
1786#[target_feature(enable = "sse2")]
1787#[cfg_attr(test, assert_instr(mulsd))]
83c7162d 1788#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1789pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
1790 simd_insert(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
1791}
1792
1793/// Multiply packed double-precision (64-bit) floating-point elements in `a`
1794/// and `b`.
83c7162d
XL
1795///
1796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd)
0531ce1d
XL
1797#[inline]
1798#[target_feature(enable = "sse2")]
1799#[cfg_attr(test, assert_instr(mulpd))]
83c7162d 1800#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1801pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
1802 simd_mul(a, b)
1803}
1804
1805/// Return a new vector with the low element of `a` replaced by the square
1806/// root of the lower element `b`.
83c7162d
XL
1807///
1808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd)
0531ce1d
XL
1809#[inline]
1810#[target_feature(enable = "sse2")]
1811#[cfg_attr(test, assert_instr(sqrtsd))]
83c7162d 1812#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1813pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
1814 simd_insert(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
1815}
1816
1817/// Return a new vector with the square root of each of the values in `a`.
83c7162d
XL
1818///
1819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd)
0531ce1d
XL
1820#[inline]
1821#[target_feature(enable = "sse2")]
1822#[cfg_attr(test, assert_instr(sqrtpd))]
83c7162d 1823#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1824pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
1825 sqrtpd(a)
1826}
1827
1828/// Return a new vector with the low element of `a` replaced by subtracting the
1829/// low element by `b` from the low element of `a`.
83c7162d
XL
1830///
1831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd)
0531ce1d
XL
1832#[inline]
1833#[target_feature(enable = "sse2")]
1834#[cfg_attr(test, assert_instr(subsd))]
83c7162d 1835#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1836pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
1837 simd_insert(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
1838}
1839
1840/// Subtract packed double-precision (64-bit) floating-point elements in `b`
1841/// from `a`.
83c7162d
XL
1842///
1843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_pd)
0531ce1d
XL
1844#[inline]
1845#[target_feature(enable = "sse2")]
1846#[cfg_attr(test, assert_instr(subpd))]
83c7162d 1847#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1848pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
1849 simd_sub(a, b)
1850}
1851
1852/// Compute the bitwise AND of packed double-precision (64-bit) floating-point
1853/// elements in `a` and `b`.
83c7162d
XL
1854///
1855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd)
0531ce1d
XL
1856#[inline]
1857#[target_feature(enable = "sse2")]
1858#[cfg_attr(test, assert_instr(andps))]
83c7162d 1859#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1860pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
1861 let a: __m128i = mem::transmute(a);
1862 let b: __m128i = mem::transmute(b);
1863 mem::transmute(_mm_and_si128(a, b))
1864}
1865
1866/// Compute the bitwise NOT of `a` and then AND with `b`.
83c7162d
XL
1867///
1868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd)
0531ce1d
XL
1869#[inline]
1870#[target_feature(enable = "sse2")]
1871#[cfg_attr(test, assert_instr(andnps))]
83c7162d 1872#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1873pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
1874 let a: __m128i = mem::transmute(a);
1875 let b: __m128i = mem::transmute(b);
1876 mem::transmute(_mm_andnot_si128(a, b))
1877}
1878
1879/// Compute the bitwise OR of `a` and `b`.
83c7162d
XL
1880///
1881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_pd)
0531ce1d
XL
1882#[inline]
1883#[target_feature(enable = "sse2")]
1884#[cfg_attr(test, assert_instr(orps))]
83c7162d 1885#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1886pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
1887 let a: __m128i = mem::transmute(a);
1888 let b: __m128i = mem::transmute(b);
1889 mem::transmute(_mm_or_si128(a, b))
1890}
1891
1892/// Compute the bitwise OR of `a` and `b`.
83c7162d
XL
1893///
1894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd)
0531ce1d
XL
1895#[inline]
1896#[target_feature(enable = "sse2")]
1897#[cfg_attr(test, assert_instr(xorps))]
83c7162d 1898#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1899pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
1900 let a: __m128i = mem::transmute(a);
1901 let b: __m128i = mem::transmute(b);
1902 mem::transmute(_mm_xor_si128(a, b))
1903}
1904
1905/// Return a new vector with the low element of `a` replaced by the equality
1906/// comparison of the lower elements of `a` and `b`.
83c7162d
XL
1907///
1908/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd)
0531ce1d
XL
1909#[inline]
1910#[target_feature(enable = "sse2")]
1911#[cfg_attr(test, assert_instr(cmpeqsd))]
83c7162d 1912#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1913pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
1914 cmpsd(a, b, 0)
1915}
1916
1917/// Return a new vector with the low element of `a` replaced by the less-than
1918/// comparison of the lower elements of `a` and `b`.
83c7162d
XL
1919///
1920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd)
0531ce1d
XL
1921#[inline]
1922#[target_feature(enable = "sse2")]
1923#[cfg_attr(test, assert_instr(cmpltsd))]
83c7162d 1924#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1925pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
1926 cmpsd(a, b, 1)
1927}
1928
1929/// Return a new vector with the low element of `a` replaced by the
1930/// less-than-or-equal comparison of the lower elements of `a` and `b`.
83c7162d
XL
1931///
1932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd)
0531ce1d
XL
1933#[inline]
1934#[target_feature(enable = "sse2")]
1935#[cfg_attr(test, assert_instr(cmplesd))]
83c7162d 1936#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1937pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
1938 cmpsd(a, b, 2)
1939}
1940
1941/// Return a new vector with the low element of `a` replaced by the
1942/// greater-than comparison of the lower elements of `a` and `b`.
83c7162d
XL
1943///
1944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd)
0531ce1d
XL
1945#[inline]
1946#[target_feature(enable = "sse2")]
1947#[cfg_attr(test, assert_instr(cmpltsd))]
83c7162d 1948#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1949pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
8faf50e0 1950 simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
0531ce1d
XL
1951}
1952
1953/// Return a new vector with the low element of `a` replaced by the
1954/// greater-than-or-equal comparison of the lower elements of `a` and `b`.
83c7162d
XL
1955///
1956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd)
0531ce1d
XL
1957#[inline]
1958#[target_feature(enable = "sse2")]
1959#[cfg_attr(test, assert_instr(cmplesd))]
83c7162d 1960#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 1961pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
8faf50e0 1962 simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
0531ce1d
XL
1963}
1964
1965/// Return a new vector with the low element of `a` replaced by the result
1966/// of comparing both of the lower elements of `a` and `b` to `NaN`. If
1967/// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
1968/// otherwise.
83c7162d
XL
1969///
1970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd)
0531ce1d
XL
1971#[inline]
1972#[target_feature(enable = "sse2")]
1973#[cfg_attr(test, assert_instr(cmpordsd))]
83c7162d 1974#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1975pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
1976 cmpsd(a, b, 7)
1977}
1978
1979/// Return a new vector with the low element of `a` replaced by the result of
1980/// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
1981/// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
83c7162d
XL
1982///
1983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd)
0531ce1d
XL
1984#[inline]
1985#[target_feature(enable = "sse2")]
1986#[cfg_attr(test, assert_instr(cmpunordsd))]
83c7162d 1987#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
1988pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
1989 cmpsd(a, b, 3)
1990}
1991
1992/// Return a new vector with the low element of `a` replaced by the not-equal
1993/// comparison of the lower elements of `a` and `b`.
83c7162d
XL
1994///
1995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd)
0531ce1d
XL
1996#[inline]
1997#[target_feature(enable = "sse2")]
1998#[cfg_attr(test, assert_instr(cmpneqsd))]
83c7162d 1999#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2000pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
2001 cmpsd(a, b, 4)
2002}
2003
2004/// Return a new vector with the low element of `a` replaced by the
2005/// not-less-than comparison of the lower elements of `a` and `b`.
83c7162d
XL
2006///
2007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd)
0531ce1d
XL
2008#[inline]
2009#[target_feature(enable = "sse2")]
2010#[cfg_attr(test, assert_instr(cmpnltsd))]
83c7162d 2011#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2012pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
2013 cmpsd(a, b, 5)
2014}
2015
2016/// Return a new vector with the low element of `a` replaced by the
2017/// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
83c7162d
XL
2018///
2019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd)
0531ce1d
XL
2020#[inline]
2021#[target_feature(enable = "sse2")]
2022#[cfg_attr(test, assert_instr(cmpnlesd))]
83c7162d 2023#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2024pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
2025 cmpsd(a, b, 6)
2026}
2027
2028/// Return a new vector with the low element of `a` replaced by the
2029/// not-greater-than comparison of the lower elements of `a` and `b`.
83c7162d
XL
2030///
2031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd)
0531ce1d
XL
2032#[inline]
2033#[target_feature(enable = "sse2")]
2034#[cfg_attr(test, assert_instr(cmpnltsd))]
83c7162d 2035#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2036pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
8faf50e0 2037 simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
0531ce1d
XL
2038}
2039
2040/// Return a new vector with the low element of `a` replaced by the
2041/// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
83c7162d
XL
2042///
2043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd)
0531ce1d
XL
2044#[inline]
2045#[target_feature(enable = "sse2")]
2046#[cfg_attr(test, assert_instr(cmpnlesd))]
83c7162d 2047#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2048pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
8faf50e0 2049 simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
0531ce1d
XL
2050}
2051
2052/// Compare corresponding elements in `a` and `b` for equality.
83c7162d
XL
2053///
2054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd)
0531ce1d
XL
2055#[inline]
2056#[target_feature(enable = "sse2")]
2057#[cfg_attr(test, assert_instr(cmpeqpd))]
83c7162d 2058#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2059pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
2060 cmppd(a, b, 0)
2061}
2062
2063/// Compare corresponding elements in `a` and `b` for less-than.
83c7162d
XL
2064///
2065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd)
0531ce1d
XL
2066#[inline]
2067#[target_feature(enable = "sse2")]
2068#[cfg_attr(test, assert_instr(cmpltpd))]
83c7162d 2069#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2070pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
2071 cmppd(a, b, 1)
2072}
2073
2074/// Compare corresponding elements in `a` and `b` for less-than-or-equal
83c7162d
XL
2075///
2076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd)
0531ce1d
XL
2077#[inline]
2078#[target_feature(enable = "sse2")]
2079#[cfg_attr(test, assert_instr(cmplepd))]
83c7162d 2080#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2081pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
2082 cmppd(a, b, 2)
2083}
2084
2085/// Compare corresponding elements in `a` and `b` for greater-than.
83c7162d
XL
2086///
2087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd)
0531ce1d
XL
2088#[inline]
2089#[target_feature(enable = "sse2")]
2090#[cfg_attr(test, assert_instr(cmpltpd))]
83c7162d 2091#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2092pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
2093 _mm_cmplt_pd(b, a)
2094}
2095
2096/// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
83c7162d
XL
2097///
2098/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd)
0531ce1d
XL
2099#[inline]
2100#[target_feature(enable = "sse2")]
2101#[cfg_attr(test, assert_instr(cmplepd))]
83c7162d 2102#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2103pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
2104 _mm_cmple_pd(b, a)
2105}
2106
2107/// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
83c7162d
XL
2108///
2109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd)
0531ce1d
XL
2110#[inline]
2111#[target_feature(enable = "sse2")]
2112#[cfg_attr(test, assert_instr(cmpordpd))]
83c7162d 2113#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2114pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
2115 cmppd(a, b, 7)
2116}
2117
2118/// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
83c7162d
XL
2119///
2120/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd)
0531ce1d
XL
2121#[inline]
2122#[target_feature(enable = "sse2")]
2123#[cfg_attr(test, assert_instr(cmpunordpd))]
83c7162d 2124#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2125pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
2126 cmppd(a, b, 3)
2127}
2128
2129/// Compare corresponding elements in `a` and `b` for not-equal.
83c7162d
XL
2130///
2131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd)
0531ce1d
XL
2132#[inline]
2133#[target_feature(enable = "sse2")]
2134#[cfg_attr(test, assert_instr(cmpneqpd))]
83c7162d 2135#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2136pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
2137 cmppd(a, b, 4)
2138}
2139
2140/// Compare corresponding elements in `a` and `b` for not-less-than.
83c7162d
XL
2141///
2142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd)
0531ce1d
XL
2143#[inline]
2144#[target_feature(enable = "sse2")]
2145#[cfg_attr(test, assert_instr(cmpnltpd))]
83c7162d 2146#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2147pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
2148 cmppd(a, b, 5)
2149}
2150
2151/// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
83c7162d
XL
2152///
2153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd)
0531ce1d
XL
2154#[inline]
2155#[target_feature(enable = "sse2")]
2156#[cfg_attr(test, assert_instr(cmpnlepd))]
83c7162d 2157#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2158pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
2159 cmppd(a, b, 6)
2160}
2161
2162/// Compare corresponding elements in `a` and `b` for not-greater-than.
83c7162d
XL
2163///
2164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_pd)
0531ce1d
XL
2165#[inline]
2166#[target_feature(enable = "sse2")]
2167#[cfg_attr(test, assert_instr(cmpnltpd))]
83c7162d 2168#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2169pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
2170 _mm_cmpnlt_pd(b, a)
2171}
2172
2173/// Compare corresponding elements in `a` and `b` for
2174/// not-greater-than-or-equal.
83c7162d
XL
2175///
2176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd)
0531ce1d
XL
2177#[inline]
2178#[target_feature(enable = "sse2")]
2179#[cfg_attr(test, assert_instr(cmpnlepd))]
83c7162d 2180#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2181pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
2182 _mm_cmpnle_pd(b, a)
2183}
2184
2185/// Compare the lower element of `a` and `b` for equality.
83c7162d
XL
2186///
2187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd)
0531ce1d
XL
2188#[inline]
2189#[target_feature(enable = "sse2")]
2190#[cfg_attr(test, assert_instr(comisd))]
83c7162d 2191#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2192pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
2193 comieqsd(a, b)
2194}
2195
2196/// Compare the lower element of `a` and `b` for less-than.
83c7162d
XL
2197///
2198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd)
0531ce1d
XL
2199#[inline]
2200#[target_feature(enable = "sse2")]
2201#[cfg_attr(test, assert_instr(comisd))]
83c7162d 2202#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2203pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
2204 comiltsd(a, b)
2205}
2206
2207/// Compare the lower element of `a` and `b` for less-than-or-equal.
83c7162d
XL
2208///
2209/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd)
0531ce1d
XL
2210#[inline]
2211#[target_feature(enable = "sse2")]
2212#[cfg_attr(test, assert_instr(comisd))]
83c7162d 2213#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2214pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
2215 comilesd(a, b)
2216}
2217
2218/// Compare the lower element of `a` and `b` for greater-than.
83c7162d
XL
2219///
2220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd)
0531ce1d
XL
2221#[inline]
2222#[target_feature(enable = "sse2")]
2223#[cfg_attr(test, assert_instr(comisd))]
83c7162d 2224#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2225pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
2226 comigtsd(a, b)
2227}
2228
2229/// Compare the lower element of `a` and `b` for greater-than-or-equal.
83c7162d
XL
2230///
2231/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd)
0531ce1d
XL
2232#[inline]
2233#[target_feature(enable = "sse2")]
2234#[cfg_attr(test, assert_instr(comisd))]
83c7162d 2235#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2236pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
2237 comigesd(a, b)
2238}
2239
2240/// Compare the lower element of `a` and `b` for not-equal.
83c7162d
XL
2241///
2242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd)
0531ce1d
XL
2243#[inline]
2244#[target_feature(enable = "sse2")]
2245#[cfg_attr(test, assert_instr(comisd))]
83c7162d 2246#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2247pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
2248 comineqsd(a, b)
2249}
2250
2251/// Compare the lower element of `a` and `b` for equality.
83c7162d
XL
2252///
2253/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sd)
0531ce1d
XL
2254#[inline]
2255#[target_feature(enable = "sse2")]
2256#[cfg_attr(test, assert_instr(ucomisd))]
83c7162d 2257#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2258pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
2259 ucomieqsd(a, b)
2260}
2261
2262/// Compare the lower element of `a` and `b` for less-than.
83c7162d
XL
2263///
2264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sd)
0531ce1d
XL
2265#[inline]
2266#[target_feature(enable = "sse2")]
2267#[cfg_attr(test, assert_instr(ucomisd))]
83c7162d 2268#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2269pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
2270 ucomiltsd(a, b)
2271}
2272
2273/// Compare the lower element of `a` and `b` for less-than-or-equal.
83c7162d
XL
2274///
2275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sd)
0531ce1d
XL
2276#[inline]
2277#[target_feature(enable = "sse2")]
2278#[cfg_attr(test, assert_instr(ucomisd))]
83c7162d 2279#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2280pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
2281 ucomilesd(a, b)
2282}
2283
2284/// Compare the lower element of `a` and `b` for greater-than.
83c7162d
XL
2285///
2286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sd)
0531ce1d
XL
2287#[inline]
2288#[target_feature(enable = "sse2")]
2289#[cfg_attr(test, assert_instr(ucomisd))]
83c7162d 2290#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2291pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
2292 ucomigtsd(a, b)
2293}
2294
2295/// Compare the lower element of `a` and `b` for greater-than-or-equal.
83c7162d
XL
2296///
2297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sd)
0531ce1d
XL
2298#[inline]
2299#[target_feature(enable = "sse2")]
2300#[cfg_attr(test, assert_instr(ucomisd))]
83c7162d 2301#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2302pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
2303 ucomigesd(a, b)
2304}
2305
2306/// Compare the lower element of `a` and `b` for not-equal.
83c7162d
XL
2307///
2308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sd)
0531ce1d
XL
2309#[inline]
2310#[target_feature(enable = "sse2")]
2311#[cfg_attr(test, assert_instr(ucomisd))]
83c7162d 2312#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2313pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
2314 ucomineqsd(a, b)
2315}
2316
2317/// Convert packed double-precision (64-bit) floating-point elements in "a" to
2318/// packed single-precision (32-bit) floating-point elements
83c7162d
XL
2319///
2320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps)
0531ce1d
XL
2321#[inline]
2322#[target_feature(enable = "sse2")]
2323#[cfg_attr(test, assert_instr(cvtpd2ps))]
83c7162d 2324#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2325pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
2326 cvtpd2ps(a)
2327}
2328
2329/// Convert packed single-precision (32-bit) floating-point elements in `a` to
2330/// packed
2331/// double-precision (64-bit) floating-point elements.
83c7162d
XL
2332///
2333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd)
0531ce1d
XL
2334#[inline]
2335#[target_feature(enable = "sse2")]
2336#[cfg_attr(test, assert_instr(cvtps2pd))]
83c7162d 2337#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2338pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
2339 cvtps2pd(a)
2340}
2341
2342/// Convert packed double-precision (64-bit) floating-point elements in `a` to
2343/// packed 32-bit integers.
83c7162d
XL
2344///
2345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32)
0531ce1d
XL
2346#[inline]
2347#[target_feature(enable = "sse2")]
2348#[cfg_attr(test, assert_instr(cvtpd2dq))]
83c7162d 2349#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2350pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
2351 mem::transmute(cvtpd2dq(a))
2352}
2353
2354/// Convert the lower double-precision (64-bit) floating-point element in a to
2355/// a 32-bit integer.
83c7162d
XL
2356///
2357/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32)
0531ce1d
XL
2358#[inline]
2359#[target_feature(enable = "sse2")]
2360#[cfg_attr(test, assert_instr(cvtsd2si))]
83c7162d 2361#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2362pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
2363 cvtsd2si(a)
2364}
2365
2366/// Convert the lower double-precision (64-bit) floating-point element in `b`
2367/// to a single-precision (32-bit) floating-point element, store the result in
2368/// the lower element of the return value, and copy the upper element from `a`
2369/// to the upper element the return value.
83c7162d
XL
2370///
2371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss)
0531ce1d
XL
2372#[inline]
2373#[target_feature(enable = "sse2")]
2374#[cfg_attr(test, assert_instr(cvtsd2ss))]
83c7162d 2375#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2376pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
2377 cvtsd2ss(a, b)
2378}
2379
2380/// Return the lower double-precision (64-bit) floating-point element of "a".
83c7162d
XL
2381///
2382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64)
0531ce1d
XL
2383#[inline]
2384#[target_feature(enable = "sse2")]
83c7162d 2385#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2386pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
2387 simd_extract(a, 0)
2388}
2389
2390/// Convert the lower single-precision (32-bit) floating-point element in `b`
2391/// to a double-precision (64-bit) floating-point element, store the result in
2392/// the lower element of the return value, and copy the upper element from `a`
2393/// to the upper element the return value.
83c7162d
XL
2394///
2395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd)
0531ce1d
XL
2396#[inline]
2397#[target_feature(enable = "sse2")]
2398#[cfg_attr(test, assert_instr(cvtss2sd))]
83c7162d 2399#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2400pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
2401 cvtss2sd(a, b)
2402}
2403
2404/// Convert packed double-precision (64-bit) floating-point elements in `a` to
2405/// packed 32-bit integers with truncation.
83c7162d
XL
2406///
2407/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32)
0531ce1d
XL
2408#[inline]
2409#[target_feature(enable = "sse2")]
2410#[cfg_attr(test, assert_instr(cvttpd2dq))]
83c7162d 2411#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2412pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
2413 mem::transmute(cvttpd2dq(a))
2414}
2415
2416/// Convert the lower double-precision (64-bit) floating-point element in `a`
2417/// to a 32-bit integer with truncation.
83c7162d
XL
2418///
2419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32)
0531ce1d
XL
2420#[inline]
2421#[target_feature(enable = "sse2")]
2422#[cfg_attr(test, assert_instr(cvttsd2si))]
83c7162d 2423#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2424pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
2425 cvttsd2si(a)
2426}
2427
2428/// Convert packed single-precision (32-bit) floating-point elements in `a` to
2429/// packed 32-bit integers with truncation.
83c7162d
XL
2430///
2431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epi32)
0531ce1d
XL
2432#[inline]
2433#[target_feature(enable = "sse2")]
2434#[cfg_attr(test, assert_instr(cvttps2dq))]
83c7162d 2435#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2436pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
2437 mem::transmute(cvttps2dq(a))
2438}
2439
2440/// Copy double-precision (64-bit) floating-point element `a` to the lower
2441/// element of the packed 64-bit return value.
83c7162d
XL
2442///
2443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd)
0531ce1d
XL
2444#[inline]
2445#[target_feature(enable = "sse2")]
83c7162d 2446#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2447pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
2448 _mm_set_pd(0.0, a)
2449}
2450
2451/// Broadcast double-precision (64-bit) floating-point value a to all elements
2452/// of the return value.
83c7162d
XL
2453///
2454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd)
0531ce1d
XL
2455#[inline]
2456#[target_feature(enable = "sse2")]
83c7162d 2457#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2458pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
2459 _mm_set_pd(a, a)
2460}
2461
2462/// Broadcast double-precision (64-bit) floating-point value a to all elements
2463/// of the return value.
83c7162d
XL
2464///
2465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1)
0531ce1d
XL
2466#[inline]
2467#[target_feature(enable = "sse2")]
83c7162d 2468#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2469pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
2470 _mm_set_pd(a, a)
2471}
2472
2473/// Set packed double-precision (64-bit) floating-point elements in the return
2474/// value with the supplied values.
83c7162d
XL
2475///
2476/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd)
0531ce1d
XL
2477#[inline]
2478#[target_feature(enable = "sse2")]
83c7162d 2479#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2480pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
2481 __m128d(b, a)
2482}
2483
2484/// Set packed double-precision (64-bit) floating-point elements in the return
2485/// value with the supplied values in reverse order.
83c7162d
XL
2486///
2487/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd)
0531ce1d
XL
2488#[inline]
2489#[target_feature(enable = "sse2")]
83c7162d 2490#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2491pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
2492 _mm_set_pd(b, a)
2493}
2494
2495/// Returns packed double-precision (64-bit) floating-point elements with all
2496/// zeros.
83c7162d
XL
2497///
2498/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd)
0531ce1d
XL
2499#[inline]
2500#[target_feature(enable = "sse2")]
2501#[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected
83c7162d 2502#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2503pub unsafe fn _mm_setzero_pd() -> __m128d {
2504 _mm_set_pd(0.0, 0.0)
2505}
2506
2507/// Return a mask of the most significant bit of each element in `a`.
2508///
2509/// The mask is stored in the 2 least significant bits of the return value.
2510/// All other bits are set to `0`.
83c7162d
XL
2511///
2512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd)
0531ce1d
XL
2513#[inline]
2514#[target_feature(enable = "sse2")]
2515#[cfg_attr(test, assert_instr(movmskpd))]
83c7162d 2516#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2517pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
2518 movmskpd(a)
2519}
2520
2521/// Load 128-bits (composed of 2 packed double-precision (64-bit)
2522/// floating-point elements) from memory into the returned vector.
2523/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2524/// exception may be generated.
83c7162d
XL
2525///
2526/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd)
0531ce1d
XL
2527#[inline]
2528#[target_feature(enable = "sse2")]
2529#[cfg_attr(test, assert_instr(movaps))]
83c7162d 2530#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2531#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
0531ce1d
XL
2532pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
2533 *(mem_addr as *const __m128d)
2534}
2535
2536/// Loads a 64-bit double-precision value to the low element of a
2537/// 128-bit integer vector and clears the upper element.
83c7162d
XL
2538///
2539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd)
0531ce1d
XL
2540#[inline]
2541#[target_feature(enable = "sse2")]
2542#[cfg_attr(test, assert_instr(movsd))]
83c7162d 2543#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2544pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
2545 _mm_setr_pd(*mem_addr, 0.)
2546}
2547
2548/// Loads a double-precision value into the high-order bits of a 128-bit
83c7162d 2549/// vector of `[2 x double]`. The low-order bits are copied from the low-order
0531ce1d 2550/// bits of the first operand.
83c7162d
XL
2551///
2552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd)
0531ce1d
XL
2553#[inline]
2554#[target_feature(enable = "sse2")]
2555#[cfg_attr(test, assert_instr(movhpd))]
83c7162d 2556#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2557pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2558 _mm_setr_pd(simd_extract(a, 0), *mem_addr)
2559}
2560
2561/// Loads a double-precision value into the low-order bits of a 128-bit
83c7162d 2562/// vector of `[2 x double]`. The high-order bits are copied from the
0531ce1d 2563/// high-order bits of the first operand.
83c7162d
XL
2564///
2565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd)
0531ce1d
XL
2566#[inline]
2567#[target_feature(enable = "sse2")]
2568#[cfg_attr(test, assert_instr(movlpd))]
83c7162d 2569#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2570pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
2571 _mm_setr_pd(*mem_addr, simd_extract(a, 1))
2572}
2573
83c7162d 2574/// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
0531ce1d
XL
2575/// aligned memory location.
2576/// To minimize caching, the data is flagged as non-temporal (unlikely to be
2577/// used again soon).
83c7162d
XL
2578///
2579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd)
0531ce1d
XL
2580#[inline]
2581#[target_feature(enable = "sse2")]
2582#[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
83c7162d 2583#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2584#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
0531ce1d 2585pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
0731742a 2586 intrinsics::nontemporal_store(mem_addr as *mut __m128d, a);
0531ce1d
XL
2587}
2588
83c7162d 2589/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
0531ce1d 2590/// memory location.
83c7162d
XL
2591///
2592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sd)
0531ce1d
XL
2593#[inline]
2594#[target_feature(enable = "sse2")]
0731742a 2595#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
83c7162d 2596#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2597pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
2598 *mem_addr = simd_extract(a, 0)
2599}
2600
2601/// Store 128-bits (composed of 2 packed double-precision (64-bit)
2602/// floating-point elements) from `a` into memory. `mem_addr` must be aligned
2603/// on a 16-byte boundary or a general-protection exception may be generated.
83c7162d
XL
2604///
2605/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd)
0531ce1d
XL
2606#[inline]
2607#[target_feature(enable = "sse2")]
2608#[cfg_attr(test, assert_instr(movaps))]
83c7162d 2609#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2610#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
0531ce1d
XL
2611pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
2612 *(mem_addr as *mut __m128d) = a;
2613}
2614
2615/// Store 128-bits (composed of 2 packed double-precision (64-bit)
2616/// floating-point elements) from `a` into memory.
2617/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
2618///
2619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd)
0531ce1d
XL
2620#[inline]
2621#[target_feature(enable = "sse2")]
2622#[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
83c7162d 2623#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2624pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
2625 storeupd(mem_addr as *mut i8, a);
2626}
2627
2628/// Store the lower double-precision (64-bit) floating-point element from `a`
2629/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2630/// 16-byte boundary or a general-protection exception may be generated.
83c7162d
XL
2631///
2632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_pd)
0531ce1d
XL
2633#[inline]
2634#[target_feature(enable = "sse2")]
83c7162d 2635#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2636#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
0531ce1d
XL
2637pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
2638 let b: __m128d = simd_shuffle2(a, a, [0, 0]);
2639 *(mem_addr as *mut __m128d) = b;
2640}
2641
2642/// Store the lower double-precision (64-bit) floating-point element from `a`
2643/// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
2644/// 16-byte boundary or a general-protection exception may be generated.
83c7162d
XL
2645///
2646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1)
0531ce1d
XL
2647#[inline]
2648#[target_feature(enable = "sse2")]
83c7162d 2649#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2650#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
0531ce1d
XL
2651pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
2652 let b: __m128d = simd_shuffle2(a, a, [0, 0]);
2653 *(mem_addr as *mut __m128d) = b;
2654}
2655
2656/// Store 2 double-precision (64-bit) floating-point elements from `a` into
2657/// memory in reverse order.
2658/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2659/// exception may be generated.
83c7162d
XL
2660///
2661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd)
0531ce1d
XL
2662#[inline]
2663#[target_feature(enable = "sse2")]
83c7162d 2664#[stable(feature = "simd_x86", since = "1.27.0")]
0731742a 2665#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
0531ce1d
XL
2666pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
2667 let b: __m128d = simd_shuffle2(a, a, [1, 0]);
2668 *(mem_addr as *mut __m128d) = b;
2669}
2670
83c7162d 2671/// Stores the upper 64 bits of a 128-bit vector of `[2 x double]` to a
0531ce1d 2672/// memory location.
83c7162d
XL
2673///
2674/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd)
0531ce1d
XL
2675#[inline]
2676#[target_feature(enable = "sse2")]
0731742a 2677#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhpd))]
83c7162d 2678#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2679pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
2680 *mem_addr = simd_extract(a, 1);
2681}
2682
83c7162d 2683/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
0531ce1d 2684/// memory location.
83c7162d
XL
2685///
2686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd)
0531ce1d
XL
2687#[inline]
2688#[target_feature(enable = "sse2")]
0731742a 2689#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
83c7162d 2690#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2691pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
2692 *mem_addr = simd_extract(a, 0);
2693}
2694
2695/// Load a double-precision (64-bit) floating-point element from memory
2696/// into both elements of returned vector.
83c7162d
XL
2697///
2698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd)
0531ce1d
XL
2699#[inline]
2700#[target_feature(enable = "sse2")]
2701// #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
83c7162d 2702#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2703pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
2704 let d = *mem_addr;
2705 _mm_setr_pd(d, d)
2706}
2707
2708/// Load a double-precision (64-bit) floating-point element from memory
2709/// into both elements of returned vector.
83c7162d
XL
2710///
2711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1)
0531ce1d
XL
2712#[inline]
2713#[target_feature(enable = "sse2")]
2714// #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
83c7162d 2715#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2716pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
2717 _mm_load1_pd(mem_addr)
2718}
2719
2720/// Load 2 double-precision (64-bit) floating-point elements from memory into
2721/// the returned vector in reverse order. `mem_addr` must be aligned on a
2722/// 16-byte boundary or a general-protection exception may be generated.
83c7162d
XL
2723///
2724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd)
0531ce1d
XL
2725#[inline]
2726#[target_feature(enable = "sse2")]
2727#[cfg_attr(test, assert_instr(movapd))]
83c7162d 2728#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2729pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
2730 let a = _mm_load_pd(mem_addr);
2731 simd_shuffle2(a, a, [1, 0])
2732}
2733
2734/// Load 128-bits (composed of 2 packed double-precision (64-bit)
2735/// floating-point elements) from memory into the returned vector.
2736/// `mem_addr` does not need to be aligned on any particular boundary.
83c7162d
XL
2737///
2738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd)
0531ce1d
XL
2739#[inline]
2740#[target_feature(enable = "sse2")]
2741#[cfg_attr(test, assert_instr(movups))]
83c7162d 2742#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2743pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
2744 let mut dst = _mm_undefined_pd();
2745 ptr::copy_nonoverlapping(
2746 mem_addr as *const u8,
2747 &mut dst as *mut __m128d as *mut u8,
2748 mem::size_of::<__m128d>(),
2749 );
2750 dst
2751}
2752
83c7162d
XL
2753/// Constructs a 128-bit floating-point vector of `[2 x double]` from two
2754/// 128-bit vector parameters of `[2 x double]`, using the immediate-value
0531ce1d 2755/// parameter as a specifier.
83c7162d
XL
2756///
2757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd)
0531ce1d
XL
2758#[inline]
2759#[target_feature(enable = "sse2")]
2760#[cfg_attr(test, assert_instr(shufpd, imm8 = 1))]
2761#[rustc_args_required_const(2)]
83c7162d 2762#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2763pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
2764 match imm8 & 0b11 {
2765 0b00 => simd_shuffle2(a, b, [0, 2]),
2766 0b01 => simd_shuffle2(a, b, [1, 2]),
2767 0b10 => simd_shuffle2(a, b, [0, 3]),
2768 _ => simd_shuffle2(a, b, [1, 3]),
2769 }
2770}
2771
83c7162d 2772/// Constructs a 128-bit floating-point vector of `[2 x double]`. The lower
0531ce1d
XL
2773/// 64 bits are set to the lower 64 bits of the second parameter. The upper
2774/// 64 bits are set to the upper 64 bits of the first parameter.
83c7162d
XL
2775///
2776/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd)
0531ce1d
XL
2777#[inline]
2778#[target_feature(enable = "sse2")]
2779#[cfg_attr(test, assert_instr(movsd))]
83c7162d 2780#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2781pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
2782 _mm_setr_pd(simd_extract(b, 0), simd_extract(a, 1))
2783}
2784
83c7162d
XL
2785/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
2786/// floating-point vector of `[4 x float]`.
2787///
2788/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps)
0531ce1d
XL
2789#[inline]
2790#[target_feature(enable = "sse2")]
83c7162d 2791#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2792pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
2793 mem::transmute(a)
2794}
2795
83c7162d 2796/// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
0531ce1d 2797/// integer vector.
83c7162d
XL
2798///
2799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128)
0531ce1d
XL
2800#[inline]
2801#[target_feature(enable = "sse2")]
83c7162d 2802#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2803pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
0bf4aa26 2804 mem::transmute(a)
0531ce1d
XL
2805}
2806
83c7162d
XL
2807/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
2808/// floating-point vector of `[2 x double]`.
2809///
2810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd)
0531ce1d
XL
2811#[inline]
2812#[target_feature(enable = "sse2")]
83c7162d 2813#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2814pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
2815 mem::transmute(a)
2816}
2817
83c7162d 2818/// Casts a 128-bit floating-point vector of `[4 x float]` into a 128-bit
0531ce1d 2819/// integer vector.
83c7162d
XL
2820///
2821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_si128)
0531ce1d
XL
2822#[inline]
2823#[target_feature(enable = "sse2")]
83c7162d 2824#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2825pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
2826 mem::transmute(a)
2827}
2828
2829/// Casts a 128-bit integer vector into a 128-bit floating-point vector
83c7162d
XL
2830/// of `[2 x double]`.
2831///
2832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd)
0531ce1d
XL
2833#[inline]
2834#[target_feature(enable = "sse2")]
83c7162d 2835#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2836pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
0bf4aa26 2837 mem::transmute(a)
0531ce1d
XL
2838}
2839
2840/// Casts a 128-bit integer vector into a 128-bit floating-point vector
83c7162d
XL
2841/// of `[4 x float]`.
2842///
2843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ps)
0531ce1d
XL
2844#[inline]
2845#[target_feature(enable = "sse2")]
83c7162d 2846#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2847pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
2848 mem::transmute(a)
2849}
2850
2851/// Return vector of type __m128d with undefined elements.
83c7162d
XL
2852///
2853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd)
0531ce1d
XL
2854#[inline]
2855#[target_feature(enable = "sse2")]
83c7162d 2856#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2857pub unsafe fn _mm_undefined_pd() -> __m128d {
0731742a
XL
2858 // FIXME: this function should return MaybeUninit<__m128d>
2859 mem::MaybeUninit::<__m128d>::uninitialized().into_inner()
0531ce1d
XL
2860}
2861
2862/// Return vector of type __m128i with undefined elements.
83c7162d
XL
2863///
2864/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_si128)
0531ce1d
XL
2865#[inline]
2866#[target_feature(enable = "sse2")]
83c7162d 2867#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d 2868pub unsafe fn _mm_undefined_si128() -> __m128i {
0731742a
XL
2869 // FIXME: this function should return MaybeUninit<__m128i>
2870 mem::MaybeUninit::<__m128i>::uninitialized().into_inner()
0531ce1d
XL
2871}
2872
2873/// The resulting `__m128d` element is composed by the low-order values of
2874/// the two `__m128d` interleaved input elements, i.e.:
2875///
8faf50e0
XL
2876/// * The `[127:64]` bits are copied from the `[127:64]` bits of the second
2877/// input * The `[63:0]` bits are copied from the `[127:64]` bits of the first
2878/// input
83c7162d
XL
2879///
2880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
0531ce1d
XL
2881#[inline]
2882#[target_feature(enable = "sse2")]
2883#[cfg_attr(test, assert_instr(unpckhpd))]
83c7162d 2884#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2885pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
2886 simd_shuffle2(a, b, [1, 3])
2887}
2888
2889/// The resulting `__m128d` element is composed by the high-order values of
2890/// the two `__m128d` interleaved input elements, i.e.:
2891///
83c7162d
XL
2892/// * The `[127:64]` bits are copied from the `[63:0]` bits of the second input
2893/// * The `[63:0]` bits are copied from the `[63:0]` bits of the first input
2894///
2895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd)
0531ce1d
XL
2896#[inline]
2897#[target_feature(enable = "sse2")]
0731742a 2898#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlhps))]
83c7162d 2899#[stable(feature = "simd_x86", since = "1.27.0")]
0531ce1d
XL
2900pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
2901 simd_shuffle2(a, b, [0, 2])
2902}
2903
2904/// Adds two signed or unsigned 64-bit integer values, returning the
2905/// lower 64 bits of the sum.
2906#[inline]
2907#[target_feature(enable = "sse2,mmx")]
2908#[cfg_attr(test, assert_instr(paddq))]
2909pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
2910 paddq(a, b)
2911}
2912
2913/// Multiplies 32-bit unsigned integer values contained in the lower bits
2914/// of the two 64-bit integer vectors and returns the 64-bit unsigned
2915/// product.
2916#[inline]
2917#[target_feature(enable = "sse2,mmx")]
2918#[cfg_attr(test, assert_instr(pmuludq))]
2919pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
2920 pmuludq2(a, b)
2921}
2922
2923/// Subtracts signed or unsigned 64-bit integer values and writes the
2924/// difference to the corresponding bits in the destination.
2925#[inline]
2926#[target_feature(enable = "sse2,mmx")]
2927#[cfg_attr(test, assert_instr(psubq))]
2928pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
2929 psubq(a, b)
2930}
2931
2932/// Converts the two signed 32-bit integer elements of a 64-bit vector of
83c7162d
XL
2933/// `[2 x i32]` into two double-precision floating-point values, returned in a
2934/// 128-bit vector of `[2 x double]`.
0531ce1d
XL
2935#[inline]
2936#[target_feature(enable = "sse2,mmx")]
2937#[cfg_attr(test, assert_instr(cvtpi2pd))]
2938pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
2939 cvtpi2pd(a)
2940}
2941
83c7162d 2942/// Initializes both 64-bit values in a 128-bit vector of `[2 x i64]` with
0531ce1d
XL
2943/// the specified 64-bit integer values.
2944#[inline]
2945#[target_feature(enable = "sse2,mmx")]
2946// no particular instruction to test
2947pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
2948 _mm_set_epi64x(mem::transmute(e1), mem::transmute(e0))
2949}
2950
83c7162d 2951/// Initializes both values in a 128-bit vector of `[2 x i64]` with the
0531ce1d
XL
2952/// specified 64-bit value.
2953#[inline]
2954#[target_feature(enable = "sse2,mmx")]
2955// no particular instruction to test
2956pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
2957 _mm_set_epi64x(mem::transmute(a), mem::transmute(a))
2958}
2959
2960/// Constructs a 128-bit integer vector, initialized in reverse order
2961/// with the specified 64-bit integral values.
2962#[inline]
2963#[target_feature(enable = "sse2,mmx")]
2964// no particular instruction to test
2965pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
2966 _mm_set_epi64x(mem::transmute(e0), mem::transmute(e1))
2967}
2968
2969/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
2970/// integer.
2971#[inline]
2972#[target_feature(enable = "sse2,mmx")]
2973// #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong
2974// instr?
2975pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 {
2976 mem::transmute(simd_extract::<_, i64>(a.as_i64x2(), 0))
2977}
2978
2979/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
2980/// upper bits.
2981#[inline]
2982#[target_feature(enable = "sse2,mmx")]
2983// #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong
2984// instr?
2985pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i {
2986 _mm_set_epi64x(0, mem::transmute(a))
2987}
2988
2989/// Converts the two double-precision floating-point elements of a
83c7162d
XL
2990/// 128-bit vector of `[2 x double]` into two signed 32-bit integer values,
2991/// returned in a 64-bit vector of `[2 x i32]`.
0531ce1d
XL
2992#[inline]
2993#[target_feature(enable = "sse2,mmx")]
2994#[cfg_attr(test, assert_instr(cvtpd2pi))]
2995pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
2996 cvtpd2pi(a)
2997}
2998
2999/// Converts the two double-precision floating-point elements of a
83c7162d
XL
3000/// 128-bit vector of `[2 x double]` into two signed 32-bit integer values,
3001/// returned in a 64-bit vector of `[2 x i32]`.
0531ce1d
XL
3002/// If the result of either conversion is inexact, the result is truncated
3003/// (rounded towards zero) regardless of the current MXCSR setting.
3004#[inline]
3005#[target_feature(enable = "sse2,mmx")]
3006#[cfg_attr(test, assert_instr(cvttpd2pi))]
3007pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 {
3008 cvttpd2pi(a)
3009}
3010
3011#[allow(improper_ctypes)]
3012extern "C" {
3013 #[link_name = "llvm.x86.sse2.pause"]
3014 fn pause();
3015 #[link_name = "llvm.x86.sse2.clflush"]
3016 fn clflush(p: *mut u8);
3017 #[link_name = "llvm.x86.sse2.lfence"]
3018 fn lfence();
3019 #[link_name = "llvm.x86.sse2.mfence"]
3020 fn mfence();
3021 #[link_name = "llvm.x86.sse2.padds.b"]
3022 fn paddsb(a: i8x16, b: i8x16) -> i8x16;
3023 #[link_name = "llvm.x86.sse2.padds.w"]
3024 fn paddsw(a: i16x8, b: i16x8) -> i16x8;
3025 #[link_name = "llvm.x86.sse2.paddus.b"]
3026 fn paddsub(a: u8x16, b: u8x16) -> u8x16;
3027 #[link_name = "llvm.x86.sse2.paddus.w"]
3028 fn paddsuw(a: u16x8, b: u16x8) -> u16x8;
3029 #[link_name = "llvm.x86.sse2.pavg.b"]
3030 fn pavgb(a: u8x16, b: u8x16) -> u8x16;
3031 #[link_name = "llvm.x86.sse2.pavg.w"]
3032 fn pavgw(a: u16x8, b: u16x8) -> u16x8;
3033 #[link_name = "llvm.x86.sse2.pmadd.wd"]
3034 fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
3035 #[link_name = "llvm.x86.sse2.pmaxs.w"]
3036 fn pmaxsw(a: i16x8, b: i16x8) -> i16x8;
3037 #[link_name = "llvm.x86.sse2.pmaxu.b"]
3038 fn pmaxub(a: u8x16, b: u8x16) -> u8x16;
3039 #[link_name = "llvm.x86.sse2.pmins.w"]
3040 fn pminsw(a: i16x8, b: i16x8) -> i16x8;
3041 #[link_name = "llvm.x86.sse2.pminu.b"]
3042 fn pminub(a: u8x16, b: u8x16) -> u8x16;
3043 #[link_name = "llvm.x86.sse2.pmulh.w"]
3044 fn pmulhw(a: i16x8, b: i16x8) -> i16x8;
3045 #[link_name = "llvm.x86.sse2.pmulhu.w"]
3046 fn pmulhuw(a: u16x8, b: u16x8) -> u16x8;
3047 #[link_name = "llvm.x86.sse2.pmulu.dq"]
3048 fn pmuludq(a: u32x4, b: u32x4) -> u64x2;
3049 #[link_name = "llvm.x86.sse2.psad.bw"]
3050 fn psadbw(a: u8x16, b: u8x16) -> u64x2;
3051 #[link_name = "llvm.x86.sse2.psubs.b"]
3052 fn psubsb(a: i8x16, b: i8x16) -> i8x16;
3053 #[link_name = "llvm.x86.sse2.psubs.w"]
3054 fn psubsw(a: i16x8, b: i16x8) -> i16x8;
3055 #[link_name = "llvm.x86.sse2.psubus.b"]
3056 fn psubusb(a: u8x16, b: u8x16) -> u8x16;
3057 #[link_name = "llvm.x86.sse2.psubus.w"]
3058 fn psubusw(a: u16x8, b: u16x8) -> u16x8;
3059 #[link_name = "llvm.x86.sse2.pslli.w"]
3060 fn pslliw(a: i16x8, imm8: i32) -> i16x8;
3061 #[link_name = "llvm.x86.sse2.psll.w"]
3062 fn psllw(a: i16x8, count: i16x8) -> i16x8;
3063 #[link_name = "llvm.x86.sse2.pslli.d"]
3064 fn psllid(a: i32x4, imm8: i32) -> i32x4;
3065 #[link_name = "llvm.x86.sse2.psll.d"]
3066 fn pslld(a: i32x4, count: i32x4) -> i32x4;
3067 #[link_name = "llvm.x86.sse2.pslli.q"]
3068 fn pslliq(a: i64x2, imm8: i32) -> i64x2;
3069 #[link_name = "llvm.x86.sse2.psll.q"]
3070 fn psllq(a: i64x2, count: i64x2) -> i64x2;
3071 #[link_name = "llvm.x86.sse2.psrai.w"]
3072 fn psraiw(a: i16x8, imm8: i32) -> i16x8;
3073 #[link_name = "llvm.x86.sse2.psra.w"]
3074 fn psraw(a: i16x8, count: i16x8) -> i16x8;
3075 #[link_name = "llvm.x86.sse2.psrai.d"]
3076 fn psraid(a: i32x4, imm8: i32) -> i32x4;
3077 #[link_name = "llvm.x86.sse2.psra.d"]
3078 fn psrad(a: i32x4, count: i32x4) -> i32x4;
3079 #[link_name = "llvm.x86.sse2.psrli.w"]
3080 fn psrliw(a: i16x8, imm8: i32) -> i16x8;
3081 #[link_name = "llvm.x86.sse2.psrl.w"]
3082 fn psrlw(a: i16x8, count: i16x8) -> i16x8;
3083 #[link_name = "llvm.x86.sse2.psrli.d"]
3084 fn psrlid(a: i32x4, imm8: i32) -> i32x4;
3085 #[link_name = "llvm.x86.sse2.psrl.d"]
3086 fn psrld(a: i32x4, count: i32x4) -> i32x4;
3087 #[link_name = "llvm.x86.sse2.psrli.q"]
3088 fn psrliq(a: i64x2, imm8: i32) -> i64x2;
3089 #[link_name = "llvm.x86.sse2.psrl.q"]
3090 fn psrlq(a: i64x2, count: i64x2) -> i64x2;
3091 #[link_name = "llvm.x86.sse2.cvtdq2ps"]
3092 fn cvtdq2ps(a: i32x4) -> __m128;
3093 #[link_name = "llvm.x86.sse2.cvtps2dq"]
3094 fn cvtps2dq(a: __m128) -> i32x4;
3095 #[link_name = "llvm.x86.sse2.maskmov.dqu"]
3096 fn maskmovdqu(a: i8x16, mask: i8x16, mem_addr: *mut i8);
3097 #[link_name = "llvm.x86.sse2.packsswb.128"]
3098 fn packsswb(a: i16x8, b: i16x8) -> i8x16;
3099 #[link_name = "llvm.x86.sse2.packssdw.128"]
3100 fn packssdw(a: i32x4, b: i32x4) -> i16x8;
3101 #[link_name = "llvm.x86.sse2.packuswb.128"]
3102 fn packuswb(a: i16x8, b: i16x8) -> u8x16;
3103 #[link_name = "llvm.x86.sse2.pmovmskb.128"]
3104 fn pmovmskb(a: i8x16) -> i32;
3105 #[link_name = "llvm.x86.sse2.max.sd"]
3106 fn maxsd(a: __m128d, b: __m128d) -> __m128d;
3107 #[link_name = "llvm.x86.sse2.max.pd"]
3108 fn maxpd(a: __m128d, b: __m128d) -> __m128d;
3109 #[link_name = "llvm.x86.sse2.min.sd"]
3110 fn minsd(a: __m128d, b: __m128d) -> __m128d;
3111 #[link_name = "llvm.x86.sse2.min.pd"]
3112 fn minpd(a: __m128d, b: __m128d) -> __m128d;
3113 #[link_name = "llvm.x86.sse2.sqrt.sd"]
3114 fn sqrtsd(a: __m128d) -> __m128d;
3115 #[link_name = "llvm.x86.sse2.sqrt.pd"]
3116 fn sqrtpd(a: __m128d) -> __m128d;
3117 #[link_name = "llvm.x86.sse2.cmp.sd"]
3118 fn cmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3119 #[link_name = "llvm.x86.sse2.cmp.pd"]
3120 fn cmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3121 #[link_name = "llvm.x86.sse2.comieq.sd"]
3122 fn comieqsd(a: __m128d, b: __m128d) -> i32;
3123 #[link_name = "llvm.x86.sse2.comilt.sd"]
3124 fn comiltsd(a: __m128d, b: __m128d) -> i32;
3125 #[link_name = "llvm.x86.sse2.comile.sd"]
3126 fn comilesd(a: __m128d, b: __m128d) -> i32;
3127 #[link_name = "llvm.x86.sse2.comigt.sd"]
3128 fn comigtsd(a: __m128d, b: __m128d) -> i32;
3129 #[link_name = "llvm.x86.sse2.comige.sd"]
3130 fn comigesd(a: __m128d, b: __m128d) -> i32;
3131 #[link_name = "llvm.x86.sse2.comineq.sd"]
3132 fn comineqsd(a: __m128d, b: __m128d) -> i32;
3133 #[link_name = "llvm.x86.sse2.ucomieq.sd"]
3134 fn ucomieqsd(a: __m128d, b: __m128d) -> i32;
3135 #[link_name = "llvm.x86.sse2.ucomilt.sd"]
3136 fn ucomiltsd(a: __m128d, b: __m128d) -> i32;
3137 #[link_name = "llvm.x86.sse2.ucomile.sd"]
3138 fn ucomilesd(a: __m128d, b: __m128d) -> i32;
3139 #[link_name = "llvm.x86.sse2.ucomigt.sd"]
3140 fn ucomigtsd(a: __m128d, b: __m128d) -> i32;
3141 #[link_name = "llvm.x86.sse2.ucomige.sd"]
3142 fn ucomigesd(a: __m128d, b: __m128d) -> i32;
3143 #[link_name = "llvm.x86.sse2.ucomineq.sd"]
3144 fn ucomineqsd(a: __m128d, b: __m128d) -> i32;
3145 #[link_name = "llvm.x86.sse2.movmsk.pd"]
3146 fn movmskpd(a: __m128d) -> i32;
3147 #[link_name = "llvm.x86.sse2.cvtpd2ps"]
3148 fn cvtpd2ps(a: __m128d) -> __m128;
3149 #[link_name = "llvm.x86.sse2.cvtps2pd"]
3150 fn cvtps2pd(a: __m128) -> __m128d;
3151 #[link_name = "llvm.x86.sse2.cvtpd2dq"]
3152 fn cvtpd2dq(a: __m128d) -> i32x4;
3153 #[link_name = "llvm.x86.sse2.cvtsd2si"]
3154 fn cvtsd2si(a: __m128d) -> i32;
3155 #[link_name = "llvm.x86.sse2.cvtsd2ss"]
3156 fn cvtsd2ss(a: __m128, b: __m128d) -> __m128;
3157 #[link_name = "llvm.x86.sse2.cvtss2sd"]
3158 fn cvtss2sd(a: __m128d, b: __m128) -> __m128d;
3159 #[link_name = "llvm.x86.sse2.cvttpd2dq"]
3160 fn cvttpd2dq(a: __m128d) -> i32x4;
3161 #[link_name = "llvm.x86.sse2.cvttsd2si"]
3162 fn cvttsd2si(a: __m128d) -> i32;
3163 #[link_name = "llvm.x86.sse2.cvttps2dq"]
3164 fn cvttps2dq(a: __m128) -> i32x4;
3165 #[link_name = "llvm.x86.sse2.storeu.dq"]
3166 fn storeudq(mem_addr: *mut i8, a: __m128i);
3167 #[link_name = "llvm.x86.sse2.storeu.pd"]
3168 fn storeupd(mem_addr: *mut i8, a: __m128d);
3169 #[link_name = "llvm.x86.mmx.padd.q"]
3170 fn paddq(a: __m64, b: __m64) -> __m64;
3171 #[link_name = "llvm.x86.mmx.pmulu.dq"]
3172 fn pmuludq2(a: __m64, b: __m64) -> __m64;
3173 #[link_name = "llvm.x86.mmx.psub.q"]
3174 fn psubq(a: __m64, b: __m64) -> __m64;
3175 #[link_name = "llvm.x86.sse.cvtpi2pd"]
3176 fn cvtpi2pd(a: __m64) -> __m128d;
3177 #[link_name = "llvm.x86.sse.cvtpd2pi"]
3178 fn cvtpd2pi(a: __m128d) -> __m64;
3179 #[link_name = "llvm.x86.sse.cvttpd2pi"]
3180 fn cvttpd2pi(a: __m128d) -> __m64;
3181}
3182
3183#[cfg(test)]
3184mod tests {
0531ce1d 3185 use std::f32;
83c7162d 3186 use std::f64::{self, NAN};
0531ce1d 3187 use std::i32;
83c7162d 3188 use std::mem::{self, transmute};
0531ce1d 3189
83c7162d
XL
3190 use coresimd::simd::*;
3191 use coresimd::x86::*;
0531ce1d
XL
3192 use stdsimd_test::simd_test;
3193 use test::black_box; // Used to inhibit constant-folding.
0531ce1d 3194
83c7162d 3195 #[simd_test(enable = "sse2")]
0531ce1d
XL
3196 unsafe fn test_mm_pause() {
3197 _mm_pause();
3198 }
3199
83c7162d 3200 #[simd_test(enable = "sse2")]
0531ce1d
XL
3201 unsafe fn test_mm_clflush() {
3202 let x = 0;
3203 _mm_clflush(&x as *const _ as *mut u8);
3204 }
3205
83c7162d 3206 #[simd_test(enable = "sse2")]
0531ce1d
XL
3207 unsafe fn test_mm_lfence() {
3208 _mm_lfence();
3209 }
3210
83c7162d 3211 #[simd_test(enable = "sse2")]
0531ce1d
XL
3212 unsafe fn test_mm_mfence() {
3213 _mm_mfence();
3214 }
3215
83c7162d 3216 #[simd_test(enable = "sse2")]
0531ce1d 3217 unsafe fn test_mm_add_epi8() {
0731742a
XL
3218 let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3219 #[rustfmt::skip]
0531ce1d
XL
3220 let b = _mm_setr_epi8(
3221 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3222 );
3223 let r = _mm_add_epi8(a, b);
0731742a 3224 #[rustfmt::skip]
0531ce1d
XL
3225 let e = _mm_setr_epi8(
3226 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3227 );
3228 assert_eq_m128i(r, e);
3229 }
3230
83c7162d 3231 #[simd_test(enable = "sse2")]
0531ce1d
XL
3232 unsafe fn test_mm_add_epi8_overflow() {
3233 let a = _mm_set1_epi8(0x7F);
3234 let b = _mm_set1_epi8(1);
3235 let r = _mm_add_epi8(a, b);
3236 assert_eq_m128i(r, _mm_set1_epi8(-128));
3237 }
3238
83c7162d 3239 #[simd_test(enable = "sse2")]
0531ce1d
XL
3240 unsafe fn test_mm_add_epi16() {
3241 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3242 let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3243 let r = _mm_add_epi16(a, b);
3244 let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3245 assert_eq_m128i(r, e);
3246 }
3247
83c7162d 3248 #[simd_test(enable = "sse2")]
0531ce1d
XL
3249 unsafe fn test_mm_add_epi32() {
3250 let a = _mm_setr_epi32(0, 1, 2, 3);
3251 let b = _mm_setr_epi32(4, 5, 6, 7);
3252 let r = _mm_add_epi32(a, b);
3253 let e = _mm_setr_epi32(4, 6, 8, 10);
3254 assert_eq_m128i(r, e);
3255 }
3256
83c7162d 3257 #[simd_test(enable = "sse2")]
0531ce1d
XL
3258 unsafe fn test_mm_add_epi64() {
3259 let a = _mm_setr_epi64x(0, 1);
3260 let b = _mm_setr_epi64x(2, 3);
3261 let r = _mm_add_epi64(a, b);
3262 let e = _mm_setr_epi64x(2, 4);
3263 assert_eq_m128i(r, e);
3264 }
3265
83c7162d 3266 #[simd_test(enable = "sse2")]
0531ce1d 3267 unsafe fn test_mm_adds_epi8() {
0731742a
XL
3268 let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3269 #[rustfmt::skip]
0531ce1d
XL
3270 let b = _mm_setr_epi8(
3271 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3272 );
3273 let r = _mm_adds_epi8(a, b);
0731742a 3274 #[rustfmt::skip]
0531ce1d
XL
3275 let e = _mm_setr_epi8(
3276 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3277 );
3278 assert_eq_m128i(r, e);
3279 }
3280
83c7162d 3281 #[simd_test(enable = "sse2")]
0531ce1d
XL
3282 unsafe fn test_mm_adds_epi8_saturate_positive() {
3283 let a = _mm_set1_epi8(0x7F);
3284 let b = _mm_set1_epi8(1);
3285 let r = _mm_adds_epi8(a, b);
3286 assert_eq_m128i(r, a);
3287 }
3288
83c7162d 3289 #[simd_test(enable = "sse2")]
0531ce1d
XL
3290 unsafe fn test_mm_adds_epi8_saturate_negative() {
3291 let a = _mm_set1_epi8(-0x80);
3292 let b = _mm_set1_epi8(-1);
3293 let r = _mm_adds_epi8(a, b);
3294 assert_eq_m128i(r, a);
3295 }
3296
83c7162d 3297 #[simd_test(enable = "sse2")]
0531ce1d
XL
3298 unsafe fn test_mm_adds_epi16() {
3299 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3300 let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3301 let r = _mm_adds_epi16(a, b);
3302 let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3303 assert_eq_m128i(r, e);
3304 }
3305
83c7162d 3306 #[simd_test(enable = "sse2")]
0531ce1d
XL
3307 unsafe fn test_mm_adds_epi16_saturate_positive() {
3308 let a = _mm_set1_epi16(0x7FFF);
3309 let b = _mm_set1_epi16(1);
3310 let r = _mm_adds_epi16(a, b);
3311 assert_eq_m128i(r, a);
3312 }
3313
83c7162d 3314 #[simd_test(enable = "sse2")]
0531ce1d
XL
3315 unsafe fn test_mm_adds_epi16_saturate_negative() {
3316 let a = _mm_set1_epi16(-0x8000);
3317 let b = _mm_set1_epi16(-1);
3318 let r = _mm_adds_epi16(a, b);
3319 assert_eq_m128i(r, a);
3320 }
3321
83c7162d 3322 #[simd_test(enable = "sse2")]
0531ce1d 3323 unsafe fn test_mm_adds_epu8() {
0731742a
XL
3324 let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3325 #[rustfmt::skip]
0531ce1d
XL
3326 let b = _mm_setr_epi8(
3327 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3328 );
3329 let r = _mm_adds_epu8(a, b);
0731742a 3330 #[rustfmt::skip]
0531ce1d
XL
3331 let e = _mm_setr_epi8(
3332 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46,
3333 );
3334 assert_eq_m128i(r, e);
3335 }
3336
83c7162d 3337 #[simd_test(enable = "sse2")]
0531ce1d
XL
3338 unsafe fn test_mm_adds_epu8_saturate() {
3339 let a = _mm_set1_epi8(!0);
3340 let b = _mm_set1_epi8(1);
3341 let r = _mm_adds_epu8(a, b);
3342 assert_eq_m128i(r, a);
3343 }
3344
83c7162d 3345 #[simd_test(enable = "sse2")]
0531ce1d
XL
3346 unsafe fn test_mm_adds_epu16() {
3347 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3348 let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
3349 let r = _mm_adds_epu16(a, b);
3350 let e = _mm_setr_epi16(8, 10, 12, 14, 16, 18, 20, 22);
3351 assert_eq_m128i(r, e);
3352 }
3353
83c7162d 3354 #[simd_test(enable = "sse2")]
0531ce1d
XL
3355 unsafe fn test_mm_adds_epu16_saturate() {
3356 let a = _mm_set1_epi16(!0);
3357 let b = _mm_set1_epi16(1);
3358 let r = _mm_adds_epu16(a, b);
3359 assert_eq_m128i(r, a);
3360 }
3361
83c7162d 3362 #[simd_test(enable = "sse2")]
0531ce1d
XL
3363 unsafe fn test_mm_avg_epu8() {
3364 let (a, b) = (_mm_set1_epi8(3), _mm_set1_epi8(9));
3365 let r = _mm_avg_epu8(a, b);
3366 assert_eq_m128i(r, _mm_set1_epi8(6));
3367 }
3368
83c7162d 3369 #[simd_test(enable = "sse2")]
0531ce1d
XL
3370 unsafe fn test_mm_avg_epu16() {
3371 let (a, b) = (_mm_set1_epi16(3), _mm_set1_epi16(9));
3372 let r = _mm_avg_epu16(a, b);
3373 assert_eq_m128i(r, _mm_set1_epi16(6));
3374 }
3375
83c7162d 3376 #[simd_test(enable = "sse2")]
0531ce1d
XL
3377 unsafe fn test_mm_madd_epi16() {
3378 let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3379 let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
3380 let r = _mm_madd_epi16(a, b);
3381 let e = _mm_setr_epi32(29, 81, 149, 233);
3382 assert_eq_m128i(r, e);
3383 }
3384
83c7162d 3385 #[simd_test(enable = "sse2")]
0531ce1d
XL
3386 unsafe fn test_mm_max_epi16() {
3387 let a = _mm_set1_epi16(1);
3388 let b = _mm_set1_epi16(-1);
3389 let r = _mm_max_epi16(a, b);
3390 assert_eq_m128i(r, a);
3391 }
3392
83c7162d 3393 #[simd_test(enable = "sse2")]
0531ce1d
XL
3394 unsafe fn test_mm_max_epu8() {
3395 let a = _mm_set1_epi8(1);
3396 let b = _mm_set1_epi8(!0);
3397 let r = _mm_max_epu8(a, b);
3398 assert_eq_m128i(r, b);
3399 }
3400
83c7162d 3401 #[simd_test(enable = "sse2")]
0531ce1d
XL
3402 unsafe fn test_mm_min_epi16() {
3403 let a = _mm_set1_epi16(1);
3404 let b = _mm_set1_epi16(-1);
3405 let r = _mm_min_epi16(a, b);
3406 assert_eq_m128i(r, b);
3407 }
3408
83c7162d 3409 #[simd_test(enable = "sse2")]
0531ce1d
XL
3410 unsafe fn test_mm_min_epu8() {
3411 let a = _mm_set1_epi8(1);
3412 let b = _mm_set1_epi8(!0);
3413 let r = _mm_min_epu8(a, b);
3414 assert_eq_m128i(r, a);
3415 }
3416
83c7162d 3417 #[simd_test(enable = "sse2")]
0531ce1d
XL
3418 unsafe fn test_mm_mulhi_epi16() {
3419 let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3420 let r = _mm_mulhi_epi16(a, b);
3421 assert_eq_m128i(r, _mm_set1_epi16(-16));
3422 }
3423
83c7162d 3424 #[simd_test(enable = "sse2")]
0531ce1d
XL
3425 unsafe fn test_mm_mulhi_epu16() {
3426 let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(1001));
3427 let r = _mm_mulhi_epu16(a, b);
3428 assert_eq_m128i(r, _mm_set1_epi16(15));
3429 }
3430
83c7162d 3431 #[simd_test(enable = "sse2")]
0531ce1d
XL
3432 unsafe fn test_mm_mullo_epi16() {
3433 let (a, b) = (_mm_set1_epi16(1000), _mm_set1_epi16(-1001));
3434 let r = _mm_mullo_epi16(a, b);
3435 assert_eq_m128i(r, _mm_set1_epi16(-17960));
3436 }
3437
83c7162d 3438 #[simd_test(enable = "sse2")]
0531ce1d
XL
3439 unsafe fn test_mm_mul_epu32() {
3440 let a = _mm_setr_epi64x(1_000_000_000, 1 << 34);
3441 let b = _mm_setr_epi64x(1_000_000_000, 1 << 35);
3442 let r = _mm_mul_epu32(a, b);
3443 let e = _mm_setr_epi64x(1_000_000_000 * 1_000_000_000, 0);
3444 assert_eq_m128i(r, e);
3445 }
3446
83c7162d 3447 #[simd_test(enable = "sse2")]
0531ce1d 3448 unsafe fn test_mm_sad_epu8() {
0731742a 3449 #[rustfmt::skip]
0531ce1d
XL
3450 let a = _mm_setr_epi8(
3451 255u8 as i8, 254u8 as i8, 253u8 as i8, 252u8 as i8,
3452 1, 2, 3, 4,
3453 155u8 as i8, 154u8 as i8, 153u8 as i8, 152u8 as i8,
3454 1, 2, 3, 4,
3455 );
3456 let b = _mm_setr_epi8(0, 0, 0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2);
3457 let r = _mm_sad_epu8(a, b);
3458 let e = _mm_setr_epi64x(1020, 614);
3459 assert_eq_m128i(r, e);
3460 }
3461
83c7162d 3462 #[simd_test(enable = "sse2")]
0531ce1d
XL
3463 unsafe fn test_mm_sub_epi8() {
3464 let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(6));
3465 let r = _mm_sub_epi8(a, b);
3466 assert_eq_m128i(r, _mm_set1_epi8(-1));
3467 }
3468
83c7162d 3469 #[simd_test(enable = "sse2")]
0531ce1d
XL
3470 unsafe fn test_mm_sub_epi16() {
3471 let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(6));
3472 let r = _mm_sub_epi16(a, b);
3473 assert_eq_m128i(r, _mm_set1_epi16(-1));
3474 }
3475
83c7162d 3476 #[simd_test(enable = "sse2")]
0531ce1d
XL
3477 unsafe fn test_mm_sub_epi32() {
3478 let (a, b) = (_mm_set1_epi32(5), _mm_set1_epi32(6));
3479 let r = _mm_sub_epi32(a, b);
3480 assert_eq_m128i(r, _mm_set1_epi32(-1));
3481 }
3482
83c7162d 3483 #[simd_test(enable = "sse2")]
0531ce1d
XL
3484 unsafe fn test_mm_sub_epi64() {
3485 let (a, b) = (_mm_set1_epi64x(5), _mm_set1_epi64x(6));
3486 let r = _mm_sub_epi64(a, b);
3487 assert_eq_m128i(r, _mm_set1_epi64x(-1));
3488 }
3489
83c7162d 3490 #[simd_test(enable = "sse2")]
0531ce1d
XL
3491 unsafe fn test_mm_subs_epi8() {
3492 let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3493 let r = _mm_subs_epi8(a, b);
3494 assert_eq_m128i(r, _mm_set1_epi8(3));
3495 }
3496
83c7162d 3497 #[simd_test(enable = "sse2")]
0531ce1d
XL
3498 unsafe fn test_mm_subs_epi8_saturate_positive() {
3499 let a = _mm_set1_epi8(0x7F);
3500 let b = _mm_set1_epi8(-1);
3501 let r = _mm_subs_epi8(a, b);
3502 assert_eq_m128i(r, a);
3503 }
3504
83c7162d 3505 #[simd_test(enable = "sse2")]
0531ce1d
XL
3506 unsafe fn test_mm_subs_epi8_saturate_negative() {
3507 let a = _mm_set1_epi8(-0x80);
3508 let b = _mm_set1_epi8(1);
3509 let r = _mm_subs_epi8(a, b);
3510 assert_eq_m128i(r, a);
3511 }
3512
83c7162d 3513 #[simd_test(enable = "sse2")]
0531ce1d
XL
3514 unsafe fn test_mm_subs_epi16() {
3515 let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3516 let r = _mm_subs_epi16(a, b);
3517 assert_eq_m128i(r, _mm_set1_epi16(3));
3518 }
3519
83c7162d 3520 #[simd_test(enable = "sse2")]
0531ce1d
XL
3521 unsafe fn test_mm_subs_epi16_saturate_positive() {
3522 let a = _mm_set1_epi16(0x7FFF);
3523 let b = _mm_set1_epi16(-1);
3524 let r = _mm_subs_epi16(a, b);
3525 assert_eq_m128i(r, a);
3526 }
3527
83c7162d 3528 #[simd_test(enable = "sse2")]
0531ce1d
XL
3529 unsafe fn test_mm_subs_epi16_saturate_negative() {
3530 let a = _mm_set1_epi16(-0x8000);
3531 let b = _mm_set1_epi16(1);
3532 let r = _mm_subs_epi16(a, b);
3533 assert_eq_m128i(r, a);
3534 }
3535
83c7162d 3536 #[simd_test(enable = "sse2")]
0531ce1d
XL
3537 unsafe fn test_mm_subs_epu8() {
3538 let (a, b) = (_mm_set1_epi8(5), _mm_set1_epi8(2));
3539 let r = _mm_subs_epu8(a, b);
3540 assert_eq_m128i(r, _mm_set1_epi8(3));
3541 }
3542
83c7162d 3543 #[simd_test(enable = "sse2")]
0531ce1d
XL
3544 unsafe fn test_mm_subs_epu8_saturate() {
3545 let a = _mm_set1_epi8(0);
3546 let b = _mm_set1_epi8(1);
3547 let r = _mm_subs_epu8(a, b);
3548 assert_eq_m128i(r, a);
3549 }
3550
83c7162d 3551 #[simd_test(enable = "sse2")]
0531ce1d
XL
3552 unsafe fn test_mm_subs_epu16() {
3553 let (a, b) = (_mm_set1_epi16(5), _mm_set1_epi16(2));
3554 let r = _mm_subs_epu16(a, b);
3555 assert_eq_m128i(r, _mm_set1_epi16(3));
3556 }
3557
83c7162d 3558 #[simd_test(enable = "sse2")]
0531ce1d
XL
3559 unsafe fn test_mm_subs_epu16_saturate() {
3560 let a = _mm_set1_epi16(0);
3561 let b = _mm_set1_epi16(1);
3562 let r = _mm_subs_epu16(a, b);
3563 assert_eq_m128i(r, a);
3564 }
3565
83c7162d 3566 #[simd_test(enable = "sse2")]
0531ce1d 3567 unsafe fn test_mm_slli_si128() {
0731742a 3568 #[rustfmt::skip]
0531ce1d
XL
3569 let a = _mm_setr_epi8(
3570 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3571 );
3572 let r = _mm_slli_si128(a, 1);
0731742a 3573 let e = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
0531ce1d
XL
3574 assert_eq_m128i(r, e);
3575
0731742a 3576 #[rustfmt::skip]
0531ce1d
XL
3577 let a = _mm_setr_epi8(
3578 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3579 );
3580 let r = _mm_slli_si128(a, 15);
3581 let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
3582 assert_eq_m128i(r, e);
3583
0731742a 3584 #[rustfmt::skip]
0531ce1d
XL
3585 let a = _mm_setr_epi8(
3586 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3587 );
3588 let r = _mm_slli_si128(a, 16);
3589 assert_eq_m128i(r, _mm_set1_epi8(0));
3590
0731742a 3591 #[rustfmt::skip]
0531ce1d
XL
3592 let a = _mm_setr_epi8(
3593 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3594 );
3595 let r = _mm_slli_si128(a, -1);
3596 assert_eq_m128i(_mm_set1_epi8(0), r);
3597
0731742a 3598 #[rustfmt::skip]
0531ce1d
XL
3599 let a = _mm_setr_epi8(
3600 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3601 );
3602 let r = _mm_slli_si128(a, -0x80000000);
3603 assert_eq_m128i(r, _mm_set1_epi8(0));
3604 }
3605
83c7162d 3606 #[simd_test(enable = "sse2")]
0531ce1d 3607 unsafe fn test_mm_slli_epi16() {
0731742a 3608 #[rustfmt::skip]
0531ce1d
XL
3609 let a = _mm_setr_epi16(
3610 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
3611 );
3612 let r = _mm_slli_epi16(a, 4);
3613
0731742a 3614 #[rustfmt::skip]
0531ce1d
XL
3615 let e = _mm_setr_epi16(
3616 0xFFF0 as u16 as i16, 0xFFF0 as u16 as i16, 0x0FF0, 0x00F0,
3617 0, 0, 0, 0,
3618 );
3619 assert_eq_m128i(r, e);
3620 }
3621
83c7162d 3622 #[simd_test(enable = "sse2")]
0531ce1d
XL
3623 unsafe fn test_mm_sll_epi16() {
3624 let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
3625 let r = _mm_sll_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
3626 assert_eq_m128i(r, _mm_setr_epi16(0xFF0, 0, 0, 0, 0, 0, 0, 0));
3627 let r = _mm_sll_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
3628 assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
3629 }
3630
83c7162d 3631 #[simd_test(enable = "sse2")]
0531ce1d
XL
3632 unsafe fn test_mm_slli_epi32() {
3633 let r = _mm_slli_epi32(_mm_set1_epi32(0xFFFF), 4);
3634 assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
3635 }
3636
83c7162d 3637 #[simd_test(enable = "sse2")]
0531ce1d
XL
3638 unsafe fn test_mm_sll_epi32() {
3639 let a = _mm_set1_epi32(0xFFFF);
3640 let b = _mm_setr_epi32(4, 0, 0, 0);
3641 let r = _mm_sll_epi32(a, b);
3642 assert_eq_m128i(r, _mm_set1_epi32(0xFFFF0));
3643 }
3644
83c7162d 3645 #[simd_test(enable = "sse2")]
0531ce1d
XL
3646 unsafe fn test_mm_slli_epi64() {
3647 let r = _mm_slli_epi64(_mm_set1_epi64x(0xFFFFFFFF), 4);
3648 assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
3649 }
3650
83c7162d 3651 #[simd_test(enable = "sse2")]
0531ce1d
XL
3652 unsafe fn test_mm_sll_epi64() {
3653 let a = _mm_set1_epi64x(0xFFFFFFFF);
3654 let b = _mm_setr_epi64x(4, 0);
3655 let r = _mm_sll_epi64(a, b);
3656 assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFFF0));
3657 }
3658
83c7162d 3659 #[simd_test(enable = "sse2")]
0531ce1d
XL
3660 unsafe fn test_mm_srai_epi16() {
3661 let r = _mm_srai_epi16(_mm_set1_epi16(-1), 1);
3662 assert_eq_m128i(r, _mm_set1_epi16(-1));
3663 }
3664
83c7162d 3665 #[simd_test(enable = "sse2")]
0531ce1d
XL
3666 unsafe fn test_mm_sra_epi16() {
3667 let a = _mm_set1_epi16(-1);
3668 let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
3669 let r = _mm_sra_epi16(a, b);
3670 assert_eq_m128i(r, _mm_set1_epi16(-1));
3671 }
3672
83c7162d 3673 #[simd_test(enable = "sse2")]
0531ce1d
XL
3674 unsafe fn test_mm_srai_epi32() {
3675 let r = _mm_srai_epi32(_mm_set1_epi32(-1), 1);
3676 assert_eq_m128i(r, _mm_set1_epi32(-1));
3677 }
3678
83c7162d 3679 #[simd_test(enable = "sse2")]
0531ce1d
XL
3680 unsafe fn test_mm_sra_epi32() {
3681 let a = _mm_set1_epi32(-1);
3682 let b = _mm_setr_epi32(1, 0, 0, 0);
3683 let r = _mm_sra_epi32(a, b);
3684 assert_eq_m128i(r, _mm_set1_epi32(-1));
3685 }
3686
83c7162d 3687 #[simd_test(enable = "sse2")]
0531ce1d 3688 unsafe fn test_mm_srli_si128() {
0731742a 3689 #[rustfmt::skip]
0531ce1d
XL
3690 let a = _mm_setr_epi8(
3691 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3692 );
3693 let r = _mm_srli_si128(a, 1);
0731742a 3694 #[rustfmt::skip]
0531ce1d
XL
3695 let e = _mm_setr_epi8(
3696 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0,
3697 );
3698 assert_eq_m128i(r, e);
3699
0731742a 3700 #[rustfmt::skip]
0531ce1d
XL
3701 let a = _mm_setr_epi8(
3702 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3703 );
3704 let r = _mm_srli_si128(a, 15);
3705 let e = _mm_setr_epi8(16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3706 assert_eq_m128i(r, e);
3707
0731742a 3708 #[rustfmt::skip]
0531ce1d
XL
3709 let a = _mm_setr_epi8(
3710 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3711 );
3712 let r = _mm_srli_si128(a, 16);
3713 assert_eq_m128i(r, _mm_set1_epi8(0));
3714
0731742a 3715 #[rustfmt::skip]
0531ce1d
XL
3716 let a = _mm_setr_epi8(
3717 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3718 );
3719 let r = _mm_srli_si128(a, -1);
3720 assert_eq_m128i(r, _mm_set1_epi8(0));
3721
0731742a 3722 #[rustfmt::skip]
0531ce1d
XL
3723 let a = _mm_setr_epi8(
3724 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
3725 );
3726 let r = _mm_srli_si128(a, -0x80000000);
3727 assert_eq_m128i(r, _mm_set1_epi8(0));
3728 }
3729
83c7162d 3730 #[simd_test(enable = "sse2")]
0531ce1d 3731 unsafe fn test_mm_srli_epi16() {
0731742a 3732 #[rustfmt::skip]
0531ce1d
XL
3733 let a = _mm_setr_epi16(
3734 0xFFFF as u16 as i16, 0x0FFF, 0x00FF, 0x000F, 0, 0, 0, 0,
3735 );
3736 let r = _mm_srli_epi16(a, 4);
0731742a 3737 #[rustfmt::skip]
0531ce1d
XL
3738 let e = _mm_setr_epi16(
3739 0xFFF as u16 as i16, 0xFF as u16 as i16, 0xF, 0, 0, 0, 0, 0,
3740 );
3741 assert_eq_m128i(r, e);
3742 }
3743
83c7162d 3744 #[simd_test(enable = "sse2")]
0531ce1d
XL
3745 unsafe fn test_mm_srl_epi16() {
3746 let a = _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0);
3747 let r = _mm_srl_epi16(a, _mm_setr_epi16(4, 0, 0, 0, 0, 0, 0, 0));
3748 assert_eq_m128i(r, _mm_setr_epi16(0xF, 0, 0, 0, 0, 0, 0, 0));
3749 let r = _mm_srl_epi16(a, _mm_setr_epi16(0, 0, 0, 0, 4, 0, 0, 0));
3750 assert_eq_m128i(r, _mm_setr_epi16(0xFF, 0, 0, 0, 0, 0, 0, 0));
3751 }
3752
83c7162d 3753 #[simd_test(enable = "sse2")]
0531ce1d
XL
3754 unsafe fn test_mm_srli_epi32() {
3755 let r = _mm_srli_epi32(_mm_set1_epi32(0xFFFF), 4);
3756 assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
3757 }
3758
83c7162d 3759 #[simd_test(enable = "sse2")]
0531ce1d
XL
3760 unsafe fn test_mm_srl_epi32() {
3761 let a = _mm_set1_epi32(0xFFFF);
3762 let b = _mm_setr_epi32(4, 0, 0, 0);
3763 let r = _mm_srl_epi32(a, b);
3764 assert_eq_m128i(r, _mm_set1_epi32(0xFFF));
3765 }
3766
83c7162d 3767 #[simd_test(enable = "sse2")]
0531ce1d
XL
3768 unsafe fn test_mm_srli_epi64() {
3769 let r = _mm_srli_epi64(_mm_set1_epi64x(0xFFFFFFFF), 4);
3770 assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
3771 }
3772
83c7162d 3773 #[simd_test(enable = "sse2")]
0531ce1d
XL
3774 unsafe fn test_mm_srl_epi64() {
3775 let a = _mm_set1_epi64x(0xFFFFFFFF);
3776 let b = _mm_setr_epi64x(4, 0);
3777 let r = _mm_srl_epi64(a, b);
3778 assert_eq_m128i(r, _mm_set1_epi64x(0xFFFFFFF));
3779 }
3780
83c7162d 3781 #[simd_test(enable = "sse2")]
0531ce1d
XL
3782 unsafe fn test_mm_and_si128() {
3783 let a = _mm_set1_epi8(5);
3784 let b = _mm_set1_epi8(3);
3785 let r = _mm_and_si128(a, b);
3786 assert_eq_m128i(r, _mm_set1_epi8(1));
3787 }
3788
83c7162d 3789 #[simd_test(enable = "sse2")]
0531ce1d
XL
3790 unsafe fn test_mm_andnot_si128() {
3791 let a = _mm_set1_epi8(5);
3792 let b = _mm_set1_epi8(3);
3793 let r = _mm_andnot_si128(a, b);
3794 assert_eq_m128i(r, _mm_set1_epi8(2));
3795 }
3796
83c7162d 3797 #[simd_test(enable = "sse2")]
0531ce1d
XL
3798 unsafe fn test_mm_or_si128() {
3799 let a = _mm_set1_epi8(5);
3800 let b = _mm_set1_epi8(3);
3801 let r = _mm_or_si128(a, b);
3802 assert_eq_m128i(r, _mm_set1_epi8(7));
3803 }
3804
83c7162d 3805 #[simd_test(enable = "sse2")]
0531ce1d
XL
3806 unsafe fn test_mm_xor_si128() {
3807 let a = _mm_set1_epi8(5);
3808 let b = _mm_set1_epi8(3);
3809 let r = _mm_xor_si128(a, b);
3810 assert_eq_m128i(r, _mm_set1_epi8(6));
3811 }
3812
83c7162d 3813 #[simd_test(enable = "sse2")]
0531ce1d 3814 unsafe fn test_mm_cmpeq_epi8() {
0731742a
XL
3815 let a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3816 let b = _mm_setr_epi8(15, 14, 2, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
0531ce1d 3817 let r = _mm_cmpeq_epi8(a, b);
0731742a 3818 #[rustfmt::skip]
0531ce1d
XL
3819 assert_eq_m128i(
3820 r,
3821 _mm_setr_epi8(
3822 0, 0, 0xFFu8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
3823 )
3824 );
3825 }
3826
83c7162d 3827 #[simd_test(enable = "sse2")]
0531ce1d
XL
3828 unsafe fn test_mm_cmpeq_epi16() {
3829 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3830 let b = _mm_setr_epi16(7, 6, 2, 4, 3, 2, 1, 0);
3831 let r = _mm_cmpeq_epi16(a, b);
3832 assert_eq_m128i(r, _mm_setr_epi16(0, 0, !0, 0, 0, 0, 0, 0));
3833 }
3834
83c7162d 3835 #[simd_test(enable = "sse2")]
0531ce1d
XL
3836 unsafe fn test_mm_cmpeq_epi32() {
3837 let a = _mm_setr_epi32(0, 1, 2, 3);
3838 let b = _mm_setr_epi32(3, 2, 2, 0);
3839 let r = _mm_cmpeq_epi32(a, b);
3840 assert_eq_m128i(r, _mm_setr_epi32(0, 0, !0, 0));
3841 }
3842
83c7162d 3843 #[simd_test(enable = "sse2")]
0531ce1d
XL
3844 unsafe fn test_mm_cmpgt_epi8() {
3845 let a = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3846 let b = _mm_set1_epi8(0);
3847 let r = _mm_cmpgt_epi8(a, b);
3848 let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3849 assert_eq_m128i(r, e);
3850 }
3851
83c7162d 3852 #[simd_test(enable = "sse2")]
0531ce1d
XL
3853 unsafe fn test_mm_cmpgt_epi16() {
3854 let a = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3855 let b = _mm_set1_epi16(0);
3856 let r = _mm_cmpgt_epi16(a, b);
3857 let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3858 assert_eq_m128i(r, e);
3859 }
3860
83c7162d 3861 #[simd_test(enable = "sse2")]
0531ce1d
XL
3862 unsafe fn test_mm_cmpgt_epi32() {
3863 let a = _mm_set_epi32(5, 0, 0, 0);
3864 let b = _mm_set1_epi32(0);
3865 let r = _mm_cmpgt_epi32(a, b);
3866 assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3867 }
3868
83c7162d 3869 #[simd_test(enable = "sse2")]
0531ce1d
XL
3870 unsafe fn test_mm_cmplt_epi8() {
3871 let a = _mm_set1_epi8(0);
3872 let b = _mm_set_epi8(5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3873 let r = _mm_cmplt_epi8(a, b);
3874 let e = _mm_set_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3875 assert_eq_m128i(r, e);
3876 }
3877
83c7162d 3878 #[simd_test(enable = "sse2")]
0531ce1d
XL
3879 unsafe fn test_mm_cmplt_epi16() {
3880 let a = _mm_set1_epi16(0);
3881 let b = _mm_set_epi16(5, 0, 0, 0, 0, 0, 0, 0);
3882 let r = _mm_cmplt_epi16(a, b);
3883 let e = _mm_set_epi16(!0, 0, 0, 0, 0, 0, 0, 0);
3884 assert_eq_m128i(r, e);
3885 }
3886
83c7162d 3887 #[simd_test(enable = "sse2")]
0531ce1d
XL
3888 unsafe fn test_mm_cmplt_epi32() {
3889 let a = _mm_set1_epi32(0);
3890 let b = _mm_set_epi32(5, 0, 0, 0);
3891 let r = _mm_cmplt_epi32(a, b);
3892 assert_eq_m128i(r, _mm_set_epi32(!0, 0, 0, 0));
3893 }
3894
83c7162d 3895 #[simd_test(enable = "sse2")]
0531ce1d
XL
3896 unsafe fn test_mm_cvtepi32_pd() {
3897 let a = _mm_set_epi32(35, 25, 15, 5);
3898 let r = _mm_cvtepi32_pd(a);
3899 assert_eq_m128d(r, _mm_setr_pd(5.0, 15.0));
3900 }
3901
83c7162d 3902 #[simd_test(enable = "sse2")]
0531ce1d
XL
3903 unsafe fn test_mm_cvtsi32_sd() {
3904 let a = _mm_set1_pd(3.5);
3905 let r = _mm_cvtsi32_sd(a, 5);
3906 assert_eq_m128d(r, _mm_setr_pd(5.0, 3.5));
3907 }
3908
83c7162d 3909 #[simd_test(enable = "sse2")]
0531ce1d
XL
3910 unsafe fn test_mm_cvtepi32_ps() {
3911 let a = _mm_setr_epi32(1, 2, 3, 4);
3912 let r = _mm_cvtepi32_ps(a);
3913 assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3914 }
3915
83c7162d 3916 #[simd_test(enable = "sse2")]
0531ce1d
XL
3917 unsafe fn test_mm_cvtps_epi32() {
3918 let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3919 let r = _mm_cvtps_epi32(a);
3920 assert_eq_m128i(r, _mm_setr_epi32(1, 2, 3, 4));
3921 }
3922
83c7162d 3923 #[simd_test(enable = "sse2")]
0531ce1d
XL
3924 unsafe fn test_mm_cvtsi32_si128() {
3925 let r = _mm_cvtsi32_si128(5);
3926 assert_eq_m128i(r, _mm_setr_epi32(5, 0, 0, 0));
3927 }
3928
83c7162d 3929 #[simd_test(enable = "sse2")]
0531ce1d
XL
3930 unsafe fn test_mm_cvtsi128_si32() {
3931 let r = _mm_cvtsi128_si32(_mm_setr_epi32(5, 0, 0, 0));
3932 assert_eq!(r, 5);
3933 }
3934
83c7162d 3935 #[simd_test(enable = "sse2")]
0531ce1d
XL
3936 unsafe fn test_mm_set_epi64x() {
3937 let r = _mm_set_epi64x(0, 1);
3938 assert_eq_m128i(r, _mm_setr_epi64x(1, 0));
3939 }
3940
83c7162d 3941 #[simd_test(enable = "sse2")]
0531ce1d
XL
3942 unsafe fn test_mm_set_epi32() {
3943 let r = _mm_set_epi32(0, 1, 2, 3);
3944 assert_eq_m128i(r, _mm_setr_epi32(3, 2, 1, 0));
3945 }
3946
83c7162d 3947 #[simd_test(enable = "sse2")]
0531ce1d
XL
3948 unsafe fn test_mm_set_epi16() {
3949 let r = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
3950 assert_eq_m128i(r, _mm_setr_epi16(7, 6, 5, 4, 3, 2, 1, 0));
3951 }
3952
83c7162d 3953 #[simd_test(enable = "sse2")]
0531ce1d 3954 unsafe fn test_mm_set_epi8() {
0731742a 3955 #[rustfmt::skip]
0531ce1d
XL
3956 let r = _mm_set_epi8(
3957 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3958 );
0731742a 3959 #[rustfmt::skip]
0531ce1d
XL
3960 let e = _mm_setr_epi8(
3961 15, 14, 13, 12, 11, 10, 9, 8,
3962 7, 6, 5, 4, 3, 2, 1, 0,
3963 );
3964 assert_eq_m128i(r, e);
3965 }
3966
83c7162d 3967 #[simd_test(enable = "sse2")]
0531ce1d
XL
3968 unsafe fn test_mm_set1_epi64x() {
3969 let r = _mm_set1_epi64x(1);
3970 assert_eq_m128i(r, _mm_set1_epi64x(1));
3971 }
3972
83c7162d 3973 #[simd_test(enable = "sse2")]
0531ce1d
XL
3974 unsafe fn test_mm_set1_epi32() {
3975 let r = _mm_set1_epi32(1);
3976 assert_eq_m128i(r, _mm_set1_epi32(1));
3977 }
3978
83c7162d 3979 #[simd_test(enable = "sse2")]
0531ce1d
XL
3980 unsafe fn test_mm_set1_epi16() {
3981 let r = _mm_set1_epi16(1);
3982 assert_eq_m128i(r, _mm_set1_epi16(1));
3983 }
3984
83c7162d 3985 #[simd_test(enable = "sse2")]
0531ce1d
XL
3986 unsafe fn test_mm_set1_epi8() {
3987 let r = _mm_set1_epi8(1);
3988 assert_eq_m128i(r, _mm_set1_epi8(1));
3989 }
3990
83c7162d 3991 #[simd_test(enable = "sse2")]
0531ce1d
XL
3992 unsafe fn test_mm_setr_epi32() {
3993 let r = _mm_setr_epi32(0, 1, 2, 3);
3994 assert_eq_m128i(r, _mm_setr_epi32(0, 1, 2, 3));
3995 }
3996
83c7162d 3997 #[simd_test(enable = "sse2")]
0531ce1d
XL
3998 unsafe fn test_mm_setr_epi16() {
3999 let r = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4000 assert_eq_m128i(r, _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7));
4001 }
4002
83c7162d 4003 #[simd_test(enable = "sse2")]
0531ce1d 4004 unsafe fn test_mm_setr_epi8() {
0731742a 4005 #[rustfmt::skip]
0531ce1d
XL
4006 let r = _mm_setr_epi8(
4007 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
4008 );
0731742a 4009 #[rustfmt::skip]
0531ce1d
XL
4010 let e = _mm_setr_epi8(
4011 0, 1, 2, 3, 4, 5, 6, 7,
4012 8, 9, 10, 11, 12, 13, 14, 15,
4013 );
4014 assert_eq_m128i(r, e);
4015 }
4016
83c7162d 4017 #[simd_test(enable = "sse2")]
0531ce1d
XL
4018 unsafe fn test_mm_setzero_si128() {
4019 let r = _mm_setzero_si128();
4020 assert_eq_m128i(r, _mm_set1_epi64x(0));
4021 }
4022
83c7162d 4023 #[simd_test(enable = "sse2")]
0531ce1d
XL
4024 unsafe fn test_mm_loadl_epi64() {
4025 let a = _mm_setr_epi64x(6, 5);
4026 let r = _mm_loadl_epi64(&a as *const _);
4027 assert_eq_m128i(r, _mm_setr_epi64x(6, 0));
4028 }
4029
83c7162d 4030 #[simd_test(enable = "sse2")]
0531ce1d
XL
4031 unsafe fn test_mm_load_si128() {
4032 let a = _mm_set_epi64x(5, 6);
4033 let r = _mm_load_si128(&a as *const _ as *const _);
4034 assert_eq_m128i(a, r);
4035 }
4036
83c7162d 4037 #[simd_test(enable = "sse2")]
0531ce1d
XL
4038 unsafe fn test_mm_loadu_si128() {
4039 let a = _mm_set_epi64x(5, 6);
4040 let r = _mm_loadu_si128(&a as *const _ as *const _);
4041 assert_eq_m128i(a, r);
4042 }
4043
83c7162d 4044 #[simd_test(enable = "sse2")]
0531ce1d
XL
4045 unsafe fn test_mm_maskmoveu_si128() {
4046 let a = _mm_set1_epi8(9);
0731742a 4047 #[rustfmt::skip]
0531ce1d
XL
4048 let mask = _mm_set_epi8(
4049 0, 0, 0x80u8 as i8, 0, 0, 0, 0, 0,
4050 0, 0, 0, 0, 0, 0, 0, 0,
4051 );
4052 let mut r = _mm_set1_epi8(0);
4053 _mm_maskmoveu_si128(a, mask, &mut r as *mut _ as *mut i8);
4054 let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
4055 assert_eq_m128i(r, e);
4056 }
4057
83c7162d 4058 #[simd_test(enable = "sse2")]
0531ce1d
XL
4059 unsafe fn test_mm_store_si128() {
4060 let a = _mm_set1_epi8(9);
4061 let mut r = _mm_set1_epi8(0);
4062 _mm_store_si128(&mut r as *mut _ as *mut __m128i, a);
4063 assert_eq_m128i(r, a);
4064 }
4065
83c7162d 4066 #[simd_test(enable = "sse2")]
0531ce1d
XL
4067 unsafe fn test_mm_storeu_si128() {
4068 let a = _mm_set1_epi8(9);
4069 let mut r = _mm_set1_epi8(0);
4070 _mm_storeu_si128(&mut r as *mut _ as *mut __m128i, a);
4071 assert_eq_m128i(r, a);
4072 }
4073
83c7162d 4074 #[simd_test(enable = "sse2")]
0531ce1d
XL
4075 unsafe fn test_mm_storel_epi64() {
4076 let a = _mm_setr_epi64x(2, 9);
4077 let mut r = _mm_set1_epi8(0);
4078 _mm_storel_epi64(&mut r as *mut _ as *mut __m128i, a);
4079 assert_eq_m128i(r, _mm_setr_epi64x(2, 0));
4080 }
4081
83c7162d 4082 #[simd_test(enable = "sse2")]
0531ce1d
XL
4083 unsafe fn test_mm_stream_si128() {
4084 let a = _mm_setr_epi32(1, 2, 3, 4);
4085 let mut r = _mm_undefined_si128();
4086 _mm_stream_si128(&mut r as *mut _, a);
4087 assert_eq_m128i(r, a);
4088 }
4089
83c7162d 4090 #[simd_test(enable = "sse2")]
0531ce1d
XL
4091 unsafe fn test_mm_stream_si32() {
4092 let a: i32 = 7;
4093 let mut mem = ::std::boxed::Box::<i32>::new(-1);
4094 _mm_stream_si32(&mut *mem as *mut i32, a);
4095 assert_eq!(a, *mem);
4096 }
4097
83c7162d 4098 #[simd_test(enable = "sse2")]
0531ce1d
XL
4099 unsafe fn test_mm_move_epi64() {
4100 let a = _mm_setr_epi64x(5, 6);
4101 let r = _mm_move_epi64(a);
4102 assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
4103 }
4104
83c7162d 4105 #[simd_test(enable = "sse2")]
0531ce1d
XL
4106 unsafe fn test_mm_packs_epi16() {
4107 let a = _mm_setr_epi16(0x80, -0x81, 0, 0, 0, 0, 0, 0);
4108 let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -0x81, 0x80);
4109 let r = _mm_packs_epi16(a, b);
0731742a 4110 #[rustfmt::skip]
0531ce1d
XL
4111 assert_eq_m128i(
4112 r,
4113 _mm_setr_epi8(
4114 0x7F, -0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0x80, 0x7F
4115 )
4116 );
4117 }
4118
83c7162d 4119 #[simd_test(enable = "sse2")]
0531ce1d
XL
4120 unsafe fn test_mm_packs_epi32() {
4121 let a = _mm_setr_epi32(0x8000, -0x8001, 0, 0);
4122 let b = _mm_setr_epi32(0, 0, -0x8001, 0x8000);
4123 let r = _mm_packs_epi32(a, b);
4124 assert_eq_m128i(
4125 r,
4126 _mm_setr_epi16(0x7FFF, -0x8000, 0, 0, 0, 0, -0x8000, 0x7FFF),
4127 );
4128 }
4129
83c7162d 4130 #[simd_test(enable = "sse2")]
0531ce1d
XL
4131 unsafe fn test_mm_packus_epi16() {
4132 let a = _mm_setr_epi16(0x100, -1, 0, 0, 0, 0, 0, 0);
4133 let b = _mm_setr_epi16(0, 0, 0, 0, 0, 0, -1, 0x100);
4134 let r = _mm_packus_epi16(a, b);
4135 assert_eq_m128i(
4136 r,
4137 _mm_setr_epi8(!0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, !0),
4138 );
4139 }
4140
83c7162d 4141 #[simd_test(enable = "sse2")]
0531ce1d
XL
4142 unsafe fn test_mm_extract_epi16() {
4143 let a = _mm_setr_epi16(-1, 1, 2, 3, 4, 5, 6, 7);
4144 let r1 = _mm_extract_epi16(a, 0);
4145 let r2 = _mm_extract_epi16(a, 11);
4146 assert_eq!(r1, -1);
4147 assert_eq!(r2, 3);
4148 }
4149
83c7162d 4150 #[simd_test(enable = "sse2")]
0531ce1d
XL
4151 unsafe fn test_mm_insert_epi16() {
4152 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4153 let r = _mm_insert_epi16(a, 9, 0);
4154 let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
4155 assert_eq_m128i(r, e);
4156 }
4157
83c7162d 4158 #[simd_test(enable = "sse2")]
0531ce1d 4159 unsafe fn test_mm_movemask_epi8() {
0731742a 4160 #[rustfmt::skip]
0531ce1d
XL
4161 let a = _mm_setr_epi8(
4162 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8, 0b01,
4163 0b0101, 0b1111_0000u8 as i8, 0, 0,
4164 0, 0, 0b1111_0000u8 as i8, 0b0101,
4165 0b01, 0b1000_0000u8 as i8, 0b0, 0b1000_0000u8 as i8,
4166 );
4167 let r = _mm_movemask_epi8(a);
4168 assert_eq!(r, 0b10100100_00100101);
4169 }
4170
83c7162d 4171 #[simd_test(enable = "sse2")]
0531ce1d
XL
4172 unsafe fn test_mm_shuffle_epi32() {
4173 let a = _mm_setr_epi32(5, 10, 15, 20);
4174 let r = _mm_shuffle_epi32(a, 0b00_01_01_11);
4175 let e = _mm_setr_epi32(20, 10, 10, 5);
4176 assert_eq_m128i(r, e);
4177 }
4178
83c7162d 4179 #[simd_test(enable = "sse2")]
0531ce1d
XL
4180 unsafe fn test_mm_shufflehi_epi16() {
4181 let a = _mm_setr_epi16(1, 2, 3, 4, 5, 10, 15, 20);
4182 let r = _mm_shufflehi_epi16(a, 0b00_01_01_11);
4183 let e = _mm_setr_epi16(1, 2, 3, 4, 20, 10, 10, 5);
4184 assert_eq_m128i(r, e);
4185 }
4186
83c7162d 4187 #[simd_test(enable = "sse2")]
0531ce1d
XL
4188 unsafe fn test_mm_shufflelo_epi16() {
4189 let a = _mm_setr_epi16(5, 10, 15, 20, 1, 2, 3, 4);
4190 let r = _mm_shufflelo_epi16(a, 0b00_01_01_11);
4191 let e = _mm_setr_epi16(20, 10, 10, 5, 1, 2, 3, 4);
4192 assert_eq_m128i(r, e);
4193 }
4194
83c7162d 4195 #[simd_test(enable = "sse2")]
0531ce1d 4196 unsafe fn test_mm_unpackhi_epi8() {
0731742a 4197 #[rustfmt::skip]
0531ce1d
XL
4198 let a = _mm_setr_epi8(
4199 0, 1, 2, 3, 4, 5, 6, 7,
4200 8, 9, 10, 11, 12, 13, 14, 15,
4201 );
0731742a 4202 #[rustfmt::skip]
0531ce1d
XL
4203 let b = _mm_setr_epi8(
4204 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4205 );
4206 let r = _mm_unpackhi_epi8(a, b);
0731742a 4207 #[rustfmt::skip]
0531ce1d
XL
4208 let e = _mm_setr_epi8(
4209 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
4210 );
4211 assert_eq_m128i(r, e);
4212 }
4213
83c7162d 4214 #[simd_test(enable = "sse2")]
0531ce1d
XL
4215 unsafe fn test_mm_unpackhi_epi16() {
4216 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4217 let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4218 let r = _mm_unpackhi_epi16(a, b);
4219 let e = _mm_setr_epi16(4, 12, 5, 13, 6, 14, 7, 15);
4220 assert_eq_m128i(r, e);
4221 }
4222
83c7162d 4223 #[simd_test(enable = "sse2")]
0531ce1d
XL
4224 unsafe fn test_mm_unpackhi_epi32() {
4225 let a = _mm_setr_epi32(0, 1, 2, 3);
4226 let b = _mm_setr_epi32(4, 5, 6, 7);
4227 let r = _mm_unpackhi_epi32(a, b);
4228 let e = _mm_setr_epi32(2, 6, 3, 7);
4229 assert_eq_m128i(r, e);
4230 }
4231
83c7162d 4232 #[simd_test(enable = "sse2")]
0531ce1d
XL
4233 unsafe fn test_mm_unpackhi_epi64() {
4234 let a = _mm_setr_epi64x(0, 1);
4235 let b = _mm_setr_epi64x(2, 3);
4236 let r = _mm_unpackhi_epi64(a, b);
4237 let e = _mm_setr_epi64x(1, 3);
4238 assert_eq_m128i(r, e);
4239 }
4240
83c7162d 4241 #[simd_test(enable = "sse2")]
0531ce1d 4242 unsafe fn test_mm_unpacklo_epi8() {
0731742a 4243 #[rustfmt::skip]
0531ce1d
XL
4244 let a = _mm_setr_epi8(
4245 0, 1, 2, 3, 4, 5, 6, 7,
4246 8, 9, 10, 11, 12, 13, 14, 15,
4247 );
0731742a 4248 #[rustfmt::skip]
0531ce1d
XL
4249 let b = _mm_setr_epi8(
4250 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
4251 );
4252 let r = _mm_unpacklo_epi8(a, b);
0731742a 4253 #[rustfmt::skip]
0531ce1d
XL
4254 let e = _mm_setr_epi8(
4255 0, 16, 1, 17, 2, 18, 3, 19,
4256 4, 20, 5, 21, 6, 22, 7, 23,
4257 );
4258 assert_eq_m128i(r, e);
4259 }
4260
83c7162d 4261 #[simd_test(enable = "sse2")]
0531ce1d
XL
4262 unsafe fn test_mm_unpacklo_epi16() {
4263 let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4264 let b = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
4265 let r = _mm_unpacklo_epi16(a, b);
4266 let e = _mm_setr_epi16(0, 8, 1, 9, 2, 10, 3, 11);
4267 assert_eq_m128i(r, e);
4268 }
4269
83c7162d 4270 #[simd_test(enable = "sse2")]
0531ce1d
XL
4271 unsafe fn test_mm_unpacklo_epi32() {
4272 let a = _mm_setr_epi32(0, 1, 2, 3);
4273 let b = _mm_setr_epi32(4, 5, 6, 7);
4274 let r = _mm_unpacklo_epi32(a, b);
4275 let e = _mm_setr_epi32(0, 4, 1, 5);
4276 assert_eq_m128i(r, e);
4277 }
4278
83c7162d 4279 #[simd_test(enable = "sse2")]
0531ce1d
XL
4280 unsafe fn test_mm_unpacklo_epi64() {
4281 let a = _mm_setr_epi64x(0, 1);
4282 let b = _mm_setr_epi64x(2, 3);
4283 let r = _mm_unpacklo_epi64(a, b);
4284 let e = _mm_setr_epi64x(0, 2);
4285 assert_eq_m128i(r, e);
4286 }
4287
83c7162d 4288 #[simd_test(enable = "sse2")]
0531ce1d
XL
4289 unsafe fn test_mm_add_sd() {
4290 let a = _mm_setr_pd(1.0, 2.0);
4291 let b = _mm_setr_pd(5.0, 10.0);
4292 let r = _mm_add_sd(a, b);
4293 assert_eq_m128d(r, _mm_setr_pd(6.0, 2.0));
4294 }
4295
83c7162d 4296 #[simd_test(enable = "sse2")]
0531ce1d
XL
4297 unsafe fn test_mm_add_pd() {
4298 let a = _mm_setr_pd(1.0, 2.0);
4299 let b = _mm_setr_pd(5.0, 10.0);
4300 let r = _mm_add_pd(a, b);
4301 assert_eq_m128d(r, _mm_setr_pd(6.0, 12.0));
4302 }
4303
83c7162d 4304 #[simd_test(enable = "sse2")]
0531ce1d
XL
4305 unsafe fn test_mm_div_sd() {
4306 let a = _mm_setr_pd(1.0, 2.0);
4307 let b = _mm_setr_pd(5.0, 10.0);
4308 let r = _mm_div_sd(a, b);
4309 assert_eq_m128d(r, _mm_setr_pd(0.2, 2.0));
4310 }
4311
83c7162d 4312 #[simd_test(enable = "sse2")]
0531ce1d
XL
4313 unsafe fn test_mm_div_pd() {
4314 let a = _mm_setr_pd(1.0, 2.0);
4315 let b = _mm_setr_pd(5.0, 10.0);
4316 let r = _mm_div_pd(a, b);
4317 assert_eq_m128d(r, _mm_setr_pd(0.2, 0.2));
4318 }
4319
83c7162d 4320 #[simd_test(enable = "sse2")]
0531ce1d
XL
4321 unsafe fn test_mm_max_sd() {
4322 let a = _mm_setr_pd(1.0, 2.0);
4323 let b = _mm_setr_pd(5.0, 10.0);
4324 let r = _mm_max_sd(a, b);
4325 assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4326 }
4327
83c7162d 4328 #[simd_test(enable = "sse2")]
0531ce1d
XL
4329 unsafe fn test_mm_max_pd() {
4330 let a = _mm_setr_pd(1.0, 2.0);
4331 let b = _mm_setr_pd(5.0, 10.0);
4332 let r = _mm_max_pd(a, b);
4333 assert_eq_m128d(r, _mm_setr_pd(5.0, 10.0));
4334 }
4335
83c7162d 4336 #[simd_test(enable = "sse2")]
0531ce1d
XL
4337 unsafe fn test_mm_min_sd() {
4338 let a = _mm_setr_pd(1.0, 2.0);
4339 let b = _mm_setr_pd(5.0, 10.0);
4340 let r = _mm_min_sd(a, b);
4341 assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4342 }
4343
83c7162d 4344 #[simd_test(enable = "sse2")]
0531ce1d
XL
4345 unsafe fn test_mm_min_pd() {
4346 let a = _mm_setr_pd(1.0, 2.0);
4347 let b = _mm_setr_pd(5.0, 10.0);
4348 let r = _mm_min_pd(a, b);
4349 assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4350 }
4351
83c7162d 4352 #[simd_test(enable = "sse2")]
0531ce1d
XL
4353 unsafe fn test_mm_mul_sd() {
4354 let a = _mm_setr_pd(1.0, 2.0);
4355 let b = _mm_setr_pd(5.0, 10.0);
4356 let r = _mm_mul_sd(a, b);
4357 assert_eq_m128d(r, _mm_setr_pd(5.0, 2.0));
4358 }
4359
83c7162d 4360 #[simd_test(enable = "sse2")]
0531ce1d
XL
4361 unsafe fn test_mm_mul_pd() {
4362 let a = _mm_setr_pd(1.0, 2.0);
4363 let b = _mm_setr_pd(5.0, 10.0);
4364 let r = _mm_mul_pd(a, b);
4365 assert_eq_m128d(r, _mm_setr_pd(5.0, 20.0));
4366 }
4367
83c7162d 4368 #[simd_test(enable = "sse2")]
0531ce1d
XL
4369 unsafe fn test_mm_sqrt_sd() {
4370 let a = _mm_setr_pd(1.0, 2.0);
4371 let b = _mm_setr_pd(5.0, 10.0);
4372 let r = _mm_sqrt_sd(a, b);
4373 assert_eq_m128d(r, _mm_setr_pd(5.0f64.sqrt(), 2.0));
4374 }
4375
83c7162d 4376 #[simd_test(enable = "sse2")]
0531ce1d
XL
4377 unsafe fn test_mm_sqrt_pd() {
4378 let r = _mm_sqrt_pd(_mm_setr_pd(1.0, 2.0));
4379 assert_eq_m128d(r, _mm_setr_pd(1.0f64.sqrt(), 2.0f64.sqrt()));
4380 }
4381
83c7162d 4382 #[simd_test(enable = "sse2")]
0531ce1d
XL
4383 unsafe fn test_mm_sub_sd() {
4384 let a = _mm_setr_pd(1.0, 2.0);
4385 let b = _mm_setr_pd(5.0, 10.0);
4386 let r = _mm_sub_sd(a, b);
4387 assert_eq_m128d(r, _mm_setr_pd(-4.0, 2.0));
4388 }
4389
83c7162d 4390 #[simd_test(enable = "sse2")]
0531ce1d
XL
4391 unsafe fn test_mm_sub_pd() {
4392 let a = _mm_setr_pd(1.0, 2.0);
4393 let b = _mm_setr_pd(5.0, 10.0);
4394 let r = _mm_sub_pd(a, b);
4395 assert_eq_m128d(r, _mm_setr_pd(-4.0, -8.0));
4396 }
4397
83c7162d 4398 #[simd_test(enable = "sse2")]
0531ce1d
XL
4399 unsafe fn test_mm_and_pd() {
4400 let a = transmute(u64x2::splat(5));
4401 let b = transmute(u64x2::splat(3));
4402 let r = _mm_and_pd(a, b);
4403 let e = transmute(u64x2::splat(1));
4404 assert_eq_m128d(r, e);
4405 }
4406
83c7162d 4407 #[simd_test(enable = "sse2")]
0531ce1d
XL
4408 unsafe fn test_mm_andnot_pd() {
4409 let a = transmute(u64x2::splat(5));
4410 let b = transmute(u64x2::splat(3));
4411 let r = _mm_andnot_pd(a, b);
4412 let e = transmute(u64x2::splat(2));
4413 assert_eq_m128d(r, e);
4414 }
4415
83c7162d 4416 #[simd_test(enable = "sse2")]
0531ce1d
XL
4417 unsafe fn test_mm_or_pd() {
4418 let a = transmute(u64x2::splat(5));
4419 let b = transmute(u64x2::splat(3));
4420 let r = _mm_or_pd(a, b);
4421 let e = transmute(u64x2::splat(7));
4422 assert_eq_m128d(r, e);
4423 }
4424
83c7162d 4425 #[simd_test(enable = "sse2")]
0531ce1d
XL
4426 unsafe fn test_mm_xor_pd() {
4427 let a = transmute(u64x2::splat(5));
4428 let b = transmute(u64x2::splat(3));
4429 let r = _mm_xor_pd(a, b);
4430 let e = transmute(u64x2::splat(6));
4431 assert_eq_m128d(r, e);
4432 }
4433
83c7162d 4434 #[simd_test(enable = "sse2")]
0531ce1d
XL
4435 unsafe fn test_mm_cmpeq_sd() {
4436 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4437 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4438 let r = transmute::<_, __m128i>(_mm_cmpeq_sd(a, b));
4439 assert_eq_m128i(r, e);
4440 }
4441
83c7162d 4442 #[simd_test(enable = "sse2")]
0531ce1d
XL
4443 unsafe fn test_mm_cmplt_sd() {
4444 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4445 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4446 let r = transmute::<_, __m128i>(_mm_cmplt_sd(a, b));
4447 assert_eq_m128i(r, e);
4448 }
4449
83c7162d 4450 #[simd_test(enable = "sse2")]
0531ce1d
XL
4451 unsafe fn test_mm_cmple_sd() {
4452 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4453 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4454 let r = transmute::<_, __m128i>(_mm_cmple_sd(a, b));
4455 assert_eq_m128i(r, e);
4456 }
4457
83c7162d 4458 #[simd_test(enable = "sse2")]
0531ce1d
XL
4459 unsafe fn test_mm_cmpgt_sd() {
4460 let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4461 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4462 let r = transmute::<_, __m128i>(_mm_cmpgt_sd(a, b));
4463 assert_eq_m128i(r, e);
4464 }
4465
83c7162d 4466 #[simd_test(enable = "sse2")]
0531ce1d
XL
4467 unsafe fn test_mm_cmpge_sd() {
4468 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4469 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4470 let r = transmute::<_, __m128i>(_mm_cmpge_sd(a, b));
4471 assert_eq_m128i(r, e);
4472 }
4473
83c7162d 4474 #[simd_test(enable = "sse2")]
0531ce1d
XL
4475 unsafe fn test_mm_cmpord_sd() {
4476 let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4477 let e = _mm_setr_epi64x(0, transmute(2.0f64));
4478 let r = transmute::<_, __m128i>(_mm_cmpord_sd(a, b));
4479 assert_eq_m128i(r, e);
4480 }
4481
83c7162d 4482 #[simd_test(enable = "sse2")]
0531ce1d
XL
4483 unsafe fn test_mm_cmpunord_sd() {
4484 let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4485 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4486 let r = transmute::<_, __m128i>(_mm_cmpunord_sd(a, b));
4487 assert_eq_m128i(r, e);
4488 }
4489
83c7162d 4490 #[simd_test(enable = "sse2")]
0531ce1d
XL
4491 unsafe fn test_mm_cmpneq_sd() {
4492 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4493 let e = _mm_setr_epi64x(!0, transmute(2.0f64));
4494 let r = transmute::<_, __m128i>(_mm_cmpneq_sd(a, b));
4495 assert_eq_m128i(r, e);
4496 }
4497
83c7162d 4498 #[simd_test(enable = "sse2")]
0531ce1d
XL
4499 unsafe fn test_mm_cmpnlt_sd() {
4500 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4501 let e = _mm_setr_epi64x(0, transmute(2.0f64));
4502 let r = transmute::<_, __m128i>(_mm_cmpnlt_sd(a, b));
4503 assert_eq_m128i(r, e);
4504 }
4505
83c7162d 4506 #[simd_test(enable = "sse2")]
0531ce1d
XL
4507 unsafe fn test_mm_cmpnle_sd() {
4508 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4509 let e = _mm_setr_epi64x(0, transmute(2.0f64));
4510 let r = transmute::<_, __m128i>(_mm_cmpnle_sd(a, b));
4511 assert_eq_m128i(r, e);
4512 }
4513
83c7162d 4514 #[simd_test(enable = "sse2")]
0531ce1d
XL
4515 unsafe fn test_mm_cmpngt_sd() {
4516 let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4517 let e = _mm_setr_epi64x(0, transmute(2.0f64));
4518 let r = transmute::<_, __m128i>(_mm_cmpngt_sd(a, b));
4519 assert_eq_m128i(r, e);
4520 }
4521
83c7162d 4522 #[simd_test(enable = "sse2")]
0531ce1d
XL
4523 unsafe fn test_mm_cmpnge_sd() {
4524 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4525 let e = _mm_setr_epi64x(0, transmute(2.0f64));
4526 let r = transmute::<_, __m128i>(_mm_cmpnge_sd(a, b));
4527 assert_eq_m128i(r, e);
4528 }
4529
83c7162d 4530 #[simd_test(enable = "sse2")]
0531ce1d
XL
4531 unsafe fn test_mm_cmpeq_pd() {
4532 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4533 let e = _mm_setr_epi64x(!0, 0);
4534 let r = transmute::<_, __m128i>(_mm_cmpeq_pd(a, b));
4535 assert_eq_m128i(r, e);
4536 }
4537
83c7162d 4538 #[simd_test(enable = "sse2")]
0531ce1d
XL
4539 unsafe fn test_mm_cmplt_pd() {
4540 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4541 let e = _mm_setr_epi64x(0, !0);
4542 let r = transmute::<_, __m128i>(_mm_cmplt_pd(a, b));
4543 assert_eq_m128i(r, e);
4544 }
4545
83c7162d 4546 #[simd_test(enable = "sse2")]
0531ce1d
XL
4547 unsafe fn test_mm_cmple_pd() {
4548 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4549 let e = _mm_setr_epi64x(!0, !0);
4550 let r = transmute::<_, __m128i>(_mm_cmple_pd(a, b));
4551 assert_eq_m128i(r, e);
4552 }
4553
83c7162d 4554 #[simd_test(enable = "sse2")]
0531ce1d
XL
4555 unsafe fn test_mm_cmpgt_pd() {
4556 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4557 let e = _mm_setr_epi64x(0, 0);
4558 let r = transmute::<_, __m128i>(_mm_cmpgt_pd(a, b));
4559 assert_eq_m128i(r, e);
4560 }
4561
83c7162d 4562 #[simd_test(enable = "sse2")]
0531ce1d
XL
4563 unsafe fn test_mm_cmpge_pd() {
4564 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4565 let e = _mm_setr_epi64x(!0, 0);
4566 let r = transmute::<_, __m128i>(_mm_cmpge_pd(a, b));
4567 assert_eq_m128i(r, e);
4568 }
4569
83c7162d 4570 #[simd_test(enable = "sse2")]
0531ce1d
XL
4571 unsafe fn test_mm_cmpord_pd() {
4572 let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4573 let e = _mm_setr_epi64x(0, !0);
4574 let r = transmute::<_, __m128i>(_mm_cmpord_pd(a, b));
4575 assert_eq_m128i(r, e);
4576 }
4577
83c7162d 4578 #[simd_test(enable = "sse2")]
0531ce1d
XL
4579 unsafe fn test_mm_cmpunord_pd() {
4580 let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(5.0, 3.0));
4581 let e = _mm_setr_epi64x(!0, 0);
4582 let r = transmute::<_, __m128i>(_mm_cmpunord_pd(a, b));
4583 assert_eq_m128i(r, e);
4584 }
4585
83c7162d 4586 #[simd_test(enable = "sse2")]
0531ce1d
XL
4587 unsafe fn test_mm_cmpneq_pd() {
4588 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4589 let e = _mm_setr_epi64x(!0, !0);
4590 let r = transmute::<_, __m128i>(_mm_cmpneq_pd(a, b));
4591 assert_eq_m128i(r, e);
4592 }
4593
83c7162d 4594 #[simd_test(enable = "sse2")]
0531ce1d
XL
4595 unsafe fn test_mm_cmpnlt_pd() {
4596 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(5.0, 3.0));
4597 let e = _mm_setr_epi64x(0, 0);
4598 let r = transmute::<_, __m128i>(_mm_cmpnlt_pd(a, b));
4599 assert_eq_m128i(r, e);
4600 }
4601
83c7162d 4602 #[simd_test(enable = "sse2")]
0531ce1d
XL
4603 unsafe fn test_mm_cmpnle_pd() {
4604 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4605 let e = _mm_setr_epi64x(0, 0);
4606 let r = transmute::<_, __m128i>(_mm_cmpnle_pd(a, b));
4607 assert_eq_m128i(r, e);
4608 }
4609
83c7162d 4610 #[simd_test(enable = "sse2")]
0531ce1d
XL
4611 unsafe fn test_mm_cmpngt_pd() {
4612 let (a, b) = (_mm_setr_pd(5.0, 2.0), _mm_setr_pd(1.0, 3.0));
4613 let e = _mm_setr_epi64x(0, !0);
4614 let r = transmute::<_, __m128i>(_mm_cmpngt_pd(a, b));
4615 assert_eq_m128i(r, e);
4616 }
4617
83c7162d 4618 #[simd_test(enable = "sse2")]
0531ce1d
XL
4619 unsafe fn test_mm_cmpnge_pd() {
4620 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4621 let e = _mm_setr_epi64x(0, !0);
4622 let r = transmute::<_, __m128i>(_mm_cmpnge_pd(a, b));
4623 assert_eq_m128i(r, e);
4624 }
4625
83c7162d 4626 #[simd_test(enable = "sse2")]
0531ce1d
XL
4627 unsafe fn test_mm_comieq_sd() {
4628 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4629 assert!(_mm_comieq_sd(a, b) != 0);
4630
4631 let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(1.0, 3.0));
4632 assert!(_mm_comieq_sd(a, b) == 0);
4633 }
4634
83c7162d 4635 #[simd_test(enable = "sse2")]
0531ce1d
XL
4636 unsafe fn test_mm_comilt_sd() {
4637 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4638 assert!(_mm_comilt_sd(a, b) == 0);
4639 }
4640
83c7162d 4641 #[simd_test(enable = "sse2")]
0531ce1d
XL
4642 unsafe fn test_mm_comile_sd() {
4643 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4644 assert!(_mm_comile_sd(a, b) != 0);
4645 }
4646
83c7162d 4647 #[simd_test(enable = "sse2")]
0531ce1d
XL
4648 unsafe fn test_mm_comigt_sd() {
4649 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4650 assert!(_mm_comigt_sd(a, b) == 0);
4651 }
4652
83c7162d 4653 #[simd_test(enable = "sse2")]
0531ce1d
XL
4654 unsafe fn test_mm_comige_sd() {
4655 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4656 assert!(_mm_comige_sd(a, b) != 0);
4657 }
4658
83c7162d 4659 #[simd_test(enable = "sse2")]
0531ce1d
XL
4660 unsafe fn test_mm_comineq_sd() {
4661 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4662 assert!(_mm_comineq_sd(a, b) == 0);
4663 }
4664
83c7162d 4665 #[simd_test(enable = "sse2")]
0531ce1d
XL
4666 unsafe fn test_mm_ucomieq_sd() {
4667 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4668 assert!(_mm_ucomieq_sd(a, b) != 0);
4669
4670 let (a, b) = (_mm_setr_pd(NAN, 2.0), _mm_setr_pd(NAN, 3.0));
4671 assert!(_mm_ucomieq_sd(a, b) == 0);
4672 }
4673
83c7162d 4674 #[simd_test(enable = "sse2")]
0531ce1d
XL
4675 unsafe fn test_mm_ucomilt_sd() {
4676 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4677 assert!(_mm_ucomilt_sd(a, b) == 0);
4678 }
4679
83c7162d 4680 #[simd_test(enable = "sse2")]
0531ce1d
XL
4681 unsafe fn test_mm_ucomile_sd() {
4682 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4683 assert!(_mm_ucomile_sd(a, b) != 0);
4684 }
4685
83c7162d 4686 #[simd_test(enable = "sse2")]
0531ce1d
XL
4687 unsafe fn test_mm_ucomigt_sd() {
4688 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4689 assert!(_mm_ucomigt_sd(a, b) == 0);
4690 }
4691
83c7162d 4692 #[simd_test(enable = "sse2")]
0531ce1d
XL
4693 unsafe fn test_mm_ucomige_sd() {
4694 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4695 assert!(_mm_ucomige_sd(a, b) != 0);
4696 }
4697
83c7162d 4698 #[simd_test(enable = "sse2")]
0531ce1d
XL
4699 unsafe fn test_mm_ucomineq_sd() {
4700 let (a, b) = (_mm_setr_pd(1.0, 2.0), _mm_setr_pd(1.0, 3.0));
4701 assert!(_mm_ucomineq_sd(a, b) == 0);
4702 }
4703
83c7162d 4704 #[simd_test(enable = "sse2")]
0531ce1d
XL
4705 unsafe fn test_mm_movemask_pd() {
4706 let r = _mm_movemask_pd(_mm_setr_pd(-1.0, 5.0));
4707 assert_eq!(r, 0b01);
4708
4709 let r = _mm_movemask_pd(_mm_setr_pd(-1.0, -5.0));
4710 assert_eq!(r, 0b11);
4711 }
4712
4713 #[repr(align(16))]
4714 struct Memory {
4715 data: [f64; 4],
4716 }
4717
83c7162d 4718 #[simd_test(enable = "sse2")]
0531ce1d
XL
4719 unsafe fn test_mm_load_pd() {
4720 let mem = Memory {
4721 data: [1.0f64, 2.0, 3.0, 4.0],
4722 };
4723 let vals = &mem.data;
4724 let d = vals.as_ptr();
4725
4726 let r = _mm_load_pd(d);
4727 assert_eq_m128d(r, _mm_setr_pd(1.0, 2.0));
4728 }
4729
83c7162d 4730 #[simd_test(enable = "sse2")]
0531ce1d
XL
4731 unsafe fn test_mm_load_sd() {
4732 let a = 1.;
4733 let expected = _mm_setr_pd(a, 0.);
4734 let r = _mm_load_sd(&a);
4735 assert_eq_m128d(r, expected);
4736 }
4737
83c7162d 4738 #[simd_test(enable = "sse2")]
0531ce1d
XL
4739 unsafe fn test_mm_loadh_pd() {
4740 let a = _mm_setr_pd(1., 2.);
4741 let b = 3.;
4742 let expected = _mm_setr_pd(_mm_cvtsd_f64(a), 3.);
4743 let r = _mm_loadh_pd(a, &b);
4744 assert_eq_m128d(r, expected);
4745 }
4746
83c7162d 4747 #[simd_test(enable = "sse2")]
0531ce1d
XL
4748 unsafe fn test_mm_loadl_pd() {
4749 let a = _mm_setr_pd(1., 2.);
4750 let b = 3.;
4751 let expected = _mm_setr_pd(3., get_m128d(a, 1));
4752 let r = _mm_loadl_pd(a, &b);
4753 assert_eq_m128d(r, expected);
4754 }
4755
83c7162d 4756 #[simd_test(enable = "sse2")]
0531ce1d
XL
4757 unsafe fn test_mm_stream_pd() {
4758 #[repr(align(128))]
4759 struct Memory {
4760 pub data: [f64; 2],
4761 }
4762 let a = _mm_set1_pd(7.0);
8faf50e0 4763 let mut mem = Memory { data: [-1.0; 2] };
0531ce1d
XL
4764
4765 _mm_stream_pd(&mut mem.data[0] as *mut f64, a);
4766 for i in 0..2 {
4767 assert_eq!(mem.data[i], get_m128d(a, i));
4768 }
4769 }
4770
83c7162d 4771 #[simd_test(enable = "sse2")]
0531ce1d
XL
4772 unsafe fn test_mm_store_sd() {
4773 let mut dest = 0.;
4774 let a = _mm_setr_pd(1., 2.);
4775 _mm_store_sd(&mut dest, a);
4776 assert_eq!(dest, _mm_cvtsd_f64(a));
4777 }
4778
83c7162d 4779 #[simd_test(enable = "sse2")]
0531ce1d 4780 unsafe fn test_mm_store_pd() {
8faf50e0 4781 let mut mem = Memory { data: [0.0f64; 4] };
0531ce1d
XL
4782 let vals = &mut mem.data;
4783 let a = _mm_setr_pd(1.0, 2.0);
4784 let d = vals.as_mut_ptr();
4785
4786 _mm_store_pd(d, *black_box(&a));
4787 assert_eq!(vals[0], 1.0);
4788 assert_eq!(vals[1], 2.0);
4789 }
4790
83c7162d 4791 #[simd_test(enable = "sse")]
0531ce1d 4792 unsafe fn test_mm_storeu_pd() {
8faf50e0 4793 let mut mem = Memory { data: [0.0f64; 4] };
0531ce1d
XL
4794 let vals = &mut mem.data;
4795 let a = _mm_setr_pd(1.0, 2.0);
4796
4797 let mut ofs = 0;
4798 let mut p = vals.as_mut_ptr();
4799
4800 // Make sure p is *not* aligned to 16-byte boundary
4801 if (p as usize) & 0xf == 0 {
4802 ofs = 1;
4803 p = p.offset(1);
4804 }
4805
4806 _mm_storeu_pd(p, *black_box(&a));
4807
4808 if ofs > 0 {
4809 assert_eq!(vals[ofs - 1], 0.0);
4810 }
4811 assert_eq!(vals[ofs + 0], 1.0);
4812 assert_eq!(vals[ofs + 1], 2.0);
4813 }
4814
83c7162d 4815 #[simd_test(enable = "sse2")]
0531ce1d 4816 unsafe fn test_mm_store1_pd() {
8faf50e0 4817 let mut mem = Memory { data: [0.0f64; 4] };
0531ce1d
XL
4818 let vals = &mut mem.data;
4819 let a = _mm_setr_pd(1.0, 2.0);
4820 let d = vals.as_mut_ptr();
4821
4822 _mm_store1_pd(d, *black_box(&a));
4823 assert_eq!(vals[0], 1.0);
4824 assert_eq!(vals[1], 1.0);
4825 }
4826
83c7162d 4827 #[simd_test(enable = "sse2")]
0531ce1d 4828 unsafe fn test_mm_store_pd1() {
8faf50e0 4829 let mut mem = Memory { data: [0.0f64; 4] };
0531ce1d
XL
4830 let vals = &mut mem.data;
4831 let a = _mm_setr_pd(1.0, 2.0);
4832 let d = vals.as_mut_ptr();
4833
4834 _mm_store_pd1(d, *black_box(&a));
4835 assert_eq!(vals[0], 1.0);
4836 assert_eq!(vals[1], 1.0);
4837 }
4838
83c7162d 4839 #[simd_test(enable = "sse2")]
0531ce1d 4840 unsafe fn test_mm_storer_pd() {
8faf50e0 4841 let mut mem = Memory { data: [0.0f64; 4] };
0531ce1d
XL
4842 let vals = &mut mem.data;
4843 let a = _mm_setr_pd(1.0, 2.0);
4844 let d = vals.as_mut_ptr();
4845
4846 _mm_storer_pd(d, *black_box(&a));
4847 assert_eq!(vals[0], 2.0);
4848 assert_eq!(vals[1], 1.0);
4849 }
4850
83c7162d 4851 #[simd_test(enable = "sse2")]
0531ce1d
XL
4852 unsafe fn test_mm_storeh_pd() {
4853 let mut dest = 0.;
4854 let a = _mm_setr_pd(1., 2.);
4855 _mm_storeh_pd(&mut dest, a);
4856 assert_eq!(dest, get_m128d(a, 1));
4857 }
4858
83c7162d 4859 #[simd_test(enable = "sse2")]
0531ce1d
XL
4860 unsafe fn test_mm_storel_pd() {
4861 let mut dest = 0.;
4862 let a = _mm_setr_pd(1., 2.);
4863 _mm_storel_pd(&mut dest, a);
4864 assert_eq!(dest, _mm_cvtsd_f64(a));
4865 }
4866
83c7162d 4867 #[simd_test(enable = "sse2")]
0531ce1d
XL
4868 unsafe fn test_mm_loadr_pd() {
4869 let mut mem = Memory {
4870 data: [1.0f64, 2.0, 3.0, 4.0],
4871 };
4872 let vals = &mut mem.data;
4873 let d = vals.as_ptr();
4874
4875 let r = _mm_loadr_pd(d);
4876 assert_eq_m128d(r, _mm_setr_pd(2.0, 1.0));
4877 }
4878
83c7162d 4879 #[simd_test(enable = "sse2")]
0531ce1d
XL
4880 unsafe fn test_mm_loadu_pd() {
4881 let mut mem = Memory {
4882 data: [1.0f64, 2.0, 3.0, 4.0],
4883 };
4884 let vals = &mut mem.data;
4885 let mut d = vals.as_ptr();
4886
4887 // make sure d is not aligned to 16-byte boundary
4888 let mut offset = 0;
4889 if (d as usize) & 0xf == 0 {
4890 offset = 1;
4891 d = d.offset(offset as isize);
4892 }
4893
4894 let r = _mm_loadu_pd(d);
8faf50e0 4895 let e = _mm_add_pd(_mm_setr_pd(1.0, 2.0), _mm_set1_pd(offset as f64));
0531ce1d
XL
4896 assert_eq_m128d(r, e);
4897 }
4898
83c7162d 4899 #[simd_test(enable = "sse2")]
0531ce1d
XL
4900 unsafe fn test_mm_cvtpd_ps() {
4901 let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, 5.0));
4902 assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, 0.0));
4903
4904 let r = _mm_cvtpd_ps(_mm_setr_pd(-1.0, -5.0));
4905 assert_eq_m128(r, _mm_setr_ps(-1.0, -5.0, 0.0, 0.0));
4906
4907 let r = _mm_cvtpd_ps(_mm_setr_pd(f64::MAX, f64::MIN));
0731742a 4908 assert_eq_m128(r, _mm_setr_ps(f32::INFINITY, f32::NEG_INFINITY, 0.0, 0.0));
0531ce1d
XL
4909
4910 let r = _mm_cvtpd_ps(_mm_setr_pd(f32::MAX as f64, f32::MIN as f64));
4911 assert_eq_m128(r, _mm_setr_ps(f32::MAX, f32::MIN, 0.0, 0.0));
4912 }
4913
83c7162d 4914 #[simd_test(enable = "sse2")]
0531ce1d
XL
4915 unsafe fn test_mm_cvtps_pd() {
4916 let r = _mm_cvtps_pd(_mm_setr_ps(-1.0, 2.0, -3.0, 5.0));
4917 assert_eq_m128d(r, _mm_setr_pd(-1.0, 2.0));
4918
4919 let r = _mm_cvtps_pd(_mm_setr_ps(
4920 f32::MAX,
4921 f32::INFINITY,
4922 f32::NEG_INFINITY,
4923 f32::MIN,
4924 ));
4925 assert_eq_m128d(r, _mm_setr_pd(f32::MAX as f64, f64::INFINITY));
4926 }
4927
83c7162d 4928 #[simd_test(enable = "sse2")]
0531ce1d
XL
4929 unsafe fn test_mm_cvtpd_epi32() {
4930 let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, 5.0));
4931 assert_eq_m128i(r, _mm_setr_epi32(-1, 5, 0, 0));
4932
4933 let r = _mm_cvtpd_epi32(_mm_setr_pd(-1.0, -5.0));
4934 assert_eq_m128i(r, _mm_setr_epi32(-1, -5, 0, 0));
4935
4936 let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::MAX, f64::MIN));
4937 assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
4938
4939 let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::INFINITY, f64::NEG_INFINITY));
4940 assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
4941
4942 let r = _mm_cvtpd_epi32(_mm_setr_pd(f64::NAN, f64::NAN));
4943 assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
4944 }
4945
83c7162d 4946 #[simd_test(enable = "sse2")]
0531ce1d
XL
4947 unsafe fn test_mm_cvtsd_si32() {
4948 let r = _mm_cvtsd_si32(_mm_setr_pd(-2.0, 5.0));
4949 assert_eq!(r, -2);
4950
4951 let r = _mm_cvtsd_si32(_mm_setr_pd(f64::MAX, f64::MIN));
4952 assert_eq!(r, i32::MIN);
4953
4954 let r = _mm_cvtsd_si32(_mm_setr_pd(f64::NAN, f64::NAN));
4955 assert_eq!(r, i32::MIN);
4956 }
4957
83c7162d 4958 #[simd_test(enable = "sse2")]
0531ce1d
XL
4959 unsafe fn test_mm_cvtsd_ss() {
4960 let a = _mm_setr_ps(-1.1, -2.2, 3.3, 4.4);
4961 let b = _mm_setr_pd(2.0, -5.0);
4962
4963 let r = _mm_cvtsd_ss(a, b);
4964
4965 assert_eq_m128(r, _mm_setr_ps(2.0, -2.2, 3.3, 4.4));
4966
0731742a 4967 let a = _mm_setr_ps(-1.1, f32::NEG_INFINITY, f32::MAX, f32::NEG_INFINITY);
0531ce1d
XL
4968 let b = _mm_setr_pd(f64::INFINITY, -5.0);
4969
4970 let r = _mm_cvtsd_ss(a, b);
4971
4972 assert_eq_m128(
4973 r,
4974 _mm_setr_ps(
4975 f32::INFINITY,
4976 f32::NEG_INFINITY,
4977 f32::MAX,
4978 f32::NEG_INFINITY,
4979 ),
4980 );
4981 }
4982
83c7162d 4983 #[simd_test(enable = "sse2")]
0531ce1d
XL
4984 unsafe fn test_mm_cvtsd_f64() {
4985 let r = _mm_cvtsd_f64(_mm_setr_pd(-1.1, 2.2));
4986 assert_eq!(r, -1.1);
4987 }
4988
83c7162d 4989 #[simd_test(enable = "sse2")]
0531ce1d
XL
4990 unsafe fn test_mm_cvtss_sd() {
4991 let a = _mm_setr_pd(-1.1, 2.2);
4992 let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
4993
4994 let r = _mm_cvtss_sd(a, b);
4995 assert_eq_m128d(r, _mm_setr_pd(1.0, 2.2));
4996
4997 let a = _mm_setr_pd(-1.1, f64::INFINITY);
4998 let b = _mm_setr_ps(f32::NEG_INFINITY, 2.0, 3.0, 4.0);
4999
5000 let r = _mm_cvtss_sd(a, b);
5001 assert_eq_m128d(r, _mm_setr_pd(f64::NEG_INFINITY, f64::INFINITY));
5002 }
5003
83c7162d 5004 #[simd_test(enable = "sse2")]
0531ce1d
XL
5005 unsafe fn test_mm_cvttpd_epi32() {
5006 let a = _mm_setr_pd(-1.1, 2.2);
5007 let r = _mm_cvttpd_epi32(a);
5008 assert_eq_m128i(r, _mm_setr_epi32(-1, 2, 0, 0));
5009
5010 let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5011 let r = _mm_cvttpd_epi32(a);
5012 assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, 0, 0));
5013 }
5014
83c7162d 5015 #[simd_test(enable = "sse2")]
0531ce1d
XL
5016 unsafe fn test_mm_cvttsd_si32() {
5017 let a = _mm_setr_pd(-1.1, 2.2);
5018 let r = _mm_cvttsd_si32(a);
5019 assert_eq!(r, -1);
5020
5021 let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5022 let r = _mm_cvttsd_si32(a);
5023 assert_eq!(r, i32::MIN);
5024 }
5025
83c7162d 5026 #[simd_test(enable = "sse2")]
0531ce1d
XL
5027 unsafe fn test_mm_cvttps_epi32() {
5028 let a = _mm_setr_ps(-1.1, 2.2, -3.3, 6.6);
5029 let r = _mm_cvttps_epi32(a);
5030 assert_eq_m128i(r, _mm_setr_epi32(-1, 2, -3, 6));
5031
0731742a 5032 let a = _mm_setr_ps(f32::NEG_INFINITY, f32::INFINITY, f32::MIN, f32::MAX);
0531ce1d 5033 let r = _mm_cvttps_epi32(a);
0731742a 5034 assert_eq_m128i(r, _mm_setr_epi32(i32::MIN, i32::MIN, i32::MIN, i32::MIN));
0531ce1d
XL
5035 }
5036
83c7162d 5037 #[simd_test(enable = "sse2")]
0531ce1d
XL
5038 unsafe fn test_mm_set_sd() {
5039 let r = _mm_set_sd(-1.0_f64);
5040 assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, 0_f64));
5041 }
5042
83c7162d 5043 #[simd_test(enable = "sse2")]
0531ce1d
XL
5044 unsafe fn test_mm_set1_pd() {
5045 let r = _mm_set1_pd(-1.0_f64);
5046 assert_eq_m128d(r, _mm_setr_pd(-1.0_f64, -1.0_f64));
5047 }
5048
83c7162d 5049 #[simd_test(enable = "sse2")]
0531ce1d
XL
5050 unsafe fn test_mm_set_pd1() {
5051 let r = _mm_set_pd1(-2.0_f64);
5052 assert_eq_m128d(r, _mm_setr_pd(-2.0_f64, -2.0_f64));
5053 }
5054
83c7162d 5055 #[simd_test(enable = "sse2")]
0531ce1d
XL
5056 unsafe fn test_mm_set_pd() {
5057 let r = _mm_set_pd(1.0_f64, 5.0_f64);
5058 assert_eq_m128d(r, _mm_setr_pd(5.0_f64, 1.0_f64));
5059 }
5060
83c7162d 5061 #[simd_test(enable = "sse2")]
0531ce1d
XL
5062 unsafe fn test_mm_setr_pd() {
5063 let r = _mm_setr_pd(1.0_f64, -5.0_f64);
5064 assert_eq_m128d(r, _mm_setr_pd(1.0_f64, -5.0_f64));
5065 }
5066
83c7162d 5067 #[simd_test(enable = "sse2")]
0531ce1d
XL
5068 unsafe fn test_mm_setzero_pd() {
5069 let r = _mm_setzero_pd();
5070 assert_eq_m128d(r, _mm_setr_pd(0_f64, 0_f64));
5071 }
5072
83c7162d 5073 #[simd_test(enable = "sse2")]
0531ce1d
XL
5074 unsafe fn test_mm_load1_pd() {
5075 let d = -5.0;
5076 let r = _mm_load1_pd(&d);
5077 assert_eq_m128d(r, _mm_setr_pd(d, d));
5078 }
5079
83c7162d 5080 #[simd_test(enable = "sse2")]
0531ce1d
XL
5081 unsafe fn test_mm_load_pd1() {
5082 let d = -5.0;
5083 let r = _mm_load_pd1(&d);
5084 assert_eq_m128d(r, _mm_setr_pd(d, d));
5085 }
5086
83c7162d 5087 #[simd_test(enable = "sse2")]
0531ce1d
XL
5088 unsafe fn test_mm_unpackhi_pd() {
5089 let a = _mm_setr_pd(1.0, 2.0);
5090 let b = _mm_setr_pd(3.0, 4.0);
5091 let r = _mm_unpackhi_pd(a, b);
5092 assert_eq_m128d(r, _mm_setr_pd(2.0, 4.0));
5093 }
5094
83c7162d 5095 #[simd_test(enable = "sse2")]
0531ce1d
XL
5096 unsafe fn test_mm_unpacklo_pd() {
5097 let a = _mm_setr_pd(1.0, 2.0);
5098 let b = _mm_setr_pd(3.0, 4.0);
5099 let r = _mm_unpacklo_pd(a, b);
5100 assert_eq_m128d(r, _mm_setr_pd(1.0, 3.0));
5101 }
5102
83c7162d 5103 #[simd_test(enable = "sse2")]
0531ce1d
XL
5104 unsafe fn test_mm_shuffle_pd() {
5105 let a = _mm_setr_pd(1., 2.);
5106 let b = _mm_setr_pd(3., 4.);
5107 let expected = _mm_setr_pd(1., 3.);
5108 let r = _mm_shuffle_pd(a, b, 0);
5109 assert_eq_m128d(r, expected);
5110 }
5111
83c7162d 5112 #[simd_test(enable = "sse2")]
0531ce1d
XL
5113 unsafe fn test_mm_move_sd() {
5114 let a = _mm_setr_pd(1., 2.);
5115 let b = _mm_setr_pd(3., 4.);
5116 let expected = _mm_setr_pd(3., 2.);
5117 let r = _mm_move_sd(a, b);
5118 assert_eq_m128d(r, expected);
5119 }
5120
83c7162d 5121 #[simd_test(enable = "sse2")]
0531ce1d
XL
5122 unsafe fn test_mm_castpd_ps() {
5123 let a = _mm_set1_pd(0.);
5124 let expected = _mm_set1_ps(0.);
5125 let r = _mm_castpd_ps(a);
5126 assert_eq_m128(r, expected);
5127 }
5128
83c7162d 5129 #[simd_test(enable = "sse2")]
0531ce1d
XL
5130 unsafe fn test_mm_castpd_si128() {
5131 let a = _mm_set1_pd(0.);
5132 let expected = _mm_set1_epi64x(0);
5133 let r = _mm_castpd_si128(a);
5134 assert_eq_m128i(r, expected);
5135 }
5136
83c7162d 5137 #[simd_test(enable = "sse2")]
0531ce1d
XL
5138 unsafe fn test_mm_castps_pd() {
5139 let a = _mm_set1_ps(0.);
5140 let expected = _mm_set1_pd(0.);
5141 let r = _mm_castps_pd(a);
5142 assert_eq_m128d(r, expected);
5143 }
5144
83c7162d 5145 #[simd_test(enable = "sse2")]
0531ce1d
XL
5146 unsafe fn test_mm_castps_si128() {
5147 let a = _mm_set1_ps(0.);
5148 let expected = _mm_set1_epi32(0);
5149 let r = _mm_castps_si128(a);
5150 assert_eq_m128i(r, expected);
5151 }
5152
83c7162d 5153 #[simd_test(enable = "sse2")]
0531ce1d
XL
5154 unsafe fn test_mm_castsi128_pd() {
5155 let a = _mm_set1_epi64x(0);
5156 let expected = _mm_set1_pd(0.);
5157 let r = _mm_castsi128_pd(a);
5158 assert_eq_m128d(r, expected);
5159 }
5160
83c7162d 5161 #[simd_test(enable = "sse2")]
0531ce1d
XL
5162 unsafe fn test_mm_castsi128_ps() {
5163 let a = _mm_set1_epi32(0);
5164 let expected = _mm_set1_ps(0.);
5165 let r = _mm_castsi128_ps(a);
5166 assert_eq_m128(r, expected);
5167 }
5168
83c7162d 5169 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5170 unsafe fn test_mm_add_si64() {
5171 let a = 1i64;
5172 let b = 2i64;
5173 let expected = 3i64;
5174 let r = _mm_add_si64(mem::transmute(a), mem::transmute(b));
5175 assert_eq!(mem::transmute::<__m64, i64>(r), expected);
5176 }
5177
83c7162d 5178 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5179 unsafe fn test_mm_mul_su32() {
5180 let a = _mm_setr_pi32(1, 2);
5181 let b = _mm_setr_pi32(3, 4);
5182 let expected = 3u64;
5183 let r = _mm_mul_su32(a, b);
5184 assert_eq_m64(r, mem::transmute(expected));
5185 }
5186
83c7162d 5187 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5188 unsafe fn test_mm_sub_si64() {
5189 let a = 1i64;
5190 let b = 2i64;
5191 let expected = -1i64;
5192 let r = _mm_sub_si64(mem::transmute(a), mem::transmute(b));
5193 assert_eq!(mem::transmute::<__m64, i64>(r), expected);
5194 }
5195
83c7162d 5196 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5197 unsafe fn test_mm_cvtpi32_pd() {
5198 let a = _mm_setr_pi32(1, 2);
5199 let expected = _mm_setr_pd(1., 2.);
5200 let r = _mm_cvtpi32_pd(a);
5201 assert_eq_m128d(r, expected);
5202 }
5203
83c7162d 5204 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5205 unsafe fn test_mm_set_epi64() {
5206 let r = _mm_set_epi64(mem::transmute(1i64), mem::transmute(2i64));
5207 assert_eq_m128i(r, _mm_setr_epi64x(2, 1));
5208 }
5209
83c7162d 5210 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5211 unsafe fn test_mm_set1_epi64() {
5212 let r = _mm_set1_epi64(mem::transmute(1i64));
5213 assert_eq_m128i(r, _mm_setr_epi64x(1, 1));
5214 }
5215
83c7162d 5216 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5217 unsafe fn test_mm_setr_epi64() {
5218 let r = _mm_setr_epi64(mem::transmute(1i64), mem::transmute(2i64));
5219 assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
5220 }
5221
83c7162d 5222 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5223 unsafe fn test_mm_movepi64_pi64() {
5224 let r = _mm_movepi64_pi64(_mm_setr_epi64x(5, 0));
5225 assert_eq_m64(r, _mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
5226 }
5227
83c7162d 5228 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5229 unsafe fn test_mm_movpi64_epi64() {
5230 let r = _mm_movpi64_epi64(_mm_setr_pi8(5, 0, 0, 0, 0, 0, 0, 0));
5231 assert_eq_m128i(r, _mm_setr_epi64x(5, 0));
5232 }
5233
83c7162d 5234 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5235 unsafe fn test_mm_cvtpd_pi32() {
5236 let a = _mm_setr_pd(5., 0.);
5237 let r = _mm_cvtpd_pi32(a);
5238 assert_eq_m64(r, _mm_setr_pi32(5, 0));
5239 }
5240
83c7162d 5241 #[simd_test(enable = "sse2,mmx")]
0531ce1d
XL
5242 unsafe fn test_mm_cvttpd_pi32() {
5243 use std::{f64, i32};
5244
5245 let a = _mm_setr_pd(5., 0.);
5246 let r = _mm_cvttpd_pi32(a);
5247 assert_eq_m64(r, _mm_setr_pi32(5, 0));
5248
5249 let a = _mm_setr_pd(f64::NEG_INFINITY, f64::NAN);
5250 let r = _mm_cvttpd_pi32(a);
5251 assert_eq_m64(r, _mm_setr_pi32(i32::MIN, i32::MIN));
5252 }
5253}