]>
Commit | Line | Data |
---|---|---|
0531ce1d XL |
1 | //! Streaming SIMD Extensions 4.1 (SSE4.1) |
2 | ||
532ac7d7 XL |
3 | use crate::{ |
4 | core_arch::{simd::*, simd_llvm::*, x86::*}, | |
5 | mem::transmute, | |
6 | }; | |
0531ce1d XL |
7 | |
8 | #[cfg(test)] | |
416331ca | 9 | use stdarch_test::assert_instr; |
0531ce1d XL |
10 | |
11 | // SSE4 rounding constans | |
12 | /// round to nearest | |
83c7162d | 13 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
14 | pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00; |
15 | /// round down | |
83c7162d | 16 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
17 | pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01; |
18 | /// round up | |
83c7162d | 19 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
20 | pub const _MM_FROUND_TO_POS_INF: i32 = 0x02; |
21 | /// truncate | |
83c7162d | 22 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
23 | pub const _MM_FROUND_TO_ZERO: i32 = 0x03; |
24 | /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE` | |
83c7162d | 25 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
26 | pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04; |
27 | /// do not suppress exceptions | |
83c7162d | 28 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
29 | pub const _MM_FROUND_RAISE_EXC: i32 = 0x00; |
30 | /// suppress exceptions | |
83c7162d | 31 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
32 | pub const _MM_FROUND_NO_EXC: i32 = 0x08; |
33 | /// round to nearest and do not suppress exceptions | |
83c7162d | 34 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
35 | pub const _MM_FROUND_NINT: i32 = 0x00; |
36 | /// round down and do not suppress exceptions | |
83c7162d | 37 | #[stable(feature = "simd_x86", since = "1.27.0")] |
74b04a01 | 38 | pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF; |
0531ce1d | 39 | /// round up and do not suppress exceptions |
83c7162d | 40 | #[stable(feature = "simd_x86", since = "1.27.0")] |
74b04a01 | 41 | pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF; |
0531ce1d | 42 | /// truncate and do not suppress exceptions |
83c7162d | 43 | #[stable(feature = "simd_x86", since = "1.27.0")] |
74b04a01 | 44 | pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO; |
0531ce1d XL |
45 | /// use MXCSR.RC and do not suppress exceptions; see |
46 | /// `vendor::_MM_SET_ROUNDING_MODE` | |
83c7162d | 47 | #[stable(feature = "simd_x86", since = "1.27.0")] |
74b04a01 | 48 | pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION; |
0531ce1d | 49 | /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE` |
83c7162d | 50 | #[stable(feature = "simd_x86", since = "1.27.0")] |
74b04a01 | 51 | pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION; |
0531ce1d XL |
52 | |
53 | /// Blend packed 8-bit integers from `a` and `b` using `mask` | |
54 | /// | |
55 | /// The high bit of each corresponding mask byte determines the selection. | |
56 | /// If the high bit is set the element of `a` is selected. The element | |
57 | /// of `b` is selected otherwise. | |
83c7162d XL |
58 | /// |
59 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8) | |
0531ce1d XL |
60 | #[inline] |
61 | #[target_feature(enable = "sse4.1")] | |
62 | #[cfg_attr(test, assert_instr(pblendvb))] | |
83c7162d | 63 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0731742a | 64 | pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i { |
532ac7d7 | 65 | transmute(pblendvb(a.as_i8x16(), b.as_i8x16(), mask.as_i8x16())) |
0531ce1d XL |
66 | } |
67 | ||
68 | /// Blend packed 16-bit integers from `a` and `b` using the mask `imm8`. | |
69 | /// | |
70 | /// The mask bits determine the selection. A clear bit selects the | |
71 | /// corresponding element of `a`, and a set bit the corresponding | |
72 | /// element of `b`. | |
83c7162d XL |
73 | /// |
74 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16) | |
0531ce1d XL |
75 | #[inline] |
76 | #[target_feature(enable = "sse4.1")] | |
8faf50e0 XL |
77 | // Note: LLVM7 prefers the single-precision floating-point domain when possible |
78 | // see https://bugs.llvm.org/show_bug.cgi?id=38195 | |
79 | // #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))] | |
80 | #[cfg_attr(test, assert_instr(blendps, imm8 = 0xF0))] | |
0531ce1d | 81 | #[rustc_args_required_const(2)] |
83c7162d | 82 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
83 | pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { |
84 | let a = a.as_i16x8(); | |
85 | let b = b.as_i16x8(); | |
86 | macro_rules! call { | |
83c7162d XL |
87 | ($imm8:expr) => { |
88 | pblendw(a, b, $imm8) | |
89 | }; | |
0531ce1d | 90 | } |
532ac7d7 | 91 | transmute(constify_imm8!(imm8, call)) |
0531ce1d XL |
92 | } |
93 | ||
94 | /// Blend packed double-precision (64-bit) floating-point elements from `a` | |
95 | /// and `b` using `mask` | |
83c7162d XL |
96 | /// |
97 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd) | |
0531ce1d XL |
98 | #[inline] |
99 | #[target_feature(enable = "sse4.1")] | |
100 | #[cfg_attr(test, assert_instr(blendvpd))] | |
83c7162d | 101 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
102 | pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d { |
103 | blendvpd(a, b, mask) | |
104 | } | |
105 | ||
106 | /// Blend packed single-precision (32-bit) floating-point elements from `a` | |
107 | /// and `b` using `mask` | |
83c7162d XL |
108 | /// |
109 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps) | |
0531ce1d XL |
110 | #[inline] |
111 | #[target_feature(enable = "sse4.1")] | |
112 | #[cfg_attr(test, assert_instr(blendvps))] | |
83c7162d | 113 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
114 | pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 { |
115 | blendvps(a, b, mask) | |
116 | } | |
117 | ||
118 | /// Blend packed double-precision (64-bit) floating-point elements from `a` | |
119 | /// and `b` using control mask `imm2` | |
83c7162d XL |
120 | /// |
121 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd) | |
0531ce1d XL |
122 | #[inline] |
123 | #[target_feature(enable = "sse4.1")] | |
8faf50e0 XL |
124 | // Note: LLVM7 prefers the single-precision floating-point domain when possible |
125 | // see https://bugs.llvm.org/show_bug.cgi?id=38195 | |
126 | // #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))] | |
127 | #[cfg_attr(test, assert_instr(blendps, imm2 = 0b10))] | |
0531ce1d | 128 | #[rustc_args_required_const(2)] |
83c7162d | 129 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
130 | pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d { |
131 | macro_rules! call { | |
83c7162d XL |
132 | ($imm2:expr) => { |
133 | blendpd(a, b, $imm2) | |
134 | }; | |
0531ce1d XL |
135 | } |
136 | constify_imm2!(imm2, call) | |
137 | } | |
138 | ||
139 | /// Blend packed single-precision (32-bit) floating-point elements from `a` | |
140 | /// and `b` using mask `imm4` | |
83c7162d XL |
141 | /// |
142 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps) | |
0531ce1d XL |
143 | #[inline] |
144 | #[target_feature(enable = "sse4.1")] | |
145 | #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))] | |
146 | #[rustc_args_required_const(2)] | |
83c7162d | 147 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
148 | pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 { |
149 | macro_rules! call { | |
83c7162d XL |
150 | ($imm4:expr) => { |
151 | blendps(a, b, $imm4) | |
152 | }; | |
0531ce1d XL |
153 | } |
154 | constify_imm4!(imm4, call) | |
155 | } | |
156 | ||
532ac7d7 | 157 | /// Extracts a single-precision (32-bit) floating-point element from `a`, |
0531ce1d | 158 | /// selected with `imm8` |
83c7162d XL |
159 | /// |
160 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps) | |
0531ce1d XL |
161 | #[inline] |
162 | #[target_feature(enable = "sse4.1")] | |
0731742a XL |
163 | #[cfg_attr( |
164 | all(test, not(target_os = "windows")), | |
165 | assert_instr(extractps, imm8 = 0) | |
166 | )] | |
0531ce1d | 167 | #[rustc_args_required_const(1)] |
83c7162d | 168 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 169 | pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 { |
3dfed10e XL |
170 | macro_rules! call { |
171 | ($imm2:expr) => { | |
172 | transmute(simd_extract::<_, f32>(a, $imm2)) | |
173 | }; | |
174 | } | |
175 | constify_imm2!(imm8, call) | |
0531ce1d XL |
176 | } |
177 | ||
532ac7d7 | 178 | /// Extracts an 8-bit integer from `a`, selected with `imm8`. Returns a 32-bit |
0531ce1d XL |
179 | /// integer containing the zero-extended integer data. |
180 | /// | |
fc512014 | 181 | /// See [LLVM commit D20468](https://reviews.llvm.org/D20468). |
83c7162d XL |
182 | /// |
183 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8) | |
0531ce1d XL |
184 | #[inline] |
185 | #[target_feature(enable = "sse4.1")] | |
186 | #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))] | |
187 | #[rustc_args_required_const(1)] | |
83c7162d | 188 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 189 | pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 { |
3dfed10e XL |
190 | let a = a.as_u8x16(); |
191 | macro_rules! call { | |
192 | ($imm4:expr) => { | |
193 | simd_extract::<_, u8>(a, $imm4) as i32 | |
194 | }; | |
195 | } | |
196 | constify_imm4!(imm8, call) | |
0531ce1d XL |
197 | } |
198 | ||
532ac7d7 | 199 | /// Extracts an 32-bit integer from `a` selected with `imm8` |
83c7162d XL |
200 | /// |
201 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32) | |
0531ce1d XL |
202 | #[inline] |
203 | #[target_feature(enable = "sse4.1")] | |
0731742a XL |
204 | #[cfg_attr( |
205 | all(test, not(target_os = "windows")), | |
206 | assert_instr(extractps, imm8 = 1) | |
207 | )] | |
0531ce1d | 208 | #[rustc_args_required_const(1)] |
83c7162d | 209 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 210 | pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 { |
3dfed10e XL |
211 | let a = a.as_i32x4(); |
212 | macro_rules! call { | |
213 | ($imm2:expr) => { | |
214 | simd_extract::<_, i32>(a, $imm2) | |
215 | }; | |
216 | } | |
217 | constify_imm2!(imm8, call) | |
0531ce1d XL |
218 | } |
219 | ||
220 | /// Select a single value in `a` to store at some position in `b`, | |
221 | /// Then zero elements according to `imm8`. | |
222 | /// | |
223 | /// `imm8` specifies which bits from operand `a` will be copied, which bits in | |
224 | /// the result they will be copied to, and which bits in the result will be | |
225 | /// cleared. The following assignments are made: | |
226 | /// | |
227 | /// * Bits `[7:6]` specify the bits to copy from operand `a`: | |
228 | /// - `00`: Selects bits `[31:0]` from operand `a`. | |
229 | /// - `01`: Selects bits `[63:32]` from operand `a`. | |
230 | /// - `10`: Selects bits `[95:64]` from operand `a`. | |
231 | /// - `11`: Selects bits `[127:96]` from operand `a`. | |
232 | /// | |
233 | /// * Bits `[5:4]` specify the bits in the result to which the selected bits | |
234 | /// from operand `a` are copied: | |
235 | /// - `00`: Copies the selected bits from `a` to result bits `[31:0]`. | |
236 | /// - `01`: Copies the selected bits from `a` to result bits `[63:32]`. | |
237 | /// - `10`: Copies the selected bits from `a` to result bits `[95:64]`. | |
238 | /// - `11`: Copies the selected bits from `a` to result bits `[127:96]`. | |
239 | /// | |
240 | /// * Bits `[3:0]`: If any of these bits are set, the corresponding result | |
241 | /// element is cleared. | |
83c7162d XL |
242 | /// |
243 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps) | |
0531ce1d XL |
244 | #[inline] |
245 | #[target_feature(enable = "sse4.1")] | |
246 | #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))] | |
247 | #[rustc_args_required_const(2)] | |
83c7162d | 248 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
249 | pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { |
250 | macro_rules! call { | |
83c7162d XL |
251 | ($imm8:expr) => { |
252 | insertps(a, b, $imm8) | |
253 | }; | |
0531ce1d XL |
254 | } |
255 | constify_imm8!(imm8, call) | |
256 | } | |
257 | ||
532ac7d7 | 258 | /// Returns a copy of `a` with the 8-bit integer from `i` inserted at a |
0531ce1d | 259 | /// location specified by `imm8`. |
83c7162d XL |
260 | /// |
261 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8) | |
0531ce1d XL |
262 | #[inline] |
263 | #[target_feature(enable = "sse4.1")] | |
264 | #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))] | |
265 | #[rustc_args_required_const(2)] | |
83c7162d | 266 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 267 | pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i { |
3dfed10e XL |
268 | let a = a.as_i8x16(); |
269 | macro_rules! call { | |
270 | ($imm4:expr) => { | |
271 | transmute(simd_insert(a, $imm4, i as i8)) | |
272 | }; | |
273 | } | |
274 | constify_imm4!(imm8, call) | |
0531ce1d XL |
275 | } |
276 | ||
532ac7d7 | 277 | /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a |
0531ce1d | 278 | /// location specified by `imm8`. |
83c7162d XL |
279 | /// |
280 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32) | |
0531ce1d XL |
281 | #[inline] |
282 | #[target_feature(enable = "sse4.1")] | |
283 | #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))] | |
284 | #[rustc_args_required_const(2)] | |
83c7162d | 285 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 286 | pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i { |
3dfed10e XL |
287 | let a = a.as_i32x4(); |
288 | macro_rules! call { | |
289 | ($imm2:expr) => { | |
290 | transmute(simd_insert(a, $imm2, i)) | |
291 | }; | |
292 | } | |
293 | constify_imm2!(imm8, call) | |
0531ce1d XL |
294 | } |
295 | ||
532ac7d7 | 296 | /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum |
0531ce1d | 297 | /// values in dst. |
83c7162d XL |
298 | /// |
299 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8) | |
0531ce1d XL |
300 | #[inline] |
301 | #[target_feature(enable = "sse4.1")] | |
302 | #[cfg_attr(test, assert_instr(pmaxsb))] | |
83c7162d | 303 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 304 | pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 305 | transmute(pmaxsb(a.as_i8x16(), b.as_i8x16())) |
0531ce1d XL |
306 | } |
307 | ||
532ac7d7 | 308 | /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed |
0531ce1d | 309 | /// maximum. |
83c7162d XL |
310 | /// |
311 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16) | |
0531ce1d XL |
312 | #[inline] |
313 | #[target_feature(enable = "sse4.1")] | |
314 | #[cfg_attr(test, assert_instr(pmaxuw))] | |
83c7162d | 315 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 316 | pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 317 | transmute(pmaxuw(a.as_u16x8(), b.as_u16x8())) |
0531ce1d XL |
318 | } |
319 | ||
532ac7d7 | 320 | /// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum |
0531ce1d | 321 | /// values. |
83c7162d XL |
322 | /// |
323 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32) | |
0531ce1d XL |
324 | #[inline] |
325 | #[target_feature(enable = "sse4.1")] | |
326 | #[cfg_attr(test, assert_instr(pmaxsd))] | |
83c7162d | 327 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 328 | pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 329 | transmute(pmaxsd(a.as_i32x4(), b.as_i32x4())) |
0531ce1d XL |
330 | } |
331 | ||
532ac7d7 | 332 | /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed |
0531ce1d | 333 | /// maximum values. |
83c7162d XL |
334 | /// |
335 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32) | |
0531ce1d XL |
336 | #[inline] |
337 | #[target_feature(enable = "sse4.1")] | |
338 | #[cfg_attr(test, assert_instr(pmaxud))] | |
83c7162d | 339 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 340 | pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 341 | transmute(pmaxud(a.as_u32x4(), b.as_u32x4())) |
0531ce1d XL |
342 | } |
343 | ||
532ac7d7 | 344 | /// Compares packed 8-bit integers in `a` and `b` and returns packed minimum |
0531ce1d | 345 | /// values in dst. |
83c7162d XL |
346 | /// |
347 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8) | |
0531ce1d XL |
348 | #[inline] |
349 | #[target_feature(enable = "sse4.1")] | |
350 | #[cfg_attr(test, assert_instr(pminsb))] | |
83c7162d | 351 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 352 | pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 353 | transmute(pminsb(a.as_i8x16(), b.as_i8x16())) |
0531ce1d XL |
354 | } |
355 | ||
532ac7d7 | 356 | /// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed |
0531ce1d | 357 | /// minimum. |
83c7162d XL |
358 | /// |
359 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16) | |
0531ce1d XL |
360 | #[inline] |
361 | #[target_feature(enable = "sse4.1")] | |
362 | #[cfg_attr(test, assert_instr(pminuw))] | |
83c7162d | 363 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 364 | pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 365 | transmute(pminuw(a.as_u16x8(), b.as_u16x8())) |
0531ce1d XL |
366 | } |
367 | ||
532ac7d7 | 368 | /// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum |
0531ce1d | 369 | /// values. |
83c7162d XL |
370 | /// |
371 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32) | |
0531ce1d XL |
372 | #[inline] |
373 | #[target_feature(enable = "sse4.1")] | |
374 | #[cfg_attr(test, assert_instr(pminsd))] | |
83c7162d | 375 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 376 | pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 377 | transmute(pminsd(a.as_i32x4(), b.as_i32x4())) |
0531ce1d XL |
378 | } |
379 | ||
532ac7d7 | 380 | /// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed |
0531ce1d | 381 | /// minimum values. |
83c7162d XL |
382 | /// |
383 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32) | |
0531ce1d XL |
384 | #[inline] |
385 | #[target_feature(enable = "sse4.1")] | |
386 | #[cfg_attr(test, assert_instr(pminud))] | |
83c7162d | 387 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 388 | pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 389 | transmute(pminud(a.as_u32x4(), b.as_u32x4())) |
0531ce1d XL |
390 | } |
391 | ||
532ac7d7 | 392 | /// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers |
0531ce1d | 393 | /// using unsigned saturation |
83c7162d XL |
394 | /// |
395 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32) | |
0531ce1d XL |
396 | #[inline] |
397 | #[target_feature(enable = "sse4.1")] | |
398 | #[cfg_attr(test, assert_instr(packusdw))] | |
83c7162d | 399 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 400 | pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 401 | transmute(packusdw(a.as_i32x4(), b.as_i32x4())) |
0531ce1d XL |
402 | } |
403 | ||
532ac7d7 | 404 | /// Compares packed 64-bit integers in `a` and `b` for equality |
83c7162d XL |
405 | /// |
406 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64) | |
0531ce1d XL |
407 | #[inline] |
408 | #[target_feature(enable = "sse4.1")] | |
409 | #[cfg_attr(test, assert_instr(pcmpeqq))] | |
83c7162d | 410 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 411 | pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 412 | transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2())) |
0531ce1d XL |
413 | } |
414 | ||
415 | /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers | |
83c7162d XL |
416 | /// |
417 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16) | |
0531ce1d XL |
418 | #[inline] |
419 | #[target_feature(enable = "sse4.1")] | |
420 | #[cfg_attr(test, assert_instr(pmovsxbw))] | |
83c7162d | 421 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
422 | pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i { |
423 | let a = a.as_i8x16(); | |
424 | let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); | |
532ac7d7 | 425 | transmute(simd_cast::<_, i16x8>(a)) |
0531ce1d XL |
426 | } |
427 | ||
428 | /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers | |
83c7162d XL |
429 | /// |
430 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32) | |
0531ce1d XL |
431 | #[inline] |
432 | #[target_feature(enable = "sse4.1")] | |
433 | #[cfg_attr(test, assert_instr(pmovsxbd))] | |
83c7162d | 434 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
435 | pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i { |
436 | let a = a.as_i8x16(); | |
437 | let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]); | |
532ac7d7 | 438 | transmute(simd_cast::<_, i32x4>(a)) |
0531ce1d XL |
439 | } |
440 | ||
441 | /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed | |
442 | /// 64-bit integers | |
83c7162d XL |
443 | /// |
444 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64) | |
0531ce1d XL |
445 | #[inline] |
446 | #[target_feature(enable = "sse4.1")] | |
447 | #[cfg_attr(test, assert_instr(pmovsxbq))] | |
83c7162d | 448 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
449 | pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i { |
450 | let a = a.as_i8x16(); | |
451 | let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]); | |
532ac7d7 | 452 | transmute(simd_cast::<_, i64x2>(a)) |
0531ce1d XL |
453 | } |
454 | ||
455 | /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers | |
83c7162d XL |
456 | /// |
457 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32) | |
0531ce1d XL |
458 | #[inline] |
459 | #[target_feature(enable = "sse4.1")] | |
460 | #[cfg_attr(test, assert_instr(pmovsxwd))] | |
83c7162d | 461 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
462 | pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i { |
463 | let a = a.as_i16x8(); | |
464 | let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]); | |
532ac7d7 | 465 | transmute(simd_cast::<_, i32x4>(a)) |
0531ce1d XL |
466 | } |
467 | ||
468 | /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers | |
83c7162d XL |
469 | /// |
470 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64) | |
0531ce1d XL |
471 | #[inline] |
472 | #[target_feature(enable = "sse4.1")] | |
473 | #[cfg_attr(test, assert_instr(pmovsxwq))] | |
83c7162d | 474 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
475 | pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i { |
476 | let a = a.as_i16x8(); | |
477 | let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]); | |
532ac7d7 | 478 | transmute(simd_cast::<_, i64x2>(a)) |
0531ce1d XL |
479 | } |
480 | ||
481 | /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers | |
83c7162d XL |
482 | /// |
483 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64) | |
0531ce1d XL |
484 | #[inline] |
485 | #[target_feature(enable = "sse4.1")] | |
486 | #[cfg_attr(test, assert_instr(pmovsxdq))] | |
83c7162d | 487 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
488 | pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i { |
489 | let a = a.as_i32x4(); | |
490 | let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]); | |
532ac7d7 | 491 | transmute(simd_cast::<_, i64x2>(a)) |
0531ce1d XL |
492 | } |
493 | ||
532ac7d7 | 494 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers |
83c7162d XL |
495 | /// |
496 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16) | |
0531ce1d XL |
497 | #[inline] |
498 | #[target_feature(enable = "sse4.1")] | |
499 | #[cfg_attr(test, assert_instr(pmovzxbw))] | |
83c7162d | 500 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
501 | pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i { |
502 | let a = a.as_u8x16(); | |
503 | let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]); | |
532ac7d7 | 504 | transmute(simd_cast::<_, i16x8>(a)) |
0531ce1d XL |
505 | } |
506 | ||
532ac7d7 | 507 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers |
83c7162d XL |
508 | /// |
509 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32) | |
0531ce1d XL |
510 | #[inline] |
511 | #[target_feature(enable = "sse4.1")] | |
512 | #[cfg_attr(test, assert_instr(pmovzxbd))] | |
83c7162d | 513 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
514 | pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i { |
515 | let a = a.as_u8x16(); | |
516 | let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]); | |
532ac7d7 | 517 | transmute(simd_cast::<_, i32x4>(a)) |
0531ce1d XL |
518 | } |
519 | ||
532ac7d7 | 520 | /// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers |
83c7162d XL |
521 | /// |
522 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64) | |
0531ce1d XL |
523 | #[inline] |
524 | #[target_feature(enable = "sse4.1")] | |
525 | #[cfg_attr(test, assert_instr(pmovzxbq))] | |
83c7162d | 526 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
527 | pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i { |
528 | let a = a.as_u8x16(); | |
529 | let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]); | |
532ac7d7 | 530 | transmute(simd_cast::<_, i64x2>(a)) |
0531ce1d XL |
531 | } |
532 | ||
532ac7d7 | 533 | /// Zeroes extend packed unsigned 16-bit integers in `a` |
0531ce1d | 534 | /// to packed 32-bit integers |
83c7162d XL |
535 | /// |
536 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32) | |
0531ce1d XL |
537 | #[inline] |
538 | #[target_feature(enable = "sse4.1")] | |
539 | #[cfg_attr(test, assert_instr(pmovzxwd))] | |
83c7162d | 540 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
541 | pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i { |
542 | let a = a.as_u16x8(); | |
543 | let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]); | |
532ac7d7 | 544 | transmute(simd_cast::<_, i32x4>(a)) |
0531ce1d XL |
545 | } |
546 | ||
532ac7d7 | 547 | /// Zeroes extend packed unsigned 16-bit integers in `a` |
0531ce1d | 548 | /// to packed 64-bit integers |
83c7162d XL |
549 | /// |
550 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64) | |
0531ce1d XL |
551 | #[inline] |
552 | #[target_feature(enable = "sse4.1")] | |
553 | #[cfg_attr(test, assert_instr(pmovzxwq))] | |
83c7162d | 554 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
555 | pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i { |
556 | let a = a.as_u16x8(); | |
557 | let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]); | |
532ac7d7 | 558 | transmute(simd_cast::<_, i64x2>(a)) |
0531ce1d XL |
559 | } |
560 | ||
532ac7d7 | 561 | /// Zeroes extend packed unsigned 32-bit integers in `a` |
0531ce1d | 562 | /// to packed 64-bit integers |
83c7162d XL |
563 | /// |
564 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64) | |
0531ce1d XL |
565 | #[inline] |
566 | #[target_feature(enable = "sse4.1")] | |
567 | #[cfg_attr(test, assert_instr(pmovzxdq))] | |
83c7162d | 568 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
569 | pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i { |
570 | let a = a.as_u32x4(); | |
571 | let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]); | |
532ac7d7 | 572 | transmute(simd_cast::<_, i64x2>(a)) |
0531ce1d XL |
573 | } |
574 | ||
575 | /// Returns the dot product of two __m128d vectors. | |
576 | /// | |
577 | /// `imm8[1:0]` is the broadcast mask, and `imm8[5:4]` is the condition mask. | |
578 | /// If a condition mask bit is zero, the corresponding multiplication is | |
579 | /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of | |
580 | /// the dot product will be stored in the return value component. Otherwise if | |
581 | /// the broadcast mask bit is zero then the return component will be zero. | |
83c7162d XL |
582 | /// |
583 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd) | |
0531ce1d XL |
584 | #[inline] |
585 | #[target_feature(enable = "sse4.1")] | |
586 | #[cfg_attr(test, assert_instr(dppd, imm8 = 0))] | |
587 | #[rustc_args_required_const(2)] | |
83c7162d | 588 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
589 | pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d { |
590 | macro_rules! call { | |
83c7162d XL |
591 | ($imm8:expr) => { |
592 | dppd(a, b, $imm8) | |
593 | }; | |
0531ce1d XL |
594 | } |
595 | constify_imm8!(imm8, call) | |
596 | } | |
597 | ||
598 | /// Returns the dot product of two __m128 vectors. | |
599 | /// | |
600 | /// `imm8[3:0]` is the broadcast mask, and `imm8[7:4]` is the condition mask. | |
601 | /// If a condition mask bit is zero, the corresponding multiplication is | |
602 | /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of | |
603 | /// the dot product will be stored in the return value component. Otherwise if | |
604 | /// the broadcast mask bit is zero then the return component will be zero. | |
83c7162d XL |
605 | /// |
606 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps) | |
0531ce1d XL |
607 | #[inline] |
608 | #[target_feature(enable = "sse4.1")] | |
609 | #[cfg_attr(test, assert_instr(dpps, imm8 = 0))] | |
610 | #[rustc_args_required_const(2)] | |
83c7162d | 611 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
612 | pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 { |
613 | macro_rules! call { | |
83c7162d XL |
614 | ($imm8:expr) => { |
615 | dpps(a, b, $imm8) | |
616 | }; | |
0531ce1d XL |
617 | } |
618 | constify_imm8!(imm8, call) | |
619 | } | |
620 | ||
621 | /// Round the packed double-precision (64-bit) floating-point elements in `a` | |
532ac7d7 | 622 | /// down to an integer value, and stores the results as packed double-precision |
0531ce1d | 623 | /// floating-point elements. |
83c7162d XL |
624 | /// |
625 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd) | |
0531ce1d XL |
626 | #[inline] |
627 | #[target_feature(enable = "sse4.1")] | |
628 | #[cfg_attr(test, assert_instr(roundpd))] | |
83c7162d | 629 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 630 | pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d { |
74b04a01 | 631 | simd_floor(a) |
0531ce1d XL |
632 | } |
633 | ||
634 | /// Round the packed single-precision (32-bit) floating-point elements in `a` | |
532ac7d7 | 635 | /// down to an integer value, and stores the results as packed single-precision |
0531ce1d | 636 | /// floating-point elements. |
83c7162d XL |
637 | /// |
638 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps) | |
0531ce1d XL |
639 | #[inline] |
640 | #[target_feature(enable = "sse4.1")] | |
641 | #[cfg_attr(test, assert_instr(roundps))] | |
83c7162d | 642 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 643 | pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 { |
74b04a01 | 644 | simd_floor(a) |
0531ce1d XL |
645 | } |
646 | ||
647 | /// Round the lower double-precision (64-bit) floating-point element in `b` | |
648 | /// down to an integer value, store the result as a double-precision | |
649 | /// floating-point element in the lower element of the intrinsic result, | |
532ac7d7 | 650 | /// and copies the upper element from `a` to the upper element of the intrinsic |
0531ce1d | 651 | /// result. |
83c7162d XL |
652 | /// |
653 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd) | |
0531ce1d XL |
654 | #[inline] |
655 | #[target_feature(enable = "sse4.1")] | |
656 | #[cfg_attr(test, assert_instr(roundsd))] | |
83c7162d | 657 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
658 | pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d { |
659 | roundsd(a, b, _MM_FROUND_FLOOR) | |
660 | } | |
661 | ||
662 | /// Round the lower single-precision (32-bit) floating-point element in `b` | |
663 | /// down to an integer value, store the result as a single-precision | |
664 | /// floating-point element in the lower element of the intrinsic result, | |
532ac7d7 | 665 | /// and copies the upper 3 packed elements from `a` to the upper elements |
0531ce1d | 666 | /// of the intrinsic result. |
83c7162d XL |
667 | /// |
668 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss) | |
0531ce1d XL |
669 | #[inline] |
670 | #[target_feature(enable = "sse4.1")] | |
671 | #[cfg_attr(test, assert_instr(roundss))] | |
83c7162d | 672 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
673 | pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 { |
674 | roundss(a, b, _MM_FROUND_FLOOR) | |
675 | } | |
676 | ||
677 | /// Round the packed double-precision (64-bit) floating-point elements in `a` | |
532ac7d7 | 678 | /// up to an integer value, and stores the results as packed double-precision |
0531ce1d | 679 | /// floating-point elements. |
83c7162d XL |
680 | /// |
681 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd) | |
0531ce1d XL |
682 | #[inline] |
683 | #[target_feature(enable = "sse4.1")] | |
684 | #[cfg_attr(test, assert_instr(roundpd))] | |
83c7162d | 685 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 686 | pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d { |
74b04a01 | 687 | simd_ceil(a) |
0531ce1d XL |
688 | } |
689 | ||
690 | /// Round the packed single-precision (32-bit) floating-point elements in `a` | |
532ac7d7 | 691 | /// up to an integer value, and stores the results as packed single-precision |
0531ce1d | 692 | /// floating-point elements. |
83c7162d XL |
693 | /// |
694 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps) | |
0531ce1d XL |
695 | #[inline] |
696 | #[target_feature(enable = "sse4.1")] | |
697 | #[cfg_attr(test, assert_instr(roundps))] | |
83c7162d | 698 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 699 | pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 { |
74b04a01 | 700 | simd_ceil(a) |
0531ce1d XL |
701 | } |
702 | ||
703 | /// Round the lower double-precision (64-bit) floating-point element in `b` | |
704 | /// up to an integer value, store the result as a double-precision | |
705 | /// floating-point element in the lower element of the intrisic result, | |
532ac7d7 | 706 | /// and copies the upper element from `a` to the upper element |
0531ce1d | 707 | /// of the intrinsic result. |
83c7162d XL |
708 | /// |
709 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd) | |
0531ce1d XL |
710 | #[inline] |
711 | #[target_feature(enable = "sse4.1")] | |
712 | #[cfg_attr(test, assert_instr(roundsd))] | |
83c7162d | 713 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
714 | pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d { |
715 | roundsd(a, b, _MM_FROUND_CEIL) | |
716 | } | |
717 | ||
718 | /// Round the lower single-precision (32-bit) floating-point element in `b` | |
719 | /// up to an integer value, store the result as a single-precision | |
720 | /// floating-point element in the lower element of the intrinsic result, | |
532ac7d7 | 721 | /// and copies the upper 3 packed elements from `a` to the upper elements |
0531ce1d | 722 | /// of the intrinsic result. |
83c7162d XL |
723 | /// |
724 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss) | |
0531ce1d XL |
725 | #[inline] |
726 | #[target_feature(enable = "sse4.1")] | |
727 | #[cfg_attr(test, assert_instr(roundss))] | |
83c7162d | 728 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
729 | pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 { |
730 | roundss(a, b, _MM_FROUND_CEIL) | |
731 | } | |
732 | ||
733 | /// Round the packed double-precision (64-bit) floating-point elements in `a` | |
532ac7d7 | 734 | /// using the `rounding` parameter, and stores the results as packed |
0531ce1d XL |
735 | /// double-precision floating-point elements. |
736 | /// Rounding is done according to the rounding parameter, which can be one of: | |
737 | /// | |
738 | /// ``` | |
0531ce1d XL |
739 | /// #[cfg(target_arch = "x86")] |
740 | /// use std::arch::x86::*; | |
741 | /// #[cfg(target_arch = "x86_64")] | |
742 | /// use std::arch::x86_64::*; | |
743 | /// | |
744 | /// # fn main() { | |
745 | /// // round to nearest, and suppress exceptions: | |
746 | /// # let _x = | |
747 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; | |
748 | /// // round down, and suppress exceptions: | |
749 | /// # let _x = | |
750 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; | |
751 | /// // round up, and suppress exceptions: | |
752 | /// # let _x = | |
753 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; | |
754 | /// // truncate, and suppress exceptions: | |
755 | /// # let _x = | |
756 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; | |
757 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: | |
758 | /// # let _x = | |
759 | /// _MM_FROUND_CUR_DIRECTION; | |
760 | /// # } | |
761 | /// ``` | |
83c7162d XL |
762 | /// |
763 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd) | |
0531ce1d XL |
764 | #[inline] |
765 | #[target_feature(enable = "sse4.1")] | |
766 | #[cfg_attr(test, assert_instr(roundpd, rounding = 0))] | |
767 | #[rustc_args_required_const(1)] | |
83c7162d | 768 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
769 | pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d { |
770 | macro_rules! call { | |
83c7162d XL |
771 | ($imm4:expr) => { |
772 | roundpd(a, $imm4) | |
773 | }; | |
0531ce1d XL |
774 | } |
775 | constify_imm4!(rounding, call) | |
776 | } | |
777 | ||
778 | /// Round the packed single-precision (32-bit) floating-point elements in `a` | |
532ac7d7 | 779 | /// using the `rounding` parameter, and stores the results as packed |
0531ce1d XL |
780 | /// single-precision floating-point elements. |
781 | /// Rounding is done according to the rounding parameter, which can be one of: | |
782 | /// | |
783 | /// ``` | |
0531ce1d XL |
784 | /// #[cfg(target_arch = "x86")] |
785 | /// use std::arch::x86::*; | |
786 | /// #[cfg(target_arch = "x86_64")] | |
787 | /// use std::arch::x86_64::*; | |
788 | /// | |
789 | /// # fn main() { | |
790 | /// // round to nearest, and suppress exceptions: | |
791 | /// # let _x = | |
792 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; | |
793 | /// // round down, and suppress exceptions: | |
794 | /// # let _x = | |
795 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; | |
796 | /// // round up, and suppress exceptions: | |
797 | /// # let _x = | |
798 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; | |
799 | /// // truncate, and suppress exceptions: | |
800 | /// # let _x = | |
801 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; | |
802 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: | |
803 | /// # let _x = | |
804 | /// _MM_FROUND_CUR_DIRECTION; | |
805 | /// # } | |
806 | /// ``` | |
83c7162d XL |
807 | /// |
808 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps) | |
0531ce1d XL |
809 | #[inline] |
810 | #[target_feature(enable = "sse4.1")] | |
811 | #[cfg_attr(test, assert_instr(roundps, rounding = 0))] | |
812 | #[rustc_args_required_const(1)] | |
83c7162d | 813 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
814 | pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 { |
815 | macro_rules! call { | |
83c7162d XL |
816 | ($imm4:expr) => { |
817 | roundps(a, $imm4) | |
818 | }; | |
0531ce1d XL |
819 | } |
820 | constify_imm4!(rounding, call) | |
821 | } | |
822 | ||
823 | /// Round the lower double-precision (64-bit) floating-point element in `b` | |
824 | /// using the `rounding` parameter, store the result as a double-precision | |
825 | /// floating-point element in the lower element of the intrinsic result, | |
532ac7d7 | 826 | /// and copies the upper element from `a` to the upper element of the intrinsic |
0531ce1d XL |
827 | /// result. |
828 | /// Rounding is done according to the rounding parameter, which can be one of: | |
829 | /// | |
830 | /// ``` | |
0531ce1d XL |
831 | /// #[cfg(target_arch = "x86")] |
832 | /// use std::arch::x86::*; | |
833 | /// #[cfg(target_arch = "x86_64")] | |
834 | /// use std::arch::x86_64::*; | |
835 | /// | |
836 | /// # fn main() { | |
837 | /// // round to nearest, and suppress exceptions: | |
838 | /// # let _x = | |
839 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; | |
840 | /// // round down, and suppress exceptions: | |
841 | /// # let _x = | |
842 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; | |
843 | /// // round up, and suppress exceptions: | |
844 | /// # let _x = | |
845 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; | |
846 | /// // truncate, and suppress exceptions: | |
847 | /// # let _x = | |
848 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; | |
849 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: | |
850 | /// # let _x = | |
851 | /// _MM_FROUND_CUR_DIRECTION; | |
852 | /// # } | |
853 | /// ``` | |
83c7162d XL |
854 | /// |
855 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd) | |
0531ce1d XL |
856 | #[inline] |
857 | #[target_feature(enable = "sse4.1")] | |
858 | #[cfg_attr(test, assert_instr(roundsd, rounding = 0))] | |
859 | #[rustc_args_required_const(2)] | |
83c7162d | 860 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
861 | pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d { |
862 | macro_rules! call { | |
83c7162d XL |
863 | ($imm4:expr) => { |
864 | roundsd(a, b, $imm4) | |
865 | }; | |
0531ce1d XL |
866 | } |
867 | constify_imm4!(rounding, call) | |
868 | } | |
869 | ||
870 | /// Round the lower single-precision (32-bit) floating-point element in `b` | |
871 | /// using the `rounding` parameter, store the result as a single-precision | |
872 | /// floating-point element in the lower element of the intrinsic result, | |
532ac7d7 | 873 | /// and copies the upper 3 packed elements from `a` to the upper elements |
0531ce1d XL |
874 | /// of the instrinsic result. |
875 | /// Rounding is done according to the rounding parameter, which can be one of: | |
876 | /// | |
877 | /// ``` | |
0531ce1d XL |
878 | /// #[cfg(target_arch = "x86")] |
879 | /// use std::arch::x86::*; | |
880 | /// #[cfg(target_arch = "x86_64")] | |
881 | /// use std::arch::x86_64::*; | |
882 | /// | |
883 | /// # fn main() { | |
884 | /// // round to nearest, and suppress exceptions: | |
885 | /// # let _x = | |
886 | /// _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; | |
887 | /// // round down, and suppress exceptions: | |
888 | /// # let _x = | |
889 | /// _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC; | |
890 | /// // round up, and suppress exceptions: | |
891 | /// # let _x = | |
892 | /// _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC; | |
893 | /// // truncate, and suppress exceptions: | |
894 | /// # let _x = | |
895 | /// _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC; | |
896 | /// // use MXCSR.RC; see `_MM_SET_ROUNDING_MODE`: | |
897 | /// # let _x = | |
898 | /// _MM_FROUND_CUR_DIRECTION; | |
899 | /// # } | |
900 | /// ``` | |
83c7162d XL |
901 | /// |
902 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss) | |
0531ce1d XL |
903 | #[inline] |
904 | #[target_feature(enable = "sse4.1")] | |
905 | #[cfg_attr(test, assert_instr(roundss, rounding = 0))] | |
906 | #[rustc_args_required_const(2)] | |
83c7162d | 907 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
908 | pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 { |
909 | macro_rules! call { | |
83c7162d XL |
910 | ($imm4:expr) => { |
911 | roundss(a, b, $imm4) | |
912 | }; | |
0531ce1d XL |
913 | } |
914 | constify_imm4!(rounding, call) | |
915 | } | |
916 | ||
917 | /// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector, | |
918 | /// returning a vector containing its value in its first position, and its | |
919 | /// index | |
920 | /// in its second position; all other elements are set to zero. | |
921 | /// | |
fc512014 | 922 | /// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW` |
0531ce1d XL |
923 | /// instruction. |
924 | /// | |
925 | /// Arguments: | |
926 | /// | |
927 | /// * `a` - A 128-bit vector of type `__m128i`. | |
928 | /// | |
929 | /// Returns: | |
930 | /// | |
931 | /// A 128-bit value where: | |
932 | /// | |
933 | /// * bits `[15:0]` - contain the minimum value found in parameter `a`, | |
934 | /// * bits `[18:16]` - contain the index of the minimum value | |
935 | /// * remaining bits are set to `0`. | |
83c7162d XL |
936 | /// |
937 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16) | |
0531ce1d XL |
938 | #[inline] |
939 | #[target_feature(enable = "sse4.1")] | |
940 | #[cfg_attr(test, assert_instr(phminposuw))] | |
83c7162d | 941 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 942 | pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i { |
532ac7d7 | 943 | transmute(phminposuw(a.as_u16x8())) |
0531ce1d XL |
944 | } |
945 | ||
532ac7d7 XL |
946 | /// Multiplies the low 32-bit integers from each packed 64-bit |
947 | /// element in `a` and `b`, and returns the signed 64-bit result. | |
83c7162d XL |
948 | /// |
949 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32) | |
0531ce1d XL |
950 | #[inline] |
951 | #[target_feature(enable = "sse4.1")] | |
952 | #[cfg_attr(test, assert_instr(pmuldq))] | |
83c7162d | 953 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 954 | pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 955 | transmute(pmuldq(a.as_i32x4(), b.as_i32x4())) |
0531ce1d XL |
956 | } |
957 | ||
532ac7d7 | 958 | /// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate |
0531ce1d XL |
959 | /// 64-bit integers, and returns the lowest 32-bit, whatever they might be, |
960 | /// reinterpreted as a signed integer. While `pmulld __m128i::splat(2), | |
961 | /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping | |
962 | /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would | |
963 | /// return a negative number. | |
83c7162d XL |
964 | /// |
965 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32) | |
0531ce1d XL |
966 | #[inline] |
967 | #[target_feature(enable = "sse4.1")] | |
968 | #[cfg_attr(test, assert_instr(pmulld))] | |
83c7162d | 969 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d | 970 | pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i { |
532ac7d7 | 971 | transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) |
0531ce1d XL |
972 | } |
973 | ||
974 | /// Subtracts 8-bit unsigned integer values and computes the absolute | |
975 | /// values of the differences to the corresponding bits in the destination. | |
976 | /// Then sums of the absolute differences are returned according to the bit | |
977 | /// fields in the immediate operand. | |
978 | /// | |
979 | /// The following algorithm is performed: | |
980 | /// | |
981 | /// ```ignore | |
982 | /// i = imm8[2] * 4 | |
983 | /// j = imm8[1:0] * 4 | |
984 | /// for k := 0 to 7 | |
985 | /// d0 = abs(a[i + k + 0] - b[j + 0]) | |
986 | /// d1 = abs(a[i + k + 1] - b[j + 1]) | |
987 | /// d2 = abs(a[i + k + 2] - b[j + 2]) | |
988 | /// d3 = abs(a[i + k + 3] - b[j + 3]) | |
989 | /// r[k] = d0 + d1 + d2 + d3 | |
990 | /// ``` | |
991 | /// | |
992 | /// Arguments: | |
993 | /// | |
994 | /// * `a` - A 128-bit vector of type `__m128i`. | |
995 | /// * `b` - A 128-bit vector of type `__m128i`. | |
996 | /// * `imm8` - An 8-bit immediate operand specifying how the absolute | |
0731742a | 997 | /// differences are to be calculated |
0531ce1d XL |
998 | /// * Bit `[2]` specify the offset for operand `a` |
999 | /// * Bits `[1:0]` specify the offset for operand `b` | |
1000 | /// | |
1001 | /// Returns: | |
1002 | /// | |
0731742a XL |
1003 | /// * A `__m128i` vector containing the sums of the sets of absolute |
1004 | /// differences between both operands. | |
83c7162d XL |
1005 | /// |
1006 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8) | |
0531ce1d XL |
1007 | #[inline] |
1008 | #[target_feature(enable = "sse4.1")] | |
1009 | #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))] | |
1010 | #[rustc_args_required_const(2)] | |
83c7162d | 1011 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1012 | pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i { |
1013 | let a = a.as_u8x16(); | |
1014 | let b = b.as_u8x16(); | |
1015 | macro_rules! call { | |
83c7162d XL |
1016 | ($imm8:expr) => { |
1017 | mpsadbw(a, b, $imm8) | |
1018 | }; | |
0531ce1d | 1019 | } |
532ac7d7 | 1020 | transmute(constify_imm3!(imm8, call)) |
0531ce1d XL |
1021 | } |
1022 | ||
1023 | /// Tests whether the specified bits in a 128-bit integer vector are all | |
1024 | /// zeros. | |
1025 | /// | |
1026 | /// Arguments: | |
1027 | /// | |
1028 | /// * `a` - A 128-bit integer vector containing the bits to be tested. | |
1029 | /// * `mask` - A 128-bit integer vector selecting which bits to test in | |
0731742a | 1030 | /// operand `a`. |
0531ce1d XL |
1031 | /// |
1032 | /// Returns: | |
1033 | /// | |
1034 | /// * `1` - if the specified bits are all zeros, | |
1035 | /// * `0` - otherwise. | |
83c7162d XL |
1036 | /// |
1037 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128) | |
0531ce1d XL |
1038 | #[inline] |
1039 | #[target_feature(enable = "sse4.1")] | |
1040 | #[cfg_attr(test, assert_instr(ptest))] | |
83c7162d | 1041 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1042 | pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 { |
1043 | ptestz(a.as_i64x2(), mask.as_i64x2()) | |
1044 | } | |
1045 | ||
1046 | /// Tests whether the specified bits in a 128-bit integer vector are all | |
1047 | /// ones. | |
1048 | /// | |
1049 | /// Arguments: | |
1050 | /// | |
1051 | /// * `a` - A 128-bit integer vector containing the bits to be tested. | |
1052 | /// * `mask` - A 128-bit integer vector selecting which bits to test in | |
0731742a | 1053 | /// operand `a`. |
0531ce1d XL |
1054 | /// |
1055 | /// Returns: | |
1056 | /// | |
1057 | /// * `1` - if the specified bits are all ones, | |
1058 | /// * `0` - otherwise. | |
83c7162d XL |
1059 | /// |
1060 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128) | |
0531ce1d XL |
1061 | #[inline] |
1062 | #[target_feature(enable = "sse4.1")] | |
1063 | #[cfg_attr(test, assert_instr(ptest))] | |
83c7162d | 1064 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1065 | pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 { |
1066 | ptestc(a.as_i64x2(), mask.as_i64x2()) | |
1067 | } | |
1068 | ||
1069 | /// Tests whether the specified bits in a 128-bit integer vector are | |
1070 | /// neither all zeros nor all ones. | |
1071 | /// | |
1072 | /// Arguments: | |
1073 | /// | |
1074 | /// * `a` - A 128-bit integer vector containing the bits to be tested. | |
1075 | /// * `mask` - A 128-bit integer vector selecting which bits to test in | |
0731742a | 1076 | /// operand `a`. |
0531ce1d XL |
1077 | /// |
1078 | /// Returns: | |
1079 | /// | |
1080 | /// * `1` - if the specified bits are neither all zeros nor all ones, | |
1081 | /// * `0` - otherwise. | |
83c7162d XL |
1082 | /// |
1083 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128) | |
0531ce1d XL |
1084 | #[inline] |
1085 | #[target_feature(enable = "sse4.1")] | |
1086 | #[cfg_attr(test, assert_instr(ptest))] | |
83c7162d | 1087 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1088 | pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 { |
1089 | ptestnzc(a.as_i64x2(), mask.as_i64x2()) | |
1090 | } | |
1091 | ||
1092 | /// Tests whether the specified bits in a 128-bit integer vector are all | |
1093 | /// zeros. | |
1094 | /// | |
1095 | /// Arguments: | |
1096 | /// | |
1097 | /// * `a` - A 128-bit integer vector containing the bits to be tested. | |
1098 | /// * `mask` - A 128-bit integer vector selecting which bits to test in | |
0731742a | 1099 | /// operand `a`. |
0531ce1d XL |
1100 | /// |
1101 | /// Returns: | |
1102 | /// | |
1103 | /// * `1` - if the specified bits are all zeros, | |
1104 | /// * `0` - otherwise. | |
83c7162d XL |
1105 | /// |
1106 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros) | |
0531ce1d XL |
1107 | #[inline] |
1108 | #[target_feature(enable = "sse4.1")] | |
1109 | #[cfg_attr(test, assert_instr(ptest))] | |
83c7162d | 1110 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1111 | pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 { |
1112 | _mm_testz_si128(a, mask) | |
1113 | } | |
1114 | ||
1115 | /// Tests whether the specified bits in `a` 128-bit integer vector are all | |
1116 | /// ones. | |
1117 | /// | |
1118 | /// Argument: | |
1119 | /// | |
1120 | /// * `a` - A 128-bit integer vector containing the bits to be tested. | |
1121 | /// | |
1122 | /// Returns: | |
1123 | /// | |
1124 | /// * `1` - if the bits specified in the operand are all set to 1, | |
1125 | /// * `0` - otherwise. | |
83c7162d XL |
1126 | /// |
1127 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones) | |
0531ce1d XL |
1128 | #[inline] |
1129 | #[target_feature(enable = "sse4.1")] | |
1130 | #[cfg_attr(test, assert_instr(pcmpeqd))] | |
1131 | #[cfg_attr(test, assert_instr(ptest))] | |
83c7162d | 1132 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1133 | pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 { |
1134 | _mm_testc_si128(a, _mm_cmpeq_epi32(a, a)) | |
1135 | } | |
1136 | ||
1137 | /// Tests whether the specified bits in a 128-bit integer vector are | |
1138 | /// neither all zeros nor all ones. | |
1139 | /// | |
1140 | /// Arguments: | |
1141 | /// | |
1142 | /// * `a` - A 128-bit integer vector containing the bits to be tested. | |
1143 | /// * `mask` - A 128-bit integer vector selecting which bits to test in | |
0731742a | 1144 | /// operand `a`. |
0531ce1d XL |
1145 | /// |
1146 | /// Returns: | |
1147 | /// | |
1148 | /// * `1` - if the specified bits are neither all zeros nor all ones, | |
1149 | /// * `0` - otherwise. | |
83c7162d XL |
1150 | /// |
1151 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros) | |
0531ce1d XL |
1152 | #[inline] |
1153 | #[target_feature(enable = "sse4.1")] | |
1154 | #[cfg_attr(test, assert_instr(ptest))] | |
83c7162d | 1155 | #[stable(feature = "simd_x86", since = "1.27.0")] |
0531ce1d XL |
1156 | pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 { |
1157 | _mm_testnzc_si128(a, mask) | |
1158 | } | |
1159 | ||
1160 | #[allow(improper_ctypes)] | |
1161 | extern "C" { | |
1162 | #[link_name = "llvm.x86.sse41.pblendvb"] | |
1163 | fn pblendvb(a: i8x16, b: i8x16, mask: i8x16) -> i8x16; | |
1164 | #[link_name = "llvm.x86.sse41.blendvpd"] | |
1165 | fn blendvpd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; | |
1166 | #[link_name = "llvm.x86.sse41.blendvps"] | |
1167 | fn blendvps(a: __m128, b: __m128, mask: __m128) -> __m128; | |
1168 | #[link_name = "llvm.x86.sse41.blendpd"] | |
1169 | fn blendpd(a: __m128d, b: __m128d, imm2: u8) -> __m128d; | |
1170 | #[link_name = "llvm.x86.sse41.blendps"] | |
1171 | fn blendps(a: __m128, b: __m128, imm4: u8) -> __m128; | |
1172 | #[link_name = "llvm.x86.sse41.pblendw"] | |
1173 | fn pblendw(a: i16x8, b: i16x8, imm8: u8) -> i16x8; | |
1174 | #[link_name = "llvm.x86.sse41.insertps"] | |
1175 | fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128; | |
1176 | #[link_name = "llvm.x86.sse41.pmaxsb"] | |
1177 | fn pmaxsb(a: i8x16, b: i8x16) -> i8x16; | |
1178 | #[link_name = "llvm.x86.sse41.pmaxuw"] | |
1179 | fn pmaxuw(a: u16x8, b: u16x8) -> u16x8; | |
1180 | #[link_name = "llvm.x86.sse41.pmaxsd"] | |
1181 | fn pmaxsd(a: i32x4, b: i32x4) -> i32x4; | |
1182 | #[link_name = "llvm.x86.sse41.pmaxud"] | |
1183 | fn pmaxud(a: u32x4, b: u32x4) -> u32x4; | |
1184 | #[link_name = "llvm.x86.sse41.pminsb"] | |
1185 | fn pminsb(a: i8x16, b: i8x16) -> i8x16; | |
1186 | #[link_name = "llvm.x86.sse41.pminuw"] | |
1187 | fn pminuw(a: u16x8, b: u16x8) -> u16x8; | |
1188 | #[link_name = "llvm.x86.sse41.pminsd"] | |
1189 | fn pminsd(a: i32x4, b: i32x4) -> i32x4; | |
1190 | #[link_name = "llvm.x86.sse41.pminud"] | |
1191 | fn pminud(a: u32x4, b: u32x4) -> u32x4; | |
1192 | #[link_name = "llvm.x86.sse41.packusdw"] | |
1193 | fn packusdw(a: i32x4, b: i32x4) -> u16x8; | |
1194 | #[link_name = "llvm.x86.sse41.dppd"] | |
1195 | fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d; | |
1196 | #[link_name = "llvm.x86.sse41.dpps"] | |
1197 | fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128; | |
1198 | #[link_name = "llvm.x86.sse41.round.pd"] | |
1199 | fn roundpd(a: __m128d, rounding: i32) -> __m128d; | |
1200 | #[link_name = "llvm.x86.sse41.round.ps"] | |
1201 | fn roundps(a: __m128, rounding: i32) -> __m128; | |
1202 | #[link_name = "llvm.x86.sse41.round.sd"] | |
1203 | fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d; | |
1204 | #[link_name = "llvm.x86.sse41.round.ss"] | |
1205 | fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128; | |
1206 | #[link_name = "llvm.x86.sse41.phminposuw"] | |
1207 | fn phminposuw(a: u16x8) -> u16x8; | |
1208 | #[link_name = "llvm.x86.sse41.pmuldq"] | |
1209 | fn pmuldq(a: i32x4, b: i32x4) -> i64x2; | |
1210 | #[link_name = "llvm.x86.sse41.mpsadbw"] | |
1211 | fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8; | |
1212 | #[link_name = "llvm.x86.sse41.ptestz"] | |
1213 | fn ptestz(a: i64x2, mask: i64x2) -> i32; | |
1214 | #[link_name = "llvm.x86.sse41.ptestc"] | |
1215 | fn ptestc(a: i64x2, mask: i64x2) -> i32; | |
1216 | #[link_name = "llvm.x86.sse41.ptestnzc"] | |
1217 | fn ptestnzc(a: i64x2, mask: i64x2) -> i32; | |
1218 | } | |
1219 | ||
1220 | #[cfg(test)] | |
1221 | mod tests { | |
532ac7d7 | 1222 | use crate::core_arch::x86::*; |
0531ce1d | 1223 | use std::mem; |
416331ca | 1224 | use stdarch_test::simd_test; |
0531ce1d | 1225 | |
83c7162d | 1226 | #[simd_test(enable = "sse4.1")] |
0531ce1d | 1227 | unsafe fn test_mm_blendv_epi8() { |
0731742a | 1228 | #[rustfmt::skip] |
0531ce1d XL |
1229 | let a = _mm_setr_epi8( |
1230 | 0, 1, 2, 3, 4, 5, 6, 7, | |
1231 | 8, 9, 10, 11, 12, 13, 14, 15, | |
1232 | ); | |
0731742a | 1233 | #[rustfmt::skip] |
0531ce1d XL |
1234 | let b = _mm_setr_epi8( |
1235 | 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, | |
1236 | ); | |
0731742a | 1237 | #[rustfmt::skip] |
0531ce1d XL |
1238 | let mask = _mm_setr_epi8( |
1239 | 0, -1, 0, -1, 0, -1, 0, -1, | |
1240 | 0, -1, 0, -1, 0, -1, 0, -1, | |
1241 | ); | |
0731742a | 1242 | #[rustfmt::skip] |
0531ce1d XL |
1243 | let e = _mm_setr_epi8( |
1244 | 0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31, | |
1245 | ); | |
1246 | assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e); | |
1247 | } | |
1248 | ||
83c7162d | 1249 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1250 | unsafe fn test_mm_blendv_pd() { |
1251 | let a = _mm_set1_pd(0.0); | |
1252 | let b = _mm_set1_pd(1.0); | |
532ac7d7 | 1253 | let mask = transmute(_mm_setr_epi64x(0, -1)); |
0531ce1d XL |
1254 | let r = _mm_blendv_pd(a, b, mask); |
1255 | let e = _mm_setr_pd(0.0, 1.0); | |
1256 | assert_eq_m128d(r, e); | |
1257 | } | |
1258 | ||
83c7162d | 1259 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1260 | unsafe fn test_mm_blendv_ps() { |
1261 | let a = _mm_set1_ps(0.0); | |
1262 | let b = _mm_set1_ps(1.0); | |
532ac7d7 | 1263 | let mask = transmute(_mm_setr_epi32(0, -1, 0, -1)); |
0531ce1d XL |
1264 | let r = _mm_blendv_ps(a, b, mask); |
1265 | let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); | |
1266 | assert_eq_m128(r, e); | |
1267 | } | |
1268 | ||
83c7162d | 1269 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1270 | unsafe fn test_mm_blend_pd() { |
1271 | let a = _mm_set1_pd(0.0); | |
1272 | let b = _mm_set1_pd(1.0); | |
1273 | let r = _mm_blend_pd(a, b, 0b10); | |
1274 | let e = _mm_setr_pd(0.0, 1.0); | |
1275 | assert_eq_m128d(r, e); | |
1276 | } | |
1277 | ||
83c7162d | 1278 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1279 | unsafe fn test_mm_blend_ps() { |
1280 | let a = _mm_set1_ps(0.0); | |
1281 | let b = _mm_set1_ps(1.0); | |
1282 | let r = _mm_blend_ps(a, b, 0b1010); | |
1283 | let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0); | |
1284 | assert_eq_m128(r, e); | |
1285 | } | |
1286 | ||
83c7162d | 1287 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1288 | unsafe fn test_mm_blend_epi16() { |
1289 | let a = _mm_set1_epi16(0); | |
1290 | let b = _mm_set1_epi16(1); | |
1291 | let r = _mm_blend_epi16(a, b, 0b1010_1100); | |
1292 | let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1); | |
1293 | assert_eq_m128i(r, e); | |
1294 | } | |
1295 | ||
83c7162d | 1296 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1297 | unsafe fn test_mm_extract_ps() { |
1298 | let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0); | |
532ac7d7 | 1299 | let r: f32 = transmute(_mm_extract_ps(a, 1)); |
0531ce1d | 1300 | assert_eq!(r, 1.0); |
532ac7d7 | 1301 | let r: f32 = transmute(_mm_extract_ps(a, 5)); |
0531ce1d XL |
1302 | assert_eq!(r, 1.0); |
1303 | } | |
1304 | ||
83c7162d | 1305 | #[simd_test(enable = "sse4.1")] |
0531ce1d | 1306 | unsafe fn test_mm_extract_epi8() { |
0731742a | 1307 | #[rustfmt::skip] |
0531ce1d XL |
1308 | let a = _mm_setr_epi8( |
1309 | -1, 1, 2, 3, 4, 5, 6, 7, | |
1310 | 8, 9, 10, 11, 12, 13, 14, 15 | |
1311 | ); | |
1312 | let r1 = _mm_extract_epi8(a, 0); | |
1313 | let r2 = _mm_extract_epi8(a, 19); | |
1314 | assert_eq!(r1, 0xFF); | |
1315 | assert_eq!(r2, 3); | |
1316 | } | |
1317 | ||
83c7162d | 1318 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1319 | unsafe fn test_mm_extract_epi32() { |
1320 | let a = _mm_setr_epi32(0, 1, 2, 3); | |
1321 | let r = _mm_extract_epi32(a, 1); | |
1322 | assert_eq!(r, 1); | |
1323 | let r = _mm_extract_epi32(a, 5); | |
1324 | assert_eq!(r, 1); | |
1325 | } | |
1326 | ||
83c7162d | 1327 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1328 | unsafe fn test_mm_insert_ps() { |
1329 | let a = _mm_set1_ps(1.0); | |
1330 | let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0); | |
1331 | let r = _mm_insert_ps(a, b, 0b11_00_1100); | |
1332 | let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0); | |
1333 | assert_eq_m128(r, e); | |
1334 | } | |
1335 | ||
83c7162d | 1336 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1337 | unsafe fn test_mm_insert_epi8() { |
1338 | let a = _mm_set1_epi8(0); | |
1339 | let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); | |
1340 | let r = _mm_insert_epi8(a, 32, 1); | |
1341 | assert_eq_m128i(r, e); | |
1342 | let r = _mm_insert_epi8(a, 32, 17); | |
1343 | assert_eq_m128i(r, e); | |
1344 | } | |
1345 | ||
83c7162d | 1346 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1347 | unsafe fn test_mm_insert_epi32() { |
1348 | let a = _mm_set1_epi32(0); | |
1349 | let e = _mm_setr_epi32(0, 32, 0, 0); | |
1350 | let r = _mm_insert_epi32(a, 32, 1); | |
1351 | assert_eq_m128i(r, e); | |
1352 | let r = _mm_insert_epi32(a, 32, 5); | |
1353 | assert_eq_m128i(r, e); | |
1354 | } | |
1355 | ||
83c7162d | 1356 | #[simd_test(enable = "sse4.1")] |
0531ce1d | 1357 | unsafe fn test_mm_max_epi8() { |
0731742a | 1358 | #[rustfmt::skip] |
0531ce1d XL |
1359 | let a = _mm_setr_epi8( |
1360 | 1, 4, 5, 8, 9, 12, 13, 16, | |
1361 | 17, 20, 21, 24, 25, 28, 29, 32, | |
1362 | ); | |
0731742a | 1363 | #[rustfmt::skip] |
0531ce1d XL |
1364 | let b = _mm_setr_epi8( |
1365 | 2, 3, 6, 7, 10, 11, 14, 15, | |
1366 | 18, 19, 22, 23, 26, 27, 30, 31, | |
1367 | ); | |
1368 | let r = _mm_max_epi8(a, b); | |
0731742a | 1369 | #[rustfmt::skip] |
0531ce1d XL |
1370 | let e = _mm_setr_epi8( |
1371 | 2, 4, 6, 8, 10, 12, 14, 16, | |
1372 | 18, 20, 22, 24, 26, 28, 30, 32, | |
1373 | ); | |
1374 | assert_eq_m128i(r, e); | |
1375 | } | |
1376 | ||
83c7162d | 1377 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1378 | unsafe fn test_mm_max_epu16() { |
1379 | let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); | |
1380 | let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); | |
1381 | let r = _mm_max_epu16(a, b); | |
1382 | let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16); | |
1383 | assert_eq_m128i(r, e); | |
1384 | } | |
1385 | ||
83c7162d | 1386 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1387 | unsafe fn test_mm_max_epi32() { |
1388 | let a = _mm_setr_epi32(1, 4, 5, 8); | |
1389 | let b = _mm_setr_epi32(2, 3, 6, 7); | |
1390 | let r = _mm_max_epi32(a, b); | |
1391 | let e = _mm_setr_epi32(2, 4, 6, 8); | |
1392 | assert_eq_m128i(r, e); | |
1393 | } | |
1394 | ||
83c7162d | 1395 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1396 | unsafe fn test_mm_max_epu32() { |
1397 | let a = _mm_setr_epi32(1, 4, 5, 8); | |
1398 | let b = _mm_setr_epi32(2, 3, 6, 7); | |
1399 | let r = _mm_max_epu32(a, b); | |
1400 | let e = _mm_setr_epi32(2, 4, 6, 8); | |
1401 | assert_eq_m128i(r, e); | |
1402 | } | |
1403 | ||
83c7162d | 1404 | #[simd_test(enable = "sse4.1")] |
0531ce1d | 1405 | unsafe fn test_mm_min_epi8_1() { |
0731742a | 1406 | #[rustfmt::skip] |
0531ce1d XL |
1407 | let a = _mm_setr_epi8( |
1408 | 1, 4, 5, 8, 9, 12, 13, 16, | |
1409 | 17, 20, 21, 24, 25, 28, 29, 32, | |
1410 | ); | |
0731742a | 1411 | #[rustfmt::skip] |
0531ce1d XL |
1412 | let b = _mm_setr_epi8( |
1413 | 2, 3, 6, 7, 10, 11, 14, 15, | |
1414 | 18, 19, 22, 23, 26, 27, 30, 31, | |
1415 | ); | |
1416 | let r = _mm_min_epi8(a, b); | |
0731742a | 1417 | #[rustfmt::skip] |
0531ce1d XL |
1418 | let e = _mm_setr_epi8( |
1419 | 1, 3, 5, 7, 9, 11, 13, 15, | |
1420 | 17, 19, 21, 23, 25, 27, 29, 31, | |
1421 | ); | |
1422 | assert_eq_m128i(r, e); | |
1423 | } | |
1424 | ||
83c7162d | 1425 | #[simd_test(enable = "sse4.1")] |
0531ce1d | 1426 | unsafe fn test_mm_min_epi8_2() { |
0731742a | 1427 | #[rustfmt::skip] |
0531ce1d XL |
1428 | let a = _mm_setr_epi8( |
1429 | 1, -4, -5, 8, -9, -12, 13, -16, | |
1430 | 17, 20, 21, 24, 25, 28, 29, 32, | |
1431 | ); | |
0731742a | 1432 | #[rustfmt::skip] |
0531ce1d XL |
1433 | let b = _mm_setr_epi8( |
1434 | 2, -3, -6, 7, -10, -11, 14, -15, | |
1435 | 18, 19, 22, 23, 26, 27, 30, 31, | |
1436 | ); | |
1437 | let r = _mm_min_epi8(a, b); | |
0731742a | 1438 | #[rustfmt::skip] |
0531ce1d XL |
1439 | let e = _mm_setr_epi8( |
1440 | 1, -4, -6, 7, -10, -12, 13, -16, | |
1441 | 17, 19, 21, 23, 25, 27, 29, 31, | |
1442 | ); | |
1443 | assert_eq_m128i(r, e); | |
1444 | } | |
1445 | ||
83c7162d | 1446 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1447 | unsafe fn test_mm_min_epu16() { |
1448 | let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16); | |
1449 | let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15); | |
1450 | let r = _mm_min_epu16(a, b); | |
1451 | let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15); | |
1452 | assert_eq_m128i(r, e); | |
1453 | } | |
1454 | ||
83c7162d | 1455 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1456 | unsafe fn test_mm_min_epi32_1() { |
1457 | let a = _mm_setr_epi32(1, 4, 5, 8); | |
1458 | let b = _mm_setr_epi32(2, 3, 6, 7); | |
1459 | let r = _mm_min_epi32(a, b); | |
1460 | let e = _mm_setr_epi32(1, 3, 5, 7); | |
1461 | assert_eq_m128i(r, e); | |
1462 | } | |
1463 | ||
83c7162d | 1464 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1465 | unsafe fn test_mm_min_epi32_2() { |
1466 | let a = _mm_setr_epi32(-1, 4, 5, -7); | |
1467 | let b = _mm_setr_epi32(-2, 3, -6, 8); | |
1468 | let r = _mm_min_epi32(a, b); | |
1469 | let e = _mm_setr_epi32(-2, 3, -6, -7); | |
1470 | assert_eq_m128i(r, e); | |
1471 | } | |
1472 | ||
83c7162d | 1473 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1474 | unsafe fn test_mm_min_epu32() { |
1475 | let a = _mm_setr_epi32(1, 4, 5, 8); | |
1476 | let b = _mm_setr_epi32(2, 3, 6, 7); | |
1477 | let r = _mm_min_epu32(a, b); | |
1478 | let e = _mm_setr_epi32(1, 3, 5, 7); | |
1479 | assert_eq_m128i(r, e); | |
1480 | } | |
1481 | ||
83c7162d | 1482 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1483 | unsafe fn test_mm_packus_epi32() { |
1484 | let a = _mm_setr_epi32(1, 2, 3, 4); | |
1485 | let b = _mm_setr_epi32(-1, -2, -3, -4); | |
1486 | let r = _mm_packus_epi32(a, b); | |
1487 | let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); | |
1488 | assert_eq_m128i(r, e); | |
1489 | } | |
1490 | ||
83c7162d | 1491 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1492 | unsafe fn test_mm_cmpeq_epi64() { |
1493 | let a = _mm_setr_epi64x(0, 1); | |
1494 | let b = _mm_setr_epi64x(0, 0); | |
1495 | let r = _mm_cmpeq_epi64(a, b); | |
1496 | let e = _mm_setr_epi64x(-1, 0); | |
1497 | assert_eq_m128i(r, e); | |
1498 | } | |
1499 | ||
83c7162d | 1500 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1501 | unsafe fn test_mm_cvtepi8_epi16() { |
1502 | let a = _mm_set1_epi8(10); | |
1503 | let r = _mm_cvtepi8_epi16(a); | |
1504 | let e = _mm_set1_epi16(10); | |
1505 | assert_eq_m128i(r, e); | |
1506 | let a = _mm_set1_epi8(-10); | |
1507 | let r = _mm_cvtepi8_epi16(a); | |
1508 | let e = _mm_set1_epi16(-10); | |
1509 | assert_eq_m128i(r, e); | |
1510 | } | |
1511 | ||
83c7162d | 1512 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1513 | unsafe fn test_mm_cvtepi8_epi32() { |
1514 | let a = _mm_set1_epi8(10); | |
1515 | let r = _mm_cvtepi8_epi32(a); | |
1516 | let e = _mm_set1_epi32(10); | |
1517 | assert_eq_m128i(r, e); | |
1518 | let a = _mm_set1_epi8(-10); | |
1519 | let r = _mm_cvtepi8_epi32(a); | |
1520 | let e = _mm_set1_epi32(-10); | |
1521 | assert_eq_m128i(r, e); | |
1522 | } | |
1523 | ||
83c7162d | 1524 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1525 | unsafe fn test_mm_cvtepi8_epi64() { |
1526 | let a = _mm_set1_epi8(10); | |
1527 | let r = _mm_cvtepi8_epi64(a); | |
1528 | let e = _mm_set1_epi64x(10); | |
1529 | assert_eq_m128i(r, e); | |
1530 | let a = _mm_set1_epi8(-10); | |
1531 | let r = _mm_cvtepi8_epi64(a); | |
1532 | let e = _mm_set1_epi64x(-10); | |
1533 | assert_eq_m128i(r, e); | |
1534 | } | |
1535 | ||
83c7162d | 1536 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1537 | unsafe fn test_mm_cvtepi16_epi32() { |
1538 | let a = _mm_set1_epi16(10); | |
1539 | let r = _mm_cvtepi16_epi32(a); | |
1540 | let e = _mm_set1_epi32(10); | |
1541 | assert_eq_m128i(r, e); | |
1542 | let a = _mm_set1_epi16(-10); | |
1543 | let r = _mm_cvtepi16_epi32(a); | |
1544 | let e = _mm_set1_epi32(-10); | |
1545 | assert_eq_m128i(r, e); | |
1546 | } | |
1547 | ||
83c7162d | 1548 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1549 | unsafe fn test_mm_cvtepi16_epi64() { |
1550 | let a = _mm_set1_epi16(10); | |
1551 | let r = _mm_cvtepi16_epi64(a); | |
1552 | let e = _mm_set1_epi64x(10); | |
1553 | assert_eq_m128i(r, e); | |
1554 | let a = _mm_set1_epi16(-10); | |
1555 | let r = _mm_cvtepi16_epi64(a); | |
1556 | let e = _mm_set1_epi64x(-10); | |
1557 | assert_eq_m128i(r, e); | |
1558 | } | |
1559 | ||
83c7162d | 1560 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1561 | unsafe fn test_mm_cvtepi32_epi64() { |
1562 | let a = _mm_set1_epi32(10); | |
1563 | let r = _mm_cvtepi32_epi64(a); | |
1564 | let e = _mm_set1_epi64x(10); | |
1565 | assert_eq_m128i(r, e); | |
1566 | let a = _mm_set1_epi32(-10); | |
1567 | let r = _mm_cvtepi32_epi64(a); | |
1568 | let e = _mm_set1_epi64x(-10); | |
1569 | assert_eq_m128i(r, e); | |
1570 | } | |
1571 | ||
83c7162d | 1572 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1573 | unsafe fn test_mm_cvtepu8_epi16() { |
1574 | let a = _mm_set1_epi8(10); | |
1575 | let r = _mm_cvtepu8_epi16(a); | |
1576 | let e = _mm_set1_epi16(10); | |
1577 | assert_eq_m128i(r, e); | |
1578 | } | |
1579 | ||
83c7162d | 1580 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1581 | unsafe fn test_mm_cvtepu8_epi32() { |
1582 | let a = _mm_set1_epi8(10); | |
1583 | let r = _mm_cvtepu8_epi32(a); | |
1584 | let e = _mm_set1_epi32(10); | |
1585 | assert_eq_m128i(r, e); | |
1586 | } | |
1587 | ||
83c7162d | 1588 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1589 | unsafe fn test_mm_cvtepu8_epi64() { |
1590 | let a = _mm_set1_epi8(10); | |
1591 | let r = _mm_cvtepu8_epi64(a); | |
1592 | let e = _mm_set1_epi64x(10); | |
1593 | assert_eq_m128i(r, e); | |
1594 | } | |
1595 | ||
83c7162d | 1596 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1597 | unsafe fn test_mm_cvtepu16_epi32() { |
1598 | let a = _mm_set1_epi16(10); | |
1599 | let r = _mm_cvtepu16_epi32(a); | |
1600 | let e = _mm_set1_epi32(10); | |
1601 | assert_eq_m128i(r, e); | |
1602 | } | |
1603 | ||
83c7162d | 1604 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1605 | unsafe fn test_mm_cvtepu16_epi64() { |
1606 | let a = _mm_set1_epi16(10); | |
1607 | let r = _mm_cvtepu16_epi64(a); | |
1608 | let e = _mm_set1_epi64x(10); | |
1609 | assert_eq_m128i(r, e); | |
1610 | } | |
1611 | ||
83c7162d | 1612 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1613 | unsafe fn test_mm_cvtepu32_epi64() { |
1614 | let a = _mm_set1_epi32(10); | |
1615 | let r = _mm_cvtepu32_epi64(a); | |
1616 | let e = _mm_set1_epi64x(10); | |
1617 | assert_eq_m128i(r, e); | |
1618 | } | |
1619 | ||
83c7162d | 1620 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1621 | unsafe fn test_mm_dp_pd() { |
1622 | let a = _mm_setr_pd(2.0, 3.0); | |
1623 | let b = _mm_setr_pd(1.0, 4.0); | |
1624 | let e = _mm_setr_pd(14.0, 0.0); | |
1625 | assert_eq_m128d(_mm_dp_pd(a, b, 0b00110001), e); | |
1626 | } | |
1627 | ||
83c7162d | 1628 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1629 | unsafe fn test_mm_dp_ps() { |
1630 | let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0); | |
1631 | let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0); | |
1632 | let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0); | |
1633 | assert_eq_m128(_mm_dp_ps(a, b, 0b01110101), e); | |
1634 | } | |
1635 | ||
83c7162d | 1636 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1637 | unsafe fn test_mm_floor_pd() { |
1638 | let a = _mm_setr_pd(2.5, 4.5); | |
1639 | let r = _mm_floor_pd(a); | |
1640 | let e = _mm_setr_pd(2.0, 4.0); | |
1641 | assert_eq_m128d(r, e); | |
1642 | } | |
1643 | ||
83c7162d | 1644 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1645 | unsafe fn test_mm_floor_ps() { |
1646 | let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); | |
1647 | let r = _mm_floor_ps(a); | |
1648 | let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); | |
1649 | assert_eq_m128(r, e); | |
1650 | } | |
1651 | ||
83c7162d | 1652 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1653 | unsafe fn test_mm_floor_sd() { |
1654 | let a = _mm_setr_pd(2.5, 4.5); | |
1655 | let b = _mm_setr_pd(-1.5, -3.5); | |
1656 | let r = _mm_floor_sd(a, b); | |
1657 | let e = _mm_setr_pd(-2.0, 4.5); | |
1658 | assert_eq_m128d(r, e); | |
1659 | } | |
1660 | ||
83c7162d | 1661 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1662 | unsafe fn test_mm_floor_ss() { |
1663 | let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5); | |
1664 | let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5); | |
1665 | let r = _mm_floor_ss(a, b); | |
1666 | let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5); | |
1667 | assert_eq_m128(r, e); | |
1668 | } | |
1669 | ||
83c7162d | 1670 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1671 | unsafe fn test_mm_ceil_pd() { |
1672 | let a = _mm_setr_pd(1.5, 3.5); | |
1673 | let r = _mm_ceil_pd(a); | |
1674 | let e = _mm_setr_pd(2.0, 4.0); | |
1675 | assert_eq_m128d(r, e); | |
1676 | } | |
1677 | ||
83c7162d | 1678 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1679 | unsafe fn test_mm_ceil_ps() { |
1680 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); | |
1681 | let r = _mm_ceil_ps(a); | |
1682 | let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0); | |
1683 | assert_eq_m128(r, e); | |
1684 | } | |
1685 | ||
83c7162d | 1686 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1687 | unsafe fn test_mm_ceil_sd() { |
1688 | let a = _mm_setr_pd(1.5, 3.5); | |
1689 | let b = _mm_setr_pd(-2.5, -4.5); | |
1690 | let r = _mm_ceil_sd(a, b); | |
1691 | let e = _mm_setr_pd(-2.0, 3.5); | |
1692 | assert_eq_m128d(r, e); | |
1693 | } | |
1694 | ||
83c7162d | 1695 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1696 | unsafe fn test_mm_ceil_ss() { |
1697 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); | |
1698 | let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5); | |
1699 | let r = _mm_ceil_ss(a, b); | |
1700 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); | |
1701 | assert_eq_m128(r, e); | |
1702 | } | |
1703 | ||
83c7162d | 1704 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1705 | unsafe fn test_mm_round_pd() { |
1706 | let a = _mm_setr_pd(1.25, 3.75); | |
1707 | let r = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT); | |
1708 | let e = _mm_setr_pd(1.0, 4.0); | |
1709 | assert_eq_m128d(r, e); | |
1710 | } | |
1711 | ||
83c7162d | 1712 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1713 | unsafe fn test_mm_round_ps() { |
1714 | let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25); | |
1715 | let r = _mm_round_ps(a, _MM_FROUND_TO_ZERO); | |
1716 | let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0); | |
1717 | assert_eq_m128(r, e); | |
1718 | } | |
1719 | ||
83c7162d | 1720 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1721 | unsafe fn test_mm_round_sd() { |
1722 | let a = _mm_setr_pd(1.5, 3.5); | |
1723 | let b = _mm_setr_pd(-2.5, -4.5); | |
1724 | let old_mode = _MM_GET_ROUNDING_MODE(); | |
1725 | _MM_SET_ROUNDING_MODE(_MM_ROUND_TOWARD_ZERO); | |
1726 | let r = _mm_round_sd(a, b, _MM_FROUND_CUR_DIRECTION); | |
1727 | _MM_SET_ROUNDING_MODE(old_mode); | |
1728 | let e = _mm_setr_pd(-2.0, 3.5); | |
1729 | assert_eq_m128d(r, e); | |
1730 | } | |
1731 | ||
83c7162d | 1732 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1733 | unsafe fn test_mm_round_ss() { |
1734 | let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5); | |
1735 | let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5); | |
1736 | let old_mode = _MM_GET_ROUNDING_MODE(); | |
1737 | _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); | |
1738 | let r = _mm_round_ss(a, b, _MM_FROUND_CUR_DIRECTION); | |
1739 | _MM_SET_ROUNDING_MODE(old_mode); | |
1740 | let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5); | |
1741 | assert_eq_m128(r, e); | |
1742 | } | |
1743 | ||
83c7162d | 1744 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1745 | unsafe fn test_mm_minpos_epu16_1() { |
1746 | let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66); | |
1747 | let r = _mm_minpos_epu16(a); | |
1748 | let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0); | |
1749 | assert_eq_m128i(r, e); | |
1750 | } | |
1751 | ||
83c7162d | 1752 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1753 | unsafe fn test_mm_minpos_epu16_2() { |
1754 | let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66); | |
1755 | let r = _mm_minpos_epu16(a); | |
1756 | let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0); | |
1757 | assert_eq_m128i(r, e); | |
1758 | } | |
1759 | ||
83c7162d | 1760 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1761 | unsafe fn test_mm_mul_epi32() { |
1762 | { | |
1763 | let a = _mm_setr_epi32(1, 1, 1, 1); | |
1764 | let b = _mm_setr_epi32(1, 2, 3, 4); | |
1765 | let r = _mm_mul_epi32(a, b); | |
1766 | let e = _mm_setr_epi64x(1, 3); | |
1767 | assert_eq_m128i(r, e); | |
1768 | } | |
1769 | { | |
0731742a | 1770 | let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */); |
0531ce1d | 1771 | let b = _mm_setr_epi32( |
8faf50e0 XL |
1772 | -20, -256, /* ignored */ |
1773 | 666666, 666666, /* ignored */ | |
0531ce1d XL |
1774 | ); |
1775 | let r = _mm_mul_epi32(a, b); | |
1776 | let e = _mm_setr_epi64x(-300, 823043843622); | |
1777 | assert_eq_m128i(r, e); | |
1778 | } | |
1779 | } | |
1780 | ||
83c7162d | 1781 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1782 | unsafe fn test_mm_mullo_epi32() { |
1783 | { | |
1784 | let a = _mm_setr_epi32(1, 1, 1, 1); | |
1785 | let b = _mm_setr_epi32(1, 2, 3, 4); | |
1786 | let r = _mm_mullo_epi32(a, b); | |
1787 | let e = _mm_setr_epi32(1, 2, 3, 4); | |
1788 | assert_eq_m128i(r, e); | |
1789 | } | |
1790 | { | |
1791 | let a = _mm_setr_epi32(15, -2, 1234567, 99999); | |
1792 | let b = _mm_setr_epi32(-20, -256, 666666, -99999); | |
1793 | let r = _mm_mullo_epi32(a, b); | |
1794 | // Attention, most significant bit in r[2] is treated | |
1795 | // as a sign bit: | |
1796 | // 1234567 * 666666 = -1589877210 | |
1797 | let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409); | |
1798 | assert_eq_m128i(r, e); | |
1799 | } | |
1800 | } | |
1801 | ||
83c7162d | 1802 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1803 | unsafe fn test_mm_minpos_epu16() { |
1804 | let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3); | |
1805 | let r = _mm_minpos_epu16(a); | |
1806 | let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0); | |
1807 | assert_eq_m128i(r, e); | |
1808 | } | |
1809 | ||
83c7162d | 1810 | #[simd_test(enable = "sse4.1")] |
0531ce1d | 1811 | unsafe fn test_mm_mpsadbw_epu8() { |
0731742a | 1812 | #[rustfmt::skip] |
0531ce1d XL |
1813 | let a = _mm_setr_epi8( |
1814 | 0, 1, 2, 3, 4, 5, 6, 7, | |
1815 | 8, 9, 10, 11, 12, 13, 14, 15, | |
1816 | ); | |
1817 | ||
1818 | let r = _mm_mpsadbw_epu8(a, a, 0b000); | |
1819 | let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); | |
1820 | assert_eq_m128i(r, e); | |
1821 | ||
1822 | let r = _mm_mpsadbw_epu8(a, a, 0b001); | |
1823 | let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12); | |
1824 | assert_eq_m128i(r, e); | |
1825 | ||
1826 | let r = _mm_mpsadbw_epu8(a, a, 0b100); | |
1827 | let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44); | |
1828 | assert_eq_m128i(r, e); | |
1829 | ||
1830 | let r = _mm_mpsadbw_epu8(a, a, 0b101); | |
1831 | let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28); | |
1832 | assert_eq_m128i(r, e); | |
1833 | ||
1834 | let r = _mm_mpsadbw_epu8(a, a, 0b111); | |
1835 | let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4); | |
1836 | assert_eq_m128i(r, e); | |
1837 | } | |
1838 | ||
83c7162d | 1839 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1840 | unsafe fn test_mm_testz_si128() { |
1841 | let a = _mm_set1_epi8(1); | |
1842 | let mask = _mm_set1_epi8(0); | |
1843 | let r = _mm_testz_si128(a, mask); | |
1844 | assert_eq!(r, 1); | |
1845 | let a = _mm_set1_epi8(0b101); | |
1846 | let mask = _mm_set1_epi8(0b110); | |
1847 | let r = _mm_testz_si128(a, mask); | |
1848 | assert_eq!(r, 0); | |
1849 | let a = _mm_set1_epi8(0b011); | |
1850 | let mask = _mm_set1_epi8(0b100); | |
1851 | let r = _mm_testz_si128(a, mask); | |
1852 | assert_eq!(r, 1); | |
1853 | } | |
1854 | ||
83c7162d | 1855 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1856 | unsafe fn test_mm_testc_si128() { |
1857 | let a = _mm_set1_epi8(-1); | |
1858 | let mask = _mm_set1_epi8(0); | |
1859 | let r = _mm_testc_si128(a, mask); | |
1860 | assert_eq!(r, 1); | |
1861 | let a = _mm_set1_epi8(0b101); | |
1862 | let mask = _mm_set1_epi8(0b110); | |
1863 | let r = _mm_testc_si128(a, mask); | |
1864 | assert_eq!(r, 0); | |
1865 | let a = _mm_set1_epi8(0b101); | |
1866 | let mask = _mm_set1_epi8(0b100); | |
1867 | let r = _mm_testc_si128(a, mask); | |
1868 | assert_eq!(r, 1); | |
1869 | } | |
1870 | ||
83c7162d | 1871 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1872 | unsafe fn test_mm_testnzc_si128() { |
1873 | let a = _mm_set1_epi8(0); | |
1874 | let mask = _mm_set1_epi8(1); | |
1875 | let r = _mm_testnzc_si128(a, mask); | |
1876 | assert_eq!(r, 0); | |
1877 | let a = _mm_set1_epi8(-1); | |
1878 | let mask = _mm_set1_epi8(0); | |
1879 | let r = _mm_testnzc_si128(a, mask); | |
1880 | assert_eq!(r, 0); | |
1881 | let a = _mm_set1_epi8(0b101); | |
1882 | let mask = _mm_set1_epi8(0b110); | |
1883 | let r = _mm_testnzc_si128(a, mask); | |
1884 | assert_eq!(r, 1); | |
1885 | let a = _mm_set1_epi8(0b101); | |
1886 | let mask = _mm_set1_epi8(0b101); | |
1887 | let r = _mm_testnzc_si128(a, mask); | |
1888 | assert_eq!(r, 0); | |
1889 | } | |
1890 | ||
83c7162d | 1891 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1892 | unsafe fn test_mm_test_all_zeros() { |
1893 | let a = _mm_set1_epi8(1); | |
1894 | let mask = _mm_set1_epi8(0); | |
1895 | let r = _mm_test_all_zeros(a, mask); | |
1896 | assert_eq!(r, 1); | |
1897 | let a = _mm_set1_epi8(0b101); | |
1898 | let mask = _mm_set1_epi8(0b110); | |
1899 | let r = _mm_test_all_zeros(a, mask); | |
1900 | assert_eq!(r, 0); | |
1901 | let a = _mm_set1_epi8(0b011); | |
1902 | let mask = _mm_set1_epi8(0b100); | |
1903 | let r = _mm_test_all_zeros(a, mask); | |
1904 | assert_eq!(r, 1); | |
1905 | } | |
1906 | ||
83c7162d | 1907 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1908 | unsafe fn test_mm_test_all_ones() { |
1909 | let a = _mm_set1_epi8(-1); | |
1910 | let r = _mm_test_all_ones(a); | |
1911 | assert_eq!(r, 1); | |
1912 | let a = _mm_set1_epi8(0b101); | |
1913 | let r = _mm_test_all_ones(a); | |
1914 | assert_eq!(r, 0); | |
1915 | } | |
1916 | ||
83c7162d | 1917 | #[simd_test(enable = "sse4.1")] |
0531ce1d XL |
1918 | unsafe fn test_mm_test_mix_ones_zeros() { |
1919 | let a = _mm_set1_epi8(0); | |
1920 | let mask = _mm_set1_epi8(1); | |
1921 | let r = _mm_test_mix_ones_zeros(a, mask); | |
1922 | assert_eq!(r, 0); | |
1923 | let a = _mm_set1_epi8(-1); | |
1924 | let mask = _mm_set1_epi8(0); | |
1925 | let r = _mm_test_mix_ones_zeros(a, mask); | |
1926 | assert_eq!(r, 0); | |
1927 | let a = _mm_set1_epi8(0b101); | |
1928 | let mask = _mm_set1_epi8(0b110); | |
1929 | let r = _mm_test_mix_ones_zeros(a, mask); | |
1930 | assert_eq!(r, 1); | |
1931 | let a = _mm_set1_epi8(0b101); | |
1932 | let mask = _mm_set1_epi8(0b101); | |
1933 | let r = _mm_test_mix_ones_zeros(a, mask); | |
1934 | assert_eq!(r, 0); | |
1935 | } | |
1936 | } |