]> git.proxmox.com Git - cargo.git/blob - vendor/ppv-lite86/src/x86_64/mod.rs
New upstream version 0.47.0
[cargo.git] / vendor / ppv-lite86 / src / x86_64 / mod.rs
1 // crate minimums: sse2, x86_64
2
3 use core::arch::x86_64::{__m128i, __m256i};
4 use crate::types::*;
5
6 mod sse2;
7
8 #[derive(Copy, Clone)]
9 pub struct YesS3;
10 #[derive(Copy, Clone)]
11 pub struct NoS3;
12
13 #[derive(Copy, Clone)]
14 pub struct YesS4;
15 #[derive(Copy, Clone)]
16 pub struct NoS4;
17
18 #[derive(Copy, Clone)]
19 pub struct YesA1;
20 #[derive(Copy, Clone)]
21 pub struct NoA1;
22
23 #[derive(Copy, Clone)]
24 pub struct YesA2;
25 #[derive(Copy, Clone)]
26 pub struct NoA2;
27
28 #[derive(Copy, Clone)]
29 pub struct YesNI;
30 #[derive(Copy, Clone)]
31 pub struct NoNI;
32
33 use core::marker::PhantomData;
34
35 #[derive(Copy, Clone)]
36 pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37 impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38 where
39 sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40 sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
41 sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
42 sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43 sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46 sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48 sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49 {
50 type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51 type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52 type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53
54 type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55 type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56 type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57 type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58
59 type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60 type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61 type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62
63 #[inline(always)]
64 unsafe fn instance() -> Self {
65 SseMachine(PhantomData)
66 }
67 }
68
69 #[derive(Copy, Clone)]
70 pub struct Avx2Machine<NI>(PhantomData<NI>);
71 impl<NI: Copy> Machine for Avx2Machine<NI>
72 where
73 sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74 sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
75 sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
76 sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77 {
78 type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79 type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80 type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81
82 type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
83 type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84 type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85 type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86
87 type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88 type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89 type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90
91 #[inline(always)]
92 unsafe fn instance() -> Self {
93 Avx2Machine(PhantomData)
94 }
95 }
96
97 pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98 pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99 pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101 /// to avoid expensive SSE/VEX conflicts.
102 pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103 pub type AVX2 = Avx2Machine<NoNI>;
104
105 /// Generic wrapper for unparameterized storage of any of the possible impls.
106 /// Converting into and out of this type should be essentially free, although it may be more
107 /// aligned than a particular impl requires.
108 #[allow(non_camel_case_types)]
109 #[derive(Copy, Clone)]
110 pub union vec128_storage {
111 u32x4: [u32; 4],
112 u64x2: [u64; 2],
113 u128x1: [u128; 1],
114 sse2: __m128i,
115 }
116 impl Store<vec128_storage> for vec128_storage {
117 #[inline(always)]
118 unsafe fn unpack(p: vec128_storage) -> Self {
119 p
120 }
121 }
122 impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
123 #[inline(always)]
124 fn into(self) -> &'a [u32; 4] {
125 unsafe { &self.u32x4 }
126 }
127 }
128 impl Into<vec128_storage> for [u32; 4] {
129 #[inline(always)]
130 fn into(self) -> vec128_storage {
131 vec128_storage { u32x4: self }
132 }
133 }
134 impl Default for vec128_storage {
135 #[inline(always)]
136 fn default() -> Self {
137 vec128_storage { u128x1: [0] }
138 }
139 }
140 impl Eq for vec128_storage {}
141 impl PartialEq for vec128_storage {
142 #[inline(always)]
143 fn eq(&self, rhs: &Self) -> bool {
144 unsafe { self.u128x1 == rhs.u128x1 }
145 }
146 }
147
148 #[allow(non_camel_case_types)]
149 #[derive(Copy, Clone)]
150 pub union vec256_storage {
151 u32x8: [u32; 8],
152 u64x4: [u64; 4],
153 u128x2: [u128; 2],
154 sse2: [vec128_storage; 2],
155 avx: __m256i,
156 }
157 impl Into<vec256_storage> for [u64; 4] {
158 #[inline(always)]
159 fn into(self) -> vec256_storage {
160 vec256_storage { u64x4: self }
161 }
162 }
163 impl Default for vec256_storage {
164 #[inline(always)]
165 fn default() -> Self {
166 vec256_storage { u128x2: [0, 0] }
167 }
168 }
169 impl vec256_storage {
170 pub fn new128(xs: [vec128_storage; 2]) -> Self {
171 Self { sse2: xs }
172 }
173 pub fn split128(self) -> [vec128_storage; 2] {
174 unsafe { self.sse2 }
175 }
176 }
177 impl Eq for vec256_storage {}
178 impl PartialEq for vec256_storage {
179 #[inline(always)]
180 fn eq(&self, rhs: &Self) -> bool {
181 unsafe { self.sse2 == rhs.sse2 }
182 }
183 }
184
185 #[allow(non_camel_case_types)]
186 #[derive(Copy, Clone)]
187 pub union vec512_storage {
188 u32x16: [u32; 16],
189 u64x8: [u64; 8],
190 u128x4: [u128; 4],
191 sse2: [vec128_storage; 4],
192 avx: [vec256_storage; 2],
193 }
194 impl Default for vec512_storage {
195 #[inline(always)]
196 fn default() -> Self {
197 vec512_storage {
198 u128x4: [0, 0, 0, 0],
199 }
200 }
201 }
202 impl vec512_storage {
203 pub fn new128(xs: [vec128_storage; 4]) -> Self {
204 Self { sse2: xs }
205 }
206 pub fn split128(self) -> [vec128_storage; 4] {
207 unsafe { self.sse2 }
208 }
209 }
210 impl Eq for vec512_storage {}
211 impl PartialEq for vec512_storage {
212 #[inline(always)]
213 fn eq(&self, rhs: &Self) -> bool {
214 unsafe { self.avx == rhs.avx }
215 }
216 }
217
218 macro_rules! impl_into {
219 ($storage:ident, $array:ty, $name:ident) => {
220 impl Into<$array> for $storage {
221 #[inline(always)]
222 fn into(self) -> $array {
223 unsafe { self.$name }
224 }
225 }
226 };
227 }
228 impl_into!(vec128_storage, [u32; 4], u32x4);
229 impl_into!(vec128_storage, [u64; 2], u64x2);
230 impl_into!(vec128_storage, [u128; 1], u128x1);
231 impl_into!(vec256_storage, [u32; 8], u32x8);
232 impl_into!(vec256_storage, [u64; 4], u64x4);
233 impl_into!(vec256_storage, [u128; 2], u128x2);
234 impl_into!(vec512_storage, [u32; 16], u32x16);
235 impl_into!(vec512_storage, [u64; 8], u64x8);
236 impl_into!(vec512_storage, [u128; 4], u128x4);
237
238 /// Generate the full set of optimized implementations to take advantage of the most important
239 /// hardware feature sets.
240 ///
241 /// This dispatcher is suitable for maximizing throughput.
242 #[macro_export]
243 macro_rules! dispatch {
244 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
245 #[cfg(feature = "std")]
246 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
247 #[inline(always)]
248 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
249 use std::arch::x86_64::*;
250 #[target_feature(enable = "avx2")]
251 unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
252 let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
253 _mm256_zeroupper();
254 ret
255 }
256 #[target_feature(enable = "avx")]
257 #[target_feature(enable = "sse4.1")]
258 #[target_feature(enable = "ssse3")]
259 unsafe fn impl_avx($($arg: $argty),*) -> $ret {
260 let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
261 _mm256_zeroupper();
262 ret
263 }
264 #[target_feature(enable = "sse4.1")]
265 #[target_feature(enable = "ssse3")]
266 unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
267 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
268 }
269 #[target_feature(enable = "ssse3")]
270 unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
271 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
272 }
273 #[target_feature(enable = "sse2")]
274 unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
275 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
276 }
277 unsafe {
278 if is_x86_feature_detected!("avx2") {
279 impl_avx2($($arg),*)
280 } else if is_x86_feature_detected!("avx") {
281 impl_avx($($arg),*)
282 } else if is_x86_feature_detected!("sse4.1") {
283 impl_sse41($($arg),*)
284 } else if is_x86_feature_detected!("ssse3") {
285 impl_ssse3($($arg),*)
286 } else if is_x86_feature_detected!("sse2") {
287 impl_sse2($($arg),*)
288 } else {
289 unimplemented!()
290 }
291 }
292 }
293 #[cfg(not(feature = "std"))]
294 #[inline(always)]
295 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
296 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
297 unsafe {
298 if cfg!(target_feature = "avx2") {
299 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
300 } else if cfg!(target_feature = "avx") {
301 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
302 } else if cfg!(target_feature = "sse4.1") {
303 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
304 } else if cfg!(target_feature = "ssse3") {
305 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
306 } else {
307 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
308 }
309 }
310 }
311 };
312 ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
313 dispatch!($mach, $MTy, {
314 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
315 });
316 }
317 }
318
319 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
320 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
321 ///
322 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
323 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
324 /// size is more important.
325 #[macro_export]
326 macro_rules! dispatch_light128 {
327 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
328 #[cfg(feature = "std")]
329 $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
330 #[inline(always)]
331 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
332 use std::arch::x86_64::*;
333 #[target_feature(enable = "avx")]
334 unsafe fn impl_avx($($arg: $argty),*) -> $ret {
335 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
336 }
337 #[target_feature(enable = "sse2")]
338 unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
339 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
340 }
341 unsafe {
342 if is_x86_feature_detected!("avx") {
343 impl_avx($($arg),*)
344 } else if is_x86_feature_detected!("sse2") {
345 impl_sse2($($arg),*)
346 } else {
347 unimplemented!()
348 }
349 }
350 }
351 #[cfg(not(feature = "std"))]
352 #[inline(always)]
353 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
354 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
355 unsafe {
356 if cfg!(target_feature = "avx2") {
357 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
358 } else if cfg!(target_feature = "avx") {
359 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
360 } else if cfg!(target_feature = "sse4.1") {
361 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
362 } else if cfg!(target_feature = "ssse3") {
363 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
364 } else {
365 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
366 }
367 }
368 }
369 };
370 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
371 dispatch_light128!($mach, $MTy, {
372 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
373 });
374 }
375 }
376
377 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
378 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
379 ///
380 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
381 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
382 /// size is more important.
383 #[macro_export]
384 macro_rules! dispatch_light256 {
385 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
386 #[cfg(feature = "std")]
387 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
388 #[inline(always)]
389 fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
390 use std::arch::x86_64::*;
391 #[target_feature(enable = "avx")]
392 unsafe fn impl_avx($($arg: $argty),*) -> $ret {
393 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
394 }
395 #[target_feature(enable = "sse2")]
396 unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
397 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
398 }
399 unsafe {
400 if is_x86_feature_detected!("avx") {
401 impl_avx($($arg),*)
402 } else if is_x86_feature_detected!("sse2") {
403 impl_sse2($($arg),*)
404 } else {
405 unimplemented!()
406 }
407 }
408 }
409 #[cfg(not(feature = "std"))]
410 #[inline(always)]
411 $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
412 unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
413 unsafe {
414 if cfg!(target_feature = "avx2") {
415 fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
416 } else if cfg!(target_feature = "avx") {
417 fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
418 } else if cfg!(target_feature = "sse4.1") {
419 fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
420 } else if cfg!(target_feature = "ssse3") {
421 fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
422 } else {
423 fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
424 }
425 }
426 }
427 };
428 ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
429 dispatch_light256!($mach, $MTy, {
430 $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
431 });
432 }
433 }