1 // crate minimums: sse2, x86_64
3 use core
::arch
::x86_64
::{__m128i, __m256i}
;
10 #[derive(Copy, Clone)]
13 #[derive(Copy, Clone)]
15 #[derive(Copy, Clone)]
18 #[derive(Copy, Clone)]
20 #[derive(Copy, Clone)]
23 #[derive(Copy, Clone)]
25 #[derive(Copy, Clone)]
28 #[derive(Copy, Clone)]
30 #[derive(Copy, Clone)]
33 use core
::marker
::PhantomData
;
35 #[derive(Copy, Clone)]
36 pub struct SseMachine
<S3
, S4
, NI
>(PhantomData
<(S3
, S4
, NI
)>);
37 impl<S3
: Copy
, S4
: Copy
, NI
: Copy
> Machine
for SseMachine
<S3
, S4
, NI
>
39 sse2
::u128x1_sse2
<S3
, S4
, NI
>: Swap64
,
40 sse2
::u64x2_sse2
<S3
, S4
, NI
>: BSwap
+ RotateEachWord32
+ MultiLane
<[u64; 2]> + Vec2
<u64>,
41 sse2
::u32x4_sse2
<S3
, S4
, NI
>: BSwap
+ RotateEachWord32
+ MultiLane
<[u32; 4]> + Vec4
<u32>,
42 sse2
::u64x4_sse2
<S3
, S4
, NI
>: BSwap
+ Words4
,
43 sse2
::u128x1_sse2
<S3
, S4
, NI
>: BSwap
,
44 sse2
::u128x2_sse2
<S3
, S4
, NI
>: Into
<sse2
::u64x2x2_sse2
<S3
, S4
, NI
>>,
45 sse2
::u128x2_sse2
<S3
, S4
, NI
>: Into
<sse2
::u64x4_sse2
<S3
, S4
, NI
>>,
46 sse2
::u128x2_sse2
<S3
, S4
, NI
>: Into
<sse2
::u32x4x2_sse2
<S3
, S4
, NI
>>,
47 sse2
::u128x4_sse2
<S3
, S4
, NI
>: Into
<sse2
::u64x2x4_sse2
<S3
, S4
, NI
>>,
48 sse2
::u128x4_sse2
<S3
, S4
, NI
>: Into
<sse2
::u32x4x4_sse2
<S3
, S4
, NI
>>,
50 type u32x4
= sse2
::u32x4_sse2
<S3
, S4
, NI
>;
51 type u64x2
= sse2
::u64x2_sse2
<S3
, S4
, NI
>;
52 type u128x1
= sse2
::u128x1_sse2
<S3
, S4
, NI
>;
54 type u32x4x2
= sse2
::u32x4x2_sse2
<S3
, S4
, NI
>;
55 type u64x2x2
= sse2
::u64x2x2_sse2
<S3
, S4
, NI
>;
56 type u64x4
= sse2
::u64x4_sse2
<S3
, S4
, NI
>;
57 type u128x2
= sse2
::u128x2_sse2
<S3
, S4
, NI
>;
59 type u32x4x4
= sse2
::u32x4x4_sse2
<S3
, S4
, NI
>;
60 type u64x2x4
= sse2
::u64x2x4_sse2
<S3
, S4
, NI
>;
61 type u128x4
= sse2
::u128x4_sse2
<S3
, S4
, NI
>;
64 unsafe fn instance() -> Self {
65 SseMachine(PhantomData
)
69 #[derive(Copy, Clone)]
70 pub struct Avx2Machine
<NI
>(PhantomData
<NI
>);
71 impl<NI
: Copy
> Machine
for Avx2Machine
<NI
>
73 sse2
::u128x1_sse2
<YesS3
, YesS4
, NI
>: BSwap
+ Swap64
,
74 sse2
::u64x2_sse2
<YesS3
, YesS4
, NI
>: BSwap
+ RotateEachWord32
+ MultiLane
<[u64; 2]> + Vec2
<u64>,
75 sse2
::u32x4_sse2
<YesS3
, YesS4
, NI
>: BSwap
+ RotateEachWord32
+ MultiLane
<[u32; 4]> + Vec4
<u32>,
76 sse2
::u64x4_sse2
<YesS3
, YesS4
, NI
>: BSwap
+ Words4
,
78 type u32x4
= sse2
::u32x4_sse2
<YesS3
, YesS4
, NI
>;
79 type u64x2
= sse2
::u64x2_sse2
<YesS3
, YesS4
, NI
>;
80 type u128x1
= sse2
::u128x1_sse2
<YesS3
, YesS4
, NI
>;
82 type u32x4x2
= sse2
::u32x4x2_sse2
<YesS3
, YesS4
, NI
>;
83 type u64x2x2
= sse2
::u64x2x2_sse2
<YesS3
, YesS4
, NI
>;
84 type u64x4
= sse2
::u64x4_sse2
<YesS3
, YesS4
, NI
>;
85 type u128x2
= sse2
::u128x2_sse2
<YesS3
, YesS4
, NI
>;
87 type u32x4x4
= sse2
::avx2
::u32x4x4_avx2
<NI
>;
88 type u64x2x4
= sse2
::u64x2x4_sse2
<YesS3
, YesS4
, NI
>;
89 type u128x4
= sse2
::u128x4_sse2
<YesS3
, YesS4
, NI
>;
92 unsafe fn instance() -> Self {
93 Avx2Machine(PhantomData
)
97 pub type SSE2
= SseMachine
<NoS3
, NoS4
, NoNI
>;
98 pub type SSSE3
= SseMachine
<YesS3
, NoS4
, NoNI
>;
99 pub type SSE41
= SseMachine
<YesS3
, YesS4
, NoNI
>;
100 /// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101 /// to avoid expensive SSE/VEX conflicts.
102 pub type AVX
= SseMachine
<YesS3
, YesS4
, NoNI
>;
103 pub type AVX2
= Avx2Machine
<NoNI
>;
105 /// Generic wrapper for unparameterized storage of any of the possible impls.
106 /// Converting into and out of this type should be essentially free, although it may be more
107 /// aligned than a particular impl requires.
108 #[allow(non_camel_case_types)]
109 #[derive(Copy, Clone)]
110 pub union vec128_storage
{
116 impl Store
<vec128_storage
> for vec128_storage
{
118 unsafe fn unpack(p
: vec128_storage
) -> Self {
122 impl<'a
> Into
<&'a
[u32; 4]> for &'a vec128_storage
{
124 fn into(self) -> &'a
[u32; 4] {
125 unsafe { &self.u32x4 }
128 impl Into
<vec128_storage
> for [u32; 4] {
130 fn into(self) -> vec128_storage
{
131 vec128_storage { u32x4: self }
134 impl Default
for vec128_storage
{
136 fn default() -> Self {
137 vec128_storage { u128x1: [0] }
140 impl Eq
for vec128_storage {}
141 impl PartialEq
for vec128_storage
{
143 fn eq(&self, rhs
: &Self) -> bool
{
144 unsafe { self.u128x1 == rhs.u128x1 }
148 #[allow(non_camel_case_types)]
149 #[derive(Copy, Clone)]
150 pub union vec256_storage
{
154 sse2
: [vec128_storage
; 2],
157 impl Into
<vec256_storage
> for [u64; 4] {
159 fn into(self) -> vec256_storage
{
160 vec256_storage { u64x4: self }
163 impl Default
for vec256_storage
{
165 fn default() -> Self {
166 vec256_storage { u128x2: [0, 0] }
169 impl vec256_storage
{
170 pub fn new128(xs
: [vec128_storage
; 2]) -> Self {
173 pub fn split128(self) -> [vec128_storage
; 2] {
177 impl Eq
for vec256_storage {}
178 impl PartialEq
for vec256_storage
{
180 fn eq(&self, rhs
: &Self) -> bool
{
181 unsafe { self.sse2 == rhs.sse2 }
185 #[allow(non_camel_case_types)]
186 #[derive(Copy, Clone)]
187 pub union vec512_storage
{
191 sse2
: [vec128_storage
; 4],
192 avx
: [vec256_storage
; 2],
194 impl Default
for vec512_storage
{
196 fn default() -> Self {
198 u128x4
: [0, 0, 0, 0],
202 impl vec512_storage
{
203 pub fn new128(xs
: [vec128_storage
; 4]) -> Self {
206 pub fn split128(self) -> [vec128_storage
; 4] {
210 impl Eq
for vec512_storage {}
211 impl PartialEq
for vec512_storage
{
213 fn eq(&self, rhs
: &Self) -> bool
{
214 unsafe { self.avx == rhs.avx }
218 macro_rules
! impl_into
{
219 ($storage
:ident
, $array
:ty
, $name
:ident
) => {
220 impl Into
<$array
> for $storage
{
222 fn into(self) -> $array
{
223 unsafe { self.$name }
228 impl_into
!(vec128_storage
, [u32; 4], u32x4
);
229 impl_into
!(vec128_storage
, [u64; 2], u64x2
);
230 impl_into
!(vec128_storage
, [u128
; 1], u128x1
);
231 impl_into
!(vec256_storage
, [u32; 8], u32x8
);
232 impl_into
!(vec256_storage
, [u64; 4], u64x4
);
233 impl_into
!(vec256_storage
, [u128
; 2], u128x2
);
234 impl_into
!(vec512_storage
, [u32; 16], u32x16
);
235 impl_into
!(vec512_storage
, [u64; 8], u64x8
);
236 impl_into
!(vec512_storage
, [u128
; 4], u128x4
);
238 /// Generate the full set of optimized implementations to take advantage of the most important
239 /// hardware feature sets.
241 /// This dispatcher is suitable for maximizing throughput.
243 macro_rules
! dispatch
{
244 ($mach
:ident
, $MTy
:ident
, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }
) => {
245 #[cfg(feature = "std")]
246 $
($
pub$
(($krate
))*)* fn $
name($
($arg
: $argty
),*) -> $ret
{
248 fn fn_impl
<$MTy
: $
crate::Machine
>($mach
: $MTy
, $
($arg
: $argty
),*) -> $ret $body
249 use std
::arch
::x86_64
::*;
250 #[target_feature(enable = "avx2")]
251 unsafe fn impl_avx2($
($arg
: $argty
),*) -> $ret
{
252 let ret
= fn_impl($
crate::x86_64
::AVX2
::instance(), $
($arg
),*);
256 #[target_feature(enable = "avx")]
257 #[target_feature(enable = "sse4.1")]
258 #[target_feature(enable = "ssse3")]
259 unsafe fn impl_avx($
($arg
: $argty
),*) -> $ret
{
260 let ret
= fn_impl($
crate::x86_64
::AVX
::instance(), $
($arg
),*);
264 #[target_feature(enable = "sse4.1")]
265 #[target_feature(enable = "ssse3")]
266 unsafe fn impl_sse41($
($arg
: $argty
),*) -> $ret
{
267 fn_impl($
crate::x86_64
::SSE41
::instance(), $
($arg
),*)
269 #[target_feature(enable = "ssse3")]
270 unsafe fn impl_ssse3($
($arg
: $argty
),*) -> $ret
{
271 fn_impl($
crate::x86_64
::SSSE3
::instance(), $
($arg
),*)
273 #[target_feature(enable = "sse2")]
274 unsafe fn impl_sse2($
($arg
: $argty
),*) -> $ret
{
275 fn_impl($
crate::x86_64
::SSE2
::instance(), $
($arg
),*)
278 if is_x86_feature_detected
!("avx2") {
280 } else if is_x86_feature_detected
!("avx") {
282 } else if is_x86_feature_detected
!("sse4.1") {
283 impl_sse41($
($arg
),*)
284 } else if is_x86_feature_detected
!("ssse3") {
285 impl_ssse3($
($arg
),*)
286 } else if is_x86_feature_detected
!("sse2") {
293 #[cfg(not(feature = "std"))]
295 $
($
pub$
(($krate
))*)* fn $
name($
($arg
: $argty
),*) -> $ret
{
296 unsafe fn fn_impl
<$MTy
: $
crate::Machine
>($mach
: $MTy
, $
($arg
: $argty
),*) -> $ret $body
298 if cfg
!(target_feature
= "avx2") {
299 fn_impl($
crate::x86_64
::AVX2
::instance(), $
($arg
),*)
300 } else if cfg
!(target_feature
= "avx") {
301 fn_impl($
crate::x86_64
::AVX
::instance(), $
($arg
),*)
302 } else if cfg
!(target_feature
= "sse4.1") {
303 fn_impl($
crate::x86_64
::SSE41
::instance(), $
($arg
),*)
304 } else if cfg
!(target_feature
= "ssse3") {
305 fn_impl($
crate::x86_64
::SSSE3
::instance(), $
($arg
),*)
307 fn_impl($
crate::x86_64
::SSE2
::instance(), $
($arg
),*)
312 ($mach
:ident
, $MTy
:ident
, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }
) => {
313 dispatch
!($mach
, $MTy
, {
314 $
([$
pub $
(($krate
))*])* fn $
name($
($arg
: $argty
),*) -> () $body
319 /// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
320 /// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
322 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
323 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
324 /// size is more important.
326 macro_rules
! dispatch_light128
{
327 ($mach
:ident
, $MTy
:ident
, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }
) => {
328 #[cfg(feature = "std")]
329 $
($
pub $
(($krate
))*)* fn $
name($
($arg
: $argty
),*) -> $ret
{
331 fn fn_impl
<$MTy
: $
crate::Machine
>($mach
: $MTy
, $
($arg
: $argty
),*) -> $ret $body
332 use std
::arch
::x86_64
::*;
333 #[target_feature(enable = "avx")]
334 unsafe fn impl_avx($
($arg
: $argty
),*) -> $ret
{
335 fn_impl($
crate::x86_64
::AVX
::instance(), $
($arg
),*)
337 #[target_feature(enable = "sse2")]
338 unsafe fn impl_sse2($
($arg
: $argty
),*) -> $ret
{
339 fn_impl($
crate::x86_64
::SSE2
::instance(), $
($arg
),*)
342 if is_x86_feature_detected
!("avx") {
344 } else if is_x86_feature_detected
!("sse2") {
351 #[cfg(not(feature = "std"))]
353 $
($
pub$
(($krate
))*)* fn $
name($
($arg
: $argty
),*) -> $ret
{
354 unsafe fn fn_impl
<$MTy
: $
crate::Machine
>($mach
: $MTy
, $
($arg
: $argty
),*) -> $ret $body
356 if cfg
!(target_feature
= "avx2") {
357 fn_impl($
crate::x86_64
::AVX2
::instance(), $
($arg
),*)
358 } else if cfg
!(target_feature
= "avx") {
359 fn_impl($
crate::x86_64
::AVX
::instance(), $
($arg
),*)
360 } else if cfg
!(target_feature
= "sse4.1") {
361 fn_impl($
crate::x86_64
::SSE41
::instance(), $
($arg
),*)
362 } else if cfg
!(target_feature
= "ssse3") {
363 fn_impl($
crate::x86_64
::SSSE3
::instance(), $
($arg
),*)
365 fn_impl($
crate::x86_64
::SSE2
::instance(), $
($arg
),*)
370 ($mach
:ident
, $MTy
:ident
, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }
) => {
371 dispatch_light128
!($mach
, $MTy
, {
372 $
([$
pub $
(($krate
))*])* fn $
name($
($arg
: $argty
),*) -> () $body
377 /// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
378 /// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
380 /// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
381 /// features (e.g. because they are done infrequently), so minimizing their contribution to code
382 /// size is more important.
384 macro_rules
! dispatch_light256
{
385 ($mach
:ident
, $MTy
:ident
, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }
) => {
386 #[cfg(feature = "std")]
387 $
([$
pub $
(($krate
))*])* fn $
name($
($arg
: $argty
),*) -> $ret
{
389 fn fn_impl
<$MTy
: $
crate::Machine
>($mach
: $MTy
, $
($arg
: $argty
),*) -> $ret $body
390 use std
::arch
::x86_64
::*;
391 #[target_feature(enable = "avx")]
392 unsafe fn impl_avx($
($arg
: $argty
),*) -> $ret
{
393 fn_impl($
crate::x86_64
::AVX
::instance(), $
($arg
),*)
395 #[target_feature(enable = "sse2")]
396 unsafe fn impl_sse2($
($arg
: $argty
),*) -> $ret
{
397 fn_impl($
crate::x86_64
::SSE2
::instance(), $
($arg
),*)
400 if is_x86_feature_detected
!("avx") {
402 } else if is_x86_feature_detected
!("sse2") {
409 #[cfg(not(feature = "std"))]
411 $
($
pub$
(($krate
))*)* fn $
name($
($arg
: $argty
),*) -> $ret
{
412 unsafe fn fn_impl
<$MTy
: $
crate::Machine
>($mach
: $MTy
, $
($arg
: $argty
),*) -> $ret $body
414 if cfg
!(target_feature
= "avx2") {
415 fn_impl($
crate::x86_64
::AVX2
::instance(), $
($arg
),*)
416 } else if cfg
!(target_feature
= "avx") {
417 fn_impl($
crate::x86_64
::AVX
::instance(), $
($arg
),*)
418 } else if cfg
!(target_feature
= "sse4.1") {
419 fn_impl($
crate::x86_64
::SSE41
::instance(), $
($arg
),*)
420 } else if cfg
!(target_feature
= "ssse3") {
421 fn_impl($
crate::x86_64
::SSSE3
::instance(), $
($arg
),*)
423 fn_impl($
crate::x86_64
::SSE2
::instance(), $
($arg
),*)
428 ($mach
:ident
, $MTy
:ident
, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }
) => {
429 dispatch_light256
!($mach
, $MTy
, {
430 $
([$
pub $
(($krate
))*])* fn $
name($
($arg
: $argty
),*) -> () $body