1 //! Vectorized Carry-less Multiplication (VCLMUL)
3 //! The reference is [Intel 64 and IA-32 Architectures Software Developer's
4 //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
6 //! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
8 use crate::core_arch
::x86
::__m256i
;
9 use crate::core_arch
::x86
::__m512i
;
12 use crate::stdarch_test
::assert_instr
;
14 #[allow(improper_ctypes)]
16 #[link_name = "llvm.x86.pclmulqdq.256"]
17 fn pclmulqdq_256(a
: __m256i
, round_key
: __m256i
, imm8
: u8) -> __m256i
;
18 #[link_name = "llvm.x86.pclmulqdq.512"]
19 fn pclmulqdq_512(a
: __m512i
, round_key
: __m512i
, imm8
: u8) -> __m512i
;
22 // for some odd reason on x86_64 we generate the correct long name instructions
23 // but on i686 we generate the short name + imm8
24 // so we need to special-case on that...
26 /// Performs a carry-less multiplication of two 64-bit polynomials over the
27 /// finite field GF(2^k) - in each of the 4 128-bit lanes.
29 /// The immediate byte is used for determining which halves of each lane `a` and `b`
30 /// should be used. Immediate bits other than 0 and 4 are ignored.
31 /// All lanes share immediate byte.
33 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
35 #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
36 // technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
37 #[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
38 #[rustc_args_required_const(2)]
39 pub unsafe fn _mm512_clmulepi64_epi128(a
: __m512i
, b
: __m512i
, imm8
: i32) -> __m512i
{
42 pclmulqdq_512(a
, b
, $imm8
)
45 constify_imm8
!(imm8
, call
)
48 /// Performs a carry-less multiplication of two 64-bit polynomials over the
49 /// finite field GF(2^k) - in each of the 2 128-bit lanes.
51 /// The immediate byte is used for determining which halves of each lane `a` and `b`
52 /// should be used. Immediate bits other than 0 and 4 are ignored.
53 /// All lanes share immediate byte.
55 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
57 #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
58 #[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
59 #[rustc_args_required_const(2)]
60 pub unsafe fn _mm256_clmulepi64_epi128(a
: __m256i
, b
: __m256i
, imm8
: i32) -> __m256i
{
63 pclmulqdq_256(a
, b
, $imm8
)
66 constify_imm8
!(imm8
, call
)
71 // The constants in the tests below are just bit patterns. They should not
72 // be interpreted as integers; signedness does not make sense for them, but
73 // __mXXXi happens to be defined in terms of signed integers.
74 #![allow(overflowing_literals)]
76 use stdarch_test
::simd_test
;
78 use crate::core_arch
::x86
::*;
80 macro_rules
! verify_kat_pclmul
{
81 ($broadcast
:ident
, $clmul
:ident
, $assert
:ident
) => {
82 // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
83 let a
= _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
84 let a
= $
broadcast(a
);
85 let b
= _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
86 let b
= $
broadcast(b
);
87 let r00
= _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
88 let r00
= $
broadcast(r00
);
89 let r01
= _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
90 let r01
= $
broadcast(r01
);
91 let r10
= _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
92 let r10
= $
broadcast(r10
);
93 let r11
= _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
94 let r11
= $
broadcast(r11
);
96 $
assert($
clmul(a
, b
, 0x00), r00
);
97 $
assert($
clmul(a
, b
, 0x10), r01
);
98 $
assert($
clmul(a
, b
, 0x01), r10
);
99 $
assert($
clmul(a
, b
, 0x11), r11
);
101 let a0
= _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
102 let a0
= $
broadcast(a0
);
103 let r
= _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
104 let r
= $
broadcast(r
);
105 $
assert($
clmul(a0
, a0
, 0x00), r
);
109 macro_rules
! unroll
{
110 ($target
:ident
[4] = $op
:ident($source
:ident
,4);) => {
111 $target
[3] = $
op($source
, 3);
112 $target
[2] = $
op($source
, 2);
113 unroll
! {$target[2] = $op($source,2);}
115 ($target
:ident
[2] = $op
:ident($source
:ident
,2);) => {
116 $target
[1] = $
op($source
, 1);
117 $target
[0] = $
op($source
, 0);
119 (assert_eq_m128i($op
:ident($vec_res
:ident
,4),$lin_res
:ident
[4]);) => {
120 assert_eq_m128i($
op($vec_res
, 3), $lin_res
[3]);
121 assert_eq_m128i($
op($vec_res
, 2), $lin_res
[2]);
122 unroll
! {assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
124 (assert_eq_m128i($op
:ident($vec_res
:ident
,2),$lin_res
:ident
[2]);) => {
125 assert_eq_m128i($
op($vec_res
, 1), $lin_res
[1]);
126 assert_eq_m128i($
op($vec_res
, 0), $lin_res
[0]);
130 // this function tests one of the possible 4 instances
131 // with different inputs across lanes
132 #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
133 unsafe fn verify_512_helper(
134 linear
: unsafe fn(__m128i
, __m128i
) -> __m128i
,
135 vectorized
: unsafe fn(__m512i
, __m512i
) -> __m512i
,
137 let a
= _mm512_set_epi64(
147 let b
= _mm512_set_epi64(
158 let mut a_decomp
= [_mm_setzero_si128(); 4];
159 unroll
! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);}
160 let mut b_decomp
= [_mm_setzero_si128(); 4];
161 unroll
! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);}
163 let r
= vectorized(a
, b
);
164 let mut e_decomp
= [_mm_setzero_si128(); 4];
166 e_decomp
[i
] = linear(a_decomp
[i
], b_decomp
[i
]);
168 unroll
! {assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
171 // this function tests one of the possible 4 instances
172 // with different inputs across lanes for the VL version
173 #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
174 unsafe fn verify_256_helper(
175 linear
: unsafe fn(__m128i
, __m128i
) -> __m128i
,
176 vectorized
: unsafe fn(__m256i
, __m256i
) -> __m256i
,
178 let a
= _mm512_set_epi64(
188 let b
= _mm512_set_epi64(
199 let mut a_decomp
= [_mm_setzero_si128(); 2];
200 unroll
! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);}
201 let mut b_decomp
= [_mm_setzero_si128(); 2];
202 unroll
! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);}
205 _mm512_extracti64x4_epi64(a
, 0),
206 _mm512_extracti64x4_epi64(b
, 0),
208 let mut e_decomp
= [_mm_setzero_si128(); 2];
210 e_decomp
[i
] = linear(a_decomp
[i
], b_decomp
[i
]);
212 unroll
! {assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);}
215 #[simd_test(enable = "avx512vpclmulqdq,avx512f")]
216 unsafe fn test_mm512_clmulepi64_epi128() {
218 _mm512_broadcast_i32x4
,
219 _mm512_clmulepi64_epi128
,
224 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x00),
225 |a
, b
| _mm512_clmulepi64_epi128(a
, b
, 0x00),
228 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x01),
229 |a
, b
| _mm512_clmulepi64_epi128(a
, b
, 0x01),
232 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x10),
233 |a
, b
| _mm512_clmulepi64_epi128(a
, b
, 0x10),
236 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x11),
237 |a
, b
| _mm512_clmulepi64_epi128(a
, b
, 0x11),
241 #[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
242 unsafe fn test_mm256_clmulepi64_epi128() {
244 _mm256_broadcastsi128_si256
,
245 _mm256_clmulepi64_epi128
,
250 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x00),
251 |a
, b
| _mm256_clmulepi64_epi128(a
, b
, 0x00),
254 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x01),
255 |a
, b
| _mm256_clmulepi64_epi128(a
, b
, 0x01),
258 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x10),
259 |a
, b
| _mm256_clmulepi64_epi128(a
, b
, 0x10),
262 |a
, b
| _mm_clmulepi64_si128(a
, b
, 0x11),
263 |a
, b
| _mm256_clmulepi64_epi128(a
, b
, 0x11),