]> git.proxmox.com Git - rustc.git/blob - library/stdarch/crates/core_arch/src/x86/avx512vpclmulqdq.rs
New upstream version 1.50.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / avx512vpclmulqdq.rs
1 //! Vectorized Carry-less Multiplication (VCLMUL)
2 //!
3 //! The reference is [Intel 64 and IA-32 Architectures Software Developer's
4 //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref] (p. 4-241).
5 //!
6 //! [intel64_ref]: http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
7
8 use crate::core_arch::x86::__m256i;
9 use crate::core_arch::x86::__m512i;
10
11 #[cfg(test)]
12 use crate::stdarch_test::assert_instr;
13
14 #[allow(improper_ctypes)]
15 extern "C" {
16 #[link_name = "llvm.x86.pclmulqdq.256"]
17 fn pclmulqdq_256(a: __m256i, round_key: __m256i, imm8: u8) -> __m256i;
18 #[link_name = "llvm.x86.pclmulqdq.512"]
19 fn pclmulqdq_512(a: __m512i, round_key: __m512i, imm8: u8) -> __m512i;
20 }
21
22 // for some odd reason on x86_64 we generate the correct long name instructions
23 // but on i686 we generate the short name + imm8
24 // so we need to special-case on that...
25
26 /// Performs a carry-less multiplication of two 64-bit polynomials over the
27 /// finite field GF(2^k) - in each of the 4 128-bit lanes.
28 ///
29 /// The immediate byte is used for determining which halves of each lane `a` and `b`
30 /// should be used. Immediate bits other than 0 and 4 are ignored.
31 /// All lanes share immediate byte.
32 ///
33 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_clmulepi64_epi128)
34 #[inline]
35 #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
36 // technically according to Intel's documentation we don't need avx512f here, however LLVM gets confused otherwise
37 #[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
38 #[rustc_args_required_const(2)]
39 pub unsafe fn _mm512_clmulepi64_epi128(a: __m512i, b: __m512i, imm8: i32) -> __m512i {
40 macro_rules! call {
41 ($imm8:expr) => {
42 pclmulqdq_512(a, b, $imm8)
43 };
44 }
45 constify_imm8!(imm8, call)
46 }
47
48 /// Performs a carry-less multiplication of two 64-bit polynomials over the
49 /// finite field GF(2^k) - in each of the 2 128-bit lanes.
50 ///
51 /// The immediate byte is used for determining which halves of each lane `a` and `b`
52 /// should be used. Immediate bits other than 0 and 4 are ignored.
53 /// All lanes share immediate byte.
54 ///
55 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_clmulepi64_epi128)
56 #[inline]
57 #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
58 #[cfg_attr(test, assert_instr(vpclmul, imm8 = 0))]
59 #[rustc_args_required_const(2)]
60 pub unsafe fn _mm256_clmulepi64_epi128(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
61 macro_rules! call {
62 ($imm8:expr) => {
63 pclmulqdq_256(a, b, $imm8)
64 };
65 }
66 constify_imm8!(imm8, call)
67 }
68
69 #[cfg(test)]
70 mod tests {
71 // The constants in the tests below are just bit patterns. They should not
72 // be interpreted as integers; signedness does not make sense for them, but
73 // __mXXXi happens to be defined in terms of signed integers.
74 #![allow(overflowing_literals)]
75
76 use stdarch_test::simd_test;
77
78 use crate::core_arch::x86::*;
79
80 macro_rules! verify_kat_pclmul {
81 ($broadcast:ident, $clmul:ident, $assert:ident) => {
82 // Constants taken from https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf
83 let a = _mm_set_epi64x(0x7b5b546573745665, 0x63746f725d53475d);
84 let a = $broadcast(a);
85 let b = _mm_set_epi64x(0x4869285368617929, 0x5b477565726f6e5d);
86 let b = $broadcast(b);
87 let r00 = _mm_set_epi64x(0x1d4d84c85c3440c0, 0x929633d5d36f0451);
88 let r00 = $broadcast(r00);
89 let r01 = _mm_set_epi64x(0x1bd17c8d556ab5a1, 0x7fa540ac2a281315);
90 let r01 = $broadcast(r01);
91 let r10 = _mm_set_epi64x(0x1a2bf6db3a30862f, 0xbabf262df4b7d5c9);
92 let r10 = $broadcast(r10);
93 let r11 = _mm_set_epi64x(0x1d1e1f2c592e7c45, 0xd66ee03e410fd4ed);
94 let r11 = $broadcast(r11);
95
96 $assert($clmul(a, b, 0x00), r00);
97 $assert($clmul(a, b, 0x10), r01);
98 $assert($clmul(a, b, 0x01), r10);
99 $assert($clmul(a, b, 0x11), r11);
100
101 let a0 = _mm_set_epi64x(0x0000000000000000, 0x8000000000000000);
102 let a0 = $broadcast(a0);
103 let r = _mm_set_epi64x(0x4000000000000000, 0x0000000000000000);
104 let r = $broadcast(r);
105 $assert($clmul(a0, a0, 0x00), r);
106 }
107 }
108
109 macro_rules! unroll {
110 ($target:ident[4] = $op:ident($source:ident,4);) => {
111 $target[3] = $op($source, 3);
112 $target[2] = $op($source, 2);
113 unroll! {$target[2] = $op($source,2);}
114 };
115 ($target:ident[2] = $op:ident($source:ident,2);) => {
116 $target[1] = $op($source, 1);
117 $target[0] = $op($source, 0);
118 };
119 (assert_eq_m128i($op:ident($vec_res:ident,4),$lin_res:ident[4]);) => {
120 assert_eq_m128i($op($vec_res, 3), $lin_res[3]);
121 assert_eq_m128i($op($vec_res, 2), $lin_res[2]);
122 unroll! {assert_eq_m128i($op($vec_res,2),$lin_res[2]);}
123 };
124 (assert_eq_m128i($op:ident($vec_res:ident,2),$lin_res:ident[2]);) => {
125 assert_eq_m128i($op($vec_res, 1), $lin_res[1]);
126 assert_eq_m128i($op($vec_res, 0), $lin_res[0]);
127 };
128 }
129
130 // this function tests one of the possible 4 instances
131 // with different inputs across lanes
132 #[target_feature(enable = "avx512vpclmulqdq,avx512f")]
133 unsafe fn verify_512_helper(
134 linear: unsafe fn(__m128i, __m128i) -> __m128i,
135 vectorized: unsafe fn(__m512i, __m512i) -> __m512i,
136 ) {
137 let a = _mm512_set_epi64(
138 0xDCB4DB3657BF0B7D,
139 0x18DB0601068EDD9F,
140 0xB76B908233200DC5,
141 0xE478235FA8E22D5E,
142 0xAB05CFFA2621154C,
143 0x1171B47A186174C9,
144 0x8C6B6C0E7595CEC9,
145 0xBE3E7D4934E961BD,
146 );
147 let b = _mm512_set_epi64(
148 0x672F6F105A94CEA7,
149 0x8298B8FFCA5F829C,
150 0xA3927047B3FB61D8,
151 0x978093862CDE7187,
152 0xB1927AB22F31D0EC,
153 0xA9A5DA619BE4D7AF,
154 0xCA2590F56884FDC6,
155 0x19BE9F660038BDB5,
156 );
157
158 let mut a_decomp = [_mm_setzero_si128(); 4];
159 unroll! {a_decomp[4] = _mm512_extracti32x4_epi32(a,4);}
160 let mut b_decomp = [_mm_setzero_si128(); 4];
161 unroll! {b_decomp[4] = _mm512_extracti32x4_epi32(b,4);}
162
163 let r = vectorized(a, b);
164 let mut e_decomp = [_mm_setzero_si128(); 4];
165 for i in 0..4 {
166 e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
167 }
168 unroll! {assert_eq_m128i(_mm512_extracti32x4_epi32(r,4),e_decomp[4]);}
169 }
170
171 // this function tests one of the possible 4 instances
172 // with different inputs across lanes for the VL version
173 #[target_feature(enable = "avx512vpclmulqdq,avx512vl")]
174 unsafe fn verify_256_helper(
175 linear: unsafe fn(__m128i, __m128i) -> __m128i,
176 vectorized: unsafe fn(__m256i, __m256i) -> __m256i,
177 ) {
178 let a = _mm512_set_epi64(
179 0xDCB4DB3657BF0B7D,
180 0x18DB0601068EDD9F,
181 0xB76B908233200DC5,
182 0xE478235FA8E22D5E,
183 0xAB05CFFA2621154C,
184 0x1171B47A186174C9,
185 0x8C6B6C0E7595CEC9,
186 0xBE3E7D4934E961BD,
187 );
188 let b = _mm512_set_epi64(
189 0x672F6F105A94CEA7,
190 0x8298B8FFCA5F829C,
191 0xA3927047B3FB61D8,
192 0x978093862CDE7187,
193 0xB1927AB22F31D0EC,
194 0xA9A5DA619BE4D7AF,
195 0xCA2590F56884FDC6,
196 0x19BE9F660038BDB5,
197 );
198
199 let mut a_decomp = [_mm_setzero_si128(); 2];
200 unroll! {a_decomp[2] = _mm512_extracti32x4_epi32(a,2);}
201 let mut b_decomp = [_mm_setzero_si128(); 2];
202 unroll! {b_decomp[2] = _mm512_extracti32x4_epi32(b,2);}
203
204 let r = vectorized(
205 _mm512_extracti64x4_epi64(a, 0),
206 _mm512_extracti64x4_epi64(b, 0),
207 );
208 let mut e_decomp = [_mm_setzero_si128(); 2];
209 for i in 0..2 {
210 e_decomp[i] = linear(a_decomp[i], b_decomp[i]);
211 }
212 unroll! {assert_eq_m128i(_mm256_extracti128_si256(r,2),e_decomp[2]);}
213 }
214
215 #[simd_test(enable = "avx512vpclmulqdq,avx512f")]
216 unsafe fn test_mm512_clmulepi64_epi128() {
217 verify_kat_pclmul!(
218 _mm512_broadcast_i32x4,
219 _mm512_clmulepi64_epi128,
220 assert_eq_m512i
221 );
222
223 verify_512_helper(
224 |a, b| _mm_clmulepi64_si128(a, b, 0x00),
225 |a, b| _mm512_clmulepi64_epi128(a, b, 0x00),
226 );
227 verify_512_helper(
228 |a, b| _mm_clmulepi64_si128(a, b, 0x01),
229 |a, b| _mm512_clmulepi64_epi128(a, b, 0x01),
230 );
231 verify_512_helper(
232 |a, b| _mm_clmulepi64_si128(a, b, 0x10),
233 |a, b| _mm512_clmulepi64_epi128(a, b, 0x10),
234 );
235 verify_512_helper(
236 |a, b| _mm_clmulepi64_si128(a, b, 0x11),
237 |a, b| _mm512_clmulepi64_epi128(a, b, 0x11),
238 );
239 }
240
241 #[simd_test(enable = "avx512vpclmulqdq,avx512vl")]
242 unsafe fn test_mm256_clmulepi64_epi128() {
243 verify_kat_pclmul!(
244 _mm256_broadcastsi128_si256,
245 _mm256_clmulepi64_epi128,
246 assert_eq_m256i
247 );
248
249 verify_256_helper(
250 |a, b| _mm_clmulepi64_si128(a, b, 0x00),
251 |a, b| _mm256_clmulepi64_epi128(a, b, 0x00),
252 );
253 verify_256_helper(
254 |a, b| _mm_clmulepi64_si128(a, b, 0x01),
255 |a, b| _mm256_clmulepi64_epi128(a, b, 0x01),
256 );
257 verify_256_helper(
258 |a, b| _mm_clmulepi64_si128(a, b, 0x10),
259 |a, b| _mm256_clmulepi64_epi128(a, b, 0x10),
260 );
261 verify_256_helper(
262 |a, b| _mm_clmulepi64_si128(a, b, 0x11),
263 |a, b| _mm256_clmulepi64_epi128(a, b, 0x11),
264 );
265 }
266 }