]> git.proxmox.com Git - rustc.git/blame - library/stdarch/crates/core_arch/src/x86/avx512gfni.rs
New upstream version 1.54.0+dfsg1
[rustc.git] / library / stdarch / crates / core_arch / src / x86 / avx512gfni.rs
CommitLineData
fc512014
XL
1//! Galois Field New Instructions (GFNI)
2//!
3//! The intrinsics here correspond to those in the `immintrin.h` C header.
4//!
5//! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6//! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7//!
8//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9
10use crate::core_arch::simd::i8x16;
11use crate::core_arch::simd::i8x32;
12use crate::core_arch::simd::i8x64;
13use crate::core_arch::simd_llvm::simd_select_bitmask;
14use crate::core_arch::x86::__m128i;
15use crate::core_arch::x86::__m256i;
16use crate::core_arch::x86::__m512i;
17use crate::core_arch::x86::__mmask16;
18use crate::core_arch::x86::__mmask32;
19use crate::core_arch::x86::__mmask64;
20use crate::core_arch::x86::_mm256_setzero_si256;
21use crate::core_arch::x86::_mm512_setzero_si512;
22use crate::core_arch::x86::_mm_setzero_si128;
23use crate::core_arch::x86::m128iExt;
24use crate::core_arch::x86::m256iExt;
25use crate::core_arch::x86::m512iExt;
26use crate::mem::transmute;
27
28#[cfg(test)]
29use stdarch_test::assert_instr;
30
31#[allow(improper_ctypes)]
32extern "C" {
33 #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
34 fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35 #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
36 fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37 #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
38 fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39 #[link_name = "llvm.x86.vgf2p8affineqb.512"]
40 fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
41 #[link_name = "llvm.x86.vgf2p8affineqb.256"]
42 fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
43 #[link_name = "llvm.x86.vgf2p8affineqb.128"]
44 fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
45 #[link_name = "llvm.x86.vgf2p8mulb.512"]
46 fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
47 #[link_name = "llvm.x86.vgf2p8mulb.256"]
48 fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
49 #[link_name = "llvm.x86.vgf2p8mulb.128"]
50 fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
51}
52
53// LLVM requires AVX512BW for a lot of these instructions, see
54// https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
55// however our tests also require the target feature list to match Intel's
56// which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
57// requirement (for now)
58// also see
59// https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
60// for forcing GFNI, BW and optionally VL extension
61
62/// Performs a multiplication in GF(2^8) on the packed bytes.
63/// The field is in polynomial representation with the reduction polynomial
64/// x^8 + x^4 + x^3 + x + 1.
65///
66/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8)
67#[inline]
68#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
69#[cfg_attr(test, assert_instr(vgf2p8mulb))]
70pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
71 transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
72}
73
74/// Performs a multiplication in GF(2^8) on the packed bytes.
75/// The field is in polynomial representation with the reduction polynomial
76/// x^8 + x^4 + x^3 + x + 1.
77///
78/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
79/// Otherwise the computation result is written into the result.
80///
81/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8mul_epi8)
82#[inline]
83#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
84#[cfg_attr(test, assert_instr(vgf2p8mulb))]
85pub unsafe fn _mm512_mask_gf2p8mul_epi8(
86 src: __m512i,
87 k: __mmask64,
88 a: __m512i,
89 b: __m512i,
90) -> __m512i {
91 transmute(simd_select_bitmask(
92 k,
93 vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
94 src.as_i8x64(),
95 ))
96}
97
98/// Performs a multiplication in GF(2^8) on the packed bytes.
99/// The field is in polynomial representation with the reduction polynomial
100/// x^8 + x^4 + x^3 + x + 1.
101///
102/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
103/// Otherwise the computation result is written into the result.
104///
105/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8mul_epi8)
106#[inline]
107#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
108#[cfg_attr(test, assert_instr(vgf2p8mulb))]
109pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
110 let zero = _mm512_setzero_si512().as_i8x64();
111 transmute(simd_select_bitmask(
112 k,
113 vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
114 zero,
115 ))
116}
117
118/// Performs a multiplication in GF(2^8) on the packed bytes.
119/// The field is in polynomial representation with the reduction polynomial
120/// x^8 + x^4 + x^3 + x + 1.
121///
122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8mul_epi8)
123#[inline]
124#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
125#[cfg_attr(test, assert_instr(vgf2p8mulb))]
126pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
127 transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
128}
129
130/// Performs a multiplication in GF(2^8) on the packed bytes.
131/// The field is in polynomial representation with the reduction polynomial
132/// x^8 + x^4 + x^3 + x + 1.
133///
134/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
135/// Otherwise the computation result is written into the result.
136///
137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8mul_epi8)
138#[inline]
139#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
140#[cfg_attr(test, assert_instr(vgf2p8mulb))]
141pub unsafe fn _mm256_mask_gf2p8mul_epi8(
142 src: __m256i,
143 k: __mmask32,
144 a: __m256i,
145 b: __m256i,
146) -> __m256i {
147 transmute(simd_select_bitmask(
148 k,
149 vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
150 src.as_i8x32(),
151 ))
152}
153
154/// Performs a multiplication in GF(2^8) on the packed bytes.
155/// The field is in polynomial representation with the reduction polynomial
156/// x^8 + x^4 + x^3 + x + 1.
157///
158/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
159/// Otherwise the computation result is written into the result.
160///
161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8mul_epi8)
162#[inline]
163#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
164#[cfg_attr(test, assert_instr(vgf2p8mulb))]
165pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
166 let zero = _mm256_setzero_si256().as_i8x32();
167 transmute(simd_select_bitmask(
168 k,
169 vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
170 zero,
171 ))
172}
173
174/// Performs a multiplication in GF(2^8) on the packed bytes.
175/// The field is in polynomial representation with the reduction polynomial
176/// x^8 + x^4 + x^3 + x + 1.
177///
178/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8mul_epi8)
179#[inline]
180#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
181#[cfg_attr(test, assert_instr(vgf2p8mulb))]
182pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
183 transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
184}
185
186/// Performs a multiplication in GF(2^8) on the packed bytes.
187/// The field is in polynomial representation with the reduction polynomial
188/// x^8 + x^4 + x^3 + x + 1.
189///
190/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
191/// Otherwise the computation result is written into the result.
192///
193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8mul_epi8)
194#[inline]
195#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
196#[cfg_attr(test, assert_instr(vgf2p8mulb))]
197pub unsafe fn _mm_mask_gf2p8mul_epi8(
198 src: __m128i,
199 k: __mmask16,
200 a: __m128i,
201 b: __m128i,
202) -> __m128i {
203 transmute(simd_select_bitmask(
204 k,
205 vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
206 src.as_i8x16(),
207 ))
208}
209
210/// Performs a multiplication in GF(2^8) on the packed bytes.
211/// The field is in polynomial representation with the reduction polynomial
212/// x^8 + x^4 + x^3 + x + 1.
213///
214/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
215/// Otherwise the computation result is written into the result.
216///
217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8mul_epi8)
218#[inline]
219#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
220#[cfg_attr(test, assert_instr(vgf2p8mulb))]
221pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
222 let zero = _mm_setzero_si128().as_i8x16();
223 transmute(simd_select_bitmask(
224 k,
225 vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
226 zero,
227 ))
228}
229
230/// Performs an affine transformation on the packed bytes in x.
231/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
232/// and b being a constant 8-bit immediate value.
233/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
234///
235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affine_epi8)
236#[inline]
237#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
17df50a5
XL
238#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
239#[rustc_legacy_const_generics(2)]
240pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
241 static_assert_imm8!(B);
242 let b = B as u8;
fc512014
XL
243 let x = x.as_i8x64();
244 let a = a.as_i8x64();
17df50a5 245 let r = vgf2p8affineqb_512(x, a, b);
fc512014
XL
246 transmute(r)
247}
248
249/// Performs an affine transformation on the packed bytes in x.
250/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
251/// and b being a constant 8-bit immediate value.
252/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
253///
254/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
255/// Otherwise the computation result is written into the result.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affine_epi8)
258#[inline]
259#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
17df50a5
XL
260#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
261#[rustc_legacy_const_generics(3)]
262pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
fc512014
XL
263 k: __mmask64,
264 x: __m512i,
265 a: __m512i,
fc512014 266) -> __m512i {
17df50a5
XL
267 static_assert_imm8!(B);
268 let b = B as u8;
fc512014 269 let zero = _mm512_setzero_si512().as_i8x64();
fc512014
XL
270 let x = x.as_i8x64();
271 let a = a.as_i8x64();
17df50a5 272 let r = vgf2p8affineqb_512(x, a, b);
fc512014
XL
273 transmute(simd_select_bitmask(k, r, zero))
274}
275
276/// Performs an affine transformation on the packed bytes in x.
277/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
278/// and b being a constant 8-bit immediate value.
279/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
280///
281/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
282/// Otherwise the computation result is written into the result.
283///
284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affine_epi8)
285#[inline]
286#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
17df50a5
XL
287#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
288#[rustc_legacy_const_generics(4)]
289pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
fc512014
XL
290 src: __m512i,
291 k: __mmask64,
292 x: __m512i,
293 a: __m512i,
fc512014 294) -> __m512i {
17df50a5
XL
295 static_assert_imm8!(B);
296 let b = B as u8;
fc512014
XL
297 let x = x.as_i8x64();
298 let a = a.as_i8x64();
17df50a5 299 let r = vgf2p8affineqb_512(x, a, b);
fc512014
XL
300 transmute(simd_select_bitmask(k, r, src.as_i8x64()))
301}
302
303/// Performs an affine transformation on the packed bytes in x.
304/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
305/// and b being a constant 8-bit immediate value.
306/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
307///
308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affine_epi8)
309#[inline]
310#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
311#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
312#[rustc_legacy_const_generics(2)]
313pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
314 static_assert_imm8!(B);
315 let b = B as u8;
fc512014
XL
316 let x = x.as_i8x32();
317 let a = a.as_i8x32();
17df50a5 318 let r = vgf2p8affineqb_256(x, a, b);
fc512014
XL
319 transmute(r)
320}
321
322/// Performs an affine transformation on the packed bytes in x.
323/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
324/// and b being a constant 8-bit immediate value.
325/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
326///
327/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
328/// Otherwise the computation result is written into the result.
329///
330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affine_epi8)
331#[inline]
332#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
333#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
334#[rustc_legacy_const_generics(3)]
335pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
fc512014
XL
336 k: __mmask32,
337 x: __m256i,
338 a: __m256i,
fc512014 339) -> __m256i {
17df50a5
XL
340 static_assert_imm8!(B);
341 let b = B as u8;
fc512014 342 let zero = _mm256_setzero_si256().as_i8x32();
fc512014
XL
343 let x = x.as_i8x32();
344 let a = a.as_i8x32();
17df50a5 345 let r = vgf2p8affineqb_256(x, a, b);
fc512014
XL
346 transmute(simd_select_bitmask(k, r, zero))
347}
348
349/// Performs an affine transformation on the packed bytes in x.
350/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
351/// and b being a constant 8-bit immediate value.
352/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
353///
354/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
355/// Otherwise the computation result is written into the result.
356///
357/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affine_epi8)
358#[inline]
359#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
360#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
361#[rustc_legacy_const_generics(4)]
362pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
fc512014
XL
363 src: __m256i,
364 k: __mmask32,
365 x: __m256i,
366 a: __m256i,
fc512014 367) -> __m256i {
17df50a5
XL
368 static_assert_imm8!(B);
369 let b = B as u8;
fc512014
XL
370 let x = x.as_i8x32();
371 let a = a.as_i8x32();
17df50a5 372 let r = vgf2p8affineqb_256(x, a, b);
fc512014
XL
373 transmute(simd_select_bitmask(k, r, src.as_i8x32()))
374}
375
376/// Performs an affine transformation on the packed bytes in x.
377/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
378/// and b being a constant 8-bit immediate value.
379/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
380///
381/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi8)
382#[inline]
383#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
384#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
385#[rustc_legacy_const_generics(2)]
386pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
387 static_assert_imm8!(B);
388 let b = B as u8;
fc512014
XL
389 let x = x.as_i8x16();
390 let a = a.as_i8x16();
17df50a5 391 let r = vgf2p8affineqb_128(x, a, b);
fc512014
XL
392 transmute(r)
393}
394
395/// Performs an affine transformation on the packed bytes in x.
396/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
397/// and b being a constant 8-bit immediate value.
398/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
399///
400/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
401/// Otherwise the computation result is written into the result.
402///
403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affine_epi8)
404#[inline]
405#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
406#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
407#[rustc_legacy_const_generics(3)]
408pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
fc512014
XL
409 k: __mmask16,
410 x: __m128i,
411 a: __m128i,
fc512014 412) -> __m128i {
17df50a5
XL
413 static_assert_imm8!(B);
414 let b = B as u8;
fc512014 415 let zero = _mm_setzero_si128().as_i8x16();
fc512014
XL
416 let x = x.as_i8x16();
417 let a = a.as_i8x16();
17df50a5 418 let r = vgf2p8affineqb_128(x, a, b);
fc512014
XL
419 transmute(simd_select_bitmask(k, r, zero))
420}
421
422/// Performs an affine transformation on the packed bytes in x.
423/// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
424/// and b being a constant 8-bit immediate value.
425/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
426///
427/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
428/// Otherwise the computation result is written into the result.
429///
430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affine_epi8)
431#[inline]
432#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
433#[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
434#[rustc_legacy_const_generics(4)]
435pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
fc512014
XL
436 src: __m128i,
437 k: __mmask16,
438 x: __m128i,
439 a: __m128i,
fc512014 440) -> __m128i {
17df50a5
XL
441 static_assert_imm8!(B);
442 let b = B as u8;
fc512014
XL
443 let x = x.as_i8x16();
444 let a = a.as_i8x16();
17df50a5 445 let r = vgf2p8affineqb_128(x, a, b);
fc512014
XL
446 transmute(simd_select_bitmask(k, r, src.as_i8x16()))
447}
448
449/// Performs an affine transformation on the inverted packed bytes in x.
450/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
451/// and b being a constant 8-bit immediate value.
452/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
453/// The inverse of 0 is 0.
454/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
455///
456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affineinv_epi64_epi8)
457#[inline]
458#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
17df50a5
XL
459#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
460#[rustc_legacy_const_generics(2)]
461pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
462 static_assert_imm8!(B);
463 let b = B as u8;
fc512014
XL
464 let x = x.as_i8x64();
465 let a = a.as_i8x64();
17df50a5 466 let r = vgf2p8affineinvqb_512(x, a, b);
fc512014
XL
467 transmute(r)
468}
469
470/// Performs an affine transformation on the inverted packed bytes in x.
471/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
472/// and b being a constant 8-bit immediate value.
473/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
474/// The inverse of 0 is 0.
475/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
476///
477/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
478/// Otherwise the computation result is written into the result.
479///
480/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
481#[inline]
482#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
17df50a5
XL
483#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
484#[rustc_legacy_const_generics(3)]
485pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
fc512014
XL
486 k: __mmask64,
487 x: __m512i,
488 a: __m512i,
fc512014 489) -> __m512i {
17df50a5
XL
490 static_assert_imm8!(B);
491 let b = B as u8;
fc512014
XL
492 let zero = _mm512_setzero_si512().as_i8x64();
493 let x = x.as_i8x64();
494 let a = a.as_i8x64();
17df50a5 495 let r = vgf2p8affineinvqb_512(x, a, b);
fc512014
XL
496 transmute(simd_select_bitmask(k, r, zero))
497}
498
499/// Performs an affine transformation on the inverted packed bytes in x.
500/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
501/// and b being a constant 8-bit immediate value.
502/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
503/// The inverse of 0 is 0.
504/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
505///
506/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
507/// Otherwise the computation result is written into the result.
508///
509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
510#[inline]
511#[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
17df50a5
XL
512#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
513#[rustc_legacy_const_generics(4)]
514pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
fc512014
XL
515 src: __m512i,
516 k: __mmask64,
517 x: __m512i,
518 a: __m512i,
fc512014 519) -> __m512i {
17df50a5
XL
520 static_assert_imm8!(B);
521 let b = B as u8;
fc512014
XL
522 let x = x.as_i8x64();
523 let a = a.as_i8x64();
17df50a5 524 let r = vgf2p8affineinvqb_512(x, a, b);
fc512014
XL
525 transmute(simd_select_bitmask(k, r, src.as_i8x64()))
526}
527
528/// Performs an affine transformation on the inverted packed bytes in x.
529/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
530/// and b being a constant 8-bit immediate value.
531/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
532/// The inverse of 0 is 0.
533/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
534///
535/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affineinv_epi64_epi8)
536#[inline]
537#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
538#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
539#[rustc_legacy_const_generics(2)]
540pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
541 static_assert_imm8!(B);
542 let b = B as u8;
fc512014
XL
543 let x = x.as_i8x32();
544 let a = a.as_i8x32();
17df50a5 545 let r = vgf2p8affineinvqb_256(x, a, b);
fc512014
XL
546 transmute(r)
547}
548
549/// Performs an affine transformation on the inverted packed bytes in x.
550/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
551/// and b being a constant 8-bit immediate value.
552/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
553/// The inverse of 0 is 0.
554/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
555///
556/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
557/// Otherwise the computation result is written into the result.
558///
559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
560#[inline]
561#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
562#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
563#[rustc_legacy_const_generics(3)]
564pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
fc512014
XL
565 k: __mmask32,
566 x: __m256i,
567 a: __m256i,
fc512014 568) -> __m256i {
17df50a5
XL
569 static_assert_imm8!(B);
570 let b = B as u8;
fc512014
XL
571 let zero = _mm256_setzero_si256().as_i8x32();
572 let x = x.as_i8x32();
573 let a = a.as_i8x32();
17df50a5 574 let r = vgf2p8affineinvqb_256(x, a, b);
fc512014
XL
575 transmute(simd_select_bitmask(k, r, zero))
576}
577
578/// Performs an affine transformation on the inverted packed bytes in x.
579/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
580/// and b being a constant 8-bit immediate value.
581/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
582/// The inverse of 0 is 0.
583/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
584///
585/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
586/// Otherwise the computation result is written into the result.
587///
588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
589#[inline]
590#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
591#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
592#[rustc_legacy_const_generics(4)]
593pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
fc512014
XL
594 src: __m256i,
595 k: __mmask32,
596 x: __m256i,
597 a: __m256i,
fc512014 598) -> __m256i {
17df50a5
XL
599 static_assert_imm8!(B);
600 let b = B as u8;
fc512014
XL
601 let x = x.as_i8x32();
602 let a = a.as_i8x32();
17df50a5 603 let r = vgf2p8affineinvqb_256(x, a, b);
fc512014
XL
604 transmute(simd_select_bitmask(k, r, src.as_i8x32()))
605}
606
607/// Performs an affine transformation on the inverted packed bytes in x.
608/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
609/// and b being a constant 8-bit immediate value.
610/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
611/// The inverse of 0 is 0.
612/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
613///
614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affineinv_epi64_epi8)
615#[inline]
616#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
617#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
618#[rustc_legacy_const_generics(2)]
619pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
620 static_assert_imm8!(B);
621 let b = B as u8;
fc512014
XL
622 let x = x.as_i8x16();
623 let a = a.as_i8x16();
17df50a5 624 let r = vgf2p8affineinvqb_128(x, a, b);
fc512014
XL
625 transmute(r)
626}
627
628/// Performs an affine transformation on the inverted packed bytes in x.
629/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
630/// and b being a constant 8-bit immediate value.
631/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
632/// The inverse of 0 is 0.
633/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
634///
635/// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
636/// Otherwise the computation result is written into the result.
637///
638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
639#[inline]
640#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
641#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
642#[rustc_legacy_const_generics(3)]
643pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
fc512014
XL
644 k: __mmask16,
645 x: __m128i,
646 a: __m128i,
fc512014 647) -> __m128i {
17df50a5
XL
648 static_assert_imm8!(B);
649 let b = B as u8;
fc512014
XL
650 let zero = _mm_setzero_si128().as_i8x16();
651 let x = x.as_i8x16();
652 let a = a.as_i8x16();
17df50a5 653 let r = vgf2p8affineinvqb_128(x, a, b);
fc512014
XL
654 transmute(simd_select_bitmask(k, r, zero))
655}
656
657/// Performs an affine transformation on the inverted packed bytes in x.
658/// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
659/// and b being a constant 8-bit immediate value.
660/// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
661/// The inverse of 0 is 0.
662/// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
663///
664/// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
665/// Otherwise the computation result is written into the result.
666///
667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affineinv_epi64_epi8)
668#[inline]
669#[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
17df50a5
XL
670#[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
671#[rustc_legacy_const_generics(4)]
672pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
fc512014
XL
673 src: __m128i,
674 k: __mmask16,
675 x: __m128i,
676 a: __m128i,
fc512014 677) -> __m128i {
17df50a5
XL
678 static_assert_imm8!(B);
679 let b = B as u8;
fc512014
XL
680 let x = x.as_i8x16();
681 let a = a.as_i8x16();
17df50a5 682 let r = vgf2p8affineinvqb_128(x, a, b);
fc512014
XL
683 transmute(simd_select_bitmask(k, r, src.as_i8x16()))
684}
685
686#[cfg(test)]
687mod tests {
688 // The constants in the tests below are just bit patterns. They should not
689 // be interpreted as integers; signedness does not make sense for them, but
690 // __mXXXi happens to be defined in terms of signed integers.
691 #![allow(overflowing_literals)]
692
693 use core::hint::black_box;
694 use core::intrinsics::size_of;
695 use stdarch_test::simd_test;
696
697 use crate::core_arch::x86::*;
698
699 fn mulbyte(left: u8, right: u8) -> u8 {
700 // this implementation follows the description in
701 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8
702 const REDUCTION_POLYNOMIAL: u16 = 0x11b;
703 let left: u16 = left.into();
704 let right: u16 = right.into();
705 let mut carryless_product: u16 = 0;
706
707 // Carryless multiplication
708 for i in 0..8 {
709 if ((left >> i) & 0x01) != 0 {
710 carryless_product ^= right << i;
711 }
712 }
713
714 // reduction, adding in "0" where appropriate to clear out high bits
715 // note that REDUCTION_POLYNOMIAL is zero in this context
716 for i in (8..=14).rev() {
717 if ((carryless_product >> i) & 0x01) != 0 {
718 carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
719 }
720 }
721
722 carryless_product as u8
723 }
724
725 const NUM_TEST_WORDS_512: usize = 4;
726 const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
727 const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
728 const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
729 const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
730 const NUM_BYTES: usize = 256;
731 const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
732 const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
733 const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
734
735 fn parity(input: u8) -> u8 {
736 let mut accumulator = 0;
737 for i in 0..8 {
738 accumulator ^= (input >> i) & 0x01;
739 }
740 accumulator
741 }
742
743 fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
744 // this implementation follows the description in
745 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi64_epi8
746 let mut accumulator = 0;
747
748 for bit in 0..8 {
749 accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
750 }
751
752 accumulator ^ b
753 }
754
755 fn generate_affine_mul_test_data(
756 immediate: u8,
757 ) -> (
758 [u64; NUM_TEST_WORDS_64],
759 [u8; NUM_TEST_ENTRIES],
760 [u8; NUM_TEST_ENTRIES],
761 ) {
762 let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
763 let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
764 let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
765
766 for i in 0..NUM_TEST_WORDS_64 {
767 left[i] = (i as u64) * 103 * 101;
768 for j in 0..8 {
769 let j64 = j as u64;
770 right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
771 result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
772 }
773 }
774
775 (left, right, result)
776 }
777
778 fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
779 let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
780 let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
781
782 for i in 0..NUM_BYTES {
783 input[i] = (i % 256) as u8;
784 result[i] = if i == 0 { 0 } else { 1 };
785 }
786
787 (input, result)
788 }
789
790 const AES_S_BOX: [u8; NUM_BYTES] = [
791 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
792 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
793 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
794 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
795 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
796 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
797 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
798 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
799 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
800 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
801 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
802 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
803 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
804 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
805 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
806 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
807 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
808 0x16,
809 ];
810
811 fn generate_byte_mul_test_data() -> (
812 [u8; NUM_TEST_ENTRIES],
813 [u8; NUM_TEST_ENTRIES],
814 [u8; NUM_TEST_ENTRIES],
815 ) {
816 let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
817 let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
818 let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
819
820 for i in 0..NUM_TEST_ENTRIES {
821 left[i] = (i % 256) as u8;
822 right[i] = left[i] * 101;
823 result[i] = mulbyte(left[i], right[i]);
824 }
825
826 (left, right, result)
827 }
828
829 #[target_feature(enable = "sse2")]
830 unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
831 let byte_offset = word_index * 16 / size_of::<T>();
832 let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m128i;
833 _mm_loadu_si128(black_box(pointer))
834 }
835
836 #[target_feature(enable = "avx")]
837 unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
838 let byte_offset = word_index * 32 / size_of::<T>();
839 let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m256i;
840 _mm256_loadu_si256(black_box(pointer))
841 }
842
843 #[target_feature(enable = "avx512f")]
844 unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
845 let byte_offset = word_index * 64 / size_of::<T>();
846 let pointer = data.as_ptr().offset(byte_offset as isize) as *const i32;
847 _mm512_loadu_si512(black_box(pointer))
848 }
849
850 #[simd_test(enable = "avx512gfni,avx512bw")]
851 unsafe fn test_mm512_gf2p8mul_epi8() {
852 let (left, right, expected) = generate_byte_mul_test_data();
853
854 for i in 0..NUM_TEST_WORDS_512 {
855 let left = load_m512i_word(&left, i);
856 let right = load_m512i_word(&right, i);
857 let expected = load_m512i_word(&expected, i);
858 let result = _mm512_gf2p8mul_epi8(left, right);
859 assert_eq_m512i(result, expected);
860 }
861 }
862
863 #[simd_test(enable = "avx512gfni,avx512bw")]
864 unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
865 let (left, right, _expected) = generate_byte_mul_test_data();
866
867 for i in 0..NUM_TEST_WORDS_512 {
868 let left = load_m512i_word(&left, i);
869 let right = load_m512i_word(&right, i);
870 let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
871 assert_eq_m512i(result_zero, _mm512_setzero_si512());
872 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
873 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
874 let expected_result = _mm512_gf2p8mul_epi8(left, right);
875 let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
876 let expected_masked =
877 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
878 assert_eq_m512i(result_masked, expected_masked);
879 }
880 }
881
882 #[simd_test(enable = "avx512gfni,avx512bw")]
883 unsafe fn test_mm512_mask_gf2p8mul_epi8() {
884 let (left, right, _expected) = generate_byte_mul_test_data();
885
886 for i in 0..NUM_TEST_WORDS_512 {
887 let left = load_m512i_word(&left, i);
888 let right = load_m512i_word(&right, i);
889 let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
890 assert_eq_m512i(result_left, left);
891 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
892 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
893 let expected_result = _mm512_gf2p8mul_epi8(left, right);
894 let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
895 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
896 assert_eq_m512i(result_masked, expected_masked);
897 }
898 }
899
900 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
901 unsafe fn test_mm256_gf2p8mul_epi8() {
902 let (left, right, expected) = generate_byte_mul_test_data();
903
904 for i in 0..NUM_TEST_WORDS_256 {
905 let left = load_m256i_word(&left, i);
906 let right = load_m256i_word(&right, i);
907 let expected = load_m256i_word(&expected, i);
908 let result = _mm256_gf2p8mul_epi8(left, right);
909 assert_eq_m256i(result, expected);
910 }
911 }
912
913 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
914 unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
915 let (left, right, _expected) = generate_byte_mul_test_data();
916
917 for i in 0..NUM_TEST_WORDS_256 {
918 let left = load_m256i_word(&left, i);
919 let right = load_m256i_word(&right, i);
920 let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
921 assert_eq_m256i(result_zero, _mm256_setzero_si256());
922 let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
923 const MASK_WORDS: i32 = 0b01_10_11_00;
924 let expected_result = _mm256_gf2p8mul_epi8(left, right);
925 let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
926 let expected_masked =
17df50a5 927 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
fc512014
XL
928 assert_eq_m256i(result_masked, expected_masked);
929 }
930 }
931
932 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
933 unsafe fn test_mm256_mask_gf2p8mul_epi8() {
934 let (left, right, _expected) = generate_byte_mul_test_data();
935
936 for i in 0..NUM_TEST_WORDS_256 {
937 let left = load_m256i_word(&left, i);
938 let right = load_m256i_word(&right, i);
939 let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
940 assert_eq_m256i(result_left, left);
941 let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
942 const MASK_WORDS: i32 = 0b01_10_11_00;
943 let expected_result = _mm256_gf2p8mul_epi8(left, right);
944 let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
17df50a5 945 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
fc512014
XL
946 assert_eq_m256i(result_masked, expected_masked);
947 }
948 }
949
950 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
951 unsafe fn test_mm_gf2p8mul_epi8() {
952 let (left, right, expected) = generate_byte_mul_test_data();
953
954 for i in 0..NUM_TEST_WORDS_128 {
955 let left = load_m128i_word(&left, i);
956 let right = load_m128i_word(&right, i);
957 let expected = load_m128i_word(&expected, i);
958 let result = _mm_gf2p8mul_epi8(left, right);
959 assert_eq_m128i(result, expected);
960 }
961 }
962
963 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
964 unsafe fn test_mm_maskz_gf2p8mul_epi8() {
965 let (left, right, _expected) = generate_byte_mul_test_data();
966
967 for i in 0..NUM_TEST_WORDS_128 {
968 let left = load_m128i_word(&left, i);
969 let right = load_m128i_word(&right, i);
970 let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
971 assert_eq_m128i(result_zero, _mm_setzero_si128());
972 let mask_bytes: __mmask16 = 0x0F_F0;
973 const MASK_WORDS: i32 = 0b01_10;
974 let expected_result = _mm_gf2p8mul_epi8(left, right);
975 let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
17df50a5
XL
976 let expected_masked =
977 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
fc512014
XL
978 assert_eq_m128i(result_masked, expected_masked);
979 }
980 }
981
982 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
983 unsafe fn test_mm_mask_gf2p8mul_epi8() {
984 let (left, right, _expected) = generate_byte_mul_test_data();
985
986 for i in 0..NUM_TEST_WORDS_128 {
987 let left = load_m128i_word(&left, i);
988 let right = load_m128i_word(&right, i);
989 let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
990 assert_eq_m128i(result_left, left);
991 let mask_bytes: __mmask16 = 0x0F_F0;
992 const MASK_WORDS: i32 = 0b01_10;
993 let expected_result = _mm_gf2p8mul_epi8(left, right);
994 let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
17df50a5 995 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
fc512014
XL
996 assert_eq_m128i(result_masked, expected_masked);
997 }
998 }
999
1000 #[simd_test(enable = "avx512gfni,avx512bw")]
1001 unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1002 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1003 const IDENTITY_BYTE: i32 = 0;
1004 let constant: i64 = 0;
1005 const CONSTANT_BYTE: i32 = 0x63;
1006 let identity = _mm512_set1_epi64(identity);
1007 let constant = _mm512_set1_epi64(constant);
1008 let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1009
1010 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1011 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1012
1013 for i in 0..NUM_TEST_WORDS_512 {
1014 let data = load_m512i_word(&bytes, i);
17df50a5 1015 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
fc512014 1016 assert_eq_m512i(result, data);
17df50a5 1017 let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
fc512014
XL
1018 assert_eq_m512i(result, constant_reference);
1019 let data = load_m512i_word(&more_bytes, i);
17df50a5 1020 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
fc512014 1021 assert_eq_m512i(result, data);
17df50a5 1022 let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
fc512014
XL
1023 assert_eq_m512i(result, constant_reference);
1024
1025 let matrix = load_m512i_word(&matrices, i);
1026 let vector = load_m512i_word(&vectors, i);
1027 let reference = load_m512i_word(&references, i);
1028
17df50a5 1029 let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
fc512014
XL
1030 assert_eq_m512i(result, reference);
1031 }
1032 }
1033
1034 #[simd_test(enable = "avx512gfni,avx512bw")]
1035 unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1036 const CONSTANT_BYTE: i32 = 0x63;
1037 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1038
1039 for i in 0..NUM_TEST_WORDS_512 {
1040 let matrix = load_m512i_word(&matrices, i);
1041 let vector = load_m512i_word(&vectors, i);
17df50a5
XL
1042 let result_zero =
1043 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
fc512014
XL
1044 assert_eq_m512i(result_zero, _mm512_setzero_si512());
1045 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1046 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
17df50a5 1047 let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014 1048 let result_masked =
17df50a5 1049 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
fc512014
XL
1050 let expected_masked =
1051 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1052 assert_eq_m512i(result_masked, expected_masked);
1053 }
1054 }
1055
1056 #[simd_test(enable = "avx512gfni,avx512bw")]
1057 unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1058 const CONSTANT_BYTE: i32 = 0x63;
1059 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1060
1061 for i in 0..NUM_TEST_WORDS_512 {
1062 let left = load_m512i_word(&vectors, i);
1063 let right = load_m512i_word(&matrices, i);
1064 let result_left =
17df50a5 1065 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
fc512014
XL
1066 assert_eq_m512i(result_left, left);
1067 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1068 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
17df50a5 1069 let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
fc512014 1070 let result_masked =
17df50a5 1071 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
fc512014
XL
1072 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1073 assert_eq_m512i(result_masked, expected_masked);
1074 }
1075 }
1076
1077 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1078 unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1079 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1080 const IDENTITY_BYTE: i32 = 0;
1081 let constant: i64 = 0;
1082 const CONSTANT_BYTE: i32 = 0x63;
1083 let identity = _mm256_set1_epi64x(identity);
1084 let constant = _mm256_set1_epi64x(constant);
1085 let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1086
1087 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1088 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1089
1090 for i in 0..NUM_TEST_WORDS_256 {
1091 let data = load_m256i_word(&bytes, i);
17df50a5 1092 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
fc512014 1093 assert_eq_m256i(result, data);
17df50a5 1094 let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
fc512014
XL
1095 assert_eq_m256i(result, constant_reference);
1096 let data = load_m256i_word(&more_bytes, i);
17df50a5 1097 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
fc512014 1098 assert_eq_m256i(result, data);
17df50a5 1099 let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
fc512014
XL
1100 assert_eq_m256i(result, constant_reference);
1101
1102 let matrix = load_m256i_word(&matrices, i);
1103 let vector = load_m256i_word(&vectors, i);
1104 let reference = load_m256i_word(&references, i);
1105
17df50a5 1106 let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
fc512014
XL
1107 assert_eq_m256i(result, reference);
1108 }
1109 }
1110
1111 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1112 unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1113 const CONSTANT_BYTE: i32 = 0x63;
1114 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1115
1116 for i in 0..NUM_TEST_WORDS_256 {
1117 let matrix = load_m256i_word(&matrices, i);
1118 let vector = load_m256i_word(&vectors, i);
17df50a5
XL
1119 let result_zero =
1120 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
fc512014
XL
1121 assert_eq_m256i(result_zero, _mm256_setzero_si256());
1122 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1123 const MASK_WORDS: i32 = 0b11_01_10_00;
17df50a5 1124 let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014 1125 let result_masked =
17df50a5 1126 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
fc512014 1127 let expected_masked =
17df50a5 1128 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
fc512014
XL
1129 assert_eq_m256i(result_masked, expected_masked);
1130 }
1131 }
1132
1133 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1134 unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1135 const CONSTANT_BYTE: i32 = 0x63;
1136 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1137
1138 for i in 0..NUM_TEST_WORDS_256 {
1139 let left = load_m256i_word(&vectors, i);
1140 let right = load_m256i_word(&matrices, i);
1141 let result_left =
17df50a5 1142 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
fc512014
XL
1143 assert_eq_m256i(result_left, left);
1144 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1145 const MASK_WORDS: i32 = 0b11_01_10_00;
17df50a5 1146 let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
fc512014 1147 let result_masked =
17df50a5
XL
1148 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1149 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
fc512014
XL
1150 assert_eq_m256i(result_masked, expected_masked);
1151 }
1152 }
1153
1154 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1155 unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1156 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1157 const IDENTITY_BYTE: i32 = 0;
1158 let constant: i64 = 0;
1159 const CONSTANT_BYTE: i32 = 0x63;
1160 let identity = _mm_set1_epi64x(identity);
1161 let constant = _mm_set1_epi64x(constant);
1162 let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1163
1164 let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1165 let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1166
1167 for i in 0..NUM_TEST_WORDS_128 {
1168 let data = load_m128i_word(&bytes, i);
17df50a5 1169 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
fc512014 1170 assert_eq_m128i(result, data);
17df50a5 1171 let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
fc512014
XL
1172 assert_eq_m128i(result, constant_reference);
1173 let data = load_m128i_word(&more_bytes, i);
17df50a5 1174 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
fc512014 1175 assert_eq_m128i(result, data);
17df50a5 1176 let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
fc512014
XL
1177 assert_eq_m128i(result, constant_reference);
1178
1179 let matrix = load_m128i_word(&matrices, i);
1180 let vector = load_m128i_word(&vectors, i);
1181 let reference = load_m128i_word(&references, i);
1182
17df50a5 1183 let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
fc512014
XL
1184 assert_eq_m128i(result, reference);
1185 }
1186 }
1187
1188 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1189 unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1190 const CONSTANT_BYTE: i32 = 0x63;
1191 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1192
1193 for i in 0..NUM_TEST_WORDS_128 {
1194 let matrix = load_m128i_word(&matrices, i);
1195 let vector = load_m128i_word(&vectors, i);
17df50a5 1196 let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
fc512014
XL
1197 assert_eq_m128i(result_zero, _mm_setzero_si128());
1198 let mask_bytes: __mmask16 = 0x0F_F0;
1199 const MASK_WORDS: i32 = 0b01_10;
17df50a5 1200 let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014 1201 let result_masked =
17df50a5
XL
1202 _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1203 let expected_masked =
1204 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
fc512014
XL
1205 assert_eq_m128i(result_masked, expected_masked);
1206 }
1207 }
1208
1209 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1210 unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1211 const CONSTANT_BYTE: i32 = 0x63;
1212 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1213
1214 for i in 0..NUM_TEST_WORDS_128 {
1215 let left = load_m128i_word(&vectors, i);
1216 let right = load_m128i_word(&matrices, i);
17df50a5
XL
1217 let result_left =
1218 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
fc512014
XL
1219 assert_eq_m128i(result_left, left);
1220 let mask_bytes: __mmask16 = 0x0F_F0;
1221 const MASK_WORDS: i32 = 0b01_10;
17df50a5 1222 let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
fc512014 1223 let result_masked =
17df50a5
XL
1224 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1225 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
fc512014
XL
1226 assert_eq_m128i(result_masked, expected_masked);
1227 }
1228 }
1229
1230 #[simd_test(enable = "avx512gfni,avx512bw")]
1231 unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1232 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1233 const IDENTITY_BYTE: i32 = 0;
1234 const CONSTANT_BYTE: i32 = 0x63;
1235 let identity = _mm512_set1_epi64(identity);
1236
1237 // validate inversion
1238 let (inputs, results) = generate_inv_tests_data();
1239
1240 for i in 0..NUM_BYTES_WORDS_512 {
1241 let input = load_m512i_word(&inputs, i);
1242 let reference = load_m512i_word(&results, i);
17df50a5 1243 let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
fc512014
XL
1244 let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1245 assert_eq_m512i(remultiplied, reference);
1246 }
1247
1248 // validate subsequent affine operation
1249 let (matrices, vectors, _affine_expected) =
1250 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1251
1252 for i in 0..NUM_TEST_WORDS_512 {
1253 let vector = load_m512i_word(&vectors, i);
1254 let matrix = load_m512i_word(&matrices, i);
1255
17df50a5
XL
1256 let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1257 let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1258 let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014
XL
1259 assert_eq_m512i(result, reference);
1260 }
1261
1262 // validate everything by virtue of checking against the AES SBox
1263 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1264 let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1265
1266 for i in 0..NUM_BYTES_WORDS_512 {
1267 let reference = load_m512i_word(&AES_S_BOX, i);
1268 let input = load_m512i_word(&inputs, i);
17df50a5 1269 let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
fc512014
XL
1270 assert_eq_m512i(result, reference);
1271 }
1272 }
1273
1274 #[simd_test(enable = "avx512gfni,avx512bw")]
1275 unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1276 const CONSTANT_BYTE: i32 = 0x63;
1277 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1278
1279 for i in 0..NUM_TEST_WORDS_512 {
1280 let matrix = load_m512i_word(&matrices, i);
1281 let vector = load_m512i_word(&vectors, i);
1282 let result_zero =
17df50a5 1283 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
fc512014
XL
1284 assert_eq_m512i(result_zero, _mm512_setzero_si512());
1285 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1286 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
17df50a5 1287 let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014 1288 let result_masked =
17df50a5 1289 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
fc512014
XL
1290 let expected_masked =
1291 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1292 assert_eq_m512i(result_masked, expected_masked);
1293 }
1294 }
1295
1296 #[simd_test(enable = "avx512gfni,avx512bw")]
1297 unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1298 const CONSTANT_BYTE: i32 = 0x63;
1299 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1300
1301 for i in 0..NUM_TEST_WORDS_512 {
1302 let left = load_m512i_word(&vectors, i);
1303 let right = load_m512i_word(&matrices, i);
1304 let result_left =
17df50a5 1305 _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
fc512014
XL
1306 assert_eq_m512i(result_left, left);
1307 let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1308 let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
17df50a5
XL
1309 let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1310 let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1311 left, mask_bytes, left, right,
1312 );
fc512014
XL
1313 let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1314 assert_eq_m512i(result_masked, expected_masked);
1315 }
1316 }
1317
1318 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1319 unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1320 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1321 const IDENTITY_BYTE: i32 = 0;
1322 const CONSTANT_BYTE: i32 = 0x63;
1323 let identity = _mm256_set1_epi64x(identity);
1324
1325 // validate inversion
1326 let (inputs, results) = generate_inv_tests_data();
1327
1328 for i in 0..NUM_BYTES_WORDS_256 {
1329 let input = load_m256i_word(&inputs, i);
1330 let reference = load_m256i_word(&results, i);
17df50a5 1331 let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
fc512014
XL
1332 let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1333 assert_eq_m256i(remultiplied, reference);
1334 }
1335
1336 // validate subsequent affine operation
1337 let (matrices, vectors, _affine_expected) =
1338 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1339
1340 for i in 0..NUM_TEST_WORDS_256 {
1341 let vector = load_m256i_word(&vectors, i);
1342 let matrix = load_m256i_word(&matrices, i);
1343
17df50a5
XL
1344 let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1345 let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1346 let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014
XL
1347 assert_eq_m256i(result, reference);
1348 }
1349
1350 // validate everything by virtue of checking against the AES SBox
1351 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1352 let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1353
1354 for i in 0..NUM_BYTES_WORDS_256 {
1355 let reference = load_m256i_word(&AES_S_BOX, i);
1356 let input = load_m256i_word(&inputs, i);
17df50a5 1357 let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
fc512014
XL
1358 assert_eq_m256i(result, reference);
1359 }
1360 }
1361
1362 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1363 unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1364 const CONSTANT_BYTE: i32 = 0x63;
1365 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1366
1367 for i in 0..NUM_TEST_WORDS_256 {
1368 let matrix = load_m256i_word(&matrices, i);
1369 let vector = load_m256i_word(&vectors, i);
1370 let result_zero =
17df50a5 1371 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
fc512014
XL
1372 assert_eq_m256i(result_zero, _mm256_setzero_si256());
1373 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1374 const MASK_WORDS: i32 = 0b11_01_10_00;
17df50a5 1375 let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014 1376 let result_masked =
17df50a5 1377 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
fc512014 1378 let expected_masked =
17df50a5 1379 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
fc512014
XL
1380 assert_eq_m256i(result_masked, expected_masked);
1381 }
1382 }
1383
1384 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1385 unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1386 const CONSTANT_BYTE: i32 = 0x63;
1387 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1388
1389 for i in 0..NUM_TEST_WORDS_256 {
1390 let left = load_m256i_word(&vectors, i);
1391 let right = load_m256i_word(&matrices, i);
1392 let result_left =
17df50a5 1393 _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
fc512014
XL
1394 assert_eq_m256i(result_left, left);
1395 let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1396 const MASK_WORDS: i32 = 0b11_01_10_00;
17df50a5
XL
1397 let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1398 let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1399 left, mask_bytes, left, right,
1400 );
1401 let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
fc512014
XL
1402 assert_eq_m256i(result_masked, expected_masked);
1403 }
1404 }
1405
1406 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1407 unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1408 let identity: i64 = 0x01_02_04_08_10_20_40_80;
1409 const IDENTITY_BYTE: i32 = 0;
1410 const CONSTANT_BYTE: i32 = 0x63;
1411 let identity = _mm_set1_epi64x(identity);
1412
1413 // validate inversion
1414 let (inputs, results) = generate_inv_tests_data();
1415
1416 for i in 0..NUM_BYTES_WORDS_128 {
1417 let input = load_m128i_word(&inputs, i);
1418 let reference = load_m128i_word(&results, i);
17df50a5 1419 let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
fc512014
XL
1420 let remultiplied = _mm_gf2p8mul_epi8(result, input);
1421 assert_eq_m128i(remultiplied, reference);
1422 }
1423
1424 // validate subsequent affine operation
1425 let (matrices, vectors, _affine_expected) =
1426 generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1427
1428 for i in 0..NUM_TEST_WORDS_128 {
1429 let vector = load_m128i_word(&vectors, i);
1430 let matrix = load_m128i_word(&matrices, i);
1431
17df50a5
XL
1432 let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1433 let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1434 let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014
XL
1435 assert_eq_m128i(result, reference);
1436 }
1437
1438 // validate everything by virtue of checking against the AES SBox
1439 const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1440 let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1441
1442 for i in 0..NUM_BYTES_WORDS_128 {
1443 let reference = load_m128i_word(&AES_S_BOX, i);
1444 let input = load_m128i_word(&inputs, i);
17df50a5 1445 let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
fc512014
XL
1446 assert_eq_m128i(result, reference);
1447 }
1448 }
1449
1450 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1451 unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1452 const CONSTANT_BYTE: i32 = 0x63;
1453 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1454
1455 for i in 0..NUM_TEST_WORDS_128 {
1456 let matrix = load_m128i_word(&matrices, i);
1457 let vector = load_m128i_word(&vectors, i);
17df50a5
XL
1458 let result_zero =
1459 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
fc512014
XL
1460 assert_eq_m128i(result_zero, _mm_setzero_si128());
1461 let mask_bytes: __mmask16 = 0x0F_F0;
1462 const MASK_WORDS: i32 = 0b01_10;
17df50a5 1463 let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
fc512014 1464 let result_masked =
17df50a5
XL
1465 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1466 let expected_masked =
1467 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
fc512014
XL
1468 assert_eq_m128i(result_masked, expected_masked);
1469 }
1470 }
1471
1472 #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
1473 unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1474 const CONSTANT_BYTE: i32 = 0x63;
1475 let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1476
1477 for i in 0..NUM_TEST_WORDS_128 {
1478 let left = load_m128i_word(&vectors, i);
1479 let right = load_m128i_word(&matrices, i);
1480 let result_left =
17df50a5 1481 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
fc512014
XL
1482 assert_eq_m128i(result_left, left);
1483 let mask_bytes: __mmask16 = 0x0F_F0;
1484 const MASK_WORDS: i32 = 0b01_10;
17df50a5 1485 let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
fc512014 1486 let result_masked =
17df50a5
XL
1487 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1488 let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
fc512014
XL
1489 assert_eq_m128i(result_masked, expected_masked);
1490 }
1491 }
1492}