]>
Commit | Line | Data |
---|---|---|
fc512014 XL |
1 | //! Galois Field New Instructions (GFNI) |
2 | //! | |
3 | //! The intrinsics here correspond to those in the `immintrin.h` C header. | |
4 | //! | |
5 | //! The reference is [Intel 64 and IA-32 Architectures Software Developer's | |
6 | //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref]. | |
7 | //! | |
8 | //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf | |
9 | ||
10 | use crate::core_arch::simd::i8x16; | |
11 | use crate::core_arch::simd::i8x32; | |
12 | use crate::core_arch::simd::i8x64; | |
13 | use crate::core_arch::simd_llvm::simd_select_bitmask; | |
14 | use crate::core_arch::x86::__m128i; | |
15 | use crate::core_arch::x86::__m256i; | |
16 | use crate::core_arch::x86::__m512i; | |
17 | use crate::core_arch::x86::__mmask16; | |
18 | use crate::core_arch::x86::__mmask32; | |
19 | use crate::core_arch::x86::__mmask64; | |
20 | use crate::core_arch::x86::_mm256_setzero_si256; | |
21 | use crate::core_arch::x86::_mm512_setzero_si512; | |
22 | use crate::core_arch::x86::_mm_setzero_si128; | |
23 | use crate::core_arch::x86::m128iExt; | |
24 | use crate::core_arch::x86::m256iExt; | |
25 | use crate::core_arch::x86::m512iExt; | |
26 | use crate::mem::transmute; | |
27 | ||
28 | #[cfg(test)] | |
29 | use stdarch_test::assert_instr; | |
30 | ||
31 | #[allow(improper_ctypes)] | |
32 | extern "C" { | |
33 | #[link_name = "llvm.x86.vgf2p8affineinvqb.512"] | |
34 | fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64; | |
35 | #[link_name = "llvm.x86.vgf2p8affineinvqb.256"] | |
36 | fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32; | |
37 | #[link_name = "llvm.x86.vgf2p8affineinvqb.128"] | |
38 | fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16; | |
39 | #[link_name = "llvm.x86.vgf2p8affineqb.512"] | |
40 | fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64; | |
41 | #[link_name = "llvm.x86.vgf2p8affineqb.256"] | |
42 | fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32; | |
43 | #[link_name = "llvm.x86.vgf2p8affineqb.128"] | |
44 | fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16; | |
45 | #[link_name = "llvm.x86.vgf2p8mulb.512"] | |
46 | fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64; | |
47 | #[link_name = "llvm.x86.vgf2p8mulb.256"] | |
48 | fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32; | |
49 | #[link_name = "llvm.x86.vgf2p8mulb.128"] | |
50 | fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16; | |
51 | } | |
52 | ||
53 | // LLVM requires AVX512BW for a lot of these instructions, see | |
54 | // https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457 | |
55 | // however our tests also require the target feature list to match Intel's | |
56 | // which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F | |
57 | // requirement (for now) | |
58 | // also see | |
59 | // https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h | |
60 | // for forcing GFNI, BW and optionally VL extension | |
61 | ||
62 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
63 | /// The field is in polynomial representation with the reduction polynomial | |
64 | /// x^8 + x^4 + x^3 + x + 1. | |
65 | /// | |
66 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8) | |
67 | #[inline] | |
68 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
69 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
70 | pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i { | |
71 | transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64())) | |
72 | } | |
73 | ||
74 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
75 | /// The field is in polynomial representation with the reduction polynomial | |
76 | /// x^8 + x^4 + x^3 + x + 1. | |
77 | /// | |
78 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
79 | /// Otherwise the computation result is written into the result. | |
80 | /// | |
81 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8mul_epi8) | |
82 | #[inline] | |
83 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
84 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
85 | pub unsafe fn _mm512_mask_gf2p8mul_epi8( | |
86 | src: __m512i, | |
87 | k: __mmask64, | |
88 | a: __m512i, | |
89 | b: __m512i, | |
90 | ) -> __m512i { | |
91 | transmute(simd_select_bitmask( | |
92 | k, | |
93 | vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()), | |
94 | src.as_i8x64(), | |
95 | )) | |
96 | } | |
97 | ||
98 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
99 | /// The field is in polynomial representation with the reduction polynomial | |
100 | /// x^8 + x^4 + x^3 + x + 1. | |
101 | /// | |
102 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
103 | /// Otherwise the computation result is written into the result. | |
104 | /// | |
105 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8mul_epi8) | |
106 | #[inline] | |
107 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
108 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
109 | pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i { | |
110 | let zero = _mm512_setzero_si512().as_i8x64(); | |
111 | transmute(simd_select_bitmask( | |
112 | k, | |
113 | vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()), | |
114 | zero, | |
115 | )) | |
116 | } | |
117 | ||
118 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
119 | /// The field is in polynomial representation with the reduction polynomial | |
120 | /// x^8 + x^4 + x^3 + x + 1. | |
121 | /// | |
122 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8mul_epi8) | |
123 | #[inline] | |
124 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
125 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
126 | pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i { | |
127 | transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32())) | |
128 | } | |
129 | ||
130 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
131 | /// The field is in polynomial representation with the reduction polynomial | |
132 | /// x^8 + x^4 + x^3 + x + 1. | |
133 | /// | |
134 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
135 | /// Otherwise the computation result is written into the result. | |
136 | /// | |
137 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8mul_epi8) | |
138 | #[inline] | |
139 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
140 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
141 | pub unsafe fn _mm256_mask_gf2p8mul_epi8( | |
142 | src: __m256i, | |
143 | k: __mmask32, | |
144 | a: __m256i, | |
145 | b: __m256i, | |
146 | ) -> __m256i { | |
147 | transmute(simd_select_bitmask( | |
148 | k, | |
149 | vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()), | |
150 | src.as_i8x32(), | |
151 | )) | |
152 | } | |
153 | ||
154 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
155 | /// The field is in polynomial representation with the reduction polynomial | |
156 | /// x^8 + x^4 + x^3 + x + 1. | |
157 | /// | |
158 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
159 | /// Otherwise the computation result is written into the result. | |
160 | /// | |
161 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8mul_epi8) | |
162 | #[inline] | |
163 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
164 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
165 | pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i { | |
166 | let zero = _mm256_setzero_si256().as_i8x32(); | |
167 | transmute(simd_select_bitmask( | |
168 | k, | |
169 | vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()), | |
170 | zero, | |
171 | )) | |
172 | } | |
173 | ||
174 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
175 | /// The field is in polynomial representation with the reduction polynomial | |
176 | /// x^8 + x^4 + x^3 + x + 1. | |
177 | /// | |
178 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8mul_epi8) | |
179 | #[inline] | |
180 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
181 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
182 | pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i { | |
183 | transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16())) | |
184 | } | |
185 | ||
186 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
187 | /// The field is in polynomial representation with the reduction polynomial | |
188 | /// x^8 + x^4 + x^3 + x + 1. | |
189 | /// | |
190 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
191 | /// Otherwise the computation result is written into the result. | |
192 | /// | |
193 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8mul_epi8) | |
194 | #[inline] | |
195 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
196 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
197 | pub unsafe fn _mm_mask_gf2p8mul_epi8( | |
198 | src: __m128i, | |
199 | k: __mmask16, | |
200 | a: __m128i, | |
201 | b: __m128i, | |
202 | ) -> __m128i { | |
203 | transmute(simd_select_bitmask( | |
204 | k, | |
205 | vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()), | |
206 | src.as_i8x16(), | |
207 | )) | |
208 | } | |
209 | ||
210 | /// Performs a multiplication in GF(2^8) on the packed bytes. | |
211 | /// The field is in polynomial representation with the reduction polynomial | |
212 | /// x^8 + x^4 + x^3 + x + 1. | |
213 | /// | |
214 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
215 | /// Otherwise the computation result is written into the result. | |
216 | /// | |
217 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8mul_epi8) | |
218 | #[inline] | |
219 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
220 | #[cfg_attr(test, assert_instr(vgf2p8mulb))] | |
221 | pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i { | |
222 | let zero = _mm_setzero_si128().as_i8x16(); | |
223 | transmute(simd_select_bitmask( | |
224 | k, | |
225 | vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()), | |
226 | zero, | |
227 | )) | |
228 | } | |
229 | ||
230 | /// Performs an affine transformation on the packed bytes in x. | |
231 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
232 | /// and b being a constant 8-bit immediate value. | |
233 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
234 | /// | |
235 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affine_epi8) | |
236 | #[inline] | |
237 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
17df50a5 XL |
238 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
239 | #[rustc_legacy_const_generics(2)] | |
240 | pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i { | |
241 | static_assert_imm8!(B); | |
242 | let b = B as u8; | |
fc512014 XL |
243 | let x = x.as_i8x64(); |
244 | let a = a.as_i8x64(); | |
17df50a5 | 245 | let r = vgf2p8affineqb_512(x, a, b); |
fc512014 XL |
246 | transmute(r) |
247 | } | |
248 | ||
249 | /// Performs an affine transformation on the packed bytes in x. | |
250 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
251 | /// and b being a constant 8-bit immediate value. | |
252 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
253 | /// | |
254 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
255 | /// Otherwise the computation result is written into the result. | |
256 | /// | |
257 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affine_epi8) | |
258 | #[inline] | |
259 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
17df50a5 XL |
260 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
261 | #[rustc_legacy_const_generics(3)] | |
262 | pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>( | |
fc512014 XL |
263 | k: __mmask64, |
264 | x: __m512i, | |
265 | a: __m512i, | |
fc512014 | 266 | ) -> __m512i { |
17df50a5 XL |
267 | static_assert_imm8!(B); |
268 | let b = B as u8; | |
fc512014 | 269 | let zero = _mm512_setzero_si512().as_i8x64(); |
fc512014 XL |
270 | let x = x.as_i8x64(); |
271 | let a = a.as_i8x64(); | |
17df50a5 | 272 | let r = vgf2p8affineqb_512(x, a, b); |
fc512014 XL |
273 | transmute(simd_select_bitmask(k, r, zero)) |
274 | } | |
275 | ||
276 | /// Performs an affine transformation on the packed bytes in x. | |
277 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
278 | /// and b being a constant 8-bit immediate value. | |
279 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
280 | /// | |
281 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
282 | /// Otherwise the computation result is written into the result. | |
283 | /// | |
284 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affine_epi8) | |
285 | #[inline] | |
286 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
17df50a5 XL |
287 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
288 | #[rustc_legacy_const_generics(4)] | |
289 | pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>( | |
fc512014 XL |
290 | src: __m512i, |
291 | k: __mmask64, | |
292 | x: __m512i, | |
293 | a: __m512i, | |
fc512014 | 294 | ) -> __m512i { |
17df50a5 XL |
295 | static_assert_imm8!(B); |
296 | let b = B as u8; | |
fc512014 XL |
297 | let x = x.as_i8x64(); |
298 | let a = a.as_i8x64(); | |
17df50a5 | 299 | let r = vgf2p8affineqb_512(x, a, b); |
fc512014 XL |
300 | transmute(simd_select_bitmask(k, r, src.as_i8x64())) |
301 | } | |
302 | ||
303 | /// Performs an affine transformation on the packed bytes in x. | |
304 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
305 | /// and b being a constant 8-bit immediate value. | |
306 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
307 | /// | |
308 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affine_epi8) | |
309 | #[inline] | |
310 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
311 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
312 | #[rustc_legacy_const_generics(2)] | |
313 | pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i { | |
314 | static_assert_imm8!(B); | |
315 | let b = B as u8; | |
fc512014 XL |
316 | let x = x.as_i8x32(); |
317 | let a = a.as_i8x32(); | |
17df50a5 | 318 | let r = vgf2p8affineqb_256(x, a, b); |
fc512014 XL |
319 | transmute(r) |
320 | } | |
321 | ||
322 | /// Performs an affine transformation on the packed bytes in x. | |
323 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
324 | /// and b being a constant 8-bit immediate value. | |
325 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
326 | /// | |
327 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
328 | /// Otherwise the computation result is written into the result. | |
329 | /// | |
330 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affine_epi8) | |
331 | #[inline] | |
332 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
333 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
334 | #[rustc_legacy_const_generics(3)] | |
335 | pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>( | |
fc512014 XL |
336 | k: __mmask32, |
337 | x: __m256i, | |
338 | a: __m256i, | |
fc512014 | 339 | ) -> __m256i { |
17df50a5 XL |
340 | static_assert_imm8!(B); |
341 | let b = B as u8; | |
fc512014 | 342 | let zero = _mm256_setzero_si256().as_i8x32(); |
fc512014 XL |
343 | let x = x.as_i8x32(); |
344 | let a = a.as_i8x32(); | |
17df50a5 | 345 | let r = vgf2p8affineqb_256(x, a, b); |
fc512014 XL |
346 | transmute(simd_select_bitmask(k, r, zero)) |
347 | } | |
348 | ||
349 | /// Performs an affine transformation on the packed bytes in x. | |
350 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
351 | /// and b being a constant 8-bit immediate value. | |
352 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
353 | /// | |
354 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
355 | /// Otherwise the computation result is written into the result. | |
356 | /// | |
357 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affine_epi8) | |
358 | #[inline] | |
359 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
360 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
361 | #[rustc_legacy_const_generics(4)] | |
362 | pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>( | |
fc512014 XL |
363 | src: __m256i, |
364 | k: __mmask32, | |
365 | x: __m256i, | |
366 | a: __m256i, | |
fc512014 | 367 | ) -> __m256i { |
17df50a5 XL |
368 | static_assert_imm8!(B); |
369 | let b = B as u8; | |
fc512014 XL |
370 | let x = x.as_i8x32(); |
371 | let a = a.as_i8x32(); | |
17df50a5 | 372 | let r = vgf2p8affineqb_256(x, a, b); |
fc512014 XL |
373 | transmute(simd_select_bitmask(k, r, src.as_i8x32())) |
374 | } | |
375 | ||
376 | /// Performs an affine transformation on the packed bytes in x. | |
377 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
378 | /// and b being a constant 8-bit immediate value. | |
379 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
380 | /// | |
381 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi8) | |
382 | #[inline] | |
383 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
384 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
385 | #[rustc_legacy_const_generics(2)] | |
386 | pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i { | |
387 | static_assert_imm8!(B); | |
388 | let b = B as u8; | |
fc512014 XL |
389 | let x = x.as_i8x16(); |
390 | let a = a.as_i8x16(); | |
17df50a5 | 391 | let r = vgf2p8affineqb_128(x, a, b); |
fc512014 XL |
392 | transmute(r) |
393 | } | |
394 | ||
395 | /// Performs an affine transformation on the packed bytes in x. | |
396 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
397 | /// and b being a constant 8-bit immediate value. | |
398 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
399 | /// | |
400 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
401 | /// Otherwise the computation result is written into the result. | |
402 | /// | |
403 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affine_epi8) | |
404 | #[inline] | |
405 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
406 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
407 | #[rustc_legacy_const_generics(3)] | |
408 | pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>( | |
fc512014 XL |
409 | k: __mmask16, |
410 | x: __m128i, | |
411 | a: __m128i, | |
fc512014 | 412 | ) -> __m128i { |
17df50a5 XL |
413 | static_assert_imm8!(B); |
414 | let b = B as u8; | |
fc512014 | 415 | let zero = _mm_setzero_si128().as_i8x16(); |
fc512014 XL |
416 | let x = x.as_i8x16(); |
417 | let a = a.as_i8x16(); | |
17df50a5 | 418 | let r = vgf2p8affineqb_128(x, a, b); |
fc512014 XL |
419 | transmute(simd_select_bitmask(k, r, zero)) |
420 | } | |
421 | ||
422 | /// Performs an affine transformation on the packed bytes in x. | |
423 | /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
424 | /// and b being a constant 8-bit immediate value. | |
425 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
426 | /// | |
427 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
428 | /// Otherwise the computation result is written into the result. | |
429 | /// | |
430 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affine_epi8) | |
431 | #[inline] | |
432 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
433 | #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))] |
434 | #[rustc_legacy_const_generics(4)] | |
435 | pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>( | |
fc512014 XL |
436 | src: __m128i, |
437 | k: __mmask16, | |
438 | x: __m128i, | |
439 | a: __m128i, | |
fc512014 | 440 | ) -> __m128i { |
17df50a5 XL |
441 | static_assert_imm8!(B); |
442 | let b = B as u8; | |
fc512014 XL |
443 | let x = x.as_i8x16(); |
444 | let a = a.as_i8x16(); | |
17df50a5 | 445 | let r = vgf2p8affineqb_128(x, a, b); |
fc512014 XL |
446 | transmute(simd_select_bitmask(k, r, src.as_i8x16())) |
447 | } | |
448 | ||
449 | /// Performs an affine transformation on the inverted packed bytes in x. | |
450 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
451 | /// and b being a constant 8-bit immediate value. | |
452 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
453 | /// The inverse of 0 is 0. | |
454 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
455 | /// | |
456 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affineinv_epi64_epi8) | |
457 | #[inline] | |
458 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
17df50a5 XL |
459 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
460 | #[rustc_legacy_const_generics(2)] | |
461 | pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i { | |
462 | static_assert_imm8!(B); | |
463 | let b = B as u8; | |
fc512014 XL |
464 | let x = x.as_i8x64(); |
465 | let a = a.as_i8x64(); | |
17df50a5 | 466 | let r = vgf2p8affineinvqb_512(x, a, b); |
fc512014 XL |
467 | transmute(r) |
468 | } | |
469 | ||
470 | /// Performs an affine transformation on the inverted packed bytes in x. | |
471 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
472 | /// and b being a constant 8-bit immediate value. | |
473 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
474 | /// The inverse of 0 is 0. | |
475 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
476 | /// | |
477 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
478 | /// Otherwise the computation result is written into the result. | |
479 | /// | |
480 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affineinv_epi64_epi8) | |
481 | #[inline] | |
482 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
17df50a5 XL |
483 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
484 | #[rustc_legacy_const_generics(3)] | |
485 | pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( | |
fc512014 XL |
486 | k: __mmask64, |
487 | x: __m512i, | |
488 | a: __m512i, | |
fc512014 | 489 | ) -> __m512i { |
17df50a5 XL |
490 | static_assert_imm8!(B); |
491 | let b = B as u8; | |
fc512014 XL |
492 | let zero = _mm512_setzero_si512().as_i8x64(); |
493 | let x = x.as_i8x64(); | |
494 | let a = a.as_i8x64(); | |
17df50a5 | 495 | let r = vgf2p8affineinvqb_512(x, a, b); |
fc512014 XL |
496 | transmute(simd_select_bitmask(k, r, zero)) |
497 | } | |
498 | ||
499 | /// Performs an affine transformation on the inverted packed bytes in x. | |
500 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
501 | /// and b being a constant 8-bit immediate value. | |
502 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
503 | /// The inverse of 0 is 0. | |
504 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
505 | /// | |
506 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
507 | /// Otherwise the computation result is written into the result. | |
508 | /// | |
509 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affineinv_epi64_epi8) | |
510 | #[inline] | |
511 | #[target_feature(enable = "avx512gfni,avx512bw,avx512f")] | |
17df50a5 XL |
512 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
513 | #[rustc_legacy_const_generics(4)] | |
514 | pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>( | |
fc512014 XL |
515 | src: __m512i, |
516 | k: __mmask64, | |
517 | x: __m512i, | |
518 | a: __m512i, | |
fc512014 | 519 | ) -> __m512i { |
17df50a5 XL |
520 | static_assert_imm8!(B); |
521 | let b = B as u8; | |
fc512014 XL |
522 | let x = x.as_i8x64(); |
523 | let a = a.as_i8x64(); | |
17df50a5 | 524 | let r = vgf2p8affineinvqb_512(x, a, b); |
fc512014 XL |
525 | transmute(simd_select_bitmask(k, r, src.as_i8x64())) |
526 | } | |
527 | ||
528 | /// Performs an affine transformation on the inverted packed bytes in x. | |
529 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
530 | /// and b being a constant 8-bit immediate value. | |
531 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
532 | /// The inverse of 0 is 0. | |
533 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
534 | /// | |
535 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affineinv_epi64_epi8) | |
536 | #[inline] | |
537 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
538 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
539 | #[rustc_legacy_const_generics(2)] | |
540 | pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i { | |
541 | static_assert_imm8!(B); | |
542 | let b = B as u8; | |
fc512014 XL |
543 | let x = x.as_i8x32(); |
544 | let a = a.as_i8x32(); | |
17df50a5 | 545 | let r = vgf2p8affineinvqb_256(x, a, b); |
fc512014 XL |
546 | transmute(r) |
547 | } | |
548 | ||
549 | /// Performs an affine transformation on the inverted packed bytes in x. | |
550 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
551 | /// and b being a constant 8-bit immediate value. | |
552 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
553 | /// The inverse of 0 is 0. | |
554 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
555 | /// | |
556 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
557 | /// Otherwise the computation result is written into the result. | |
558 | /// | |
559 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affineinv_epi64_epi8) | |
560 | #[inline] | |
561 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
562 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
563 | #[rustc_legacy_const_generics(3)] | |
564 | pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( | |
fc512014 XL |
565 | k: __mmask32, |
566 | x: __m256i, | |
567 | a: __m256i, | |
fc512014 | 568 | ) -> __m256i { |
17df50a5 XL |
569 | static_assert_imm8!(B); |
570 | let b = B as u8; | |
fc512014 XL |
571 | let zero = _mm256_setzero_si256().as_i8x32(); |
572 | let x = x.as_i8x32(); | |
573 | let a = a.as_i8x32(); | |
17df50a5 | 574 | let r = vgf2p8affineinvqb_256(x, a, b); |
fc512014 XL |
575 | transmute(simd_select_bitmask(k, r, zero)) |
576 | } | |
577 | ||
578 | /// Performs an affine transformation on the inverted packed bytes in x. | |
579 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
580 | /// and b being a constant 8-bit immediate value. | |
581 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
582 | /// The inverse of 0 is 0. | |
583 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
584 | /// | |
585 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
586 | /// Otherwise the computation result is written into the result. | |
587 | /// | |
588 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affineinv_epi64_epi8) | |
589 | #[inline] | |
590 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
591 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
592 | #[rustc_legacy_const_generics(4)] | |
593 | pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>( | |
fc512014 XL |
594 | src: __m256i, |
595 | k: __mmask32, | |
596 | x: __m256i, | |
597 | a: __m256i, | |
fc512014 | 598 | ) -> __m256i { |
17df50a5 XL |
599 | static_assert_imm8!(B); |
600 | let b = B as u8; | |
fc512014 XL |
601 | let x = x.as_i8x32(); |
602 | let a = a.as_i8x32(); | |
17df50a5 | 603 | let r = vgf2p8affineinvqb_256(x, a, b); |
fc512014 XL |
604 | transmute(simd_select_bitmask(k, r, src.as_i8x32())) |
605 | } | |
606 | ||
607 | /// Performs an affine transformation on the inverted packed bytes in x. | |
608 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
609 | /// and b being a constant 8-bit immediate value. | |
610 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
611 | /// The inverse of 0 is 0. | |
612 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
613 | /// | |
614 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affineinv_epi64_epi8) | |
615 | #[inline] | |
616 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
617 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
618 | #[rustc_legacy_const_generics(2)] | |
619 | pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i { | |
620 | static_assert_imm8!(B); | |
621 | let b = B as u8; | |
fc512014 XL |
622 | let x = x.as_i8x16(); |
623 | let a = a.as_i8x16(); | |
17df50a5 | 624 | let r = vgf2p8affineinvqb_128(x, a, b); |
fc512014 XL |
625 | transmute(r) |
626 | } | |
627 | ||
628 | /// Performs an affine transformation on the inverted packed bytes in x. | |
629 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
630 | /// and b being a constant 8-bit immediate value. | |
631 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
632 | /// The inverse of 0 is 0. | |
633 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
634 | /// | |
635 | /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set. | |
636 | /// Otherwise the computation result is written into the result. | |
637 | /// | |
638 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affineinv_epi64_epi8) | |
639 | #[inline] | |
640 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
641 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
642 | #[rustc_legacy_const_generics(3)] | |
643 | pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( | |
fc512014 XL |
644 | k: __mmask16, |
645 | x: __m128i, | |
646 | a: __m128i, | |
fc512014 | 647 | ) -> __m128i { |
17df50a5 XL |
648 | static_assert_imm8!(B); |
649 | let b = B as u8; | |
fc512014 XL |
650 | let zero = _mm_setzero_si128().as_i8x16(); |
651 | let x = x.as_i8x16(); | |
652 | let a = a.as_i8x16(); | |
17df50a5 | 653 | let r = vgf2p8affineinvqb_128(x, a, b); |
fc512014 XL |
654 | transmute(simd_select_bitmask(k, r, zero)) |
655 | } | |
656 | ||
657 | /// Performs an affine transformation on the inverted packed bytes in x. | |
658 | /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix | |
659 | /// and b being a constant 8-bit immediate value. | |
660 | /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1. | |
661 | /// The inverse of 0 is 0. | |
662 | /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a. | |
663 | /// | |
664 | /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set. | |
665 | /// Otherwise the computation result is written into the result. | |
666 | /// | |
667 | /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affineinv_epi64_epi8) | |
668 | #[inline] | |
669 | #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")] | |
17df50a5 XL |
670 | #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))] |
671 | #[rustc_legacy_const_generics(4)] | |
672 | pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>( | |
fc512014 XL |
673 | src: __m128i, |
674 | k: __mmask16, | |
675 | x: __m128i, | |
676 | a: __m128i, | |
fc512014 | 677 | ) -> __m128i { |
17df50a5 XL |
678 | static_assert_imm8!(B); |
679 | let b = B as u8; | |
fc512014 XL |
680 | let x = x.as_i8x16(); |
681 | let a = a.as_i8x16(); | |
17df50a5 | 682 | let r = vgf2p8affineinvqb_128(x, a, b); |
fc512014 XL |
683 | transmute(simd_select_bitmask(k, r, src.as_i8x16())) |
684 | } | |
685 | ||
686 | #[cfg(test)] | |
687 | mod tests { | |
688 | // The constants in the tests below are just bit patterns. They should not | |
689 | // be interpreted as integers; signedness does not make sense for them, but | |
690 | // __mXXXi happens to be defined in terms of signed integers. | |
691 | #![allow(overflowing_literals)] | |
692 | ||
693 | use core::hint::black_box; | |
694 | use core::intrinsics::size_of; | |
695 | use stdarch_test::simd_test; | |
696 | ||
697 | use crate::core_arch::x86::*; | |
698 | ||
699 | fn mulbyte(left: u8, right: u8) -> u8 { | |
700 | // this implementation follows the description in | |
701 | // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8 | |
702 | const REDUCTION_POLYNOMIAL: u16 = 0x11b; | |
703 | let left: u16 = left.into(); | |
704 | let right: u16 = right.into(); | |
705 | let mut carryless_product: u16 = 0; | |
706 | ||
707 | // Carryless multiplication | |
708 | for i in 0..8 { | |
709 | if ((left >> i) & 0x01) != 0 { | |
710 | carryless_product ^= right << i; | |
711 | } | |
712 | } | |
713 | ||
714 | // reduction, adding in "0" where appropriate to clear out high bits | |
715 | // note that REDUCTION_POLYNOMIAL is zero in this context | |
716 | for i in (8..=14).rev() { | |
717 | if ((carryless_product >> i) & 0x01) != 0 { | |
718 | carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8); | |
719 | } | |
720 | } | |
721 | ||
722 | carryless_product as u8 | |
723 | } | |
724 | ||
725 | const NUM_TEST_WORDS_512: usize = 4; | |
726 | const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2; | |
727 | const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2; | |
728 | const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64; | |
729 | const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2; | |
730 | const NUM_BYTES: usize = 256; | |
731 | const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16; | |
732 | const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2; | |
733 | const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2; | |
734 | ||
735 | fn parity(input: u8) -> u8 { | |
736 | let mut accumulator = 0; | |
737 | for i in 0..8 { | |
738 | accumulator ^= (input >> i) & 0x01; | |
739 | } | |
740 | accumulator | |
741 | } | |
742 | ||
743 | fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 { | |
744 | // this implementation follows the description in | |
745 | // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi64_epi8 | |
746 | let mut accumulator = 0; | |
747 | ||
748 | for bit in 0..8 { | |
749 | accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit); | |
750 | } | |
751 | ||
752 | accumulator ^ b | |
753 | } | |
754 | ||
755 | fn generate_affine_mul_test_data( | |
756 | immediate: u8, | |
757 | ) -> ( | |
758 | [u64; NUM_TEST_WORDS_64], | |
759 | [u8; NUM_TEST_ENTRIES], | |
760 | [u8; NUM_TEST_ENTRIES], | |
761 | ) { | |
762 | let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64]; | |
763 | let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; | |
764 | let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; | |
765 | ||
766 | for i in 0..NUM_TEST_WORDS_64 { | |
767 | left[i] = (i as u64) * 103 * 101; | |
768 | for j in 0..8 { | |
769 | let j64 = j as u64; | |
770 | right[i * 8 + j] = ((left[i] + j64) % 256) as u8; | |
771 | result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate); | |
772 | } | |
773 | } | |
774 | ||
775 | (left, right, result) | |
776 | } | |
777 | ||
778 | fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) { | |
779 | let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES]; | |
780 | let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES]; | |
781 | ||
782 | for i in 0..NUM_BYTES { | |
783 | input[i] = (i % 256) as u8; | |
784 | result[i] = if i == 0 { 0 } else { 1 }; | |
785 | } | |
786 | ||
787 | (input, result) | |
788 | } | |
789 | ||
790 | const AES_S_BOX: [u8; NUM_BYTES] = [ | |
791 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, | |
792 | 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, | |
793 | 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, | |
794 | 0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, | |
795 | 0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, | |
796 | 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, | |
797 | 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, | |
798 | 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, | |
799 | 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, | |
800 | 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, | |
801 | 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, | |
802 | 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d, | |
803 | 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25, | |
804 | 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, | |
805 | 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, | |
806 | 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, | |
807 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, | |
808 | 0x16, | |
809 | ]; | |
810 | ||
811 | fn generate_byte_mul_test_data() -> ( | |
812 | [u8; NUM_TEST_ENTRIES], | |
813 | [u8; NUM_TEST_ENTRIES], | |
814 | [u8; NUM_TEST_ENTRIES], | |
815 | ) { | |
816 | let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; | |
817 | let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; | |
818 | let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES]; | |
819 | ||
820 | for i in 0..NUM_TEST_ENTRIES { | |
821 | left[i] = (i % 256) as u8; | |
822 | right[i] = left[i] * 101; | |
823 | result[i] = mulbyte(left[i], right[i]); | |
824 | } | |
825 | ||
826 | (left, right, result) | |
827 | } | |
828 | ||
829 | #[target_feature(enable = "sse2")] | |
830 | unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i { | |
831 | let byte_offset = word_index * 16 / size_of::<T>(); | |
832 | let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m128i; | |
833 | _mm_loadu_si128(black_box(pointer)) | |
834 | } | |
835 | ||
836 | #[target_feature(enable = "avx")] | |
837 | unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i { | |
838 | let byte_offset = word_index * 32 / size_of::<T>(); | |
839 | let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m256i; | |
840 | _mm256_loadu_si256(black_box(pointer)) | |
841 | } | |
842 | ||
843 | #[target_feature(enable = "avx512f")] | |
844 | unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i { | |
845 | let byte_offset = word_index * 64 / size_of::<T>(); | |
846 | let pointer = data.as_ptr().offset(byte_offset as isize) as *const i32; | |
847 | _mm512_loadu_si512(black_box(pointer)) | |
848 | } | |
849 | ||
850 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
851 | unsafe fn test_mm512_gf2p8mul_epi8() { | |
852 | let (left, right, expected) = generate_byte_mul_test_data(); | |
853 | ||
854 | for i in 0..NUM_TEST_WORDS_512 { | |
855 | let left = load_m512i_word(&left, i); | |
856 | let right = load_m512i_word(&right, i); | |
857 | let expected = load_m512i_word(&expected, i); | |
858 | let result = _mm512_gf2p8mul_epi8(left, right); | |
859 | assert_eq_m512i(result, expected); | |
860 | } | |
861 | } | |
862 | ||
863 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
864 | unsafe fn test_mm512_maskz_gf2p8mul_epi8() { | |
865 | let (left, right, _expected) = generate_byte_mul_test_data(); | |
866 | ||
867 | for i in 0..NUM_TEST_WORDS_512 { | |
868 | let left = load_m512i_word(&left, i); | |
869 | let right = load_m512i_word(&right, i); | |
870 | let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right); | |
871 | assert_eq_m512i(result_zero, _mm512_setzero_si512()); | |
872 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; | |
873 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; | |
874 | let expected_result = _mm512_gf2p8mul_epi8(left, right); | |
875 | let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right); | |
876 | let expected_masked = | |
877 | _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result); | |
878 | assert_eq_m512i(result_masked, expected_masked); | |
879 | } | |
880 | } | |
881 | ||
882 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
883 | unsafe fn test_mm512_mask_gf2p8mul_epi8() { | |
884 | let (left, right, _expected) = generate_byte_mul_test_data(); | |
885 | ||
886 | for i in 0..NUM_TEST_WORDS_512 { | |
887 | let left = load_m512i_word(&left, i); | |
888 | let right = load_m512i_word(&right, i); | |
889 | let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right); | |
890 | assert_eq_m512i(result_left, left); | |
891 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; | |
892 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; | |
893 | let expected_result = _mm512_gf2p8mul_epi8(left, right); | |
894 | let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right); | |
895 | let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result); | |
896 | assert_eq_m512i(result_masked, expected_masked); | |
897 | } | |
898 | } | |
899 | ||
900 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
901 | unsafe fn test_mm256_gf2p8mul_epi8() { | |
902 | let (left, right, expected) = generate_byte_mul_test_data(); | |
903 | ||
904 | for i in 0..NUM_TEST_WORDS_256 { | |
905 | let left = load_m256i_word(&left, i); | |
906 | let right = load_m256i_word(&right, i); | |
907 | let expected = load_m256i_word(&expected, i); | |
908 | let result = _mm256_gf2p8mul_epi8(left, right); | |
909 | assert_eq_m256i(result, expected); | |
910 | } | |
911 | } | |
912 | ||
913 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
914 | unsafe fn test_mm256_maskz_gf2p8mul_epi8() { | |
915 | let (left, right, _expected) = generate_byte_mul_test_data(); | |
916 | ||
917 | for i in 0..NUM_TEST_WORDS_256 { | |
918 | let left = load_m256i_word(&left, i); | |
919 | let right = load_m256i_word(&right, i); | |
920 | let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right); | |
921 | assert_eq_m256i(result_zero, _mm256_setzero_si256()); | |
922 | let mask_bytes: __mmask32 = 0x0F_F0_FF_00; | |
923 | const MASK_WORDS: i32 = 0b01_10_11_00; | |
924 | let expected_result = _mm256_gf2p8mul_epi8(left, right); | |
925 | let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right); | |
926 | let expected_masked = | |
17df50a5 | 927 | _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result); |
fc512014 XL |
928 | assert_eq_m256i(result_masked, expected_masked); |
929 | } | |
930 | } | |
931 | ||
932 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
933 | unsafe fn test_mm256_mask_gf2p8mul_epi8() { | |
934 | let (left, right, _expected) = generate_byte_mul_test_data(); | |
935 | ||
936 | for i in 0..NUM_TEST_WORDS_256 { | |
937 | let left = load_m256i_word(&left, i); | |
938 | let right = load_m256i_word(&right, i); | |
939 | let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right); | |
940 | assert_eq_m256i(result_left, left); | |
941 | let mask_bytes: __mmask32 = 0x0F_F0_FF_00; | |
942 | const MASK_WORDS: i32 = 0b01_10_11_00; | |
943 | let expected_result = _mm256_gf2p8mul_epi8(left, right); | |
944 | let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right); | |
17df50a5 | 945 | let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result); |
fc512014 XL |
946 | assert_eq_m256i(result_masked, expected_masked); |
947 | } | |
948 | } | |
949 | ||
950 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
951 | unsafe fn test_mm_gf2p8mul_epi8() { | |
952 | let (left, right, expected) = generate_byte_mul_test_data(); | |
953 | ||
954 | for i in 0..NUM_TEST_WORDS_128 { | |
955 | let left = load_m128i_word(&left, i); | |
956 | let right = load_m128i_word(&right, i); | |
957 | let expected = load_m128i_word(&expected, i); | |
958 | let result = _mm_gf2p8mul_epi8(left, right); | |
959 | assert_eq_m128i(result, expected); | |
960 | } | |
961 | } | |
962 | ||
963 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
964 | unsafe fn test_mm_maskz_gf2p8mul_epi8() { | |
965 | let (left, right, _expected) = generate_byte_mul_test_data(); | |
966 | ||
967 | for i in 0..NUM_TEST_WORDS_128 { | |
968 | let left = load_m128i_word(&left, i); | |
969 | let right = load_m128i_word(&right, i); | |
970 | let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right); | |
971 | assert_eq_m128i(result_zero, _mm_setzero_si128()); | |
972 | let mask_bytes: __mmask16 = 0x0F_F0; | |
973 | const MASK_WORDS: i32 = 0b01_10; | |
974 | let expected_result = _mm_gf2p8mul_epi8(left, right); | |
975 | let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right); | |
17df50a5 XL |
976 | let expected_masked = |
977 | _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result); | |
fc512014 XL |
978 | assert_eq_m128i(result_masked, expected_masked); |
979 | } | |
980 | } | |
981 | ||
982 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
983 | unsafe fn test_mm_mask_gf2p8mul_epi8() { | |
984 | let (left, right, _expected) = generate_byte_mul_test_data(); | |
985 | ||
986 | for i in 0..NUM_TEST_WORDS_128 { | |
987 | let left = load_m128i_word(&left, i); | |
988 | let right = load_m128i_word(&right, i); | |
989 | let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right); | |
990 | assert_eq_m128i(result_left, left); | |
991 | let mask_bytes: __mmask16 = 0x0F_F0; | |
992 | const MASK_WORDS: i32 = 0b01_10; | |
993 | let expected_result = _mm_gf2p8mul_epi8(left, right); | |
994 | let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right); | |
17df50a5 | 995 | let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result); |
fc512014 XL |
996 | assert_eq_m128i(result_masked, expected_masked); |
997 | } | |
998 | } | |
999 | ||
1000 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
1001 | unsafe fn test_mm512_gf2p8affine_epi64_epi8() { | |
1002 | let identity: i64 = 0x01_02_04_08_10_20_40_80; | |
1003 | const IDENTITY_BYTE: i32 = 0; | |
1004 | let constant: i64 = 0; | |
1005 | const CONSTANT_BYTE: i32 = 0x63; | |
1006 | let identity = _mm512_set1_epi64(identity); | |
1007 | let constant = _mm512_set1_epi64(constant); | |
1008 | let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8); | |
1009 | ||
1010 | let (bytes, more_bytes, _) = generate_byte_mul_test_data(); | |
1011 | let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8); | |
1012 | ||
1013 | for i in 0..NUM_TEST_WORDS_512 { | |
1014 | let data = load_m512i_word(&bytes, i); | |
17df50a5 | 1015 | let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
fc512014 | 1016 | assert_eq_m512i(result, data); |
17df50a5 | 1017 | let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
fc512014 XL |
1018 | assert_eq_m512i(result, constant_reference); |
1019 | let data = load_m512i_word(&more_bytes, i); | |
17df50a5 | 1020 | let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
fc512014 | 1021 | assert_eq_m512i(result, data); |
17df50a5 | 1022 | let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
fc512014 XL |
1023 | assert_eq_m512i(result, constant_reference); |
1024 | ||
1025 | let matrix = load_m512i_word(&matrices, i); | |
1026 | let vector = load_m512i_word(&vectors, i); | |
1027 | let reference = load_m512i_word(&references, i); | |
1028 | ||
17df50a5 | 1029 | let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); |
fc512014 XL |
1030 | assert_eq_m512i(result, reference); |
1031 | } | |
1032 | } | |
1033 | ||
1034 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
1035 | unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() { | |
1036 | const CONSTANT_BYTE: i32 = 0x63; | |
1037 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1038 | ||
1039 | for i in 0..NUM_TEST_WORDS_512 { | |
1040 | let matrix = load_m512i_word(&matrices, i); | |
1041 | let vector = load_m512i_word(&vectors, i); | |
17df50a5 XL |
1042 | let result_zero = |
1043 | _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); | |
fc512014 XL |
1044 | assert_eq_m512i(result_zero, _mm512_setzero_si512()); |
1045 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; | |
1046 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; | |
17df50a5 | 1047 | let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
fc512014 | 1048 | let result_masked = |
17df50a5 | 1049 | _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
fc512014 XL |
1050 | let expected_masked = |
1051 | _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result); | |
1052 | assert_eq_m512i(result_masked, expected_masked); | |
1053 | } | |
1054 | } | |
1055 | ||
1056 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
1057 | unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() { | |
1058 | const CONSTANT_BYTE: i32 = 0x63; | |
1059 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1060 | ||
1061 | for i in 0..NUM_TEST_WORDS_512 { | |
1062 | let left = load_m512i_word(&vectors, i); | |
1063 | let right = load_m512i_word(&matrices, i); | |
1064 | let result_left = | |
17df50a5 | 1065 | _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
fc512014 XL |
1066 | assert_eq_m512i(result_left, left); |
1067 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; | |
1068 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; | |
17df50a5 | 1069 | let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right); |
fc512014 | 1070 | let result_masked = |
17df50a5 | 1071 | _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
fc512014 XL |
1072 | let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result); |
1073 | assert_eq_m512i(result_masked, expected_masked); | |
1074 | } | |
1075 | } | |
1076 | ||
1077 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1078 | unsafe fn test_mm256_gf2p8affine_epi64_epi8() { | |
1079 | let identity: i64 = 0x01_02_04_08_10_20_40_80; | |
1080 | const IDENTITY_BYTE: i32 = 0; | |
1081 | let constant: i64 = 0; | |
1082 | const CONSTANT_BYTE: i32 = 0x63; | |
1083 | let identity = _mm256_set1_epi64x(identity); | |
1084 | let constant = _mm256_set1_epi64x(constant); | |
1085 | let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8); | |
1086 | ||
1087 | let (bytes, more_bytes, _) = generate_byte_mul_test_data(); | |
1088 | let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8); | |
1089 | ||
1090 | for i in 0..NUM_TEST_WORDS_256 { | |
1091 | let data = load_m256i_word(&bytes, i); | |
17df50a5 | 1092 | let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
fc512014 | 1093 | assert_eq_m256i(result, data); |
17df50a5 | 1094 | let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
fc512014 XL |
1095 | assert_eq_m256i(result, constant_reference); |
1096 | let data = load_m256i_word(&more_bytes, i); | |
17df50a5 | 1097 | let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
fc512014 | 1098 | assert_eq_m256i(result, data); |
17df50a5 | 1099 | let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
fc512014 XL |
1100 | assert_eq_m256i(result, constant_reference); |
1101 | ||
1102 | let matrix = load_m256i_word(&matrices, i); | |
1103 | let vector = load_m256i_word(&vectors, i); | |
1104 | let reference = load_m256i_word(&references, i); | |
1105 | ||
17df50a5 | 1106 | let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); |
fc512014 XL |
1107 | assert_eq_m256i(result, reference); |
1108 | } | |
1109 | } | |
1110 | ||
1111 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1112 | unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() { | |
1113 | const CONSTANT_BYTE: i32 = 0x63; | |
1114 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1115 | ||
1116 | for i in 0..NUM_TEST_WORDS_256 { | |
1117 | let matrix = load_m256i_word(&matrices, i); | |
1118 | let vector = load_m256i_word(&vectors, i); | |
17df50a5 XL |
1119 | let result_zero = |
1120 | _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); | |
fc512014 XL |
1121 | assert_eq_m256i(result_zero, _mm256_setzero_si256()); |
1122 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; | |
1123 | const MASK_WORDS: i32 = 0b11_01_10_00; | |
17df50a5 | 1124 | let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
fc512014 | 1125 | let result_masked = |
17df50a5 | 1126 | _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
fc512014 | 1127 | let expected_masked = |
17df50a5 | 1128 | _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result); |
fc512014 XL |
1129 | assert_eq_m256i(result_masked, expected_masked); |
1130 | } | |
1131 | } | |
1132 | ||
1133 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1134 | unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() { | |
1135 | const CONSTANT_BYTE: i32 = 0x63; | |
1136 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1137 | ||
1138 | for i in 0..NUM_TEST_WORDS_256 { | |
1139 | let left = load_m256i_word(&vectors, i); | |
1140 | let right = load_m256i_word(&matrices, i); | |
1141 | let result_left = | |
17df50a5 | 1142 | _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
fc512014 XL |
1143 | assert_eq_m256i(result_left, left); |
1144 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; | |
1145 | const MASK_WORDS: i32 = 0b11_01_10_00; | |
17df50a5 | 1146 | let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right); |
fc512014 | 1147 | let result_masked = |
17df50a5 XL |
1148 | _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1149 | let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result); | |
fc512014 XL |
1150 | assert_eq_m256i(result_masked, expected_masked); |
1151 | } | |
1152 | } | |
1153 | ||
1154 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1155 | unsafe fn test_mm_gf2p8affine_epi64_epi8() { | |
1156 | let identity: i64 = 0x01_02_04_08_10_20_40_80; | |
1157 | const IDENTITY_BYTE: i32 = 0; | |
1158 | let constant: i64 = 0; | |
1159 | const CONSTANT_BYTE: i32 = 0x63; | |
1160 | let identity = _mm_set1_epi64x(identity); | |
1161 | let constant = _mm_set1_epi64x(constant); | |
1162 | let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8); | |
1163 | ||
1164 | let (bytes, more_bytes, _) = generate_byte_mul_test_data(); | |
1165 | let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8); | |
1166 | ||
1167 | for i in 0..NUM_TEST_WORDS_128 { | |
1168 | let data = load_m128i_word(&bytes, i); | |
17df50a5 | 1169 | let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
fc512014 | 1170 | assert_eq_m128i(result, data); |
17df50a5 | 1171 | let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
fc512014 XL |
1172 | assert_eq_m128i(result, constant_reference); |
1173 | let data = load_m128i_word(&more_bytes, i); | |
17df50a5 | 1174 | let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity); |
fc512014 | 1175 | assert_eq_m128i(result, data); |
17df50a5 | 1176 | let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant); |
fc512014 XL |
1177 | assert_eq_m128i(result, constant_reference); |
1178 | ||
1179 | let matrix = load_m128i_word(&matrices, i); | |
1180 | let vector = load_m128i_word(&vectors, i); | |
1181 | let reference = load_m128i_word(&references, i); | |
1182 | ||
17df50a5 | 1183 | let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix); |
fc512014 XL |
1184 | assert_eq_m128i(result, reference); |
1185 | } | |
1186 | } | |
1187 | ||
1188 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1189 | unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() { | |
1190 | const CONSTANT_BYTE: i32 = 0x63; | |
1191 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1192 | ||
1193 | for i in 0..NUM_TEST_WORDS_128 { | |
1194 | let matrix = load_m128i_word(&matrices, i); | |
1195 | let vector = load_m128i_word(&vectors, i); | |
17df50a5 | 1196 | let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
fc512014 XL |
1197 | assert_eq_m128i(result_zero, _mm_setzero_si128()); |
1198 | let mask_bytes: __mmask16 = 0x0F_F0; | |
1199 | const MASK_WORDS: i32 = 0b01_10; | |
17df50a5 | 1200 | let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
fc512014 | 1201 | let result_masked = |
17df50a5 XL |
1202 | _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1203 | let expected_masked = | |
1204 | _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result); | |
fc512014 XL |
1205 | assert_eq_m128i(result_masked, expected_masked); |
1206 | } | |
1207 | } | |
1208 | ||
1209 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1210 | unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() { | |
1211 | const CONSTANT_BYTE: i32 = 0x63; | |
1212 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1213 | ||
1214 | for i in 0..NUM_TEST_WORDS_128 { | |
1215 | let left = load_m128i_word(&vectors, i); | |
1216 | let right = load_m128i_word(&matrices, i); | |
17df50a5 XL |
1217 | let result_left = |
1218 | _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); | |
fc512014 XL |
1219 | assert_eq_m128i(result_left, left); |
1220 | let mask_bytes: __mmask16 = 0x0F_F0; | |
1221 | const MASK_WORDS: i32 = 0b01_10; | |
17df50a5 | 1222 | let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right); |
fc512014 | 1223 | let result_masked = |
17df50a5 XL |
1224 | _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1225 | let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result); | |
fc512014 XL |
1226 | assert_eq_m128i(result_masked, expected_masked); |
1227 | } | |
1228 | } | |
1229 | ||
1230 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
1231 | unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() { | |
1232 | let identity: i64 = 0x01_02_04_08_10_20_40_80; | |
1233 | const IDENTITY_BYTE: i32 = 0; | |
1234 | const CONSTANT_BYTE: i32 = 0x63; | |
1235 | let identity = _mm512_set1_epi64(identity); | |
1236 | ||
1237 | // validate inversion | |
1238 | let (inputs, results) = generate_inv_tests_data(); | |
1239 | ||
1240 | for i in 0..NUM_BYTES_WORDS_512 { | |
1241 | let input = load_m512i_word(&inputs, i); | |
1242 | let reference = load_m512i_word(&results, i); | |
17df50a5 | 1243 | let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); |
fc512014 XL |
1244 | let remultiplied = _mm512_gf2p8mul_epi8(result, input); |
1245 | assert_eq_m512i(remultiplied, reference); | |
1246 | } | |
1247 | ||
1248 | // validate subsequent affine operation | |
1249 | let (matrices, vectors, _affine_expected) = | |
1250 | generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1251 | ||
1252 | for i in 0..NUM_TEST_WORDS_512 { | |
1253 | let vector = load_m512i_word(&vectors, i); | |
1254 | let matrix = load_m512i_word(&matrices, i); | |
1255 | ||
17df50a5 XL |
1256 | let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); |
1257 | let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); | |
1258 | let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); | |
fc512014 XL |
1259 | assert_eq_m512i(result, reference); |
1260 | } | |
1261 | ||
1262 | // validate everything by virtue of checking against the AES SBox | |
1263 | const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; | |
1264 | let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX); | |
1265 | ||
1266 | for i in 0..NUM_BYTES_WORDS_512 { | |
1267 | let reference = load_m512i_word(&AES_S_BOX, i); | |
1268 | let input = load_m512i_word(&inputs, i); | |
17df50a5 | 1269 | let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); |
fc512014 XL |
1270 | assert_eq_m512i(result, reference); |
1271 | } | |
1272 | } | |
1273 | ||
1274 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
1275 | unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() { | |
1276 | const CONSTANT_BYTE: i32 = 0x63; | |
1277 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1278 | ||
1279 | for i in 0..NUM_TEST_WORDS_512 { | |
1280 | let matrix = load_m512i_word(&matrices, i); | |
1281 | let vector = load_m512i_word(&vectors, i); | |
1282 | let result_zero = | |
17df50a5 | 1283 | _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
fc512014 XL |
1284 | assert_eq_m512i(result_zero, _mm512_setzero_si512()); |
1285 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; | |
1286 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; | |
17df50a5 | 1287 | let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
fc512014 | 1288 | let result_masked = |
17df50a5 | 1289 | _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
fc512014 XL |
1290 | let expected_masked = |
1291 | _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result); | |
1292 | assert_eq_m512i(result_masked, expected_masked); | |
1293 | } | |
1294 | } | |
1295 | ||
1296 | #[simd_test(enable = "avx512gfni,avx512bw")] | |
1297 | unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() { | |
1298 | const CONSTANT_BYTE: i32 = 0x63; | |
1299 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1300 | ||
1301 | for i in 0..NUM_TEST_WORDS_512 { | |
1302 | let left = load_m512i_word(&vectors, i); | |
1303 | let right = load_m512i_word(&matrices, i); | |
1304 | let result_left = | |
17df50a5 | 1305 | _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
fc512014 XL |
1306 | assert_eq_m512i(result_left, left); |
1307 | let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00; | |
1308 | let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00; | |
17df50a5 XL |
1309 | let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1310 | let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>( | |
1311 | left, mask_bytes, left, right, | |
1312 | ); | |
fc512014 XL |
1313 | let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result); |
1314 | assert_eq_m512i(result_masked, expected_masked); | |
1315 | } | |
1316 | } | |
1317 | ||
1318 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1319 | unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() { | |
1320 | let identity: i64 = 0x01_02_04_08_10_20_40_80; | |
1321 | const IDENTITY_BYTE: i32 = 0; | |
1322 | const CONSTANT_BYTE: i32 = 0x63; | |
1323 | let identity = _mm256_set1_epi64x(identity); | |
1324 | ||
1325 | // validate inversion | |
1326 | let (inputs, results) = generate_inv_tests_data(); | |
1327 | ||
1328 | for i in 0..NUM_BYTES_WORDS_256 { | |
1329 | let input = load_m256i_word(&inputs, i); | |
1330 | let reference = load_m256i_word(&results, i); | |
17df50a5 | 1331 | let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); |
fc512014 XL |
1332 | let remultiplied = _mm256_gf2p8mul_epi8(result, input); |
1333 | assert_eq_m256i(remultiplied, reference); | |
1334 | } | |
1335 | ||
1336 | // validate subsequent affine operation | |
1337 | let (matrices, vectors, _affine_expected) = | |
1338 | generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1339 | ||
1340 | for i in 0..NUM_TEST_WORDS_256 { | |
1341 | let vector = load_m256i_word(&vectors, i); | |
1342 | let matrix = load_m256i_word(&matrices, i); | |
1343 | ||
17df50a5 XL |
1344 | let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); |
1345 | let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); | |
1346 | let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); | |
fc512014 XL |
1347 | assert_eq_m256i(result, reference); |
1348 | } | |
1349 | ||
1350 | // validate everything by virtue of checking against the AES SBox | |
1351 | const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; | |
1352 | let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX); | |
1353 | ||
1354 | for i in 0..NUM_BYTES_WORDS_256 { | |
1355 | let reference = load_m256i_word(&AES_S_BOX, i); | |
1356 | let input = load_m256i_word(&inputs, i); | |
17df50a5 | 1357 | let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); |
fc512014 XL |
1358 | assert_eq_m256i(result, reference); |
1359 | } | |
1360 | } | |
1361 | ||
1362 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1363 | unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() { | |
1364 | const CONSTANT_BYTE: i32 = 0x63; | |
1365 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1366 | ||
1367 | for i in 0..NUM_TEST_WORDS_256 { | |
1368 | let matrix = load_m256i_word(&matrices, i); | |
1369 | let vector = load_m256i_word(&vectors, i); | |
1370 | let result_zero = | |
17df50a5 | 1371 | _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); |
fc512014 XL |
1372 | assert_eq_m256i(result_zero, _mm256_setzero_si256()); |
1373 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; | |
1374 | const MASK_WORDS: i32 = 0b11_01_10_00; | |
17df50a5 | 1375 | let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
fc512014 | 1376 | let result_masked = |
17df50a5 | 1377 | _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
fc512014 | 1378 | let expected_masked = |
17df50a5 | 1379 | _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result); |
fc512014 XL |
1380 | assert_eq_m256i(result_masked, expected_masked); |
1381 | } | |
1382 | } | |
1383 | ||
1384 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1385 | unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() { | |
1386 | const CONSTANT_BYTE: i32 = 0x63; | |
1387 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1388 | ||
1389 | for i in 0..NUM_TEST_WORDS_256 { | |
1390 | let left = load_m256i_word(&vectors, i); | |
1391 | let right = load_m256i_word(&matrices, i); | |
1392 | let result_left = | |
17df50a5 | 1393 | _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
fc512014 XL |
1394 | assert_eq_m256i(result_left, left); |
1395 | let mask_bytes: __mmask32 = 0xFF_0F_F0_00; | |
1396 | const MASK_WORDS: i32 = 0b11_01_10_00; | |
17df50a5 XL |
1397 | let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right); |
1398 | let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>( | |
1399 | left, mask_bytes, left, right, | |
1400 | ); | |
1401 | let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result); | |
fc512014 XL |
1402 | assert_eq_m256i(result_masked, expected_masked); |
1403 | } | |
1404 | } | |
1405 | ||
1406 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1407 | unsafe fn test_mm_gf2p8affineinv_epi64_epi8() { | |
1408 | let identity: i64 = 0x01_02_04_08_10_20_40_80; | |
1409 | const IDENTITY_BYTE: i32 = 0; | |
1410 | const CONSTANT_BYTE: i32 = 0x63; | |
1411 | let identity = _mm_set1_epi64x(identity); | |
1412 | ||
1413 | // validate inversion | |
1414 | let (inputs, results) = generate_inv_tests_data(); | |
1415 | ||
1416 | for i in 0..NUM_BYTES_WORDS_128 { | |
1417 | let input = load_m128i_word(&inputs, i); | |
1418 | let reference = load_m128i_word(&results, i); | |
17df50a5 | 1419 | let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity); |
fc512014 XL |
1420 | let remultiplied = _mm_gf2p8mul_epi8(result, input); |
1421 | assert_eq_m128i(remultiplied, reference); | |
1422 | } | |
1423 | ||
1424 | // validate subsequent affine operation | |
1425 | let (matrices, vectors, _affine_expected) = | |
1426 | generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1427 | ||
1428 | for i in 0..NUM_TEST_WORDS_128 { | |
1429 | let vector = load_m128i_word(&vectors, i); | |
1430 | let matrix = load_m128i_word(&matrices, i); | |
1431 | ||
17df50a5 XL |
1432 | let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity); |
1433 | let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix); | |
1434 | let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); | |
fc512014 XL |
1435 | assert_eq_m128i(result, reference); |
1436 | } | |
1437 | ||
1438 | // validate everything by virtue of checking against the AES SBox | |
1439 | const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8; | |
1440 | let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX); | |
1441 | ||
1442 | for i in 0..NUM_BYTES_WORDS_128 { | |
1443 | let reference = load_m128i_word(&AES_S_BOX, i); | |
1444 | let input = load_m128i_word(&inputs, i); | |
17df50a5 | 1445 | let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix); |
fc512014 XL |
1446 | assert_eq_m128i(result, reference); |
1447 | } | |
1448 | } | |
1449 | ||
1450 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1451 | unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() { | |
1452 | const CONSTANT_BYTE: i32 = 0x63; | |
1453 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1454 | ||
1455 | for i in 0..NUM_TEST_WORDS_128 { | |
1456 | let matrix = load_m128i_word(&matrices, i); | |
1457 | let vector = load_m128i_word(&vectors, i); | |
17df50a5 XL |
1458 | let result_zero = |
1459 | _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix); | |
fc512014 XL |
1460 | assert_eq_m128i(result_zero, _mm_setzero_si128()); |
1461 | let mask_bytes: __mmask16 = 0x0F_F0; | |
1462 | const MASK_WORDS: i32 = 0b01_10; | |
17df50a5 | 1463 | let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix); |
fc512014 | 1464 | let result_masked = |
17df50a5 XL |
1465 | _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix); |
1466 | let expected_masked = | |
1467 | _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result); | |
fc512014 XL |
1468 | assert_eq_m128i(result_masked, expected_masked); |
1469 | } | |
1470 | } | |
1471 | ||
1472 | #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")] | |
1473 | unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() { | |
1474 | const CONSTANT_BYTE: i32 = 0x63; | |
1475 | let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8); | |
1476 | ||
1477 | for i in 0..NUM_TEST_WORDS_128 { | |
1478 | let left = load_m128i_word(&vectors, i); | |
1479 | let right = load_m128i_word(&matrices, i); | |
1480 | let result_left = | |
17df50a5 | 1481 | _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right); |
fc512014 XL |
1482 | assert_eq_m128i(result_left, left); |
1483 | let mask_bytes: __mmask16 = 0x0F_F0; | |
1484 | const MASK_WORDS: i32 = 0b01_10; | |
17df50a5 | 1485 | let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right); |
fc512014 | 1486 | let result_masked = |
17df50a5 XL |
1487 | _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right); |
1488 | let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result); | |
fc512014 XL |
1489 | assert_eq_m128i(result_masked, expected_masked); |
1490 | } | |
1491 | } | |
1492 | } |