1 #[cfg(target_arch = "x86")]
2 use core
::arch
::x86
::*;
3 #[cfg(target_arch = "x86_64")]
4 use core
::arch
::x86_64
::*;
7 assemble_count
, count_high
, count_low
, final_block
, flag_word
, input_debug_asserts
, Finalize
,
10 use crate::{Count, Word, BLOCKBYTES, IV, SIGMA}
;
11 use arrayref
::{array_refs, mut_array_refs}
;
15 pub const DEGREE
: usize = 4;
18 unsafe fn loadu(src
: *const [Word
; DEGREE
]) -> __m256i
{
19 // This is an unaligned load, so the pointer cast is allowed.
20 _mm256_loadu_si256(src
as *const __m256i
)
24 unsafe fn storeu(src
: __m256i
, dest
: *mut [Word
; DEGREE
]) {
25 // This is an unaligned store, so the pointer cast is allowed.
26 _mm256_storeu_si256(dest
as *mut __m256i
, src
)
30 unsafe fn loadu_128(mem_addr
: &[u8; 16]) -> __m128i
{
31 _mm_loadu_si128(mem_addr
.as_ptr() as *const __m128i
)
35 unsafe fn add(a
: __m256i
, b
: __m256i
) -> __m256i
{
36 _mm256_add_epi64(a
, b
)
40 unsafe fn eq(a
: __m256i
, b
: __m256i
) -> __m256i
{
41 _mm256_cmpeq_epi64(a
, b
)
45 unsafe fn and(a
: __m256i
, b
: __m256i
) -> __m256i
{
46 _mm256_and_si256(a
, b
)
50 unsafe fn negate_and(a
: __m256i
, b
: __m256i
) -> __m256i
{
51 // Note that "and not" implies the reverse of the actual arg order.
52 _mm256_andnot_si256(a
, b
)
56 unsafe fn xor(a
: __m256i
, b
: __m256i
) -> __m256i
{
57 _mm256_xor_si256(a
, b
)
61 unsafe fn set1(x
: u64) -> __m256i
{
62 _mm256_set1_epi64x(x
as i64)
66 unsafe fn set4(a
: u64, b
: u64, c
: u64, d
: u64) -> __m256i
{
67 _mm256_setr_epi64x(a
as i64, b
as i64, c
as i64, d
as i64)
70 // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
71 macro_rules
! _MM_SHUFFLE
{
72 ($z
:expr
, $y
:expr
, $x
:expr
, $w
:expr
) => {
73 ($z
<< 6) | ($y
<< 4) | ($x
<< 2) | $w
77 // These rotations are the "simple version". For the "complicated version", see
78 // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2b-common.h#L43-L46.
79 // For a discussion of the tradeoffs, see
80 // https://github.com/sneves/blake2-avx2/pull/5. In short:
81 // - This version performs better on modern x86 chips, Skylake and later.
82 // - LLVM is able to optimize this version to AVX-512 rotation instructions
83 // when those are enabled.
86 unsafe fn rot32(x
: __m256i
) -> __m256i
{
87 _mm256_or_si256(_mm256_srli_epi64(x
, 32), _mm256_slli_epi64(x
, 64 - 32))
91 unsafe fn rot24(x
: __m256i
) -> __m256i
{
92 _mm256_or_si256(_mm256_srli_epi64(x
, 24), _mm256_slli_epi64(x
, 64 - 24))
96 unsafe fn rot16(x
: __m256i
) -> __m256i
{
97 _mm256_or_si256(_mm256_srli_epi64(x
, 16), _mm256_slli_epi64(x
, 64 - 16))
101 unsafe fn rot63(x
: __m256i
) -> __m256i
{
102 _mm256_or_si256(_mm256_srli_epi64(x
, 63), _mm256_slli_epi64(x
, 64 - 63))
106 unsafe fn g1(a
: &mut __m256i
, b
: &mut __m256i
, c
: &mut __m256i
, d
: &mut __m256i
, m
: &mut __m256i
) {
117 unsafe fn g2(a
: &mut __m256i
, b
: &mut __m256i
, c
: &mut __m256i
, d
: &mut __m256i
, m
: &mut __m256i
) {
127 // Note the optimization here of leaving b as the unrotated row, rather than a.
128 // All the message loads below are adjusted to compensate for this. See
129 // discussion at https://github.com/sneves/blake2-avx2/pull/4
131 unsafe fn diagonalize(a
: &mut __m256i
, _b
: &mut __m256i
, c
: &mut __m256i
, d
: &mut __m256i
) {
132 *a
= _mm256_permute4x64_epi64(*a
, _MM_SHUFFLE
!(2, 1, 0, 3));
133 *d
= _mm256_permute4x64_epi64(*d
, _MM_SHUFFLE
!(1, 0, 3, 2));
134 *c
= _mm256_permute4x64_epi64(*c
, _MM_SHUFFLE
!(0, 3, 2, 1));
138 unsafe fn undiagonalize(a
: &mut __m256i
, _b
: &mut __m256i
, c
: &mut __m256i
, d
: &mut __m256i
) {
139 *a
= _mm256_permute4x64_epi64(*a
, _MM_SHUFFLE
!(0, 3, 2, 1));
140 *d
= _mm256_permute4x64_epi64(*d
, _MM_SHUFFLE
!(1, 0, 3, 2));
141 *c
= _mm256_permute4x64_epi64(*c
, _MM_SHUFFLE
!(2, 1, 0, 3));
145 unsafe fn compress_block(
146 block
: &[u8; BLOCKBYTES
],
147 words
: &mut [Word
; 8],
152 let (words_low
, words_high
) = mut_array_refs
!(words
, DEGREE
, DEGREE
);
153 let (iv_low
, iv_high
) = array_refs
!(&IV
, DEGREE
, DEGREE
);
154 let mut a
= loadu(words_low
);
155 let mut b
= loadu(words_high
);
156 let mut c
= loadu(iv_low
);
157 let flags
= set4(count_low(count
), count_high(count
), last_block
, last_node
);
158 let mut d
= xor(loadu(iv_high
), flags
);
160 let msg_chunks
= array_refs
!(block
, 16, 16, 16, 16, 16, 16, 16, 16);
161 let m0
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.0));
162 let m1
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.1));
163 let m2
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.2));
164 let m3
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.3));
165 let m4
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.4));
166 let m5
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.5));
167 let m6
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.6));
168 let m7
= _mm256_broadcastsi128_si256(loadu_128(msg_chunks
.7));
177 t0
= _mm256_unpacklo_epi64(m0
, m1
);
178 t1
= _mm256_unpacklo_epi64(m2
, m3
);
179 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
180 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
181 t0
= _mm256_unpackhi_epi64(m0
, m1
);
182 t1
= _mm256_unpackhi_epi64(m2
, m3
);
183 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
184 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
185 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
186 t0
= _mm256_unpacklo_epi64(m7
, m4
);
187 t1
= _mm256_unpacklo_epi64(m5
, m6
);
188 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
189 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
190 t0
= _mm256_unpackhi_epi64(m7
, m4
);
191 t1
= _mm256_unpackhi_epi64(m5
, m6
);
192 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
193 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
194 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
197 t0
= _mm256_unpacklo_epi64(m7
, m2
);
198 t1
= _mm256_unpackhi_epi64(m4
, m6
);
199 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
200 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
201 t0
= _mm256_unpacklo_epi64(m5
, m4
);
202 t1
= _mm256_alignr_epi8(m3
, m7
, 8);
203 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
204 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
205 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
206 t0
= _mm256_unpackhi_epi64(m2
, m0
);
207 t1
= _mm256_blend_epi32(m5
, m0
, 0x33);
208 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
209 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
210 t0
= _mm256_alignr_epi8(m6
, m1
, 8);
211 t1
= _mm256_blend_epi32(m3
, m1
, 0x33);
212 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
213 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
214 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
217 t0
= _mm256_alignr_epi8(m6
, m5
, 8);
218 t1
= _mm256_unpackhi_epi64(m2
, m7
);
219 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
220 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
221 t0
= _mm256_unpacklo_epi64(m4
, m0
);
222 t1
= _mm256_blend_epi32(m6
, m1
, 0x33);
223 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
224 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
225 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
226 t0
= _mm256_alignr_epi8(m5
, m4
, 8);
227 t1
= _mm256_unpackhi_epi64(m1
, m3
);
228 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
229 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
230 t0
= _mm256_unpacklo_epi64(m2
, m7
);
231 t1
= _mm256_blend_epi32(m0
, m3
, 0x33);
232 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
233 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
234 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
237 t0
= _mm256_unpackhi_epi64(m3
, m1
);
238 t1
= _mm256_unpackhi_epi64(m6
, m5
);
239 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
240 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
241 t0
= _mm256_unpackhi_epi64(m4
, m0
);
242 t1
= _mm256_unpacklo_epi64(m6
, m7
);
243 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
244 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
245 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
246 t0
= _mm256_alignr_epi8(m1
, m7
, 8);
247 t1
= _mm256_shuffle_epi32(m2
, _MM_SHUFFLE
!(1, 0, 3, 2));
248 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
249 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
250 t0
= _mm256_unpacklo_epi64(m4
, m3
);
251 t1
= _mm256_unpacklo_epi64(m5
, m0
);
252 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
253 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
254 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
257 t0
= _mm256_unpackhi_epi64(m4
, m2
);
258 t1
= _mm256_unpacklo_epi64(m1
, m5
);
259 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
260 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
261 t0
= _mm256_blend_epi32(m3
, m0
, 0x33);
262 t1
= _mm256_blend_epi32(m7
, m2
, 0x33);
263 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
264 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
265 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
266 t0
= _mm256_alignr_epi8(m7
, m1
, 8);
267 t1
= _mm256_alignr_epi8(m3
, m5
, 8);
268 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
269 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
270 t0
= _mm256_unpackhi_epi64(m6
, m0
);
271 t1
= _mm256_unpacklo_epi64(m6
, m4
);
272 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
273 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
274 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
277 t0
= _mm256_unpacklo_epi64(m1
, m3
);
278 t1
= _mm256_unpacklo_epi64(m0
, m4
);
279 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
280 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
281 t0
= _mm256_unpacklo_epi64(m6
, m5
);
282 t1
= _mm256_unpackhi_epi64(m5
, m1
);
283 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
284 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
285 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
286 t0
= _mm256_alignr_epi8(m2
, m0
, 8);
287 t1
= _mm256_unpackhi_epi64(m3
, m7
);
288 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
289 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
290 t0
= _mm256_unpackhi_epi64(m4
, m6
);
291 t1
= _mm256_alignr_epi8(m7
, m2
, 8);
292 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
293 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
294 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
297 t0
= _mm256_blend_epi32(m0
, m6
, 0x33);
298 t1
= _mm256_unpacklo_epi64(m7
, m2
);
299 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
300 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
301 t0
= _mm256_unpackhi_epi64(m2
, m7
);
302 t1
= _mm256_alignr_epi8(m5
, m6
, 8);
303 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
304 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
305 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
306 t0
= _mm256_unpacklo_epi64(m4
, m0
);
307 t1
= _mm256_blend_epi32(m4
, m3
, 0x33);
308 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
309 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
310 t0
= _mm256_unpackhi_epi64(m5
, m3
);
311 t1
= _mm256_shuffle_epi32(m1
, _MM_SHUFFLE
!(1, 0, 3, 2));
312 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
313 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
314 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
317 t0
= _mm256_unpackhi_epi64(m6
, m3
);
318 t1
= _mm256_blend_epi32(m1
, m6
, 0x33);
319 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
320 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
321 t0
= _mm256_alignr_epi8(m7
, m5
, 8);
322 t1
= _mm256_unpackhi_epi64(m0
, m4
);
323 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
324 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
325 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
326 t0
= _mm256_blend_epi32(m2
, m1
, 0x33);
327 t1
= _mm256_alignr_epi8(m4
, m7
, 8);
328 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
329 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
330 t0
= _mm256_unpacklo_epi64(m5
, m0
);
331 t1
= _mm256_unpacklo_epi64(m2
, m3
);
332 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
333 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
334 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
337 t0
= _mm256_unpacklo_epi64(m3
, m7
);
338 t1
= _mm256_alignr_epi8(m0
, m5
, 8);
339 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
340 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
341 t0
= _mm256_unpackhi_epi64(m7
, m4
);
342 t1
= _mm256_alignr_epi8(m4
, m1
, 8);
343 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
344 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
345 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
346 t0
= _mm256_unpacklo_epi64(m5
, m6
);
347 t1
= _mm256_unpackhi_epi64(m6
, m0
);
348 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
349 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
350 t0
= _mm256_alignr_epi8(m1
, m2
, 8);
351 t1
= _mm256_alignr_epi8(m2
, m3
, 8);
352 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
353 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
354 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
357 t0
= _mm256_unpacklo_epi64(m5
, m4
);
358 t1
= _mm256_unpackhi_epi64(m3
, m0
);
359 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
360 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
361 t0
= _mm256_unpacklo_epi64(m1
, m2
);
362 t1
= _mm256_blend_epi32(m2
, m3
, 0x33);
363 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
364 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
365 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
366 t0
= _mm256_unpackhi_epi64(m6
, m7
);
367 t1
= _mm256_unpackhi_epi64(m4
, m1
);
368 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
369 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
370 t0
= _mm256_blend_epi32(m5
, m0
, 0x33);
371 t1
= _mm256_unpacklo_epi64(m7
, m6
);
372 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
373 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
374 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
377 t0
= _mm256_unpacklo_epi64(m0
, m1
);
378 t1
= _mm256_unpacklo_epi64(m2
, m3
);
379 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
380 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
381 t0
= _mm256_unpackhi_epi64(m0
, m1
);
382 t1
= _mm256_unpackhi_epi64(m2
, m3
);
383 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
384 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
385 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
386 t0
= _mm256_unpacklo_epi64(m7
, m4
);
387 t1
= _mm256_unpacklo_epi64(m5
, m6
);
388 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
389 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
390 t0
= _mm256_unpackhi_epi64(m7
, m4
);
391 t1
= _mm256_unpackhi_epi64(m5
, m6
);
392 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
393 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
394 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
397 t0
= _mm256_unpacklo_epi64(m7
, m2
);
398 t1
= _mm256_unpackhi_epi64(m4
, m6
);
399 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
400 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
401 t0
= _mm256_unpacklo_epi64(m5
, m4
);
402 t1
= _mm256_alignr_epi8(m3
, m7
, 8);
403 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
404 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
405 diagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
406 t0
= _mm256_unpackhi_epi64(m2
, m0
);
407 t1
= _mm256_blend_epi32(m5
, m0
, 0x33);
408 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
409 g1(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
410 t0
= _mm256_alignr_epi8(m6
, m1
, 8);
411 t1
= _mm256_blend_epi32(m3
, m1
, 0x33);
412 b0
= _mm256_blend_epi32(t0
, t1
, 0xF0);
413 g2(&mut a
, &mut b
, &mut c
, &mut d
, &mut b0
);
414 undiagonalize(&mut a
, &mut b
, &mut c
, &mut d
);
421 storeu(a
, words_low
);
422 storeu(b
, words_high
);
425 #[target_feature(enable = "avx2")]
426 pub unsafe fn compress1_loop(
428 words
: &mut [Word
; 8],
434 input_debug_asserts(input
, finalize
);
436 let mut local_words
= *words
;
438 let mut fin_offset
= input
.len().saturating_sub(1);
439 fin_offset
-= fin_offset
% stride
.padded_blockbytes();
440 let mut buf
= [0; BLOCKBYTES
];
441 let (fin_block
, fin_len
, _
) = final_block(input
, fin_offset
, &mut buf
, stride
);
442 let fin_last_block
= flag_word(finalize
.yes());
443 let fin_last_node
= flag_word(finalize
.yes() && last_node
.yes());
451 if offset
== fin_offset
{
453 count_delta
= fin_len
;
454 last_block
= fin_last_block
;
455 last_node
= fin_last_node
;
457 // This unsafe cast avoids bounds checks. There's guaranteed to be
458 // enough input because `offset < fin_offset`.
459 block
= &*(input
.as_ptr().add(offset
) as *const [u8; BLOCKBYTES
]);
460 count_delta
= BLOCKBYTES
;
461 last_block
= flag_word(false);
462 last_node
= flag_word(false);
465 count
= count
.wrapping_add(count_delta
as Count
);
466 compress_block(block
, &mut local_words
, count
, last_block
, last_node
);
468 // Check for termination before bumping the offset, to avoid overflow.
469 if offset
== fin_offset
{
473 offset
+= stride
.padded_blockbytes();
476 *words
= local_words
;
479 // Performance note: Factoring out a G function here doesn't hurt performance,
480 // unlike in the case of BLAKE2s where it hurts substantially. In fact, on my
481 // machine, it helps a tiny bit. But the difference it tiny, so I'm going to
482 // stick to the approach used by https://github.com/sneves/blake2-avx2
483 // until/unless I can be sure the (tiny) improvement is consistent across
484 // different Intel microarchitectures. Smaller code size is nice, but a
485 // divergence between the BLAKE2b and BLAKE2s implementations is less nice.
487 unsafe fn round(v
: &mut [__m256i
; 16], m
: &[__m256i
; 16], r
: usize) {
488 v
[0] = add(v
[0], m
[SIGMA
[r
][0] as usize]);
489 v
[1] = add(v
[1], m
[SIGMA
[r
][2] as usize]);
490 v
[2] = add(v
[2], m
[SIGMA
[r
][4] as usize]);
491 v
[3] = add(v
[3], m
[SIGMA
[r
][6] as usize]);
492 v
[0] = add(v
[0], v
[4]);
493 v
[1] = add(v
[1], v
[5]);
494 v
[2] = add(v
[2], v
[6]);
495 v
[3] = add(v
[3], v
[7]);
496 v
[12] = xor(v
[12], v
[0]);
497 v
[13] = xor(v
[13], v
[1]);
498 v
[14] = xor(v
[14], v
[2]);
499 v
[15] = xor(v
[15], v
[3]);
500 v
[12] = rot32(v
[12]);
501 v
[13] = rot32(v
[13]);
502 v
[14] = rot32(v
[14]);
503 v
[15] = rot32(v
[15]);
504 v
[8] = add(v
[8], v
[12]);
505 v
[9] = add(v
[9], v
[13]);
506 v
[10] = add(v
[10], v
[14]);
507 v
[11] = add(v
[11], v
[15]);
508 v
[4] = xor(v
[4], v
[8]);
509 v
[5] = xor(v
[5], v
[9]);
510 v
[6] = xor(v
[6], v
[10]);
511 v
[7] = xor(v
[7], v
[11]);
516 v
[0] = add(v
[0], m
[SIGMA
[r
][1] as usize]);
517 v
[1] = add(v
[1], m
[SIGMA
[r
][3] as usize]);
518 v
[2] = add(v
[2], m
[SIGMA
[r
][5] as usize]);
519 v
[3] = add(v
[3], m
[SIGMA
[r
][7] as usize]);
520 v
[0] = add(v
[0], v
[4]);
521 v
[1] = add(v
[1], v
[5]);
522 v
[2] = add(v
[2], v
[6]);
523 v
[3] = add(v
[3], v
[7]);
524 v
[12] = xor(v
[12], v
[0]);
525 v
[13] = xor(v
[13], v
[1]);
526 v
[14] = xor(v
[14], v
[2]);
527 v
[15] = xor(v
[15], v
[3]);
528 v
[12] = rot16(v
[12]);
529 v
[13] = rot16(v
[13]);
530 v
[14] = rot16(v
[14]);
531 v
[15] = rot16(v
[15]);
532 v
[8] = add(v
[8], v
[12]);
533 v
[9] = add(v
[9], v
[13]);
534 v
[10] = add(v
[10], v
[14]);
535 v
[11] = add(v
[11], v
[15]);
536 v
[4] = xor(v
[4], v
[8]);
537 v
[5] = xor(v
[5], v
[9]);
538 v
[6] = xor(v
[6], v
[10]);
539 v
[7] = xor(v
[7], v
[11]);
545 v
[0] = add(v
[0], m
[SIGMA
[r
][8] as usize]);
546 v
[1] = add(v
[1], m
[SIGMA
[r
][10] as usize]);
547 v
[2] = add(v
[2], m
[SIGMA
[r
][12] as usize]);
548 v
[3] = add(v
[3], m
[SIGMA
[r
][14] as usize]);
549 v
[0] = add(v
[0], v
[5]);
550 v
[1] = add(v
[1], v
[6]);
551 v
[2] = add(v
[2], v
[7]);
552 v
[3] = add(v
[3], v
[4]);
553 v
[15] = xor(v
[15], v
[0]);
554 v
[12] = xor(v
[12], v
[1]);
555 v
[13] = xor(v
[13], v
[2]);
556 v
[14] = xor(v
[14], v
[3]);
557 v
[15] = rot32(v
[15]);
558 v
[12] = rot32(v
[12]);
559 v
[13] = rot32(v
[13]);
560 v
[14] = rot32(v
[14]);
561 v
[10] = add(v
[10], v
[15]);
562 v
[11] = add(v
[11], v
[12]);
563 v
[8] = add(v
[8], v
[13]);
564 v
[9] = add(v
[9], v
[14]);
565 v
[5] = xor(v
[5], v
[10]);
566 v
[6] = xor(v
[6], v
[11]);
567 v
[7] = xor(v
[7], v
[8]);
568 v
[4] = xor(v
[4], v
[9]);
573 v
[0] = add(v
[0], m
[SIGMA
[r
][9] as usize]);
574 v
[1] = add(v
[1], m
[SIGMA
[r
][11] as usize]);
575 v
[2] = add(v
[2], m
[SIGMA
[r
][13] as usize]);
576 v
[3] = add(v
[3], m
[SIGMA
[r
][15] as usize]);
577 v
[0] = add(v
[0], v
[5]);
578 v
[1] = add(v
[1], v
[6]);
579 v
[2] = add(v
[2], v
[7]);
580 v
[3] = add(v
[3], v
[4]);
581 v
[15] = xor(v
[15], v
[0]);
582 v
[12] = xor(v
[12], v
[1]);
583 v
[13] = xor(v
[13], v
[2]);
584 v
[14] = xor(v
[14], v
[3]);
585 v
[15] = rot16(v
[15]);
586 v
[12] = rot16(v
[12]);
587 v
[13] = rot16(v
[13]);
588 v
[14] = rot16(v
[14]);
589 v
[10] = add(v
[10], v
[15]);
590 v
[11] = add(v
[11], v
[12]);
591 v
[8] = add(v
[8], v
[13]);
592 v
[9] = add(v
[9], v
[14]);
593 v
[5] = xor(v
[5], v
[10]);
594 v
[6] = xor(v
[6], v
[11]);
595 v
[7] = xor(v
[7], v
[8]);
596 v
[4] = xor(v
[4], v
[9]);
603 // We'd rather make this a regular function with #[inline(always)], but for
604 // some reason that blows up compile times by about 10 seconds, at least in
605 // some cases (BLAKE2b avx2.rs). This macro seems to get the same performance
606 // result, without the compile time issue.
607 macro_rules
! compress4_transposed
{
616 let h_vecs
: &mut [__m256i
; 8] = $h_vecs
;
617 let msg_vecs
: &[__m256i
; 16] = $msg_vecs
;
618 let count_low
: __m256i
= $count_low
;
619 let count_high
: __m256i
= $count_high
;
620 let lastblock
: __m256i
= $lastblock
;
621 let lastnode
: __m256i
= $lastnode
;
636 xor(set1(IV
[4]), count_low
),
637 xor(set1(IV
[5]), count_high
),
638 xor(set1(IV
[6]), lastblock
),
639 xor(set1(IV
[7]), lastnode
),
642 round(&mut v
, &msg_vecs
, 0);
643 round(&mut v
, &msg_vecs
, 1);
644 round(&mut v
, &msg_vecs
, 2);
645 round(&mut v
, &msg_vecs
, 3);
646 round(&mut v
, &msg_vecs
, 4);
647 round(&mut v
, &msg_vecs
, 5);
648 round(&mut v
, &msg_vecs
, 6);
649 round(&mut v
, &msg_vecs
, 7);
650 round(&mut v
, &msg_vecs
, 8);
651 round(&mut v
, &msg_vecs
, 9);
652 round(&mut v
, &msg_vecs
, 10);
653 round(&mut v
, &msg_vecs
, 11);
655 h_vecs
[0] = xor(xor(h_vecs
[0], v
[0]), v
[8]);
656 h_vecs
[1] = xor(xor(h_vecs
[1], v
[1]), v
[9]);
657 h_vecs
[2] = xor(xor(h_vecs
[2], v
[2]), v
[10]);
658 h_vecs
[3] = xor(xor(h_vecs
[3], v
[3]), v
[11]);
659 h_vecs
[4] = xor(xor(h_vecs
[4], v
[4]), v
[12]);
660 h_vecs
[5] = xor(xor(h_vecs
[5], v
[5]), v
[13]);
661 h_vecs
[6] = xor(xor(h_vecs
[6], v
[6]), v
[14]);
662 h_vecs
[7] = xor(xor(h_vecs
[7], v
[7]), v
[15]);
667 unsafe fn interleave128(a
: __m256i
, b
: __m256i
) -> (__m256i
, __m256i
) {
669 _mm256_permute2x128_si256(a
, b
, 0x20),
670 _mm256_permute2x128_si256(a
, b
, 0x31),
674 // There are several ways to do a transposition. We could do it naively, with 8 separate
675 // _mm256_set_epi64x instructions, referencing each of the 64 words explicitly. Or we could copy
676 // the vecs into contiguous storage and then use gather instructions. This third approach is to use
677 // a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
678 // fastest approach. To test this, run `cargo +nightly bench --bench libtest load_4` in the
679 // https://github.com/oconnor663/bao_experiments repo.
681 unsafe fn transpose_vecs(
686 ) -> [__m256i
; DEGREE
] {
687 // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33.
688 let ab_02
= _mm256_unpacklo_epi64(vec_a
, vec_b
);
689 let ab_13
= _mm256_unpackhi_epi64(vec_a
, vec_b
);
690 let cd_02
= _mm256_unpacklo_epi64(vec_c
, vec_d
);
691 let cd_13
= _mm256_unpackhi_epi64(vec_c
, vec_d
);
693 // Interleave 128-bit lanes.
694 let (abcd_0
, abcd_2
) = interleave128(ab_02
, cd_02
);
695 let (abcd_1
, abcd_3
) = interleave128(ab_13
, cd_13
);
697 [abcd_0
, abcd_1
, abcd_2
, abcd_3
]
701 unsafe fn transpose_state_vecs(jobs
: &[Job
; DEGREE
]) -> [__m256i
; 8] {
702 // Load all the state words into transposed vectors, where the first vector
703 // has the first word of each state, etc. Transposing once at the beginning
704 // and once at the end is more efficient that repeating it for each block.
705 let words0
= array_refs
!(&jobs
[0].words
, DEGREE
, DEGREE
);
706 let words1
= array_refs
!(&jobs
[1].words
, DEGREE
, DEGREE
);
707 let words2
= array_refs
!(&jobs
[2].words
, DEGREE
, DEGREE
);
708 let words3
= array_refs
!(&jobs
[3].words
, DEGREE
, DEGREE
);
709 let [h0
, h1
, h2
, h3
] = transpose_vecs(
715 let [h4
, h5
, h6
, h7
] = transpose_vecs(
721 [h0
, h1
, h2
, h3
, h4
, h5
, h6
, h7
]
725 unsafe fn untranspose_state_vecs(h_vecs
: &[__m256i
; 8], jobs
: &mut [Job
; DEGREE
]) {
726 // Un-transpose the updated state vectors back into the caller's arrays.
727 let [job0
, job1
, job2
, job3
] = jobs
;
728 let words0
= mut_array_refs
!(&mut job0
.words
, DEGREE
, DEGREE
);
729 let words1
= mut_array_refs
!(&mut job1
.words
, DEGREE
, DEGREE
);
730 let words2
= mut_array_refs
!(&mut job2
.words
, DEGREE
, DEGREE
);
731 let words3
= mut_array_refs
!(&mut job3
.words
, DEGREE
, DEGREE
);
732 let out
= transpose_vecs(h_vecs
[0], h_vecs
[1], h_vecs
[2], h_vecs
[3]);
733 storeu(out
[0], words0
.0
);
734 storeu(out
[1], words1
.0
);
735 storeu(out
[2], words2
.0
);
736 storeu(out
[3], words3
.0
);
737 let out
= transpose_vecs(h_vecs
[4], h_vecs
[5], h_vecs
[6], h_vecs
[7]);
738 storeu(out
[0], words0
.1
);
739 storeu(out
[1], words1
.1
);
740 storeu(out
[2], words2
.1
);
741 storeu(out
[3], words3
.1
);
745 unsafe fn transpose_msg_vecs(blocks
: [*const [u8; BLOCKBYTES
]; DEGREE
]) -> [__m256i
; 16] {
746 // These input arrays have no particular alignment, so we use unaligned
747 // loads to read from them.
748 let block0
= blocks
[0] as *const [Word
; DEGREE
];
749 let block1
= blocks
[1] as *const [Word
; DEGREE
];
750 let block2
= blocks
[2] as *const [Word
; DEGREE
];
751 let block3
= blocks
[3] as *const [Word
; DEGREE
];
752 let [m0
, m1
, m2
, m3
] = transpose_vecs(
753 loadu(block0
.add(0)),
754 loadu(block1
.add(0)),
755 loadu(block2
.add(0)),
756 loadu(block3
.add(0)),
758 let [m4
, m5
, m6
, m7
] = transpose_vecs(
759 loadu(block0
.add(1)),
760 loadu(block1
.add(1)),
761 loadu(block2
.add(1)),
762 loadu(block3
.add(1)),
764 let [m8
, m9
, m10
, m11
] = transpose_vecs(
765 loadu(block0
.add(2)),
766 loadu(block1
.add(2)),
767 loadu(block2
.add(2)),
768 loadu(block3
.add(2)),
770 let [m12
, m13
, m14
, m15
] = transpose_vecs(
771 loadu(block0
.add(3)),
772 loadu(block1
.add(3)),
773 loadu(block2
.add(3)),
774 loadu(block3
.add(3)),
777 m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
, m9
, m10
, m11
, m12
, m13
, m14
, m15
,
782 unsafe fn load_counts(jobs
: &[Job
; DEGREE
]) -> (__m256i
, __m256i
) {
785 count_low(jobs
[0].count
),
786 count_low(jobs
[1].count
),
787 count_low(jobs
[2].count
),
788 count_low(jobs
[3].count
),
791 count_high(jobs
[0].count
),
792 count_high(jobs
[1].count
),
793 count_high(jobs
[2].count
),
794 count_high(jobs
[3].count
),
800 unsafe fn store_counts(jobs
: &mut [Job
; DEGREE
], low
: __m256i
, high
: __m256i
) {
801 let low_ints
: [Word
; DEGREE
] = mem
::transmute(low
);
802 let high_ints
: [Word
; DEGREE
] = mem
::transmute(high
);
804 jobs
[i
].count
= assemble_count(low_ints
[i
], high_ints
[i
]);
809 unsafe fn add_to_counts(lo
: &mut __m256i
, hi
: &mut __m256i
, delta
: __m256i
) {
810 // If the low counts reach zero, that means they wrapped, unless the delta
812 *lo
= add(*lo
, delta
);
813 let lo_reached_zero
= eq(*lo
, set1(0));
814 let delta_was_zero
= eq(delta
, set1(0));
815 let hi_inc
= and(set1(1), negate_and(delta_was_zero
, lo_reached_zero
));
816 *hi
= add(*hi
, hi_inc
);
820 unsafe fn flags_vec(flags
: [bool
; DEGREE
]) -> __m256i
{
829 #[target_feature(enable = "avx2")]
830 pub unsafe fn compress4_loop(jobs
: &mut [Job
; DEGREE
], finalize
: Finalize
, stride
: Stride
) {
831 // If we're not finalizing, there can't be a partial block at the end.
832 for job
in jobs
.iter() {
833 input_debug_asserts(job
.input
, finalize
);
837 jobs
[0].input
.as_ptr(),
838 jobs
[1].input
.as_ptr(),
839 jobs
[2].input
.as_ptr(),
840 jobs
[3].input
.as_ptr(),
842 let mut h_vecs
= transpose_state_vecs(&jobs
);
843 let (mut counts_lo
, mut counts_hi
) = load_counts(&jobs
);
845 // Prepare the final blocks (note, which could be empty if the input is
846 // empty). Do all this before entering the main loop.
847 let min_len
= jobs
.iter().map(|job
| job
.input
.len()).min().unwrap();
848 let mut fin_offset
= min_len
.saturating_sub(1);
849 fin_offset
-= fin_offset
% stride
.padded_blockbytes();
850 // Performance note, making these buffers mem::uninitialized() seems to
851 // cause problems in the optimizer.
852 let mut buf0
: [u8; BLOCKBYTES
] = [0; BLOCKBYTES
];
853 let mut buf1
: [u8; BLOCKBYTES
] = [0; BLOCKBYTES
];
854 let mut buf2
: [u8; BLOCKBYTES
] = [0; BLOCKBYTES
];
855 let mut buf3
: [u8; BLOCKBYTES
] = [0; BLOCKBYTES
];
856 let (block0
, len0
, finalize0
) = final_block(jobs
[0].input
, fin_offset
, &mut buf0
, stride
);
857 let (block1
, len1
, finalize1
) = final_block(jobs
[1].input
, fin_offset
, &mut buf1
, stride
);
858 let (block2
, len2
, finalize2
) = final_block(jobs
[2].input
, fin_offset
, &mut buf2
, stride
);
859 let (block3
, len3
, finalize3
) = final_block(jobs
[3].input
, fin_offset
, &mut buf3
, stride
);
860 let fin_blocks
: [*const [u8; BLOCKBYTES
]; DEGREE
] = [block0
, block1
, block2
, block3
];
861 let fin_counts_delta
= set4(len0
as Word
, len1
as Word
, len2
as Word
, len3
as Word
);
865 fin_last_block
= flags_vec([finalize0
, finalize1
, finalize2
, finalize3
]);
866 fin_last_node
= flags_vec([
867 finalize0
&& jobs
[0].last_node
.yes(),
868 finalize1
&& jobs
[1].last_node
.yes(),
869 finalize2
&& jobs
[2].last_node
.yes(),
870 finalize3
&& jobs
[3].last_node
.yes(),
873 fin_last_block
= set1(0);
874 fin_last_node
= set1(0);
884 if offset
== fin_offset
{
886 counts_delta
= fin_counts_delta
;
887 last_block
= fin_last_block
;
888 last_node
= fin_last_node
;
891 msg_ptrs
[0].add(offset
) as *const [u8; BLOCKBYTES
],
892 msg_ptrs
[1].add(offset
) as *const [u8; BLOCKBYTES
],
893 msg_ptrs
[2].add(offset
) as *const [u8; BLOCKBYTES
],
894 msg_ptrs
[3].add(offset
) as *const [u8; BLOCKBYTES
],
896 counts_delta
= set1(BLOCKBYTES
as Word
);
897 last_block
= set1(0);
901 let m_vecs
= transpose_msg_vecs(blocks
);
902 add_to_counts(&mut counts_lo
, &mut counts_hi
, counts_delta
);
903 compress4_transposed
!(
912 // Check for termination before bumping the offset, to avoid overflow.
913 if offset
== fin_offset
{
917 offset
+= stride
.padded_blockbytes();
920 // Write out the results.
921 untranspose_state_vecs(&h_vecs
, &mut *jobs
);
922 store_counts(&mut *jobs
, counts_lo
, counts_hi
);
923 let max_consumed
= offset
.saturating_add(stride
.padded_blockbytes());
924 for job
in jobs
.iter_mut() {
925 let consumed
= cmp
::min(max_consumed
, job
.input
.len());
926 job
.input
= &job
.input
[consumed
..];