1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "sha256_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
37 ;; code to compute oct SHA256 using SSE-256 / AVX2
38 ;; outer calling routine takes care of save and restore of XMM registers
39 ;; Logic designed/laid out by JDG
41 ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
42 ;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
43 ;; Windows preserves: rcx rbp r8
45 ;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
46 ;; Linux preserves: rdi rbp r8
50 %ifidn __OUTPUT_FORMAT__, elf64
126 %define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
127 %define ROUNDS 64*SZ8
129 %define SHA256_DIGEST_WORD_SIZE 4
130 %define MAX_SHA256_LANES 8
131 %define NUM_SHA256_DIGEST_WORDS 8
132 %define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
136 ;; Assume stack aligned to 32 bytes before call
137 ;; Therefore FRAMESZ mod 32 must be 32-8 = 24
144 %define FRAMESZ stack_frame_size
145 %define _DIGEST stack_frame.digest
146 %define _YTMP stack_frame.ytmp
147 %define _RSP_SAVE stack_frame.rsp
149 %define YTMP0 rsp + _YTMP + 0*SZ8
150 %define YTMP1 rsp + _YTMP + 1*SZ8
151 %define YTMP2 rsp + _YTMP + 2*SZ8
152 %define YTMP3 rsp + _YTMP + 3*SZ8
154 %define VMOVPS vmovups
156 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
157 ; "transpose" data in {r0...r7} using temps {t0...t1}
158 ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
159 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
160 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
161 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
162 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
163 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
164 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
165 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
166 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
168 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
169 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
170 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
171 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
172 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
173 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
174 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
175 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
176 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
189 ; process top half (r0..r3) {a...d}
190 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
191 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
192 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
193 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
194 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
195 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
196 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
197 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
199 ; use r2 in place of t0
200 ; process bottom half (r4..r7) {e...h}
201 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
202 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
203 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
204 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
205 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
206 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
207 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
208 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
210 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
211 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
212 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
213 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
214 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
215 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
216 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
217 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
234 ; PRORD reg, imm, tmp
239 vpslld %%tmp, %%reg, (32-(%%imm))
240 vpsrld %%reg, %%reg, %%imm
241 vpor %%reg, %%reg, %%tmp
245 ; PRORD_nd reg, imm, tmp, src
251 vpslld %%tmp, %%src, (32-(%%imm))
252 vpsrld %%reg, %%src, %%imm
253 vpor %%reg, %%reg, %%tmp
261 ; PRORD_nd dst, src, amt
263 PRORD_nd %1, %3, TMP, %2
266 ;; arguments passed implicitly in preprocessor symbols i, a...h
270 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
272 vpxor a2, f, g ; ch: a2 = f^g
273 vpand a2, a2, e ; ch: a2 = (f^g)&e
274 vpxor a2, a2, g ; a2 = ch
276 PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
277 vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1
278 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
279 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
280 PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
281 vpaddd h, h, a2 ; h = h + ch
282 PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
283 vpaddd h, h, %%T1 ; h = h + ch + W + K
284 vpxor a0, a0, a1 ; a0 = sigma1
285 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
286 vpxor %%T1, a, c ; maj: T1 = a^c
287 add ROUND, SZ8 ; ROUND++
288 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
293 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
294 PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
295 vpxor a2, a2, a1 ; a2 = sig0
296 vpand a1, a, c ; maj: a1 = a&c
297 vpor a1, a1, %%T1 ; a1 = maj
298 vpaddd h, h, a1 ; h = h + ch + W + K + maj
299 vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
305 ;; arguments passed implicitly in preprocessor symbols i, a...h
309 vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
310 vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
323 vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
324 vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
325 vpaddd %%T1, %%T1, a1
327 ROUND_00_15 %%T1, %%i
332 ;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
333 ;; arg 1 : STATE : pointer to input data
334 ;; arg 2 : INP_SIZE : size of input in blocks
335 mk_global sha256_mb_x8_avx2, function, internal
339 ; general registers preserved in outer calling routine
340 ; outer calling routine saves all the XMM registers
342 ; save rsp, allocate 32-byte aligned for local variables
346 mov [rsp + _RSP_SAVE], IDX
349 ;; Load the pre-transposed incoming digest.
350 vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
351 vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
352 vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
353 vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
354 vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
355 vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
356 vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
357 vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
361 ;; load the address of each of the 4 message lanes
362 ;; getting ready to transpose input onto stack
363 mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
364 mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
365 mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
366 mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
367 mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
368 mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
369 mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
370 mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
377 vmovdqa [rsp + _DIGEST + 0*SZ8], a
378 vmovdqa [rsp + _DIGEST + 1*SZ8], b
379 vmovdqa [rsp + _DIGEST + 2*SZ8], c
380 vmovdqa [rsp + _DIGEST + 3*SZ8], d
381 vmovdqa [rsp + _DIGEST + 4*SZ8], e
382 vmovdqa [rsp + _DIGEST + 5*SZ8], f
383 vmovdqa [rsp + _DIGEST + 6*SZ8], g
384 vmovdqa [rsp + _DIGEST + 7*SZ8], h
387 VMOVPS TT0,[inp0+IDX+i*32]
388 VMOVPS TT1,[inp1+IDX+i*32]
389 VMOVPS TT2,[inp2+IDX+i*32]
390 VMOVPS TT3,[inp3+IDX+i*32]
391 VMOVPS TT4,[inp4+IDX+i*32]
392 VMOVPS TT5,[inp5+IDX+i*32]
393 VMOVPS TT6,[inp6+IDX+i*32]
394 VMOVPS TT7,[inp7+IDX+i*32]
397 TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
398 vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
400 vpshufb TT0, TT0, TMP1
401 vpshufb TT1, TT1, TMP1
402 vpshufb TT2, TT2, TMP1
403 vpshufb TT3, TT3, TMP1
404 vpshufb TT4, TT4, TMP1
405 vpshufb TT5, TT5, TMP1
406 vpshufb TT6, TT6, TMP1
407 vpshufb TT7, TT7, TMP1
413 ROUND_00_15 TT0,(i*8+0)
415 ROUND_00_15 TT1,(i*8+1)
417 ROUND_00_15 TT2,(i*8+2)
419 ROUND_00_15 TT3,(i*8+3)
421 ROUND_00_15 TT0,(i*8+4)
422 ROUND_00_15 TT1,(i*8+5)
423 ROUND_00_15 TT2,(i*8+6)
424 ROUND_00_15 TT3,(i*8+7)
443 vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
444 vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
445 vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
446 vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
447 vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
448 vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
449 vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
450 vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
452 sub INP_SIZE, 1 ;; unit is blocks
455 ; write back to memory (state object) the transposed digest
456 vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a
457 vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b
458 vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c
459 vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d
460 vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e
461 vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f
462 vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g
463 vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h
465 ; update input pointers
467 mov [STATE + _args_data_ptr + 0*8], inp0
469 mov [STATE + _args_data_ptr + 1*8], inp1
471 mov [STATE + _args_data_ptr + 2*8], inp2
473 mov [STATE + _args_data_ptr + 3*8], inp3
475 mov [STATE + _args_data_ptr + 4*8], inp4
477 mov [STATE + _args_data_ptr + 5*8], inp5
479 mov [STATE + _args_data_ptr + 6*8], inp6
481 mov [STATE + _args_data_ptr + 7*8], inp7
485 mov rsp, [rsp + _RSP_SAVE]
491 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
492 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
493 dq 0x7137449171374491, 0x7137449171374491
494 dq 0x7137449171374491, 0x7137449171374491
495 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
496 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
497 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
498 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
499 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
500 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
501 dq 0x59f111f159f111f1, 0x59f111f159f111f1
502 dq 0x59f111f159f111f1, 0x59f111f159f111f1
503 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
504 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
505 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
506 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
507 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
508 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
509 dq 0x12835b0112835b01, 0x12835b0112835b01
510 dq 0x12835b0112835b01, 0x12835b0112835b01
511 dq 0x243185be243185be, 0x243185be243185be
512 dq 0x243185be243185be, 0x243185be243185be
513 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
514 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
515 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
516 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
517 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
518 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
519 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
520 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
521 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
522 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
523 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
524 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
525 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
526 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
527 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
528 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
529 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
530 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
531 dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
532 dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
533 dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
534 dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
535 dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
536 dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
537 dq 0x76f988da76f988da, 0x76f988da76f988da
538 dq 0x76f988da76f988da, 0x76f988da76f988da
539 dq 0x983e5152983e5152, 0x983e5152983e5152
540 dq 0x983e5152983e5152, 0x983e5152983e5152
541 dq 0xa831c66da831c66d, 0xa831c66da831c66d
542 dq 0xa831c66da831c66d, 0xa831c66da831c66d
543 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
544 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
545 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
546 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
547 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
548 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
549 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
550 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
551 dq 0x06ca635106ca6351, 0x06ca635106ca6351
552 dq 0x06ca635106ca6351, 0x06ca635106ca6351
553 dq 0x1429296714292967, 0x1429296714292967
554 dq 0x1429296714292967, 0x1429296714292967
555 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
556 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
557 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
558 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
559 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
560 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
561 dq 0x53380d1353380d13, 0x53380d1353380d13
562 dq 0x53380d1353380d13, 0x53380d1353380d13
563 dq 0x650a7354650a7354, 0x650a7354650a7354
564 dq 0x650a7354650a7354, 0x650a7354650a7354
565 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
566 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
567 dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
568 dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
569 dq 0x92722c8592722c85, 0x92722c8592722c85
570 dq 0x92722c8592722c85, 0x92722c8592722c85
571 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
572 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
573 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
574 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
575 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
576 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
577 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
578 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
579 dq 0xd192e819d192e819, 0xd192e819d192e819
580 dq 0xd192e819d192e819, 0xd192e819d192e819
581 dq 0xd6990624d6990624, 0xd6990624d6990624
582 dq 0xd6990624d6990624, 0xd6990624d6990624
583 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
584 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
585 dq 0x106aa070106aa070, 0x106aa070106aa070
586 dq 0x106aa070106aa070, 0x106aa070106aa070
587 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
588 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
589 dq 0x1e376c081e376c08, 0x1e376c081e376c08
590 dq 0x1e376c081e376c08, 0x1e376c081e376c08
591 dq 0x2748774c2748774c, 0x2748774c2748774c
592 dq 0x2748774c2748774c, 0x2748774c2748774c
593 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
594 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
595 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
596 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
597 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
598 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
599 dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
600 dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
601 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
602 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
603 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
604 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
605 dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
606 dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
607 dq 0x84c8781484c87814, 0x84c8781484c87814
608 dq 0x84c8781484c87814, 0x84c8781484c87814
609 dq 0x8cc702088cc70208, 0x8cc702088cc70208
610 dq 0x8cc702088cc70208, 0x8cc702088cc70208
611 dq 0x90befffa90befffa, 0x90befffa90befffa
612 dq 0x90befffa90befffa, 0x90befffa90befffa
613 dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
614 dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
615 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
616 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
617 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
618 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
619 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
620 dq 0x0405060700010203, 0x0c0d0e0f08090a0b