]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha256_mb / sha256_mb_x8_avx2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "sha256_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32
33 [bits 64]
34 default rel
35 section .text
36
37 ;; code to compute oct SHA256 using SSE-256 / AVX2
38 ;; outer calling routine takes care of save and restore of XMM registers
39 ;; Logic designed/laid out by JDG
40
41 ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
42 ;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
43 ;; Windows preserves: rcx rbp r8
44 ;;
45 ;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
46 ;; Linux preserves: rdi rbp r8
47 ;;
48 ;; clobbers ymm0-15
49
50 %ifidn __OUTPUT_FORMAT__, elf64
51 ; Linux definitions
52 %define arg1 rdi
53 %define arg2 rsi
54 %define reg3 rcx
55 %define reg4 rdx
56 %else
57 ; Windows definitions
58 %define arg1 rcx
59 %define arg2 rdx
60 %define reg3 rsi
61 %define reg4 rdi
62 %endif
63
64 ; Common definitions
65 %define STATE arg1
66 %define INP_SIZE arg2
67
68 %define IDX rax
69 %define ROUND rbx
70 %define TBL reg3
71
72 %define inp0 r9
73 %define inp1 r10
74 %define inp2 r11
75 %define inp3 r12
76 %define inp4 r13
77 %define inp5 r14
78 %define inp6 r15
79 %define inp7 reg4
80
81 ; ymm0 a
82 ; ymm1 b
83 ; ymm2 c
84 ; ymm3 d
85 ; ymm4 e
86 ; ymm5 f
87 ; ymm6 g TMP0
88 ; ymm7 h TMP1
89 ; ymm8 T1 TT0
90 ; ymm9 TT1
91 ; ymm10 TT2
92 ; ymm11 TT3
93 ; ymm12 a0 TT4
94 ; ymm13 a1 TT5
95 ; ymm14 a2 TT6
96 ; ymm15 TMP TT7
97
98 %define a ymm0
99 %define b ymm1
100 %define c ymm2
101 %define d ymm3
102 %define e ymm4
103 %define f ymm5
104 %define g ymm6
105 %define h ymm7
106
107 %define T1 ymm8
108
109 %define a0 ymm12
110 %define a1 ymm13
111 %define a2 ymm14
112 %define TMP ymm15
113
114 %define TMP0 ymm6
115 %define TMP1 ymm7
116
117 %define TT0 ymm8
118 %define TT1 ymm9
119 %define TT2 ymm10
120 %define TT3 ymm11
121 %define TT4 ymm12
122 %define TT5 ymm13
123 %define TT6 ymm14
124 %define TT7 ymm15
125
126 %define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
127 %define ROUNDS 64*SZ8
128 %define PTR_SZ 8
129 %define SHA256_DIGEST_WORD_SIZE 4
130 %define MAX_SHA256_LANES 8
131 %define NUM_SHA256_DIGEST_WORDS 8
132 %define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
133
134 ; Define stack usage
135
136 ;; Assume stack aligned to 32 bytes before call
137 ;; Therefore FRAMESZ mod 32 must be 32-8 = 24
138 struc stack_frame
139 .data resb 16*SZ8
140 .digest resb 8*SZ8
141 .ytmp resb 4*SZ8
142 .rsp resb 8
143 endstruc
144 %define FRAMESZ stack_frame_size
145 %define _DIGEST stack_frame.digest
146 %define _YTMP stack_frame.ytmp
147 %define _RSP_SAVE stack_frame.rsp
148
149 %define YTMP0 rsp + _YTMP + 0*SZ8
150 %define YTMP1 rsp + _YTMP + 1*SZ8
151 %define YTMP2 rsp + _YTMP + 2*SZ8
152 %define YTMP3 rsp + _YTMP + 3*SZ8
153
154 %define VMOVPS vmovups
155
156 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
157 ; "transpose" data in {r0...r7} using temps {t0...t1}
158 ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
159 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
160 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
161 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
162 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
163 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
164 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
165 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
166 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
167 ;
168 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
169 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
170 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
171 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
172 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
173 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
174 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
175 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
176 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
177 ;
178 %macro TRANSPOSE8 10
179 %define %%r0 %1
180 %define %%r1 %2
181 %define %%r2 %3
182 %define %%r3 %4
183 %define %%r4 %5
184 %define %%r5 %6
185 %define %%r6 %7
186 %define %%r7 %8
187 %define %%t0 %9
188 %define %%t1 %10
189 ; process top half (r0..r3) {a...d}
190 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
191 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
192 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
193 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
194 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
195 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
196 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
197 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
198
199 ; use r2 in place of t0
200 ; process bottom half (r4..r7) {e...h}
201 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
202 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
203 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
204 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
205 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
206 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
207 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
208 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
209
210 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
211 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
212 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
213 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
214 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
215 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
216 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
217 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
218 %endmacro
219
220
221
222 %macro ROTATE_ARGS 0
223 %xdefine TMP_ h
224 %xdefine h g
225 %xdefine g f
226 %xdefine f e
227 %xdefine e d
228 %xdefine d c
229 %xdefine c b
230 %xdefine b a
231 %xdefine a TMP_
232 %endm
233
234 ; PRORD reg, imm, tmp
235 %macro PRORD 3
236 %define %%reg %1
237 %define %%imm %2
238 %define %%tmp %3
239 vpslld %%tmp, %%reg, (32-(%%imm))
240 vpsrld %%reg, %%reg, %%imm
241 vpor %%reg, %%reg, %%tmp
242 %endmacro
243
244 ; non-destructive
245 ; PRORD_nd reg, imm, tmp, src
246 %macro PRORD_nd 4
247 %define %%reg %1
248 %define %%imm %2
249 %define %%tmp %3
250 %define %%src %4
251 vpslld %%tmp, %%src, (32-(%%imm))
252 vpsrld %%reg, %%src, %%imm
253 vpor %%reg, %%reg, %%tmp
254 %endmacro
255
256 ; PRORD dst/src, amt
257 %macro PRORD 2
258 PRORD %1, %2, TMP
259 %endmacro
260
261 ; PRORD_nd dst, src, amt
262 %macro PRORD_nd 3
263 PRORD_nd %1, %3, TMP, %2
264 %endmacro
265
266 ;; arguments passed implicitly in preprocessor symbols i, a...h
267 %macro ROUND_00_15 2
268 %define %%T1 %1
269 %define %%i %2
270 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
271
272 vpxor a2, f, g ; ch: a2 = f^g
273 vpand a2, a2, e ; ch: a2 = (f^g)&e
274 vpxor a2, a2, g ; a2 = ch
275
276 PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
277 vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1
278 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
279 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
280 PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
281 vpaddd h, h, a2 ; h = h + ch
282 PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
283 vpaddd h, h, %%T1 ; h = h + ch + W + K
284 vpxor a0, a0, a1 ; a0 = sigma1
285 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
286 vpxor %%T1, a, c ; maj: T1 = a^c
287 add ROUND, SZ8 ; ROUND++
288 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
289 vpaddd h, h, a0
290
291 vpaddd d, d, h
292
293 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
294 PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
295 vpxor a2, a2, a1 ; a2 = sig0
296 vpand a1, a, c ; maj: a1 = a&c
297 vpor a1, a1, %%T1 ; a1 = maj
298 vpaddd h, h, a1 ; h = h + ch + W + K + maj
299 vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
300
301 ROTATE_ARGS
302 %endm
303
304
305 ;; arguments passed implicitly in preprocessor symbols i, a...h
306 %macro ROUND_16_XX 2
307 %define %%T1 %1
308 %define %%i %2
309 vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
310 vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
311 vmovdqa a0, %%T1
312 PRORD %%T1, 18-7
313 vmovdqa a2, a1
314 PRORD a1, 19-17
315 vpxor %%T1, %%T1, a0
316 PRORD %%T1, 7
317 vpxor a1, a1, a2
318 PRORD a1, 17
319 vpsrld a0, a0, 3
320 vpxor %%T1, %%T1, a0
321 vpsrld a2, a2, 10
322 vpxor a1, a1, a2
323 vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
324 vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
325 vpaddd %%T1, %%T1, a1
326
327 ROUND_00_15 %%T1, %%i
328
329 %endm
330
331
332 ;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
333 ;; arg 1 : STATE : pointer to input data
334 ;; arg 2 : INP_SIZE : size of input in blocks
335 mk_global sha256_mb_x8_avx2, function, internal
336 align 16
337 sha256_mb_x8_avx2:
338 endbranch
339 ; general registers preserved in outer calling routine
340 ; outer calling routine saves all the XMM registers
341
342 ; save rsp, allocate 32-byte aligned for local variables
343 mov IDX, rsp
344 sub rsp, FRAMESZ
345 and rsp, ~31
346 mov [rsp + _RSP_SAVE], IDX
347
348
349 ;; Load the pre-transposed incoming digest.
350 vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
351 vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
352 vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
353 vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
354 vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
355 vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
356 vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
357 vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
358
359 lea TBL,[K256_8_MB]
360
361 ;; load the address of each of the 4 message lanes
362 ;; getting ready to transpose input onto stack
363 mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
364 mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
365 mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
366 mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
367 mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
368 mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
369 mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
370 mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
371
372 xor IDX, IDX
373 lloop:
374 xor ROUND, ROUND
375
376 ;; save old digest
377 vmovdqa [rsp + _DIGEST + 0*SZ8], a
378 vmovdqa [rsp + _DIGEST + 1*SZ8], b
379 vmovdqa [rsp + _DIGEST + 2*SZ8], c
380 vmovdqa [rsp + _DIGEST + 3*SZ8], d
381 vmovdqa [rsp + _DIGEST + 4*SZ8], e
382 vmovdqa [rsp + _DIGEST + 5*SZ8], f
383 vmovdqa [rsp + _DIGEST + 6*SZ8], g
384 vmovdqa [rsp + _DIGEST + 7*SZ8], h
385 %assign i 0
386 %rep 2
387 VMOVPS TT0,[inp0+IDX+i*32]
388 VMOVPS TT1,[inp1+IDX+i*32]
389 VMOVPS TT2,[inp2+IDX+i*32]
390 VMOVPS TT3,[inp3+IDX+i*32]
391 VMOVPS TT4,[inp4+IDX+i*32]
392 VMOVPS TT5,[inp5+IDX+i*32]
393 VMOVPS TT6,[inp6+IDX+i*32]
394 VMOVPS TT7,[inp7+IDX+i*32]
395 vmovdqa [YTMP0], g
396 vmovdqa [YTMP1], h
397 TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
398 vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
399 vmovdqa g, [YTMP0]
400 vpshufb TT0, TT0, TMP1
401 vpshufb TT1, TT1, TMP1
402 vpshufb TT2, TT2, TMP1
403 vpshufb TT3, TT3, TMP1
404 vpshufb TT4, TT4, TMP1
405 vpshufb TT5, TT5, TMP1
406 vpshufb TT6, TT6, TMP1
407 vpshufb TT7, TT7, TMP1
408 vmovdqa h, [YTMP1]
409 vmovdqa [YTMP0], TT4
410 vmovdqa [YTMP1], TT5
411 vmovdqa [YTMP2], TT6
412 vmovdqa [YTMP3], TT7
413 ROUND_00_15 TT0,(i*8+0)
414 vmovdqa TT0, [YTMP0]
415 ROUND_00_15 TT1,(i*8+1)
416 vmovdqa TT1, [YTMP1]
417 ROUND_00_15 TT2,(i*8+2)
418 vmovdqa TT2, [YTMP2]
419 ROUND_00_15 TT3,(i*8+3)
420 vmovdqa TT3, [YTMP3]
421 ROUND_00_15 TT0,(i*8+4)
422 ROUND_00_15 TT1,(i*8+5)
423 ROUND_00_15 TT2,(i*8+6)
424 ROUND_00_15 TT3,(i*8+7)
425 %assign i (i+1)
426 %endrep
427 add IDX, 4*4*4
428
429 %assign i (i*8)
430
431 jmp Lrounds_16_xx
432 align 16
433 Lrounds_16_xx:
434 %rep 16
435 ROUND_16_XX T1, i
436 %assign i (i+1)
437 %endrep
438
439 cmp ROUND,ROUNDS
440 jb Lrounds_16_xx
441
442 ;; add old digest
443 vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
444 vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
445 vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
446 vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
447 vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
448 vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
449 vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
450 vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
451
452 sub INP_SIZE, 1 ;; unit is blocks
453 jne lloop
454
455 ; write back to memory (state object) the transposed digest
456 vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a
457 vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b
458 vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c
459 vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d
460 vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e
461 vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f
462 vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g
463 vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h
464
465 ; update input pointers
466 add inp0, IDX
467 mov [STATE + _args_data_ptr + 0*8], inp0
468 add inp1, IDX
469 mov [STATE + _args_data_ptr + 1*8], inp1
470 add inp2, IDX
471 mov [STATE + _args_data_ptr + 2*8], inp2
472 add inp3, IDX
473 mov [STATE + _args_data_ptr + 3*8], inp3
474 add inp4, IDX
475 mov [STATE + _args_data_ptr + 4*8], inp4
476 add inp5, IDX
477 mov [STATE + _args_data_ptr + 5*8], inp5
478 add inp6, IDX
479 mov [STATE + _args_data_ptr + 6*8], inp6
480 add inp7, IDX
481 mov [STATE + _args_data_ptr + 7*8], inp7
482
483 ;;;;;;;;;;;;;;;;
484 ;; Postamble
485 mov rsp, [rsp + _RSP_SAVE]
486 ret
487
488 section .data
489 align 64
490 K256_8_MB:
491 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
492 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
493 dq 0x7137449171374491, 0x7137449171374491
494 dq 0x7137449171374491, 0x7137449171374491
495 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
496 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
497 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
498 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
499 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
500 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
501 dq 0x59f111f159f111f1, 0x59f111f159f111f1
502 dq 0x59f111f159f111f1, 0x59f111f159f111f1
503 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
504 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
505 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
506 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
507 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
508 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
509 dq 0x12835b0112835b01, 0x12835b0112835b01
510 dq 0x12835b0112835b01, 0x12835b0112835b01
511 dq 0x243185be243185be, 0x243185be243185be
512 dq 0x243185be243185be, 0x243185be243185be
513 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
514 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
515 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
516 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
517 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
518 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
519 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
520 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
521 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
522 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
523 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
524 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
525 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
526 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
527 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
528 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
529 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
530 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
531 dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
532 dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
533 dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
534 dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
535 dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
536 dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
537 dq 0x76f988da76f988da, 0x76f988da76f988da
538 dq 0x76f988da76f988da, 0x76f988da76f988da
539 dq 0x983e5152983e5152, 0x983e5152983e5152
540 dq 0x983e5152983e5152, 0x983e5152983e5152
541 dq 0xa831c66da831c66d, 0xa831c66da831c66d
542 dq 0xa831c66da831c66d, 0xa831c66da831c66d
543 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
544 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
545 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
546 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
547 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
548 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
549 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
550 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
551 dq 0x06ca635106ca6351, 0x06ca635106ca6351
552 dq 0x06ca635106ca6351, 0x06ca635106ca6351
553 dq 0x1429296714292967, 0x1429296714292967
554 dq 0x1429296714292967, 0x1429296714292967
555 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
556 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
557 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
558 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
559 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
560 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
561 dq 0x53380d1353380d13, 0x53380d1353380d13
562 dq 0x53380d1353380d13, 0x53380d1353380d13
563 dq 0x650a7354650a7354, 0x650a7354650a7354
564 dq 0x650a7354650a7354, 0x650a7354650a7354
565 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
566 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
567 dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
568 dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
569 dq 0x92722c8592722c85, 0x92722c8592722c85
570 dq 0x92722c8592722c85, 0x92722c8592722c85
571 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
572 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
573 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
574 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
575 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
576 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
577 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
578 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
579 dq 0xd192e819d192e819, 0xd192e819d192e819
580 dq 0xd192e819d192e819, 0xd192e819d192e819
581 dq 0xd6990624d6990624, 0xd6990624d6990624
582 dq 0xd6990624d6990624, 0xd6990624d6990624
583 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
584 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
585 dq 0x106aa070106aa070, 0x106aa070106aa070
586 dq 0x106aa070106aa070, 0x106aa070106aa070
587 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
588 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
589 dq 0x1e376c081e376c08, 0x1e376c081e376c08
590 dq 0x1e376c081e376c08, 0x1e376c081e376c08
591 dq 0x2748774c2748774c, 0x2748774c2748774c
592 dq 0x2748774c2748774c, 0x2748774c2748774c
593 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
594 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
595 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
596 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
597 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
598 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
599 dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
600 dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
601 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
602 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
603 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
604 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
605 dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
606 dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
607 dq 0x84c8781484c87814, 0x84c8781484c87814
608 dq 0x84c8781484c87814, 0x84c8781484c87814
609 dq 0x8cc702088cc70208, 0x8cc702088cc70208
610 dq 0x8cc702088cc70208, 0x8cc702088cc70208
611 dq 0x90befffa90befffa, 0x90befffa90befffa
612 dq 0x90befffa90befffa, 0x90befffa90befffa
613 dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
614 dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
615 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
616 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
617 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
618 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
619 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
620 dq 0x0405060700010203, 0x0c0d0e0f08090a0b