]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
a93fecb1bab9a5415b6fe7a5c51e0069bc3374db
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha512_mb / sha512_mb_x8_avx512.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "sha512_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32
33 %ifdef HAVE_AS_KNOWS_AVX512
34
35 [bits 64]
36 default rel
37 section .text
38
39 ;; code to compute quad SHA512 using AVX512
40 ;; use ZMMs to tackle the larger digest size
41 ;; outer calling routine takes care of save and restore of XMM registers
42 ;; Logic designed/laid out by JDG
43
44 ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
45 ;; Stack must be aligned to 32 bytes before call
46 ;; Windows clobbers: rax rbx rdx rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
47 ;; Windows preserves: rcx rsi
48 ;;
49 ;; Linux clobbers: rax rbx rcx rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
50 ;; Linux preserves: rdx rdi
51 ;;
52 ;; clobbers zmm0-31
53
54 %define APPEND(a,b) a %+ b
55
56 %ifidn __OUTPUT_FORMAT__, win64
57 %define arg1 rcx ; arg0 preserved
58 %define arg2 rdx ; arg1
59 %define reg3 r8 ; arg2 preserved
60 %define reg4 r9 ; arg3
61 %define var1 rdi ; usable
62 %define var2 rsi
63 %define local_func_decl(func_name) global func_name
64 %else
65 %define arg1 rdi ; arg0
66 %define arg2 rsi ; arg1
67 %define var2 rdx ; arg2
68 %define var1 rcx ; arg3 usable
69 %define local_func_decl(func_name) mk_global func_name, function, internal
70 %endif
71
72 %define state arg1
73 %define num_blks arg2
74
75 %define IN (state + _data_ptr)
76 %define DIGEST state
77 %define SIZE num_blks
78
79 %define IDX var1
80 %define TBL r8
81
82 %define VMOVDQ32 vmovdqu32
83
84 %define SHA512_DIGEST_WORD_SIZE 8
85 %define NUM_SHA512_DIGEST_WORDS 8
86 %define SHA512_DIGEST_ROW_SIZE 8*8
87 %define PTR_SZ 8
88 %define _data_ptr_sha512 _data_ptr
89
90 %define NUM_LANES 8
91 %define SZ 8
92 %define SZ8 8 * SZ
93 %define DIGEST_SZ 8 * SZ8
94 %define DIGEST_SAVE NUM_LANES * DIGEST_SZ
95 %define RSP_SAVE 1*8
96
97 ; Define Stack Layout
98 START_FIELDS
99 ;;; name size align
100 FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64
101 FIELD _RSP, 8, 8
102 %assign STACK_SPACE _FIELD_OFFSET
103
104
105 %define inp0 r9
106 %define inp1 r10
107 %define inp2 r11
108 %define inp3 r12
109 %define inp4 r13
110 %define inp5 r14
111 %define inp6 r15
112 %define inp7 rax
113
114 %define A zmm0
115 %define B zmm1
116 %define C zmm2
117 %define D zmm3
118 %define E zmm4
119 %define F zmm5
120 %define G zmm6
121 %define H zmm7
122 %define T1 zmm8
123 %define TMP0 zmm9
124 %define TMP1 zmm10
125 %define TMP2 zmm11
126 %define TMP3 zmm12
127 %define TMP4 zmm13
128 %define TMP5 zmm14
129 %define TMP6 zmm15
130
131
132 %define W0 zmm16
133 %define W1 zmm17
134 %define W2 zmm18
135 %define W3 zmm19
136 %define W4 zmm20
137 %define W5 zmm21
138 %define W6 zmm22
139 %define W7 zmm23
140 %define W8 zmm24
141 %define W9 zmm25
142 %define W10 zmm26
143 %define W11 zmm27
144 %define W12 zmm28
145 %define W13 zmm29
146 %define W14 zmm30
147 %define W15 zmm31
148
149 ; from sha256_fips180-2.pdf
150 ; define rotates for Sigma function for main loop steps
151 %define BIG_SIGMA_0_0 28 ; Sigma0
152 %define BIG_SIGMA_0_1 34
153 %define BIG_SIGMA_0_2 39
154 %define BIG_SIGMA_1_0 14 ; Sigma1
155 %define BIG_SIGMA_1_1 18
156 %define BIG_SIGMA_1_2 41
157
158 ; define rotates for Sigma function for scheduling steps
159
160 %define SMALL_SIGMA_0_0 1 ; sigma0
161 %define SMALL_SIGMA_0_1 8
162 %define SMALL_SIGMA_0_2 7
163 %define SMALL_SIGMA_1_0 19 ; sigma1
164 %define SMALL_SIGMA_1_1 61
165 %define SMALL_SIGMA_1_2 6
166
167 %define SHA_MAX_ROUNDS 80
168 %define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
169
170 %macro TRANSPOSE8 12
171 %define %%r0 %1
172 %define %%r1 %2
173 %define %%r2 %3
174 %define %%r3 %4
175 %define %%r4 %5
176 %define %%r5 %6
177 %define %%r6 %7
178 %define %%r7 %8
179 %define %%t0 %9
180 %define %%t1 %10
181 %define %%PERM_INDEX1 %11
182 %define %%PERM_INDEX2 %12
183
184
185 ; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities
186 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
187 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
188 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
189 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
190 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
191 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
192 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
193 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
194
195 ;; ;;; will not get clobbered
196 vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ; temp
197 vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2] ; temp
198
199 ; process top half (r0..r3) {a...d}
200 vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {b6 a6 b4 a4 b2 a2 b0 a0}
201 vshufpd %%r0, %%r0, %%r1, 0xFF ; r0 = {b7 a7 b5 a5 b3 a3 b1 a1}
202 vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {d6 c6 d4 c4 d2 c2 d0 c0}
203 vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {d7 c7 d5 c5 d3 c3 d1 c1}
204
205 vmovdqa32 %%r1, %%t0 ; r1 and r3 free
206 vpermt2q %%r1, %%PERM_INDEX1,%%t1 ; r1 = {d4 c4 b4 a4 d0 c0 b0 a0}
207 vpermt2q %%t0, %%PERM_INDEX2,%%t1 ; t0 = {d6 c6 b6 a6 d2 c2 b2 a2}
208
209 vmovdqa32 %%t1, %%r0 ; t1 and r3 free
210 vpermt2q %%t1, %%PERM_INDEX1,%%r2 ; t1 = {d5 c5 b5 a5 d1 c1 b1 a1}
211 vpermt2q %%r0, %%PERM_INDEX2,%%r2 ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
212
213 ;; Likewise for top half ; r2 and r3 free
214 vshufpd %%r2, %%r4, %%r5, 0x00 ; r2 = {f6 e6 f4 e4 f2 e2 f0 e0}
215 vshufpd %%r4, %%r4, %%r5, 0xFF ; r4 = {f7 e7 f5 e5 f3 e3 f1 e1}
216 vshufpd %%r3, %%r6, %%r7, 0x00 ; r3 = {h6 g6 h4 g4 h2 g2 h0 g0}
217 vshufpd %%r6, %%r6, %%r7, 0xFF ; r6 = {h7 g7 h5 g5 h3 g3 h1 g1}
218
219 vmovdqa32 %%r5, %%r2 ; r5 and r7 free
220 vpermt2q %%r5, %%PERM_INDEX1,%%r3 ; r5 = {h4 g4 f4 e4 h0 g0 f0 e0}
221 vpermt2q %%r2, %%PERM_INDEX2,%%r3 ; r2 = {h6 g6 f6 e6 h2 g2 f2 e2}
222
223 vmovdqa32 %%r7, %%r4
224 vpermt2q %%r7, %%PERM_INDEX1,%%r6 ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
225 vpermt2q %%r4, %%PERM_INDEX2,%%r6 ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
226
227 ;;; free r3, r6
228 vshuff64x2 %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
229 vshuff64x2 %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
230
231 ;;; t0 and r3 free
232 vshuff64x2 %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
233 vshuff64x2 %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7 d7 c7 b7 a7}
234
235 vshuff64x2 %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
236 vshuff64x2 %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
237
238
239 vshuff64x2 %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
240 vshuff64x2 %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
241
242 ;; will re-order input to avoid move
243 ;vmovdqa32 %%r7, %%t0
244
245 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
246 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
247 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
248 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
249 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
250 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
251 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
252 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
253 ; temp
254 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
255 %endmacro
256
257 %macro ROTATE_ARGS 0
258 %xdefine TMP_ H
259 %xdefine H G
260 %xdefine G F
261 %xdefine F E
262 %xdefine E D
263 %xdefine D C
264 %xdefine C B
265 %xdefine B A
266 %xdefine A TMP_
267 %endm
268
269
270
271 ;; CH(A, B, C) = (A&B) ^ (~A&C)
272 ;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
273 ;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39
274 ;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41
275 ;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7
276 ;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
277
278 ;; Main processing loop per round
279 ;; equivalent to %macro ROUND_00_15 2
280 %macro PROCESS_LOOP 2
281 %define %%WT %1
282 %define %%ROUND %2
283 ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
284 ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
285 ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
286
287 ;; H becomes T2, then add T1 for A
288 ;; D becomes D + T1 for E
289
290 vpaddq T1, H, TMP3 ; T1 = H + Kt
291 vmovdqa32 TMP0, E
292 ;; compute BIG_SIGMA_1(E)
293 vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E)
294 vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E)
295 vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E)
296 vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E)
297 vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
298 vpaddq T1, T1, %%WT ; T1 = T1 + Wt
299 vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
300 vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E)
301 vpaddq D, D, T1 ; D = D + T1
302 vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A)
303 vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A)
304 vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A)
305 vmovdqa32 TMP0, A
306 vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
307 vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A)
308 vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
309 vpaddq H, H, T1 ; H(A) = H(T2) + T1
310 vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
311
312 ;; Rotate the args A-H (rotation of names associated with regs)
313 ROTATE_ARGS
314 %endmacro
315
316 %macro MSG_SCHED_ROUND_16_79 4
317 %define %%WT %1
318 %define %%WTp1 %2
319 %define %%WTp9 %3
320 %define %%WTp14 %4
321 vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2)
322 vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2)
323 vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2)
324 vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2)
325
326 vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2)
327 vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
328
329 vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15)
330 vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15)
331 vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15)
332 vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15)
333
334 vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) +
335 ; Wt-7 + sigma_0(Wt-15) +
336
337 %endmacro
338
339 align 64
340
341 ; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size)
342 ; arg 1 : pointer to input data
343 ; arg 2 : size (in blocks) ;; assumed to be >= 1
344 local_func_decl(sha512_mb_x8_avx512)
345 sha512_mb_x8_avx512:
346 endbranch
347 mov rax, rsp
348 sub rsp, STACK_SPACE
349 and rsp, ~63 ; align stack to multiple of 64
350 mov [rsp + _RSP], rax
351 lea TBL,[TABLE]
352
353 ;; Initialize digests
354 vmovups A, [DIGEST + 0*8*8]
355 vmovups B, [DIGEST + 1*8*8]
356 vmovups C, [DIGEST + 2*8*8]
357 vmovups D, [DIGEST + 3*8*8]
358 vmovups E, [DIGEST + 4*8*8]
359 vmovups F, [DIGEST + 5*8*8]
360 vmovups G, [DIGEST + 6*8*8]
361 vmovups H, [DIGEST + 7*8*8]
362
363 xor IDX, IDX
364 ;; Read in input data address, saving them in registers because
365 ;; they will serve as variables, which we shall keep incrementing
366 mov inp0, [IN + 0*8]
367 mov inp1, [IN + 1*8]
368 mov inp2, [IN + 2*8]
369 mov inp3, [IN + 3*8]
370 mov inp4, [IN + 4*8]
371 mov inp5, [IN + 5*8]
372 mov inp6, [IN + 6*8]
373 mov inp7, [IN + 7*8]
374
375 lloop:
376
377 ;; first half of 1024 (need to transpose before use)
378 vmovups W0,[inp0 + IDX ]
379 vmovups W1,[inp1 + IDX ]
380 vmovups W2,[inp2 + IDX ]
381 vmovups W3,[inp3 + IDX ]
382 vmovups W4,[inp4 + IDX ]
383 vmovups W5,[inp5 + IDX ]
384 vmovups W6,[inp6 + IDX ]
385 vmovups TMP0,[inp7 + IDX ]
386 TRANSPOSE8 W0, W1, W2, W3, W4, W5, W6, TMP0, W7, TMP1, TMP2, TMP3
387 ;; second half of 1024 (need to transpose before use)
388 vmovups W8,[inp0 + SZ8 + IDX ]
389 vmovups W9,[inp1 + SZ8 + IDX ]
390 vmovups W10,[inp2 + SZ8 + IDX ]
391 vmovups W11,[inp3 + SZ8 + IDX ]
392 vmovups W12,[inp4 + SZ8 + IDX ]
393 vmovups W13,[inp5 + SZ8 + IDX ]
394 vmovups W14,[inp6 + SZ8 + IDX ]
395 vmovups TMP0,[inp7 + SZ8 + IDX ]
396 TRANSPOSE8 W8, W9, W10, W11, W12, W13, W14, TMP0, W15, TMP1, TMP2, TMP3
397
398 vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
399
400 vmovdqa32 TMP3, [TBL] ; First K
401
402 ; Save digests for later addition
403 vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
404 vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
405 vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
406 vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
407 vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
408 vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
409 vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
410 vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
411
412 add IDX, 128 ; increment by message block length in bytes
413
414
415
416
417 %assign I 0
418 %rep 16
419 ;;; little endian to big endian
420 vpshufb APPEND(W,I), APPEND(W,I), TMP2
421 %assign I (I+1)
422 %endrep
423 ; Save digests for later addition
424 vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
425 vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
426 vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
427 vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
428 vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
429 vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
430 vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
431 vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
432
433 ; MSG Schedule for W0-W15 is now complete in registers
434 ; Process first (max-rounds -16)
435 ; Calculate next Wt+16 after processing is complete and Wt is unneeded
436
437 ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
438
439 %assign I 0
440 %assign J 0
441 %assign K 1
442 %assign L 9
443 %assign M 14
444 %rep SHA_ROUNDS_LESS_16
445 PROCESS_LOOP APPEND(W,J), I
446 MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
447 %assign I (I+1)
448 %assign J ((J+1)% 16)
449 %assign K ((K+1)% 16)
450 %assign L ((L+1)% 16)
451 %assign M ((M+1)% 16)
452 %endrep
453 ; Check is this is the last block
454 sub SIZE, 1
455 je lastLoop
456
457 ; Process last 16 rounds
458 ; Read in next block msg data for use in first 16 words of msg sched
459 %assign I SHA_ROUNDS_LESS_16
460 %assign J 0
461 %rep 16
462 PROCESS_LOOP APPEND(W,J), I
463 %assign I (I+1)
464 %assign J (J+1)
465 %endrep
466 ; Add old digest
467 vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
468 vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
469 vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
470 vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
471 vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
472 vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
473 vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
474 vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
475
476 jmp lloop
477
478
479 lastLoop:
480 ; Process last 16 rounds
481 %assign I SHA_ROUNDS_LESS_16
482 %assign J 0
483
484 %rep 16
485 PROCESS_LOOP APPEND(W,J), I
486 %assign I (I+1)
487 %assign J (J+1)
488 %endrep
489
490 ; Add old digest
491 vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
492 vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
493 vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
494 vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
495 vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
496 vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
497 vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
498 vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
499
500 ;; update into data pointers
501 %assign I 0
502 %rep 4
503 mov inp0, [IN + (2*I)*8]
504 mov inp1, [IN + (2*I +1)*8]
505 add inp0, IDX
506 add inp1, IDX
507 mov [IN + (2*I)*8], inp0
508 mov [IN + (2*I+1)*8], inp1
509 %assign I (I+1)
510 %endrep
511
512 VMOVDQ32 [DIGEST + 0*8*8], A
513 VMOVDQ32 [DIGEST + 1*8*8], B
514 VMOVDQ32 [DIGEST + 2*8*8], C
515 VMOVDQ32 [DIGEST + 3*8*8], D
516 VMOVDQ32 [DIGEST + 4*8*8], E
517 VMOVDQ32 [DIGEST + 5*8*8], F
518 VMOVDQ32 [DIGEST + 6*8*8], G
519 VMOVDQ32 [DIGEST + 7*8*8], H
520
521 mov rsp, [rsp + _RSP]
522 ret
523
524 section .data
525 align 64
526 ; 80 constants for SHA512
527 ; replicating for each lane, thus 8*80
528 ; to aid in SIMD .. space tradeoff for time!
529 ; local to asm file, used nowhere else
530 TABLE:
531 dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
532 dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
533 dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
534 dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
535 dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
536 dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
537 dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
538 dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
539 dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
540 dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
541 dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
542 dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
543 dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
544 dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
545 dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
546 dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
547 dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
548 dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
549 dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
550 dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
551 dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
552 dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
553 dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
554 dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
555 dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
556 dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
557 dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
558 dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
559 dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
560 dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
561 dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
562 dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
563 dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
564 dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
565 dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
566 dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
567 dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
568 dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
569 dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
570 dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
571 dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
572 dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
573 dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
574 dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
575 dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
576 dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
577 dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
578 dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
579 dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
580 dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
581 dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
582 dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
583 dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
584 dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
585 dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
586 dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
587 dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
588 dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
589 dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
590 dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
591 dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
592 dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
593 dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
594 dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
595 dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
596 dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
597 dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
598 dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
599 dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
600 dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
601 dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
602 dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
603 dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
604 dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
605 dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
606 dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
607 dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
608 dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
609 dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
610 dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
611
612 align 64
613 ; this does the big endian to little endian conversion over a quad word .. ZMM
614 ;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
615 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
616 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
617 dq 0x2021222324252627, 0x28292a2b2c2d2e2f
618 dq 0x3031323334353637, 0x38393a3b3c3d3e3f
619
620 align 64
621 TRANSPOSE8_PERM_INDEX_1: dq 0x0000000000000000
622 dq 0x0000000000000001
623 dq 0x0000000000000008
624 dq 0x0000000000000009
625 dq 0x0000000000000004
626 dq 0x0000000000000005
627 dq 0x000000000000000C
628 dq 0x000000000000000D
629
630 TRANSPOSE8_PERM_INDEX_2: dq 0x0000000000000002
631 dq 0x0000000000000003
632 dq 0x000000000000000A
633 dq 0x000000000000000B
634 dq 0x0000000000000006
635 dq 0x0000000000000007
636 dq 0x000000000000000E
637 dq 0x000000000000000F
638
639 %else
640 %ifidn __OUTPUT_FORMAT__, win64
641 global no_sha512_mb_x8_avx512
642 no_sha512_mb_x8_avx512:
643 %endif
644 %endif ; HAVE_AS_KNOWS_AVX512