]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha256_mb / sha256_opt_x1.asm
CommitLineData
1e59de90
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3;
4; Copyright (C) 2013 Intel Corporation.
5;
6; Authors:
7; James Guilford <james.guilford@intel.com>
8; Kirk Yap <kirk.s.yap@intel.com>
9; Tim Chen <tim.c.chen@linux.intel.com>
10; Transcoded by:
11; Xiaodong Liu <xiaodong.liu@intel.com>
12;
13; This software is available to you under the OpenIB.org BSD license
14; below:
15;
16; Redistribution and use in source and binary forms, with or
17; without modification, are permitted provided that the following
18; conditions are met:
19;
20; - Redistributions of source code must retain the above
21; copyright notice, this list of conditions and the following
22; disclaimer.
23;
24; - Redistributions in binary form must reproduce the above
25; copyright notice, this list of conditions and the following
26; disclaimer in the documentation and/or other materials
27; provided with the distribution.
28;
29; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36; SOFTWARE.
37;
38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39;
40; This code is described in an Intel White-Paper:
41; "Fast SHA-256 Implementations on Intel Architecture Processors"
42;
43; To find it, surf to http://www.intel.com/p/en_US/embedded
44; and search for that title.
45;
46;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47%include "sha256_mb_mgr_datastruct.asm"
48%include "reg_sizes.asm"
49
50[bits 64]
51default rel
52section .text
53
54;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
55
56%ifidn __OUTPUT_FORMAT__, elf64
57 ; Linux
58 %define arg0 rdi
59 %define arg1 rsi
60%else
61 ; Windows
62 %define arg0 rcx
63 %define arg1 rdx
64%endif
65
66%xdefine X0 xmm4
67%xdefine X1 xmm5
68%xdefine X2 xmm6
69%xdefine X3 xmm7
70
71%xdefine XTMP0 xmm0
72%xdefine XTMP1 xmm1
73%xdefine XTMP2 xmm2
74%xdefine XTMP3 xmm3
75%xdefine XTMP4 xmm8
76%xdefine XFER xmm9
77
78%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
79%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
80%define BYTE_FLIP_MASK xmm12
81
82; arg index is start from 0 while mgr_flush/submit is from 1
83%define MGR arg0 ; rdi or rcx
84%define NBLK arg1 ; rsi or rdx
85%define IDX r8 ; local variable -- consistent with caller
86%define NLANX4 r10 ; consistent with caller, should be r10
87
88%define TMGR r9 ; data pointer stored in stack named _TMGR
89%define INP r9 ; data pointer stored in stack named _INP
90%define SRND r9 ; clobbers INP
91%define TMP r9 ; local variable -- assistant to address digest
92
93%xdefine TBL rbp
94%xdefine c ecx
95%xdefine d esi
96%xdefine e edx
97%xdefine a eax
98%xdefine b ebx
99
100%xdefine f edi
101%xdefine g r12d
102%xdefine h r11d
103
104%xdefine y0 r13d
105%xdefine y1 r14d
106%xdefine y2 r15d
107
108
109;; FRAMESZ plus pushes must be an odd multiple of 8
110%define _STACK_ALIGN_SIZE 8 ; 0 or 8 depends on pushes
111%define _INP_END_SIZE 8
112%define _INP_SIZE 8
113%define _TMGR_SIZE 8
114%define _XFER_SIZE 16
115%define _XMM_SAVE_SIZE 0
116%define _GPR_SAVE_SIZE 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15
117
118%define _STACK_ALIGN 0
119%define _INP_END (_STACK_ALIGN + _STACK_ALIGN_SIZE)
120%define _INP (_INP_END + _INP_END_SIZE)
121%define _TMGR (_INP + _INP_SIZE)
122%define _XFER (_TMGR + _TMGR_SIZE)
123%define _XMM_SAVE (_XFER + _XFER_SIZE)
124%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
125%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)
126
127;; assume buffers not aligned
128%define MOVDQ movdqu
129
130;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
131
132; addm [mem], reg
133; Add reg to mem using reg-mem add and store
134%macro addm 2
135 add %2, %1 ;changed
136 mov %1, %2 ;changed
137%endmacro
138
139;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
140
141; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
142; Load xmm with mem and byte swap each dword
143%macro COPY_XMM_AND_BSWAP 3
144 MOVDQ %1, %2 ;changed
145 pshufb %1, %3 ;changed
146%endmacro
147
148; rotate_Xs
149; Rotate values of symbols X0...X3
150%macro rotate_Xs 0
151%xdefine X_ X0
152%xdefine X0 X1
153%xdefine X1 X2
154%xdefine X2 X3
155%xdefine X3 X_
156%endmacro
157
158; ROTATE_ARGS
159; Rotate values of symbols a...h
160%macro ROTATE_ARGS 0
161%xdefine TMP_ h
162%xdefine h g
163%xdefine g f
164%xdefine f e
165%xdefine e d
166%xdefine d c
167%xdefine c b
168%xdefine b a
169%xdefine a TMP_
170%endmacro
171
172%macro FOUR_ROUNDS_AND_SCHED 0
173 ;; compute s0 four at a time and s1 two at a time
174 ;; compute W[-16] + W[-7] 4 at a time
175 movdqa XTMP0, X3
176 mov y0, e ; y0 = e
177 ror y0, (25-11) ; y0 = e >> (25-11)
178 mov y1, a ; y1 = a
179 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
180 ror y1, (22-13) ; y1 = a >> (22-13)
181 xor y0, e ; y0 = e ^ (e >> (25-11))
182 mov y2, f ; y2 = f
183 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
184 movdqa XTMP1, X1
185 xor y1, a ; y1 = a ^ (a >> (22-13)
186 xor y2, g ; y2 = f^g
187 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
188 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
189 and y2, e ; y2 = (f^g)&e
190 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
191 ;; compute s0
192 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
193 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
194 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
195 xor y2, g ; y2 = CH = ((f^g)&e)^g
196 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
197 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
198 add y2, y0 ; y2 = S1 + CH
199 add y2 , [rsp + _XFER] ; y2 = k + w + S1 + CH
200 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
201 mov y0, a ; y0 = a
202 add h, y2 ; h = h + S1 + CH + k + w
203 mov y2, a ; y2 = a
204 pslld XTMP1, (32-7) ;
205 or y0, c ; y0 = a|c
206 add d, h ; d = d + h + S1 + CH + k + w
207 and y2, c ; y2 = a&c
208 psrld XTMP2, 7 ;
209 and y0, b ; y0 = (a|c)&b
210 add h, y1 ; h = h + S1 + CH + k + w + S0
211 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
212 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
213 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
214
215 ROTATE_ARGS
216 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
217 mov y0, e ; y0 = e
218 mov y1, a ; y1 = a
219 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
220 ror y0, (25-11) ; y0 = e >> (25-11)
221 xor y0, e ; y0 = e ^ (e >> (25-11))
222 mov y2, f ; y2 = f
223 ror y1, (22-13) ; y1 = a >> (22-13)
224 pslld XTMP3, (32-18) ;
225 xor y1, a ; y1 = a ^ (a >> (22-13)
226 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
227 xor y2, g ; y2 = f^g
228 psrld XTMP2, 18 ;
229 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
230 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
231 and y2, e ; y2 = (f^g)&e
232 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
233 pxor XTMP1, XTMP3
234 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
235 xor y2, g ; y2 = CH = ((f^g)&e)^g
236 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
237 add y2, y0 ; y2 = S1 + CH
238 add y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH
239 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
240 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
241 mov y0, a ; y0 = a
242 add h, y2 ; h = h + S1 + CH + k + w
243 mov y2, a ; y2 = a
244 pxor XTMP1, XTMP4 ; XTMP1 = s0
245 or y0, c ; y0 = a|c
246 add d, h ; d = d + h + S1 + CH + k + w
247 and y2, c ; y2 = a&c
248 ;; compute low s1
249 pshufd XTMP2, X3, 11111010B ; XTMP2 = W[-2] {BBAA}
250 and y0, b ; y0 = (a|c)&b
251 add h, y1 ; h = h + S1 + CH + k + w + S0
252 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
253 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
254 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
255
256 ROTATE_ARGS
257 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
258 mov y0, e ; y0 = e
259 mov y1, a ; y1 = a
260 ror y0, (25-11) ; y0 = e >> (25-11)
261 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
262 xor y0, e ; y0 = e ^ (e >> (25-11))
263 ror y1, (22-13) ; y1 = a >> (22-13)
264 mov y2, f ; y2 = f
265 xor y1, a ; y1 = a ^ (a >> (22-13)
266 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
267 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
268 xor y2, g ; y2 = f^g
269 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
270 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
271 and y2, e ; y2 = (f^g)&e
272 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
273 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
274 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
275 xor y2, g ; y2 = CH = ((f^g)&e)^g
276 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
277 pxor XTMP2, XTMP3
278 add y2, y0 ; y2 = S1 + CH
279 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
280 add y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH
281 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
282 mov y0, a ; y0 = a
283 add h, y2 ; h = h + S1 + CH + k + w
284 mov y2, a ; y2 = a
285 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
286 or y0, c ; y0 = a|c
287 add d, h ; d = d + h + S1 + CH + k + w
288 and y2, c ; y2 = a&c
289 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
290 and y0, b ; y0 = (a|c)&b
291 add h, y1 ; h = h + S1 + CH + k + w + S0
292 ;; compute high s1
293 pshufd XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA}
294 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
295 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
296
297 ROTATE_ARGS
298 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
299 mov y0, e ; y0 = e
300 ror y0, (25-11) ; y0 = e >> (25-11)
301 mov y1, a ; y1 = a
302 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
303 ror y1, (22-13) ; y1 = a >> (22-13)
304 xor y0, e ; y0 = e ^ (e >> (25-11))
305 mov y2, f ; y2 = f
306 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
307 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
308 xor y1, a ; y1 = a ^ (a >> (22-13)
309 xor y2, g ; y2 = f^g
310 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
311 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25
312 and y2, e ; y2 = (f^g)&e
313 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
314 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
315 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22
316 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
317 xor y2, g ; y2 = CH = ((f^g)&e)^g
318 pxor XTMP2, XTMP3 ;
319 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
320 add y2, y0 ; y2 = S1 + CH
321 add y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH
322 pxor X0, XTMP2 ; X0 = s1 {xDxC}
323 mov y0, a ; y0 = a
324 add h, y2 ; h = h + S1 + CH + k + w
325 mov y2, a ; y2 = a
326 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
327 or y0, c ; y0 = a|c
328 add d, h ; d = d + h + S1 + CH + k + w
329 and y2, c ; y2 = a&c
330 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
331 and y0, b ; y0 = (a|c)&b
332 add h, y1 ; h = h + S1 + CH + k + w + S0
333 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
334 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
335
336 ROTATE_ARGS
337 rotate_Xs
338%endmacro
339
340;; input is [rsp + _XFER + %1 * 4]
341%macro DO_ROUND 1
342 mov y0, e ; y0 = e
343 ror y0, (25-11) ; y0 = e >> (25-11)
344 mov y1, a ; y1 = a
345 xor y0, e ; y0 = e ^ (e >> (25-11))
346 ror y1, (22-13) ; y1 = a >> (22-13)
347 mov y2, f ; y2 = f
348 xor y1, a ; y1 = a ^ (a >> (22-13)
349 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
350 xor y2, g ; y2 = f^g
351 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
352 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
353 and y2, e ; y2 = (f^g)&e
354 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
355 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
356 xor y2, g ; y2 = CH = ((f^g)&e)^g
357 add y2, y0 ; y2 = S1 + CH
358 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
359 %xdefine offset (%1 * 4 + _XFER)
360 add y2, [rsp + offset] ; y2 = k + w + S1 + CH
361 mov y0, a ; y0 = a
362 add h, y2 ; h = h + S1 + CH + k + w
363 mov y2, a ; y2 = a
364 or y0, c ; y0 = a|c
365 add d, h ; d = d + h + S1 + CH + k + w
366 and y2, c ; y2 = a&c
367 and y0, b ; y0 = (a|c)&b
368 add h, y1 ; h = h + S1 + CH + k + w + S0
369 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
370 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
371 ROTATE_ARGS
372%endmacro
373
374;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
376; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
377; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
378; invisibile arg 2 : IDX : hash on which lane
379; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
380; (sse/avx is 4, avx2 is 8, avx512 is 16)
381;
382; Clobbers registers: all general regs, xmm0-xmm12
383; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
384;
385;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
386section .text
387mk_global sha256_opt_x1, function, internal
388sha256_opt_x1:
389 endbranch
390 sub rsp, STACK_SIZE
391 mov [rsp + _GPR_SAVE + 8*0], rbx
392 mov [rsp + _GPR_SAVE + 8*1], rbp
393%ifidn __OUTPUT_FORMAT__, win64
394 mov [rsp + _GPR_SAVE + 8*2], rdi
395 mov [rsp + _GPR_SAVE + 8*3], rsi
396 ; caller has already stored XMM6~10
397%endif
398 mov [rsp + _GPR_SAVE + 8*4], r12
399 mov [rsp + _GPR_SAVE + 8*5], r13
400 mov [rsp + _GPR_SAVE + 8*6], r14
401 mov [rsp + _GPR_SAVE + 8*7], r15
402 mov [rsp + _GPR_SAVE + 8*8], rdx
403
404 shl NBLK, 6 ; convert to bytes
405 jz done_hash
406
407 ; detach idx from nlanx4
408 mov IDX, NLANX4
409 shr NLANX4, 8
410 and IDX, 0xff
411
412 mov [rsp + _TMGR], MGR
413 ;; Load input pointers
414 mov INP, [MGR + _data_ptr + IDX*8]
415 mov [rsp + _INP], INP
416 ;; nblk is used to indicate data end
417 add NBLK, INP
418 mov [rsp + _INP_END], NBLK ; pointer to end of data
419
420
421 mov TMGR, [rsp + _TMGR]
422 ;; load initial digest
423 lea TMP, [TMGR + 4*IDX]
424 mov a, [TMP + 0*NLANX4]
425 mov b, [TMP + 1*NLANX4]
426 mov c, [TMP + 2*NLANX4]
427 lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
428 mov d, [TMP + 1*NLANX4]
429 mov e, [TMP + 2*NLANX4]
430 mov g, [TMP + 4*NLANX4]
431 lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4
432 mov f, [TMP + 2*NLANX4]
433 mov h, [TMP + 4*NLANX4]
434
435 movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
436 movdqa SHUF_00BA, [_SHUF_00BA]
437 movdqa SHUF_DC00, [_SHUF_DC00]
438
439 mov INP, [rsp + _INP]
440loop0:
441 lea TBL, [K256]
442
443 ;; byte swap first 16 dwords
444 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
445 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
446 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
447 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
448
449 mov [rsp + _INP], INP
450
451 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
452 mov SRND, 3
453
454loop1:
455 movdqa XFER, [TBL]
456 paddd XFER, X0
457 movdqa [rsp + _XFER], XFER
458 FOUR_ROUNDS_AND_SCHED
459
460 movdqa XFER, [TBL + 1*16]
461 paddd XFER, X0
462 movdqa [rsp + _XFER], XFER
463 FOUR_ROUNDS_AND_SCHED
464
465 movdqa XFER, [TBL + 2*16]
466 paddd XFER, X0
467 movdqa [rsp + _XFER], XFER
468 FOUR_ROUNDS_AND_SCHED
469
470 movdqa XFER, [TBL + 3*16]
471 paddd XFER, X0
472 movdqa [rsp + _XFER], XFER
473 add TBL, 4*16
474 FOUR_ROUNDS_AND_SCHED
475
476 sub SRND, 1
477 jne loop1
478
479 mov SRND, 2
480loop2:
481 paddd X0, [TBL]
482 movdqa [rsp + _XFER], X0
483 DO_ROUND 0
484 DO_ROUND 1
485 DO_ROUND 2
486 DO_ROUND 3
487 paddd X1, [TBL + 1*16]
488 movdqa [rsp + _XFER], X1
489 add TBL, 2*16
490 DO_ROUND 0
491 DO_ROUND 1
492 DO_ROUND 2
493 DO_ROUND 3
494
495 movdqa X0, X2
496 movdqa X1, X3
497
498 sub SRND, 1
499 jne loop2
500
501 ; write out digests
502 mov TMGR, [rsp + _TMGR]
503 lea TMP, [TMGR + 4*IDX]
504 addm a, [TMP + 0*NLANX4]
505 addm b, [TMP + 1*NLANX4]
506 addm c, [TMP + 2*NLANX4]
507 lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
508 addm d, [TMP + 1*NLANX4]
509 addm e, [TMP + 2*NLANX4]
510 addm g, [TMP + 4*NLANX4]
511 lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4
512 addm f, [TMP + 2*NLANX4]
513 addm h, [TMP + 4*NLANX4]
514
515 mov INP, [rsp + _INP]
516 add INP, 64
517 cmp INP, [rsp + _INP_END]
518 jne loop0
519
520done_hash:
521 mov MGR, [rsp + _TMGR]
522
523 mov rdx, [rsp + _GPR_SAVE + 8*8]
524 mov r15, [rsp + _GPR_SAVE + 8*7]
525 mov r14, [rsp + _GPR_SAVE + 8*6]
526 mov r13, [rsp + _GPR_SAVE + 8*5]
527 mov r12, [rsp + _GPR_SAVE + 8*4]
528%ifidn __OUTPUT_FORMAT__, win64
529 mov rsi, [rsp + _GPR_SAVE + 8*3]
530 mov rdi, [rsp + _GPR_SAVE + 8*2]
531%endif
532 mov rbp, [rsp + _GPR_SAVE + 8*1]
533 mov rbx, [rsp + _GPR_SAVE + 8*0]
534 add rsp, STACK_SIZE
535
536 ret
537
538section .data
539align 64
540K256:
541 DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
542 DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
543 DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
544 DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
545 DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
546 DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
547 DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
548 DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
549 DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
550 DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
551 DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
552 DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
553 DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
554 DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
555 DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
556 DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
557
558PSHUFFLE_BYTE_FLIP_MASK:
559 DQ 0x0405060700010203, 0x0c0d0e0f08090a0b
560
561; shuffle xBxA -> 00BA
562_SHUF_00BA:
563 DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
564
565; shuffle xDxC -> DC00
566_SHUF_DC00:
567 DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100