]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha256_one_block_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ; This code schedules 1 blocks at a time, with 4 lanes per block
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "os.asm"
31
32 section .data
33 default rel
34 align 64
35 K256:
36 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
52
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
54 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
55
56 ; shuffle xBxA -> 00BA
57 _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
58 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
59
60 ; shuffle xDxC -> DC00
61 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
62 dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
63
64 section .text
65
66 %define VMOVDQ vmovdqu ;; assume buffers not aligned
67
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
69
70 %macro MY_ROR 2
71 shld %1,%1,(32-(%2))
72 %endm
73
74 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
75 ; Load xmm with mem and byte swap each dword
76 %macro COPY_XMM_AND_BSWAP 3
77 VMOVDQ %1, %2
78 vpshufb %1, %1, %3
79 %endmacro
80
81 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
82
83 %define X0 xmm4
84 %define X1 xmm5
85 %define X2 xmm6
86 %define X3 xmm7
87
88 %define XTMP0 xmm0
89 %define XTMP1 xmm1
90 %define XTMP2 xmm2
91 %define XTMP3 xmm3
92 %define XTMP4 xmm8
93 %define XFER xmm9
94 %define XTMP5 xmm11
95
96 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
97 %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
98 %define BYTE_FLIP_MASK xmm13
99
100 %ifdef LINUX
101 %define CTX rsi ; 2nd arg
102 %define INP rdi ; 1st arg
103
104 %define SRND rdi ; clobbers INP
105 %define c ecx
106 %define d r8d
107 %define e edx
108 %else
109 %define CTX rdx ; 2nd arg
110 %define INP rcx ; 1st arg
111
112 %define SRND rcx ; clobbers INP
113 %define c edi
114 %define d esi
115 %define e r8d
116
117 %endif
118 %define TBL rbp
119 %define a eax
120 %define b ebx
121
122 %define f r9d
123 %define g r10d
124 %define h r11d
125
126 %define y0 r13d
127 %define y1 r14d
128 %define y2 r15d
129
130
131 struc STACK
132 %ifndef LINUX
133 _XMM_SAVE: reso 7
134 %endif
135 _XFER: reso 1
136 endstruc
137
138 %ifndef FUNC
139 %define FUNC sha256_block_avx
140 %endif
141
142 ; rotate_Xs
143 ; Rotate values of symbols X0...X3
144 %macro rotate_Xs 0
145 %xdefine X_ X0
146 %xdefine X0 X1
147 %xdefine X1 X2
148 %xdefine X2 X3
149 %xdefine X3 X_
150 %endm
151
152 ; ROTATE_ARGS
153 ; Rotate values of symbols a...h
154 %macro ROTATE_ARGS 0
155 %xdefine TMP_ h
156 %xdefine h g
157 %xdefine g f
158 %xdefine f e
159 %xdefine e d
160 %xdefine d c
161 %xdefine c b
162 %xdefine b a
163 %xdefine a TMP_
164 %endm
165
166 %macro FOUR_ROUNDS_AND_SCHED 0
167 ;; compute s0 four at a time and s1 two at a time
168 ;; compute W[-16] + W[-7] 4 at a time
169 ;vmovdqa XTMP0, X3
170 mov y0, e ; y0 = e
171 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
172 mov y1, a ; y1 = a
173 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
174 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
175 xor y0, e ; y0 = e ^ (e >> (25-11))
176 mov y2, f ; y2 = f
177 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
178 ;vmovdqa XTMP1, X1
179 xor y1, a ; y1 = a ^ (a >> (22-13)
180 xor y2, g ; y2 = f^g
181 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
182 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
183 and y2, e ; y2 = (f^g)&e
184 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
185 ;; compute s0
186 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
187 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
188 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
189 xor y2, g ; y2 = CH = ((f^g)&e)^g
190
191 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
192 add y2, y0 ; y2 = S1 + CH
193 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
194
195 mov y0, a ; y0 = a
196 add h, y2 ; h = h + S1 + CH + k + w
197 mov y2, a ; y2 = a
198
199 vpsrld XTMP2, XTMP1, 7
200
201 or y0, c ; y0 = a|c
202 add d, h ; d = d + h + S1 + CH + k + w
203 and y2, c ; y2 = a&c
204
205 vpslld XTMP3, XTMP1, (32-7)
206
207 and y0, b ; y0 = (a|c)&b
208 add h, y1 ; h = h + S1 + CH + k + w + S0
209
210 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
211
212 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
213 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
214
215 ROTATE_ARGS
216
217 mov y0, e ; y0 = e
218 mov y1, a ; y1 = a
219
220
221 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
222 xor y0, e ; y0 = e ^ (e >> (25-11))
223 mov y2, f ; y2 = f
224 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
225
226 vpsrld XTMP2, XTMP1,18
227
228 xor y1, a ; y1 = a ^ (a >> (22-13)
229 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
230 xor y2, g ; y2 = f^g
231
232 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
233
234 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
235 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
236 and y2, e ; y2 = (f^g)&e
237 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
238
239 vpslld XTMP1, XTMP1, (32-18)
240
241 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
242 xor y2, g ; y2 = CH = ((f^g)&e)^g
243
244 vpxor XTMP3, XTMP3, XTMP1
245
246 add y2, y0 ; y2 = S1 + CH
247 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
248 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
249
250 vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
251
252 mov y0, a ; y0 = a
253 add h, y2 ; h = h + S1 + CH + k + w
254 mov y2, a ; y2 = a
255
256 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
257
258 or y0, c ; y0 = a|c
259 add d, h ; d = d + h + S1 + CH + k + w
260 and y2, c ; y2 = a&c
261 ;; compute low s1
262 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
263 and y0, b ; y0 = (a|c)&b
264 add h, y1 ; h = h + S1 + CH + k + w + S0
265 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
266 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
267 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
268
269 ROTATE_ARGS
270 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
271
272 mov y0, e ; y0 = e
273 mov y1, a ; y1 = a
274 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
275
276 ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
277
278 xor y0, e ; y0 = e ^ (e >> (25-11))
279 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
280 mov y2, f ; y2 = f
281 xor y1, a ; y1 = a ^ (a >> (22-13)
282 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
283
284 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
285
286 xor y2, g ; y2 = f^g
287
288 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
289
290 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
291 and y2, e ; y2 = (f^g)&e
292
293 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
294
295 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
296 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
297 xor y2, g ; y2 = CH = ((f^g)&e)^g
298 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
299 vpxor XTMP2, XTMP2, XTMP3
300 add y2, y0 ; y2 = S1 + CH
301 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
302 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
303 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
304 mov y0, a ; y0 = a
305 add h, y2 ; h = h + S1 + CH + k + w
306 mov y2, a ; y2 = a
307 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
308 or y0, c ; y0 = a|c
309 add d, h ; d = d + h + S1 + CH + k + w
310 and y2, c ; y2 = a&c
311 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
312 and y0, b ; y0 = (a|c)&b
313 add h, y1 ; h = h + S1 + CH + k + w + S0
314 ;; compute high s1
315 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
316 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
317 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
318
319 ROTATE_ARGS
320 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
321 mov y0, e ; y0 = e
322 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
323 mov y1, a ; y1 = a
324 ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
325 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
326 xor y0, e ; y0 = e ^ (e >> (25-11))
327 mov y2, f ; y2 = f
328 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
329
330 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
331
332 xor y1, a ; y1 = a ^ (a >> (22-13)
333 xor y2, g ; y2 = f^g
334
335 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
336
337 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
338 and y2, e ; y2 = (f^g)&e
339 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
340
341 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
342
343 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
344 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
345 xor y2, g ; y2 = CH = ((f^g)&e)^g
346
347 vpxor XTMP2, XTMP2, XTMP3
348
349 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
350 add y2, y0 ; y2 = S1 + CH
351 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
352 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
353 mov y0, a ; y0 = a
354 add h, y2 ; h = h + S1 + CH + k + w
355 mov y2, a ; y2 = a
356 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
357 or y0, c ; y0 = a|c
358 add d, h ; d = d + h + S1 + CH + k + w
359 and y2, c ; y2 = a&c
360 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
361 and y0, b ; y0 = (a|c)&b
362 add h, y1 ; h = h + S1 + CH + k + w + S0
363 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
364 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
365
366 ROTATE_ARGS
367 rotate_Xs
368 %endm
369
370 ;; input is [rsp + _XFER + %1 * 4]
371 %macro DO_ROUND 1
372 mov y0, e ; y0 = e
373 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
374 mov y1, a ; y1 = a
375 xor y0, e ; y0 = e ^ (e >> (25-11))
376 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
377 mov y2, f ; y2 = f
378 xor y1, a ; y1 = a ^ (a >> (22-13)
379 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
380 xor y2, g ; y2 = f^g
381 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
382 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
383 and y2, e ; y2 = (f^g)&e
384 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
385 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
386 xor y2, g ; y2 = CH = ((f^g)&e)^g
387 add y2, y0 ; y2 = S1 + CH
388 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
389 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
390 mov y0, a ; y0 = a
391 add h, y2 ; h = h + S1 + CH + k + w
392 mov y2, a ; y2 = a
393 or y0, c ; y0 = a|c
394 add d, h ; d = d + h + S1 + CH + k + w
395 and y2, c ; y2 = a&c
396 and y0, b ; y0 = (a|c)&b
397 add h, y1 ; h = h + S1 + CH + k + w + S0
398 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
399 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
400 ROTATE_ARGS
401 %endm
402
403 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
404 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
405 ;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
406 ;; arg 1 : pointer to input data
407 ;; arg 2 : pointer to digest
408 section .text
409 MKGLOBAL(FUNC,function,)
410 align 32
411 FUNC:
412 push rbx
413 %ifndef LINUX
414 push rsi
415 push rdi
416 %endif
417 push rbp
418 push r13
419 push r14
420 push r15
421
422 sub rsp,STACK_size
423 %ifndef LINUX
424 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
425 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
426 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
427 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
428 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
429 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
430 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
431 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
432 %endif
433
434 ;; load initial digest
435 mov a, [4*0 + CTX]
436 mov b, [4*1 + CTX]
437 mov c, [4*2 + CTX]
438 mov d, [4*3 + CTX]
439 mov e, [4*4 + CTX]
440 mov f, [4*5 + CTX]
441 mov g, [4*6 + CTX]
442 mov h, [4*7 + CTX]
443
444 vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
445 vmovdqa SHUF_00BA, [rel _SHUF_00BA]
446 vmovdqa SHUF_DC00, [rel _SHUF_DC00]
447
448 lea TBL,[rel K256]
449
450 ;; byte swap first 16 dwords
451 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
452 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
453 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
454 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
455
456 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
457 mov SRND, 3
458 align 16
459 loop1:
460 vpaddd XFER, X0, [TBL + 0*16]
461 vmovdqa [rsp + _XFER], XFER
462 FOUR_ROUNDS_AND_SCHED
463
464 vpaddd XFER, X0, [TBL + 1*16]
465 vmovdqa [rsp + _XFER], XFER
466 FOUR_ROUNDS_AND_SCHED
467
468 vpaddd XFER, X0, [TBL + 2*16]
469 vmovdqa [rsp + _XFER], XFER
470 FOUR_ROUNDS_AND_SCHED
471
472 vpaddd XFER, X0, [TBL + 3*16]
473 vmovdqa [rsp + _XFER], XFER
474 add TBL, 4*16
475 FOUR_ROUNDS_AND_SCHED
476
477 sub SRND, 1
478 jne loop1
479
480 mov SRND, 2
481 loop2:
482 vpaddd XFER, X0, [TBL + 0*16]
483 vmovdqa [rsp + _XFER], XFER
484 DO_ROUND 0
485 DO_ROUND 1
486 DO_ROUND 2
487 DO_ROUND 3
488
489 vpaddd XFER, X1, [TBL + 1*16]
490 vmovdqa [rsp + _XFER], XFER
491 add TBL, 2*16
492 DO_ROUND 0
493 DO_ROUND 1
494 DO_ROUND 2
495 DO_ROUND 3
496
497 vmovdqa X0, X2
498 vmovdqa X1, X3
499
500 sub SRND, 1
501 jne loop2
502
503 add [4*0 + CTX], a
504 add [4*1 + CTX], b
505 add [4*2 + CTX], c
506 add [4*3 + CTX], d
507 add [4*4 + CTX], e
508 add [4*5 + CTX], f
509 add [4*6 + CTX], g
510 add [4*7 + CTX], h
511
512 done_hash:
513 %ifndef LINUX
514 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
515 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
516 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
517 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
518 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
519 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
520 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
521 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
522 %endif
523
524 add rsp, STACK_size
525
526 pop r15
527 pop r14
528 pop r13
529 pop rbp
530 %ifndef LINUX
531 pop rdi
532 pop rsi
533 %endif
534 pop rbx
535
536 ret
537
538
539 %ifdef LINUX
540 section .note.GNU-stack noalloc noexec nowrite progbits
541 %endif