]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm
b92e1f6c46e1217c93e55232c064876dd623d745
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha256_one_block_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ; This code schedules 1 blocks at a time, with 4 lanes per block
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "os.asm"
31
32 section .data
33 default rel
34 align 64
35 K256:
36 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
52
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
54 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
55
56 ; shuffle xBxA -> 00BA
57 _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
58 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
59
60 ; shuffle xDxC -> DC00
61 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
62 dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
63
64 section .text
65
66 %define VMOVDQ vmovdqu ;; assume buffers not aligned
67
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
69
70 %macro MY_ROR 2
71 shld %1,%1,(32-(%2))
72 %endm
73
74 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
75 ; Load xmm with mem and byte swap each dword
76 %macro COPY_XMM_AND_BSWAP 3
77 VMOVDQ %1, %2
78 vpshufb %1, %1, %3
79 %endmacro
80
81 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
82
83 %define X0 xmm4
84 %define X1 xmm5
85 %define X2 xmm6
86 %define X3 xmm7
87
88 %define XTMP0 xmm0
89 %define XTMP1 xmm1
90 %define XTMP2 xmm2
91 %define XTMP3 xmm3
92 %define XTMP4 xmm8
93 %define XFER xmm9
94 %define XTMP5 xmm11
95
96 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
97 %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
98 %define BYTE_FLIP_MASK xmm13
99
100 %ifdef LINUX
101 %define CTX rsi ; 2nd arg
102 %define INP rdi ; 1st arg
103
104 %define SRND rdi ; clobbers INP
105 %define c ecx
106 %define d r8d
107 %define e edx
108 %else
109 %define CTX rdx ; 2nd arg
110 %define INP rcx ; 1st arg
111
112 %define SRND rcx ; clobbers INP
113 %define c edi
114 %define d esi
115 %define e r8d
116
117 %endif
118 %define TBL rbp
119 %define a eax
120 %define b ebx
121
122 %define f r9d
123 %define g r10d
124 %define h r11d
125
126 %define y0 r13d
127 %define y1 r14d
128 %define y2 r15d
129
130
131 struc STACK
132 %ifndef LINUX
133 _XMM_SAVE: reso 7
134 %endif
135 _XFER: reso 1
136 endstruc
137
138 %ifndef H0
139 %define H0 0x6a09e667
140 %define H1 0xbb67ae85
141 %define H2 0x3c6ef372
142 %define H3 0xa54ff53a
143 %define H4 0x510e527f
144 %define H5 0x9b05688c
145 %define H6 0x1f83d9ab
146 %define H7 0x5be0cd19
147 %define FUNC sha256_one_block_avx
148 %endif
149
150 ; rotate_Xs
151 ; Rotate values of symbols X0...X3
152 %macro rotate_Xs 0
153 %xdefine X_ X0
154 %xdefine X0 X1
155 %xdefine X1 X2
156 %xdefine X2 X3
157 %xdefine X3 X_
158 %endm
159
160 ; ROTATE_ARGS
161 ; Rotate values of symbols a...h
162 %macro ROTATE_ARGS 0
163 %xdefine TMP_ h
164 %xdefine h g
165 %xdefine g f
166 %xdefine f e
167 %xdefine e d
168 %xdefine d c
169 %xdefine c b
170 %xdefine b a
171 %xdefine a TMP_
172 %endm
173
174 %macro FOUR_ROUNDS_AND_SCHED 0
175 ;; compute s0 four at a time and s1 two at a time
176 ;; compute W[-16] + W[-7] 4 at a time
177 ;vmovdqa XTMP0, X3
178 mov y0, e ; y0 = e
179 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
180 mov y1, a ; y1 = a
181 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
182 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
183 xor y0, e ; y0 = e ^ (e >> (25-11))
184 mov y2, f ; y2 = f
185 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
186 ;vmovdqa XTMP1, X1
187 xor y1, a ; y1 = a ^ (a >> (22-13)
188 xor y2, g ; y2 = f^g
189 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
190 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
191 and y2, e ; y2 = (f^g)&e
192 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
193 ;; compute s0
194 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
195 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
196 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
197 xor y2, g ; y2 = CH = ((f^g)&e)^g
198
199 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
200 add y2, y0 ; y2 = S1 + CH
201 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
202
203 mov y0, a ; y0 = a
204 add h, y2 ; h = h + S1 + CH + k + w
205 mov y2, a ; y2 = a
206
207 vpsrld XTMP2, XTMP1, 7
208
209 or y0, c ; y0 = a|c
210 add d, h ; d = d + h + S1 + CH + k + w
211 and y2, c ; y2 = a&c
212
213 vpslld XTMP3, XTMP1, (32-7)
214
215 and y0, b ; y0 = (a|c)&b
216 add h, y1 ; h = h + S1 + CH + k + w + S0
217
218 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
219
220 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
221 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
222
223 ROTATE_ARGS
224
225 mov y0, e ; y0 = e
226 mov y1, a ; y1 = a
227
228
229 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
230 xor y0, e ; y0 = e ^ (e >> (25-11))
231 mov y2, f ; y2 = f
232 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
233
234 vpsrld XTMP2, XTMP1,18
235
236 xor y1, a ; y1 = a ^ (a >> (22-13)
237 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
238 xor y2, g ; y2 = f^g
239
240 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
241
242 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
243 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244 and y2, e ; y2 = (f^g)&e
245 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
246
247 vpslld XTMP1, XTMP1, (32-18)
248
249 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
250 xor y2, g ; y2 = CH = ((f^g)&e)^g
251
252 vpxor XTMP3, XTMP3, XTMP1
253
254 add y2, y0 ; y2 = S1 + CH
255 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
256 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
257
258 vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
259
260 mov y0, a ; y0 = a
261 add h, y2 ; h = h + S1 + CH + k + w
262 mov y2, a ; y2 = a
263
264 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
265
266 or y0, c ; y0 = a|c
267 add d, h ; d = d + h + S1 + CH + k + w
268 and y2, c ; y2 = a&c
269 ;; compute low s1
270 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
271 and y0, b ; y0 = (a|c)&b
272 add h, y1 ; h = h + S1 + CH + k + w + S0
273 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
274 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
275 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
276
277 ROTATE_ARGS
278 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
279
280 mov y0, e ; y0 = e
281 mov y1, a ; y1 = a
282 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
283
284 ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
285
286 xor y0, e ; y0 = e ^ (e >> (25-11))
287 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
288 mov y2, f ; y2 = f
289 xor y1, a ; y1 = a ^ (a >> (22-13)
290 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
291
292 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
293
294 xor y2, g ; y2 = f^g
295
296 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
297
298 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
299 and y2, e ; y2 = (f^g)&e
300
301 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
302
303 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
304 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
305 xor y2, g ; y2 = CH = ((f^g)&e)^g
306 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
307 vpxor XTMP2, XTMP2, XTMP3
308 add y2, y0 ; y2 = S1 + CH
309 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
310 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
311 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
312 mov y0, a ; y0 = a
313 add h, y2 ; h = h + S1 + CH + k + w
314 mov y2, a ; y2 = a
315 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
316 or y0, c ; y0 = a|c
317 add d, h ; d = d + h + S1 + CH + k + w
318 and y2, c ; y2 = a&c
319 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
320 and y0, b ; y0 = (a|c)&b
321 add h, y1 ; h = h + S1 + CH + k + w + S0
322 ;; compute high s1
323 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
324 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
325 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
326
327 ROTATE_ARGS
328 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
329 mov y0, e ; y0 = e
330 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
331 mov y1, a ; y1 = a
332 ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
333 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
334 xor y0, e ; y0 = e ^ (e >> (25-11))
335 mov y2, f ; y2 = f
336 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
337
338 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
339
340 xor y1, a ; y1 = a ^ (a >> (22-13)
341 xor y2, g ; y2 = f^g
342
343 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
344
345 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
346 and y2, e ; y2 = (f^g)&e
347 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
348
349 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
350
351 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
352 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
353 xor y2, g ; y2 = CH = ((f^g)&e)^g
354
355 vpxor XTMP2, XTMP2, XTMP3
356
357 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
358 add y2, y0 ; y2 = S1 + CH
359 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
360 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
361 mov y0, a ; y0 = a
362 add h, y2 ; h = h + S1 + CH + k + w
363 mov y2, a ; y2 = a
364 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
365 or y0, c ; y0 = a|c
366 add d, h ; d = d + h + S1 + CH + k + w
367 and y2, c ; y2 = a&c
368 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
369 and y0, b ; y0 = (a|c)&b
370 add h, y1 ; h = h + S1 + CH + k + w + S0
371 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
372 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
373
374 ROTATE_ARGS
375 rotate_Xs
376 %endm
377
378 ;; input is [rsp + _XFER + %1 * 4]
379 %macro DO_ROUND 1
380 mov y0, e ; y0 = e
381 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
382 mov y1, a ; y1 = a
383 xor y0, e ; y0 = e ^ (e >> (25-11))
384 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
385 mov y2, f ; y2 = f
386 xor y1, a ; y1 = a ^ (a >> (22-13)
387 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
388 xor y2, g ; y2 = f^g
389 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
390 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
391 and y2, e ; y2 = (f^g)&e
392 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
393 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
394 xor y2, g ; y2 = CH = ((f^g)&e)^g
395 add y2, y0 ; y2 = S1 + CH
396 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
397 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
398 mov y0, a ; y0 = a
399 add h, y2 ; h = h + S1 + CH + k + w
400 mov y2, a ; y2 = a
401 or y0, c ; y0 = a|c
402 add d, h ; d = d + h + S1 + CH + k + w
403 and y2, c ; y2 = a&c
404 and y0, b ; y0 = (a|c)&b
405 add h, y1 ; h = h + S1 + CH + k + w + S0
406 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
407 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
408 ROTATE_ARGS
409 %endm
410
411 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
412 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
413 ;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
414 ;; arg 1 : pointer to input data
415 ;; arg 2 : pointer to digest
416 section .text
417 MKGLOBAL(FUNC,function,)
418 align 32
419 FUNC:
420 push rbx
421 %ifndef LINUX
422 push rsi
423 push rdi
424 %endif
425 push rbp
426 push r13
427 push r14
428 push r15
429
430 sub rsp,STACK_size
431 %ifndef LINUX
432 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
433 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
434 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
435 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
436 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
437 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
438 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
439 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
440 %endif
441
442 ;; load initial digest
443 mov a,H0
444 mov b,H1
445 mov c,H2
446 mov d,H3
447 mov e,H4
448 mov f,H5
449 mov g,H6
450 mov h,H7
451
452 vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
453 vmovdqa SHUF_00BA, [rel _SHUF_00BA]
454 vmovdqa SHUF_DC00, [rel _SHUF_DC00]
455
456 lea TBL,[rel K256]
457
458 ;; byte swap first 16 dwords
459 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
460 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
461 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
462 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
463
464 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
465 mov SRND, 3
466 align 16
467 loop1:
468 vpaddd XFER, X0, [TBL + 0*16]
469 vmovdqa [rsp + _XFER], XFER
470 FOUR_ROUNDS_AND_SCHED
471
472 vpaddd XFER, X0, [TBL + 1*16]
473 vmovdqa [rsp + _XFER], XFER
474 FOUR_ROUNDS_AND_SCHED
475
476 vpaddd XFER, X0, [TBL + 2*16]
477 vmovdqa [rsp + _XFER], XFER
478 FOUR_ROUNDS_AND_SCHED
479
480 vpaddd XFER, X0, [TBL + 3*16]
481 vmovdqa [rsp + _XFER], XFER
482 add TBL, 4*16
483 FOUR_ROUNDS_AND_SCHED
484
485 sub SRND, 1
486 jne loop1
487
488 mov SRND, 2
489 loop2:
490 vpaddd XFER, X0, [TBL + 0*16]
491 vmovdqa [rsp + _XFER], XFER
492 DO_ROUND 0
493 DO_ROUND 1
494 DO_ROUND 2
495 DO_ROUND 3
496
497 vpaddd XFER, X1, [TBL + 1*16]
498 vmovdqa [rsp + _XFER], XFER
499 add TBL, 2*16
500 DO_ROUND 0
501 DO_ROUND 1
502 DO_ROUND 2
503 DO_ROUND 3
504
505 vmovdqa X0, X2
506 vmovdqa X1, X3
507
508 sub SRND, 1
509 jne loop2
510
511 add a,H0
512 add b,H1
513 add c,H2
514 add d,H3
515 add e,H4
516 add f,H5
517 add g,H6
518 mov [4*0 + CTX],a
519 mov [4*1 + CTX],b
520 mov [4*2 + CTX],c
521 mov [4*3 + CTX],d
522 mov [4*4 + CTX],e
523 mov [4*5 + CTX],f
524 mov [4*6 + CTX],g
525 add h,H7
526 mov [4*7 + CTX],h
527
528 done_hash:
529 %ifndef LINUX
530 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
531 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
532 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
533 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
534 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
535 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
536 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
537 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
538 %endif
539
540 add rsp, STACK_size
541
542 pop r15
543 pop r14
544 pop r13
545 pop rbp
546 %ifndef LINUX
547 pop rdi
548 pop rsi
549 %endif
550 pop rbx
551
552 ret
553
554
555 %ifdef LINUX
556 section .note.GNU-stack noalloc noexec nowrite progbits
557 %endif