]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / sha256_one_block_sse.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ; This code schedules 1 blocks at a time, with 4 lanes per block
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "include/os.asm"
31
32 section .data
33 default rel
34 align 64
35 K256:
36 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
52
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
54 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
55
56 ; shuffle xBxA -> 00BA
57 _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
58 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
59 ; shuffle xDxC -> DC00
60 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
61 dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
62
63 section .text
64
65
66 %define MOVDQ movdqu ;; assume buffers not aligned
67
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
69
70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
71 ; Load xmm with mem and byte swap each dword
72 %macro COPY_XMM_AND_BSWAP 3
73 MOVDQ %1, %2
74 pshufb %1, %3
75 %endmacro
76
77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
78
79 %define X0 xmm4
80 %define X1 xmm5
81 %define X2 xmm6
82 %define X3 xmm7
83
84 %define XTMP0 xmm0
85 %define XTMP1 xmm1
86 %define XTMP2 xmm2
87 %define XTMP3 xmm3
88 %define XTMP4 xmm8
89 %define XFER xmm9
90
91 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
92 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
93 %define BYTE_FLIP_MASK xmm12
94
95 %ifdef LINUX
96 %define CTX rsi ; 2nd arg
97 %define INP rdi ; 1st arg
98
99 %define SRND rdi ; clobbers INP
100 %define c ecx
101 %define d r8d
102 %define e edx
103 %else
104 %define CTX rdx ; 2nd arg
105 %define INP rcx ; 1st arg
106
107 %define SRND rcx ; clobbers INP
108 %define c edi
109 %define d esi
110 %define e r8d
111
112 %endif
113 %define TBL rbp
114 %define a eax
115 %define b ebx
116
117 %define f r9d
118 %define g r10d
119 %define h r11d
120
121 %define y0 r13d
122 %define y1 r14d
123 %define y2 r15d
124
125
126 struc STACK
127 %ifndef LINUX
128 _XMM_SAVE: reso 7
129 %endif
130 _XFER: reso 1
131 endstruc
132
133 %ifndef FUNC
134 %define FUNC sha256_block_sse
135 %endif
136
137 ; rotate_Xs
138 ; Rotate values of symbols X0...X3
139 %macro rotate_Xs 0
140 %xdefine X_ X0
141 %xdefine X0 X1
142 %xdefine X1 X2
143 %xdefine X2 X3
144 %xdefine X3 X_
145 %endm
146
147 ; ROTATE_ARGS
148 ; Rotate values of symbols a...h
149 %macro ROTATE_ARGS 0
150 %xdefine TMP_ h
151 %xdefine h g
152 %xdefine g f
153 %xdefine f e
154 %xdefine e d
155 %xdefine d c
156 %xdefine c b
157 %xdefine b a
158 %xdefine a TMP_
159 %endm
160
161 %macro FOUR_ROUNDS_AND_SCHED 0
162 ;; compute s0 four at a time and s1 two at a time
163 ;; compute W[-16] + W[-7] 4 at a time
164 movdqa XTMP0, X3
165 mov y0, e ; y0 = e
166 ror y0, (25-11) ; y0 = e >> (25-11)
167 mov y1, a ; y1 = a
168 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
169 ror y1, (22-13) ; y1 = a >> (22-13)
170 xor y0, e ; y0 = e ^ (e >> (25-11))
171 mov y2, f ; y2 = f
172 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
173 movdqa XTMP1, X1
174 xor y1, a ; y1 = a ^ (a >> (22-13)
175 xor y2, g ; y2 = f^g
176 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
177 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
178 and y2, e ; y2 = (f^g)&e
179 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
180 ;; compute s0
181 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
182 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
183 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
184 xor y2, g ; y2 = CH = ((f^g)&e)^g
185 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
186 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
187 add y2, y0 ; y2 = S1 + CH
188 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
189 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
190 mov y0, a ; y0 = a
191 add h, y2 ; h = h + S1 + CH + k + w
192 mov y2, a ; y2 = a
193 pslld XTMP1, (32-7)
194 or y0, c ; y0 = a|c
195 add d, h ; d = d + h + S1 + CH + k + w
196 and y2, c ; y2 = a&c
197 psrld XTMP2, 7
198 and y0, b ; y0 = (a|c)&b
199 add h, y1 ; h = h + S1 + CH + k + w + S0
200 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
201 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
202 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
203
204 ROTATE_ARGS
205 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
206 mov y0, e ; y0 = e
207 mov y1, a ; y1 = a
208 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
209 ror y0, (25-11) ; y0 = e >> (25-11)
210 xor y0, e ; y0 = e ^ (e >> (25-11))
211 mov y2, f ; y2 = f
212 ror y1, (22-13) ; y1 = a >> (22-13)
213 pslld XTMP3, (32-18)
214 xor y1, a ; y1 = a ^ (a >> (22-13)
215 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
216 xor y2, g ; y2 = f^g
217 psrld XTMP2, 18
218 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
219 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
220 and y2, e ; y2 = (f^g)&e
221 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
222 pxor XTMP1, XTMP3
223 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
224 xor y2, g ; y2 = CH = ((f^g)&e)^g
225 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
226 add y2, y0 ; y2 = S1 + CH
227 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
228 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
229 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
230 mov y0, a ; y0 = a
231 add h, y2 ; h = h + S1 + CH + k + w
232 mov y2, a ; y2 = a
233 pxor XTMP1, XTMP4 ; XTMP1 = s0
234 or y0, c ; y0 = a|c
235 add d, h ; d = d + h + S1 + CH + k + w
236 and y2, c ; y2 = a&c
237 ;; compute low s1
238 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
239 and y0, b ; y0 = (a|c)&b
240 add h, y1 ; h = h + S1 + CH + k + w + S0
241 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
242 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
243 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
244
245 ROTATE_ARGS
246 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
247 mov y0, e ; y0 = e
248 mov y1, a ; y1 = a
249 ror y0, (25-11) ; y0 = e >> (25-11)
250 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
251 xor y0, e ; y0 = e ^ (e >> (25-11))
252 ror y1, (22-13) ; y1 = a >> (22-13)
253 mov y2, f ; y2 = f
254 xor y1, a ; y1 = a ^ (a >> (22-13)
255 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
256 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
257 xor y2, g ; y2 = f^g
258 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
259 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
260 and y2, e ; y2 = (f^g)&e
261 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
262 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
263 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
264 xor y2, g ; y2 = CH = ((f^g)&e)^g
265 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
266 pxor XTMP2, XTMP3
267 add y2, y0 ; y2 = S1 + CH
268 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
269 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
270 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
271 mov y0, a ; y0 = a
272 add h, y2 ; h = h + S1 + CH + k + w
273 mov y2, a ; y2 = a
274 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
275 or y0, c ; y0 = a|c
276 add d, h ; d = d + h + S1 + CH + k + w
277 and y2, c ; y2 = a&c
278 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
279 and y0, b ; y0 = (a|c)&b
280 add h, y1 ; h = h + S1 + CH + k + w + S0
281 ;; compute high s1
282 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
283 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
284 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
285
286 ROTATE_ARGS
287 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
288 mov y0, e ; y0 = e
289 ror y0, (25-11) ; y0 = e >> (25-11)
290 mov y1, a ; y1 = a
291 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
292 ror y1, (22-13) ; y1 = a >> (22-13)
293 xor y0, e ; y0 = e ^ (e >> (25-11))
294 mov y2, f ; y2 = f
295 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
296 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
297 xor y1, a ; y1 = a ^ (a >> (22-13)
298 xor y2, g ; y2 = f^g
299 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
300 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
301 and y2, e ; y2 = (f^g)&e
302 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
303 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
304 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
305 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
306 xor y2, g ; y2 = CH = ((f^g)&e)^g
307 pxor XTMP2, XTMP3
308 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
309 add y2, y0 ; y2 = S1 + CH
310 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
311 pxor X0, XTMP2 ; X0 = s1 {xDxC}
312 mov y0, a ; y0 = a
313 add h, y2 ; h = h + S1 + CH + k + w
314 mov y2, a ; y2 = a
315 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
316 or y0, c ; y0 = a|c
317 add d, h ; d = d + h + S1 + CH + k + w
318 and y2, c ; y2 = a&c
319 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
320 and y0, b ; y0 = (a|c)&b
321 add h, y1 ; h = h + S1 + CH + k + w + S0
322 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
323 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
324
325 ROTATE_ARGS
326 rotate_Xs
327 %endm
328
329 ;; input is [rsp + _XFER + %1 * 4]
330 %macro DO_ROUND 1
331 mov y0, e ; y0 = e
332 ror y0, (25-11) ; y0 = e >> (25-11)
333 mov y1, a ; y1 = a
334 xor y0, e ; y0 = e ^ (e >> (25-11))
335 ror y1, (22-13) ; y1 = a >> (22-13)
336 mov y2, f ; y2 = f
337 xor y1, a ; y1 = a ^ (a >> (22-13)
338 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
339 xor y2, g ; y2 = f^g
340 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
341 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
342 and y2, e ; y2 = (f^g)&e
343 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
344 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
345 xor y2, g ; y2 = CH = ((f^g)&e)^g
346 add y2, y0 ; y2 = S1 + CH
347 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
348 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
349 mov y0, a ; y0 = a
350 add h, y2 ; h = h + S1 + CH + k + w
351 mov y2, a ; y2 = a
352 or y0, c ; y0 = a|c
353 add d, h ; d = d + h + S1 + CH + k + w
354 and y2, c ; y2 = a&c
355 and y0, b ; y0 = (a|c)&b
356 add h, y1 ; h = h + S1 + CH + k + w + S0
357 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
358 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
359 ROTATE_ARGS
360 %endm
361
362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
364 ;; void FUNC(void *input_data, UINT32 digest[8])
365 ;; arg 1 : pointer to input data
366 ;; arg 2 : pointer to digest
367 section .text
368 MKGLOBAL(FUNC,function,internal)
369 align 32
370 FUNC:
371 push rbx
372 %ifndef LINUX
373 push rsi
374 push rdi
375 %endif
376 push rbp
377 push r13
378 push r14
379 push r15
380
381 sub rsp,STACK_size
382 %ifndef LINUX
383 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
384 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
385 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
386 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
387 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
388 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
389 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
390 %endif
391
392 ;; load initial digest
393 mov a, [4*0 + CTX]
394 mov b, [4*1 + CTX]
395 mov c, [4*2 + CTX]
396 mov d, [4*3 + CTX]
397 mov e, [4*4 + CTX]
398 mov f, [4*5 + CTX]
399 mov g, [4*6 + CTX]
400 mov h, [4*7 + CTX]
401
402 movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
403 movdqa SHUF_00BA, [rel _SHUF_00BA]
404 movdqa SHUF_DC00, [rel _SHUF_DC00]
405
406 lea TBL,[rel K256]
407
408 ;; byte swap first 16 dwords
409 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
410 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
411 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
412 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
413
414 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
415 mov SRND, 3
416 align 16
417 loop1:
418 movdqa XFER, [TBL + 0*16]
419 paddd XFER, X0
420 movdqa [rsp + _XFER], XFER
421 FOUR_ROUNDS_AND_SCHED
422
423 movdqa XFER, [TBL + 1*16]
424 paddd XFER, X0
425 movdqa [rsp + _XFER], XFER
426 FOUR_ROUNDS_AND_SCHED
427
428 movdqa XFER, [TBL + 2*16]
429 paddd XFER, X0
430 movdqa [rsp + _XFER], XFER
431 FOUR_ROUNDS_AND_SCHED
432
433 movdqa XFER, [TBL + 3*16]
434 paddd XFER, X0
435 movdqa [rsp + _XFER], XFER
436 add TBL, 4*16
437 FOUR_ROUNDS_AND_SCHED
438
439 sub SRND, 1
440 jne loop1
441
442 mov SRND, 2
443 loop2:
444 paddd X0, [TBL + 0*16]
445 movdqa [rsp + _XFER], X0
446 DO_ROUND 0
447 DO_ROUND 1
448 DO_ROUND 2
449 DO_ROUND 3
450 paddd X1, [TBL + 1*16]
451 movdqa [rsp + _XFER], X1
452 add TBL, 2*16
453 DO_ROUND 0
454 DO_ROUND 1
455 DO_ROUND 2
456 DO_ROUND 3
457
458 movdqa X0, X2
459 movdqa X1, X3
460
461 sub SRND, 1
462 jne loop2
463
464 add [4*0 + CTX], a
465 add [4*1 + CTX], b
466 add [4*2 + CTX], c
467 add [4*3 + CTX], d
468 add [4*4 + CTX], e
469 add [4*5 + CTX], f
470 add [4*6 + CTX], g
471 add [4*7 + CTX], h
472
473 done_hash:
474 %ifndef LINUX
475 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
476 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
477 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
478 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
479 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
480 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
481 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
482 %ifdef SAFE_DATA
483 ;; Clear potential sensitive data stored in stack
484 pxor xmm0, xmm0
485 movdqa [rsp + _XMM_SAVE + 0 * 16], xmm0
486 movdqa [rsp + _XMM_SAVE + 1 * 16], xmm0
487 movdqa [rsp + _XMM_SAVE + 2 * 16], xmm0
488 movdqa [rsp + _XMM_SAVE + 3 * 16], xmm0
489 movdqa [rsp + _XMM_SAVE + 4 * 16], xmm0
490 movdqa [rsp + _XMM_SAVE + 5 * 16], xmm0
491 movdqa [rsp + _XMM_SAVE + 6 * 16], xmm0
492 %endif
493 %endif ;; LINUX
494
495 add rsp, STACK_size
496
497 pop r15
498 pop r14
499 pop r13
500 pop rbp
501 %ifndef LINUX
502 pop rdi
503 pop rsi
504 %endif
505 pop rbx
506
507 ret
508
509
510 %ifdef LINUX
511 section .note.GNU-stack noalloc noexec nowrite progbits
512 %endif