]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ; This code schedules 1 blocks at a time, with 4 lanes per block | |
29 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
30 | %include "os.asm" | |
31 | ||
32 | section .data | |
33 | default rel | |
34 | align 64 | |
35 | K256: | |
36 | dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
37 | dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
38 | dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
39 | dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
40 | dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
41 | dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
42 | dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
43 | dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
44 | dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
45 | dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
46 | dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
47 | dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
48 | dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
49 | dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
50 | dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
51 | dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
52 | ||
53 | PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 | |
54 | dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
55 | ||
56 | ; shuffle xBxA -> 00BA | |
57 | _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 | |
58 | dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF | |
59 | ||
60 | ; shuffle xDxC -> DC00 | |
61 | _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF | |
62 | dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 | |
63 | ||
64 | section .text | |
65 | ||
66 | %define VMOVDQ vmovdqu ;; assume buffers not aligned | |
67 | ||
68 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros | |
69 | ||
70 | %macro MY_ROR 2 | |
71 | shld %1,%1,(32-(%2)) | |
72 | %endm | |
73 | ||
74 | ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask | |
75 | ; Load xmm with mem and byte swap each dword | |
76 | %macro COPY_XMM_AND_BSWAP 3 | |
77 | VMOVDQ %1, %2 | |
78 | vpshufb %1, %1, %3 | |
79 | %endmacro | |
80 | ||
81 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
82 | ||
83 | %define X0 xmm4 | |
84 | %define X1 xmm5 | |
85 | %define X2 xmm6 | |
86 | %define X3 xmm7 | |
87 | ||
88 | %define XTMP0 xmm0 | |
89 | %define XTMP1 xmm1 | |
90 | %define XTMP2 xmm2 | |
91 | %define XTMP3 xmm3 | |
92 | %define XTMP4 xmm8 | |
93 | %define XFER xmm9 | |
94 | %define XTMP5 xmm11 | |
95 | ||
96 | %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA | |
97 | %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 | |
98 | %define BYTE_FLIP_MASK xmm13 | |
99 | ||
100 | %ifdef LINUX | |
101 | %define CTX rsi ; 2nd arg | |
102 | %define INP rdi ; 1st arg | |
103 | ||
104 | %define SRND rdi ; clobbers INP | |
105 | %define c ecx | |
106 | %define d r8d | |
107 | %define e edx | |
108 | %else | |
109 | %define CTX rdx ; 2nd arg | |
110 | %define INP rcx ; 1st arg | |
111 | ||
112 | %define SRND rcx ; clobbers INP | |
113 | %define c edi | |
114 | %define d esi | |
115 | %define e r8d | |
116 | ||
117 | %endif | |
118 | %define TBL rbp | |
119 | %define a eax | |
120 | %define b ebx | |
121 | ||
122 | %define f r9d | |
123 | %define g r10d | |
124 | %define h r11d | |
125 | ||
126 | %define y0 r13d | |
127 | %define y1 r14d | |
128 | %define y2 r15d | |
129 | ||
130 | ||
131 | struc STACK | |
132 | %ifndef LINUX | |
133 | _XMM_SAVE: reso 7 | |
134 | %endif | |
135 | _XFER: reso 1 | |
136 | endstruc | |
137 | ||
9f95a23c TL |
138 | %ifndef FUNC |
139 | %define FUNC sha256_block_avx | |
11fdf7f2 TL |
140 | %endif |
141 | ||
142 | ; rotate_Xs | |
143 | ; Rotate values of symbols X0...X3 | |
144 | %macro rotate_Xs 0 | |
145 | %xdefine X_ X0 | |
146 | %xdefine X0 X1 | |
147 | %xdefine X1 X2 | |
148 | %xdefine X2 X3 | |
149 | %xdefine X3 X_ | |
150 | %endm | |
151 | ||
152 | ; ROTATE_ARGS | |
153 | ; Rotate values of symbols a...h | |
154 | %macro ROTATE_ARGS 0 | |
155 | %xdefine TMP_ h | |
156 | %xdefine h g | |
157 | %xdefine g f | |
158 | %xdefine f e | |
159 | %xdefine e d | |
160 | %xdefine d c | |
161 | %xdefine c b | |
162 | %xdefine b a | |
163 | %xdefine a TMP_ | |
164 | %endm | |
165 | ||
166 | %macro FOUR_ROUNDS_AND_SCHED 0 | |
167 | ;; compute s0 four at a time and s1 two at a time | |
168 | ;; compute W[-16] + W[-7] 4 at a time | |
169 | ;vmovdqa XTMP0, X3 | |
170 | mov y0, e ; y0 = e | |
171 | MY_ROR y0, (25-11) ; y0 = e >> (25-11) | |
172 | mov y1, a ; y1 = a | |
173 | vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] | |
174 | MY_ROR y1, (22-13) ; y1 = a >> (22-13) | |
175 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
176 | mov y2, f ; y2 = f | |
177 | MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
178 | ;vmovdqa XTMP1, X1 | |
179 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
180 | xor y2, g ; y2 = f^g | |
181 | vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] | |
182 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
183 | and y2, e ; y2 = (f^g)&e | |
184 | MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
185 | ;; compute s0 | |
186 | vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] | |
187 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
188 | MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
189 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
190 | ||
191 | MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
192 | add y2, y0 ; y2 = S1 + CH | |
193 | add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH | |
194 | ||
195 | mov y0, a ; y0 = a | |
196 | add h, y2 ; h = h + S1 + CH + k + w | |
197 | mov y2, a ; y2 = a | |
198 | ||
199 | vpsrld XTMP2, XTMP1, 7 | |
200 | ||
201 | or y0, c ; y0 = a|c | |
202 | add d, h ; d = d + h + S1 + CH + k + w | |
203 | and y2, c ; y2 = a&c | |
204 | ||
205 | vpslld XTMP3, XTMP1, (32-7) | |
206 | ||
207 | and y0, b ; y0 = (a|c)&b | |
208 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
209 | ||
210 | vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 | |
211 | ||
212 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
213 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
214 | ||
215 | ROTATE_ARGS | |
216 | ||
217 | mov y0, e ; y0 = e | |
218 | mov y1, a ; y1 = a | |
219 | ||
220 | ||
221 | MY_ROR y0, (25-11) ; y0 = e >> (25-11) | |
222 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
223 | mov y2, f ; y2 = f | |
224 | MY_ROR y1, (22-13) ; y1 = a >> (22-13) | |
225 | ||
226 | vpsrld XTMP2, XTMP1,18 | |
227 | ||
228 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
229 | MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
230 | xor y2, g ; y2 = f^g | |
231 | ||
232 | vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 | |
233 | ||
234 | MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
235 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
236 | and y2, e ; y2 = (f^g)&e | |
237 | MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
238 | ||
239 | vpslld XTMP1, XTMP1, (32-18) | |
240 | ||
241 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
242 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
243 | ||
244 | vpxor XTMP3, XTMP3, XTMP1 | |
245 | ||
246 | add y2, y0 ; y2 = S1 + CH | |
247 | add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH | |
248 | MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
249 | ||
250 | vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 | |
251 | ||
252 | mov y0, a ; y0 = a | |
253 | add h, y2 ; h = h + S1 + CH + k + w | |
254 | mov y2, a ; y2 = a | |
255 | ||
256 | vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 | |
257 | ||
258 | or y0, c ; y0 = a|c | |
259 | add d, h ; d = d + h + S1 + CH + k + w | |
260 | and y2, c ; y2 = a&c | |
261 | ;; compute low s1 | |
262 | vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} | |
263 | and y0, b ; y0 = (a|c)&b | |
264 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
265 | vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 | |
266 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
267 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
268 | ||
269 | ROTATE_ARGS | |
270 | ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} | |
271 | ||
272 | mov y0, e ; y0 = e | |
273 | mov y1, a ; y1 = a | |
274 | MY_ROR y0, (25-11) ; y0 = e >> (25-11) | |
275 | ||
276 | ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} | |
277 | ||
278 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
279 | MY_ROR y1, (22-13) ; y1 = a >> (22-13) | |
280 | mov y2, f ; y2 = f | |
281 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
282 | MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
283 | ||
284 | vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} | |
285 | ||
286 | xor y2, g ; y2 = f^g | |
287 | ||
288 | vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} | |
289 | ||
290 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
291 | and y2, e ; y2 = (f^g)&e | |
292 | ||
293 | vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} | |
294 | ||
295 | MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
296 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
297 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
298 | MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
299 | vpxor XTMP2, XTMP2, XTMP3 | |
300 | add y2, y0 ; y2 = S1 + CH | |
301 | MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
302 | add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH | |
303 | vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} | |
304 | mov y0, a ; y0 = a | |
305 | add h, y2 ; h = h + S1 + CH + k + w | |
306 | mov y2, a ; y2 = a | |
307 | vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} | |
308 | or y0, c ; y0 = a|c | |
309 | add d, h ; d = d + h + S1 + CH + k + w | |
310 | and y2, c ; y2 = a&c | |
311 | vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} | |
312 | and y0, b ; y0 = (a|c)&b | |
313 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
314 | ;; compute high s1 | |
315 | vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} | |
316 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
317 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
318 | ||
319 | ROTATE_ARGS | |
320 | ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} | |
321 | mov y0, e ; y0 = e | |
322 | MY_ROR y0, (25-11) ; y0 = e >> (25-11) | |
323 | mov y1, a ; y1 = a | |
324 | ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} | |
325 | MY_ROR y1, (22-13) ; y1 = a >> (22-13) | |
326 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
327 | mov y2, f ; y2 = f | |
328 | MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
329 | ||
330 | vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} | |
331 | ||
332 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
333 | xor y2, g ; y2 = f^g | |
334 | ||
335 | vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} | |
336 | ||
337 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
338 | and y2, e ; y2 = (f^g)&e | |
339 | MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
340 | ||
341 | vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} | |
342 | ||
343 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
344 | MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
345 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
346 | ||
347 | vpxor XTMP2, XTMP2, XTMP3 | |
348 | ||
349 | MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
350 | add y2, y0 ; y2 = S1 + CH | |
351 | add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH | |
352 | vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} | |
353 | mov y0, a ; y0 = a | |
354 | add h, y2 ; h = h + S1 + CH + k + w | |
355 | mov y2, a ; y2 = a | |
356 | vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} | |
357 | or y0, c ; y0 = a|c | |
358 | add d, h ; d = d + h + S1 + CH + k + w | |
359 | and y2, c ; y2 = a&c | |
360 | vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} | |
361 | and y0, b ; y0 = (a|c)&b | |
362 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
363 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
364 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
365 | ||
366 | ROTATE_ARGS | |
367 | rotate_Xs | |
368 | %endm | |
369 | ||
370 | ;; input is [rsp + _XFER + %1 * 4] | |
371 | %macro DO_ROUND 1 | |
372 | mov y0, e ; y0 = e | |
373 | MY_ROR y0, (25-11) ; y0 = e >> (25-11) | |
374 | mov y1, a ; y1 = a | |
375 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
376 | MY_ROR y1, (22-13) ; y1 = a >> (22-13) | |
377 | mov y2, f ; y2 = f | |
378 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
379 | MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
380 | xor y2, g ; y2 = f^g | |
381 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
382 | MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
383 | and y2, e ; y2 = (f^g)&e | |
384 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
385 | MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
386 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
387 | add y2, y0 ; y2 = S1 + CH | |
388 | MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
389 | add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH | |
390 | mov y0, a ; y0 = a | |
391 | add h, y2 ; h = h + S1 + CH + k + w | |
392 | mov y2, a ; y2 = a | |
393 | or y0, c ; y0 = a|c | |
394 | add d, h ; d = d + h + S1 + CH + k + w | |
395 | and y2, c ; y2 = a&c | |
396 | and y0, b ; y0 = (a|c)&b | |
397 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
398 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
399 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
400 | ROTATE_ARGS | |
401 | %endm | |
402 | ||
403 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
404 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
405 | ;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks) | |
406 | ;; arg 1 : pointer to input data | |
407 | ;; arg 2 : pointer to digest | |
408 | section .text | |
409 | MKGLOBAL(FUNC,function,) | |
410 | align 32 | |
411 | FUNC: | |
412 | push rbx | |
413 | %ifndef LINUX | |
414 | push rsi | |
415 | push rdi | |
416 | %endif | |
417 | push rbp | |
418 | push r13 | |
419 | push r14 | |
420 | push r15 | |
421 | ||
422 | sub rsp,STACK_size | |
423 | %ifndef LINUX | |
424 | vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 | |
425 | vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 | |
426 | vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 | |
427 | vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 | |
428 | vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 | |
429 | vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 | |
430 | vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 | |
431 | vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 | |
432 | %endif | |
433 | ||
434 | ;; load initial digest | |
9f95a23c TL |
435 | mov a, [4*0 + CTX] |
436 | mov b, [4*1 + CTX] | |
437 | mov c, [4*2 + CTX] | |
438 | mov d, [4*3 + CTX] | |
439 | mov e, [4*4 + CTX] | |
440 | mov f, [4*5 + CTX] | |
441 | mov g, [4*6 + CTX] | |
442 | mov h, [4*7 + CTX] | |
11fdf7f2 TL |
443 | |
444 | vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] | |
445 | vmovdqa SHUF_00BA, [rel _SHUF_00BA] | |
446 | vmovdqa SHUF_DC00, [rel _SHUF_DC00] | |
447 | ||
448 | lea TBL,[rel K256] | |
449 | ||
450 | ;; byte swap first 16 dwords | |
451 | COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK | |
452 | COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK | |
453 | COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK | |
454 | COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK | |
455 | ||
456 | ;; schedule 48 input dwords, by doing 3 rounds of 16 each | |
457 | mov SRND, 3 | |
458 | align 16 | |
459 | loop1: | |
460 | vpaddd XFER, X0, [TBL + 0*16] | |
461 | vmovdqa [rsp + _XFER], XFER | |
462 | FOUR_ROUNDS_AND_SCHED | |
463 | ||
464 | vpaddd XFER, X0, [TBL + 1*16] | |
465 | vmovdqa [rsp + _XFER], XFER | |
466 | FOUR_ROUNDS_AND_SCHED | |
467 | ||
468 | vpaddd XFER, X0, [TBL + 2*16] | |
469 | vmovdqa [rsp + _XFER], XFER | |
470 | FOUR_ROUNDS_AND_SCHED | |
471 | ||
472 | vpaddd XFER, X0, [TBL + 3*16] | |
473 | vmovdqa [rsp + _XFER], XFER | |
474 | add TBL, 4*16 | |
475 | FOUR_ROUNDS_AND_SCHED | |
476 | ||
477 | sub SRND, 1 | |
478 | jne loop1 | |
479 | ||
480 | mov SRND, 2 | |
481 | loop2: | |
482 | vpaddd XFER, X0, [TBL + 0*16] | |
483 | vmovdqa [rsp + _XFER], XFER | |
484 | DO_ROUND 0 | |
485 | DO_ROUND 1 | |
486 | DO_ROUND 2 | |
487 | DO_ROUND 3 | |
488 | ||
489 | vpaddd XFER, X1, [TBL + 1*16] | |
490 | vmovdqa [rsp + _XFER], XFER | |
491 | add TBL, 2*16 | |
492 | DO_ROUND 0 | |
493 | DO_ROUND 1 | |
494 | DO_ROUND 2 | |
495 | DO_ROUND 3 | |
496 | ||
497 | vmovdqa X0, X2 | |
498 | vmovdqa X1, X3 | |
499 | ||
500 | sub SRND, 1 | |
501 | jne loop2 | |
502 | ||
9f95a23c TL |
503 | add [4*0 + CTX], a |
504 | add [4*1 + CTX], b | |
505 | add [4*2 + CTX], c | |
506 | add [4*3 + CTX], d | |
507 | add [4*4 + CTX], e | |
508 | add [4*5 + CTX], f | |
509 | add [4*6 + CTX], g | |
510 | add [4*7 + CTX], h | |
11fdf7f2 TL |
511 | |
512 | done_hash: | |
513 | %ifndef LINUX | |
514 | vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] | |
515 | vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] | |
516 | vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] | |
517 | vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] | |
518 | vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] | |
519 | vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] | |
520 | vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] | |
521 | vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] | |
522 | %endif | |
523 | ||
524 | add rsp, STACK_size | |
525 | ||
526 | pop r15 | |
527 | pop r14 | |
528 | pop r13 | |
529 | pop rbp | |
530 | %ifndef LINUX | |
531 | pop rdi | |
532 | pop rsi | |
533 | %endif | |
534 | pop rbx | |
535 | ||
536 | ret | |
537 | ||
538 | ||
539 | %ifdef LINUX | |
540 | section .note.GNU-stack noalloc noexec nowrite progbits | |
541 | %endif |