]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Implement fast SHA-256 with SSSE3 instructions. (x86_64) | |
3 | ; | |
4 | ; Copyright (C) 2013 Intel Corporation. | |
5 | ; | |
6 | ; Authors: | |
7 | ; James Guilford <james.guilford@intel.com> | |
8 | ; Kirk Yap <kirk.s.yap@intel.com> | |
9 | ; Tim Chen <tim.c.chen@linux.intel.com> | |
10 | ; Transcoded by: | |
11 | ; Xiaodong Liu <xiaodong.liu@intel.com> | |
12 | ; | |
13 | ; This software is available to you under the OpenIB.org BSD license | |
14 | ; below: | |
15 | ; | |
16 | ; Redistribution and use in source and binary forms, with or | |
17 | ; without modification, are permitted provided that the following | |
18 | ; conditions are met: | |
19 | ; | |
20 | ; - Redistributions of source code must retain the above | |
21 | ; copyright notice, this list of conditions and the following | |
22 | ; disclaimer. | |
23 | ; | |
24 | ; - Redistributions in binary form must reproduce the above | |
25 | ; copyright notice, this list of conditions and the following | |
26 | ; disclaimer in the documentation and/or other materials | |
27 | ; provided with the distribution. | |
28 | ; | |
29 | ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
30 | ; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
31 | ; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
32 | ; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
33 | ; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
34 | ; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
35 | ; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
36 | ; SOFTWARE. | |
37 | ; | |
38 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
39 | ; | |
40 | ; This code is described in an Intel White-Paper: | |
41 | ; "Fast SHA-256 Implementations on Intel Architecture Processors" | |
42 | ; | |
43 | ; To find it, surf to http://www.intel.com/p/en_US/embedded | |
44 | ; and search for that title. | |
45 | ; | |
46 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
47 | %include "sha256_mb_mgr_datastruct.asm" | |
48 | %include "reg_sizes.asm" | |
49 | ||
50 | [bits 64] | |
51 | default rel | |
52 | section .text | |
53 | ||
54 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
55 | ||
56 | %ifidn __OUTPUT_FORMAT__, elf64 | |
57 | ; Linux | |
58 | %define arg0 rdi | |
59 | %define arg1 rsi | |
60 | %else | |
61 | ; Windows | |
62 | %define arg0 rcx | |
63 | %define arg1 rdx | |
64 | %endif | |
65 | ||
66 | %xdefine X0 xmm4 | |
67 | %xdefine X1 xmm5 | |
68 | %xdefine X2 xmm6 | |
69 | %xdefine X3 xmm7 | |
70 | ||
71 | %xdefine XTMP0 xmm0 | |
72 | %xdefine XTMP1 xmm1 | |
73 | %xdefine XTMP2 xmm2 | |
74 | %xdefine XTMP3 xmm3 | |
75 | %xdefine XTMP4 xmm8 | |
76 | %xdefine XFER xmm9 | |
77 | ||
78 | %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA | |
79 | %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 | |
80 | %define BYTE_FLIP_MASK xmm12 | |
81 | ||
82 | ; arg index is start from 0 while mgr_flush/submit is from 1 | |
83 | %define MGR arg0 ; rdi or rcx | |
84 | %define NBLK arg1 ; rsi or rdx | |
85 | %define IDX r8 ; local variable -- consistent with caller | |
86 | %define NLANX4 r10 ; consistent with caller, should be r10 | |
87 | ||
88 | %define TMGR r9 ; data pointer stored in stack named _TMGR | |
89 | %define INP r9 ; data pointer stored in stack named _INP | |
90 | %define SRND r9 ; clobbers INP | |
91 | %define TMP r9 ; local variable -- assistant to address digest | |
92 | ||
93 | %xdefine TBL rbp | |
94 | %xdefine c ecx | |
95 | %xdefine d esi | |
96 | %xdefine e edx | |
97 | %xdefine a eax | |
98 | %xdefine b ebx | |
99 | ||
100 | %xdefine f edi | |
101 | %xdefine g r12d | |
102 | %xdefine h r11d | |
103 | ||
104 | %xdefine y0 r13d | |
105 | %xdefine y1 r14d | |
106 | %xdefine y2 r15d | |
107 | ||
108 | ||
109 | ;; FRAMESZ plus pushes must be an odd multiple of 8 | |
110 | %define _STACK_ALIGN_SIZE 8 ; 0 or 8 depends on pushes | |
111 | %define _INP_END_SIZE 8 | |
112 | %define _INP_SIZE 8 | |
113 | %define _TMGR_SIZE 8 | |
114 | %define _XFER_SIZE 16 | |
115 | %define _XMM_SAVE_SIZE 0 | |
116 | %define _GPR_SAVE_SIZE 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15 | |
117 | ||
118 | %define _STACK_ALIGN 0 | |
119 | %define _INP_END (_STACK_ALIGN + _STACK_ALIGN_SIZE) | |
120 | %define _INP (_INP_END + _INP_END_SIZE) | |
121 | %define _TMGR (_INP + _INP_SIZE) | |
122 | %define _XFER (_TMGR + _TMGR_SIZE) | |
123 | %define _XMM_SAVE (_XFER + _XFER_SIZE) | |
124 | %define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE) | |
125 | %define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE) | |
126 | ||
127 | ;; assume buffers not aligned | |
128 | %define MOVDQ movdqu | |
129 | ||
130 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros | |
131 | ||
132 | ; addm [mem], reg | |
133 | ; Add reg to mem using reg-mem add and store | |
134 | %macro addm 2 | |
135 | add %2, %1 ;changed | |
136 | mov %1, %2 ;changed | |
137 | %endmacro | |
138 | ||
139 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
140 | ||
141 | ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask | |
142 | ; Load xmm with mem and byte swap each dword | |
143 | %macro COPY_XMM_AND_BSWAP 3 | |
144 | MOVDQ %1, %2 ;changed | |
145 | pshufb %1, %3 ;changed | |
146 | %endmacro | |
147 | ||
148 | ; rotate_Xs | |
149 | ; Rotate values of symbols X0...X3 | |
150 | %macro rotate_Xs 0 | |
151 | %xdefine X_ X0 | |
152 | %xdefine X0 X1 | |
153 | %xdefine X1 X2 | |
154 | %xdefine X2 X3 | |
155 | %xdefine X3 X_ | |
156 | %endmacro | |
157 | ||
158 | ; ROTATE_ARGS | |
159 | ; Rotate values of symbols a...h | |
160 | %macro ROTATE_ARGS 0 | |
161 | %xdefine TMP_ h | |
162 | %xdefine h g | |
163 | %xdefine g f | |
164 | %xdefine f e | |
165 | %xdefine e d | |
166 | %xdefine d c | |
167 | %xdefine c b | |
168 | %xdefine b a | |
169 | %xdefine a TMP_ | |
170 | %endmacro | |
171 | ||
172 | %macro FOUR_ROUNDS_AND_SCHED 0 | |
173 | ;; compute s0 four at a time and s1 two at a time | |
174 | ;; compute W[-16] + W[-7] 4 at a time | |
175 | movdqa XTMP0, X3 | |
176 | mov y0, e ; y0 = e | |
177 | ror y0, (25-11) ; y0 = e >> (25-11) | |
178 | mov y1, a ; y1 = a | |
179 | palignr XTMP0, X2, 4 ; XTMP0 = W[-7] | |
180 | ror y1, (22-13) ; y1 = a >> (22-13) | |
181 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
182 | mov y2, f ; y2 = f | |
183 | ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
184 | movdqa XTMP1, X1 | |
185 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
186 | xor y2, g ; y2 = f^g | |
187 | paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] | |
188 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
189 | and y2, e ; y2 = (f^g)&e | |
190 | ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
191 | ;; compute s0 | |
192 | palignr XTMP1, X0, 4 ; XTMP1 = W[-15] | |
193 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
194 | ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
195 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
196 | movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] | |
197 | ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
198 | add y2, y0 ; y2 = S1 + CH | |
199 | add y2 , [rsp + _XFER] ; y2 = k + w + S1 + CH | |
200 | movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] | |
201 | mov y0, a ; y0 = a | |
202 | add h, y2 ; h = h + S1 + CH + k + w | |
203 | mov y2, a ; y2 = a | |
204 | pslld XTMP1, (32-7) ; | |
205 | or y0, c ; y0 = a|c | |
206 | add d, h ; d = d + h + S1 + CH + k + w | |
207 | and y2, c ; y2 = a&c | |
208 | psrld XTMP2, 7 ; | |
209 | and y0, b ; y0 = (a|c)&b | |
210 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
211 | por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 | |
212 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
213 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
214 | ||
215 | ROTATE_ARGS | |
216 | movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] | |
217 | mov y0, e ; y0 = e | |
218 | mov y1, a ; y1 = a | |
219 | movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] | |
220 | ror y0, (25-11) ; y0 = e >> (25-11) | |
221 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
222 | mov y2, f ; y2 = f | |
223 | ror y1, (22-13) ; y1 = a >> (22-13) | |
224 | pslld XTMP3, (32-18) ; | |
225 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
226 | ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
227 | xor y2, g ; y2 = f^g | |
228 | psrld XTMP2, 18 ; | |
229 | ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
230 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
231 | and y2, e ; y2 = (f^g)&e | |
232 | ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
233 | pxor XTMP1, XTMP3 | |
234 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
235 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
236 | psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 | |
237 | add y2, y0 ; y2 = S1 + CH | |
238 | add y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH | |
239 | ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
240 | pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 | |
241 | mov y0, a ; y0 = a | |
242 | add h, y2 ; h = h + S1 + CH + k + w | |
243 | mov y2, a ; y2 = a | |
244 | pxor XTMP1, XTMP4 ; XTMP1 = s0 | |
245 | or y0, c ; y0 = a|c | |
246 | add d, h ; d = d + h + S1 + CH + k + w | |
247 | and y2, c ; y2 = a&c | |
248 | ;; compute low s1 | |
249 | pshufd XTMP2, X3, 11111010B ; XTMP2 = W[-2] {BBAA} | |
250 | and y0, b ; y0 = (a|c)&b | |
251 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
252 | paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 | |
253 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
254 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
255 | ||
256 | ROTATE_ARGS | |
257 | movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} | |
258 | mov y0, e ; y0 = e | |
259 | mov y1, a ; y1 = a | |
260 | ror y0, (25-11) ; y0 = e >> (25-11) | |
261 | movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} | |
262 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
263 | ror y1, (22-13) ; y1 = a >> (22-13) | |
264 | mov y2, f ; y2 = f | |
265 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
266 | ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
267 | psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} | |
268 | xor y2, g ; y2 = f^g | |
269 | psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} | |
270 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
271 | and y2, e ; y2 = (f^g)&e | |
272 | psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} | |
273 | ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
274 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
275 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
276 | ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
277 | pxor XTMP2, XTMP3 | |
278 | add y2, y0 ; y2 = S1 + CH | |
279 | ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
280 | add y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH | |
281 | pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} | |
282 | mov y0, a ; y0 = a | |
283 | add h, y2 ; h = h + S1 + CH + k + w | |
284 | mov y2, a ; y2 = a | |
285 | pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} | |
286 | or y0, c ; y0 = a|c | |
287 | add d, h ; d = d + h + S1 + CH + k + w | |
288 | and y2, c ; y2 = a&c | |
289 | paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} | |
290 | and y0, b ; y0 = (a|c)&b | |
291 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
292 | ;; compute high s1 | |
293 | pshufd XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA} | |
294 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
295 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
296 | ||
297 | ROTATE_ARGS | |
298 | movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} | |
299 | mov y0, e ; y0 = e | |
300 | ror y0, (25-11) ; y0 = e >> (25-11) | |
301 | mov y1, a ; y1 = a | |
302 | movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} | |
303 | ror y1, (22-13) ; y1 = a >> (22-13) | |
304 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
305 | mov y2, f ; y2 = f | |
306 | ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
307 | psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} | |
308 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
309 | xor y2, g ; y2 = f^g | |
310 | psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} | |
311 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25 | |
312 | and y2, e ; y2 = (f^g)&e | |
313 | ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
314 | psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} | |
315 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22 | |
316 | ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2 | |
317 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
318 | pxor XTMP2, XTMP3 ; | |
319 | ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2 | |
320 | add y2, y0 ; y2 = S1 + CH | |
321 | add y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH | |
322 | pxor X0, XTMP2 ; X0 = s1 {xDxC} | |
323 | mov y0, a ; y0 = a | |
324 | add h, y2 ; h = h + S1 + CH + k + w | |
325 | mov y2, a ; y2 = a | |
326 | pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} | |
327 | or y0, c ; y0 = a|c | |
328 | add d, h ; d = d + h + S1 + CH + k + w | |
329 | and y2, c ; y2 = a&c | |
330 | paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} | |
331 | and y0, b ; y0 = (a|c)&b | |
332 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
333 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
334 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
335 | ||
336 | ROTATE_ARGS | |
337 | rotate_Xs | |
338 | %endmacro | |
339 | ||
340 | ;; input is [rsp + _XFER + %1 * 4] | |
341 | %macro DO_ROUND 1 | |
342 | mov y0, e ; y0 = e | |
343 | ror y0, (25-11) ; y0 = e >> (25-11) | |
344 | mov y1, a ; y1 = a | |
345 | xor y0, e ; y0 = e ^ (e >> (25-11)) | |
346 | ror y1, (22-13) ; y1 = a >> (22-13) | |
347 | mov y2, f ; y2 = f | |
348 | xor y1, a ; y1 = a ^ (a >> (22-13) | |
349 | ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) | |
350 | xor y2, g ; y2 = f^g | |
351 | xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) | |
352 | ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) | |
353 | and y2, e ; y2 = (f^g)&e | |
354 | xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) | |
355 | ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) | |
356 | xor y2, g ; y2 = CH = ((f^g)&e)^g | |
357 | add y2, y0 ; y2 = S1 + CH | |
358 | ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) | |
359 | %xdefine offset (%1 * 4 + _XFER) | |
360 | add y2, [rsp + offset] ; y2 = k + w + S1 + CH | |
361 | mov y0, a ; y0 = a | |
362 | add h, y2 ; h = h + S1 + CH + k + w | |
363 | mov y2, a ; y2 = a | |
364 | or y0, c ; y0 = a|c | |
365 | add d, h ; d = d + h + S1 + CH + k + w | |
366 | and y2, c ; y2 = a&c | |
367 | and y0, b ; y0 = (a|c)&b | |
368 | add h, y1 ; h = h + S1 + CH + k + w + S0 | |
369 | or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) | |
370 | add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ | |
371 | ROTATE_ARGS | |
372 | %endmacro | |
373 | ||
374 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
375 | ; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks); | |
376 | ; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) | |
377 | ; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 | |
378 | ; invisibile arg 2 : IDX : hash on which lane | |
379 | ; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) | |
380 | ; (sse/avx is 4, avx2 is 8, avx512 is 16) | |
381 | ; | |
382 | ; Clobbers registers: all general regs, xmm0-xmm12 | |
383 | ; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack} | |
384 | ; | |
385 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
386 | section .text | |
387 | mk_global sha256_opt_x1, function, internal | |
388 | sha256_opt_x1: | |
389 | endbranch | |
390 | sub rsp, STACK_SIZE | |
391 | mov [rsp + _GPR_SAVE + 8*0], rbx | |
392 | mov [rsp + _GPR_SAVE + 8*1], rbp | |
393 | %ifidn __OUTPUT_FORMAT__, win64 | |
394 | mov [rsp + _GPR_SAVE + 8*2], rdi | |
395 | mov [rsp + _GPR_SAVE + 8*3], rsi | |
396 | ; caller has already stored XMM6~10 | |
397 | %endif | |
398 | mov [rsp + _GPR_SAVE + 8*4], r12 | |
399 | mov [rsp + _GPR_SAVE + 8*5], r13 | |
400 | mov [rsp + _GPR_SAVE + 8*6], r14 | |
401 | mov [rsp + _GPR_SAVE + 8*7], r15 | |
402 | mov [rsp + _GPR_SAVE + 8*8], rdx | |
403 | ||
404 | shl NBLK, 6 ; convert to bytes | |
405 | jz done_hash | |
406 | ||
407 | ; detach idx from nlanx4 | |
408 | mov IDX, NLANX4 | |
409 | shr NLANX4, 8 | |
410 | and IDX, 0xff | |
411 | ||
412 | mov [rsp + _TMGR], MGR | |
413 | ;; Load input pointers | |
414 | mov INP, [MGR + _data_ptr + IDX*8] | |
415 | mov [rsp + _INP], INP | |
416 | ;; nblk is used to indicate data end | |
417 | add NBLK, INP | |
418 | mov [rsp + _INP_END], NBLK ; pointer to end of data | |
419 | ||
420 | ||
421 | mov TMGR, [rsp + _TMGR] | |
422 | ;; load initial digest | |
423 | lea TMP, [TMGR + 4*IDX] | |
424 | mov a, [TMP + 0*NLANX4] | |
425 | mov b, [TMP + 1*NLANX4] | |
426 | mov c, [TMP + 2*NLANX4] | |
427 | lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 | |
428 | mov d, [TMP + 1*NLANX4] | |
429 | mov e, [TMP + 2*NLANX4] | |
430 | mov g, [TMP + 4*NLANX4] | |
431 | lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4 | |
432 | mov f, [TMP + 2*NLANX4] | |
433 | mov h, [TMP + 4*NLANX4] | |
434 | ||
435 | movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK] | |
436 | movdqa SHUF_00BA, [_SHUF_00BA] | |
437 | movdqa SHUF_DC00, [_SHUF_DC00] | |
438 | ||
439 | mov INP, [rsp + _INP] | |
440 | loop0: | |
441 | lea TBL, [K256] | |
442 | ||
443 | ;; byte swap first 16 dwords | |
444 | COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK | |
445 | COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK | |
446 | COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK | |
447 | COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK | |
448 | ||
449 | mov [rsp + _INP], INP | |
450 | ||
451 | ;; schedule 48 input dwords, by doing 3 rounds of 16 each | |
452 | mov SRND, 3 | |
453 | ||
454 | loop1: | |
455 | movdqa XFER, [TBL] | |
456 | paddd XFER, X0 | |
457 | movdqa [rsp + _XFER], XFER | |
458 | FOUR_ROUNDS_AND_SCHED | |
459 | ||
460 | movdqa XFER, [TBL + 1*16] | |
461 | paddd XFER, X0 | |
462 | movdqa [rsp + _XFER], XFER | |
463 | FOUR_ROUNDS_AND_SCHED | |
464 | ||
465 | movdqa XFER, [TBL + 2*16] | |
466 | paddd XFER, X0 | |
467 | movdqa [rsp + _XFER], XFER | |
468 | FOUR_ROUNDS_AND_SCHED | |
469 | ||
470 | movdqa XFER, [TBL + 3*16] | |
471 | paddd XFER, X0 | |
472 | movdqa [rsp + _XFER], XFER | |
473 | add TBL, 4*16 | |
474 | FOUR_ROUNDS_AND_SCHED | |
475 | ||
476 | sub SRND, 1 | |
477 | jne loop1 | |
478 | ||
479 | mov SRND, 2 | |
480 | loop2: | |
481 | paddd X0, [TBL] | |
482 | movdqa [rsp + _XFER], X0 | |
483 | DO_ROUND 0 | |
484 | DO_ROUND 1 | |
485 | DO_ROUND 2 | |
486 | DO_ROUND 3 | |
487 | paddd X1, [TBL + 1*16] | |
488 | movdqa [rsp + _XFER], X1 | |
489 | add TBL, 2*16 | |
490 | DO_ROUND 0 | |
491 | DO_ROUND 1 | |
492 | DO_ROUND 2 | |
493 | DO_ROUND 3 | |
494 | ||
495 | movdqa X0, X2 | |
496 | movdqa X1, X3 | |
497 | ||
498 | sub SRND, 1 | |
499 | jne loop2 | |
500 | ||
501 | ; write out digests | |
502 | mov TMGR, [rsp + _TMGR] | |
503 | lea TMP, [TMGR + 4*IDX] | |
504 | addm a, [TMP + 0*NLANX4] | |
505 | addm b, [TMP + 1*NLANX4] | |
506 | addm c, [TMP + 2*NLANX4] | |
507 | lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 | |
508 | addm d, [TMP + 1*NLANX4] | |
509 | addm e, [TMP + 2*NLANX4] | |
510 | addm g, [TMP + 4*NLANX4] | |
511 | lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4 | |
512 | addm f, [TMP + 2*NLANX4] | |
513 | addm h, [TMP + 4*NLANX4] | |
514 | ||
515 | mov INP, [rsp + _INP] | |
516 | add INP, 64 | |
517 | cmp INP, [rsp + _INP_END] | |
518 | jne loop0 | |
519 | ||
520 | done_hash: | |
521 | mov MGR, [rsp + _TMGR] | |
522 | ||
523 | mov rdx, [rsp + _GPR_SAVE + 8*8] | |
524 | mov r15, [rsp + _GPR_SAVE + 8*7] | |
525 | mov r14, [rsp + _GPR_SAVE + 8*6] | |
526 | mov r13, [rsp + _GPR_SAVE + 8*5] | |
527 | mov r12, [rsp + _GPR_SAVE + 8*4] | |
528 | %ifidn __OUTPUT_FORMAT__, win64 | |
529 | mov rsi, [rsp + _GPR_SAVE + 8*3] | |
530 | mov rdi, [rsp + _GPR_SAVE + 8*2] | |
531 | %endif | |
532 | mov rbp, [rsp + _GPR_SAVE + 8*1] | |
533 | mov rbx, [rsp + _GPR_SAVE + 8*0] | |
534 | add rsp, STACK_SIZE | |
535 | ||
536 | ret | |
537 | ||
538 | section .data | |
539 | align 64 | |
540 | K256: | |
541 | DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
542 | DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
543 | DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
544 | DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
545 | DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
546 | DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
547 | DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
548 | DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
549 | DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
550 | DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
551 | DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
552 | DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
553 | DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
554 | DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
555 | DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
556 | DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
557 | ||
558 | PSHUFFLE_BYTE_FLIP_MASK: | |
559 | DQ 0x0405060700010203, 0x0c0d0e0f08090a0b | |
560 | ||
561 | ; shuffle xBxA -> 00BA | |
562 | _SHUF_00BA: | |
563 | DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF | |
564 | ||
565 | ; shuffle xDxC -> DC00 | |
566 | _SHUF_DC00: | |
567 | DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 |