]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2017 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "sha256_mb_mgr_datastruct.asm" | |
31 | %include "reg_sizes.asm" | |
32 | ||
33 | %ifdef HAVE_AS_KNOWS_SHANI | |
34 | ||
35 | [bits 64] | |
36 | default rel | |
37 | section .text | |
38 | ||
39 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
40 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
41 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
42 | ||
43 | %ifidn __OUTPUT_FORMAT__, elf64 | |
44 | ; Linux | |
45 | %define arg0 rdi | |
46 | %define arg1 rsi | |
47 | %else | |
48 | ; Windows | |
49 | %define arg0 rcx | |
50 | %define arg1 rdx | |
51 | %endif | |
52 | ||
53 | ;; FRAMESZ plus pushes must be an odd multiple of 8 | |
54 | %define FRAMESZ 64 ; space for ABCDE | |
55 | %define RSPSAVE rax | |
56 | ||
57 | %define MSG xmm0 | |
58 | %define STATE0 xmm1 | |
59 | %define STATE1 xmm2 | |
60 | %define MSGTMP0 xmm3 | |
61 | %define MSGTMP1 xmm4 | |
62 | %define MSGTMP2 xmm5 | |
63 | %define MSGTMP3 xmm6 | |
64 | %define MSGTMP4 xmm7 | |
65 | ||
66 | %define STATE0b xmm8 | |
67 | %define STATE1b xmm9 | |
68 | %define MSGTMP0b xmm10 | |
69 | %define MSGTMP1b xmm11 | |
70 | %define MSGTMP2b xmm12 | |
71 | %define MSGTMP3b xmm13 | |
72 | %define MSGTMP4b xmm14 | |
73 | ||
74 | %define SHUF_MASK xmm15 | |
75 | ||
76 | ; arg index is start from 0 while mgr_flush/submit is from 1 | |
77 | %define MGR arg0 | |
78 | %define NBLK arg1 | |
79 | %define NLANX4 r10 ; consistent with caller | |
80 | %define IDX r8 ; local variable -- consistent with caller | |
81 | %define DPTR r11 ; local variable -- input buffer pointer | |
82 | %define DPTRb r12 | |
83 | %define TMP r9 ; local variable -- assistant to address digest | |
84 | %define TBL r13 | |
85 | %define TMPb r14 ; local variable -- assistant to address digest | |
86 | align 32 | |
87 | ||
88 | ; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks); | |
89 | ; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used) | |
90 | ; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1 | |
91 | ; invisibile arg 2 : IDX : hash on which lane | |
92 | ; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it) | |
93 | ; (sse/avx is 4, avx2 is 8, avx512 is 16) | |
94 | ; | |
95 | ; Clobbers registers: rax, r9~r14, xmm0-xmm15 | |
96 | ; | |
97 | mk_global sha256_ni_x2, function, internal | |
98 | sha256_ni_x2: | |
99 | endbranch | |
100 | mov RSPSAVE, rsp | |
101 | sub rsp, FRAMESZ | |
102 | and rsp, ~0xF ; Align 16Bytes downward | |
103 | ||
104 | shl NBLK, 6 ; transform blk amount into bytes | |
105 | jz backto_mgr | |
106 | ||
107 | ; detach idx from nlanx4 | |
108 | mov IDX, NLANX4 | |
109 | shr NLANX4, 8 | |
110 | and IDX, 0xff | |
111 | ||
112 | lea TMP, [MGR + 4*0] | |
113 | lea TMPb, [MGR + 4*1] | |
114 | ||
115 | ;; Initialize digest | |
116 | ;; digests -> ABEF(state0), CDGH(state1) | |
117 | pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A | |
118 | pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B | |
119 | pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C | |
120 | lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 | |
121 | pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D | |
122 | pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E | |
123 | pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G | |
124 | lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 | |
125 | pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F | |
126 | pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H | |
127 | ||
128 | pinsrd STATE0b, [TMPb + 0*NLANX4], 3 ; A | |
129 | pinsrd STATE0b, [TMPb + 1*NLANX4], 2 ; B | |
130 | pinsrd STATE1b, [TMPb + 2*NLANX4], 3 ; C | |
131 | lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 | |
132 | pinsrd STATE1b, [TMPb + 1*NLANX4], 2 ; D | |
133 | pinsrd STATE0b, [TMPb + 2*NLANX4], 1 ; E | |
134 | pinsrd STATE1b, [TMPb + 4*NLANX4], 1 ; G | |
135 | lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 | |
136 | pinsrd STATE0b, [TMPb + 2*NLANX4], 0 ; F | |
137 | pinsrd STATE1b, [TMPb + 4*NLANX4], 0 ; H | |
138 | ||
139 | movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK] | |
140 | lea TBL, [TABLE] | |
141 | ||
142 | ;; Load input pointers | |
143 | mov DPTR, [MGR + _data_ptr + 8*0] | |
144 | mov DPTRb,[MGR + _data_ptr + 8*1] | |
145 | ;; nblk is used to indicate data end | |
146 | add NBLK, DPTR | |
147 | ||
148 | lloop: | |
149 | ; /* Save hash values for addition after rounds */ | |
150 | movdqa [rsp + 0*16], STATE0 | |
151 | movdqa [rsp + 1*16], STATE1 | |
152 | ||
153 | movdqa [rsp + 2*16], STATE0b | |
154 | movdqa [rsp + 3*16], STATE1b | |
155 | ||
156 | ; /* Rounds 0-3 */ | |
157 | movdqu MSG, [DPTR + 0*16] | |
158 | pshufb MSG, SHUF_MASK | |
159 | movdqa MSGTMP0, MSG | |
160 | paddd MSG, [TBL + 0*16] | |
161 | sha256rnds2 STATE1, STATE0, MSG | |
162 | pshufd MSG, MSG, 0x0E | |
163 | sha256rnds2 STATE0, STATE1, MSG | |
164 | ||
165 | movdqu MSG, [DPTRb + 0*16] | |
166 | pshufb MSG, SHUF_MASK | |
167 | movdqa MSGTMP0b, MSG | |
168 | paddd MSG, [TBL + 0*16] | |
169 | sha256rnds2 STATE1b, STATE0b, MSG | |
170 | pshufd MSG, MSG, 0x0E | |
171 | sha256rnds2 STATE0b, STATE1b, MSG | |
172 | ||
173 | ; /* Rounds 4-7 */ | |
174 | movdqu MSG, [DPTR + 1*16] | |
175 | pshufb MSG, SHUF_MASK | |
176 | movdqa MSGTMP1, MSG | |
177 | paddd MSG, [TBL + 1*16] | |
178 | sha256rnds2 STATE1, STATE0, MSG | |
179 | pshufd MSG, MSG, 0x0E | |
180 | sha256rnds2 STATE0, STATE1, MSG | |
181 | sha256msg1 MSGTMP0, MSGTMP1 | |
182 | ||
183 | movdqu MSG, [DPTRb + 1*16] | |
184 | pshufb MSG, SHUF_MASK | |
185 | movdqa MSGTMP1b, MSG | |
186 | paddd MSG, [TBL + 1*16] | |
187 | sha256rnds2 STATE1b, STATE0b, MSG | |
188 | pshufd MSG, MSG, 0x0E | |
189 | sha256rnds2 STATE0b, STATE1b, MSG | |
190 | sha256msg1 MSGTMP0b, MSGTMP1b | |
191 | ||
192 | ; /* Rounds 8-11 */ | |
193 | movdqu MSG, [DPTR + 2*16] | |
194 | pshufb MSG, SHUF_MASK | |
195 | movdqa MSGTMP2, MSG | |
196 | paddd MSG, [TBL + 2*16] | |
197 | sha256rnds2 STATE1, STATE0, MSG | |
198 | pshufd MSG, MSG, 0x0E | |
199 | sha256rnds2 STATE0, STATE1, MSG | |
200 | sha256msg1 MSGTMP1, MSGTMP2 | |
201 | ||
202 | movdqu MSG, [DPTRb + 2*16] | |
203 | pshufb MSG, SHUF_MASK | |
204 | movdqa MSGTMP2b, MSG | |
205 | paddd MSG, [TBL + 2*16] | |
206 | sha256rnds2 STATE1b, STATE0b, MSG | |
207 | pshufd MSG, MSG, 0x0E | |
208 | sha256rnds2 STATE0b, STATE1b, MSG | |
209 | sha256msg1 MSGTMP1b, MSGTMP2b | |
210 | ||
211 | ; /* Rounds 12-15 */ | |
212 | movdqu MSG, [DPTR + 3*16] | |
213 | pshufb MSG, SHUF_MASK | |
214 | movdqa MSGTMP3, MSG | |
215 | paddd MSG, [TBL + 3*16] | |
216 | sha256rnds2 STATE1, STATE0, MSG | |
217 | movdqa MSGTMP4, MSGTMP3 | |
218 | palignr MSGTMP4, MSGTMP2, 4 | |
219 | paddd MSGTMP0, MSGTMP4 | |
220 | sha256msg2 MSGTMP0, MSGTMP3 | |
221 | pshufd MSG, MSG, 0x0E | |
222 | sha256rnds2 STATE0, STATE1, MSG | |
223 | sha256msg1 MSGTMP2, MSGTMP3 | |
224 | ||
225 | movdqu MSG, [DPTRb + 3*16] | |
226 | pshufb MSG, SHUF_MASK | |
227 | movdqa MSGTMP3b, MSG | |
228 | paddd MSG, [TBL + 3*16] | |
229 | sha256rnds2 STATE1b, STATE0b, MSG | |
230 | movdqa MSGTMP4b, MSGTMP3b | |
231 | palignr MSGTMP4b, MSGTMP2b, 4 | |
232 | paddd MSGTMP0b, MSGTMP4b | |
233 | sha256msg2 MSGTMP0b, MSGTMP3b | |
234 | pshufd MSG, MSG, 0x0E | |
235 | sha256rnds2 STATE0b, STATE1b, MSG | |
236 | sha256msg1 MSGTMP2b, MSGTMP3b | |
237 | ||
238 | ; /* Rounds 16-19 */ | |
239 | movdqa MSG, MSGTMP0 | |
240 | paddd MSG, [TBL + 4*16] | |
241 | sha256rnds2 STATE1, STATE0, MSG | |
242 | movdqa MSGTMP4, MSGTMP0 | |
243 | palignr MSGTMP4, MSGTMP3, 4 | |
244 | paddd MSGTMP1, MSGTMP4 | |
245 | sha256msg2 MSGTMP1, MSGTMP0 | |
246 | pshufd MSG, MSG, 0x0E | |
247 | sha256rnds2 STATE0, STATE1, MSG | |
248 | sha256msg1 MSGTMP3, MSGTMP0 | |
249 | ||
250 | movdqa MSG, MSGTMP0b | |
251 | paddd MSG, [TBL + 4*16] | |
252 | sha256rnds2 STATE1b, STATE0b, MSG | |
253 | movdqa MSGTMP4b, MSGTMP0b | |
254 | palignr MSGTMP4b, MSGTMP3b, 4 | |
255 | paddd MSGTMP1b, MSGTMP4b | |
256 | sha256msg2 MSGTMP1b, MSGTMP0b | |
257 | pshufd MSG, MSG, 0x0E | |
258 | sha256rnds2 STATE0b, STATE1b, MSG | |
259 | sha256msg1 MSGTMP3b, MSGTMP0b | |
260 | ||
261 | ; /* Rounds 20-23 */ | |
262 | movdqa MSG, MSGTMP1 | |
263 | paddd MSG, [TBL + 5*16] | |
264 | sha256rnds2 STATE1, STATE0, MSG | |
265 | movdqa MSGTMP4, MSGTMP1 | |
266 | palignr MSGTMP4, MSGTMP0, 4 | |
267 | paddd MSGTMP2, MSGTMP4 | |
268 | sha256msg2 MSGTMP2, MSGTMP1 | |
269 | pshufd MSG, MSG, 0x0E | |
270 | sha256rnds2 STATE0, STATE1, MSG | |
271 | sha256msg1 MSGTMP0, MSGTMP1 | |
272 | ||
273 | movdqa MSG, MSGTMP1b | |
274 | paddd MSG, [TBL + 5*16] | |
275 | sha256rnds2 STATE1b, STATE0b, MSG | |
276 | movdqa MSGTMP4b, MSGTMP1b | |
277 | palignr MSGTMP4b, MSGTMP0b, 4 | |
278 | paddd MSGTMP2b, MSGTMP4b | |
279 | sha256msg2 MSGTMP2b, MSGTMP1b | |
280 | pshufd MSG, MSG, 0x0E | |
281 | sha256rnds2 STATE0b, STATE1b, MSG | |
282 | sha256msg1 MSGTMP0b, MSGTMP1b | |
283 | ||
284 | ; /* Rounds 24-27 */ | |
285 | movdqa MSG, MSGTMP2 | |
286 | paddd MSG, [TBL + 6*16] | |
287 | sha256rnds2 STATE1, STATE0, MSG | |
288 | movdqa MSGTMP4, MSGTMP2 | |
289 | palignr MSGTMP4, MSGTMP1, 4 | |
290 | paddd MSGTMP3, MSGTMP4 | |
291 | sha256msg2 MSGTMP3, MSGTMP2 | |
292 | pshufd MSG, MSG, 0x0E | |
293 | sha256rnds2 STATE0, STATE1, MSG | |
294 | sha256msg1 MSGTMP1, MSGTMP2 | |
295 | ||
296 | movdqa MSG, MSGTMP2b | |
297 | paddd MSG, [TBL + 6*16] | |
298 | sha256rnds2 STATE1b, STATE0b, MSG | |
299 | movdqa MSGTMP4b, MSGTMP2b | |
300 | palignr MSGTMP4b, MSGTMP1b, 4 | |
301 | paddd MSGTMP3b, MSGTMP4b | |
302 | sha256msg2 MSGTMP3b, MSGTMP2b | |
303 | pshufd MSG, MSG, 0x0E | |
304 | sha256rnds2 STATE0b, STATE1b, MSG | |
305 | sha256msg1 MSGTMP1b, MSGTMP2b | |
306 | ||
307 | ; /* Rounds 28-31 */ | |
308 | movdqa MSG, MSGTMP3 | |
309 | paddd MSG, [TBL + 7*16] | |
310 | sha256rnds2 STATE1, STATE0, MSG | |
311 | movdqa MSGTMP4, MSGTMP3 | |
312 | palignr MSGTMP4, MSGTMP2, 4 | |
313 | paddd MSGTMP0, MSGTMP4 | |
314 | sha256msg2 MSGTMP0, MSGTMP3 | |
315 | pshufd MSG, MSG, 0x0E | |
316 | sha256rnds2 STATE0, STATE1, MSG | |
317 | sha256msg1 MSGTMP2, MSGTMP3 | |
318 | ||
319 | movdqa MSG, MSGTMP3b | |
320 | paddd MSG, [TBL + 7*16] | |
321 | sha256rnds2 STATE1b, STATE0b, MSG | |
322 | movdqa MSGTMP4b, MSGTMP3b | |
323 | palignr MSGTMP4b, MSGTMP2b, 4 | |
324 | paddd MSGTMP0b, MSGTMP4b | |
325 | sha256msg2 MSGTMP0b, MSGTMP3b | |
326 | pshufd MSG, MSG, 0x0E | |
327 | sha256rnds2 STATE0b, STATE1b, MSG | |
328 | sha256msg1 MSGTMP2b, MSGTMP3b | |
329 | ||
330 | ; /* Rounds 32-35 */ | |
331 | movdqa MSG, MSGTMP0 | |
332 | paddd MSG, [TBL + 8*16] | |
333 | sha256rnds2 STATE1, STATE0, MSG | |
334 | movdqa MSGTMP4, MSGTMP0 | |
335 | palignr MSGTMP4, MSGTMP3, 4 | |
336 | paddd MSGTMP1, MSGTMP4 | |
337 | sha256msg2 MSGTMP1, MSGTMP0 | |
338 | pshufd MSG, MSG, 0x0E | |
339 | sha256rnds2 STATE0, STATE1, MSG | |
340 | sha256msg1 MSGTMP3, MSGTMP0 | |
341 | ||
342 | movdqa MSG, MSGTMP0b | |
343 | paddd MSG, [TBL + 8*16] | |
344 | sha256rnds2 STATE1b, STATE0b, MSG | |
345 | movdqa MSGTMP4b, MSGTMP0b | |
346 | palignr MSGTMP4b, MSGTMP3b, 4 | |
347 | paddd MSGTMP1b, MSGTMP4b | |
348 | sha256msg2 MSGTMP1b, MSGTMP0b | |
349 | pshufd MSG, MSG, 0x0E | |
350 | sha256rnds2 STATE0b, STATE1b, MSG | |
351 | sha256msg1 MSGTMP3b, MSGTMP0b | |
352 | ||
353 | ; /* Rounds 36-39 */ | |
354 | movdqa MSG, MSGTMP1 | |
355 | paddd MSG, [TBL + 9*16] | |
356 | sha256rnds2 STATE1, STATE0, MSG | |
357 | movdqa MSGTMP4, MSGTMP1 | |
358 | palignr MSGTMP4, MSGTMP0, 4 | |
359 | paddd MSGTMP2, MSGTMP4 | |
360 | sha256msg2 MSGTMP2, MSGTMP1 | |
361 | pshufd MSG, MSG, 0x0E | |
362 | sha256rnds2 STATE0, STATE1, MSG | |
363 | sha256msg1 MSGTMP0, MSGTMP1 | |
364 | ||
365 | movdqa MSG, MSGTMP1b | |
366 | paddd MSG, [TBL + 9*16] | |
367 | sha256rnds2 STATE1b, STATE0b, MSG | |
368 | movdqa MSGTMP4b, MSGTMP1b | |
369 | palignr MSGTMP4b, MSGTMP0b, 4 | |
370 | paddd MSGTMP2b, MSGTMP4b | |
371 | sha256msg2 MSGTMP2b, MSGTMP1b | |
372 | pshufd MSG, MSG, 0x0E | |
373 | sha256rnds2 STATE0b, STATE1b, MSG | |
374 | sha256msg1 MSGTMP0b, MSGTMP1b | |
375 | ||
376 | ; /* Rounds 40-43 */ | |
377 | movdqa MSG, MSGTMP2 | |
378 | paddd MSG, [TBL + 10*16] | |
379 | sha256rnds2 STATE1, STATE0, MSG | |
380 | movdqa MSGTMP4, MSGTMP2 | |
381 | palignr MSGTMP4, MSGTMP1, 4 | |
382 | paddd MSGTMP3, MSGTMP4 | |
383 | sha256msg2 MSGTMP3, MSGTMP2 | |
384 | pshufd MSG, MSG, 0x0E | |
385 | sha256rnds2 STATE0, STATE1, MSG | |
386 | sha256msg1 MSGTMP1, MSGTMP2 | |
387 | ||
388 | movdqa MSG, MSGTMP2b | |
389 | paddd MSG, [TBL + 10*16] | |
390 | sha256rnds2 STATE1b, STATE0b, MSG | |
391 | movdqa MSGTMP4b, MSGTMP2b | |
392 | palignr MSGTMP4b, MSGTMP1b, 4 | |
393 | paddd MSGTMP3b, MSGTMP4b | |
394 | sha256msg2 MSGTMP3b, MSGTMP2b | |
395 | pshufd MSG, MSG, 0x0E | |
396 | sha256rnds2 STATE0b, STATE1b, MSG | |
397 | sha256msg1 MSGTMP1b, MSGTMP2b | |
398 | ||
399 | ; /* Rounds 44-47 */ | |
400 | movdqa MSG, MSGTMP3 | |
401 | paddd MSG, [TBL + 11*16] | |
402 | sha256rnds2 STATE1, STATE0, MSG | |
403 | movdqa MSGTMP4, MSGTMP3 | |
404 | palignr MSGTMP4, MSGTMP2, 4 | |
405 | paddd MSGTMP0, MSGTMP4 | |
406 | sha256msg2 MSGTMP0, MSGTMP3 | |
407 | pshufd MSG, MSG, 0x0E | |
408 | sha256rnds2 STATE0, STATE1, MSG | |
409 | sha256msg1 MSGTMP2, MSGTMP3 | |
410 | ||
411 | movdqa MSG, MSGTMP3b | |
412 | paddd MSG, [TBL + 11*16] | |
413 | sha256rnds2 STATE1b, STATE0b, MSG | |
414 | movdqa MSGTMP4b, MSGTMP3b | |
415 | palignr MSGTMP4b, MSGTMP2b, 4 | |
416 | paddd MSGTMP0b, MSGTMP4b | |
417 | sha256msg2 MSGTMP0b, MSGTMP3b | |
418 | pshufd MSG, MSG, 0x0E | |
419 | sha256rnds2 STATE0b, STATE1b, MSG | |
420 | sha256msg1 MSGTMP2b, MSGTMP3b | |
421 | ||
422 | ; /* Rounds 48-51 */ | |
423 | movdqa MSG, MSGTMP0 | |
424 | paddd MSG, [TBL + 12*16] | |
425 | sha256rnds2 STATE1, STATE0, MSG | |
426 | movdqa MSGTMP4, MSGTMP0 | |
427 | palignr MSGTMP4, MSGTMP3, 4 | |
428 | paddd MSGTMP1, MSGTMP4 | |
429 | sha256msg2 MSGTMP1, MSGTMP0 | |
430 | pshufd MSG, MSG, 0x0E | |
431 | sha256rnds2 STATE0, STATE1, MSG | |
432 | sha256msg1 MSGTMP3, MSGTMP0 | |
433 | ||
434 | movdqa MSG, MSGTMP0b | |
435 | paddd MSG, [TBL + 12*16] | |
436 | sha256rnds2 STATE1b, STATE0b, MSG | |
437 | movdqa MSGTMP4b, MSGTMP0b | |
438 | palignr MSGTMP4b, MSGTMP3b, 4 | |
439 | paddd MSGTMP1b, MSGTMP4b | |
440 | sha256msg2 MSGTMP1b, MSGTMP0b | |
441 | pshufd MSG, MSG, 0x0E | |
442 | sha256rnds2 STATE0b, STATE1b, MSG | |
443 | sha256msg1 MSGTMP3b, MSGTMP0b | |
444 | ||
445 | ; /* Rounds 52-55 */ | |
446 | movdqa MSG, MSGTMP1 | |
447 | paddd MSG, [TBL + 13*16] | |
448 | sha256rnds2 STATE1, STATE0, MSG | |
449 | movdqa MSGTMP4, MSGTMP1 | |
450 | palignr MSGTMP4, MSGTMP0, 4 | |
451 | paddd MSGTMP2, MSGTMP4 | |
452 | sha256msg2 MSGTMP2, MSGTMP1 | |
453 | pshufd MSG, MSG, 0x0E | |
454 | sha256rnds2 STATE0, STATE1, MSG | |
455 | ||
456 | movdqa MSG, MSGTMP1b | |
457 | paddd MSG, [TBL + 13*16] | |
458 | sha256rnds2 STATE1b, STATE0b, MSG | |
459 | movdqa MSGTMP4b, MSGTMP1b | |
460 | palignr MSGTMP4b, MSGTMP0b, 4 | |
461 | paddd MSGTMP2b, MSGTMP4b | |
462 | sha256msg2 MSGTMP2b, MSGTMP1b | |
463 | pshufd MSG, MSG, 0x0E | |
464 | sha256rnds2 STATE0b, STATE1b, MSG | |
465 | ||
466 | ; /* Rounds 56-59 */ | |
467 | movdqa MSG, MSGTMP2 | |
468 | paddd MSG, [TBL + 14*16] | |
469 | sha256rnds2 STATE1, STATE0, MSG | |
470 | movdqa MSGTMP4, MSGTMP2 | |
471 | palignr MSGTMP4, MSGTMP1, 4 | |
472 | paddd MSGTMP3, MSGTMP4 | |
473 | sha256msg2 MSGTMP3, MSGTMP2 | |
474 | pshufd MSG, MSG, 0x0E | |
475 | sha256rnds2 STATE0, STATE1, MSG | |
476 | ||
477 | movdqa MSG, MSGTMP2b | |
478 | paddd MSG, [TBL + 14*16] | |
479 | sha256rnds2 STATE1b, STATE0b, MSG | |
480 | movdqa MSGTMP4b, MSGTMP2b | |
481 | palignr MSGTMP4b, MSGTMP1b, 4 | |
482 | paddd MSGTMP3b, MSGTMP4b | |
483 | sha256msg2 MSGTMP3b, MSGTMP2b | |
484 | pshufd MSG, MSG, 0x0E | |
485 | sha256rnds2 STATE0b, STATE1b, MSG | |
486 | ||
487 | ; /* Rounds 60-63 */ | |
488 | movdqa MSG, MSGTMP3 | |
489 | paddd MSG, [TBL + 15*16] | |
490 | sha256rnds2 STATE1, STATE0, MSG | |
491 | pshufd MSG, MSG, 0x0E | |
492 | sha256rnds2 STATE0, STATE1, MSG | |
493 | ||
494 | movdqa MSG, MSGTMP3b | |
495 | paddd MSG, [TBL + 15*16] | |
496 | sha256rnds2 STATE1b, STATE0b, MSG | |
497 | pshufd MSG, MSG, 0x0E | |
498 | sha256rnds2 STATE0b, STATE1b, MSG | |
499 | ||
500 | ; /* Add current hash values with previously saved */ | |
501 | paddd STATE0, [rsp + 0*16] | |
502 | paddd STATE1, [rsp + 1*16] | |
503 | ||
504 | paddd STATE0b, [rsp + 2*16] | |
505 | paddd STATE1b, [rsp + 3*16] | |
506 | ||
507 | ; Increment data pointer and loop if more to process | |
508 | add DPTR, 64 | |
509 | add DPTRb, 64 | |
510 | cmp DPTR, NBLK | |
511 | jne lloop | |
512 | ||
513 | ; write out digests | |
514 | lea TMP, [MGR + 4*0] | |
515 | ;; ABEF(state0), CDGH(state1) -> digests | |
516 | pextrd [TMP + 0*NLANX4], STATE0, 3 ; A | |
517 | pextrd [TMP + 1*NLANX4], STATE0, 2 ; B | |
518 | pextrd [TMP + 2*NLANX4], STATE1, 3 ; C | |
519 | lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 | |
520 | pextrd [TMP + 1*NLANX4], STATE1, 2 ; D | |
521 | pextrd [TMP + 2*NLANX4], STATE0, 1 ; E | |
522 | pextrd [TMP + 4*NLANX4], STATE1, 1 ; G | |
523 | lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 | |
524 | pextrd [TMP + 2*NLANX4], STATE0, 0 ; F | |
525 | pextrd [TMP + 4*NLANX4], STATE1, 0 ; H | |
526 | ||
527 | lea TMPb, [MGR + 4*1] | |
528 | ;; ABEF(state0), CDGH(state1) -> digests | |
529 | pextrd [TMPb + 0*NLANX4], STATE0b, 3 ; A | |
530 | pextrd [TMPb + 1*NLANX4], STATE0b, 2 ; B | |
531 | pextrd [TMPb + 2*NLANX4], STATE1b, 3 ; C | |
532 | lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4 | |
533 | pextrd [TMPb + 1*NLANX4], STATE1b, 2 ; D | |
534 | pextrd [TMPb + 2*NLANX4], STATE0b, 1 ; E | |
535 | pextrd [TMPb + 4*NLANX4], STATE1b, 1 ; G | |
536 | lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4 | |
537 | pextrd [TMPb + 2*NLANX4], STATE0b, 0 ; F | |
538 | pextrd [TMPb + 4*NLANX4], STATE1b, 0 ; H | |
539 | ||
540 | ; update input pointers | |
541 | mov [MGR + _data_ptr + 0*8], DPTR | |
542 | mov [MGR + _data_ptr + 1*8], DPTRb | |
543 | ||
544 | backto_mgr: | |
545 | ;;;;;;;;;;;;;;;; | |
546 | ;; Postamble | |
547 | mov rsp, RSPSAVE | |
548 | ||
549 | ret | |
550 | ||
551 | section .data align=16 | |
552 | PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
553 | TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
554 | dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
555 | dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
556 | dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
557 | dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
558 | dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
559 | dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
560 | dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
561 | dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
562 | dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
563 | dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
564 | dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
565 | dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
566 | dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
567 | dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
568 | dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
569 | %else | |
570 | %ifidn __OUTPUT_FORMAT__, win64 | |
571 | global no_sha256_ni_x2 | |
572 | no_sha256_ni_x2: | |
573 | %endif | |
574 | %endif ; HAVE_AS_KNOWS_SHANI |