]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha256_mb / sha256_ni_x2.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "sha256_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32
33 %ifdef HAVE_AS_KNOWS_SHANI
34
35 [bits 64]
36 default rel
37 section .text
38
39 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
40 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
41 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
42
43 %ifidn __OUTPUT_FORMAT__, elf64
44 ; Linux
45 %define arg0 rdi
46 %define arg1 rsi
47 %else
48 ; Windows
49 %define arg0 rcx
50 %define arg1 rdx
51 %endif
52
53 ;; FRAMESZ plus pushes must be an odd multiple of 8
54 %define FRAMESZ 64 ; space for ABCDE
55 %define RSPSAVE rax
56
57 %define MSG xmm0
58 %define STATE0 xmm1
59 %define STATE1 xmm2
60 %define MSGTMP0 xmm3
61 %define MSGTMP1 xmm4
62 %define MSGTMP2 xmm5
63 %define MSGTMP3 xmm6
64 %define MSGTMP4 xmm7
65
66 %define STATE0b xmm8
67 %define STATE1b xmm9
68 %define MSGTMP0b xmm10
69 %define MSGTMP1b xmm11
70 %define MSGTMP2b xmm12
71 %define MSGTMP3b xmm13
72 %define MSGTMP4b xmm14
73
74 %define SHUF_MASK xmm15
75
76 ; arg index is start from 0 while mgr_flush/submit is from 1
77 %define MGR arg0
78 %define NBLK arg1
79 %define NLANX4 r10 ; consistent with caller
80 %define IDX r8 ; local variable -- consistent with caller
81 %define DPTR r11 ; local variable -- input buffer pointer
82 %define DPTRb r12
83 %define TMP r9 ; local variable -- assistant to address digest
84 %define TBL r13
85 %define TMPb r14 ; local variable -- assistant to address digest
86 align 32
87
88 ; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
89 ; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
90 ; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
91 ; invisibile arg 2 : IDX : hash on which lane
92 ; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
93 ; (sse/avx is 4, avx2 is 8, avx512 is 16)
94 ;
95 ; Clobbers registers: rax, r9~r14, xmm0-xmm15
96 ;
97 mk_global sha256_ni_x2, function, internal
98 sha256_ni_x2:
99 endbranch
100 mov RSPSAVE, rsp
101 sub rsp, FRAMESZ
102 and rsp, ~0xF ; Align 16Bytes downward
103
104 shl NBLK, 6 ; transform blk amount into bytes
105 jz backto_mgr
106
107 ; detach idx from nlanx4
108 mov IDX, NLANX4
109 shr NLANX4, 8
110 and IDX, 0xff
111
112 lea TMP, [MGR + 4*0]
113 lea TMPb, [MGR + 4*1]
114
115 ;; Initialize digest
116 ;; digests -> ABEF(state0), CDGH(state1)
117 pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A
118 pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B
119 pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C
120 lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
121 pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D
122 pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E
123 pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G
124 lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
125 pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F
126 pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H
127
128 pinsrd STATE0b, [TMPb + 0*NLANX4], 3 ; A
129 pinsrd STATE0b, [TMPb + 1*NLANX4], 2 ; B
130 pinsrd STATE1b, [TMPb + 2*NLANX4], 3 ; C
131 lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
132 pinsrd STATE1b, [TMPb + 1*NLANX4], 2 ; D
133 pinsrd STATE0b, [TMPb + 2*NLANX4], 1 ; E
134 pinsrd STATE1b, [TMPb + 4*NLANX4], 1 ; G
135 lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
136 pinsrd STATE0b, [TMPb + 2*NLANX4], 0 ; F
137 pinsrd STATE1b, [TMPb + 4*NLANX4], 0 ; H
138
139 movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
140 lea TBL, [TABLE]
141
142 ;; Load input pointers
143 mov DPTR, [MGR + _data_ptr + 8*0]
144 mov DPTRb,[MGR + _data_ptr + 8*1]
145 ;; nblk is used to indicate data end
146 add NBLK, DPTR
147
148 lloop:
149 ; /* Save hash values for addition after rounds */
150 movdqa [rsp + 0*16], STATE0
151 movdqa [rsp + 1*16], STATE1
152
153 movdqa [rsp + 2*16], STATE0b
154 movdqa [rsp + 3*16], STATE1b
155
156 ; /* Rounds 0-3 */
157 movdqu MSG, [DPTR + 0*16]
158 pshufb MSG, SHUF_MASK
159 movdqa MSGTMP0, MSG
160 paddd MSG, [TBL + 0*16]
161 sha256rnds2 STATE1, STATE0, MSG
162 pshufd MSG, MSG, 0x0E
163 sha256rnds2 STATE0, STATE1, MSG
164
165 movdqu MSG, [DPTRb + 0*16]
166 pshufb MSG, SHUF_MASK
167 movdqa MSGTMP0b, MSG
168 paddd MSG, [TBL + 0*16]
169 sha256rnds2 STATE1b, STATE0b, MSG
170 pshufd MSG, MSG, 0x0E
171 sha256rnds2 STATE0b, STATE1b, MSG
172
173 ; /* Rounds 4-7 */
174 movdqu MSG, [DPTR + 1*16]
175 pshufb MSG, SHUF_MASK
176 movdqa MSGTMP1, MSG
177 paddd MSG, [TBL + 1*16]
178 sha256rnds2 STATE1, STATE0, MSG
179 pshufd MSG, MSG, 0x0E
180 sha256rnds2 STATE0, STATE1, MSG
181 sha256msg1 MSGTMP0, MSGTMP1
182
183 movdqu MSG, [DPTRb + 1*16]
184 pshufb MSG, SHUF_MASK
185 movdqa MSGTMP1b, MSG
186 paddd MSG, [TBL + 1*16]
187 sha256rnds2 STATE1b, STATE0b, MSG
188 pshufd MSG, MSG, 0x0E
189 sha256rnds2 STATE0b, STATE1b, MSG
190 sha256msg1 MSGTMP0b, MSGTMP1b
191
192 ; /* Rounds 8-11 */
193 movdqu MSG, [DPTR + 2*16]
194 pshufb MSG, SHUF_MASK
195 movdqa MSGTMP2, MSG
196 paddd MSG, [TBL + 2*16]
197 sha256rnds2 STATE1, STATE0, MSG
198 pshufd MSG, MSG, 0x0E
199 sha256rnds2 STATE0, STATE1, MSG
200 sha256msg1 MSGTMP1, MSGTMP2
201
202 movdqu MSG, [DPTRb + 2*16]
203 pshufb MSG, SHUF_MASK
204 movdqa MSGTMP2b, MSG
205 paddd MSG, [TBL + 2*16]
206 sha256rnds2 STATE1b, STATE0b, MSG
207 pshufd MSG, MSG, 0x0E
208 sha256rnds2 STATE0b, STATE1b, MSG
209 sha256msg1 MSGTMP1b, MSGTMP2b
210
211 ; /* Rounds 12-15 */
212 movdqu MSG, [DPTR + 3*16]
213 pshufb MSG, SHUF_MASK
214 movdqa MSGTMP3, MSG
215 paddd MSG, [TBL + 3*16]
216 sha256rnds2 STATE1, STATE0, MSG
217 movdqa MSGTMP4, MSGTMP3
218 palignr MSGTMP4, MSGTMP2, 4
219 paddd MSGTMP0, MSGTMP4
220 sha256msg2 MSGTMP0, MSGTMP3
221 pshufd MSG, MSG, 0x0E
222 sha256rnds2 STATE0, STATE1, MSG
223 sha256msg1 MSGTMP2, MSGTMP3
224
225 movdqu MSG, [DPTRb + 3*16]
226 pshufb MSG, SHUF_MASK
227 movdqa MSGTMP3b, MSG
228 paddd MSG, [TBL + 3*16]
229 sha256rnds2 STATE1b, STATE0b, MSG
230 movdqa MSGTMP4b, MSGTMP3b
231 palignr MSGTMP4b, MSGTMP2b, 4
232 paddd MSGTMP0b, MSGTMP4b
233 sha256msg2 MSGTMP0b, MSGTMP3b
234 pshufd MSG, MSG, 0x0E
235 sha256rnds2 STATE0b, STATE1b, MSG
236 sha256msg1 MSGTMP2b, MSGTMP3b
237
238 ; /* Rounds 16-19 */
239 movdqa MSG, MSGTMP0
240 paddd MSG, [TBL + 4*16]
241 sha256rnds2 STATE1, STATE0, MSG
242 movdqa MSGTMP4, MSGTMP0
243 palignr MSGTMP4, MSGTMP3, 4
244 paddd MSGTMP1, MSGTMP4
245 sha256msg2 MSGTMP1, MSGTMP0
246 pshufd MSG, MSG, 0x0E
247 sha256rnds2 STATE0, STATE1, MSG
248 sha256msg1 MSGTMP3, MSGTMP0
249
250 movdqa MSG, MSGTMP0b
251 paddd MSG, [TBL + 4*16]
252 sha256rnds2 STATE1b, STATE0b, MSG
253 movdqa MSGTMP4b, MSGTMP0b
254 palignr MSGTMP4b, MSGTMP3b, 4
255 paddd MSGTMP1b, MSGTMP4b
256 sha256msg2 MSGTMP1b, MSGTMP0b
257 pshufd MSG, MSG, 0x0E
258 sha256rnds2 STATE0b, STATE1b, MSG
259 sha256msg1 MSGTMP3b, MSGTMP0b
260
261 ; /* Rounds 20-23 */
262 movdqa MSG, MSGTMP1
263 paddd MSG, [TBL + 5*16]
264 sha256rnds2 STATE1, STATE0, MSG
265 movdqa MSGTMP4, MSGTMP1
266 palignr MSGTMP4, MSGTMP0, 4
267 paddd MSGTMP2, MSGTMP4
268 sha256msg2 MSGTMP2, MSGTMP1
269 pshufd MSG, MSG, 0x0E
270 sha256rnds2 STATE0, STATE1, MSG
271 sha256msg1 MSGTMP0, MSGTMP1
272
273 movdqa MSG, MSGTMP1b
274 paddd MSG, [TBL + 5*16]
275 sha256rnds2 STATE1b, STATE0b, MSG
276 movdqa MSGTMP4b, MSGTMP1b
277 palignr MSGTMP4b, MSGTMP0b, 4
278 paddd MSGTMP2b, MSGTMP4b
279 sha256msg2 MSGTMP2b, MSGTMP1b
280 pshufd MSG, MSG, 0x0E
281 sha256rnds2 STATE0b, STATE1b, MSG
282 sha256msg1 MSGTMP0b, MSGTMP1b
283
284 ; /* Rounds 24-27 */
285 movdqa MSG, MSGTMP2
286 paddd MSG, [TBL + 6*16]
287 sha256rnds2 STATE1, STATE0, MSG
288 movdqa MSGTMP4, MSGTMP2
289 palignr MSGTMP4, MSGTMP1, 4
290 paddd MSGTMP3, MSGTMP4
291 sha256msg2 MSGTMP3, MSGTMP2
292 pshufd MSG, MSG, 0x0E
293 sha256rnds2 STATE0, STATE1, MSG
294 sha256msg1 MSGTMP1, MSGTMP2
295
296 movdqa MSG, MSGTMP2b
297 paddd MSG, [TBL + 6*16]
298 sha256rnds2 STATE1b, STATE0b, MSG
299 movdqa MSGTMP4b, MSGTMP2b
300 palignr MSGTMP4b, MSGTMP1b, 4
301 paddd MSGTMP3b, MSGTMP4b
302 sha256msg2 MSGTMP3b, MSGTMP2b
303 pshufd MSG, MSG, 0x0E
304 sha256rnds2 STATE0b, STATE1b, MSG
305 sha256msg1 MSGTMP1b, MSGTMP2b
306
307 ; /* Rounds 28-31 */
308 movdqa MSG, MSGTMP3
309 paddd MSG, [TBL + 7*16]
310 sha256rnds2 STATE1, STATE0, MSG
311 movdqa MSGTMP4, MSGTMP3
312 palignr MSGTMP4, MSGTMP2, 4
313 paddd MSGTMP0, MSGTMP4
314 sha256msg2 MSGTMP0, MSGTMP3
315 pshufd MSG, MSG, 0x0E
316 sha256rnds2 STATE0, STATE1, MSG
317 sha256msg1 MSGTMP2, MSGTMP3
318
319 movdqa MSG, MSGTMP3b
320 paddd MSG, [TBL + 7*16]
321 sha256rnds2 STATE1b, STATE0b, MSG
322 movdqa MSGTMP4b, MSGTMP3b
323 palignr MSGTMP4b, MSGTMP2b, 4
324 paddd MSGTMP0b, MSGTMP4b
325 sha256msg2 MSGTMP0b, MSGTMP3b
326 pshufd MSG, MSG, 0x0E
327 sha256rnds2 STATE0b, STATE1b, MSG
328 sha256msg1 MSGTMP2b, MSGTMP3b
329
330 ; /* Rounds 32-35 */
331 movdqa MSG, MSGTMP0
332 paddd MSG, [TBL + 8*16]
333 sha256rnds2 STATE1, STATE0, MSG
334 movdqa MSGTMP4, MSGTMP0
335 palignr MSGTMP4, MSGTMP3, 4
336 paddd MSGTMP1, MSGTMP4
337 sha256msg2 MSGTMP1, MSGTMP0
338 pshufd MSG, MSG, 0x0E
339 sha256rnds2 STATE0, STATE1, MSG
340 sha256msg1 MSGTMP3, MSGTMP0
341
342 movdqa MSG, MSGTMP0b
343 paddd MSG, [TBL + 8*16]
344 sha256rnds2 STATE1b, STATE0b, MSG
345 movdqa MSGTMP4b, MSGTMP0b
346 palignr MSGTMP4b, MSGTMP3b, 4
347 paddd MSGTMP1b, MSGTMP4b
348 sha256msg2 MSGTMP1b, MSGTMP0b
349 pshufd MSG, MSG, 0x0E
350 sha256rnds2 STATE0b, STATE1b, MSG
351 sha256msg1 MSGTMP3b, MSGTMP0b
352
353 ; /* Rounds 36-39 */
354 movdqa MSG, MSGTMP1
355 paddd MSG, [TBL + 9*16]
356 sha256rnds2 STATE1, STATE0, MSG
357 movdqa MSGTMP4, MSGTMP1
358 palignr MSGTMP4, MSGTMP0, 4
359 paddd MSGTMP2, MSGTMP4
360 sha256msg2 MSGTMP2, MSGTMP1
361 pshufd MSG, MSG, 0x0E
362 sha256rnds2 STATE0, STATE1, MSG
363 sha256msg1 MSGTMP0, MSGTMP1
364
365 movdqa MSG, MSGTMP1b
366 paddd MSG, [TBL + 9*16]
367 sha256rnds2 STATE1b, STATE0b, MSG
368 movdqa MSGTMP4b, MSGTMP1b
369 palignr MSGTMP4b, MSGTMP0b, 4
370 paddd MSGTMP2b, MSGTMP4b
371 sha256msg2 MSGTMP2b, MSGTMP1b
372 pshufd MSG, MSG, 0x0E
373 sha256rnds2 STATE0b, STATE1b, MSG
374 sha256msg1 MSGTMP0b, MSGTMP1b
375
376 ; /* Rounds 40-43 */
377 movdqa MSG, MSGTMP2
378 paddd MSG, [TBL + 10*16]
379 sha256rnds2 STATE1, STATE0, MSG
380 movdqa MSGTMP4, MSGTMP2
381 palignr MSGTMP4, MSGTMP1, 4
382 paddd MSGTMP3, MSGTMP4
383 sha256msg2 MSGTMP3, MSGTMP2
384 pshufd MSG, MSG, 0x0E
385 sha256rnds2 STATE0, STATE1, MSG
386 sha256msg1 MSGTMP1, MSGTMP2
387
388 movdqa MSG, MSGTMP2b
389 paddd MSG, [TBL + 10*16]
390 sha256rnds2 STATE1b, STATE0b, MSG
391 movdqa MSGTMP4b, MSGTMP2b
392 palignr MSGTMP4b, MSGTMP1b, 4
393 paddd MSGTMP3b, MSGTMP4b
394 sha256msg2 MSGTMP3b, MSGTMP2b
395 pshufd MSG, MSG, 0x0E
396 sha256rnds2 STATE0b, STATE1b, MSG
397 sha256msg1 MSGTMP1b, MSGTMP2b
398
399 ; /* Rounds 44-47 */
400 movdqa MSG, MSGTMP3
401 paddd MSG, [TBL + 11*16]
402 sha256rnds2 STATE1, STATE0, MSG
403 movdqa MSGTMP4, MSGTMP3
404 palignr MSGTMP4, MSGTMP2, 4
405 paddd MSGTMP0, MSGTMP4
406 sha256msg2 MSGTMP0, MSGTMP3
407 pshufd MSG, MSG, 0x0E
408 sha256rnds2 STATE0, STATE1, MSG
409 sha256msg1 MSGTMP2, MSGTMP3
410
411 movdqa MSG, MSGTMP3b
412 paddd MSG, [TBL + 11*16]
413 sha256rnds2 STATE1b, STATE0b, MSG
414 movdqa MSGTMP4b, MSGTMP3b
415 palignr MSGTMP4b, MSGTMP2b, 4
416 paddd MSGTMP0b, MSGTMP4b
417 sha256msg2 MSGTMP0b, MSGTMP3b
418 pshufd MSG, MSG, 0x0E
419 sha256rnds2 STATE0b, STATE1b, MSG
420 sha256msg1 MSGTMP2b, MSGTMP3b
421
422 ; /* Rounds 48-51 */
423 movdqa MSG, MSGTMP0
424 paddd MSG, [TBL + 12*16]
425 sha256rnds2 STATE1, STATE0, MSG
426 movdqa MSGTMP4, MSGTMP0
427 palignr MSGTMP4, MSGTMP3, 4
428 paddd MSGTMP1, MSGTMP4
429 sha256msg2 MSGTMP1, MSGTMP0
430 pshufd MSG, MSG, 0x0E
431 sha256rnds2 STATE0, STATE1, MSG
432 sha256msg1 MSGTMP3, MSGTMP0
433
434 movdqa MSG, MSGTMP0b
435 paddd MSG, [TBL + 12*16]
436 sha256rnds2 STATE1b, STATE0b, MSG
437 movdqa MSGTMP4b, MSGTMP0b
438 palignr MSGTMP4b, MSGTMP3b, 4
439 paddd MSGTMP1b, MSGTMP4b
440 sha256msg2 MSGTMP1b, MSGTMP0b
441 pshufd MSG, MSG, 0x0E
442 sha256rnds2 STATE0b, STATE1b, MSG
443 sha256msg1 MSGTMP3b, MSGTMP0b
444
445 ; /* Rounds 52-55 */
446 movdqa MSG, MSGTMP1
447 paddd MSG, [TBL + 13*16]
448 sha256rnds2 STATE1, STATE0, MSG
449 movdqa MSGTMP4, MSGTMP1
450 palignr MSGTMP4, MSGTMP0, 4
451 paddd MSGTMP2, MSGTMP4
452 sha256msg2 MSGTMP2, MSGTMP1
453 pshufd MSG, MSG, 0x0E
454 sha256rnds2 STATE0, STATE1, MSG
455
456 movdqa MSG, MSGTMP1b
457 paddd MSG, [TBL + 13*16]
458 sha256rnds2 STATE1b, STATE0b, MSG
459 movdqa MSGTMP4b, MSGTMP1b
460 palignr MSGTMP4b, MSGTMP0b, 4
461 paddd MSGTMP2b, MSGTMP4b
462 sha256msg2 MSGTMP2b, MSGTMP1b
463 pshufd MSG, MSG, 0x0E
464 sha256rnds2 STATE0b, STATE1b, MSG
465
466 ; /* Rounds 56-59 */
467 movdqa MSG, MSGTMP2
468 paddd MSG, [TBL + 14*16]
469 sha256rnds2 STATE1, STATE0, MSG
470 movdqa MSGTMP4, MSGTMP2
471 palignr MSGTMP4, MSGTMP1, 4
472 paddd MSGTMP3, MSGTMP4
473 sha256msg2 MSGTMP3, MSGTMP2
474 pshufd MSG, MSG, 0x0E
475 sha256rnds2 STATE0, STATE1, MSG
476
477 movdqa MSG, MSGTMP2b
478 paddd MSG, [TBL + 14*16]
479 sha256rnds2 STATE1b, STATE0b, MSG
480 movdqa MSGTMP4b, MSGTMP2b
481 palignr MSGTMP4b, MSGTMP1b, 4
482 paddd MSGTMP3b, MSGTMP4b
483 sha256msg2 MSGTMP3b, MSGTMP2b
484 pshufd MSG, MSG, 0x0E
485 sha256rnds2 STATE0b, STATE1b, MSG
486
487 ; /* Rounds 60-63 */
488 movdqa MSG, MSGTMP3
489 paddd MSG, [TBL + 15*16]
490 sha256rnds2 STATE1, STATE0, MSG
491 pshufd MSG, MSG, 0x0E
492 sha256rnds2 STATE0, STATE1, MSG
493
494 movdqa MSG, MSGTMP3b
495 paddd MSG, [TBL + 15*16]
496 sha256rnds2 STATE1b, STATE0b, MSG
497 pshufd MSG, MSG, 0x0E
498 sha256rnds2 STATE0b, STATE1b, MSG
499
500 ; /* Add current hash values with previously saved */
501 paddd STATE0, [rsp + 0*16]
502 paddd STATE1, [rsp + 1*16]
503
504 paddd STATE0b, [rsp + 2*16]
505 paddd STATE1b, [rsp + 3*16]
506
507 ; Increment data pointer and loop if more to process
508 add DPTR, 64
509 add DPTRb, 64
510 cmp DPTR, NBLK
511 jne lloop
512
513 ; write out digests
514 lea TMP, [MGR + 4*0]
515 ;; ABEF(state0), CDGH(state1) -> digests
516 pextrd [TMP + 0*NLANX4], STATE0, 3 ; A
517 pextrd [TMP + 1*NLANX4], STATE0, 2 ; B
518 pextrd [TMP + 2*NLANX4], STATE1, 3 ; C
519 lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
520 pextrd [TMP + 1*NLANX4], STATE1, 2 ; D
521 pextrd [TMP + 2*NLANX4], STATE0, 1 ; E
522 pextrd [TMP + 4*NLANX4], STATE1, 1 ; G
523 lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
524 pextrd [TMP + 2*NLANX4], STATE0, 0 ; F
525 pextrd [TMP + 4*NLANX4], STATE1, 0 ; H
526
527 lea TMPb, [MGR + 4*1]
528 ;; ABEF(state0), CDGH(state1) -> digests
529 pextrd [TMPb + 0*NLANX4], STATE0b, 3 ; A
530 pextrd [TMPb + 1*NLANX4], STATE0b, 2 ; B
531 pextrd [TMPb + 2*NLANX4], STATE1b, 3 ; C
532 lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
533 pextrd [TMPb + 1*NLANX4], STATE1b, 2 ; D
534 pextrd [TMPb + 2*NLANX4], STATE0b, 1 ; E
535 pextrd [TMPb + 4*NLANX4], STATE1b, 1 ; G
536 lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
537 pextrd [TMPb + 2*NLANX4], STATE0b, 0 ; F
538 pextrd [TMPb + 4*NLANX4], STATE1b, 0 ; H
539
540 ; update input pointers
541 mov [MGR + _data_ptr + 0*8], DPTR
542 mov [MGR + _data_ptr + 1*8], DPTRb
543
544 backto_mgr:
545 ;;;;;;;;;;;;;;;;
546 ;; Postamble
547 mov rsp, RSPSAVE
548
549 ret
550
551 section .data align=16
552 PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
553 TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
554 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
555 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
556 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
557 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
558 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
559 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
560 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
561 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
562 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
563 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
564 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
565 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
566 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
567 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
568 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
569 %else
570 %ifidn __OUTPUT_FORMAT__, win64
571 global no_sha256_ni_x2
572 no_sha256_ni_x2:
573 %endif
574 %endif ; HAVE_AS_KNOWS_SHANI