]>
Commit | Line | Data |
---|---|---|
1e59de90 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2020 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "sm3_mb_mgr_datastruct.asm" | |
31 | %include "reg_sizes.asm" | |
32 | ||
33 | [bits 64] | |
34 | default rel | |
35 | section .text | |
36 | ||
37 | ;; code to compute oct SM3 using SSE-256 / AVX2 | |
38 | ;; outer calling routine takes care of save and restore of XMM registers | |
39 | ;; Logic designed/laid out by JDG | |
40 | ||
41 | ;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15 | |
42 | ;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15 | |
43 | ;; Windows preserves: rcx rbp r8 | |
44 | ;; | |
45 | ;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15 | |
46 | ;; Linux preserves: rdi rbp r8 | |
47 | ;; | |
48 | ;; clobbers ymm0-15 | |
49 | ||
50 | %ifidn __OUTPUT_FORMAT__, elf64 | |
51 | ; Linux definitions | |
52 | %define arg1 rdi | |
53 | %define arg2 rsi | |
54 | %define reg3 rcx | |
55 | %define reg4 rdx | |
56 | %else | |
57 | ; Windows definitions | |
58 | %define arg1 rcx | |
59 | %define arg2 rdx | |
60 | %define reg3 rsi | |
61 | %define reg4 rdi | |
62 | %endif | |
63 | ||
64 | ; Common definitions | |
65 | %define STATE arg1 | |
66 | %define INP_SIZE arg2 | |
67 | %define SIZE INP_SIZE ; rsi | |
68 | ||
69 | %define IDX rax | |
70 | %define TBL reg3 | |
71 | ||
72 | %define inp0 r9 | |
73 | %define inp1 r10 | |
74 | %define inp2 r11 | |
75 | %define inp3 r12 | |
76 | %define inp4 r13 | |
77 | %define inp5 r14 | |
78 | %define inp6 r15 | |
79 | %define inp7 reg4 | |
80 | ||
81 | %define APPEND(a,b) a %+ b | |
82 | ||
83 | %define WB0 ymm0 | |
84 | %define WB1 ymm1 | |
85 | %define WB2 ymm2 | |
86 | %define WB3 ymm3 | |
87 | %define WB4 ymm4 | |
88 | %define WB5 ymm5 | |
89 | %define WB6 ymm6 | |
90 | %define WB7 ymm7 | |
91 | %define WB8 ymm8 | |
92 | %define WB9 ymm9 | |
93 | %define WB10 ymm10 | |
94 | %define WB11 ymm11 | |
95 | %define WB12 ymm12 | |
96 | %define WB13 ymm13 | |
97 | %define WB14 ymm14 | |
98 | %define WB15 ymm15 | |
99 | ||
100 | %define WBTMP0 ymm8 | |
101 | %define WBTMP1 ymm9 | |
102 | ||
103 | %define WBTMP2 ymm0 | |
104 | %define WBTMP3 ymm1 | |
105 | ||
106 | %define A ymm0 | |
107 | %define B ymm1 | |
108 | %define C ymm2 | |
109 | %define D ymm3 | |
110 | %define E ymm4 | |
111 | %define F ymm5 | |
112 | %define G ymm6 | |
113 | %define H ymm7 | |
114 | ||
115 | %define TMP0 ymm8 | |
116 | %define TMP1 ymm9 | |
117 | %define TMP2 ymm10 | |
118 | ||
119 | ; W(j) = WB(j) + WB(j+4) | |
120 | ; Keep WB(j) - W(j+4) to reduce momory read | |
121 | %define Wj0 ymm11 | |
122 | %define Wj1 ymm12 | |
123 | %define Wj2 ymm13 | |
124 | %define Wj3 ymm14 | |
125 | %define Wj4 ymm15 | |
126 | ||
127 | ||
128 | %define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register | |
129 | %define PTR_SZ 8 | |
130 | %define SM3_DIGEST_WORD_SIZE 4 | |
131 | %define MAX_SM3_LANES 8 | |
132 | %define NUM_SM3_DIGEST_WORDS 8 | |
133 | %define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE) | |
134 | ||
135 | ; Define stack usage | |
136 | ||
137 | ;; Assume stack aligned to 32 bytes before call | |
138 | ;; Therefore FRAMESZ mod 32 must be 32-8 = 24 | |
139 | struc stack_frame | |
140 | .data resb 16*SZ8 | |
141 | .digest resb 8*SZ8 | |
142 | .wbtmp resb 69*SZ8 | |
143 | .rsp resb 8 | |
144 | endstruc | |
145 | %define FRAMESZ stack_frame_size | |
146 | %define _DIGEST stack_frame.digest | |
147 | %define _WBTMP stack_frame.wbtmp | |
148 | %define _RSP_SAVE stack_frame.rsp | |
149 | ||
150 | %define YTMP0 rsp + _WBTMP + 0*SZ8 | |
151 | %define YTMP1 rsp + _WBTMP + 1*SZ8 | |
152 | %define YTMP2 rsp + _WBTMP + 2*SZ8 | |
153 | %define YTMP3 rsp + _WBTMP + 3*SZ8 | |
154 | %define YTMP4 rsp + _WBTMP + 4*SZ8 | |
155 | ||
156 | %define YTMPI rsp + _WBTMP + I*SZ8 | |
157 | %define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8 | |
158 | %define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8 | |
159 | %define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8 | |
160 | %define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8 | |
161 | ||
162 | ||
163 | %define VMOVPS vmovups | |
164 | ||
165 | ;;;;;;;; | |
166 | ; same as sha256 | |
167 | ;;;;;;;; | |
168 | %macro TRANSPOSE8 10 | |
169 | %define %%r0 %1 | |
170 | %define %%r1 %2 | |
171 | %define %%r2 %3 | |
172 | %define %%r3 %4 | |
173 | %define %%r4 %5 | |
174 | %define %%r5 %6 | |
175 | %define %%r6 %7 | |
176 | %define %%r7 %8 | |
177 | %define %%t0 %9 | |
178 | %define %%t1 %10 | |
179 | ; process top half (r0..r3) {a...d} | |
180 | vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0} | |
181 | vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2} | |
182 | vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0} | |
183 | vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2} | |
184 | vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1} | |
185 | vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2} | |
186 | vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3} | |
187 | vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0} | |
188 | ||
189 | ; use r2 in place of t0 | |
190 | ; process bottom half (r4..r7) {e...h} | |
191 | vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0} | |
192 | vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2} | |
193 | vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0} | |
194 | vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2} | |
195 | vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1} | |
196 | vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2} | |
197 | vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3} | |
198 | vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0} | |
199 | ||
200 | vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6 | |
201 | vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2 | |
202 | vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5 | |
203 | vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1 | |
204 | vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7 | |
205 | vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3 | |
206 | vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4 | |
207 | vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0 | |
208 | %endmacro | |
209 | ||
210 | %macro ROTATE_W 0 | |
211 | ||
212 | %xdefine TMP_ Wj0 | |
213 | %xdefine Wj0 Wj1 | |
214 | %xdefine Wj1 Wj2 | |
215 | %xdefine Wj2 Wj3 | |
216 | %xdefine Wj3 Wj4 | |
217 | ||
218 | %xdefine Wj4 TMP_ | |
219 | ||
220 | %endmacro | |
221 | ||
222 | ; ROTATE A,B,C,D | |
223 | %macro ROTATE_ARGS_AD 0 | |
224 | ||
225 | %xdefine TMP_ D | |
226 | %xdefine D C | |
227 | %xdefine C B | |
228 | %xdefine B A | |
229 | %xdefine A TMP2 | |
230 | %xdefine TMP2 TMP_ | |
231 | ||
232 | %endmacro | |
233 | ||
234 | %macro ROTATE_ARGS_EH 0 | |
235 | ||
236 | %xdefine TMP_ H | |
237 | %xdefine H G | |
238 | %xdefine G F | |
239 | %xdefine F E | |
240 | %xdefine E TMP0 | |
241 | %xdefine TMP0 TMP_ | |
242 | ||
243 | %endmacro | |
244 | ||
245 | %macro ROLD 3 | |
246 | ||
247 | %define %%reg %1 | |
248 | %define %%imm %2 | |
249 | %define %%tmp %3 | |
250 | vpslld %%tmp, %%reg, %%imm | |
251 | vpsrld %%reg, %%reg, (32-(%%imm)) | |
252 | vpor %%reg, %%reg, %%tmp | |
253 | ||
254 | %endmacro | |
255 | ||
256 | %macro ROLD_nd 4 | |
257 | %define %%reg %1 | |
258 | %define %%imm %2 | |
259 | %define %%tmp %3 | |
260 | %define %%src %4 | |
261 | vpslld %%tmp, %%src, %%imm | |
262 | vpsrld %%reg, %%src, (32-(%%imm)) | |
263 | vpor %%reg, %%reg, %%tmp | |
264 | %endmacro | |
265 | ||
266 | ;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes); | |
267 | ;; arg 1 : STATE : pointer to input data | |
268 | ;; arg 2 : INP_SIZE : size of input in blocks | |
269 | mk_global sm3_mb_x8_avx2,function,internal | |
270 | align 16 | |
271 | sm3_mb_x8_avx2: | |
272 | endbranch | |
273 | ; general registers preserved in outer calling routine | |
274 | ; outer calling routine saves all the YMM registers | |
275 | ||
276 | ; save rsp, allocate 32-byte aligned for local variables | |
277 | mov IDX, rsp | |
278 | sub rsp, FRAMESZ | |
279 | and rsp, ~31 | |
280 | mov [rsp + _RSP_SAVE], IDX | |
281 | ||
282 | lea TBL,[TABLE] | |
283 | ||
284 | ;; load the address of each of the 8 message lanes | |
285 | ;; getting ready to transpose input onto stack | |
286 | mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ] | |
287 | mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ] | |
288 | mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ] | |
289 | mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ] | |
290 | mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ] | |
291 | mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ] | |
292 | mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ] | |
293 | mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ] | |
294 | ||
295 | xor IDX, IDX | |
296 | ||
297 | %assign cur_loop 0 | |
298 | lloop: | |
299 | ||
300 | ; | |
301 | ; Pre calculate the WB 0..68 an W 0..64 | |
302 | ; It will better than calculate WB/W in round method | |
303 | ; | |
304 | ; ps : SHA256(AVX2) calculate WB/W in round method | |
305 | ; | |
306 | ; Pre calculation memory io time: | |
307 | ; read : 68 + 3 * 52(read WB) | |
308 | ; write : 52(write WB17..68) | |
309 | ; Round method calculation memory io time: | |
310 | ; read : 48 * 6(read 6 number of WB each round) | |
311 | ; write : 52 + 64(same as upper) | |
312 | ; | |
313 | VMOVPS WB0,[inp0+IDX] | |
314 | VMOVPS WB1,[inp1+IDX] | |
315 | VMOVPS WB2,[inp2+IDX] | |
316 | VMOVPS WB3,[inp3+IDX] | |
317 | VMOVPS WB4,[inp4+IDX] | |
318 | VMOVPS WB5,[inp5+IDX] | |
319 | VMOVPS WB6,[inp6+IDX] | |
320 | VMOVPS WB7,[inp7+IDX] | |
321 | ||
322 | TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1 | |
323 | vmovdqa WBTMP0, [SHUF_MASK] | |
324 | vpshufb WB0,WBTMP0 | |
325 | vpshufb WB1,WBTMP0 | |
326 | vpshufb WB2,WBTMP0 | |
327 | vpshufb WB3,WBTMP0 | |
328 | vpshufb WB4,WBTMP0 | |
329 | vpshufb WB5,WBTMP0 | |
330 | vpshufb WB6,WBTMP0 | |
331 | vpshufb WB7,WBTMP0 | |
332 | ||
333 | vmovdqa [YTMP0], WB0 | |
334 | vmovdqa [YTMP1], WB1 | |
335 | ||
336 | VMOVPS WB8,[inp0+IDX + 32] | |
337 | VMOVPS WB9,[inp1+IDX + 32] | |
338 | VMOVPS WB10,[inp2+IDX + 32] | |
339 | VMOVPS WB11,[inp3+IDX + 32] | |
340 | VMOVPS WB12,[inp4+IDX + 32] | |
341 | VMOVPS WB13,[inp5+IDX + 32] | |
342 | VMOVPS WB14,[inp6+IDX + 32] | |
343 | VMOVPS WB15,[inp7+IDX + 32] | |
344 | ||
345 | TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3 | |
346 | vmovdqa WBTMP2, [SHUF_MASK] | |
347 | vpshufb WB8,WBTMP2 | |
348 | vpshufb WB9,WBTMP2 | |
349 | vpshufb WB10,WBTMP2 | |
350 | vpshufb WB11,WBTMP2 | |
351 | vpshufb WB12,WBTMP2 | |
352 | vpshufb WB13,WBTMP2 | |
353 | vpshufb WB14,WBTMP2 | |
354 | vpshufb WB15,WBTMP2 | |
355 | ||
356 | ; WB0 WB1 already saved | |
357 | %assign I 2 | |
358 | %rep 14 | |
359 | vmovdqa [YTMPI], APPEND(WB,I) | |
360 | %assign I (I+1) | |
361 | %endrep | |
362 | ||
363 | vmovdqa WB0 , [YTMP0] | |
364 | vmovdqa WB1 , [YTMP1] | |
365 | ||
366 | ; Calculate WB 16...67 | |
367 | %rep 52 | |
368 | %assign J (I % 16) | |
369 | %assign J_1 ((I-1) % 16) ;tmp to use | |
370 | %assign J_2 ((I-2) % 16) ;tmp to use | |
371 | %assign J_3 ((I-3) % 16) | |
372 | %assign J_4 ((I-4) % 16) ;tmp to use | |
373 | %assign J_9 ((I-9) % 16) | |
374 | %assign J_13 ((I-13) % 16) | |
375 | %assign J_6 ((I-6) % 16) | |
376 | ||
377 | ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3) | |
378 | vpxor APPEND(WB,J),APPEND(WB,J_2) | |
379 | vpxor APPEND(WB,J),APPEND(WB,J_9) | |
380 | ||
381 | ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J) | |
382 | ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J) | |
383 | vpxor APPEND(WB,J),APPEND(WB,J_2) | |
384 | vpxor APPEND(WB,J),APPEND(WB,J_1) | |
385 | ||
386 | ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13) | |
387 | vpxor APPEND(WB,J),APPEND(WB,J_2) | |
388 | vpxor APPEND(WB,J),APPEND(WB,J_6) | |
389 | ||
390 | vmovdqa [YTMPI], APPEND(WB,J) | |
391 | ||
392 | vmovdqa APPEND(WB,J_1), [YTMPI_1] | |
393 | vmovdqa APPEND(WB,J_2), [YTMPI_2] | |
394 | vmovdqa APPEND(WB,J_4), [YTMPI_4] | |
395 | ||
396 | %assign I (I+1) | |
397 | %endrep | |
398 | ||
399 | add IDX, 4*4*4 | |
400 | ||
401 | ; Every round need load A-H | |
402 | ; Because we pre calculate the WB | |
403 | vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE] | |
404 | vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE] | |
405 | vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE] | |
406 | vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE] | |
407 | vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE] | |
408 | vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE] | |
409 | vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE] | |
410 | vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE] | |
411 | ||
412 | vmovdqa Wj0, [YTMP0] | |
413 | vmovdqa Wj1, [YTMP1] | |
414 | vmovdqa Wj2, [YTMP2] | |
415 | vmovdqa Wj3, [YTMP3] | |
416 | vmovdqa Wj4, [YTMP4] | |
417 | ||
418 | ||
419 | %assign I 0 | |
420 | %rep 16 | |
421 | ||
422 | ; SS1 - TMP1 | |
423 | ROLD_nd TMP0,12,TMP1,A | |
424 | vmovdqa TMP1, [TBL + (I*32)] | |
425 | vpaddd TMP1,E | |
426 | vpaddd TMP1,TMP0 | |
427 | ROLD TMP1,7,TMP2 | |
428 | ||
429 | ; SS2 - TMP2 | |
430 | vpxor TMP2,TMP1,TMP0 | |
431 | ||
432 | ; TT1 | |
433 | vpxor TMP0,A,B | |
434 | vpxor TMP0,C | |
435 | vpaddd TMP2,TMP0 | |
436 | vpaddd TMP2,D | |
437 | vpxor TMP0,Wj0,Wj4 | |
438 | vpaddd TMP2,TMP0 | |
439 | ||
440 | ROLD B,9,TMP0 | |
441 | ||
442 | ; Rotate a,b,c,d first | |
443 | ; after P0(TT2) , Wj0 will be relase | |
444 | ROTATE_ARGS_AD | |
445 | ||
446 | ; P0(TT2) | |
447 | vpxor TMP0,E,F | |
448 | vpxor TMP0,G | |
449 | vpaddd TMP0,H | |
450 | vpaddd TMP0,TMP1 | |
451 | vpaddd TMP0,Wj0 | |
452 | ||
453 | ROLD_nd TMP1,9,TMP2,TMP0 | |
454 | ROLD_nd Wj0,17,TMP2,TMP0 | |
455 | ||
456 | vpxor TMP0,TMP1 | |
457 | vpxor TMP0,Wj0 | |
458 | ||
459 | ROLD F,19,TMP2 | |
460 | ||
461 | ROTATE_ARGS_EH | |
462 | ||
463 | ROTATE_W | |
464 | ||
465 | vmovdqa Wj4, [YTMPI5] | |
466 | %assign I (I+1) | |
467 | %endrep | |
468 | ||
469 | %rep 48 | |
470 | ; SS1 - TMP1 | |
471 | ROLD_nd TMP0,12,TMP1,A | |
472 | vmovdqa TMP1, [TBL + (I*32)] | |
473 | vpaddd TMP1,E | |
474 | vpaddd TMP1,TMP0 | |
475 | ROLD TMP1,7,TMP2 | |
476 | ||
477 | ; SS2 - TMP2 | |
478 | vpxor TMP2,TMP1,TMP0 | |
479 | ||
480 | ; SS2 + D first | |
481 | ; D will be release | |
482 | ; FF16/GG16 diff with FF64/GG64 | |
483 | ; So the register which keep D should be release before calculate TT1 | |
484 | vpaddd TMP2,D | |
485 | ||
486 | ; TT1 | |
487 | vpor TMP0,A,B | |
488 | vpand TMP0,C | |
489 | vpand D,A,B | |
490 | vpor TMP0,D | |
491 | ||
492 | vpaddd TMP2,TMP0 | |
493 | vpxor TMP0,Wj0,Wj4 | |
494 | vpaddd TMP2,TMP0 | |
495 | ||
496 | ROLD B,9,TMP0 | |
497 | ||
498 | ROTATE_ARGS_AD | |
499 | ||
500 | ; P0(TT2) | |
501 | vpaddd TMP1,H | |
502 | vpaddd TMP1,Wj0 | |
503 | ||
504 | vpand TMP0,E,F | |
505 | vpandn Wj0,E,G | |
506 | vpor TMP0,Wj0 | |
507 | ||
508 | vpaddd TMP0,TMP1 | |
509 | ||
510 | ROLD_nd TMP1,9,TMP2,TMP0 | |
511 | ROLD_nd Wj0,17,TMP2,TMP0 | |
512 | ||
513 | vpxor TMP0,TMP1 | |
514 | vpxor TMP0,Wj0 | |
515 | ||
516 | ROLD F,19,TMP2 | |
517 | ||
518 | ROTATE_ARGS_EH | |
519 | ||
520 | ROTATE_W | |
521 | vmovdqa Wj4, [YTMPI5] | |
522 | %assign I (I+1) | |
523 | %endrep | |
524 | ||
525 | vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE] | |
526 | vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE] | |
527 | vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE] | |
528 | vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE] | |
529 | vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE] | |
530 | vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE] | |
531 | vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE] | |
532 | vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE] | |
533 | ||
534 | ; Write back to memory (state object) the transposed digest | |
535 | vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A | |
536 | vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B | |
537 | vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C | |
538 | vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D | |
539 | vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E | |
540 | vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F | |
541 | vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G | |
542 | vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H | |
543 | ||
544 | sub SIZE, 1 | |
545 | je last_loop | |
546 | jmp lloop | |
547 | ||
548 | last_loop: | |
549 | ||
550 | ||
551 | ; update input pointers | |
552 | add inp0, IDX | |
553 | mov [STATE + _args_data_ptr + 0*8], inp0 | |
554 | add inp1, IDX | |
555 | mov [STATE + _args_data_ptr + 1*8], inp1 | |
556 | add inp2, IDX | |
557 | mov [STATE + _args_data_ptr + 2*8], inp2 | |
558 | add inp3, IDX | |
559 | mov [STATE + _args_data_ptr + 3*8], inp3 | |
560 | add inp4, IDX | |
561 | mov [STATE + _args_data_ptr + 4*8], inp4 | |
562 | add inp5, IDX | |
563 | mov [STATE + _args_data_ptr + 5*8], inp5 | |
564 | add inp6, IDX | |
565 | mov [STATE + _args_data_ptr + 6*8], inp6 | |
566 | add inp7, IDX | |
567 | mov [STATE + _args_data_ptr + 7*8], inp7 | |
568 | ||
569 | ;;;;;;;;;;;;;;;; | |
570 | ;; Postamble | |
571 | mov rsp, [rsp + _RSP_SAVE] | |
572 | ret | |
573 | ||
574 | ||
575 | PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
576 | dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
577 | ||
578 | align 64 | |
579 | global TABLE | |
580 | TABLE: | |
581 | dq 0x79cc451979cc4519,0x79cc451979cc4519 | |
582 | dq 0x79cc451979cc4519,0x79cc451979cc4519 | |
583 | dq 0xf3988a32f3988a32,0xf3988a32f3988a32 | |
584 | dq 0xf3988a32f3988a32,0xf3988a32f3988a32 | |
585 | dq 0xe7311465e7311465,0xe7311465e7311465 | |
586 | dq 0xe7311465e7311465,0xe7311465e7311465 | |
587 | dq 0xce6228cbce6228cb,0xce6228cbce6228cb | |
588 | dq 0xce6228cbce6228cb,0xce6228cbce6228cb | |
589 | dq 0x9cc451979cc45197,0x9cc451979cc45197 | |
590 | dq 0x9cc451979cc45197,0x9cc451979cc45197 | |
591 | dq 0x3988a32f3988a32f,0x3988a32f3988a32f | |
592 | dq 0x3988a32f3988a32f,0x3988a32f3988a32f | |
593 | dq 0x7311465e7311465e,0x7311465e7311465e | |
594 | dq 0x7311465e7311465e,0x7311465e7311465e | |
595 | dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc | |
596 | dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc | |
597 | dq 0xcc451979cc451979,0xcc451979cc451979 | |
598 | dq 0xcc451979cc451979,0xcc451979cc451979 | |
599 | dq 0x988a32f3988a32f3,0x988a32f3988a32f3 | |
600 | dq 0x988a32f3988a32f3,0x988a32f3988a32f3 | |
601 | dq 0x311465e7311465e7,0x311465e7311465e7 | |
602 | dq 0x311465e7311465e7,0x311465e7311465e7 | |
603 | dq 0x6228cbce6228cbce,0x6228cbce6228cbce | |
604 | dq 0x6228cbce6228cbce,0x6228cbce6228cbce | |
605 | dq 0xc451979cc451979c,0xc451979cc451979c | |
606 | dq 0xc451979cc451979c,0xc451979cc451979c | |
607 | dq 0x88a32f3988a32f39,0x88a32f3988a32f39 | |
608 | dq 0x88a32f3988a32f39,0x88a32f3988a32f39 | |
609 | dq 0x11465e7311465e73,0x11465e7311465e73 | |
610 | dq 0x11465e7311465e73,0x11465e7311465e73 | |
611 | dq 0x228cbce6228cbce6,0x228cbce6228cbce6 | |
612 | dq 0x228cbce6228cbce6,0x228cbce6228cbce6 | |
613 | dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 | |
614 | dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 | |
615 | dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f | |
616 | dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f | |
617 | dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e | |
618 | dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e | |
619 | dq 0xec53d43cec53d43c,0xec53d43cec53d43c | |
620 | dq 0xec53d43cec53d43c,0xec53d43cec53d43c | |
621 | dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 | |
622 | dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 | |
623 | dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 | |
624 | dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 | |
625 | dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 | |
626 | dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 | |
627 | dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce | |
628 | dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce | |
629 | dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d | |
630 | dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d | |
631 | dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b | |
632 | dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b | |
633 | dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 | |
634 | dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 | |
635 | dq 0x53d43cec53d43cec,0x53d43cec53d43cec | |
636 | dq 0x53d43cec53d43cec,0x53d43cec53d43cec | |
637 | dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 | |
638 | dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 | |
639 | dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 | |
640 | dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 | |
641 | dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 | |
642 | dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 | |
643 | dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 | |
644 | dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 | |
645 | dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a | |
646 | dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a | |
647 | dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 | |
648 | dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14 | |
649 | dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 | |
650 | dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629 | |
651 | dq 0xd43cec53d43cec53,0xd43cec53d43cec53 | |
652 | dq 0xd43cec53d43cec53,0xd43cec53d43cec53 | |
653 | dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 | |
654 | dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7 | |
655 | dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f | |
656 | dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f | |
657 | dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e | |
658 | dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e | |
659 | dq 0x43cec53d43cec53d,0x43cec53d43cec53d | |
660 | dq 0x43cec53d43cec53d,0x43cec53d43cec53d | |
661 | dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a | |
662 | dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a | |
663 | dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 | |
664 | dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5 | |
665 | dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea | |
666 | dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea | |
667 | dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 | |
668 | dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4 | |
669 | dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 | |
670 | dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8 | |
671 | dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 | |
672 | dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50 | |
673 | dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 | |
674 | dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1 | |
675 | dq 0xcec53d43cec53d43,0xcec53d43cec53d43 | |
676 | dq 0xcec53d43cec53d43,0xcec53d43cec53d43 | |
677 | dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 | |
678 | dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87 | |
679 | dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f | |
680 | dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f | |
681 | dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e | |
682 | dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e | |
683 | dq 0xec53d43cec53d43c,0xec53d43cec53d43c | |
684 | dq 0xec53d43cec53d43c,0xec53d43cec53d43c | |
685 | dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 | |
686 | dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879 | |
687 | dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 | |
688 | dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3 | |
689 | dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 | |
690 | dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7 | |
691 | dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce | |
692 | dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce | |
693 | dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d | |
694 | dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d | |
695 | dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b | |
696 | dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b | |
697 | dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 | |
698 | dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76 | |
699 | dq 0x53d43cec53d43cec,0x53d43cec53d43cec | |
700 | dq 0x53d43cec53d43cec,0x53d43cec53d43cec | |
701 | dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 | |
702 | dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8 | |
703 | dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 | |
704 | dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1 | |
705 | dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 | |
706 | dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762 | |
707 | dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 | |
708 | dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5 | |
709 | ||
710 | SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b | |
711 | dq 0x0405060700010203,0x0c0d0e0f08090a0b |