]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
1e59de90 | 5 | ; modification, are permitted provided that the following conditions |
7c673cae FG |
6 | ; are met: |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "sha1_mb_mgr_datastruct.asm" | |
31 | %include "reg_sizes.asm" | |
32 | ||
1e59de90 | 33 | [bits 64] |
7c673cae | 34 | default rel |
1e59de90 | 35 | section .text |
7c673cae FG |
36 | |
37 | ;; code to compute quad SHA1 using SSE | |
38 | ;; derived from ...\sha1_multiple\sha1_quad4.asm | |
39 | ;; variation of sha1_mult2.asm | |
40 | ||
41 | ; transpose r0, r1, r2, r3, t0, t1 | |
42 | ; "transpose" data in {r0..r3} using temps {t0..t3} | |
43 | ; Input looks like: {r0 r1 r2 r3} | |
44 | ; r0 = {a3 a2 a1 a0} | |
45 | ; r1 = {b3 b2 b1 b0} | |
46 | ; r2 = {c3 c2 c1 c0} | |
47 | ; r3 = {d3 d2 d1 d0} | |
48 | ; | |
49 | ; output looks like: {t0 r1 r0 r3} | |
50 | ; t0 = {d0 c0 b0 a0} | |
51 | ; r1 = {d1 c1 b1 a1} | |
52 | ; r0 = {d2 c2 b2 a2} | |
53 | ; r3 = {d3 c3 b3 a3} | |
1e59de90 | 54 | ; |
7c673cae FG |
55 | %macro TRANSPOSE 6 |
56 | %define %%r0 %1 | |
57 | %define %%r1 %2 | |
58 | %define %%r2 %3 | |
59 | %define %%r3 %4 | |
60 | %define %%t0 %5 | |
61 | %define %%t1 %6 | |
62 | movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} | |
63 | shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} | |
64 | shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} | |
65 | ||
66 | movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} | |
67 | shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} | |
68 | shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} | |
69 | ||
70 | movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} | |
71 | shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} | |
72 | ||
73 | movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} | |
74 | shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} | |
75 | ||
76 | shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} | |
77 | shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} | |
1e59de90 | 78 | %endmacro |
7c673cae FG |
79 | ;; |
80 | ;; Magic functions defined in FIPS 180-1 | |
81 | ;; | |
82 | ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) | |
83 | %macro MAGIC_F0 5 | |
84 | %define %%regF %1 | |
85 | %define %%regB %2 | |
86 | %define %%regC %3 | |
87 | %define %%regD %4 | |
88 | %define %%regT %5 | |
89 | movdqa %%regF,%%regC | |
90 | pxor %%regF,%%regD | |
91 | pand %%regF,%%regB | |
92 | pxor %%regF,%%regD | |
93 | %endmacro | |
94 | ||
95 | ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) | |
96 | %macro MAGIC_F1 5 | |
97 | %define %%regF %1 | |
98 | %define %%regB %2 | |
99 | %define %%regC %3 | |
100 | %define %%regD %4 | |
101 | %define %%regT %5 | |
102 | movdqa %%regF,%%regD | |
103 | pxor %%regF,%%regC | |
104 | pxor %%regF,%%regB | |
105 | %endmacro | |
106 | ||
107 | ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) | |
108 | %macro MAGIC_F2 5 | |
109 | %define %%regF %1 | |
110 | %define %%regB %2 | |
111 | %define %%regC %3 | |
112 | %define %%regD %4 | |
113 | %define %%regT %5 | |
114 | movdqa %%regF,%%regB | |
115 | movdqa %%regT,%%regB | |
116 | por %%regF,%%regC | |
117 | pand %%regT,%%regC | |
118 | pand %%regF,%%regD | |
119 | por %%regF,%%regT | |
120 | %endmacro | |
121 | ||
122 | ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) | |
123 | %macro MAGIC_F3 5 | |
124 | %define %%regF %1 | |
125 | %define %%regB %2 | |
126 | %define %%regC %3 | |
127 | %define %%regD %4 | |
128 | %define %%regT %5 | |
129 | MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT | |
130 | %endmacro | |
131 | ||
132 | ; PROLD reg, imm, tmp | |
133 | %macro PROLD 3 | |
134 | %define %%reg %1 | |
135 | %define %%imm %2 | |
136 | %define %%tmp %3 | |
137 | movdqa %%tmp, %%reg | |
138 | pslld %%reg, %%imm | |
139 | psrld %%tmp, (32-%%imm) | |
140 | por %%reg, %%tmp | |
141 | %endmacro | |
142 | ||
143 | %macro SHA1_STEP_00_15 10 | |
144 | %define %%regA %1 | |
145 | %define %%regB %2 | |
146 | %define %%regC %3 | |
147 | %define %%regD %4 | |
148 | %define %%regE %5 | |
149 | %define %%regT %6 | |
150 | %define %%regF %7 | |
151 | %define %%memW %8 | |
152 | %define %%immCNT %9 | |
153 | %define %%MAGIC %10 | |
154 | paddd %%regE,%%immCNT | |
155 | paddd %%regE,[rsp + (%%memW * 16)] | |
156 | movdqa %%regT,%%regA | |
157 | PROLD %%regT,5, %%regF | |
158 | paddd %%regE,%%regT | |
159 | %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) | |
160 | PROLD %%regB,30, %%regT | |
161 | paddd %%regE,%%regF | |
162 | %endmacro | |
163 | ||
164 | %macro SHA1_STEP_16_79 10 | |
165 | %define %%regA %1 | |
166 | %define %%regB %2 | |
167 | %define %%regC %3 | |
168 | %define %%regD %4 | |
169 | %define %%regE %5 | |
170 | %define %%regT %6 | |
171 | %define %%regF %7 | |
172 | %define %%memW %8 | |
173 | %define %%immCNT %9 | |
174 | %define %%MAGIC %10 | |
175 | paddd %%regE,%%immCNT | |
176 | movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] | |
177 | pxor W16, W14 | |
178 | pxor W16, [rsp + ((%%memW - 8) & 15) * 16] | |
179 | pxor W16, [rsp + ((%%memW - 3) & 15) * 16] | |
180 | movdqa %%regF, W16 | |
181 | pslld W16, 1 | |
182 | psrld %%regF, (32-1) | |
183 | por %%regF, W16 | |
184 | ROTATE_W | |
185 | ||
186 | movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF | |
187 | paddd %%regE,%%regF | |
188 | movdqa %%regT,%%regA | |
189 | PROLD %%regT,5, %%regF | |
190 | paddd %%regE,%%regT | |
191 | %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) | |
192 | PROLD %%regB,30, %%regT | |
193 | paddd %%regE,%%regF | |
194 | %endmacro | |
195 | ||
196 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
197 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
198 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
199 | ||
200 | ;; FRAMESZ plus pushes must be an odd multiple of 8 | |
201 | %define XMM_SAVE ((15-15)*16 + 1*8) | |
202 | %define FRAMESZ 16*16 + XMM_SAVE | |
203 | %define _XMM FRAMESZ - XMM_SAVE | |
1e59de90 | 204 | |
7c673cae FG |
205 | %define MOVPS movups |
206 | ||
207 | %define inp0 r8 | |
208 | %define inp1 r9 | |
209 | %define inp2 r10 | |
210 | %define inp3 r11 | |
211 | ||
212 | %define IDX rax | |
213 | ||
214 | %define A xmm0 | |
215 | %define B xmm1 | |
216 | %define C xmm2 | |
217 | %define D xmm3 | |
218 | %define E xmm4 | |
219 | %define F xmm5 ; tmp | |
220 | %define G xmm6 ; tmp | |
221 | ||
222 | %define TMP G | |
223 | %define FUN F | |
224 | %define K xmm7 | |
225 | ||
226 | %define AA xmm8 | |
227 | %define BB xmm9 | |
228 | %define CC xmm10 | |
229 | %define DD xmm11 | |
230 | %define EE xmm12 | |
1e59de90 | 231 | |
7c673cae FG |
232 | %define T0 xmm6 |
233 | %define T1 xmm7 | |
234 | %define T2 xmm8 | |
235 | %define T3 xmm9 | |
236 | %define T4 xmm10 | |
237 | %define T5 xmm11 | |
238 | ||
239 | %macro ROTATE_ARGS 0 | |
240 | %xdefine TMP_ E | |
241 | %xdefine E D | |
242 | %xdefine D C | |
243 | %xdefine C B | |
244 | %xdefine B A | |
245 | %xdefine A TMP_ | |
246 | %endm | |
247 | ||
248 | %define W14 xmm13 | |
249 | %define W15 xmm14 | |
250 | %define W16 xmm15 | |
251 | ||
252 | %macro ROTATE_W 0 | |
253 | %xdefine TMP_ W16 | |
254 | %xdefine W16 W15 | |
255 | %xdefine W15 W14 | |
256 | %xdefine W14 TMP_ | |
257 | %endm | |
258 | ||
259 | %define DIGEST_SIZE (4*5*4) | |
260 | ||
261 | %ifidn __OUTPUT_FORMAT__, elf64 | |
262 | ; Linux | |
263 | %define ARG1 rdi | |
264 | %define ARG2 rsi | |
265 | %else | |
266 | ; Windows | |
267 | %define ARG1 rcx | |
268 | %define ARG2 rdx | |
269 | %endif | |
270 | ||
271 | align 32 | |
272 | ||
273 | ; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks); | |
274 | ; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used) | |
275 | ; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1 | |
276 | ; | |
277 | ; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15 | |
278 | ; | |
1e59de90 | 279 | mk_global sha1_mb_x4_sse, function, internal |
7c673cae | 280 | sha1_mb_x4_sse: |
1e59de90 | 281 | endbranch |
7c673cae FG |
282 | |
283 | sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8 | |
1e59de90 | 284 | |
7c673cae FG |
285 | ;; Initialize digests |
286 | movdqa A, [ARG1 + 0*16] | |
287 | movdqa B, [ARG1 + 1*16] | |
288 | movdqa C, [ARG1 + 2*16] | |
289 | movdqa D, [ARG1 + 3*16] | |
290 | movdqa E, [ARG1 + 4*16] | |
1e59de90 | 291 | |
7c673cae FG |
292 | ;; load input pointers |
293 | mov inp0,[ARG1 + _data_ptr + 0*8] | |
294 | mov inp1,[ARG1 + _data_ptr + 1*8] | |
295 | mov inp2,[ARG1 + _data_ptr + 2*8] | |
296 | mov inp3,[ARG1 + _data_ptr + 3*8] | |
297 | ||
298 | xor IDX, IDX | |
299 | lloop: | |
300 | movdqa F, [PSHUFFLE_BYTE_FLIP_MASK] | |
301 | %assign I 0 | |
302 | %rep 4 | |
303 | MOVPS T2,[inp0+IDX] | |
304 | MOVPS T1,[inp1+IDX] | |
305 | MOVPS T4,[inp2+IDX] | |
306 | MOVPS T3,[inp3+IDX] | |
307 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
308 | pshufb T0, F | |
309 | movdqa [rsp+(I*4+0)*16],T0 | |
310 | pshufb T1, F | |
311 | movdqa [rsp+(I*4+1)*16],T1 | |
312 | pshufb T2, F | |
313 | movdqa [rsp+(I*4+2)*16],T2 | |
314 | pshufb T3, F | |
315 | movdqa [rsp+(I*4+3)*16],T3 | |
316 | add IDX, 4*4 | |
317 | %assign I (I+1) | |
318 | %endrep | |
319 | ||
320 | ; save old digests | |
321 | movdqa AA, A | |
322 | movdqa BB, B | |
323 | movdqa CC, C | |
324 | movdqa DD, D | |
325 | movdqa EE, E | |
326 | ||
327 | ;; | |
328 | ;; perform 0-79 steps | |
329 | ;; | |
330 | movdqa K, [K00_19] | |
331 | ;; do rounds 0...15 | |
332 | %assign I 0 | |
333 | %rep 16 | |
334 | SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 | |
335 | ROTATE_ARGS | |
336 | %assign I (I+1) | |
337 | %endrep | |
338 | ||
339 | ;; do rounds 16...19 | |
340 | movdqa W16, [rsp + ((16 - 16) & 15) * 16] | |
341 | movdqa W15, [rsp + ((16 - 15) & 15) * 16] | |
342 | %rep 4 | |
343 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 | |
344 | ROTATE_ARGS | |
345 | %assign I (I+1) | |
346 | %endrep | |
347 | ||
348 | ;; do rounds 20...39 | |
349 | movdqa K, [K20_39] | |
350 | %rep 20 | |
351 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 | |
352 | ROTATE_ARGS | |
353 | %assign I (I+1) | |
354 | %endrep | |
355 | ||
356 | ;; do rounds 40...59 | |
357 | movdqa K, [K40_59] | |
358 | %rep 20 | |
359 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 | |
360 | ROTATE_ARGS | |
361 | %assign I (I+1) | |
362 | %endrep | |
363 | ||
364 | ;; do rounds 60...79 | |
365 | movdqa K, [K60_79] | |
366 | %rep 20 | |
367 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 | |
368 | ROTATE_ARGS | |
369 | %assign I (I+1) | |
370 | %endrep | |
371 | ||
372 | paddd A,AA | |
373 | paddd B,BB | |
374 | paddd C,CC | |
375 | paddd D,DD | |
376 | paddd E,EE | |
377 | ||
378 | sub ARG2, 1 | |
379 | jne lloop | |
380 | ||
381 | ; write out digests | |
382 | movdqa [ARG1 + 0*16], A | |
383 | movdqa [ARG1 + 1*16], B | |
384 | movdqa [ARG1 + 2*16], C | |
385 | movdqa [ARG1 + 3*16], D | |
386 | movdqa [ARG1 + 4*16], E | |
387 | ||
388 | ; update input pointers | |
389 | add inp0, IDX | |
390 | mov [ARG1 + _data_ptr + 0*8], inp0 | |
391 | add inp1, IDX | |
392 | mov [ARG1 + _data_ptr + 1*8], inp1 | |
393 | add inp2, IDX | |
394 | mov [ARG1 + _data_ptr + 2*8], inp2 | |
395 | add inp3, IDX | |
396 | mov [ARG1 + _data_ptr + 3*8], inp3 | |
397 | ||
398 | ;;;;;;;;;;;;;;;; | |
399 | ;; Postamble | |
1e59de90 | 400 | |
7c673cae FG |
401 | add rsp, FRAMESZ |
402 | ||
403 | ret | |
404 | ||
405 | ||
406 | section .data align=16 | |
407 | ||
408 | align 16 | |
409 | PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
410 | K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999 | |
411 | K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 | |
412 | K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC | |
413 | K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 |