]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
f67539c2 | 28 | %include "include/os.asm" |
11fdf7f2 TL |
29 | |
30 | ;%define DO_DBGPRINT | |
f67539c2 | 31 | %include "include/dbgprint.asm" |
11fdf7f2 TL |
32 | |
33 | %include "mb_mgr_datastruct.asm" | |
34 | ||
35 | section .data | |
36 | default rel | |
37 | align 16 | |
38 | PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 | |
39 | dq 0x0405060700010203, 0x0c0d0e0f08090a0b | |
40 | K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 | |
41 | dq 0x5A8279995A827999, 0x5A8279995A827999 | |
42 | K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 | |
43 | dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 | |
44 | K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC | |
45 | dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC | |
46 | K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 | |
47 | dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 | |
48 | ||
49 | section .text | |
50 | ||
51 | ;; code to compute quad SHA1 using SSE | |
52 | ;; derived from ...\sha1_multiple\sha1_quad4.asm | |
53 | ;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact | |
54 | ;; rbx, rsi, rdi, rbp, r12-r15 left intact | |
55 | ;; This version is not safe to call from C/C++ | |
56 | ||
57 | ;; Stack must be aligned to 16 bytes before call | |
58 | ;; Windows clobbers: rax rdx r8 r9 r10 r11 | |
59 | ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 | |
60 | ;; | |
61 | ;; Linux clobbers: rax rsi r8 r9 r10 r11 | |
62 | ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 | |
63 | ;; | |
64 | ;; clobbers xmm0-15 | |
65 | ||
66 | ; transpose r0, r1, r2, r3, t0, t1 | |
67 | ; "transpose" data in {r0..r3} using temps {t0..t3} | |
68 | ; Input looks like: {r0 r1 r2 r3} | |
69 | ; r0 = {a3 a2 a1 a0} | |
70 | ; r1 = {b3 b2 b1 b0} | |
71 | ; r2 = {c3 c2 c1 c0} | |
72 | ; r3 = {d3 d2 d1 d0} | |
73 | ; | |
74 | ; output looks like: {t0 r1 r0 r3} | |
75 | ; t0 = {d0 c0 b0 a0} | |
76 | ; r1 = {d1 c1 b1 a1} | |
77 | ; r0 = {d2 c2 b2 a2} | |
78 | ; r3 = {d3 c3 b3 a3} | |
79 | ; | |
80 | %macro TRANSPOSE 6 | |
81 | %define %%r0 %1 | |
82 | %define %%r1 %2 | |
83 | %define %%r2 %3 | |
84 | %define %%r3 %4 | |
85 | %define %%t0 %5 | |
86 | %define %%t1 %6 | |
87 | movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} | |
88 | shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} | |
89 | shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} | |
90 | ||
91 | movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} | |
92 | shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} | |
93 | shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} | |
94 | ||
95 | movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} | |
96 | shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} | |
97 | ||
98 | movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} | |
99 | shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} | |
100 | ||
101 | shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} | |
102 | shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} | |
103 | %endmacro | |
104 | ;; | |
105 | ;; Magic functions defined in FIPS 180-1 | |
106 | ;; | |
107 | ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) | |
108 | %macro MAGIC_F0 5 | |
109 | %define %%regF %1 | |
110 | %define %%regB %2 | |
111 | %define %%regC %3 | |
112 | %define %%regD %4 | |
113 | %define %%regT %5 | |
114 | movdqa %%regF,%%regC | |
115 | pxor %%regF,%%regD | |
116 | pand %%regF,%%regB | |
117 | pxor %%regF,%%regD | |
118 | %endmacro | |
119 | ||
120 | ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) | |
121 | %macro MAGIC_F1 5 | |
122 | %define %%regF %1 | |
123 | %define %%regB %2 | |
124 | %define %%regC %3 | |
125 | %define %%regD %4 | |
126 | %define %%regT %5 | |
127 | movdqa %%regF,%%regD | |
128 | pxor %%regF,%%regC | |
129 | pxor %%regF,%%regB | |
130 | %endmacro | |
131 | ||
132 | ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) | |
133 | %macro MAGIC_F2 5 | |
134 | %define %%regF %1 | |
135 | %define %%regB %2 | |
136 | %define %%regC %3 | |
137 | %define %%regD %4 | |
138 | %define %%regT %5 | |
139 | movdqa %%regF,%%regB | |
140 | movdqa %%regT,%%regB | |
141 | por %%regF,%%regC | |
142 | pand %%regT,%%regC | |
143 | pand %%regF,%%regD | |
144 | por %%regF,%%regT | |
145 | %endmacro | |
146 | ||
147 | ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) | |
148 | %macro MAGIC_F3 5 | |
149 | %define %%regF %1 | |
150 | %define %%regB %2 | |
151 | %define %%regC %3 | |
152 | %define %%regD %4 | |
153 | %define %%regT %5 | |
154 | MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT | |
155 | %endmacro | |
156 | ||
157 | ; PROLD reg, imm, tmp | |
158 | %macro PROLD 3 | |
159 | %define %%reg %1 | |
160 | %define %%imm %2 | |
161 | %define %%tmp %3 | |
162 | movdqa %%tmp, %%reg | |
163 | pslld %%reg, %%imm | |
164 | psrld %%tmp, (32-%%imm) | |
165 | por %%reg, %%tmp | |
166 | %endmacro | |
167 | ||
168 | %macro SHA1_STEP_00_15 10 | |
169 | %define %%regA %1 | |
170 | %define %%regB %2 | |
171 | %define %%regC %3 | |
172 | %define %%regD %4 | |
173 | %define %%regE %5 | |
174 | %define %%regT %6 | |
175 | %define %%regF %7 | |
176 | %define %%memW %8 | |
177 | %define %%immCNT %9 | |
178 | %define %%MAGIC %10 | |
179 | paddd %%regE,%%immCNT | |
180 | paddd %%regE,[rsp + (%%memW * 16)] | |
181 | movdqa %%regT,%%regA | |
182 | PROLD %%regT,5, %%regF | |
183 | paddd %%regE,%%regT | |
184 | %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) | |
185 | PROLD %%regB,30, %%regT | |
186 | paddd %%regE,%%regF | |
187 | %endmacro | |
188 | ||
189 | %macro SHA1_STEP_16_79 10 | |
190 | %define %%regA %1 | |
191 | %define %%regB %2 | |
192 | %define %%regC %3 | |
193 | %define %%regD %4 | |
194 | %define %%regE %5 | |
195 | %define %%regT %6 | |
196 | %define %%regF %7 | |
197 | %define %%memW %8 | |
198 | %define %%immCNT %9 | |
199 | %define %%MAGIC %10 | |
200 | paddd %%regE,%%immCNT | |
201 | movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] | |
202 | pxor W16, W14 | |
203 | pxor W16, [rsp + ((%%memW - 8) & 15) * 16] | |
204 | pxor W16, [rsp + ((%%memW - 3) & 15) * 16] | |
205 | movdqa %%regF, W16 | |
206 | pslld W16, 1 | |
207 | psrld %%regF, (32-1) | |
208 | por %%regF, W16 | |
209 | ROTATE_W | |
210 | ||
211 | movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF | |
212 | paddd %%regE,%%regF | |
213 | movdqa %%regT,%%regA | |
214 | PROLD %%regT,5, %%regF | |
215 | paddd %%regE,%%regT | |
216 | %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) | |
217 | PROLD %%regB,30, %%regT | |
218 | paddd %%regE,%%regF | |
219 | %endmacro | |
220 | ||
221 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
222 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
223 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
224 | ||
225 | ;; FRAMESZ must be an odd multiple of 8 | |
226 | %define FRAMESZ 16*16 + 8 | |
227 | ||
228 | %define MOVPS movdqu | |
229 | ||
230 | %ifdef LINUX | |
231 | %define arg1 rdi | |
232 | %define arg2 rsi | |
233 | %else | |
234 | %define arg1 rcx | |
235 | %define arg2 rdx | |
236 | %endif | |
237 | ||
238 | %define inp0 r8 | |
239 | %define inp1 r9 | |
240 | %define inp2 r10 | |
241 | %define inp3 r11 | |
242 | ||
243 | %define IDX rax | |
244 | ||
245 | %define A xmm0 | |
246 | %define B xmm1 | |
247 | %define C xmm2 | |
248 | %define D xmm3 | |
249 | %define E xmm4 | |
250 | %define F xmm5 ; tmp | |
251 | %define G xmm6 ; tmp | |
252 | ||
253 | %define TMP G | |
254 | %define FUN F | |
255 | %define K xmm7 | |
256 | ||
257 | %define AA xmm8 | |
258 | %define BB xmm9 | |
259 | %define CC xmm10 | |
260 | %define DD xmm11 | |
261 | %define EE xmm12 | |
262 | ||
263 | %define T0 xmm6 | |
264 | %define T1 xmm7 | |
265 | %define T2 xmm8 | |
266 | %define T3 xmm9 | |
267 | %define T4 xmm10 | |
268 | %define T5 xmm11 | |
269 | ||
270 | %define W14 xmm13 | |
271 | %define W15 xmm14 | |
272 | %define W16 xmm15 | |
273 | ||
274 | %macro ROTATE_ARGS 0 | |
275 | %xdefine TMP_ E | |
276 | %xdefine E D | |
277 | %xdefine D C | |
278 | %xdefine C B | |
279 | %xdefine B A | |
280 | %xdefine A TMP_ | |
281 | %endm | |
282 | ||
283 | %macro ROTATE_W 0 | |
284 | %xdefine TMP_ W16 | |
285 | %xdefine W16 W15 | |
286 | %xdefine W15 W14 | |
287 | %xdefine W14 TMP_ | |
288 | %endm | |
289 | ||
290 | align 32 | |
291 | ||
292 | ; XMM registers are clobbered. Saving/restoring must be done at a higher level | |
293 | ||
294 | ; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks); | |
295 | ; arg 1 : rcx : pointer to args | |
296 | ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 | |
297 | MKGLOBAL(sha1_mult_sse,function,internal) | |
298 | sha1_mult_sse: | |
299 | ||
300 | sub rsp, FRAMESZ | |
301 | ||
302 | ;; Initialize digests | |
303 | movdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] | |
304 | movdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] | |
305 | movdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] | |
306 | movdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] | |
307 | movdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] | |
308 | DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E | |
309 | ;; load input pointers | |
310 | mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] | |
311 | mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] | |
312 | mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] | |
313 | mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] | |
314 | DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3 | |
315 | xor IDX, IDX | |
316 | lloop: | |
317 | movdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] | |
318 | %assign I 0 | |
319 | %rep 4 | |
320 | MOVPS T2,[inp0+IDX] | |
321 | MOVPS T1,[inp1+IDX] | |
322 | MOVPS T4,[inp2+IDX] | |
323 | MOVPS T3,[inp3+IDX] | |
324 | TRANSPOSE T2, T1, T4, T3, T0, T5 | |
325 | DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3 | |
326 | pshufb T0, F | |
327 | movdqa [rsp+(I*4+0)*16],T0 | |
328 | pshufb T1, F | |
329 | movdqa [rsp+(I*4+1)*16],T1 | |
330 | pshufb T2, F | |
331 | movdqa [rsp+(I*4+2)*16],T2 | |
332 | pshufb T3, F | |
333 | movdqa [rsp+(I*4+3)*16],T3 | |
334 | add IDX, 4*4 | |
335 | %assign I (I+1) | |
336 | %endrep | |
337 | ||
338 | ; save old digests | |
339 | movdqa AA, A | |
340 | movdqa BB, B | |
341 | movdqa CC, C | |
342 | movdqa DD, D | |
343 | movdqa EE, E | |
344 | ||
345 | ;; | |
346 | ;; perform 0-79 steps | |
347 | ;; | |
348 | movdqa K, [rel K00_19] | |
349 | ;; do rounds 0...15 | |
350 | %assign I 0 | |
351 | %rep 16 | |
352 | SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 | |
353 | ROTATE_ARGS | |
354 | %assign I (I+1) | |
355 | %endrep | |
356 | ||
357 | ;; do rounds 16...19 | |
358 | movdqa W16, [rsp + ((16 - 16) & 15) * 16] | |
359 | movdqa W15, [rsp + ((16 - 15) & 15) * 16] | |
360 | %rep 4 | |
361 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 | |
362 | ROTATE_ARGS | |
363 | %assign I (I+1) | |
364 | %endrep | |
365 | ||
366 | ;; do rounds 20...39 | |
367 | movdqa K, [rel K20_39] | |
368 | %rep 20 | |
369 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 | |
370 | ROTATE_ARGS | |
371 | %assign I (I+1) | |
372 | %endrep | |
373 | ||
374 | ;; do rounds 40...59 | |
375 | movdqa K, [rel K40_59] | |
376 | %rep 20 | |
377 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 | |
378 | ROTATE_ARGS | |
379 | %assign I (I+1) | |
380 | %endrep | |
381 | ||
382 | ;; do rounds 60...79 | |
383 | movdqa K, [rel K60_79] | |
384 | %rep 20 | |
385 | SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 | |
386 | ROTATE_ARGS | |
387 | %assign I (I+1) | |
388 | %endrep | |
389 | ||
390 | paddd A,AA | |
391 | paddd B,BB | |
392 | paddd C,CC | |
393 | paddd D,DD | |
394 | paddd E,EE | |
395 | ||
396 | sub arg2, 1 | |
397 | jne lloop | |
398 | ||
399 | ; write out digests | |
400 | movdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A | |
401 | movdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B | |
402 | movdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C | |
403 | movdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D | |
404 | movdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E | |
405 | DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E | |
406 | ; update input pointers | |
407 | add inp0, IDX | |
408 | mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 | |
409 | add inp1, IDX | |
410 | mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 | |
411 | add inp2, IDX | |
412 | mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 | |
413 | add inp3, IDX | |
414 | mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 | |
415 | DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3 | |
416 | ;;;;;;;;;;;;;;;; | |
417 | ;; Postamble | |
418 | ||
f67539c2 TL |
419 | ;; Clear stack frame (16*16 bytes) |
420 | %ifdef SAFE_DATA | |
421 | pxor xmm0, xmm0 | |
422 | %assign i 0 | |
423 | %rep 16 | |
424 | movdqa [rsp + i*16], xmm0 | |
425 | %assign i (i+1) | |
426 | %endrep | |
427 | %endif | |
428 | ||
11fdf7f2 TL |
429 | add rsp, FRAMESZ |
430 | ||
431 | ret | |
432 | ||
433 | %ifdef LINUX | |
434 | section .note.GNU-stack noalloc noexec nowrite progbits | |
435 | %endif |