]>
Commit | Line | Data |
---|---|---|
2874c5fd | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
66be8951 MK |
2 | /* |
3 | * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental | |
4 | * SSE3 instruction set extensions introduced in Intel Core Microarchitecture | |
5 | * processors. CPUs supporting Intel(R) AVX extensions will get an additional | |
6 | * boost. | |
7 | * | |
8 | * This work was inspired by the vectorized implementation of Dean Gaudet. | |
9 | * Additional information on it can be found at: | |
10 | * http://www.arctic.org/~dean/crypto/sha1.html | |
11 | * | |
12 | * It was improved upon with more efficient vectorization of the message | |
13 | * scheduling. This implementation has also been optimized for all current and | |
14 | * several future generations of Intel CPUs. | |
15 | * | |
16 | * See this article for more information about the implementation details: | |
17 | * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ | |
18 | * | |
19 | * Copyright (C) 2010, Intel Corp. | |
20 | * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> | |
21 | * Ronen Zohar <ronen.zohar@intel.com> | |
22 | * | |
23 | * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: | |
24 | * Author: Mathias Krause <minipli@googlemail.com> | |
66be8951 MK |
25 | */ |
26 | ||
ac9d55dd JK |
27 | #include <linux/linkage.h> |
28 | ||
66be8951 MK |
29 | #define CTX %rdi // arg1 |
30 | #define BUF %rsi // arg2 | |
31 | #define CNT %rdx // arg3 | |
32 | ||
33 | #define REG_A %ecx | |
34 | #define REG_B %esi | |
35 | #define REG_C %edi | |
6488bce7 | 36 | #define REG_D %r12d |
66be8951 MK |
37 | #define REG_E %edx |
38 | ||
39 | #define REG_T1 %eax | |
40 | #define REG_T2 %ebx | |
41 | ||
42 | #define K_BASE %r8 | |
43 | #define HASH_PTR %r9 | |
44 | #define BUFFER_PTR %r10 | |
45 | #define BUFFER_END %r11 | |
46 | ||
47 | #define W_TMP1 %xmm0 | |
48 | #define W_TMP2 %xmm9 | |
49 | ||
50 | #define W0 %xmm1 | |
51 | #define W4 %xmm2 | |
52 | #define W8 %xmm3 | |
53 | #define W12 %xmm4 | |
54 | #define W16 %xmm5 | |
55 | #define W20 %xmm6 | |
56 | #define W24 %xmm7 | |
57 | #define W28 %xmm8 | |
58 | ||
59 | #define XMM_SHUFB_BSWAP %xmm10 | |
60 | ||
61 | /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ | |
62 | #define WK(t) (((t) & 15) * 4)(%rsp) | |
63 | #define W_PRECALC_AHEAD 16 | |
64 | ||
65 | /* | |
66 | * This macro implements the SHA-1 function's body for single 64-byte block | |
67 | * param: function's name | |
68 | */ | |
69 | .macro SHA1_VECTOR_ASM name | |
ac9d55dd JK |
70 | ENTRY(\name) |
71 | ||
66be8951 | 72 | push %rbx |
66be8951 | 73 | push %r12 |
6488bce7 JP |
74 | push %rbp |
75 | mov %rsp, %rbp | |
66be8951 | 76 | |
66be8951 MK |
77 | sub $64, %rsp # allocate workspace |
78 | and $~15, %rsp # align stack | |
79 | ||
80 | mov CTX, HASH_PTR | |
81 | mov BUF, BUFFER_PTR | |
82 | ||
83 | shl $6, CNT # multiply by 64 | |
84 | add BUF, CNT | |
85 | mov CNT, BUFFER_END | |
86 | ||
87 | lea K_XMM_AR(%rip), K_BASE | |
88 | xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP | |
89 | ||
90 | SHA1_PIPELINED_MAIN_BODY | |
91 | ||
92 | # cleanup workspace | |
93 | mov $8, %ecx | |
94 | mov %rsp, %rdi | |
a7bea830 | 95 | xor %eax, %eax |
66be8951 MK |
96 | rep stosq |
97 | ||
6488bce7 | 98 | mov %rbp, %rsp # deallocate workspace |
66be8951 | 99 | pop %rbp |
6488bce7 | 100 | pop %r12 |
66be8951 MK |
101 | pop %rbx |
102 | ret | |
103 | ||
ac9d55dd | 104 | ENDPROC(\name) |
66be8951 MK |
105 | .endm |
106 | ||
107 | /* | |
108 | * This macro implements 80 rounds of SHA-1 for one 64-byte block | |
109 | */ | |
110 | .macro SHA1_PIPELINED_MAIN_BODY | |
111 | INIT_REGALLOC | |
112 | ||
113 | mov (HASH_PTR), A | |
114 | mov 4(HASH_PTR), B | |
115 | mov 8(HASH_PTR), C | |
116 | mov 12(HASH_PTR), D | |
117 | mov 16(HASH_PTR), E | |
118 | ||
119 | .set i, 0 | |
120 | .rept W_PRECALC_AHEAD | |
121 | W_PRECALC i | |
122 | .set i, (i+1) | |
123 | .endr | |
124 | ||
125 | .align 4 | |
126 | 1: | |
127 | RR F1,A,B,C,D,E,0 | |
128 | RR F1,D,E,A,B,C,2 | |
129 | RR F1,B,C,D,E,A,4 | |
130 | RR F1,E,A,B,C,D,6 | |
131 | RR F1,C,D,E,A,B,8 | |
132 | ||
133 | RR F1,A,B,C,D,E,10 | |
134 | RR F1,D,E,A,B,C,12 | |
135 | RR F1,B,C,D,E,A,14 | |
136 | RR F1,E,A,B,C,D,16 | |
137 | RR F1,C,D,E,A,B,18 | |
138 | ||
139 | RR F2,A,B,C,D,E,20 | |
140 | RR F2,D,E,A,B,C,22 | |
141 | RR F2,B,C,D,E,A,24 | |
142 | RR F2,E,A,B,C,D,26 | |
143 | RR F2,C,D,E,A,B,28 | |
144 | ||
145 | RR F2,A,B,C,D,E,30 | |
146 | RR F2,D,E,A,B,C,32 | |
147 | RR F2,B,C,D,E,A,34 | |
148 | RR F2,E,A,B,C,D,36 | |
149 | RR F2,C,D,E,A,B,38 | |
150 | ||
151 | RR F3,A,B,C,D,E,40 | |
152 | RR F3,D,E,A,B,C,42 | |
153 | RR F3,B,C,D,E,A,44 | |
154 | RR F3,E,A,B,C,D,46 | |
155 | RR F3,C,D,E,A,B,48 | |
156 | ||
157 | RR F3,A,B,C,D,E,50 | |
158 | RR F3,D,E,A,B,C,52 | |
159 | RR F3,B,C,D,E,A,54 | |
160 | RR F3,E,A,B,C,D,56 | |
161 | RR F3,C,D,E,A,B,58 | |
162 | ||
163 | add $64, BUFFER_PTR # move to the next 64-byte block | |
164 | cmp BUFFER_END, BUFFER_PTR # if the current is the last one use | |
165 | cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun | |
166 | ||
167 | RR F4,A,B,C,D,E,60 | |
168 | RR F4,D,E,A,B,C,62 | |
169 | RR F4,B,C,D,E,A,64 | |
170 | RR F4,E,A,B,C,D,66 | |
171 | RR F4,C,D,E,A,B,68 | |
172 | ||
173 | RR F4,A,B,C,D,E,70 | |
174 | RR F4,D,E,A,B,C,72 | |
175 | RR F4,B,C,D,E,A,74 | |
176 | RR F4,E,A,B,C,D,76 | |
177 | RR F4,C,D,E,A,B,78 | |
178 | ||
179 | UPDATE_HASH (HASH_PTR), A | |
180 | UPDATE_HASH 4(HASH_PTR), B | |
181 | UPDATE_HASH 8(HASH_PTR), C | |
182 | UPDATE_HASH 12(HASH_PTR), D | |
183 | UPDATE_HASH 16(HASH_PTR), E | |
184 | ||
185 | RESTORE_RENAMED_REGS | |
186 | cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end | |
187 | jne 1b | |
188 | .endm | |
189 | ||
190 | .macro INIT_REGALLOC | |
191 | .set A, REG_A | |
192 | .set B, REG_B | |
193 | .set C, REG_C | |
194 | .set D, REG_D | |
195 | .set E, REG_E | |
196 | .set T1, REG_T1 | |
197 | .set T2, REG_T2 | |
198 | .endm | |
199 | ||
200 | .macro RESTORE_RENAMED_REGS | |
201 | # order is important (REG_C is where it should be) | |
202 | mov B, REG_B | |
203 | mov D, REG_D | |
204 | mov A, REG_A | |
205 | mov E, REG_E | |
206 | .endm | |
207 | ||
208 | .macro SWAP_REG_NAMES a, b | |
209 | .set _T, \a | |
210 | .set \a, \b | |
211 | .set \b, _T | |
212 | .endm | |
213 | ||
214 | .macro F1 b, c, d | |
215 | mov \c, T1 | |
216 | SWAP_REG_NAMES \c, T1 | |
217 | xor \d, T1 | |
218 | and \b, T1 | |
219 | xor \d, T1 | |
220 | .endm | |
221 | ||
222 | .macro F2 b, c, d | |
223 | mov \d, T1 | |
224 | SWAP_REG_NAMES \d, T1 | |
225 | xor \c, T1 | |
226 | xor \b, T1 | |
227 | .endm | |
228 | ||
229 | .macro F3 b, c ,d | |
230 | mov \c, T1 | |
231 | SWAP_REG_NAMES \c, T1 | |
232 | mov \b, T2 | |
233 | or \b, T1 | |
234 | and \c, T2 | |
235 | and \d, T1 | |
236 | or T2, T1 | |
237 | .endm | |
238 | ||
239 | .macro F4 b, c, d | |
240 | F2 \b, \c, \d | |
241 | .endm | |
242 | ||
243 | .macro UPDATE_HASH hash, val | |
244 | add \hash, \val | |
245 | mov \val, \hash | |
246 | .endm | |
247 | ||
248 | /* | |
249 | * RR does two rounds of SHA-1 back to back with W[] pre-calc | |
250 | * t1 = F(b, c, d); e += w(i) | |
251 | * e += t1; b <<= 30; d += w(i+1); | |
252 | * t1 = F(a, b, c); | |
253 | * d += t1; a <<= 5; | |
254 | * e += a; | |
255 | * t1 = e; a >>= 7; | |
256 | * t1 <<= 5; | |
257 | * d += t1; | |
258 | */ | |
259 | .macro RR F, a, b, c, d, e, round | |
260 | add WK(\round), \e | |
261 | \F \b, \c, \d # t1 = F(b, c, d); | |
262 | W_PRECALC (\round + W_PRECALC_AHEAD) | |
263 | rol $30, \b | |
264 | add T1, \e | |
265 | add WK(\round + 1), \d | |
266 | ||
267 | \F \a, \b, \c | |
268 | W_PRECALC (\round + W_PRECALC_AHEAD + 1) | |
269 | rol $5, \a | |
270 | add \a, \e | |
271 | add T1, \d | |
272 | ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) | |
273 | ||
274 | mov \e, T1 | |
275 | SWAP_REG_NAMES \e, T1 | |
276 | ||
277 | rol $5, T1 | |
278 | add T1, \d | |
279 | ||
280 | # write: \a, \b | |
281 | # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c | |
282 | .endm | |
283 | ||
284 | .macro W_PRECALC r | |
285 | .set i, \r | |
286 | ||
287 | .if (i < 20) | |
288 | .set K_XMM, 0 | |
289 | .elseif (i < 40) | |
290 | .set K_XMM, 16 | |
291 | .elseif (i < 60) | |
292 | .set K_XMM, 32 | |
293 | .elseif (i < 80) | |
294 | .set K_XMM, 48 | |
295 | .endif | |
296 | ||
297 | .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) | |
298 | .set i, ((\r) % 80) # pre-compute for the next iteration | |
299 | .if (i == 0) | |
300 | W_PRECALC_RESET | |
301 | .endif | |
302 | W_PRECALC_00_15 | |
303 | .elseif (i<32) | |
304 | W_PRECALC_16_31 | |
305 | .elseif (i < 80) // rounds 32-79 | |
306 | W_PRECALC_32_79 | |
307 | .endif | |
308 | .endm | |
309 | ||
310 | .macro W_PRECALC_RESET | |
311 | .set W, W0 | |
312 | .set W_minus_04, W4 | |
313 | .set W_minus_08, W8 | |
314 | .set W_minus_12, W12 | |
315 | .set W_minus_16, W16 | |
316 | .set W_minus_20, W20 | |
317 | .set W_minus_24, W24 | |
318 | .set W_minus_28, W28 | |
319 | .set W_minus_32, W | |
320 | .endm | |
321 | ||
322 | .macro W_PRECALC_ROTATE | |
323 | .set W_minus_32, W_minus_28 | |
324 | .set W_minus_28, W_minus_24 | |
325 | .set W_minus_24, W_minus_20 | |
326 | .set W_minus_20, W_minus_16 | |
327 | .set W_minus_16, W_minus_12 | |
328 | .set W_minus_12, W_minus_08 | |
329 | .set W_minus_08, W_minus_04 | |
330 | .set W_minus_04, W | |
331 | .set W, W_minus_32 | |
332 | .endm | |
333 | ||
334 | .macro W_PRECALC_SSSE3 | |
335 | ||
336 | .macro W_PRECALC_00_15 | |
337 | W_PRECALC_00_15_SSSE3 | |
338 | .endm | |
339 | .macro W_PRECALC_16_31 | |
340 | W_PRECALC_16_31_SSSE3 | |
341 | .endm | |
342 | .macro W_PRECALC_32_79 | |
343 | W_PRECALC_32_79_SSSE3 | |
344 | .endm | |
345 | ||
346 | /* message scheduling pre-compute for rounds 0-15 */ | |
347 | .macro W_PRECALC_00_15_SSSE3 | |
348 | .if ((i & 3) == 0) | |
349 | movdqu (i*4)(BUFFER_PTR), W_TMP1 | |
350 | .elseif ((i & 3) == 1) | |
351 | pshufb XMM_SHUFB_BSWAP, W_TMP1 | |
352 | movdqa W_TMP1, W | |
353 | .elseif ((i & 3) == 2) | |
354 | paddd (K_BASE), W_TMP1 | |
355 | .elseif ((i & 3) == 3) | |
356 | movdqa W_TMP1, WK(i&~3) | |
357 | W_PRECALC_ROTATE | |
358 | .endif | |
359 | .endm | |
360 | ||
361 | /* message scheduling pre-compute for rounds 16-31 | |
362 | * | |
363 | * - calculating last 32 w[i] values in 8 XMM registers | |
364 | * - pre-calculate K+w[i] values and store to mem, for later load by ALU add | |
365 | * instruction | |
366 | * | |
367 | * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] | |
368 | * dependency, but improves for 32-79 | |
369 | */ | |
370 | .macro W_PRECALC_16_31_SSSE3 | |
371 | # blended scheduling of vector and scalar instruction streams, one 4-wide | |
372 | # vector iteration / 4 scalar rounds | |
373 | .if ((i & 3) == 0) | |
374 | movdqa W_minus_12, W | |
375 | palignr $8, W_minus_16, W # w[i-14] | |
376 | movdqa W_minus_04, W_TMP1 | |
377 | psrldq $4, W_TMP1 # w[i-3] | |
378 | pxor W_minus_08, W | |
379 | .elseif ((i & 3) == 1) | |
380 | pxor W_minus_16, W_TMP1 | |
381 | pxor W_TMP1, W | |
382 | movdqa W, W_TMP2 | |
383 | movdqa W, W_TMP1 | |
384 | pslldq $12, W_TMP2 | |
385 | .elseif ((i & 3) == 2) | |
386 | psrld $31, W | |
387 | pslld $1, W_TMP1 | |
388 | por W, W_TMP1 | |
389 | movdqa W_TMP2, W | |
390 | psrld $30, W_TMP2 | |
391 | pslld $2, W | |
392 | .elseif ((i & 3) == 3) | |
393 | pxor W, W_TMP1 | |
394 | pxor W_TMP2, W_TMP1 | |
395 | movdqa W_TMP1, W | |
396 | paddd K_XMM(K_BASE), W_TMP1 | |
397 | movdqa W_TMP1, WK(i&~3) | |
398 | W_PRECALC_ROTATE | |
399 | .endif | |
400 | .endm | |
401 | ||
402 | /* message scheduling pre-compute for rounds 32-79 | |
403 | * | |
404 | * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 | |
405 | * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 | |
406 | * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken | |
407 | */ | |
408 | .macro W_PRECALC_32_79_SSSE3 | |
409 | .if ((i & 3) == 0) | |
410 | movdqa W_minus_04, W_TMP1 | |
411 | pxor W_minus_28, W # W is W_minus_32 before xor | |
412 | palignr $8, W_minus_08, W_TMP1 | |
413 | .elseif ((i & 3) == 1) | |
414 | pxor W_minus_16, W | |
415 | pxor W_TMP1, W | |
416 | movdqa W, W_TMP1 | |
417 | .elseif ((i & 3) == 2) | |
418 | psrld $30, W | |
419 | pslld $2, W_TMP1 | |
420 | por W, W_TMP1 | |
421 | .elseif ((i & 3) == 3) | |
422 | movdqa W_TMP1, W | |
423 | paddd K_XMM(K_BASE), W_TMP1 | |
424 | movdqa W_TMP1, WK(i&~3) | |
425 | W_PRECALC_ROTATE | |
426 | .endif | |
427 | .endm | |
428 | ||
429 | .endm // W_PRECALC_SSSE3 | |
430 | ||
431 | ||
432 | #define K1 0x5a827999 | |
433 | #define K2 0x6ed9eba1 | |
434 | #define K3 0x8f1bbcdc | |
435 | #define K4 0xca62c1d6 | |
436 | ||
437 | .section .rodata | |
438 | .align 16 | |
439 | ||
440 | K_XMM_AR: | |
441 | .long K1, K1, K1, K1 | |
442 | .long K2, K2, K2, K2 | |
443 | .long K3, K3, K3, K3 | |
444 | .long K4, K4, K4, K4 | |
445 | ||
446 | BSWAP_SHUFB_CTL: | |
447 | .long 0x00010203 | |
448 | .long 0x04050607 | |
449 | .long 0x08090a0b | |
450 | .long 0x0c0d0e0f | |
451 | ||
452 | ||
453 | .section .text | |
454 | ||
455 | W_PRECALC_SSSE3 | |
456 | .macro xmm_mov a, b | |
457 | movdqu \a,\b | |
458 | .endm | |
459 | ||
460 | /* SSSE3 optimized implementation: | |
461 | * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, | |
462 | * unsigned int rounds); | |
463 | */ | |
464 | SHA1_VECTOR_ASM sha1_transform_ssse3 | |
465 | ||
65df5774 | 466 | #ifdef CONFIG_AS_AVX |
66be8951 MK |
467 | |
468 | .macro W_PRECALC_AVX | |
469 | ||
470 | .purgem W_PRECALC_00_15 | |
471 | .macro W_PRECALC_00_15 | |
472 | W_PRECALC_00_15_AVX | |
473 | .endm | |
474 | .purgem W_PRECALC_16_31 | |
475 | .macro W_PRECALC_16_31 | |
476 | W_PRECALC_16_31_AVX | |
477 | .endm | |
478 | .purgem W_PRECALC_32_79 | |
479 | .macro W_PRECALC_32_79 | |
480 | W_PRECALC_32_79_AVX | |
481 | .endm | |
482 | ||
483 | .macro W_PRECALC_00_15_AVX | |
484 | .if ((i & 3) == 0) | |
485 | vmovdqu (i*4)(BUFFER_PTR), W_TMP1 | |
486 | .elseif ((i & 3) == 1) | |
487 | vpshufb XMM_SHUFB_BSWAP, W_TMP1, W | |
488 | .elseif ((i & 3) == 2) | |
489 | vpaddd (K_BASE), W, W_TMP1 | |
490 | .elseif ((i & 3) == 3) | |
491 | vmovdqa W_TMP1, WK(i&~3) | |
492 | W_PRECALC_ROTATE | |
493 | .endif | |
494 | .endm | |
495 | ||
496 | .macro W_PRECALC_16_31_AVX | |
497 | .if ((i & 3) == 0) | |
498 | vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] | |
499 | vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] | |
500 | vpxor W_minus_08, W, W | |
501 | vpxor W_minus_16, W_TMP1, W_TMP1 | |
502 | .elseif ((i & 3) == 1) | |
503 | vpxor W_TMP1, W, W | |
504 | vpslldq $12, W, W_TMP2 | |
505 | vpslld $1, W, W_TMP1 | |
506 | .elseif ((i & 3) == 2) | |
507 | vpsrld $31, W, W | |
508 | vpor W, W_TMP1, W_TMP1 | |
509 | vpslld $2, W_TMP2, W | |
510 | vpsrld $30, W_TMP2, W_TMP2 | |
511 | .elseif ((i & 3) == 3) | |
512 | vpxor W, W_TMP1, W_TMP1 | |
513 | vpxor W_TMP2, W_TMP1, W | |
514 | vpaddd K_XMM(K_BASE), W, W_TMP1 | |
515 | vmovdqu W_TMP1, WK(i&~3) | |
516 | W_PRECALC_ROTATE | |
517 | .endif | |
518 | .endm | |
519 | ||
520 | .macro W_PRECALC_32_79_AVX | |
521 | .if ((i & 3) == 0) | |
522 | vpalignr $8, W_minus_08, W_minus_04, W_TMP1 | |
523 | vpxor W_minus_28, W, W # W is W_minus_32 before xor | |
524 | .elseif ((i & 3) == 1) | |
525 | vpxor W_minus_16, W_TMP1, W_TMP1 | |
526 | vpxor W_TMP1, W, W | |
527 | .elseif ((i & 3) == 2) | |
528 | vpslld $2, W, W_TMP1 | |
529 | vpsrld $30, W, W | |
530 | vpor W, W_TMP1, W | |
531 | .elseif ((i & 3) == 3) | |
532 | vpaddd K_XMM(K_BASE), W, W_TMP1 | |
533 | vmovdqu W_TMP1, WK(i&~3) | |
534 | W_PRECALC_ROTATE | |
535 | .endif | |
536 | .endm | |
537 | ||
538 | .endm // W_PRECALC_AVX | |
539 | ||
540 | W_PRECALC_AVX | |
541 | .purgem xmm_mov | |
542 | .macro xmm_mov a, b | |
543 | vmovdqu \a,\b | |
544 | .endm | |
545 | ||
546 | ||
547 | /* AVX optimized implementation: | |
548 | * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, | |
549 | * unsigned int rounds); | |
550 | */ | |
551 | SHA1_VECTOR_ASM sha1_transform_avx | |
552 | ||
553 | #endif |