]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Implement fast SHA-1 with AVX2 instructions. (x86_64) | |
3 | * | |
4 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
5 | * redistributing this file, you may do so under either license. | |
6 | * | |
7 | * GPL LICENSE SUMMARY | |
8 | * | |
9 | * Copyright(c) 2014 Intel Corporation. | |
10 | * | |
11 | * This program is free software; you can redistribute it and/or modify | |
12 | * it under the terms of version 2 of the GNU General Public License as | |
13 | * published by the Free Software Foundation. | |
14 | * | |
15 | * This program is distributed in the hope that it will be useful, but | |
16 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * General Public License for more details. | |
19 | * | |
20 | * Contact Information: | |
21 | * Ilya Albrekht <ilya.albrekht@intel.com> | |
22 | * Maxim Locktyukhin <maxim.locktyukhin@intel.com> | |
23 | * Ronen Zohar <ronen.zohar@intel.com> | |
24 | * Chandramouli Narayanan <mouli@linux.intel.com> | |
25 | * | |
26 | * BSD LICENSE | |
27 | * | |
28 | * Copyright(c) 2014 Intel Corporation. | |
29 | * | |
30 | * Redistribution and use in source and binary forms, with or without | |
31 | * modification, are permitted provided that the following conditions | |
32 | * are met: | |
33 | * | |
34 | * Redistributions of source code must retain the above copyright | |
35 | * notice, this list of conditions and the following disclaimer. | |
36 | * Redistributions in binary form must reproduce the above copyright | |
37 | * notice, this list of conditions and the following disclaimer in | |
38 | * the documentation and/or other materials provided with the | |
39 | * distribution. | |
40 | * Neither the name of Intel Corporation nor the names of its | |
41 | * contributors may be used to endorse or promote products derived | |
42 | * from this software without specific prior written permission. | |
43 | * | |
44 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
45 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
46 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
47 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
48 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
49 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
50 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
51 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
52 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
53 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
54 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
55 | * | |
56 | */ | |
57 | ||
58 | /* | |
59 | * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. | |
60 | * | |
61 | *This implementation is based on the previous SSSE3 release: | |
62 | *Visit http://software.intel.com/en-us/articles/ | |
63 | *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ | |
64 | * | |
65 | *Updates 20-byte SHA-1 record in 'hash' for even number of | |
66 | *'num_blocks' consecutive 64-byte blocks | |
67 | * | |
68 | *extern "C" void sha1_transform_avx2( | |
69 | * int *hash, const char* input, size_t num_blocks ); | |
70 | */ | |
71 | ||
72 | #include <linux/linkage.h> | |
73 | ||
74 | #define CTX %rdi /* arg1 */ | |
75 | #define BUF %rsi /* arg2 */ | |
76 | #define CNT %rdx /* arg3 */ | |
77 | ||
78 | #define REG_A %ecx | |
79 | #define REG_B %esi | |
80 | #define REG_C %edi | |
81 | #define REG_D %eax | |
82 | #define REG_E %edx | |
83 | #define REG_TB %ebx | |
84 | #define REG_TA %r12d | |
85 | #define REG_RA %rcx | |
86 | #define REG_RB %rsi | |
87 | #define REG_RC %rdi | |
88 | #define REG_RD %rax | |
89 | #define REG_RE %rdx | |
90 | #define REG_RTA %r12 | |
91 | #define REG_RTB %rbx | |
92 | #define REG_T1 %ebp | |
93 | #define xmm_mov vmovups | |
94 | #define avx2_zeroupper vzeroupper | |
95 | #define RND_F1 1 | |
96 | #define RND_F2 2 | |
97 | #define RND_F3 3 | |
98 | ||
99 | .macro REGALLOC | |
100 | .set A, REG_A | |
101 | .set B, REG_B | |
102 | .set C, REG_C | |
103 | .set D, REG_D | |
104 | .set E, REG_E | |
105 | .set TB, REG_TB | |
106 | .set TA, REG_TA | |
107 | ||
108 | .set RA, REG_RA | |
109 | .set RB, REG_RB | |
110 | .set RC, REG_RC | |
111 | .set RD, REG_RD | |
112 | .set RE, REG_RE | |
113 | ||
114 | .set RTA, REG_RTA | |
115 | .set RTB, REG_RTB | |
116 | ||
117 | .set T1, REG_T1 | |
118 | .endm | |
119 | ||
120 | #define HASH_PTR %r9 | |
121 | #define BLOCKS_CTR %r8 | |
122 | #define BUFFER_PTR %r10 | |
123 | #define BUFFER_PTR2 %r13 | |
124 | ||
125 | #define PRECALC_BUF %r14 | |
126 | #define WK_BUF %r15 | |
127 | ||
128 | #define W_TMP %xmm0 | |
129 | #define WY_TMP %ymm0 | |
130 | #define WY_TMP2 %ymm9 | |
131 | ||
132 | # AVX2 variables | |
133 | #define WY0 %ymm3 | |
134 | #define WY4 %ymm5 | |
135 | #define WY08 %ymm7 | |
136 | #define WY12 %ymm8 | |
137 | #define WY16 %ymm12 | |
138 | #define WY20 %ymm13 | |
139 | #define WY24 %ymm14 | |
140 | #define WY28 %ymm15 | |
141 | ||
142 | #define YMM_SHUFB_BSWAP %ymm10 | |
143 | ||
144 | /* | |
145 | * Keep 2 iterations precalculated at a time: | |
146 | * - 80 DWORDs per iteration * 2 | |
147 | */ | |
148 | #define W_SIZE (80*2*2 +16) | |
149 | ||
150 | #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) | |
151 | #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) | |
152 | ||
153 | ||
154 | .macro UPDATE_HASH hash, val | |
155 | add \hash, \val | |
156 | mov \val, \hash | |
157 | .endm | |
158 | ||
159 | .macro PRECALC_RESET_WY | |
160 | .set WY_00, WY0 | |
161 | .set WY_04, WY4 | |
162 | .set WY_08, WY08 | |
163 | .set WY_12, WY12 | |
164 | .set WY_16, WY16 | |
165 | .set WY_20, WY20 | |
166 | .set WY_24, WY24 | |
167 | .set WY_28, WY28 | |
168 | .set WY_32, WY_00 | |
169 | .endm | |
170 | ||
171 | .macro PRECALC_ROTATE_WY | |
172 | /* Rotate macros */ | |
173 | .set WY_32, WY_28 | |
174 | .set WY_28, WY_24 | |
175 | .set WY_24, WY_20 | |
176 | .set WY_20, WY_16 | |
177 | .set WY_16, WY_12 | |
178 | .set WY_12, WY_08 | |
179 | .set WY_08, WY_04 | |
180 | .set WY_04, WY_00 | |
181 | .set WY_00, WY_32 | |
182 | ||
183 | /* Define register aliases */ | |
184 | .set WY, WY_00 | |
185 | .set WY_minus_04, WY_04 | |
186 | .set WY_minus_08, WY_08 | |
187 | .set WY_minus_12, WY_12 | |
188 | .set WY_minus_16, WY_16 | |
189 | .set WY_minus_20, WY_20 | |
190 | .set WY_minus_24, WY_24 | |
191 | .set WY_minus_28, WY_28 | |
192 | .set WY_minus_32, WY | |
193 | .endm | |
194 | ||
195 | .macro PRECALC_00_15 | |
196 | .if (i == 0) # Initialize and rotate registers | |
197 | PRECALC_RESET_WY | |
198 | PRECALC_ROTATE_WY | |
199 | .endif | |
200 | ||
201 | /* message scheduling pre-compute for rounds 0-15 */ | |
202 | .if ((i & 7) == 0) | |
203 | /* | |
204 | * blended AVX2 and ALU instruction scheduling | |
205 | * 1 vector iteration per 8 rounds | |
206 | */ | |
207 | vmovdqu (i * 2)(BUFFER_PTR), W_TMP | |
208 | .elseif ((i & 7) == 1) | |
209 | vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ | |
210 | WY_TMP, WY_TMP | |
211 | .elseif ((i & 7) == 2) | |
212 | vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY | |
213 | .elseif ((i & 7) == 4) | |
214 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP | |
215 | .elseif ((i & 7) == 7) | |
216 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | |
217 | ||
218 | PRECALC_ROTATE_WY | |
219 | .endif | |
220 | .endm | |
221 | ||
222 | .macro PRECALC_16_31 | |
223 | /* | |
224 | * message scheduling pre-compute for rounds 16-31 | |
225 | * calculating last 32 w[i] values in 8 XMM registers | |
226 | * pre-calculate K+w[i] values and store to mem | |
227 | * for later load by ALU add instruction | |
228 | * | |
229 | * "brute force" vectorization for rounds 16-31 only | |
230 | * due to w[i]->w[i-3] dependency | |
231 | */ | |
232 | .if ((i & 7) == 0) | |
233 | /* | |
234 | * blended AVX2 and ALU instruction scheduling | |
235 | * 1 vector iteration per 8 rounds | |
236 | */ | |
237 | /* w[i-14] */ | |
238 | vpalignr $8, WY_minus_16, WY_minus_12, WY | |
239 | vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ | |
240 | .elseif ((i & 7) == 1) | |
241 | vpxor WY_minus_08, WY, WY | |
242 | vpxor WY_minus_16, WY_TMP, WY_TMP | |
243 | .elseif ((i & 7) == 2) | |
244 | vpxor WY_TMP, WY, WY | |
245 | vpslldq $12, WY, WY_TMP2 | |
246 | .elseif ((i & 7) == 3) | |
247 | vpslld $1, WY, WY_TMP | |
248 | vpsrld $31, WY, WY | |
249 | .elseif ((i & 7) == 4) | |
250 | vpor WY, WY_TMP, WY_TMP | |
251 | vpslld $2, WY_TMP2, WY | |
252 | .elseif ((i & 7) == 5) | |
253 | vpsrld $30, WY_TMP2, WY_TMP2 | |
254 | vpxor WY, WY_TMP, WY_TMP | |
255 | .elseif ((i & 7) == 7) | |
256 | vpxor WY_TMP2, WY_TMP, WY | |
257 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP | |
258 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | |
259 | ||
260 | PRECALC_ROTATE_WY | |
261 | .endif | |
262 | .endm | |
263 | ||
264 | .macro PRECALC_32_79 | |
265 | /* | |
266 | * in SHA-1 specification: | |
267 | * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 | |
268 | * instead we do equal: | |
269 | * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 | |
270 | * allows more efficient vectorization | |
271 | * since w[i]=>w[i-3] dependency is broken | |
272 | */ | |
273 | ||
274 | .if ((i & 7) == 0) | |
275 | /* | |
276 | * blended AVX2 and ALU instruction scheduling | |
277 | * 1 vector iteration per 8 rounds | |
278 | */ | |
279 | vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP | |
280 | .elseif ((i & 7) == 1) | |
281 | /* W is W_minus_32 before xor */ | |
282 | vpxor WY_minus_28, WY, WY | |
283 | .elseif ((i & 7) == 2) | |
284 | vpxor WY_minus_16, WY_TMP, WY_TMP | |
285 | .elseif ((i & 7) == 3) | |
286 | vpxor WY_TMP, WY, WY | |
287 | .elseif ((i & 7) == 4) | |
288 | vpslld $2, WY, WY_TMP | |
289 | .elseif ((i & 7) == 5) | |
290 | vpsrld $30, WY, WY | |
291 | vpor WY, WY_TMP, WY | |
292 | .elseif ((i & 7) == 7) | |
293 | vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP | |
294 | vmovdqu WY_TMP, PRECALC_WK(i&~7) | |
295 | ||
296 | PRECALC_ROTATE_WY | |
297 | .endif | |
298 | .endm | |
299 | ||
300 | .macro PRECALC r, s | |
301 | .set i, \r | |
302 | ||
303 | .if (i < 40) | |
304 | .set K_XMM, 32*0 | |
305 | .elseif (i < 80) | |
306 | .set K_XMM, 32*1 | |
307 | .elseif (i < 120) | |
308 | .set K_XMM, 32*2 | |
309 | .else | |
310 | .set K_XMM, 32*3 | |
311 | .endif | |
312 | ||
313 | .if (i<32) | |
314 | PRECALC_00_15 \s | |
315 | .elseif (i<64) | |
316 | PRECALC_16_31 \s | |
317 | .elseif (i < 160) | |
318 | PRECALC_32_79 \s | |
319 | .endif | |
320 | .endm | |
321 | ||
322 | .macro ROTATE_STATE | |
323 | .set T_REG, E | |
324 | .set E, D | |
325 | .set D, C | |
326 | .set C, B | |
327 | .set B, TB | |
328 | .set TB, A | |
329 | .set A, T_REG | |
330 | ||
331 | .set T_REG, RE | |
332 | .set RE, RD | |
333 | .set RD, RC | |
334 | .set RC, RB | |
335 | .set RB, RTB | |
336 | .set RTB, RA | |
337 | .set RA, T_REG | |
338 | .endm | |
339 | ||
340 | /* Macro relies on saved ROUND_Fx */ | |
341 | ||
342 | .macro RND_FUN f, r | |
343 | .if (\f == RND_F1) | |
344 | ROUND_F1 \r | |
345 | .elseif (\f == RND_F2) | |
346 | ROUND_F2 \r | |
347 | .elseif (\f == RND_F3) | |
348 | ROUND_F3 \r | |
349 | .endif | |
350 | .endm | |
351 | ||
352 | .macro RR r | |
353 | .set round_id, (\r % 80) | |
354 | ||
355 | .if (round_id == 0) /* Precalculate F for first round */ | |
356 | .set ROUND_FUNC, RND_F1 | |
357 | mov B, TB | |
358 | ||
359 | rorx $(32-30), B, B /* b>>>2 */ | |
360 | andn D, TB, T1 | |
361 | and C, TB | |
362 | xor T1, TB | |
363 | .endif | |
364 | ||
365 | RND_FUN ROUND_FUNC, \r | |
366 | ROTATE_STATE | |
367 | ||
368 | .if (round_id == 18) | |
369 | .set ROUND_FUNC, RND_F2 | |
370 | .elseif (round_id == 38) | |
371 | .set ROUND_FUNC, RND_F3 | |
372 | .elseif (round_id == 58) | |
373 | .set ROUND_FUNC, RND_F2 | |
374 | .endif | |
375 | ||
376 | .set round_id, ( (\r+1) % 80) | |
377 | ||
378 | RND_FUN ROUND_FUNC, (\r+1) | |
379 | ROTATE_STATE | |
380 | .endm | |
381 | ||
382 | .macro ROUND_F1 r | |
383 | add WK(\r), E | |
384 | ||
385 | andn C, A, T1 /* ~b&d */ | |
386 | lea (RE,RTB), E /* Add F from the previous round */ | |
387 | ||
388 | rorx $(32-5), A, TA /* T2 = A >>> 5 */ | |
389 | rorx $(32-30),A, TB /* b>>>2 for next round */ | |
390 | ||
391 | PRECALC (\r) /* msg scheduling for next 2 blocks */ | |
392 | ||
393 | /* | |
394 | * Calculate F for the next round | |
395 | * (b & c) ^ andn[b, d] | |
396 | */ | |
397 | and B, A /* b&c */ | |
398 | xor T1, A /* F1 = (b&c) ^ (~b&d) */ | |
399 | ||
400 | lea (RE,RTA), E /* E += A >>> 5 */ | |
401 | .endm | |
402 | ||
403 | .macro ROUND_F2 r | |
404 | add WK(\r), E | |
405 | lea (RE,RTB), E /* Add F from the previous round */ | |
406 | ||
407 | /* Calculate F for the next round */ | |
408 | rorx $(32-5), A, TA /* T2 = A >>> 5 */ | |
409 | .if ((round_id) < 79) | |
410 | rorx $(32-30), A, TB /* b>>>2 for next round */ | |
411 | .endif | |
412 | PRECALC (\r) /* msg scheduling for next 2 blocks */ | |
413 | ||
414 | .if ((round_id) < 79) | |
415 | xor B, A | |
416 | .endif | |
417 | ||
418 | add TA, E /* E += A >>> 5 */ | |
419 | ||
420 | .if ((round_id) < 79) | |
421 | xor C, A | |
422 | .endif | |
423 | .endm | |
424 | ||
425 | .macro ROUND_F3 r | |
426 | add WK(\r), E | |
427 | PRECALC (\r) /* msg scheduling for next 2 blocks */ | |
428 | ||
429 | lea (RE,RTB), E /* Add F from the previous round */ | |
430 | ||
431 | mov B, T1 | |
432 | or A, T1 | |
433 | ||
434 | rorx $(32-5), A, TA /* T2 = A >>> 5 */ | |
435 | rorx $(32-30), A, TB /* b>>>2 for next round */ | |
436 | ||
437 | /* Calculate F for the next round | |
438 | * (b and c) or (d and (b or c)) | |
439 | */ | |
440 | and C, T1 | |
441 | and B, A | |
442 | or T1, A | |
443 | ||
444 | add TA, E /* E += A >>> 5 */ | |
445 | ||
446 | .endm | |
447 | ||
448 | /* Add constant only if (%2 > %3) condition met (uses RTA as temp) | |
449 | * %1 + %2 >= %3 ? %4 : 0 | |
450 | */ | |
451 | .macro ADD_IF_GE a, b, c, d | |
452 | mov \a, RTA | |
453 | add $\d, RTA | |
454 | cmp $\c, \b | |
455 | cmovge RTA, \a | |
456 | .endm | |
457 | ||
458 | /* | |
459 | * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining | |
460 | */ | |
461 | .macro SHA1_PIPELINED_MAIN_BODY | |
462 | ||
463 | REGALLOC | |
464 | ||
465 | mov (HASH_PTR), A | |
466 | mov 4(HASH_PTR), B | |
467 | mov 8(HASH_PTR), C | |
468 | mov 12(HASH_PTR), D | |
469 | mov 16(HASH_PTR), E | |
470 | ||
471 | mov %rsp, PRECALC_BUF | |
472 | lea (2*4*80+32)(%rsp), WK_BUF | |
473 | ||
474 | # Precalc WK for first 2 blocks | |
475 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 | |
476 | .set i, 0 | |
477 | .rept 160 | |
478 | PRECALC i | |
479 | .set i, i + 1 | |
480 | .endr | |
481 | ||
482 | /* Go to next block if needed */ | |
483 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 | |
484 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 | |
485 | xchg WK_BUF, PRECALC_BUF | |
486 | ||
487 | .align 32 | |
488 | _loop: | |
489 | /* | |
490 | * code loops through more than one block | |
491 | * we use K_BASE value as a signal of a last block, | |
492 | * it is set below by: cmovae BUFFER_PTR, K_BASE | |
493 | */ | |
494 | test BLOCKS_CTR, BLOCKS_CTR | |
495 | jnz _begin | |
496 | .align 32 | |
497 | jmp _end | |
498 | .align 32 | |
499 | _begin: | |
500 | ||
501 | /* | |
502 | * Do first block | |
503 | * rounds: 0,2,4,6,8 | |
504 | */ | |
505 | .set j, 0 | |
506 | .rept 5 | |
507 | RR j | |
508 | .set j, j+2 | |
509 | .endr | |
510 | ||
511 | jmp _loop0 | |
512 | _loop0: | |
513 | ||
514 | /* | |
515 | * rounds: | |
516 | * 10,12,14,16,18 | |
517 | * 20,22,24,26,28 | |
518 | * 30,32,34,36,38 | |
519 | * 40,42,44,46,48 | |
520 | * 50,52,54,56,58 | |
521 | */ | |
522 | .rept 25 | |
523 | RR j | |
524 | .set j, j+2 | |
525 | .endr | |
526 | ||
527 | /* Update Counter */ | |
528 | sub $1, BLOCKS_CTR | |
529 | /* Move to the next block only if needed*/ | |
530 | ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 | |
531 | /* | |
532 | * rounds | |
533 | * 60,62,64,66,68 | |
534 | * 70,72,74,76,78 | |
535 | */ | |
536 | .rept 10 | |
537 | RR j | |
538 | .set j, j+2 | |
539 | .endr | |
540 | ||
541 | UPDATE_HASH (HASH_PTR), A | |
542 | UPDATE_HASH 4(HASH_PTR), TB | |
543 | UPDATE_HASH 8(HASH_PTR), C | |
544 | UPDATE_HASH 12(HASH_PTR), D | |
545 | UPDATE_HASH 16(HASH_PTR), E | |
546 | ||
547 | test BLOCKS_CTR, BLOCKS_CTR | |
548 | jz _loop | |
549 | ||
550 | mov TB, B | |
551 | ||
552 | /* Process second block */ | |
553 | /* | |
554 | * rounds | |
555 | * 0+80, 2+80, 4+80, 6+80, 8+80 | |
556 | * 10+80,12+80,14+80,16+80,18+80 | |
557 | */ | |
558 | ||
559 | .set j, 0 | |
560 | .rept 10 | |
561 | RR j+80 | |
562 | .set j, j+2 | |
563 | .endr | |
564 | ||
565 | jmp _loop1 | |
566 | _loop1: | |
567 | /* | |
568 | * rounds | |
569 | * 20+80,22+80,24+80,26+80,28+80 | |
570 | * 30+80,32+80,34+80,36+80,38+80 | |
571 | */ | |
572 | .rept 10 | |
573 | RR j+80 | |
574 | .set j, j+2 | |
575 | .endr | |
576 | ||
577 | jmp _loop2 | |
578 | _loop2: | |
579 | ||
580 | /* | |
581 | * rounds | |
582 | * 40+80,42+80,44+80,46+80,48+80 | |
583 | * 50+80,52+80,54+80,56+80,58+80 | |
584 | */ | |
585 | .rept 10 | |
586 | RR j+80 | |
587 | .set j, j+2 | |
588 | .endr | |
589 | ||
590 | /* update counter */ | |
591 | sub $1, BLOCKS_CTR | |
592 | /* Move to the next block only if needed*/ | |
593 | ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 | |
594 | ||
595 | jmp _loop3 | |
596 | _loop3: | |
597 | ||
598 | /* | |
599 | * rounds | |
600 | * 60+80,62+80,64+80,66+80,68+80 | |
601 | * 70+80,72+80,74+80,76+80,78+80 | |
602 | */ | |
603 | .rept 10 | |
604 | RR j+80 | |
605 | .set j, j+2 | |
606 | .endr | |
607 | ||
608 | UPDATE_HASH (HASH_PTR), A | |
609 | UPDATE_HASH 4(HASH_PTR), TB | |
610 | UPDATE_HASH 8(HASH_PTR), C | |
611 | UPDATE_HASH 12(HASH_PTR), D | |
612 | UPDATE_HASH 16(HASH_PTR), E | |
613 | ||
614 | /* Reset state for AVX2 reg permutation */ | |
615 | mov A, TA | |
616 | mov TB, A | |
617 | mov C, TB | |
618 | mov E, C | |
619 | mov D, B | |
620 | mov TA, D | |
621 | ||
622 | REGALLOC | |
623 | ||
624 | xchg WK_BUF, PRECALC_BUF | |
625 | ||
626 | jmp _loop | |
627 | ||
628 | .align 32 | |
629 | _end: | |
630 | ||
631 | .endm | |
632 | /* | |
633 | * macro implements SHA-1 function's body for several 64-byte blocks | |
634 | * param: function's name | |
635 | */ | |
636 | .macro SHA1_VECTOR_ASM name | |
637 | ENTRY(\name) | |
638 | ||
639 | push %rbx | |
640 | push %rbp | |
641 | push %r12 | |
642 | push %r13 | |
643 | push %r14 | |
644 | push %r15 | |
645 | ||
646 | RESERVE_STACK = (W_SIZE*4 + 8+24) | |
647 | ||
648 | /* Align stack */ | |
649 | mov %rsp, %rbx | |
650 | and $~(0x20-1), %rsp | |
651 | push %rbx | |
652 | sub $RESERVE_STACK, %rsp | |
653 | ||
654 | avx2_zeroupper | |
655 | ||
656 | /* Setup initial values */ | |
657 | mov CTX, HASH_PTR | |
658 | mov BUF, BUFFER_PTR | |
659 | ||
660 | mov BUF, BUFFER_PTR2 | |
661 | mov CNT, BLOCKS_CTR | |
662 | ||
663 | xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP | |
664 | ||
665 | SHA1_PIPELINED_MAIN_BODY | |
666 | ||
667 | avx2_zeroupper | |
668 | ||
669 | add $RESERVE_STACK, %rsp | |
670 | pop %rsp | |
671 | ||
672 | pop %r15 | |
673 | pop %r14 | |
674 | pop %r13 | |
675 | pop %r12 | |
676 | pop %rbp | |
677 | pop %rbx | |
678 | ||
679 | ret | |
680 | ||
681 | ENDPROC(\name) | |
682 | .endm | |
683 | ||
684 | .section .rodata | |
685 | ||
686 | #define K1 0x5a827999 | |
687 | #define K2 0x6ed9eba1 | |
688 | #define K3 0x8f1bbcdc | |
689 | #define K4 0xca62c1d6 | |
690 | ||
691 | .align 128 | |
692 | K_XMM_AR: | |
693 | .long K1, K1, K1, K1 | |
694 | .long K1, K1, K1, K1 | |
695 | .long K2, K2, K2, K2 | |
696 | .long K2, K2, K2, K2 | |
697 | .long K3, K3, K3, K3 | |
698 | .long K3, K3, K3, K3 | |
699 | .long K4, K4, K4, K4 | |
700 | .long K4, K4, K4, K4 | |
701 | ||
702 | BSWAP_SHUFB_CTL: | |
703 | .long 0x00010203 | |
704 | .long 0x04050607 | |
705 | .long 0x08090a0b | |
706 | .long 0x0c0d0e0f | |
707 | .long 0x00010203 | |
708 | .long 0x04050607 | |
709 | .long 0x08090a0b | |
710 | .long 0x0c0d0e0f | |
711 | .text | |
712 | ||
713 | SHA1_VECTOR_ASM sha1_transform_avx2 |