1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;; code to compute 16 SHA1 using AVX2
33 %include "reg_sizes.asm"
36 ;; Magic functions defined in FIPS 180-1
38 ;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
45 vpxor %%regF, %%regC,%%regD
46 vpand %%regF, %%regF,%%regB
47 vpxor %%regF, %%regF,%%regD
50 ;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
57 vpxor %%regF,%%regD,%%regC
58 vpxor %%regF,%%regF,%%regB
63 ;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
70 vpor %%regF,%%regB,%%regC
71 vpand %%regT,%%regB,%%regC
72 vpand %%regF,%%regF,%%regD
73 vpor %%regF,%%regF,%%regT
76 ;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
83 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
91 vpsrld %%tmp, %%reg, (32-%%imm)
92 vpslld %%reg, %%reg, %%imm
93 vpor %%reg, %%reg, %%tmp
102 vpsrld %%tmp, %%src, (32-%%imm)
103 vpslld %%reg, %%src, %%imm
104 vpor %%reg, %%reg, %%tmp
107 %macro SHA1_STEP_00_15 11
119 vpaddd %%regE, %%regE,%%immCNT
120 vpaddd %%regE, %%regE,[%%data + (%%memW * 32)]
121 PROLD_nd %%regT,5, %%regF,%%regA
122 vpaddd %%regE, %%regE,%%regT
123 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
124 PROLD %%regB,30, %%regT
125 vpaddd %%regE, %%regE,%%regF
127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
128 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
129 %macro SHA1_STEP_16_79 11
141 vpaddd %%regE, %%regE,%%immCNT
143 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
145 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
146 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
148 vpsrld %%regF, W16, (32-1)
150 vpor %%regF, %%regF, W16
153 vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
154 vpaddd %%regE, %%regE,%%regF
156 PROLD_nd %%regT,5, %%regF, %%regA
157 vpaddd %%regE, %%regE,%%regT
158 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
159 PROLD %%regB,30, %%regT
160 vpaddd %%regE,%%regE,%%regF
163 ;; Insert murmur's instructions into this macro.
164 ;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 512Byte.
165 ;; So insert 1 murmur block into every 2 SHA1_STEP_16_79.
166 %define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
168 %macro SHA1_STEP_16_79_0 11
180 vpaddd %%regE, %%regE,%%immCNT
182 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
184 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
185 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
186 mov mur_data1, [mur_in_p]
187 mov mur_data2, [mur_in_p + 8]
189 vpsrld %%regF, W16, (32-1)
190 imul mur_data1, mur_c1_r
192 vpor %%regF, %%regF, W16
193 imul mur_data2, mur_c2_r
196 vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
198 vpaddd %%regE, %%regE,%%regF
200 PROLD_nd %%regT,5, %%regF, %%regA
201 vpaddd %%regE, %%regE,%%regT
202 imul mur_data1, mur_c2_r
203 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
204 PROLD %%regB,30, %%regT
205 imul mur_data2, mur_c1_r
206 vpaddd %%regE,%%regE,%%regF
210 %macro SHA1_STEP_16_79_1 11
222 vpaddd %%regE, %%regE,%%immCNT
223 xor mur_hash1, mur_data1
224 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
227 add mur_hash1, mur_hash2
228 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
229 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
230 lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
231 vpsrld %%regF, W16, (32-1)
233 xor mur_hash2, mur_data2
234 vpor %%regF, %%regF, W16
238 vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
239 vpaddd %%regE, %%regE,%%regF
240 add mur_hash2, mur_hash1
241 PROLD_nd %%regT,5, %%regF, %%regA
242 vpaddd %%regE, %%regE,%%regT
243 lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
244 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
245 PROLD %%regB,30, %%regT
247 vpaddd %%regE,%%regE,%%regF
250 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
251 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
253 %ifidn __OUTPUT_FORMAT__, elf64
265 %define tmp3 r12 ; must be saved and restored
266 %define tmp4 r13 ; must be saved and restored
267 %define tmp5 r14 ; must be saved and restored
268 %define tmp6 r15 ; must be saved and restored
269 %define tmp7 rbx ; must be saved and restored
270 %define tmp8 rbp ; must be saved and restored
282 %macro FUNC_RESTORE 0
299 %define tmp1 r12 ; must be saved and restored
300 %define tmp2 r13 ; must be saved and restored
301 %define tmp3 r14 ; must be saved and restored
302 %define tmp4 r15 ; must be saved and restored
303 %define tmp5 rdi ; must be saved and restored
304 %define tmp6 rsi ; must be saved and restored
305 %define tmp7 rbx ; must be saved and restored
306 %define tmp8 rbp ; must be saved and restored
309 %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
311 %define arg(x) [rsp + stack_size + PS + PS*x]
312 %define func(x) proc_frame x
314 alloc_stack stack_size
315 save_xmm128 xmm6, 0*16
316 save_xmm128 xmm7, 1*16
317 save_xmm128 xmm8, 2*16
318 save_xmm128 xmm9, 3*16
319 save_xmm128 xmm10, 4*16
320 save_xmm128 xmm11, 5*16
321 save_xmm128 xmm12, 6*16
322 save_xmm128 xmm13, 7*16
323 save_xmm128 xmm14, 8*16
324 save_xmm128 xmm15, 9*16
325 save_reg r12, 10*16 + 0*8
326 save_reg r13, 10*16 + 1*8
327 save_reg r14, 10*16 + 2*8
328 save_reg r15, 10*16 + 3*8
329 save_reg rdi, 10*16 + 4*8
330 save_reg rsi, 10*16 + 5*8
331 save_reg rbx, 10*16 + 6*8
332 save_reg rbp, 10*16 + 7*8
337 %macro FUNC_RESTORE 0
338 movdqa xmm6, [rsp + 0*16]
339 movdqa xmm7, [rsp + 1*16]
340 movdqa xmm8, [rsp + 2*16]
341 movdqa xmm9, [rsp + 3*16]
342 movdqa xmm10, [rsp + 4*16]
343 movdqa xmm11, [rsp + 5*16]
344 movdqa xmm12, [rsp + 6*16]
345 movdqa xmm13, [rsp + 7*16]
346 movdqa xmm14, [rsp + 8*16]
347 movdqa xmm15, [rsp + 9*16]
348 mov r12, [rsp + 10*16 + 0*8]
349 mov r13, [rsp + 10*16 + 1*8]
350 mov r14, [rsp + 10*16 + 2*8]
351 mov r15, [rsp + 10*16 + 3*8]
352 mov rdi, [rsp + 10*16 + 4*8]
353 mov rsi, [rsp + 10*16 + 5*8]
354 mov rbx, [rsp + 10*16 + 6*8]
355 mov rbp, [rsp + 10*16 + 7*8]
359 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
361 ;variables of mh_sha1
363 %define mh_digests_p arg1
364 %define mh_data_p arg2
366 ;variables of murmur3
367 %define mur_in_p tmp2
368 %define mur_digest_p arg3
369 %define mur_hash1 tmp3
370 %define mur_hash2 tmp4
371 %define mur_data1 tmp5
372 %define mur_data2 return
373 %define mur_c1_r tmp6
374 %define mur_c2_r arg5
375 ; constants of murmur3_x64_128
381 %define N1 0x52dce729;DWORD
382 %define N2 0x38495ab5;DWORD
383 %define C1 QWORD(0x87c37b91114253d5)
384 %define C2 QWORD(0x4cf5ad432745937f)
385 ;variables used by storing segs_digests on stack
386 %define RSP_SAVE tmp7
387 %define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
394 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
395 %define VMOVPS vmovups
446 ; segs_digests:low addr-> high_addr
447 ; a | b | c | ...| p | (16)
448 ; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
449 ; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
451 ; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
454 ;void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
455 ; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
456 ; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
457 ; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
458 ; uint32_t num_blocks);
459 ; arg 0 pointer to input data
460 ; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
461 ; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
462 ; arg 3 pointer to murmur3 digest
463 ; arg 4 number of 1KB blocks
465 global mh_sha1_murmur3_x64_128_block_avx2:function internal
466 func(mh_sha1_murmur3_x64_128_block_avx2)
475 ; leave enough space to store segs_digests
477 ; align rsp to 32 Bytes needed by avx2
480 %assign I 0 ; copy segs_digests into stack
482 VMOVPS A, [mh_digests_p + I*32*5 + 32*0]
483 VMOVPS B, [mh_digests_p + I*32*5 + 32*1]
484 VMOVPS C, [mh_digests_p + I*32*5 + 32*2]
485 VMOVPS D, [mh_digests_p + I*32*5 + 32*3]
486 VMOVPS E, [mh_digests_p + I*32*5 + 32*4]
488 vmovdqa [rsp + I*32*5 + 32*0], A
489 vmovdqa [rsp + I*32*5 + 32*1], B
490 vmovdqa [rsp + I*32*5 + 32*2], C
491 vmovdqa [rsp + I*32*5 + 32*3], D
492 vmovdqa [rsp + I*32*5 + 32*4], E
496 ;init murmur variables
497 mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
498 ;load murmur hash digests and multiplier
499 mov mur_hash1, [mur_digest_p]
500 mov mur_hash2, [mur_digest_p + 8]
505 ;transform to big-endian data and store on aligned_frame
506 vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
507 ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
510 VMOVPS T0,[mh_in_p + I*64+0*32]
511 VMOVPS T1,[mh_in_p + I*64+1*32]
514 vmovdqa [mh_data_p +I*32+0*512],T0
516 vmovdqa [mh_data_p +I*32+1*512],T1
520 mov mh_segs, 0 ;start from the first 8 segments
521 mov pref, 1024 ;avoid prefetch repeadtedly
523 ;; Initialize digests
524 vmovdqa A, [rsp + 0*64 + mh_segs]
525 vmovdqa B, [rsp + 1*64 + mh_segs]
526 vmovdqa C, [rsp + 2*64 + mh_segs]
527 vmovdqa D, [rsp + 3*64 + mh_segs]
528 vmovdqa E, [rsp + 4*64 + mh_segs]
536 ;; perform 0-79 steps
542 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
548 vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32]
549 vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32]
552 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
556 PREFETCH_X [mh_in_p + pref+128*0]
557 PREFETCH_X [mh_in_p + pref+128*1]
562 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
570 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
574 PREFETCH_X [mh_in_p + pref+128*2]
575 PREFETCH_X [mh_in_p + pref+128*3]
580 SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
592 vmovdqa [rsp + 0*64 + mh_segs], A
593 vmovdqa [rsp + 1*64 + mh_segs], B
594 vmovdqa [rsp + 2*64 + mh_segs], C
595 vmovdqa [rsp + 3*64 + mh_segs], D
596 vmovdqa [rsp + 4*64 + mh_segs], E
605 sub mh_data_p, (1024)
610 ;store murmur-hash digest
611 mov [mur_digest_p], mur_hash1
612 mov [mur_digest_p + 8], mur_hash2
614 %assign I 0 ; copy segs_digests back to mh_digests_p
616 vmovdqa A, [rsp + I*32*5 + 32*0]
617 vmovdqa B, [rsp + I*32*5 + 32*1]
618 vmovdqa C, [rsp + I*32*5 + 32*2]
619 vmovdqa D, [rsp + I*32*5 + 32*3]
620 vmovdqa E, [rsp + I*32*5 + 32*4]
622 VMOVPS [mh_digests_p + I*32*5 + 32*0], A
623 VMOVPS [mh_digests_p + I*32*5 + 32*1], B
624 VMOVPS [mh_digests_p + I*32*5 + 32*2], C
625 VMOVPS [mh_digests_p + I*32*5 + 32*3], D
626 VMOVPS [mh_digests_p + I*32*5 + 32*4], E
629 mov rsp, RSP_SAVE ; restore rsp
637 section .data align=32
640 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
641 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
642 K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
643 dq 0x5A8279995A827999, 0x5A8279995A827999
644 K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
645 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
646 K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
647 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
648 K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
649 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6