1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ;; code to compute 16 SHA1 using SSE
33 %include "reg_sizes.asm"
39 ;; Magic functions defined in FIPS 180-1
41 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
54 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
66 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
81 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
88 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
98 psrld %%tmp, (32-%%imm)
102 %macro SHA1_STEP_00_15 11
114 paddd %%regE,%%immCNT
115 paddd %%regE,[%%data + (%%memW * 16)]
117 PROLD %%regT,5, %%regF
119 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
120 PROLD %%regB,30, %%regT
124 %macro SHA1_STEP_16_79 11
136 paddd %%regE,%%immCNT
137 movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
139 pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
140 pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
147 movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
150 PROLD %%regT,5, %%regF
152 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
153 PROLD %%regB,30, %%regT
157 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
158 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
159 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
160 %ifidn __OUTPUT_FORMAT__, elf64
172 %define tmp3 r12 ; must be saved and restored
173 %define tmp4 r13 ; must be saved and restored
174 %define tmp5 r14 ; must be saved and restored
175 %define tmp6 r15 ; must be saved and restored
185 %macro FUNC_RESTORE 0
200 %define tmp1 r12 ; must be saved and restored
201 %define tmp2 r13 ; must be saved and restored
202 %define tmp3 r14 ; must be saved and restored
203 %define tmp4 r15 ; must be saved and restored
204 %define tmp5 rdi ; must be saved and restored
205 %define tmp6 rsi ; must be saved and restored
208 %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
209 %define func(x) proc_frame x
211 alloc_stack stack_size
212 save_xmm128 xmm6, 0*16
213 save_xmm128 xmm7, 1*16
214 save_xmm128 xmm8, 2*16
215 save_xmm128 xmm9, 3*16
216 save_xmm128 xmm10, 4*16
217 save_xmm128 xmm11, 5*16
218 save_xmm128 xmm12, 6*16
219 save_xmm128 xmm13, 7*16
220 save_xmm128 xmm14, 8*16
221 save_xmm128 xmm15, 9*16
222 save_reg r12, 10*16 + 0*8
223 save_reg r13, 10*16 + 1*8
224 save_reg r14, 10*16 + 2*8
225 save_reg r15, 10*16 + 3*8
226 save_reg rdi, 10*16 + 4*8
227 save_reg rsi, 10*16 + 5*8
231 %macro FUNC_RESTORE 0
232 movdqa xmm6, [rsp + 0*16]
233 movdqa xmm7, [rsp + 1*16]
234 movdqa xmm8, [rsp + 2*16]
235 movdqa xmm9, [rsp + 3*16]
236 movdqa xmm10, [rsp + 4*16]
237 movdqa xmm11, [rsp + 5*16]
238 movdqa xmm12, [rsp + 6*16]
239 movdqa xmm13, [rsp + 7*16]
240 movdqa xmm14, [rsp + 8*16]
241 movdqa xmm15, [rsp + 9*16]
242 mov r12, [rsp + 10*16 + 0*8]
243 mov r13, [rsp + 10*16 + 1*8]
244 mov r14, [rsp + 10*16 + 2*8]
245 mov r15, [rsp + 10*16 + 3*8]
246 mov rdi, [rsp + 10*16 + 4*8]
247 mov rsi, [rsp + 10*16 + 5*8]
251 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
253 ;variables of mh_sha1
255 %define mh_digests_p arg1
256 %define mh_data_p arg2
258 ;variables used by storing segs_digests on stack
259 %define RSP_SAVE tmp2
260 %define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
267 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
317 ; segs_digests:low addr-> high_addr
318 ; a | b | c | ...| p | (16)
319 ; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
320 ; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
322 ; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep |
326 ;void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
327 ; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
328 ; arg 0 pointer to input data
329 ; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
330 ; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
331 ; arg 3 number of 1KB blocks
333 mk_global mh_sha1_block_sse, function, internal
334 func(mh_sha1_block_sse)
343 ; leave enough space to store segs_digests
345 ; align rsp to 16 Bytes needed by sse
348 %assign I 0 ; copy segs_digests into stack
350 MOVPS A, [mh_digests_p + I*64 + 16*0]
351 MOVPS B, [mh_digests_p + I*64 + 16*1]
352 MOVPS C, [mh_digests_p + I*64 + 16*2]
353 MOVPS D, [mh_digests_p + I*64 + 16*3]
355 movdqa [rsp + I*64 + 16*0], A
356 movdqa [rsp + I*64 + 16*1], B
357 movdqa [rsp + I*64 + 16*2], C
358 movdqa [rsp + I*64 + 16*3], D
363 ;transform to big-endian data and store on aligned_frame
364 movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
365 ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
368 MOVPS T0,[mh_in_p + I*64+0*16]
369 MOVPS T1,[mh_in_p + I*64+1*16]
370 MOVPS T2,[mh_in_p + I*64+2*16]
371 MOVPS T3,[mh_in_p + I*64+3*16]
374 movdqa [mh_data_p +(I)*16 +0*256],T0
376 movdqa [mh_data_p +(I)*16 +1*256],T1
378 movdqa [mh_data_p +(I)*16 +2*256],T2
380 movdqa [mh_data_p +(I)*16 +3*256],T3
384 mov mh_segs, 0 ;start from the first 4 segments
385 mov pref, 1024 ;avoid prefetch repeadtedly
387 ;; Initialize digests
388 movdqa A, [rsp + 0*64 + mh_segs]
389 movdqa B, [rsp + 1*64 + mh_segs]
390 movdqa C, [rsp + 2*64 + mh_segs]
391 movdqa D, [rsp + 3*64 + mh_segs]
392 movdqa E, [rsp + 4*64 + mh_segs]
400 ;; perform 0-79 steps
406 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
412 movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
413 movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
415 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
419 PREFETCH_X [mh_in_p + pref+128*0]
423 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
431 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
435 PREFETCH_X [mh_in_p + pref+128*1]
439 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
451 movdqa [rsp + 0*64 + mh_segs], A
452 movdqa [rsp + 1*64 + mh_segs], B
453 movdqa [rsp + 2*64 + mh_segs], C
454 movdqa [rsp + 3*64 + mh_segs], D
455 movdqa [rsp + 4*64 + mh_segs], E
463 sub mh_data_p, (1024)
469 %assign I 0 ; copy segs_digests back to mh_digests_p
471 movdqa A, [rsp + I*64 + 16*0]
472 movdqa B, [rsp + I*64 + 16*1]
473 movdqa C, [rsp + I*64 + 16*2]
474 movdqa D, [rsp + I*64 + 16*3]
476 MOVPS [mh_digests_p + I*64 + 16*0], A
477 MOVPS [mh_digests_p + I*64 + 16*1], B
478 MOVPS [mh_digests_p + I*64 + 16*2], C
479 MOVPS [mh_digests_p + I*64 + 16*3], D
482 mov rsp, RSP_SAVE ; restore rsp
490 section .data align=16
493 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
495 K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
496 K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
497 K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
498 K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6