2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 %include "include/os.asm"
29 %include "mb_mgr_datastruct.asm"
35 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
36 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
37 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
38 dq 0x5A8279995A827999, 0x5A8279995A827999
39 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
40 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
41 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
42 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
43 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
44 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
48 ;; code to compute quad SHA1 using AVX
49 ;; derived from ...\sha1_multiple\sha1_quad4.asm
50 ;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
51 ;; rbx, rsi, rdi, rbp, r12-r15 left intact
52 ;; This version is not safe to call from C/C++
54 ;; Stack must be aligned to 16 bytes before call
55 ;; Windows clobbers: rax rdx r8 r9 r10 r11
56 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
58 ;; Linux clobbers: rax rsi r8 r9 r10 r11
59 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
63 ; transpose r0, r1, r2, r3, t0, t1
64 ; "transpose" data in {r0..r3} using temps {t0..t3}
65 ; Input looks like: {r0 r1 r2 r3}
71 ; output looks like: {t0 r1 r0 r3}
84 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
85 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
87 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
88 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
90 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
92 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
94 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
95 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
98 ;; Magic functions defined in FIPS 180-1
100 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
107 vpxor %%regF, %%regC,%%regD
108 vpand %%regF, %%regF,%%regB
109 vpxor %%regF, %%regF,%%regD
112 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
119 vpxor %%regF,%%regD,%%regC
120 vpxor %%regF,%%regF,%%regB
123 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
130 vpor %%regF,%%regB,%%regC
131 vpand %%regT,%%regB,%%regC
132 vpand %%regF,%%regF,%%regD
133 vpor %%regF,%%regF,%%regT
136 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
143 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
146 ; PROLD reg, imm, tmp
151 vpsrld %%tmp, %%reg, (32-(%%imm))
152 vpslld %%reg, %%reg, %%imm
153 vpor %%reg, %%reg, %%tmp
157 ; PROLD_nd reg, imm, tmp, src
163 vpsrld %%tmp, %%src, (32-(%%imm))
164 vpslld %%reg, %%src, %%imm
165 vpor %%reg, %%reg, %%tmp
168 %macro SHA1_STEP_00_15 10
179 vpaddd %%regE, %%regE,%%immCNT
180 vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
181 PROLD_nd %%regT,5, %%regF,%%regA
182 vpaddd %%regE, %%regE,%%regT
183 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
184 PROLD %%regB,30, %%regT
185 vpaddd %%regE, %%regE,%%regF
188 %macro SHA1_STEP_16_79 10
199 vpaddd %%regE, %%regE,%%immCNT
201 vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
203 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
204 vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
206 vpsrld %%regF, W16, (32-1)
208 vpor %%regF, %%regF, W16
211 vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
212 vpaddd %%regE, %%regE,%%regF
214 PROLD_nd %%regT,5, %%regF, %%regA
215 vpaddd %%regE, %%regE,%%regT
216 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
217 PROLD %%regB,30, %%regT
218 vpaddd %%regE,%%regE,%%regF
221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
223 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
225 ;; FRAMESZ must be an odd multiple of 8
226 %define FRAMESZ 16*16 + 8
228 %define VMOVPS vmovdqu
292 ; XMM registers are clobbered. Saving/restoring must be done at a higher level
294 ; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks);
295 ; arg 1 : rcx : pointer to args
296 ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
297 MKGLOBAL(sha1_mult_avx,function,internal)
302 ;; Initialize digests
303 vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
304 vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
305 vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
306 vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
307 vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
309 ;; transpose input onto stack
310 mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
311 mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
312 mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
313 mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
317 vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
324 TRANSPOSE T2, T1, T4, T3, T0, T5
326 vmovdqa [rsp+(I*4+0)*16],T0
328 vmovdqa [rsp+(I*4+1)*16],T1
330 vmovdqa [rsp+(I*4+2)*16],T2
332 vmovdqa [rsp+(I*4+3)*16],T3
345 ;; perform 0-79 steps
347 vmovdqa K, [rel K00_19]
351 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
357 vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
358 vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
360 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
366 vmovdqa K, [rel K20_39]
368 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
374 vmovdqa K, [rel K40_59]
376 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
382 vmovdqa K, [rel K60_79]
384 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
399 vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
400 vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
401 vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
402 vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
403 vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
405 ; update input pointers
407 mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
409 mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
411 mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
413 mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
418 ;; Clear all stack containing part of message
423 vmovdqa [rsp + i*16], xmm0
433 section .note.GNU-stack noalloc noexec nowrite progbits