2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;; Stack must be aligned to 32 bytes before call
29 ;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15
30 ;; Windows preserves: rbx rcx rsi rdi rbp
32 ;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15
33 ;; Linux preserves: rbx rcx rdi rbp r8
37 %include "include/os.asm"
39 %include "include/dbgprint.asm"
40 %include "mb_mgr_datastruct.asm"
41 %include "include/transpose_avx2.asm"
46 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
47 ;ddq 0x0c0d0e0f08090a0b0405060700010203
48 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
49 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
50 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
51 ;ddq 0x5A8279995A8279995A8279995A827999
52 dq 0x5A8279995A827999, 0x5A8279995A827999
53 dq 0x5A8279995A827999, 0x5A8279995A827999
54 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
55 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
56 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
57 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
58 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
59 ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
60 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
61 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
62 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
63 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
64 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
65 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
141 ;; Assume stack aligned to 32 bytes before call
142 ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
143 %define FRAMESZ 32*16 + 24
145 %define VMOVPS vmovups
148 ;; Magic functions defined in FIPS 180-1
150 ;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
157 ;vmovdqa %%regF,%%regC
158 vpxor %%regF, %%regC,%%regD
159 vpand %%regF, %%regF,%%regB
160 vpxor %%regF, %%regF,%%regD
163 ;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
170 ;vmovdqa %%regF,%%regD
171 vpxor %%regF,%%regD,%%regC
172 vpxor %%regF,%%regF,%%regB
177 ;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
184 ;vmovdqa %%regF,%%regB
185 ;vmovdqa %%regT,%%regB
186 vpor %%regF,%%regB,%%regC
187 vpand %%regT,%%regB,%%regC
188 vpand %%regF,%%regF,%%regD
189 vpor %%regF,%%regF,%%regT
192 ;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
199 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
202 ; PROLD reg, imm, tmp
207 ;vmovdqa %%tmp, %%reg
208 vpsrld %%tmp, %%reg, (32-%%imm)
209 vpslld %%reg, %%reg, %%imm
210 vpor %%reg, %%reg, %%tmp
213 ; PROLD reg, imm, tmp
219 ;vmovdqa %%tmp, %%reg
220 vpsrld %%tmp, %%src, (32-%%imm)
221 vpslld %%reg, %%src, %%imm
222 vpor %%reg, %%reg, %%tmp
225 %macro SHA1_STEP_00_15 10
236 vpaddd %%regE, %%regE,%%immCNT
237 vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
238 ;vmovdqa %%regT,%%regA
239 PROLD_nd %%regT,5, %%regF,%%regA
240 vpaddd %%regE, %%regE,%%regT
241 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
242 PROLD %%regB,30, %%regT
243 vpaddd %%regE, %%regE,%%regF
246 %macro SHA1_STEP_16_79 10
257 vpaddd %%regE, %%regE,%%immCNT
259 vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32]
261 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
262 vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
265 vpsrld %%regF, W16, (32-1)
267 vpor %%regF, %%regF, W16
270 vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF
271 vpaddd %%regE, %%regE,%%regF
273 ;vmovdqa %%regT,%%regA
274 PROLD_nd %%regT,5, %%regF, %%regA
275 vpaddd %%regE, %%regE,%%regT
276 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
277 PROLD %%regB,30, %%regT
278 vpaddd %%regE,%%regE,%%regF
282 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
284 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
304 ; void sha1_x8_avx2(void *state, int num_blks)
305 ; arg 1 : rcx : pointer to array[4] of pointer to input data
306 ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
307 MKGLOBAL(sha1_x8_avx2,function,internal)
311 ;; Initialize digests
312 vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE]
313 vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE]
314 vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE]
315 vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE]
316 vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE]
317 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E
319 ;; transpose input onto stack
320 mov inp0,[state+_data_ptr_sha1+0*PTR_SZ]
321 mov inp1,[state+_data_ptr_sha1+1*PTR_SZ]
322 mov inp2,[state+_data_ptr_sha1+2*PTR_SZ]
323 mov inp3,[state+_data_ptr_sha1+3*PTR_SZ]
324 mov inp4,[state+_data_ptr_sha1+4*PTR_SZ]
325 mov inp5,[state+_data_ptr_sha1+5*PTR_SZ]
326 mov inp6,[state+_data_ptr_sha1+6*PTR_SZ]
327 mov inp7,[state+_data_ptr_sha1+7*PTR_SZ]
331 vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
334 TRANSPOSE8_U32_LOAD8 T0, T1, T2, T3, T4, T5, T6, T7, \
335 inp0, inp1, inp2, inp3, inp4, inp5, \
338 TRANSPOSE8_U32 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
339 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
341 vmovdqa [rsp+(I*8+0)*32],T0
343 vmovdqa [rsp+(I*8+1)*32],T1
345 vmovdqa [rsp+(I*8+2)*32],T2
347 vmovdqa [rsp+(I*8+3)*32],T3
349 vmovdqa [rsp+(I*8+4)*32],T4
351 vmovdqa [rsp+(I*8+5)*32],T5
353 vmovdqa [rsp+(I*8+6)*32],T6
355 vmovdqa [rsp+(I*8+7)*32],T7
369 ;; perform 0-79 steps
371 vmovdqa K, [rel K00_19]
375 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
381 vmovdqa W16, [rsp + ((16 - 16) & 15) * 32]
382 vmovdqa W15, [rsp + ((16 - 15) & 15) * 32]
384 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
390 vmovdqa K, [rel K20_39]
392 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
398 vmovdqa K, [rel K40_59]
400 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
406 vmovdqa K, [rel K60_79]
408 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
423 vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A
424 vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B
425 vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C
426 vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D
427 vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E
428 DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E
429 ;; update input pointers
438 mov [state+_data_ptr_sha1+0*PTR_SZ], inp0
439 mov [state+_data_ptr_sha1+1*PTR_SZ], inp1
440 mov [state+_data_ptr_sha1+2*PTR_SZ], inp2
441 mov [state+_data_ptr_sha1+3*PTR_SZ], inp3
442 mov [state+_data_ptr_sha1+4*PTR_SZ], inp4
443 mov [state+_data_ptr_sha1+5*PTR_SZ], inp5
444 mov [state+_data_ptr_sha1+6*PTR_SZ], inp6
445 mov [state+_data_ptr_sha1+7*PTR_SZ], inp7
450 ;; Clear stack frame (16*32 bytes)
455 vmovdqa [rsp + i*32], ymm0
465 section .note.GNU-stack noalloc noexec nowrite progbits