2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;; Stack must be aligned to 32 bytes before call
29 ;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15
30 ;; Windows preserves: rbx rcx rsi rdi rbp
32 ;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15
33 ;; Linux preserves: rbx rcx rdi rbp r8
39 %include "dbgprint.asm"
40 %include "mb_mgr_datastruct.asm"
45 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
46 ;ddq 0x0c0d0e0f08090a0b0405060700010203
47 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
48 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
49 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
50 ;ddq 0x5A8279995A8279995A8279995A827999
51 dq 0x5A8279995A827999, 0x5A8279995A827999
52 dq 0x5A8279995A827999, 0x5A8279995A827999
53 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
54 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
55 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
56 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
57 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
58 ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
59 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
60 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
61 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
62 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
63 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
64 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
140 ;; Assume stack aligned to 32 bytes before call
141 ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
142 %define FRAMESZ 32*16 + 24
144 %define VMOVPS vmovups
148 ; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
149 ; "transpose" data in {r0...r7} using temps {t0...t1}
150 ; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
151 ; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
152 ; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
153 ; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
154 ; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
155 ; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
156 ; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
157 ; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
158 ; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
160 ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
161 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
162 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
163 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
164 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
165 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
166 ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
167 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
168 ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
181 ; process top half (r0..r3) {a...d}
182 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
183 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
184 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
185 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
186 vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
187 vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
188 vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
189 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
191 ; use r2 in place of t0
192 ; process bottom half (r4..r7) {e...h}
193 vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
194 vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
195 vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
196 vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
197 vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
198 vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
199 vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
200 vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
202 vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
203 vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
204 vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
205 vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
206 vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
207 vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
208 vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
209 vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
213 ;; Magic functions defined in FIPS 180-1
215 ;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
222 ;vmovdqa %%regF,%%regC
223 vpxor %%regF, %%regC,%%regD
224 vpand %%regF, %%regF,%%regB
225 vpxor %%regF, %%regF,%%regD
228 ;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
235 ;vmovdqa %%regF,%%regD
236 vpxor %%regF,%%regD,%%regC
237 vpxor %%regF,%%regF,%%regB
242 ;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
249 ;vmovdqa %%regF,%%regB
250 ;vmovdqa %%regT,%%regB
251 vpor %%regF,%%regB,%%regC
252 vpand %%regT,%%regB,%%regC
253 vpand %%regF,%%regF,%%regD
254 vpor %%regF,%%regF,%%regT
257 ;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
264 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
267 ; PROLD reg, imm, tmp
272 ;vmovdqa %%tmp, %%reg
273 vpsrld %%tmp, %%reg, (32-%%imm)
274 vpslld %%reg, %%reg, %%imm
275 vpor %%reg, %%reg, %%tmp
278 ; PROLD reg, imm, tmp
284 ;vmovdqa %%tmp, %%reg
285 vpsrld %%tmp, %%src, (32-%%imm)
286 vpslld %%reg, %%src, %%imm
287 vpor %%reg, %%reg, %%tmp
290 %macro SHA1_STEP_00_15 10
301 vpaddd %%regE, %%regE,%%immCNT
302 vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
303 ;vmovdqa %%regT,%%regA
304 PROLD_nd %%regT,5, %%regF,%%regA
305 vpaddd %%regE, %%regE,%%regT
306 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
307 PROLD %%regB,30, %%regT
308 vpaddd %%regE, %%regE,%%regF
311 %macro SHA1_STEP_16_79 10
322 vpaddd %%regE, %%regE,%%immCNT
324 vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32]
326 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
327 vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
330 vpsrld %%regF, W16, (32-1)
332 vpor %%regF, %%regF, W16
335 vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF
336 vpaddd %%regE, %%regE,%%regF
338 ;vmovdqa %%regT,%%regA
339 PROLD_nd %%regT,5, %%regF, %%regA
340 vpaddd %%regE, %%regE,%%regT
341 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
342 PROLD %%regB,30, %%regT
343 vpaddd %%regE,%%regE,%%regF
347 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
369 ; void sha1_x8_avx2(void *state, int num_blks)
370 ; arg 1 : rcx : pointer to array[4] of pointer to input data
371 ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
372 MKGLOBAL(sha1_x8_avx2,function,internal)
376 ;; Initialize digests
377 vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE]
378 vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE]
379 vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE]
380 vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE]
381 vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE]
382 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E
384 ;; transpose input onto stack
385 mov inp0,[state+_data_ptr_sha1+0*PTR_SZ]
386 mov inp1,[state+_data_ptr_sha1+1*PTR_SZ]
387 mov inp2,[state+_data_ptr_sha1+2*PTR_SZ]
388 mov inp3,[state+_data_ptr_sha1+3*PTR_SZ]
389 mov inp4,[state+_data_ptr_sha1+4*PTR_SZ]
390 mov inp5,[state+_data_ptr_sha1+5*PTR_SZ]
391 mov inp6,[state+_data_ptr_sha1+6*PTR_SZ]
392 mov inp7,[state+_data_ptr_sha1+7*PTR_SZ]
396 vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
407 TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
408 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
410 vmovdqa [rsp+(I*8+0)*32],T0
412 vmovdqa [rsp+(I*8+1)*32],T1
414 vmovdqa [rsp+(I*8+2)*32],T2
416 vmovdqa [rsp+(I*8+3)*32],T3
418 vmovdqa [rsp+(I*8+4)*32],T4
420 vmovdqa [rsp+(I*8+5)*32],T5
422 vmovdqa [rsp+(I*8+6)*32],T6
424 vmovdqa [rsp+(I*8+7)*32],T7
438 ;; perform 0-79 steps
440 vmovdqa K, [rel K00_19]
444 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
450 vmovdqa W16, [rsp + ((16 - 16) & 15) * 32]
451 vmovdqa W15, [rsp + ((16 - 15) & 15) * 32]
453 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
459 vmovdqa K, [rel K20_39]
461 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
467 vmovdqa K, [rel K40_59]
469 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
475 vmovdqa K, [rel K60_79]
477 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
492 vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A
493 vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B
494 vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C
495 vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D
496 vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E
497 DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E
498 ;; update input pointers
507 mov [state+_data_ptr_sha1+0*PTR_SZ], inp0
508 mov [state+_data_ptr_sha1+1*PTR_SZ], inp1
509 mov [state+_data_ptr_sha1+2*PTR_SZ], inp2
510 mov [state+_data_ptr_sha1+3*PTR_SZ], inp3
511 mov [state+_data_ptr_sha1+4*PTR_SZ], inp4
512 mov [state+_data_ptr_sha1+5*PTR_SZ], inp5
513 mov [state+_data_ptr_sha1+6*PTR_SZ], inp6
514 mov [state+_data_ptr_sha1+7*PTR_SZ], inp7
524 section .note.GNU-stack noalloc noexec nowrite progbits