2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ; SHA1 code, hybrid, rolled, interleaved
29 ; Uses AVX instructions
35 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
36 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
37 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
38 dq 0x5A8279995A827999, 0x5A8279995A827999
39 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
40 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
41 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
42 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
43 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
44 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
48 %define VMOVDQ vmovdqu ;; assume buffers not aligned
51 %define INP rdi ; 1st arg
52 %define CTX rsi ; 2nd arg
56 %define INP rcx ; 1st arg
57 %define CTX rdx ; 2nd arg
62 %define FRAMESZ 3*16 + 1*8
63 %define _RSP FRAMESZ-1*8 + rsp
90 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
114 ;; Magic functions defined in FIPS 180-1
116 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
129 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
141 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
156 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
163 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
173 %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
185 vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14]
188 vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16]
190 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
191 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
195 vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA}
205 vpsrld XTMP1, X4, (32-1)
208 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
209 vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA}
219 vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx}
222 vpxor XTMP0, XTMP0, XTMP1
223 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
224 ;; rotate XTMP0 left 1
225 vpsrld XTMP1, XTMP0, (32-1)
233 vpslld XTMP0, XTMP0, 1
236 vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
237 %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
239 vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
247 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
248 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
249 ;; void sha1_block_avx(void *input_data, UINT32 digest[5])
250 ;; arg 1 : (in) pointer to input data
251 ;; arg 2 : (in/out) pointer to read/write digest
252 MKGLOBAL(sha1_block_avx,function,internal)
261 vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
263 mov rax,rsp ; copy rsp
264 VMOVDQ X0, [INP + 0*16]
266 VMOVDQ X1, [INP + 1*16]
267 and rsp,-64 ; align stack frame
268 mov [_RSP],rax ; save copy of rsp
270 vmovdqa [rsp + 0 * 16], xmm6
271 vmovdqa [rsp + 1 * 16], xmm7
272 vmovdqa [rsp + 2 * 16], xmm8
274 ;; load next message block
275 VMOVDQ X2, [INP + 2*16]
276 VMOVDQ X3, [INP + 3*16]
278 ;; set up a-f based on h0-h4
279 ;; byte swap first 16 dwords
290 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
292 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
293 vmovdqa XK, [rel K00_19]
329 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
333 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
335 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
336 vmovdqa XK, [rel K20_39]
372 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
378 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
379 vmovdqa XK, [rel K40_59]
415 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
417 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
421 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
422 vmovdqa XK, [rel K60_79]
466 ;; update result digest h0-h4
473 vmovdqa xmm8, [rsp + 2 * 16]
474 vmovdqa xmm7, [rsp + 1 * 16]
475 vmovdqa xmm6, [rsp + 0 * 16]
488 section .note.GNU-stack noalloc noexec nowrite progbits