2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ; This code schedules 1 blocks at a time, with 4 lanes per block
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
54 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
56 ; shuffle xBxA -> 00BA
57 _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
58 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
60 ; shuffle xDxC -> DC00
61 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
62 dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
66 %define VMOVDQ vmovdqu ;; assume buffers not aligned
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
74 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
75 ; Load xmm with mem and byte swap each dword
76 %macro COPY_XMM_AND_BSWAP 3
81 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
96 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
97 %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
98 %define BYTE_FLIP_MASK xmm13
101 %define CTX rsi ; 2nd arg
102 %define INP rdi ; 1st arg
104 %define SRND rdi ; clobbers INP
109 %define CTX rdx ; 2nd arg
110 %define INP rcx ; 1st arg
112 %define SRND rcx ; clobbers INP
139 %define FUNC sha256_block_avx
143 ; Rotate values of symbols X0...X3
153 ; Rotate values of symbols a...h
166 %macro FOUR_ROUNDS_AND_SCHED 0
167 ;; compute s0 four at a time and s1 two at a time
168 ;; compute W[-16] + W[-7] 4 at a time
171 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
173 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
174 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
175 xor y0, e ; y0 = e ^ (e >> (25-11))
177 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
179 xor y1, a ; y1 = a ^ (a >> (22-13)
181 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
182 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
183 and y2, e ; y2 = (f^g)&e
184 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
186 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
187 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
188 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
189 xor y2, g ; y2 = CH = ((f^g)&e)^g
191 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
192 add y2, y0 ; y2 = S1 + CH
193 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
196 add h, y2 ; h = h + S1 + CH + k + w
199 vpsrld XTMP2, XTMP1, 7
202 add d, h ; d = d + h + S1 + CH + k + w
205 vpslld XTMP3, XTMP1, (32-7)
207 and y0, b ; y0 = (a|c)&b
208 add h, y1 ; h = h + S1 + CH + k + w + S0
210 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
212 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
213 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
221 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
222 xor y0, e ; y0 = e ^ (e >> (25-11))
224 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
226 vpsrld XTMP2, XTMP1,18
228 xor y1, a ; y1 = a ^ (a >> (22-13)
229 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
232 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
234 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
235 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
236 and y2, e ; y2 = (f^g)&e
237 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
239 vpslld XTMP1, XTMP1, (32-18)
241 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
242 xor y2, g ; y2 = CH = ((f^g)&e)^g
244 vpxor XTMP3, XTMP3, XTMP1
246 add y2, y0 ; y2 = S1 + CH
247 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
248 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
250 vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
253 add h, y2 ; h = h + S1 + CH + k + w
256 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
259 add d, h ; d = d + h + S1 + CH + k + w
262 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
263 and y0, b ; y0 = (a|c)&b
264 add h, y1 ; h = h + S1 + CH + k + w + S0
265 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
266 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
267 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
270 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
274 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
276 ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
278 xor y0, e ; y0 = e ^ (e >> (25-11))
279 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
281 xor y1, a ; y1 = a ^ (a >> (22-13)
282 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
284 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
288 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
290 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
291 and y2, e ; y2 = (f^g)&e
293 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
295 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
296 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
297 xor y2, g ; y2 = CH = ((f^g)&e)^g
298 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
299 vpxor XTMP2, XTMP2, XTMP3
300 add y2, y0 ; y2 = S1 + CH
301 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
302 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
303 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
305 add h, y2 ; h = h + S1 + CH + k + w
307 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
309 add d, h ; d = d + h + S1 + CH + k + w
311 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
312 and y0, b ; y0 = (a|c)&b
313 add h, y1 ; h = h + S1 + CH + k + w + S0
315 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
316 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
317 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
320 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
322 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
324 ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
325 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
326 xor y0, e ; y0 = e ^ (e >> (25-11))
328 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
330 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
332 xor y1, a ; y1 = a ^ (a >> (22-13)
335 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
337 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
338 and y2, e ; y2 = (f^g)&e
339 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
341 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
343 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
344 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
345 xor y2, g ; y2 = CH = ((f^g)&e)^g
347 vpxor XTMP2, XTMP2, XTMP3
349 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
350 add y2, y0 ; y2 = S1 + CH
351 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
352 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
354 add h, y2 ; h = h + S1 + CH + k + w
356 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
358 add d, h ; d = d + h + S1 + CH + k + w
360 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
361 and y0, b ; y0 = (a|c)&b
362 add h, y1 ; h = h + S1 + CH + k + w + S0
363 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
364 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
370 ;; input is [rsp + _XFER + %1 * 4]
373 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
375 xor y0, e ; y0 = e ^ (e >> (25-11))
376 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
378 xor y1, a ; y1 = a ^ (a >> (22-13)
379 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
381 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
382 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
383 and y2, e ; y2 = (f^g)&e
384 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
385 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
386 xor y2, g ; y2 = CH = ((f^g)&e)^g
387 add y2, y0 ; y2 = S1 + CH
388 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
389 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
391 add h, y2 ; h = h + S1 + CH + k + w
394 add d, h ; d = d + h + S1 + CH + k + w
396 and y0, b ; y0 = (a|c)&b
397 add h, y1 ; h = h + S1 + CH + k + w + S0
398 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
399 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
403 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
404 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
405 ;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
406 ;; arg 1 : pointer to input data
407 ;; arg 2 : pointer to digest
409 MKGLOBAL(FUNC,function,)
424 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
425 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
426 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
427 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
428 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
429 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
430 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
431 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
434 ;; load initial digest
444 vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
445 vmovdqa SHUF_00BA, [rel _SHUF_00BA]
446 vmovdqa SHUF_DC00, [rel _SHUF_DC00]
450 ;; byte swap first 16 dwords
451 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
452 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
453 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
454 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
456 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
460 vpaddd XFER, X0, [TBL + 0*16]
461 vmovdqa [rsp + _XFER], XFER
462 FOUR_ROUNDS_AND_SCHED
464 vpaddd XFER, X0, [TBL + 1*16]
465 vmovdqa [rsp + _XFER], XFER
466 FOUR_ROUNDS_AND_SCHED
468 vpaddd XFER, X0, [TBL + 2*16]
469 vmovdqa [rsp + _XFER], XFER
470 FOUR_ROUNDS_AND_SCHED
472 vpaddd XFER, X0, [TBL + 3*16]
473 vmovdqa [rsp + _XFER], XFER
475 FOUR_ROUNDS_AND_SCHED
482 vpaddd XFER, X0, [TBL + 0*16]
483 vmovdqa [rsp + _XFER], XFER
489 vpaddd XFER, X1, [TBL + 1*16]
490 vmovdqa [rsp + _XFER], XFER
514 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
515 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
516 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
517 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
518 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
519 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
520 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
521 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
540 section .note.GNU-stack noalloc noexec nowrite progbits