2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ; This code schedules 1 blocks at a time, with 4 lanes per block
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
54 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
56 ; shuffle xBxA -> 00BA
57 _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
58 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
60 ; shuffle xDxC -> DC00
61 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
62 dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
66 %define VMOVDQ vmovdqu ;; assume buffers not aligned
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
74 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
75 ; Load xmm with mem and byte swap each dword
76 %macro COPY_XMM_AND_BSWAP 3
81 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
96 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
97 %define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
98 %define BYTE_FLIP_MASK xmm13
101 %define CTX rsi ; 2nd arg
102 %define INP rdi ; 1st arg
104 %define SRND rdi ; clobbers INP
109 %define CTX rdx ; 2nd arg
110 %define INP rcx ; 1st arg
112 %define SRND rcx ; clobbers INP
139 %define H0 0x6a09e667
140 %define H1 0xbb67ae85
141 %define H2 0x3c6ef372
142 %define H3 0xa54ff53a
143 %define H4 0x510e527f
144 %define H5 0x9b05688c
145 %define H6 0x1f83d9ab
146 %define H7 0x5be0cd19
147 %define FUNC sha256_one_block_avx
151 ; Rotate values of symbols X0...X3
161 ; Rotate values of symbols a...h
174 %macro FOUR_ROUNDS_AND_SCHED 0
175 ;; compute s0 four at a time and s1 two at a time
176 ;; compute W[-16] + W[-7] 4 at a time
179 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
181 vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
182 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
183 xor y0, e ; y0 = e ^ (e >> (25-11))
185 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
187 xor y1, a ; y1 = a ^ (a >> (22-13)
189 vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
190 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
191 and y2, e ; y2 = (f^g)&e
192 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
194 vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
195 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
196 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
197 xor y2, g ; y2 = CH = ((f^g)&e)^g
199 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
200 add y2, y0 ; y2 = S1 + CH
201 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
204 add h, y2 ; h = h + S1 + CH + k + w
207 vpsrld XTMP2, XTMP1, 7
210 add d, h ; d = d + h + S1 + CH + k + w
213 vpslld XTMP3, XTMP1, (32-7)
215 and y0, b ; y0 = (a|c)&b
216 add h, y1 ; h = h + S1 + CH + k + w + S0
218 vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
220 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
221 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
229 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
230 xor y0, e ; y0 = e ^ (e >> (25-11))
232 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
234 vpsrld XTMP2, XTMP1,18
236 xor y1, a ; y1 = a ^ (a >> (22-13)
237 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
240 vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
242 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
243 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
244 and y2, e ; y2 = (f^g)&e
245 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
247 vpslld XTMP1, XTMP1, (32-18)
249 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
250 xor y2, g ; y2 = CH = ((f^g)&e)^g
252 vpxor XTMP3, XTMP3, XTMP1
254 add y2, y0 ; y2 = S1 + CH
255 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
256 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
258 vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
261 add h, y2 ; h = h + S1 + CH + k + w
264 vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
267 add d, h ; d = d + h + S1 + CH + k + w
270 vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
271 and y0, b ; y0 = (a|c)&b
272 add h, y1 ; h = h + S1 + CH + k + w + S0
273 vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
274 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
275 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
278 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
282 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
284 ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
286 xor y0, e ; y0 = e ^ (e >> (25-11))
287 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
289 xor y1, a ; y1 = a ^ (a >> (22-13)
290 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
292 vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
296 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
298 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
299 and y2, e ; y2 = (f^g)&e
301 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
303 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
304 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
305 xor y2, g ; y2 = CH = ((f^g)&e)^g
306 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
307 vpxor XTMP2, XTMP2, XTMP3
308 add y2, y0 ; y2 = S1 + CH
309 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
310 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
311 vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
313 add h, y2 ; h = h + S1 + CH + k + w
315 vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
317 add d, h ; d = d + h + S1 + CH + k + w
319 vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
320 and y0, b ; y0 = (a|c)&b
321 add h, y1 ; h = h + S1 + CH + k + w + S0
323 vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
324 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
325 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
328 ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
330 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
332 ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
333 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
334 xor y0, e ; y0 = e ^ (e >> (25-11))
336 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
338 vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
340 xor y1, a ; y1 = a ^ (a >> (22-13)
343 vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
345 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
346 and y2, e ; y2 = (f^g)&e
347 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
349 vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
351 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
352 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
353 xor y2, g ; y2 = CH = ((f^g)&e)^g
355 vpxor XTMP2, XTMP2, XTMP3
357 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
358 add y2, y0 ; y2 = S1 + CH
359 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
360 vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
362 add h, y2 ; h = h + S1 + CH + k + w
364 vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
366 add d, h ; d = d + h + S1 + CH + k + w
368 vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
369 and y0, b ; y0 = (a|c)&b
370 add h, y1 ; h = h + S1 + CH + k + w + S0
371 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
372 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
378 ;; input is [rsp + _XFER + %1 * 4]
381 MY_ROR y0, (25-11) ; y0 = e >> (25-11)
383 xor y0, e ; y0 = e ^ (e >> (25-11))
384 MY_ROR y1, (22-13) ; y1 = a >> (22-13)
386 xor y1, a ; y1 = a ^ (a >> (22-13)
387 MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
389 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
390 MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
391 and y2, e ; y2 = (f^g)&e
392 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
393 MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
394 xor y2, g ; y2 = CH = ((f^g)&e)^g
395 add y2, y0 ; y2 = S1 + CH
396 MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
397 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
399 add h, y2 ; h = h + S1 + CH + k + w
402 add d, h ; d = d + h + S1 + CH + k + w
404 and y0, b ; y0 = (a|c)&b
405 add h, y1 ; h = h + S1 + CH + k + w + S0
406 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
407 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
411 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
412 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
413 ;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
414 ;; arg 1 : pointer to input data
415 ;; arg 2 : pointer to digest
417 MKGLOBAL(FUNC,function,)
432 vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
433 vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
434 vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
435 vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
436 vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
437 vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
438 vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
439 vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
442 ;; load initial digest
452 vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
453 vmovdqa SHUF_00BA, [rel _SHUF_00BA]
454 vmovdqa SHUF_DC00, [rel _SHUF_DC00]
458 ;; byte swap first 16 dwords
459 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
460 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
461 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
462 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
464 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
468 vpaddd XFER, X0, [TBL + 0*16]
469 vmovdqa [rsp + _XFER], XFER
470 FOUR_ROUNDS_AND_SCHED
472 vpaddd XFER, X0, [TBL + 1*16]
473 vmovdqa [rsp + _XFER], XFER
474 FOUR_ROUNDS_AND_SCHED
476 vpaddd XFER, X0, [TBL + 2*16]
477 vmovdqa [rsp + _XFER], XFER
478 FOUR_ROUNDS_AND_SCHED
480 vpaddd XFER, X0, [TBL + 3*16]
481 vmovdqa [rsp + _XFER], XFER
483 FOUR_ROUNDS_AND_SCHED
490 vpaddd XFER, X0, [TBL + 0*16]
491 vmovdqa [rsp + _XFER], XFER
497 vpaddd XFER, X1, [TBL + 1*16]
498 vmovdqa [rsp + _XFER], XFER
530 vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
531 vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
532 vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
533 vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
534 vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
535 vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
536 vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
537 vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
556 section .note.GNU-stack noalloc noexec nowrite progbits