2 ;; Copyright (c) 2012-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ; This code schedules 1 blocks at a time, with 4 lanes per block
29 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "include/os.asm"
36 dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
37 dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
38 dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
39 dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
40 dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
41 dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
42 dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
43 dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
44 dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
45 dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
46 dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
47 dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
48 dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
49 dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
50 dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
51 dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
54 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
56 ; shuffle xBxA -> 00BA
57 _SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
58 dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
59 ; shuffle xDxC -> DC00
60 _SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
61 dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
66 %define MOVDQ movdqu ;; assume buffers not aligned
68 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
70 ; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
71 ; Load xmm with mem and byte swap each dword
72 %macro COPY_XMM_AND_BSWAP 3
77 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
91 %define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
92 %define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
93 %define BYTE_FLIP_MASK xmm12
96 %define CTX rsi ; 2nd arg
97 %define INP rdi ; 1st arg
99 %define SRND rdi ; clobbers INP
104 %define CTX rdx ; 2nd arg
105 %define INP rcx ; 1st arg
107 %define SRND rcx ; clobbers INP
134 %define FUNC sha256_block_sse
138 ; Rotate values of symbols X0...X3
148 ; Rotate values of symbols a...h
161 %macro FOUR_ROUNDS_AND_SCHED 0
162 ;; compute s0 four at a time and s1 two at a time
163 ;; compute W[-16] + W[-7] 4 at a time
166 ror y0, (25-11) ; y0 = e >> (25-11)
168 palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
169 ror y1, (22-13) ; y1 = a >> (22-13)
170 xor y0, e ; y0 = e ^ (e >> (25-11))
172 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
174 xor y1, a ; y1 = a ^ (a >> (22-13)
176 paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
177 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
178 and y2, e ; y2 = (f^g)&e
179 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
181 palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
182 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
183 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
184 xor y2, g ; y2 = CH = ((f^g)&e)^g
185 movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
186 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
187 add y2, y0 ; y2 = S1 + CH
188 add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
189 movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
191 add h, y2 ; h = h + S1 + CH + k + w
195 add d, h ; d = d + h + S1 + CH + k + w
198 and y0, b ; y0 = (a|c)&b
199 add h, y1 ; h = h + S1 + CH + k + w + S0
200 por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
201 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
202 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
205 movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
208 movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
209 ror y0, (25-11) ; y0 = e >> (25-11)
210 xor y0, e ; y0 = e ^ (e >> (25-11))
212 ror y1, (22-13) ; y1 = a >> (22-13)
214 xor y1, a ; y1 = a ^ (a >> (22-13)
215 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
218 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
219 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
220 and y2, e ; y2 = (f^g)&e
221 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
223 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
224 xor y2, g ; y2 = CH = ((f^g)&e)^g
225 psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
226 add y2, y0 ; y2 = S1 + CH
227 add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
228 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
229 pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
231 add h, y2 ; h = h + S1 + CH + k + w
233 pxor XTMP1, XTMP4 ; XTMP1 = s0
235 add d, h ; d = d + h + S1 + CH + k + w
238 pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
239 and y0, b ; y0 = (a|c)&b
240 add h, y1 ; h = h + S1 + CH + k + w + S0
241 paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
242 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
243 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
246 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
249 ror y0, (25-11) ; y0 = e >> (25-11)
250 movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
251 xor y0, e ; y0 = e ^ (e >> (25-11))
252 ror y1, (22-13) ; y1 = a >> (22-13)
254 xor y1, a ; y1 = a ^ (a >> (22-13)
255 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
256 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
258 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
259 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
260 and y2, e ; y2 = (f^g)&e
261 psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
262 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
263 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
264 xor y2, g ; y2 = CH = ((f^g)&e)^g
265 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
267 add y2, y0 ; y2 = S1 + CH
268 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
269 add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
270 pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
272 add h, y2 ; h = h + S1 + CH + k + w
274 pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
276 add d, h ; d = d + h + S1 + CH + k + w
278 paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
279 and y0, b ; y0 = (a|c)&b
280 add h, y1 ; h = h + S1 + CH + k + w + S0
282 pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
283 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
284 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
287 movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
289 ror y0, (25-11) ; y0 = e >> (25-11)
291 movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
292 ror y1, (22-13) ; y1 = a >> (22-13)
293 xor y0, e ; y0 = e ^ (e >> (25-11))
295 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
296 psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
297 xor y1, a ; y1 = a ^ (a >> (22-13)
299 psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
300 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
301 and y2, e ; y2 = (f^g)&e
302 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
303 psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
304 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
305 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
306 xor y2, g ; y2 = CH = ((f^g)&e)^g
308 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
309 add y2, y0 ; y2 = S1 + CH
310 add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
311 pxor X0, XTMP2 ; X0 = s1 {xDxC}
313 add h, y2 ; h = h + S1 + CH + k + w
315 pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
317 add d, h ; d = d + h + S1 + CH + k + w
319 paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
320 and y0, b ; y0 = (a|c)&b
321 add h, y1 ; h = h + S1 + CH + k + w + S0
322 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
323 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
329 ;; input is [rsp + _XFER + %1 * 4]
332 ror y0, (25-11) ; y0 = e >> (25-11)
334 xor y0, e ; y0 = e ^ (e >> (25-11))
335 ror y1, (22-13) ; y1 = a >> (22-13)
337 xor y1, a ; y1 = a ^ (a >> (22-13)
338 ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
340 xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
341 ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
342 and y2, e ; y2 = (f^g)&e
343 xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
344 ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
345 xor y2, g ; y2 = CH = ((f^g)&e)^g
346 add y2, y0 ; y2 = S1 + CH
347 ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
348 add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
350 add h, y2 ; h = h + S1 + CH + k + w
353 add d, h ; d = d + h + S1 + CH + k + w
355 and y0, b ; y0 = (a|c)&b
356 add h, y1 ; h = h + S1 + CH + k + w + S0
357 or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
358 add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
362 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
363 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
364 ;; void FUNC(void *input_data, UINT32 digest[8])
365 ;; arg 1 : pointer to input data
366 ;; arg 2 : pointer to digest
368 MKGLOBAL(FUNC,function,internal)
383 movdqa [rsp + _XMM_SAVE + 0*16],xmm6
384 movdqa [rsp + _XMM_SAVE + 1*16],xmm7
385 movdqa [rsp + _XMM_SAVE + 2*16],xmm8
386 movdqa [rsp + _XMM_SAVE + 3*16],xmm9
387 movdqa [rsp + _XMM_SAVE + 4*16],xmm10
388 movdqa [rsp + _XMM_SAVE + 5*16],xmm11
389 movdqa [rsp + _XMM_SAVE + 6*16],xmm12
392 ;; load initial digest
402 movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
403 movdqa SHUF_00BA, [rel _SHUF_00BA]
404 movdqa SHUF_DC00, [rel _SHUF_DC00]
408 ;; byte swap first 16 dwords
409 COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
410 COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
411 COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
412 COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
414 ;; schedule 48 input dwords, by doing 3 rounds of 16 each
418 movdqa XFER, [TBL + 0*16]
420 movdqa [rsp + _XFER], XFER
421 FOUR_ROUNDS_AND_SCHED
423 movdqa XFER, [TBL + 1*16]
425 movdqa [rsp + _XFER], XFER
426 FOUR_ROUNDS_AND_SCHED
428 movdqa XFER, [TBL + 2*16]
430 movdqa [rsp + _XFER], XFER
431 FOUR_ROUNDS_AND_SCHED
433 movdqa XFER, [TBL + 3*16]
435 movdqa [rsp + _XFER], XFER
437 FOUR_ROUNDS_AND_SCHED
444 paddd X0, [TBL + 0*16]
445 movdqa [rsp + _XFER], X0
450 paddd X1, [TBL + 1*16]
451 movdqa [rsp + _XFER], X1
475 movdqa xmm6,[rsp + _XMM_SAVE + 0*16]
476 movdqa xmm7,[rsp + _XMM_SAVE + 1*16]
477 movdqa xmm8,[rsp + _XMM_SAVE + 2*16]
478 movdqa xmm9,[rsp + _XMM_SAVE + 3*16]
479 movdqa xmm10,[rsp + _XMM_SAVE + 4*16]
480 movdqa xmm11,[rsp + _XMM_SAVE + 5*16]
481 movdqa xmm12,[rsp + _XMM_SAVE + 6*16]
483 ;; Clear potential sensitive data stored in stack
485 movdqa [rsp + _XMM_SAVE + 0 * 16], xmm0
486 movdqa [rsp + _XMM_SAVE + 1 * 16], xmm0
487 movdqa [rsp + _XMM_SAVE + 2 * 16], xmm0
488 movdqa [rsp + _XMM_SAVE + 3 * 16], xmm0
489 movdqa [rsp + _XMM_SAVE + 4 * 16], xmm0
490 movdqa [rsp + _XMM_SAVE + 5 * 16], xmm0
491 movdqa [rsp + _XMM_SAVE + 6 * 16], xmm0
511 section .note.GNU-stack noalloc noexec nowrite progbits