]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | ;; |
2 | ;; Copyright (c) 2012-2018, Intel Corporation | |
3 | ;; | |
4 | ;; Redistribution and use in source and binary forms, with or without | |
5 | ;; modification, are permitted provided that the following conditions are met: | |
6 | ;; | |
7 | ;; * Redistributions of source code must retain the above copyright notice, | |
8 | ;; this list of conditions and the following disclaimer. | |
9 | ;; * Redistributions in binary form must reproduce the above copyright | |
10 | ;; notice, this list of conditions and the following disclaimer in the | |
11 | ;; documentation and/or other materials provided with the distribution. | |
12 | ;; * Neither the name of Intel Corporation nor the names of its contributors | |
13 | ;; may be used to endorse or promote products derived from this software | |
14 | ;; without specific prior written permission. | |
15 | ;; | |
16 | ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |
17 | ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
18 | ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
19 | ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE | |
20 | ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
21 | ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |
22 | ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
23 | ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |
24 | ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 | ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 | ;; | |
27 | ||
28 | ;; code to compute quad SHA512 using AVX | |
29 | ;; use YMMs to tackle the larger digest size | |
30 | ;; outer calling routine takes care of save and restore of XMM registers | |
31 | ;; Logic designed/laid out by JDG | |
32 | ||
33 | ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 | |
34 | ;; Stack must be aligned to 32 bytes before call | |
35 | ;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 | |
36 | ;; Windows preserves: rcx rsi rdi rbp r13 r14 r15 | |
37 | ;; | |
38 | ;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 | |
39 | ;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 | |
40 | ;; | |
41 | ;; clobbers ymm0-15 | |
42 | ||
f67539c2 | 43 | %include "include/os.asm" |
11fdf7f2 | 44 | ;%define DO_DBGPRINT |
f67539c2 TL |
45 | %include "include/dbgprint.asm" |
46 | %include "include/transpose_avx2.asm" | |
47 | %include "include/dbgprint.asm" | |
11fdf7f2 TL |
48 | %include "mb_mgr_datastruct.asm" |
49 | ||
50 | section .data | |
51 | default rel | |
52 | align 64 | |
53 | K512_4: | |
54 | dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22 | |
55 | dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd | |
56 | dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f | |
57 | dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc | |
58 | dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538 | |
59 | dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019 | |
60 | dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b | |
61 | dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 | |
62 | dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242 | |
63 | dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe | |
64 | dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c | |
65 | dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 | |
66 | dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f | |
67 | dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 | |
68 | dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235 | |
69 | dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694 | |
70 | dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 | |
71 | dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3 | |
72 | dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 | |
73 | dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 | |
74 | dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275 | |
75 | dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 | |
76 | dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 | |
77 | dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5 | |
78 | dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab | |
79 | dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210 | |
80 | dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f | |
81 | dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 | |
82 | dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 | |
83 | dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725 | |
84 | dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f | |
85 | dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70 | |
86 | dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc | |
87 | dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926 | |
88 | dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed | |
89 | dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df | |
90 | dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de | |
91 | dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 | |
92 | dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 | |
93 | dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b | |
94 | dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 | |
95 | dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001 | |
96 | dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 | |
97 | dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30 | |
98 | dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218 | |
99 | dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910 | |
100 | dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a | |
101 | dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 | |
102 | dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 | |
103 | dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53 | |
104 | dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 | |
105 | dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 | |
106 | dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 | |
107 | dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb | |
108 | dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 | |
109 | dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 | |
110 | dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc | |
111 | dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60 | |
112 | dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 | |
113 | dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec | |
114 | dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28 | |
115 | dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9 | |
116 | dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 | |
117 | dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b | |
118 | dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c | |
119 | dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207 | |
120 | dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e | |
121 | dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 | |
122 | dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba | |
123 | dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 | |
124 | dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae | |
125 | dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b | |
126 | dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84 | |
127 | dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493 | |
128 | dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc | |
129 | dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c | |
130 | dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 | |
131 | dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a | |
132 | dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec | |
133 | dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817 | |
134 | ||
135 | align 32 | |
136 | PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 | |
137 | dq 0x0001020304050607, 0x08090a0b0c0d0e0f | |
138 | ;ddq 0x18191a1b1c1d1e1f1011121314151617 | |
139 | dq 0x1011121314151617, 0x18191a1b1c1d1e1f | |
140 | ||
141 | section .text | |
142 | ||
143 | %ifdef LINUX | |
144 | %define arg1 rdi | |
145 | %define arg2 rsi | |
146 | %else | |
147 | %define arg1 rcx | |
148 | %define arg2 rdx | |
149 | %endif | |
150 | ||
151 | ; Common definitions | |
152 | %define STATE arg1 | |
153 | %define INP_SIZE arg2 | |
154 | ||
155 | %define IDX rax | |
156 | %define ROUND rbx | |
157 | %define TBL r8 | |
158 | ||
159 | %define inp0 r9 | |
160 | %define inp1 r10 | |
161 | %define inp2 r11 | |
162 | %define inp3 r12 | |
163 | ||
164 | %define a ymm0 | |
165 | %define b ymm1 | |
166 | %define c ymm2 | |
167 | %define d ymm3 | |
168 | %define e ymm4 | |
169 | %define f ymm5 | |
170 | %define g ymm6 | |
171 | %define h ymm7 | |
172 | ||
173 | %define a0 ymm8 | |
174 | %define a1 ymm9 | |
175 | %define a2 ymm10 | |
176 | ||
177 | %define TT0 ymm14 | |
178 | %define TT1 ymm13 | |
179 | %define TT2 ymm12 | |
180 | %define TT3 ymm11 | |
181 | %define TT4 ymm10 | |
182 | %define TT5 ymm9 | |
183 | ||
184 | %define T1 ymm14 | |
185 | %define TMP ymm15 | |
186 | ||
187 | ||
188 | ||
189 | %define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register | |
190 | %define ROUNDS 80*SZ4 | |
191 | ||
192 | ; Define stack usage | |
193 | ||
194 | ;; Assume stack aligned to 32 bytes before call | |
195 | ;; Therefore FRAMESZ mod 32 must be 32-8 = 24 | |
196 | struc stack_frame | |
197 | .data resb 16*SZ4 | |
198 | .digest resb NUM_SHA512_DIGEST_WORDS*SZ4 | |
199 | .align resb 24 | |
200 | endstruc | |
201 | ||
202 | %define _DIGEST stack_frame.digest | |
203 | ||
11fdf7f2 TL |
204 | %macro ROTATE_ARGS 0 |
205 | %xdefine TMP_ h | |
206 | %xdefine h g | |
207 | %xdefine g f | |
208 | %xdefine f e | |
209 | %xdefine e d | |
210 | %xdefine d c | |
211 | %xdefine c b | |
212 | %xdefine b a | |
213 | %xdefine a TMP_ | |
214 | %endm | |
215 | ||
216 | ; PRORQ reg, imm, tmp | |
217 | ; packed-rotate-right-double | |
218 | ; does a rotate by doing two shifts and an or | |
219 | %macro PRORQ 3 | |
220 | %define %%reg %1 | |
221 | %define %%imm %2 | |
222 | %define %%tmp %3 | |
223 | vpsllq %%tmp, %%reg, (64-(%%imm)) | |
224 | vpsrlq %%reg, %%reg, %%imm | |
225 | vpor %%reg, %%reg, %%tmp | |
226 | %endmacro | |
227 | ||
228 | ; non-destructive | |
229 | ; PRORQ_nd reg, imm, tmp, src | |
230 | %macro PRORQ_nd 4 | |
231 | %define %%reg %1 | |
232 | %define %%imm %2 | |
233 | %define %%tmp %3 | |
234 | %define %%src %4 | |
235 | vpsllq %%tmp, %%src, (64-(%%imm)) | |
236 | vpsrlq %%reg, %%src, %%imm | |
237 | vpor %%reg, %%reg, %%tmp | |
238 | %endmacro | |
239 | ||
240 | ; PRORQ dst/src, amt | |
241 | %macro PRORQ 2 | |
242 | PRORQ %1, %2, TMP | |
243 | %endmacro | |
244 | ||
245 | ; PRORQ_nd dst, src, amt | |
246 | %macro PRORQ_nd 3 | |
247 | PRORQ_nd %1, %3, TMP, %2 | |
248 | %endmacro | |
249 | ||
250 | ||
251 | ||
252 | ;; arguments passed implicitly in preprocessor symbols i, a...h | |
253 | %macro ROUND_00_15 2 | |
254 | %define %%T1 %1 | |
255 | %define %%i %2 | |
256 | PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) | |
257 | ||
258 | vpxor a2, f, g ; ch: a2 = f^g | |
259 | vpand a2, a2, e ; ch: a2 = (f^g)&e | |
260 | vpxor a2, a2, g ; a2 = ch | |
261 | ||
262 | PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) | |
263 | vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1 | |
264 | vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K | |
265 | vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) | |
266 | PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) | |
267 | vpaddq h, h, a2 ; h = h + ch | |
268 | PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) | |
269 | vpaddq h, h, %%T1 ; h = h + ch + W + K | |
270 | vpxor a0, a0, a1 ; a0 = sigma1 | |
271 | vmovdqa %%T1, a ; maj: T1 = a | |
272 | PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) | |
273 | vpxor %%T1, %%T1, c ; maj: T1 = a^c | |
274 | add ROUND, SZ4 ; ROUND++ | |
275 | vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b | |
276 | vpaddq h, h, a0 | |
277 | ||
278 | vpaddq d, d, h | |
279 | ||
280 | vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) | |
281 | PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) | |
282 | vpxor a2, a2, a1 ; a2 = sig0 | |
283 | vpand a1, a, c ; maj: a1 = a&c | |
284 | vpor a1, a1, %%T1 ; a1 = maj | |
285 | vpaddq h, h, a1 ; h = h + ch + W + K + maj | |
286 | vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 | |
287 | ROTATE_ARGS | |
288 | %endm | |
289 | ||
290 | ||
291 | ;; arguments passed implicitly in preprocessor symbols i, a...h | |
292 | %macro ROUND_16_XX 2 | |
293 | %define %%T1 %1 | |
294 | %define %%i %2 | |
295 | vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] | |
296 | vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp] | |
297 | vmovdqa a0, %%T1 | |
298 | PRORQ %%T1, 8-1 | |
299 | vmovdqa a2, a1 | |
300 | PRORQ a1, 61-19 | |
301 | vpxor %%T1, %%T1, a0 | |
302 | PRORQ %%T1, 1 | |
303 | vpxor a1, a1, a2 | |
304 | PRORQ a1, 19 | |
305 | vpsrlq a0, a0, 7 | |
306 | vpxor %%T1, %%T1, a0 | |
307 | vpsrlq a2, a2, 6 | |
308 | vpxor a1, a1, a2 | |
309 | vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp] | |
310 | vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp] | |
311 | vpaddq %%T1, %%T1, a1 | |
312 | ||
313 | ROUND_00_15 %%T1, %%i | |
314 | ||
315 | %endm | |
316 | ||
317 | ||
318 | ;; void sha512_x4_avx2(void *STATE, const int INP_SIZE) | |
319 | ;; arg 1 : STATE : pointer to input data | |
320 | ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) | |
321 | MKGLOBAL(sha512_x4_avx2,function,internal) | |
322 | align 32 | |
323 | sha512_x4_avx2: | |
324 | ; general registers preserved in outer calling routine | |
325 | ; outer calling routine saves all the XMM registers | |
326 | ||
327 | sub rsp, stack_frame_size | |
328 | ||
329 | ;; Load the pre-transposed incoming digest. | |
330 | vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE] | |
331 | vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE] | |
332 | vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE] | |
333 | vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE] | |
334 | vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE] | |
335 | vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE] | |
336 | vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE] | |
337 | vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE] | |
338 | ||
339 | DBGPRINTL_YMM "sha512-avx2 Incoming digest", a, b, c, d, e, f, g, h | |
340 | lea TBL,[K512_4] | |
341 | ||
342 | ;; load the address of each of the MAX_LANES (4) message lanes | |
343 | ;; getting ready to transpose input onto stack | |
344 | mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ] | |
345 | mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ] | |
346 | mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ] | |
347 | mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ] | |
348 | ||
349 | xor IDX, IDX | |
350 | lloop: | |
351 | xor ROUND, ROUND | |
352 | ||
353 | ;; save old digest | |
354 | vmovdqa [rsp + _DIGEST + 0*SZ4], a | |
355 | vmovdqa [rsp + _DIGEST + 1*SZ4], b | |
356 | vmovdqa [rsp + _DIGEST + 2*SZ4], c | |
357 | vmovdqa [rsp + _DIGEST + 3*SZ4], d | |
358 | vmovdqa [rsp + _DIGEST + 4*SZ4], e | |
359 | vmovdqa [rsp + _DIGEST + 5*SZ4], f | |
360 | vmovdqa [rsp + _DIGEST + 6*SZ4], g | |
361 | vmovdqa [rsp + _DIGEST + 7*SZ4], h | |
362 | ||
363 | %assign i 0 | |
364 | %rep 4 | |
365 | ;; load up the shuffler for little-endian to big-endian format | |
366 | vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK] | |
f67539c2 TL |
367 | |
368 | TRANSPOSE4_U64_LOAD4 TT4, TT1, TT5, TT3, inp0, inp1, inp2, inp3, IDX+i*32 | |
369 | ||
370 | TRANSPOSE4_U64 TT4, TT1, TT5, TT3, TT0, TT2 | |
371 | DBGPRINTL_YMM "sha512-avx2 Incoming data", TT0, TT1, TT2, TT3 | |
11fdf7f2 TL |
372 | vpshufb TT0, TT0, TMP |
373 | vpshufb TT1, TT1, TMP | |
374 | vpshufb TT2, TT2, TMP | |
375 | vpshufb TT3, TT3, TMP | |
376 | ROUND_00_15 TT0,(i*4+0) | |
377 | ROUND_00_15 TT1,(i*4+1) | |
378 | ROUND_00_15 TT2,(i*4+2) | |
379 | ROUND_00_15 TT3,(i*4+3) | |
380 | %assign i (i+1) | |
381 | %endrep | |
382 | ;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) | |
383 | add IDX, 4 * 32 | |
384 | ||
385 | %assign i (i*4) | |
386 | ||
387 | jmp Lrounds_16_xx | |
388 | align 16 | |
389 | Lrounds_16_xx: | |
390 | %rep 16 | |
391 | ROUND_16_XX T1, i | |
392 | %assign i (i+1) | |
393 | %endrep | |
394 | ||
395 | cmp ROUND,ROUNDS | |
396 | jb Lrounds_16_xx | |
397 | ||
398 | ;; add old digest | |
399 | vpaddq a, a, [rsp + _DIGEST + 0*SZ4] | |
400 | vpaddq b, b, [rsp + _DIGEST + 1*SZ4] | |
401 | vpaddq c, c, [rsp + _DIGEST + 2*SZ4] | |
402 | vpaddq d, d, [rsp + _DIGEST + 3*SZ4] | |
403 | vpaddq e, e, [rsp + _DIGEST + 4*SZ4] | |
404 | vpaddq f, f, [rsp + _DIGEST + 5*SZ4] | |
405 | vpaddq g, g, [rsp + _DIGEST + 6*SZ4] | |
406 | vpaddq h, h, [rsp + _DIGEST + 7*SZ4] | |
407 | ||
408 | sub INP_SIZE, 1 ;; consumed one message block | |
409 | jne lloop | |
410 | ||
411 | ; write back to memory (state object) the transposed digest | |
412 | vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a | |
413 | vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b | |
414 | vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c | |
415 | vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d | |
416 | vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e | |
417 | vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f | |
418 | vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g | |
419 | vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h | |
420 | DBGPRINTL_YMM "sha512-avx2 Outgoing digest", a, b, c, d, e, f, g, h | |
421 | ||
422 | ;; update input data pointers | |
423 | add inp0, IDX | |
424 | mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 | |
425 | add inp1, IDX | |
426 | mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 | |
427 | add inp2, IDX | |
428 | mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2 | |
429 | add inp3, IDX | |
430 | mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3 | |
431 | ||
432 | ;;;;;;;;;;;;;;;; | |
433 | ;; Postamble | |
434 | ||
f67539c2 TL |
435 | ;; Clear stack frame ((16 + 8)*32 bytes) |
436 | %ifdef SAFE_DATA | |
437 | vpxor ymm0, ymm0 | |
438 | %assign i 0 | |
439 | %rep (16+NUM_SHA512_DIGEST_WORDS) | |
440 | vmovdqa [rsp + i*SZ4], ymm0 | |
441 | %assign i (i+1) | |
442 | %endrep | |
443 | %endif | |
444 | ||
11fdf7f2 TL |
445 | add rsp, stack_frame_size |
446 | ||
447 | ; outer calling routine restores XMM and other GP registers | |
448 | ret | |
449 | ||
450 | %ifdef LINUX | |
451 | section .note.GNU-stack noalloc noexec nowrite progbits | |
452 | %endif |