]>
Commit | Line | Data |
---|---|---|
d34a4600 TC |
1 | ######################################################################## |
2 | # Implement fast SHA-256 with AVX2 instructions. (x86_64) | |
3 | # | |
4 | # Copyright (C) 2013 Intel Corporation. | |
5 | # | |
6 | # Authors: | |
7 | # James Guilford <james.guilford@intel.com> | |
8 | # Kirk Yap <kirk.s.yap@intel.com> | |
9 | # Tim Chen <tim.c.chen@linux.intel.com> | |
10 | # | |
11 | # This software is available to you under a choice of one of two | |
12 | # licenses. You may choose to be licensed under the terms of the GNU | |
13 | # General Public License (GPL) Version 2, available from the file | |
14 | # COPYING in the main directory of this source tree, or the | |
15 | # OpenIB.org BSD license below: | |
16 | # | |
17 | # Redistribution and use in source and binary forms, with or | |
18 | # without modification, are permitted provided that the following | |
19 | # conditions are met: | |
20 | # | |
21 | # - Redistributions of source code must retain the above | |
22 | # copyright notice, this list of conditions and the following | |
23 | # disclaimer. | |
24 | # | |
25 | # - Redistributions in binary form must reproduce the above | |
26 | # copyright notice, this list of conditions and the following | |
27 | # disclaimer in the documentation and/or other materials | |
28 | # provided with the distribution. | |
29 | # | |
30 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
31 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
32 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
33 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
34 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
35 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
36 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
37 | # SOFTWARE. | |
38 | # | |
39 | ######################################################################## | |
40 | # | |
41 | # This code is described in an Intel White-Paper: | |
42 | # "Fast SHA-256 Implementations on Intel Architecture Processors" | |
43 | # | |
44 | # To find it, surf to http://www.intel.com/p/en_US/embedded | |
45 | # and search for that title. | |
46 | # | |
47 | ######################################################################## | |
48 | # This code schedules 2 blocks at a time, with 4 lanes per block | |
49 | ######################################################################## | |
50 | ||
d34a4600 TC |
51 | #include <linux/linkage.h> |
52 | ||
53 | ## assume buffers not aligned | |
54 | #define VMOVDQ vmovdqu | |
55 | ||
56 | ################################ Define Macros | |
57 | ||
58 | # addm [mem], reg | |
59 | # Add reg to mem using reg-mem add and store | |
60 | .macro addm p1 p2 | |
61 | add \p1, \p2 | |
62 | mov \p2, \p1 | |
63 | .endm | |
64 | ||
65 | ################################ | |
66 | ||
67 | X0 = %ymm4 | |
68 | X1 = %ymm5 | |
69 | X2 = %ymm6 | |
70 | X3 = %ymm7 | |
71 | ||
72 | # XMM versions of above | |
73 | XWORD0 = %xmm4 | |
74 | XWORD1 = %xmm5 | |
75 | XWORD2 = %xmm6 | |
76 | XWORD3 = %xmm7 | |
77 | ||
78 | XTMP0 = %ymm0 | |
79 | XTMP1 = %ymm1 | |
80 | XTMP2 = %ymm2 | |
81 | XTMP3 = %ymm3 | |
82 | XTMP4 = %ymm8 | |
83 | XFER = %ymm9 | |
84 | XTMP5 = %ymm11 | |
85 | ||
86 | SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA | |
87 | SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00 | |
88 | BYTE_FLIP_MASK = %ymm13 | |
89 | ||
90 | X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK | |
91 | ||
92 | NUM_BLKS = %rdx # 3rd arg | |
1631030a AB |
93 | INP = %rsi # 2nd arg |
94 | CTX = %rdi # 1st arg | |
d34a4600 TC |
95 | c = %ecx |
96 | d = %r8d | |
97 | e = %edx # clobbers NUM_BLKS | |
1631030a | 98 | y3 = %esi # clobbers INP |
d34a4600 | 99 | |
d34a4600 TC |
100 | SRND = CTX # SRND is same register as CTX |
101 | ||
102 | a = %eax | |
103 | b = %ebx | |
104 | f = %r9d | |
105 | g = %r10d | |
106 | h = %r11d | |
107 | old_h = %r11d | |
108 | ||
109 | T1 = %r12d | |
110 | y0 = %r13d | |
111 | y1 = %r14d | |
112 | y2 = %r15d | |
113 | ||
114 | ||
115 | _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round | |
116 | _XMM_SAVE_SIZE = 0 | |
117 | _INP_END_SIZE = 8 | |
118 | _INP_SIZE = 8 | |
119 | _CTX_SIZE = 8 | |
d34a4600 TC |
120 | |
121 | _XFER = 0 | |
122 | _XMM_SAVE = _XFER + _XFER_SIZE | |
123 | _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE | |
124 | _INP = _INP_END + _INP_END_SIZE | |
125 | _CTX = _INP + _INP_SIZE | |
ce584666 | 126 | STACK_SIZE = _CTX + _CTX_SIZE |
d34a4600 TC |
127 | |
128 | # rotate_Xs | |
129 | # Rotate values of symbols X0...X3 | |
130 | .macro rotate_Xs | |
131 | X_ = X0 | |
132 | X0 = X1 | |
133 | X1 = X2 | |
134 | X2 = X3 | |
135 | X3 = X_ | |
136 | .endm | |
137 | ||
138 | # ROTATE_ARGS | |
139 | # Rotate values of symbols a...h | |
140 | .macro ROTATE_ARGS | |
141 | old_h = h | |
142 | TMP_ = h | |
143 | h = g | |
144 | g = f | |
145 | f = e | |
146 | e = d | |
147 | d = c | |
148 | c = b | |
149 | b = a | |
150 | a = TMP_ | |
151 | .endm | |
152 | ||
153 | .macro FOUR_ROUNDS_AND_SCHED disp | |
154 | ################################### RND N + 0 ############################ | |
155 | ||
156 | mov a, y3 # y3 = a # MAJA | |
157 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
158 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
159 | ||
160 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | |
161 | or c, y3 # y3 = a|c # MAJA | |
162 | vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7] | |
163 | mov f, y2 # y2 = f # CH | |
164 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
165 | ||
166 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
167 | xor g, y2 # y2 = f^g # CH | |
168 | vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1 | |
169 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
170 | ||
171 | and e, y2 # y2 = (f^g)&e # CH | |
172 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
173 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
174 | add h, d # d = k + w + h + d # -- | |
175 | ||
176 | and b, y3 # y3 = (a|c)&b # MAJA | |
177 | vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15] | |
178 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
179 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
180 | ||
181 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
182 | vpsrld $7, XTMP1, XTMP2 | |
183 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
184 | mov a, T1 # T1 = a # MAJB | |
185 | and c, T1 # T1 = a&c # MAJB | |
186 | ||
187 | add y0, y2 # y2 = S1 + CH # -- | |
188 | vpslld $(32-7), XTMP1, XTMP3 | |
189 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
190 | add y1, h # h = k + w + h + S0 # -- | |
191 | ||
192 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
193 | vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 | |
194 | ||
195 | vpsrld $18, XTMP1, XTMP2 | |
196 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
197 | add y3, h # h = t1 + S0 + MAJ # -- | |
198 | ||
199 | ||
200 | ROTATE_ARGS | |
201 | ||
202 | ################################### RND N + 1 ############################ | |
203 | ||
204 | mov a, y3 # y3 = a # MAJA | |
205 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
206 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
207 | offset = \disp + 1*4 | |
208 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
209 | or c, y3 # y3 = a|c # MAJA | |
210 | ||
211 | ||
212 | vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3 | |
213 | mov f, y2 # y2 = f # CH | |
214 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
215 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
216 | xor g, y2 # y2 = f^g # CH | |
217 | ||
218 | ||
219 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
220 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
221 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
222 | and e, y2 # y2 = (f^g)&e # CH | |
223 | add h, d # d = k + w + h + d # -- | |
224 | ||
225 | vpslld $(32-18), XTMP1, XTMP1 | |
226 | and b, y3 # y3 = (a|c)&b # MAJA | |
227 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
228 | ||
229 | vpxor XTMP1, XTMP3, XTMP3 | |
230 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
231 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
232 | ||
233 | vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 | |
234 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
235 | mov a, T1 # T1 = a # MAJB | |
236 | and c, T1 # T1 = a&c # MAJB | |
237 | add y0, y2 # y2 = S1 + CH # -- | |
238 | ||
239 | vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0 | |
240 | vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA} | |
241 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
242 | add y1, h # h = k + w + h + S0 # -- | |
243 | ||
244 | vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0 | |
245 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
246 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
247 | add y3, h # h = t1 + S0 + MAJ # -- | |
248 | ||
249 | vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA} | |
250 | ||
251 | ||
252 | ROTATE_ARGS | |
253 | ||
254 | ################################### RND N + 2 ############################ | |
255 | ||
256 | mov a, y3 # y3 = a # MAJA | |
257 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
258 | offset = \disp + 2*4 | |
259 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
260 | ||
261 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA} | |
262 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
263 | or c, y3 # y3 = a|c # MAJA | |
264 | mov f, y2 # y2 = f # CH | |
265 | xor g, y2 # y2 = f^g # CH | |
266 | ||
267 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
268 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
269 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA} | |
270 | and e, y2 # y2 = (f^g)&e # CH | |
271 | ||
272 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
273 | vpxor XTMP3, XTMP2, XTMP2 | |
274 | add h, d # d = k + w + h + d # -- | |
275 | and b, y3 # y3 = (a|c)&b # MAJA | |
276 | ||
277 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
278 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
279 | vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA} | |
280 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
281 | ||
282 | vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA} | |
283 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
284 | rorx $2, a ,T1 # T1 = (a >> 2) # S0 | |
285 | vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]} | |
286 | ||
287 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
288 | mov a, T1 # T1 = a # MAJB | |
289 | and c, T1 # T1 = a&c # MAJB | |
290 | add y0, y2 # y2 = S1 + CH # -- | |
291 | vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC} | |
292 | ||
293 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
294 | add y1,h # h = k + w + h + S0 # -- | |
295 | add y2,d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
296 | add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
297 | ||
298 | add y3,h # h = t1 + S0 + MAJ # -- | |
299 | ||
300 | ||
301 | ROTATE_ARGS | |
302 | ||
303 | ################################### RND N + 3 ############################ | |
304 | ||
305 | mov a, y3 # y3 = a # MAJA | |
306 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
307 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
308 | offset = \disp + 3*4 | |
309 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
310 | or c, y3 # y3 = a|c # MAJA | |
311 | ||
312 | ||
313 | vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC} | |
314 | mov f, y2 # y2 = f # CH | |
315 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
316 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
317 | xor g, y2 # y2 = f^g # CH | |
318 | ||
319 | ||
320 | vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC} | |
321 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
322 | and e, y2 # y2 = (f^g)&e # CH | |
323 | add h, d # d = k + w + h + d # -- | |
324 | and b, y3 # y3 = (a|c)&b # MAJA | |
325 | ||
326 | vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC} | |
327 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
328 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
329 | ||
330 | vpxor XTMP3, XTMP2, XTMP2 | |
331 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
332 | add y0, y2 # y2 = S1 + CH # -- | |
333 | ||
334 | vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC} | |
335 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
336 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
337 | ||
338 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
339 | vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00} | |
340 | ||
341 | vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]} | |
342 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
343 | mov a, T1 # T1 = a # MAJB | |
344 | and c, T1 # T1 = a&c # MAJB | |
345 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
346 | ||
347 | add y1, h # h = k + w + h + S0 # -- | |
348 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
349 | add y3, h # h = t1 + S0 + MAJ # -- | |
350 | ||
351 | ROTATE_ARGS | |
352 | rotate_Xs | |
353 | .endm | |
354 | ||
355 | .macro DO_4ROUNDS disp | |
356 | ################################### RND N + 0 ########################### | |
357 | ||
358 | mov f, y2 # y2 = f # CH | |
359 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
360 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
361 | xor g, y2 # y2 = f^g # CH | |
362 | ||
363 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
364 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
365 | and e, y2 # y2 = (f^g)&e # CH | |
366 | ||
367 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
368 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
369 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
370 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
371 | mov a, y3 # y3 = a # MAJA | |
372 | ||
373 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
374 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
375 | addl \disp(%rsp, SRND), h # h = k + w + h # -- | |
376 | or c, y3 # y3 = a|c # MAJA | |
377 | ||
378 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
379 | mov a, T1 # T1 = a # MAJB | |
380 | and b, y3 # y3 = (a|c)&b # MAJA | |
381 | and c, T1 # T1 = a&c # MAJB | |
382 | add y0, y2 # y2 = S1 + CH # -- | |
383 | ||
384 | ||
385 | add h, d # d = k + w + h + d # -- | |
386 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
387 | add y1, h # h = k + w + h + S0 # -- | |
388 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
389 | ||
390 | ROTATE_ARGS | |
391 | ||
392 | ################################### RND N + 1 ########################### | |
393 | ||
394 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
395 | mov f, y2 # y2 = f # CH | |
396 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
397 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
398 | xor g, y2 # y2 = f^g # CH | |
399 | ||
400 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
401 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
402 | and e, y2 # y2 = (f^g)&e # CH | |
403 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
404 | ||
405 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
406 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
407 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
408 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
409 | mov a, y3 # y3 = a # MAJA | |
410 | ||
411 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
412 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
413 | offset = 4*1 + \disp | |
414 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
415 | or c, y3 # y3 = a|c # MAJA | |
416 | ||
417 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
418 | mov a, T1 # T1 = a # MAJB | |
419 | and b, y3 # y3 = (a|c)&b # MAJA | |
420 | and c, T1 # T1 = a&c # MAJB | |
421 | add y0, y2 # y2 = S1 + CH # -- | |
422 | ||
423 | ||
424 | add h, d # d = k + w + h + d # -- | |
425 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
426 | add y1, h # h = k + w + h + S0 # -- | |
427 | ||
428 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
429 | ||
430 | ROTATE_ARGS | |
431 | ||
432 | ################################### RND N + 2 ############################## | |
433 | ||
434 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
435 | mov f, y2 # y2 = f # CH | |
436 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
437 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
438 | xor g, y2 # y2 = f^g # CH | |
439 | ||
440 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
441 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
442 | and e, y2 # y2 = (f^g)&e # CH | |
443 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
444 | ||
445 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
446 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
447 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
448 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
449 | mov a, y3 # y3 = a # MAJA | |
450 | ||
451 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
452 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
453 | offset = 4*2 + \disp | |
454 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
455 | or c, y3 # y3 = a|c # MAJA | |
456 | ||
457 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
458 | mov a, T1 # T1 = a # MAJB | |
459 | and b, y3 # y3 = (a|c)&b # MAJA | |
460 | and c, T1 # T1 = a&c # MAJB | |
461 | add y0, y2 # y2 = S1 + CH # -- | |
462 | ||
463 | ||
464 | add h, d # d = k + w + h + d # -- | |
465 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
466 | add y1, h # h = k + w + h + S0 # -- | |
467 | ||
468 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
469 | ||
470 | ROTATE_ARGS | |
471 | ||
472 | ################################### RND N + 3 ########################### | |
473 | ||
474 | add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
475 | mov f, y2 # y2 = f # CH | |
476 | rorx $25, e, y0 # y0 = e >> 25 # S1A | |
477 | rorx $11, e, y1 # y1 = e >> 11 # S1B | |
478 | xor g, y2 # y2 = f^g # CH | |
479 | ||
480 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1 | |
481 | rorx $6, e, y1 # y1 = (e >> 6) # S1 | |
482 | and e, y2 # y2 = (f^g)&e # CH | |
483 | add y3, old_h # h = t1 + S0 + MAJ # -- | |
484 | ||
485 | xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1 | |
486 | rorx $13, a, T1 # T1 = a >> 13 # S0B | |
487 | xor g, y2 # y2 = CH = ((f^g)&e)^g # CH | |
488 | rorx $22, a, y1 # y1 = a >> 22 # S0A | |
489 | mov a, y3 # y3 = a # MAJA | |
490 | ||
491 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0 | |
492 | rorx $2, a, T1 # T1 = (a >> 2) # S0 | |
493 | offset = 4*3 + \disp | |
494 | addl offset(%rsp, SRND), h # h = k + w + h # -- | |
495 | or c, y3 # y3 = a|c # MAJA | |
496 | ||
497 | xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0 | |
498 | mov a, T1 # T1 = a # MAJB | |
499 | and b, y3 # y3 = (a|c)&b # MAJA | |
500 | and c, T1 # T1 = a&c # MAJB | |
501 | add y0, y2 # y2 = S1 + CH # -- | |
502 | ||
503 | ||
504 | add h, d # d = k + w + h + d # -- | |
505 | or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ | |
506 | add y1, h # h = k + w + h + S0 # -- | |
507 | ||
508 | add y2, d # d = k + w + h + d + S1 + CH = d + t1 # -- | |
509 | ||
510 | ||
511 | add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# -- | |
512 | ||
513 | add y3, h # h = t1 + S0 + MAJ # -- | |
514 | ||
515 | ROTATE_ARGS | |
516 | ||
517 | .endm | |
518 | ||
519 | ######################################################################## | |
41419a28 KC |
520 | ## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks) |
521 | ## arg 1 : pointer to state | |
1631030a | 522 | ## arg 2 : pointer to input data |
d34a4600 TC |
523 | ## arg 3 : Num blocks |
524 | ######################################################################## | |
525 | .text | |
6dcc5627 | 526 | SYM_FUNC_START(sha256_transform_rorx) |
d34a4600 TC |
527 | .align 32 |
528 | pushq %rbx | |
d34a4600 TC |
529 | pushq %r12 |
530 | pushq %r13 | |
531 | pushq %r14 | |
532 | pushq %r15 | |
533 | ||
ce584666 JP |
534 | push %rbp |
535 | mov %rsp, %rbp | |
536 | ||
d34a4600 TC |
537 | subq $STACK_SIZE, %rsp |
538 | and $-32, %rsp # align rsp to 32 byte boundary | |
d34a4600 TC |
539 | |
540 | shl $6, NUM_BLKS # convert to bytes | |
541 | jz done_hash | |
542 | lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block | |
543 | mov NUM_BLKS, _INP_END(%rsp) | |
544 | ||
545 | cmp NUM_BLKS, INP | |
546 | je only_one_block | |
547 | ||
548 | ## load initial digest | |
549 | mov (CTX), a | |
550 | mov 4*1(CTX), b | |
551 | mov 4*2(CTX), c | |
552 | mov 4*3(CTX), d | |
553 | mov 4*4(CTX), e | |
554 | mov 4*5(CTX), f | |
555 | mov 4*6(CTX), g | |
556 | mov 4*7(CTX), h | |
557 | ||
558 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | |
559 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | |
560 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | |
561 | ||
562 | mov CTX, _CTX(%rsp) | |
563 | ||
564 | loop0: | |
d34a4600 TC |
565 | ## Load first 16 dwords from two blocks |
566 | VMOVDQ 0*32(INP),XTMP0 | |
567 | VMOVDQ 1*32(INP),XTMP1 | |
568 | VMOVDQ 2*32(INP),XTMP2 | |
569 | VMOVDQ 3*32(INP),XTMP3 | |
570 | ||
571 | ## byte swap data | |
572 | vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0 | |
573 | vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1 | |
574 | vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2 | |
575 | vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3 | |
576 | ||
577 | ## transpose data into high/low halves | |
578 | vperm2i128 $0x20, XTMP2, XTMP0, X0 | |
579 | vperm2i128 $0x31, XTMP2, XTMP0, X1 | |
580 | vperm2i128 $0x20, XTMP3, XTMP1, X2 | |
581 | vperm2i128 $0x31, XTMP3, XTMP1, X3 | |
582 | ||
583 | last_block_enter: | |
584 | add $64, INP | |
585 | mov INP, _INP(%rsp) | |
586 | ||
587 | ## schedule 48 input dwords, by doing 3 rounds of 12 each | |
588 | xor SRND, SRND | |
589 | ||
590 | .align 16 | |
591 | loop1: | |
d3dfbfe2 | 592 | vpaddd K256+0*32(SRND), X0, XFER |
d34a4600 TC |
593 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) |
594 | FOUR_ROUNDS_AND_SCHED _XFER + 0*32 | |
595 | ||
d3dfbfe2 | 596 | vpaddd K256+1*32(SRND), X0, XFER |
d34a4600 TC |
597 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) |
598 | FOUR_ROUNDS_AND_SCHED _XFER + 1*32 | |
599 | ||
d3dfbfe2 | 600 | vpaddd K256+2*32(SRND), X0, XFER |
d34a4600 TC |
601 | vmovdqa XFER, 2*32+_XFER(%rsp, SRND) |
602 | FOUR_ROUNDS_AND_SCHED _XFER + 2*32 | |
603 | ||
d3dfbfe2 | 604 | vpaddd K256+3*32(SRND), X0, XFER |
d34a4600 TC |
605 | vmovdqa XFER, 3*32+_XFER(%rsp, SRND) |
606 | FOUR_ROUNDS_AND_SCHED _XFER + 3*32 | |
607 | ||
608 | add $4*32, SRND | |
609 | cmp $3*4*32, SRND | |
610 | jb loop1 | |
611 | ||
612 | loop2: | |
613 | ## Do last 16 rounds with no scheduling | |
d3dfbfe2 | 614 | vpaddd K256+0*32(SRND), X0, XFER |
d34a4600 TC |
615 | vmovdqa XFER, 0*32+_XFER(%rsp, SRND) |
616 | DO_4ROUNDS _XFER + 0*32 | |
d3dfbfe2 JP |
617 | |
618 | vpaddd K256+1*32(SRND), X1, XFER | |
d34a4600 TC |
619 | vmovdqa XFER, 1*32+_XFER(%rsp, SRND) |
620 | DO_4ROUNDS _XFER + 1*32 | |
621 | add $2*32, SRND | |
622 | ||
623 | vmovdqa X2, X0 | |
624 | vmovdqa X3, X1 | |
625 | ||
626 | cmp $4*4*32, SRND | |
627 | jb loop2 | |
628 | ||
629 | mov _CTX(%rsp), CTX | |
630 | mov _INP(%rsp), INP | |
631 | ||
632 | addm (4*0)(CTX),a | |
633 | addm (4*1)(CTX),b | |
634 | addm (4*2)(CTX),c | |
635 | addm (4*3)(CTX),d | |
636 | addm (4*4)(CTX),e | |
637 | addm (4*5)(CTX),f | |
638 | addm (4*6)(CTX),g | |
639 | addm (4*7)(CTX),h | |
640 | ||
641 | cmp _INP_END(%rsp), INP | |
642 | ja done_hash | |
643 | ||
644 | #### Do second block using previously scheduled results | |
645 | xor SRND, SRND | |
646 | .align 16 | |
647 | loop3: | |
648 | DO_4ROUNDS _XFER + 0*32 + 16 | |
649 | DO_4ROUNDS _XFER + 1*32 + 16 | |
650 | add $2*32, SRND | |
651 | cmp $4*4*32, SRND | |
652 | jb loop3 | |
653 | ||
654 | mov _CTX(%rsp), CTX | |
655 | mov _INP(%rsp), INP | |
656 | add $64, INP | |
657 | ||
658 | addm (4*0)(CTX),a | |
659 | addm (4*1)(CTX),b | |
660 | addm (4*2)(CTX),c | |
661 | addm (4*3)(CTX),d | |
662 | addm (4*4)(CTX),e | |
663 | addm (4*5)(CTX),f | |
664 | addm (4*6)(CTX),g | |
665 | addm (4*7)(CTX),h | |
666 | ||
667 | cmp _INP_END(%rsp), INP | |
668 | jb loop0 | |
669 | ja done_hash | |
670 | ||
671 | do_last_block: | |
d34a4600 TC |
672 | VMOVDQ 0*16(INP),XWORD0 |
673 | VMOVDQ 1*16(INP),XWORD1 | |
674 | VMOVDQ 2*16(INP),XWORD2 | |
675 | VMOVDQ 3*16(INP),XWORD3 | |
676 | ||
677 | vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0 | |
678 | vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1 | |
679 | vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2 | |
680 | vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3 | |
681 | ||
682 | jmp last_block_enter | |
683 | ||
684 | only_one_block: | |
685 | ||
686 | ## load initial digest | |
687 | mov (4*0)(CTX),a | |
688 | mov (4*1)(CTX),b | |
689 | mov (4*2)(CTX),c | |
690 | mov (4*3)(CTX),d | |
691 | mov (4*4)(CTX),e | |
692 | mov (4*5)(CTX),f | |
693 | mov (4*6)(CTX),g | |
694 | mov (4*7)(CTX),h | |
695 | ||
696 | vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK | |
697 | vmovdqa _SHUF_00BA(%rip), SHUF_00BA | |
698 | vmovdqa _SHUF_DC00(%rip), SHUF_DC00 | |
699 | ||
700 | mov CTX, _CTX(%rsp) | |
701 | jmp do_last_block | |
702 | ||
703 | done_hash: | |
704 | ||
ce584666 JP |
705 | mov %rbp, %rsp |
706 | pop %rbp | |
d34a4600 TC |
707 | |
708 | popq %r15 | |
709 | popq %r14 | |
710 | popq %r13 | |
711 | popq %r12 | |
d34a4600 TC |
712 | popq %rbx |
713 | ret | |
6dcc5627 | 714 | SYM_FUNC_END(sha256_transform_rorx) |
d34a4600 | 715 | |
e183914a | 716 | .section .rodata.cst512.K256, "aM", @progbits, 512 |
d34a4600 TC |
717 | .align 64 |
718 | K256: | |
719 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
720 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | |
721 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
722 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | |
723 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
724 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | |
725 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
726 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | |
727 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
728 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | |
729 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
730 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | |
731 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
732 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | |
733 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
734 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | |
735 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
736 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | |
737 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
738 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | |
739 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
740 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | |
741 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
742 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | |
743 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
744 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | |
745 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
746 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | |
747 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
748 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | |
749 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
750 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | |
751 | ||
e183914a DV |
752 | .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32 |
753 | .align 32 | |
d34a4600 TC |
754 | PSHUFFLE_BYTE_FLIP_MASK: |
755 | .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203 | |
756 | ||
757 | # shuffle xBxA -> 00BA | |
e183914a DV |
758 | .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32 |
759 | .align 32 | |
d34a4600 TC |
760 | _SHUF_00BA: |
761 | .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100 | |
762 | ||
763 | # shuffle xDxC -> DC00 | |
e183914a DV |
764 | .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32 |
765 | .align 32 | |
d34a4600 TC |
766 | _SHUF_DC00: |
767 | .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF |