]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha256_mb / sha256_mb_x4_avx.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "sha256_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32
33 default rel
34
35 ;; code to compute quad SHA256 using AVX
36 ;; Logic designed/laid out by JDG
37
38 ; transpose r0, r1, r2, r3, t0, t1
39 ; "transpose" data in {r0..r3} using temps {t0..t3}
40 ; Input looks like: {r0 r1 r2 r3}
41 ; r0 = {a3 a2 a1 a0}
42 ; r1 = {b3 b2 b1 b0}
43 ; r2 = {c3 c2 c1 c0}
44 ; r3 = {d3 d2 d1 d0}
45 ;
46 ; output looks like: {t0 r1 r0 r3}
47 ; t0 = {d0 c0 b0 a0}
48 ; r1 = {d1 c1 b1 a1}
49 ; r0 = {d2 c2 b2 a2}
50 ; r3 = {d3 c3 b3 a3}
51 ;
52 %macro TRANSPOSE 6
53 %define %%r0 %1
54 %define %%r1 %2
55 %define %%r2 %3
56 %define %%r3 %4
57 %define %%t0 %5
58 %define %%t1 %6
59 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
60 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
61
62 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
63 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
64
65 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
66
67 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
68
69 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
70 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
71 %endmacro
72
73
74 %define TABLE K256_4_MB
75 %define SZ 4
76 %define SZ4 4*SZ
77 %define ROUNDS 64*SZ4
78
79 %define a xmm0
80 %define b xmm1
81 %define c xmm2
82 %define d xmm3
83 %define e xmm4
84 %define f xmm5
85 %define g xmm6
86 %define h xmm7
87
88 %define a0 xmm8
89 %define a1 xmm9
90 %define a2 xmm10
91
92 %define TT0 xmm14
93 %define TT1 xmm13
94 %define TT2 xmm12
95 %define TT3 xmm11
96 %define TT4 xmm10
97 %define TT5 xmm9
98
99 %define T1 xmm14
100 %define TMP xmm15
101
102
103 %macro ROTATE_ARGS 0
104 %xdefine TMP_ h
105 %xdefine h g
106 %xdefine g f
107 %xdefine f e
108 %xdefine e d
109 %xdefine d c
110 %xdefine c b
111 %xdefine b a
112 %xdefine a TMP_
113 %endm
114
115 ; PRORD reg, imm, tmp
116 %macro PRORD 3
117 %define %%reg %1
118 %define %%imm %2
119 %define %%tmp %3
120 vpslld %%tmp, %%reg, (32-(%%imm))
121 vpsrld %%reg, %%reg, %%imm
122 vpor %%reg, %%reg, %%tmp
123 %endmacro
124
125 ; non-destructive
126 ; PRORD_nd reg, imm, tmp, src
127 %macro PRORD_nd 4
128 %define %%reg %1
129 %define %%imm %2
130 %define %%tmp %3
131 %define %%src %4
132 vpslld %%tmp, %%src, (32-(%%imm))
133 vpsrld %%reg, %%src, %%imm
134 vpor %%reg, %%reg, %%tmp
135 %endmacro
136
137 ; PRORD dst/src, amt
138 %macro PRORD 2
139 PRORD %1, %2, TMP
140 %endmacro
141
142 ; PRORD_nd dst, src, amt
143 %macro PRORD_nd 3
144 PRORD_nd %1, %3, TMP, %2
145 %endmacro
146
147 ;; arguments passed implicitly in preprocessor symbols i, a...h
148 %macro ROUND_00_15 2
149 %define %%T1 %1
150 %define %%i %2
151
152
153 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
154
155 vpxor a2, f, g ; ch: a2 = f^g
156 vpand a2, e ; ch: a2 = (f^g)&e
157 vpxor a2, g ; a2 = ch
158
159 PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
160 vmovdqa [SZ4*(%%i&0xf) + rsp], %%T1
161 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
162 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
163 PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
164 vpaddd h, h, a2 ; h = h + ch
165 PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
166 vpaddd h, h, %%T1 ; h = h + ch + W + K
167 vpxor a0, a0, a1 ; a0 = sigma1
168 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
169 vpxor %%T1, a, c ; maj: T1 = a^c
170 add ROUND, SZ4 ; ROUND++
171 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
172 vpaddd h, h, a0
173
174 vpaddd d, d, h
175
176 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
177 PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
178 vpxor a2, a2, a1 ; a2 = sig0
179 vpand a1, a, c ; maj: a1 = a&c
180 vpor a1, a1, %%T1 ; a1 = maj
181 vpaddd h, h, a1 ; h = h + ch + W + K + maj
182 vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
183
184 ROTATE_ARGS
185 %endm
186
187
188 ;; arguments passed implicitly in preprocessor symbols i, a...h
189 %macro ROUND_16_XX 2
190 %define %%T1 %1
191 %define %%i %2
192
193 vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
194 vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
195 vmovdqa a0, %%T1
196 PRORD %%T1, 18-7
197 vmovdqa a2, a1
198 PRORD a1, 19-17
199 vpxor %%T1, %%T1, a0
200 PRORD %%T1, 7
201 vpxor a1, a1, a2
202 PRORD a1, 17
203 vpsrld a0, a0, 3
204 vpxor %%T1, %%T1, a0
205 vpsrld a2, a2, 10
206 vpxor a1, a1, a2
207 vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
208 vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
209 vpaddd %%T1, %%T1, a1
210
211 ROUND_00_15 %%T1, %%i
212 %endm
213
214 %define DIGEST_SIZE 8*SZ4
215 %define DATA 16*SZ4
216 %define ALIGNMENT 1*8
217 ; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
218 %define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
219 %define _DIGEST (DATA)
220
221 %define VMOVPS vmovups
222
223 %define inp0 r8
224 %define inp1 r9
225 %define inp2 r10
226 %define inp3 r11
227
228 %ifidn __OUTPUT_FORMAT__, elf64
229 ; Linux definitions
230 %define arg1 rdi
231 %define arg2 rsi
232 %else
233 ; Windows definitions
234 %define arg1 rcx
235 %define arg2 rdx
236 %endif
237
238 ; Common definitions
239 %define IDX rax
240 %define ROUND rbx
241 %define TBL r12
242
243 ;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
244 ;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
245 ;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
246 ;;
247 ;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
248 ;;
249 global sha256_mb_x4_avx:function internal
250 align 32
251 sha256_mb_x4_avx:
252 sub rsp, FRAMESZ
253
254 ;; Initialize digests
255 vmovdqa a,[arg1+0*SZ4]
256 vmovdqa b,[arg1+1*SZ4]
257 vmovdqa c,[arg1+2*SZ4]
258 vmovdqa d,[arg1+3*SZ4]
259 vmovdqa e,[arg1+4*SZ4]
260 vmovdqa f,[arg1+5*SZ4]
261 vmovdqa g,[arg1+6*SZ4]
262 vmovdqa h,[arg1+7*SZ4]
263
264 lea TBL,[TABLE]
265
266 ;; transpose input onto stack
267 mov inp0,[arg1 + _data_ptr + 0*8]
268 mov inp1,[arg1 + _data_ptr + 1*8]
269 mov inp2,[arg1 + _data_ptr + 2*8]
270 mov inp3,[arg1 + _data_ptr + 3*8]
271
272 xor IDX, IDX
273 lloop:
274 xor ROUND, ROUND
275
276 ;; save old digest
277 vmovdqa [rsp + _DIGEST + 0*SZ4], a
278 vmovdqa [rsp + _DIGEST + 1*SZ4], b
279 vmovdqa [rsp + _DIGEST + 2*SZ4], c
280 vmovdqa [rsp + _DIGEST + 3*SZ4], d
281 vmovdqa [rsp + _DIGEST + 4*SZ4], e
282 vmovdqa [rsp + _DIGEST + 5*SZ4], f
283 vmovdqa [rsp + _DIGEST + 6*SZ4], g
284 vmovdqa [rsp + _DIGEST + 7*SZ4], h
285
286 %assign i 0
287 %rep 4
288 vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
289 VMOVPS TT2,[inp0+IDX+i*16]
290 VMOVPS TT1,[inp1+IDX+i*16]
291 VMOVPS TT4,[inp2+IDX+i*16]
292 VMOVPS TT3,[inp3+IDX+i*16]
293 TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
294 vpshufb TT0, TT0, TMP
295 vpshufb TT1, TT1, TMP
296 vpshufb TT2, TT2, TMP
297 vpshufb TT3, TT3, TMP
298 ROUND_00_15 TT0,(i*4+0)
299 ROUND_00_15 TT1,(i*4+1)
300 ROUND_00_15 TT2,(i*4+2)
301 ROUND_00_15 TT3,(i*4+3)
302 %assign i (i+1)
303 %endrep
304 add IDX, 4*4*4
305
306
307 %assign i (i*4)
308
309 jmp Lrounds_16_xx
310 align 16
311 Lrounds_16_xx:
312 %rep 16
313 ROUND_16_XX T1, i
314 %assign i (i+1)
315 %endrep
316
317 cmp ROUND,ROUNDS
318 jb Lrounds_16_xx
319
320 ;; add old digest
321 vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
322 vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
323 vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
324 vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
325 vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
326 vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
327 vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
328 vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
329
330
331 sub arg2, 1
332 jne lloop
333
334 ; write digests out
335 vmovdqa [arg1+0*SZ4],a
336 vmovdqa [arg1+1*SZ4],b
337 vmovdqa [arg1+2*SZ4],c
338 vmovdqa [arg1+3*SZ4],d
339 vmovdqa [arg1+4*SZ4],e
340 vmovdqa [arg1+5*SZ4],f
341 vmovdqa [arg1+6*SZ4],g
342 vmovdqa [arg1+7*SZ4],h
343
344 ; update input pointers
345 add inp0, IDX
346 mov [arg1 + _data_ptr + 0*8], inp0
347 add inp1, IDX
348 mov [arg1 + _data_ptr + 1*8], inp1
349 add inp2, IDX
350 mov [arg1 + _data_ptr + 2*8], inp2
351 add inp3, IDX
352 mov [arg1 + _data_ptr + 3*8], inp3
353
354 ;;;;;;;;;;;;;;;;
355 ;; Postamble
356
357 add rsp, FRAMESZ
358 ret
359
360 section .data align=64
361
362 align 64
363 TABLE:
364 dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
365 dq 0x7137449171374491, 0x7137449171374491
366 dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
367 dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
368 dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
369 dq 0x59f111f159f111f1, 0x59f111f159f111f1
370 dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
371 dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
372 dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
373 dq 0x12835b0112835b01, 0x12835b0112835b01
374 dq 0x243185be243185be, 0x243185be243185be
375 dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
376 dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
377 dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
378 dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
379 dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
380 dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
381 dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
382 dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
383 dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
384 dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
385 dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
386 dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
387 dq 0x76f988da76f988da, 0x76f988da76f988da
388 dq 0x983e5152983e5152, 0x983e5152983e5152
389 dq 0xa831c66da831c66d, 0xa831c66da831c66d
390 dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
391 dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
392 dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
393 dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
394 dq 0x06ca635106ca6351, 0x06ca635106ca6351
395 dq 0x1429296714292967, 0x1429296714292967
396 dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
397 dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
398 dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
399 dq 0x53380d1353380d13, 0x53380d1353380d13
400 dq 0x650a7354650a7354, 0x650a7354650a7354
401 dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
402 dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
403 dq 0x92722c8592722c85, 0x92722c8592722c85
404 dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
405 dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
406 dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
407 dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
408 dq 0xd192e819d192e819, 0xd192e819d192e819
409 dq 0xd6990624d6990624, 0xd6990624d6990624
410 dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
411 dq 0x106aa070106aa070, 0x106aa070106aa070
412 dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
413 dq 0x1e376c081e376c08, 0x1e376c081e376c08
414 dq 0x2748774c2748774c, 0x2748774c2748774c
415 dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
416 dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
417 dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
418 dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
419 dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
420 dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
421 dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
422 dq 0x84c8781484c87814, 0x84c8781484c87814
423 dq 0x8cc702088cc70208, 0x8cc702088cc70208
424 dq 0x90befffa90befffa, 0x90befffa90befffa
425 dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
426 dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
427 dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
428 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b