]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha_256_mult_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; code to compute quad SHA256 using AVX
29 ;; outer calling routine takes care of save and restore of XMM registers
30 ;; Logic designed/laid out by JDG
31
32 ;; Stack must be aligned to 16 bytes before call
33 ;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
34 ;; Windows preserves: rcx rsi rdi rbp r12 r14 r15
35 ;;
36 ;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12
37 ;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
38 ;;
39 ;; clobbers xmm0-15
40
41 %include "os.asm"
42 %include "mb_mgr_datastruct.asm"
43
44 extern K256_4
45
46 %ifdef LINUX
47 %define arg1 rdi
48 %define arg2 rsi
49 %else
50 ; Windows definitions
51 %define arg1 rcx
52 %define arg2 rdx
53 %endif
54
55 ; Common definitions
56 %define STATE arg1
57 %define INP_SIZE arg2
58
59 %define IDX rax
60 %define ROUND rbx
61 %define TBL r12
62
63 %define inp0 r8
64 %define inp1 r9
65 %define inp2 r10
66 %define inp3 r11
67
68 %define a xmm0
69 %define b xmm1
70 %define c xmm2
71 %define d xmm3
72 %define e xmm4
73 %define f xmm5
74 %define g xmm6
75 %define h xmm7
76
77 %define a0 xmm8
78 %define a1 xmm9
79 %define a2 xmm10
80
81 %define TT0 xmm14
82 %define TT1 xmm13
83 %define TT2 xmm12
84 %define TT3 xmm11
85 %define TT4 xmm10
86 %define TT5 xmm9
87
88 %define T1 xmm14
89 %define TMP xmm15
90
91 %define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
92 %define ROUNDS 64*SZ4
93
94 ; Define stack usage
95 struc STACK
96 _DATA: resb SZ4 * 16
97 _DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS
98 resb 8 ; for alignment, must be odd multiple of 8
99 endstruc
100
101 %define VMOVPS vmovups
102
103 ; transpose r0, r1, r2, r3, t0, t1
104 ; "transpose" data in {r0..r3} using temps {t0..t3}
105 ; Input looks like: {r0 r1 r2 r3}
106 ; r0 = {a3 a2 a1 a0}
107 ; r1 = {b3 b2 b1 b0}
108 ; r2 = {c3 c2 c1 c0}
109 ; r3 = {d3 d2 d1 d0}
110 ;
111 ; output looks like: {t0 r1 r0 r3}
112 ; t0 = {d0 c0 b0 a0}
113 ; r1 = {d1 c1 b1 a1}
114 ; r0 = {d2 c2 b2 a2}
115 ; r3 = {d3 c3 b3 a3}
116 ;
117 %macro TRANSPOSE 6
118 %define %%r0 %1
119 %define %%r1 %2
120 %define %%r2 %3
121 %define %%r3 %4
122 %define %%t0 %5
123 %define %%t1 %6
124 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
125 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
126
127 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
128 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
129
130 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
131
132 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
133
134 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
135 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
136 %endmacro
137
138
139
140 %macro ROTATE_ARGS 0
141 %xdefine TMP_ h
142 %xdefine h g
143 %xdefine g f
144 %xdefine f e
145 %xdefine e d
146 %xdefine d c
147 %xdefine c b
148 %xdefine b a
149 %xdefine a TMP_
150 %endm
151
152 ; PRORD reg, imm, tmp
153 %macro PRORD 3
154 %define %%reg %1
155 %define %%imm %2
156 %define %%tmp %3
157 vpslld %%tmp, %%reg, (32-(%%imm))
158 vpsrld %%reg, %%reg, %%imm
159 vpor %%reg, %%reg, %%tmp
160 %endmacro
161
162 ; non-destructive
163 ; PRORD_nd reg, imm, tmp, src
164 %macro PRORD_nd 4
165 %define %%reg %1
166 %define %%imm %2
167 %define %%tmp %3
168 %define %%src %4
169 ;vmovdqa %%tmp, %%reg
170 vpslld %%tmp, %%src, (32-(%%imm))
171 vpsrld %%reg, %%src, %%imm
172 vpor %%reg, %%reg, %%tmp
173 %endmacro
174
175 ; PRORD dst/src, amt
176 %macro PRORD 2
177 PRORD %1, %2, TMP
178 %endmacro
179
180 ; PRORD_nd dst, src, amt
181 %macro PRORD_nd 3
182 PRORD_nd %1, %3, TMP, %2
183 %endmacro
184
185 ;; arguments passed implicitly in preprocessor symbols i, a...h
186 %macro ROUND_00_15 2
187 %define %%T1 %1
188 %define %%i %2
189 PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
190
191 vpxor a2, f, g ; ch: a2 = f^g
192 vpand a2, a2, e ; ch: a2 = (f^g)&e
193 vpxor a2, a2, g ; a2 = ch
194
195 PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
196 vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1
197 vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
198 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
199 PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
200 vpaddd h, h, a2 ; h = h + ch
201 PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
202 vpaddd h, h, %%T1 ; h = h + ch + W + K
203 vpxor a0, a0, a1 ; a0 = sigma1
204 PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
205 vpxor %%T1, a, c ; maj: T1 = a^c
206 add ROUND, SZ4 ; ROUND++
207 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
208 vpaddd h, h, a0
209
210 vpaddd d, d, h
211
212 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
213 PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
214 vpxor a2, a2, a1 ; a2 = sig0
215 vpand a1, a, c ; maj: a1 = a&c
216 vpor a1, a1, %%T1 ; a1 = maj
217 vpaddd h, h, a1 ; h = h + ch + W + K + maj
218 vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
219
220 ROTATE_ARGS
221 %endm
222
223
224 ;; arguments passed implicitly in preprocessor symbols i, a...h
225 %macro ROUND_16_XX 2
226 %define %%T1 %1
227 %define %%i %2
228 vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
229 vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
230 vmovdqa a0, %%T1
231 PRORD %%T1, 18-7
232 vmovdqa a2, a1
233 PRORD a1, 19-17
234 vpxor %%T1, %%T1, a0
235 PRORD %%T1, 7
236 vpxor a1, a1, a2
237 PRORD a1, 17
238 vpsrld a0, a0, 3
239 vpxor %%T1, %%T1, a0
240 vpsrld a2, a2, 10
241 vpxor a1, a1, a2
242 vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
243 vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
244 vpaddd %%T1, %%T1, a1
245
246 ROUND_00_15 %%T1, %%i
247 %endm
248
249 section .data
250 default rel
251 align 16
252 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
253 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
254
255 section .text
256
257 ;; SHA256_ARGS:
258 ;; UINT128 digest[8]; // transposed digests
259 ;; UINT8 *data_ptr[4];
260 ;;
261
262 ;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks);
263 ;; arg 1 : STATE : pointer args
264 ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
265 ;;
266 MKGLOBAL(sha_256_mult_avx,function,internal)
267 align 16
268 sha_256_mult_avx:
269 ; general registers preserved in outer calling routine
270 ; outer calling routine saves all the XMM registers
271 sub rsp, STACK_size
272
273 ;; Load the pre-transposed incoming digest.
274 vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE]
275 vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE]
276 vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE]
277 vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE]
278 vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE]
279 vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE]
280 vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE]
281 vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE]
282
283 lea TBL,[rel K256_4]
284
285 ;; load the address of each of the 4 message lanes
286 ;; getting ready to transpose input onto stack
287 mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
288 mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
289 mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
290 mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
291
292 xor IDX, IDX
293 lloop:
294 xor ROUND, ROUND
295
296 ;; save old digest
297 vmovdqa [rsp + _DIGEST + 0*SZ4], a
298 vmovdqa [rsp + _DIGEST + 1*SZ4], b
299 vmovdqa [rsp + _DIGEST + 2*SZ4], c
300 vmovdqa [rsp + _DIGEST + 3*SZ4], d
301 vmovdqa [rsp + _DIGEST + 4*SZ4], e
302 vmovdqa [rsp + _DIGEST + 5*SZ4], f
303 vmovdqa [rsp + _DIGEST + 6*SZ4], g
304 vmovdqa [rsp + _DIGEST + 7*SZ4], h
305
306 %assign i 0
307 %rep 4
308 vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
309 VMOVPS TT2,[inp0+IDX+i*16]
310 VMOVPS TT1,[inp1+IDX+i*16]
311 VMOVPS TT4,[inp2+IDX+i*16]
312 VMOVPS TT3,[inp3+IDX+i*16]
313 TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
314 vpshufb TT0, TT0, TMP
315 vpshufb TT1, TT1, TMP
316 vpshufb TT2, TT2, TMP
317 vpshufb TT3, TT3, TMP
318 ROUND_00_15 TT0,(i*4+0)
319 ROUND_00_15 TT1,(i*4+1)
320 ROUND_00_15 TT2,(i*4+2)
321 ROUND_00_15 TT3,(i*4+3)
322 %assign i (i+1)
323 %endrep
324 add IDX, 4*4*4
325
326 %assign i (i*4)
327
328 jmp Lrounds_16_xx
329 align 16
330 Lrounds_16_xx:
331 %rep 16
332 ROUND_16_XX T1, i
333 %assign i (i+1)
334 %endrep
335
336 cmp ROUND,ROUNDS
337 jb Lrounds_16_xx
338
339 ;; add old digest
340 vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
341 vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
342 vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
343 vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
344 vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
345 vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
346 vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
347 vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
348
349 sub INP_SIZE, 1 ;; unit is blocks
350 jne lloop
351
352 ; write back to memory (state object) the transposed digest
353 vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a
354 vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b
355 vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c
356 vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d
357 vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e
358 vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f
359 vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g
360 vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h
361
362 ; update input pointers
363 add inp0, IDX
364 mov [STATE + _data_ptr_sha256 + 0*8], inp0
365 add inp1, IDX
366 mov [STATE + _data_ptr_sha256 + 1*8], inp1
367 add inp2, IDX
368 mov [STATE + _data_ptr_sha256 + 2*8], inp2
369 add inp3, IDX
370 mov [STATE + _data_ptr_sha256 + 3*8], inp3
371
372 ;;;;;;;;;;;;;;;;
373 ;; Postamble
374
375 add rsp, STACK_size
376 ; outer calling routine restores XMM and other GP registers
377 ret
378
379 %ifdef LINUX
380 section .note.GNU-stack noalloc noexec nowrite progbits
381 %endif