]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha512_x2_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; code to compute SHA512 by-2 using AVX
29 ;; outer calling routine takes care of save and restore of XMM registers
30 ;; Logic designed/laid out by JDG
31
32 ;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
33 ;; Stack must be aligned to 16 bytes before call
34 ;; Windows clobbers: rax rdx r8 r9 r10 r11
35 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
36 ;;
37 ;; Linux clobbers: rax rsi r8 r9 r10 r11
38 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
39 ;;
40 ;; clobbers xmm0-15
41
42 %include "os.asm"
43 %include "mb_mgr_datastruct.asm"
44 extern K512_2
45
46 section .data
47 default rel
48
49 align 32
50 ; one from sha512_rorx
51 ; this does the big endian to little endian conversion
52 ; over a quad word
53 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
54 dq 0x0001020304050607, 0x08090a0b0c0d0e0f
55 ;ddq 0x18191a1b1c1d1e1f1011121314151617
56 dq 0x1011121314151617, 0x18191a1b1c1d1e1f
57
58 section .text
59
60 %ifdef LINUX ; Linux definitions
61 %define arg1 rdi
62 %define arg2 rsi
63 %else ; Windows definitions
64 %define arg1 rcx
65 %define arg2 rdx
66 %endif
67
68 ; Common definitions
69 %define STATE arg1
70 %define INP_SIZE arg2
71
72 %define IDX rax
73 %define ROUND r8
74 %define TBL r11
75
76 %define inp0 r9
77 %define inp1 r10
78
79 %define a xmm0
80 %define b xmm1
81 %define c xmm2
82 %define d xmm3
83 %define e xmm4
84 %define f xmm5
85 %define g xmm6
86 %define h xmm7
87
88 %define a0 xmm8
89 %define a1 xmm9
90 %define a2 xmm10
91
92 %define TT0 xmm14
93 %define TT1 xmm13
94 %define TT2 xmm12
95 %define TT3 xmm11
96 %define TT4 xmm10
97 %define TT5 xmm9
98
99 %define T1 xmm14
100 %define TMP xmm15
101
102
103
104 %define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
105 %define ROUNDS 80*SZ2
106
107 ; Define stack usage
108
109 struc STACK
110 _DATA: resb SZ2 * 16
111 _DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
112 resb 8 ; for alignment, must be odd multiple of 8
113 endstruc
114
115 %define VMOVPD vmovupd
116
117 ; transpose r0, r1, t0
118 ; Input looks like {r0 r1}
119 ; r0 = {a1 a0}
120 ; r1 = {b1 b0}
121 ;
122 ; output looks like
123 ; r0 = {b0, a0}
124 ; t0 = {b1, a1}
125
126 %macro TRANSPOSE 3
127 %define %%r0 %1
128 %define %%r1 %2
129 %define %%t0 %3
130 vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1
131 vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0
132 %endm
133
134
135 %macro ROTATE_ARGS 0
136 %xdefine TMP_ h
137 %xdefine h g
138 %xdefine g f
139 %xdefine f e
140 %xdefine e d
141 %xdefine d c
142 %xdefine c b
143 %xdefine b a
144 %xdefine a TMP_
145 %endm
146
147 ; PRORQ reg, imm, tmp
148 ; packed-rotate-right-double
149 ; does a rotate by doing two shifts and an or
150 %macro PRORQ 3
151 %define %%reg %1
152 %define %%imm %2
153 %define %%tmp %3
154 vpsllq %%tmp, %%reg, (64-(%%imm))
155 vpsrlq %%reg, %%reg, %%imm
156 vpor %%reg, %%reg, %%tmp
157 %endmacro
158
159 ; non-destructive
160 ; PRORQ_nd reg, imm, tmp, src
161 %macro PRORQ_nd 4
162 %define %%reg %1
163 %define %%imm %2
164 %define %%tmp %3
165 %define %%src %4
166 vpsllq %%tmp, %%src, (64-(%%imm))
167 vpsrlq %%reg, %%src, %%imm
168 vpor %%reg, %%reg, %%tmp
169 %endmacro
170
171 ; PRORQ dst/src, amt
172 %macro PRORQ 2
173 PRORQ %1, %2, TMP
174 %endmacro
175
176 ; PRORQ_nd dst, src, amt
177 %macro PRORQ_nd 3
178 PRORQ_nd %1, %3, TMP, %2
179 %endmacro
180
181
182
183 ;; arguments passed implicitly in preprocessor symbols i, a...h
184 %macro ROUND_00_15 2
185 %define %%T1 %1
186 %define %%i %2
187 PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
188
189 vpxor a2, f, g ; ch: a2 = f^g
190 vpand a2, a2, e ; ch: a2 = (f^g)&e
191 vpxor a2, a2, g ; a2 = ch
192
193 PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
194 vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1
195 vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
196 vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
197 PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
198 vpaddq h, h, a2 ; h = h + ch
199 PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
200 vpaddq h, h, %%T1 ; h = h + ch + W + K
201 vpxor a0, a0, a1 ; a0 = sigma1
202 vmovdqa %%T1, a ; maj: T1 = a
203 PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
204 vpxor %%T1, %%T1, c ; maj: T1 = a^c
205 add ROUND, SZ2 ; ROUND++
206 vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
207 vpaddq h, h, a0
208
209 vpaddq d, d, h
210
211 vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
212 PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
213 vpxor a2, a2, a1 ; a2 = sig0
214 vpand a1, a, c ; maj: a1 = a&c
215 vpor a1, a1, %%T1 ; a1 = maj
216 vpaddq h, h, a1 ; h = h + ch + W + K + maj
217 vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
218 ROTATE_ARGS
219
220 %endm
221
222
223 ;; arguments passed implicitly in preprocessor symbols i, a...h
224 %macro ROUND_16_XX 2
225 %define %%T1 %1
226 %define %%i %2
227 vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
228 vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
229 vmovdqa a0, %%T1
230 PRORQ %%T1, 8-1
231 vmovdqa a2, a1
232 PRORQ a1, 61-19
233 vpxor %%T1, %%T1, a0
234 PRORQ %%T1, 1
235 vpxor a1, a1, a2
236 PRORQ a1, 19
237 vpsrlq a0, a0, 7
238 vpxor %%T1, %%T1, a0
239 vpsrlq a2, a2, 6
240 vpxor a1, a1, a2
241 vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
242 vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
243 vpaddq %%T1, %%T1, a1
244
245 ROUND_00_15 %%T1, %%i
246
247 %endm
248
249
250
251 ;; SHA512_ARGS:
252 ;; UINT128 digest[8]; // transposed digests
253 ;; UINT8 *data_ptr[2];
254 ;;
255
256 ;; void sha512_x2_avx(SHA512_ARGS *args, UINT64 msg_size_in_blocks)
257 ;; arg 1 : STATE : pointer args
258 ;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
259 ;;
260 MKGLOBAL(sha512_x2_avx,function,internal)
261 align 32
262 sha512_x2_avx:
263 ; general registers preserved in outer calling routine
264 ; outer calling routine saves all the XMM registers
265
266 sub rsp, STACK_size
267
268 ;; Load the pre-transposed incoming digest.
269 vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
270 vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
271 vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
272 vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
273 vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
274 vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
275 vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
276 vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
277
278 lea TBL,[rel K512_2]
279
280 ;; load the address of each of the 2 message lanes
281 ;; getting ready to transpose input onto stack
282 mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
283 mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
284
285 xor IDX, IDX
286 lloop:
287
288 xor ROUND, ROUND
289
290 ;; save old digest
291 vmovdqa [rsp + _DIGEST + 0*SZ2], a
292 vmovdqa [rsp + _DIGEST + 1*SZ2], b
293 vmovdqa [rsp + _DIGEST + 2*SZ2], c
294 vmovdqa [rsp + _DIGEST + 3*SZ2], d
295 vmovdqa [rsp + _DIGEST + 4*SZ2], e
296 vmovdqa [rsp + _DIGEST + 5*SZ2], f
297 vmovdqa [rsp + _DIGEST + 6*SZ2], g
298 vmovdqa [rsp + _DIGEST + 7*SZ2], h
299
300 %assign i 0
301 %rep 8
302 ;; load up the shuffler for little-endian to big-endian format
303 vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
304 VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
305 VMOVPD TT2,[inp1+IDX+i*16]
306
307 TRANSPOSE TT0, TT2, TT1
308 vpshufb TT0, TT0, TMP
309 vpshufb TT1, TT1, TMP
310
311 ROUND_00_15 TT0,(i*2+0)
312 ROUND_00_15 TT1,(i*2+1)
313 %assign i (i+1)
314 %endrep
315
316 ;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
317 add IDX, 8 * 16
318
319 %assign i (i*4)
320
321 jmp Lrounds_16_xx
322 align 16
323 Lrounds_16_xx:
324 %rep 16
325 ROUND_16_XX T1, i
326 %assign i (i+1)
327 %endrep
328
329 cmp ROUND,ROUNDS
330 jb Lrounds_16_xx
331
332 ;; add old digest
333 vpaddq a, a, [rsp + _DIGEST + 0*SZ2]
334 vpaddq b, b, [rsp + _DIGEST + 1*SZ2]
335 vpaddq c, c, [rsp + _DIGEST + 2*SZ2]
336 vpaddq d, d, [rsp + _DIGEST + 3*SZ2]
337 vpaddq e, e, [rsp + _DIGEST + 4*SZ2]
338 vpaddq f, f, [rsp + _DIGEST + 5*SZ2]
339 vpaddq g, g, [rsp + _DIGEST + 6*SZ2]
340 vpaddq h, h, [rsp + _DIGEST + 7*SZ2]
341
342 sub INP_SIZE, 1 ;; consumed one message block
343 jne lloop
344
345 ; write back to memory (state object) the transposed digest
346 vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a
347 vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b
348 vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c
349 vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d
350 vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e
351 vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f
352 vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g
353 vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h
354
355 ; update input pointers
356 add inp0, IDX
357 mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
358 add inp1, IDX
359 mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
360
361 ;;;;;;;;;;;;;;;;
362 ;; Postamble
363
364 add rsp, STACK_size
365
366 ; outer calling routine restores XMM and other GP registers
367 ret
368
369 %ifdef LINUX
370 section .note.GNU-stack noalloc noexec nowrite progbits
371 %endif