]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / sha1_mult_avx.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 %include "include/os.asm"
29 %include "mb_mgr_datastruct.asm"
30
31 section .data
32 default rel
33
34 align 16
35 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
36 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
37 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
38 dq 0x5A8279995A827999, 0x5A8279995A827999
39 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
40 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
41 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
42 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
43 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
44 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
45
46 section .text
47
48 ;; code to compute quad SHA1 using AVX
49 ;; derived from ...\sha1_multiple\sha1_quad4.asm
50 ;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
51 ;; rbx, rsi, rdi, rbp, r12-r15 left intact
52 ;; This version is not safe to call from C/C++
53
54 ;; Stack must be aligned to 16 bytes before call
55 ;; Windows clobbers: rax rdx r8 r9 r10 r11
56 ;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
57 ;;
58 ;; Linux clobbers: rax rsi r8 r9 r10 r11
59 ;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
60 ;;
61 ;; clobbers xmm0-15
62
63 ; transpose r0, r1, r2, r3, t0, t1
64 ; "transpose" data in {r0..r3} using temps {t0..t3}
65 ; Input looks like: {r0 r1 r2 r3}
66 ; r0 = {a3 a2 a1 a0}
67 ; r1 = {b3 b2 b1 b0}
68 ; r2 = {c3 c2 c1 c0}
69 ; r3 = {d3 d2 d1 d0}
70 ;
71 ; output looks like: {t0 r1 r0 r3}
72 ; t0 = {d0 c0 b0 a0}
73 ; r1 = {d1 c1 b1 a1}
74 ; r0 = {d2 c2 b2 a2}
75 ; r3 = {d3 c3 b3 a3}
76 ;
77 %macro TRANSPOSE 6
78 %define %%r0 %1
79 %define %%r1 %2
80 %define %%r2 %3
81 %define %%r3 %4
82 %define %%t0 %5
83 %define %%t1 %6
84 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
85 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
86
87 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
88 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
89
90 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
91
92 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
93
94 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
95 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
96 %endmacro
97 ;;
98 ;; Magic functions defined in FIPS 180-1
99 ;;
100 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
101 %macro MAGIC_F0 5
102 %define %%regF %1
103 %define %%regB %2
104 %define %%regC %3
105 %define %%regD %4
106 %define %%regT %5
107 vpxor %%regF, %%regC,%%regD
108 vpand %%regF, %%regF,%%regB
109 vpxor %%regF, %%regF,%%regD
110 %endmacro
111
112 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
113 %macro MAGIC_F1 5
114 %define %%regF %1
115 %define %%regB %2
116 %define %%regC %3
117 %define %%regD %4
118 %define %%regT %5
119 vpxor %%regF,%%regD,%%regC
120 vpxor %%regF,%%regF,%%regB
121 %endmacro
122
123 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
124 %macro MAGIC_F2 5
125 %define %%regF %1
126 %define %%regB %2
127 %define %%regC %3
128 %define %%regD %4
129 %define %%regT %5
130 vpor %%regF,%%regB,%%regC
131 vpand %%regT,%%regB,%%regC
132 vpand %%regF,%%regF,%%regD
133 vpor %%regF,%%regF,%%regT
134 %endmacro
135
136 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
137 %macro MAGIC_F3 5
138 %define %%regF %1
139 %define %%regB %2
140 %define %%regC %3
141 %define %%regD %4
142 %define %%regT %5
143 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
144 %endmacro
145
146 ; PROLD reg, imm, tmp
147 %macro PROLD 3
148 %define %%reg %1
149 %define %%imm %2
150 %define %%tmp %3
151 vpsrld %%tmp, %%reg, (32-(%%imm))
152 vpslld %%reg, %%reg, %%imm
153 vpor %%reg, %%reg, %%tmp
154 %endmacro
155
156 ; non-destructive
157 ; PROLD_nd reg, imm, tmp, src
158 %macro PROLD_nd 4
159 %define %%reg %1
160 %define %%imm %2
161 %define %%tmp %3
162 %define %%src %4
163 vpsrld %%tmp, %%src, (32-(%%imm))
164 vpslld %%reg, %%src, %%imm
165 vpor %%reg, %%reg, %%tmp
166 %endmacro
167
168 %macro SHA1_STEP_00_15 10
169 %define %%regA %1
170 %define %%regB %2
171 %define %%regC %3
172 %define %%regD %4
173 %define %%regE %5
174 %define %%regT %6
175 %define %%regF %7
176 %define %%memW %8
177 %define %%immCNT %9
178 %define %%MAGIC %10
179 vpaddd %%regE, %%regE,%%immCNT
180 vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
181 PROLD_nd %%regT,5, %%regF,%%regA
182 vpaddd %%regE, %%regE,%%regT
183 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
184 PROLD %%regB,30, %%regT
185 vpaddd %%regE, %%regE,%%regF
186 %endmacro
187
188 %macro SHA1_STEP_16_79 10
189 %define %%regA %1
190 %define %%regB %2
191 %define %%regC %3
192 %define %%regD %4
193 %define %%regE %5
194 %define %%regT %6
195 %define %%regF %7
196 %define %%memW %8
197 %define %%immCNT %9
198 %define %%MAGIC %10
199 vpaddd %%regE, %%regE,%%immCNT
200
201 vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
202 vpxor W16, W16, W14
203 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
204 vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
205
206 vpsrld %%regF, W16, (32-1)
207 vpslld W16, W16, 1
208 vpor %%regF, %%regF, W16
209 ROTATE_W
210
211 vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
212 vpaddd %%regE, %%regE,%%regF
213
214 PROLD_nd %%regT,5, %%regF, %%regA
215 vpaddd %%regE, %%regE,%%regT
216 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
217 PROLD %%regB,30, %%regT
218 vpaddd %%regE,%%regE,%%regF
219 %endmacro
220
221 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
223 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224
225 ;; FRAMESZ must be an odd multiple of 8
226 %define FRAMESZ 16*16 + 8
227
228 %define VMOVPS vmovdqu
229
230 %ifdef LINUX
231 %define arg1 rdi
232 %define arg2 rsi
233 %else
234 %define arg1 rcx
235 %define arg2 rdx
236 %endif
237
238 %define inp0 r8
239 %define inp1 r9
240 %define inp2 r10
241 %define inp3 r11
242
243 %define IDX rax
244
245 %define A xmm0
246 %define B xmm1
247 %define C xmm2
248 %define D xmm3
249 %define E xmm4
250 %define F xmm5 ; tmp
251 %define G xmm6 ; tmp
252
253 %define TMP G
254 %define FUN F
255 %define K xmm7
256
257 %define AA xmm8
258 %define BB xmm9
259 %define CC xmm10
260 %define DD xmm11
261 %define EE xmm12
262
263 %define T0 xmm6
264 %define T1 xmm7
265 %define T2 xmm8
266 %define T3 xmm9
267 %define T4 xmm10
268 %define T5 xmm11
269
270 %define W14 xmm13
271 %define W15 xmm14
272 %define W16 xmm15
273
274 %macro ROTATE_ARGS 0
275 %xdefine TMP_ E
276 %xdefine E D
277 %xdefine D C
278 %xdefine C B
279 %xdefine B A
280 %xdefine A TMP_
281 %endm
282
283 %macro ROTATE_W 0
284 %xdefine TMP_ W16
285 %xdefine W16 W15
286 %xdefine W15 W14
287 %xdefine W14 TMP_
288 %endm
289
290 align 32
291
292 ; XMM registers are clobbered. Saving/restoring must be done at a higher level
293
294 ; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks);
295 ; arg 1 : rcx : pointer to args
296 ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
297 MKGLOBAL(sha1_mult_avx,function,internal)
298 sha1_mult_avx:
299
300 sub rsp, FRAMESZ
301
302 ;; Initialize digests
303 vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
304 vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
305 vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
306 vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
307 vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
308
309 ;; transpose input onto stack
310 mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
311 mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
312 mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
313 mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
314
315 xor IDX, IDX
316 lloop:
317 vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
318 %assign I 0
319 %rep 4
320 VMOVPS T2,[inp0+IDX]
321 VMOVPS T1,[inp1+IDX]
322 VMOVPS T4,[inp2+IDX]
323 VMOVPS T3,[inp3+IDX]
324 TRANSPOSE T2, T1, T4, T3, T0, T5
325 vpshufb T0, T0, F
326 vmovdqa [rsp+(I*4+0)*16],T0
327 vpshufb T1, T1, F
328 vmovdqa [rsp+(I*4+1)*16],T1
329 vpshufb T2, T2, F
330 vmovdqa [rsp+(I*4+2)*16],T2
331 vpshufb T3, T3, F
332 vmovdqa [rsp+(I*4+3)*16],T3
333 add IDX, 4*4
334 %assign I (I+1)
335 %endrep
336
337 ; save old digests
338 vmovdqa AA, A
339 vmovdqa BB, B
340 vmovdqa CC, C
341 vmovdqa DD, D
342 vmovdqa EE, E
343
344 ;;
345 ;; perform 0-79 steps
346 ;;
347 vmovdqa K, [rel K00_19]
348 ;; do rounds 0...15
349 %assign I 0
350 %rep 16
351 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
352 ROTATE_ARGS
353 %assign I (I+1)
354 %endrep
355
356 ;; do rounds 16...19
357 vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
358 vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
359 %rep 4
360 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
361 ROTATE_ARGS
362 %assign I (I+1)
363 %endrep
364
365 ;; do rounds 20...39
366 vmovdqa K, [rel K20_39]
367 %rep 20
368 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
369 ROTATE_ARGS
370 %assign I (I+1)
371 %endrep
372
373 ;; do rounds 40...59
374 vmovdqa K, [rel K40_59]
375 %rep 20
376 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
377 ROTATE_ARGS
378 %assign I (I+1)
379 %endrep
380
381 ;; do rounds 60...79
382 vmovdqa K, [rel K60_79]
383 %rep 20
384 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
385 ROTATE_ARGS
386 %assign I (I+1)
387 %endrep
388
389 vpaddd A,A,AA
390 vpaddd B,B,BB
391 vpaddd C,C,CC
392 vpaddd D,D,DD
393 vpaddd E,E,EE
394
395 sub arg2, 1
396 jne lloop
397
398 ; write out digests
399 vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
400 vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
401 vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
402 vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
403 vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
404
405 ; update input pointers
406 add inp0, IDX
407 mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
408 add inp1, IDX
409 mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
410 add inp2, IDX
411 mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
412 add inp3, IDX
413 mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
414
415 ;;;;;;;;;;;;;;;;
416 ;; Postamble
417
418 ;; Clear all stack containing part of message
419 %ifdef SAFE_DATA
420 vpxor xmm0, xmm0
421 %assign i 0
422 %rep 16
423 vmovdqa [rsp + i*16], xmm0
424 %assign i (i+1)
425 %endrep
426 %endif
427
428 add rsp, FRAMESZ
429
430 ret
431
432 %ifdef LINUX
433 section .note.GNU-stack noalloc noexec nowrite progbits
434 %endif