]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / sha1_x8_avx2.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; Stack must be aligned to 32 bytes before call
29 ;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15
30 ;; Windows preserves: rbx rcx rsi rdi rbp
31 ;;
32 ;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15
33 ;; Linux preserves: rbx rcx rdi rbp r8
34 ;;
35 ;; clobbers ymm0-15
36
37 %include "include/os.asm"
38 ;%define DO_DBGPRINT
39 %include "include/dbgprint.asm"
40 %include "mb_mgr_datastruct.asm"
41 %include "include/transpose_avx2.asm"
42
43 section .data
44 default rel
45 align 32
46 PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
47 ;ddq 0x0c0d0e0f08090a0b0405060700010203
48 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
49 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
50 K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
51 ;ddq 0x5A8279995A8279995A8279995A827999
52 dq 0x5A8279995A827999, 0x5A8279995A827999
53 dq 0x5A8279995A827999, 0x5A8279995A827999
54 K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
55 ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
56 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
57 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
58 K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
59 ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
60 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
61 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
62 K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
63 ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
64 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
65 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
66
67 section .text
68
69 %ifdef LINUX
70 %define arg1 rdi
71 %define arg2 rsi
72 %define reg3 rdx
73 %else
74 %define arg1 rcx
75 %define arg2 rdx
76 %define reg3 r8
77 %endif
78
79 %define state arg1
80 %define num_blks arg2
81
82 %define inp0 r9
83 %define inp1 r10
84 %define inp2 r11
85 %define inp3 r12
86 %define inp4 r13
87 %define inp5 r14
88 %define inp6 r15
89 %define inp7 reg3
90
91 %define IDX rax
92
93 ; ymm0 A
94 ; ymm1 B
95 ; ymm2 C
96 ; ymm3 D
97 ; ymm4 E
98 ; ymm5 F AA
99 ; ymm6 T0 BB
100 ; ymm7 T1 CC
101 ; ymm8 T2 DD
102 ; ymm9 T3 EE
103 ; ymm10 T4 TMP
104 ; ymm11 T5 FUN
105 ; ymm12 T6 K
106 ; ymm13 T7 W14
107 ; ymm14 T8 W15
108 ; ymm15 T9 W16
109
110 %define A ymm0
111 %define B ymm1
112 %define C ymm2
113 %define D ymm3
114 %define E ymm4
115
116 %define F ymm5
117 %define T0 ymm6
118 %define T1 ymm7
119 %define T2 ymm8
120 %define T3 ymm9
121 %define T4 ymm10
122 %define T5 ymm11
123 %define T6 ymm12
124 %define T7 ymm13
125 %define T8 ymm14
126 %define T9 ymm15
127
128 %define AA ymm5
129 %define BB ymm6
130 %define CC ymm7
131 %define DD ymm8
132 %define EE ymm9
133 %define TMP ymm10
134 %define FUN ymm11
135 %define K ymm12
136 %define W14 ymm13
137 %define W15 ymm14
138 %define W16 ymm15
139
140
141 ;; Assume stack aligned to 32 bytes before call
142 ;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
143 %define FRAMESZ 32*16 + 24
144
145 %define VMOVPS vmovups
146
147 ;;
148 ;; Magic functions defined in FIPS 180-1
149 ;;
150 ;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
151 %macro MAGIC_F0 5
152 %define %%regF %1
153 %define %%regB %2
154 %define %%regC %3
155 %define %%regD %4
156 %define %%regT %5
157 ;vmovdqa %%regF,%%regC
158 vpxor %%regF, %%regC,%%regD
159 vpand %%regF, %%regF,%%regB
160 vpxor %%regF, %%regF,%%regD
161 %endmacro
162
163 ;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
164 %macro MAGIC_F1 5
165 %define %%regF %1
166 %define %%regB %2
167 %define %%regC %3
168 %define %%regD %4
169 %define %%regT %5
170 ;vmovdqa %%regF,%%regD
171 vpxor %%regF,%%regD,%%regC
172 vpxor %%regF,%%regF,%%regB
173 %endmacro
174
175
176
177 ;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
178 %macro MAGIC_F2 5
179 %define %%regF %1
180 %define %%regB %2
181 %define %%regC %3
182 %define %%regD %4
183 %define %%regT %5
184 ;vmovdqa %%regF,%%regB
185 ;vmovdqa %%regT,%%regB
186 vpor %%regF,%%regB,%%regC
187 vpand %%regT,%%regB,%%regC
188 vpand %%regF,%%regF,%%regD
189 vpor %%regF,%%regF,%%regT
190 %endmacro
191
192 ;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
193 %macro MAGIC_F3 5
194 %define %%regF %1
195 %define %%regB %2
196 %define %%regC %3
197 %define %%regD %4
198 %define %%regT %5
199 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
200 %endmacro
201
202 ; PROLD reg, imm, tmp
203 %macro PROLD 3
204 %define %%reg %1
205 %define %%imm %2
206 %define %%tmp %3
207 ;vmovdqa %%tmp, %%reg
208 vpsrld %%tmp, %%reg, (32-%%imm)
209 vpslld %%reg, %%reg, %%imm
210 vpor %%reg, %%reg, %%tmp
211 %endmacro
212
213 ; PROLD reg, imm, tmp
214 %macro PROLD_nd 4
215 %define %%reg %1
216 %define %%imm %2
217 %define %%tmp %3
218 %define %%src %4
219 ;vmovdqa %%tmp, %%reg
220 vpsrld %%tmp, %%src, (32-%%imm)
221 vpslld %%reg, %%src, %%imm
222 vpor %%reg, %%reg, %%tmp
223 %endmacro
224
225 %macro SHA1_STEP_00_15 10
226 %define %%regA %1
227 %define %%regB %2
228 %define %%regC %3
229 %define %%regD %4
230 %define %%regE %5
231 %define %%regT %6
232 %define %%regF %7
233 %define %%memW %8
234 %define %%immCNT %9
235 %define %%MAGIC %10
236 vpaddd %%regE, %%regE,%%immCNT
237 vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
238 ;vmovdqa %%regT,%%regA
239 PROLD_nd %%regT,5, %%regF,%%regA
240 vpaddd %%regE, %%regE,%%regT
241 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
242 PROLD %%regB,30, %%regT
243 vpaddd %%regE, %%regE,%%regF
244 %endmacro
245
246 %macro SHA1_STEP_16_79 10
247 %define %%regA %1
248 %define %%regB %2
249 %define %%regC %3
250 %define %%regD %4
251 %define %%regE %5
252 %define %%regT %6
253 %define %%regF %7
254 %define %%memW %8
255 %define %%immCNT %9
256 %define %%MAGIC %10
257 vpaddd %%regE, %%regE,%%immCNT
258
259 vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32]
260 vpxor W16, W16, W14
261 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
262 vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
263
264 ;vmovdqa %%regF, W16
265 vpsrld %%regF, W16, (32-1)
266 vpslld W16, W16, 1
267 vpor %%regF, %%regF, W16
268 ROTATE_W
269
270 vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF
271 vpaddd %%regE, %%regE,%%regF
272
273 ;vmovdqa %%regT,%%regA
274 PROLD_nd %%regT,5, %%regF, %%regA
275 vpaddd %%regE, %%regE,%%regT
276 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
277 PROLD %%regB,30, %%regT
278 vpaddd %%regE,%%regE,%%regF
279 %endmacro
280
281
282 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
283 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
284 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
285
286 %macro ROTATE_ARGS 0
287 %xdefine TMP_ E
288 %xdefine E D
289 %xdefine D C
290 %xdefine C B
291 %xdefine B A
292 %xdefine A TMP_
293 %endm
294
295 %macro ROTATE_W 0
296 %xdefine TMP_ W16
297 %xdefine W16 W15
298 %xdefine W15 W14
299 %xdefine W14 TMP_
300 %endm
301
302 align 32
303
304 ; void sha1_x8_avx2(void *state, int num_blks)
305 ; arg 1 : rcx : pointer to array[4] of pointer to input data
306 ; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
307 MKGLOBAL(sha1_x8_avx2,function,internal)
308 sha1_x8_avx2:
309 sub rsp, FRAMESZ
310
311 ;; Initialize digests
312 vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE]
313 vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE]
314 vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE]
315 vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE]
316 vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE]
317 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E
318
319 ;; transpose input onto stack
320 mov inp0,[state+_data_ptr_sha1+0*PTR_SZ]
321 mov inp1,[state+_data_ptr_sha1+1*PTR_SZ]
322 mov inp2,[state+_data_ptr_sha1+2*PTR_SZ]
323 mov inp3,[state+_data_ptr_sha1+3*PTR_SZ]
324 mov inp4,[state+_data_ptr_sha1+4*PTR_SZ]
325 mov inp5,[state+_data_ptr_sha1+5*PTR_SZ]
326 mov inp6,[state+_data_ptr_sha1+6*PTR_SZ]
327 mov inp7,[state+_data_ptr_sha1+7*PTR_SZ]
328
329 xor IDX, IDX
330 lloop:
331 vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
332 %assign I 0
333 %rep 2
334 TRANSPOSE8_U32_LOAD8 T0, T1, T2, T3, T4, T5, T6, T7, \
335 inp0, inp1, inp2, inp3, inp4, inp5, \
336 inp6, inp7, IDX
337
338 TRANSPOSE8_U32 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
339 DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
340 vpshufb T0, T0, F
341 vmovdqa [rsp+(I*8+0)*32],T0
342 vpshufb T1, T1, F
343 vmovdqa [rsp+(I*8+1)*32],T1
344 vpshufb T2, T2, F
345 vmovdqa [rsp+(I*8+2)*32],T2
346 vpshufb T3, T3, F
347 vmovdqa [rsp+(I*8+3)*32],T3
348 vpshufb T4, T4, F
349 vmovdqa [rsp+(I*8+4)*32],T4
350 vpshufb T5, T5, F
351 vmovdqa [rsp+(I*8+5)*32],T5
352 vpshufb T6, T6, F
353 vmovdqa [rsp+(I*8+6)*32],T6
354 vpshufb T7, T7, F
355 vmovdqa [rsp+(I*8+7)*32],T7
356 add IDX, 32
357 %assign I (I+1)
358 %endrep
359
360
361 ; save old digests
362 vmovdqa AA, A
363 vmovdqa BB, B
364 vmovdqa CC, C
365 vmovdqa DD, D
366 vmovdqa EE, E
367
368 ;;
369 ;; perform 0-79 steps
370 ;;
371 vmovdqa K, [rel K00_19]
372 ;; do rounds 0...15
373 %assign I 0
374 %rep 16
375 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
376 ROTATE_ARGS
377 %assign I (I+1)
378 %endrep
379
380 ;; do rounds 16...19
381 vmovdqa W16, [rsp + ((16 - 16) & 15) * 32]
382 vmovdqa W15, [rsp + ((16 - 15) & 15) * 32]
383 %rep 4
384 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
385 ROTATE_ARGS
386 %assign I (I+1)
387 %endrep
388
389 ;; do rounds 20...39
390 vmovdqa K, [rel K20_39]
391 %rep 20
392 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
393 ROTATE_ARGS
394 %assign I (I+1)
395 %endrep
396
397 ;; do rounds 40...59
398 vmovdqa K, [rel K40_59]
399 %rep 20
400 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
401 ROTATE_ARGS
402 %assign I (I+1)
403 %endrep
404
405 ;; do rounds 60...79
406 vmovdqa K, [rel K60_79]
407 %rep 20
408 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
409 ROTATE_ARGS
410 %assign I (I+1)
411 %endrep
412
413 vpaddd A,A,AA
414 vpaddd B,B,BB
415 vpaddd C,C,CC
416 vpaddd D,D,DD
417 vpaddd E,E,EE
418
419 sub num_blks, 1
420 jne lloop
421
422 ; write out digests
423 vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A
424 vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B
425 vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C
426 vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D
427 vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E
428 DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E
429 ;; update input pointers
430 add inp0, IDX
431 add inp1, IDX
432 add inp2, IDX
433 add inp3, IDX
434 add inp4, IDX
435 add inp5, IDX
436 add inp6, IDX
437 add inp7, IDX
438 mov [state+_data_ptr_sha1+0*PTR_SZ], inp0
439 mov [state+_data_ptr_sha1+1*PTR_SZ], inp1
440 mov [state+_data_ptr_sha1+2*PTR_SZ], inp2
441 mov [state+_data_ptr_sha1+3*PTR_SZ], inp3
442 mov [state+_data_ptr_sha1+4*PTR_SZ], inp4
443 mov [state+_data_ptr_sha1+5*PTR_SZ], inp5
444 mov [state+_data_ptr_sha1+6*PTR_SZ], inp6
445 mov [state+_data_ptr_sha1+7*PTR_SZ], inp7
446
447 ;;;;;;;;;;;;;;;;
448 ;; Postamble
449
450 ;; Clear stack frame (16*32 bytes)
451 %ifdef SAFE_DATA
452 vpxor ymm0, ymm0
453 %assign i 0
454 %rep 16
455 vmovdqa [rsp + i*32], ymm0
456 %assign i (i+1)
457 %endrep
458 %endif
459
460 add rsp, FRAMESZ
461
462 ret
463
464 %ifdef LINUX
465 section .note.GNU-stack noalloc noexec nowrite progbits
466 %endif