]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / sha1_mult_sse.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
f67539c2 28%include "include/os.asm"
11fdf7f2
TL
29
30;%define DO_DBGPRINT
f67539c2 31%include "include/dbgprint.asm"
11fdf7f2
TL
32
33%include "mb_mgr_datastruct.asm"
34
35section .data
36default rel
37align 16
38PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
39 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
40K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
41 dq 0x5A8279995A827999, 0x5A8279995A827999
42K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
43 dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
44K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
45 dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
46K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
47 dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
48
49section .text
50
51;; code to compute quad SHA1 using SSE
52;; derived from ...\sha1_multiple\sha1_quad4.asm
53;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
54;; rbx, rsi, rdi, rbp, r12-r15 left intact
55;; This version is not safe to call from C/C++
56
57;; Stack must be aligned to 16 bytes before call
58;; Windows clobbers: rax rdx r8 r9 r10 r11
59;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
60;;
61;; Linux clobbers: rax rsi r8 r9 r10 r11
62;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
63;;
64;; clobbers xmm0-15
65
66; transpose r0, r1, r2, r3, t0, t1
67; "transpose" data in {r0..r3} using temps {t0..t3}
68; Input looks like: {r0 r1 r2 r3}
69; r0 = {a3 a2 a1 a0}
70; r1 = {b3 b2 b1 b0}
71; r2 = {c3 c2 c1 c0}
72; r3 = {d3 d2 d1 d0}
73;
74; output looks like: {t0 r1 r0 r3}
75; t0 = {d0 c0 b0 a0}
76; r1 = {d1 c1 b1 a1}
77; r0 = {d2 c2 b2 a2}
78; r3 = {d3 c3 b3 a3}
79;
80%macro TRANSPOSE 6
81%define %%r0 %1
82%define %%r1 %2
83%define %%r2 %3
84%define %%r3 %4
85%define %%t0 %5
86%define %%t1 %6
87 movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
88 shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
89 shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
90
91 movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
92 shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
93 shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
94
95 movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
96 shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
97
98 movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
99 shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
100
101 shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
102 shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
103%endmacro
104;;
105;; Magic functions defined in FIPS 180-1
106;;
107; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
108%macro MAGIC_F0 5
109%define %%regF %1
110%define %%regB %2
111%define %%regC %3
112%define %%regD %4
113%define %%regT %5
114 movdqa %%regF,%%regC
115 pxor %%regF,%%regD
116 pand %%regF,%%regB
117 pxor %%regF,%%regD
118%endmacro
119
120; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
121%macro MAGIC_F1 5
122%define %%regF %1
123%define %%regB %2
124%define %%regC %3
125%define %%regD %4
126%define %%regT %5
127 movdqa %%regF,%%regD
128 pxor %%regF,%%regC
129 pxor %%regF,%%regB
130%endmacro
131
132; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
133%macro MAGIC_F2 5
134%define %%regF %1
135%define %%regB %2
136%define %%regC %3
137%define %%regD %4
138%define %%regT %5
139 movdqa %%regF,%%regB
140 movdqa %%regT,%%regB
141 por %%regF,%%regC
142 pand %%regT,%%regC
143 pand %%regF,%%regD
144 por %%regF,%%regT
145%endmacro
146
147; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
148%macro MAGIC_F3 5
149%define %%regF %1
150%define %%regB %2
151%define %%regC %3
152%define %%regD %4
153%define %%regT %5
154 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
155%endmacro
156
157; PROLD reg, imm, tmp
158%macro PROLD 3
159%define %%reg %1
160%define %%imm %2
161%define %%tmp %3
162 movdqa %%tmp, %%reg
163 pslld %%reg, %%imm
164 psrld %%tmp, (32-%%imm)
165 por %%reg, %%tmp
166%endmacro
167
168%macro SHA1_STEP_00_15 10
169%define %%regA %1
170%define %%regB %2
171%define %%regC %3
172%define %%regD %4
173%define %%regE %5
174%define %%regT %6
175%define %%regF %7
176%define %%memW %8
177%define %%immCNT %9
178%define %%MAGIC %10
179 paddd %%regE,%%immCNT
180 paddd %%regE,[rsp + (%%memW * 16)]
181 movdqa %%regT,%%regA
182 PROLD %%regT,5, %%regF
183 paddd %%regE,%%regT
184 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
185 PROLD %%regB,30, %%regT
186 paddd %%regE,%%regF
187%endmacro
188
189%macro SHA1_STEP_16_79 10
190%define %%regA %1
191%define %%regB %2
192%define %%regC %3
193%define %%regD %4
194%define %%regE %5
195%define %%regT %6
196%define %%regF %7
197%define %%memW %8
198%define %%immCNT %9
199%define %%MAGIC %10
200 paddd %%regE,%%immCNT
201 movdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
202 pxor W16, W14
203 pxor W16, [rsp + ((%%memW - 8) & 15) * 16]
204 pxor W16, [rsp + ((%%memW - 3) & 15) * 16]
205 movdqa %%regF, W16
206 pslld W16, 1
207 psrld %%regF, (32-1)
208 por %%regF, W16
209 ROTATE_W
210
211 movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
212 paddd %%regE,%%regF
213 movdqa %%regT,%%regA
214 PROLD %%regT,5, %%regF
215 paddd %%regE,%%regT
216 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
217 PROLD %%regB,30, %%regT
218 paddd %%regE,%%regF
219%endmacro
220
221;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
223;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
224
225;; FRAMESZ must be an odd multiple of 8
226%define FRAMESZ 16*16 + 8
227
228%define MOVPS movdqu
229
230%ifdef LINUX
231%define arg1 rdi
232%define arg2 rsi
233%else
234%define arg1 rcx
235%define arg2 rdx
236%endif
237
238%define inp0 r8
239%define inp1 r9
240%define inp2 r10
241%define inp3 r11
242
243%define IDX rax
244
245%define A xmm0
246%define B xmm1
247%define C xmm2
248%define D xmm3
249%define E xmm4
250%define F xmm5 ; tmp
251%define G xmm6 ; tmp
252
253%define TMP G
254%define FUN F
255%define K xmm7
256
257%define AA xmm8
258%define BB xmm9
259%define CC xmm10
260%define DD xmm11
261%define EE xmm12
262
263%define T0 xmm6
264%define T1 xmm7
265%define T2 xmm8
266%define T3 xmm9
267%define T4 xmm10
268%define T5 xmm11
269
270%define W14 xmm13
271%define W15 xmm14
272%define W16 xmm15
273
274%macro ROTATE_ARGS 0
275%xdefine TMP_ E
276%xdefine E D
277%xdefine D C
278%xdefine C B
279%xdefine B A
280%xdefine A TMP_
281%endm
282
283%macro ROTATE_W 0
284%xdefine TMP_ W16
285%xdefine W16 W15
286%xdefine W15 W14
287%xdefine W14 TMP_
288%endm
289
290align 32
291
292; XMM registers are clobbered. Saving/restoring must be done at a higher level
293
294; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks);
295; arg 1 : rcx : pointer to args
296; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
297MKGLOBAL(sha1_mult_sse,function,internal)
298sha1_mult_sse:
299
300 sub rsp, FRAMESZ
301
302 ;; Initialize digests
303 movdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
304 movdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
305 movdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
306 movdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
307 movdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
308 DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E
309 ;; load input pointers
310 mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
311 mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
312 mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
313 mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
314 DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3
315 xor IDX, IDX
316lloop:
317 movdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
318%assign I 0
319%rep 4
320 MOVPS T2,[inp0+IDX]
321 MOVPS T1,[inp1+IDX]
322 MOVPS T4,[inp2+IDX]
323 MOVPS T3,[inp3+IDX]
324 TRANSPOSE T2, T1, T4, T3, T0, T5
325 DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3
326 pshufb T0, F
327 movdqa [rsp+(I*4+0)*16],T0
328 pshufb T1, F
329 movdqa [rsp+(I*4+1)*16],T1
330 pshufb T2, F
331 movdqa [rsp+(I*4+2)*16],T2
332 pshufb T3, F
333 movdqa [rsp+(I*4+3)*16],T3
334 add IDX, 4*4
335%assign I (I+1)
336%endrep
337
338 ; save old digests
339 movdqa AA, A
340 movdqa BB, B
341 movdqa CC, C
342 movdqa DD, D
343 movdqa EE, E
344
345;;
346;; perform 0-79 steps
347;;
348 movdqa K, [rel K00_19]
349;; do rounds 0...15
350%assign I 0
351%rep 16
352 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
353 ROTATE_ARGS
354%assign I (I+1)
355%endrep
356
357;; do rounds 16...19
358 movdqa W16, [rsp + ((16 - 16) & 15) * 16]
359 movdqa W15, [rsp + ((16 - 15) & 15) * 16]
360%rep 4
361 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
362 ROTATE_ARGS
363%assign I (I+1)
364%endrep
365
366;; do rounds 20...39
367 movdqa K, [rel K20_39]
368%rep 20
369 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
370 ROTATE_ARGS
371%assign I (I+1)
372%endrep
373
374;; do rounds 40...59
375 movdqa K, [rel K40_59]
376%rep 20
377 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
378 ROTATE_ARGS
379%assign I (I+1)
380%endrep
381
382;; do rounds 60...79
383 movdqa K, [rel K60_79]
384%rep 20
385 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
386 ROTATE_ARGS
387%assign I (I+1)
388%endrep
389
390 paddd A,AA
391 paddd B,BB
392 paddd C,CC
393 paddd D,DD
394 paddd E,EE
395
396 sub arg2, 1
397 jne lloop
398
399 ; write out digests
400 movdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
401 movdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
402 movdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
403 movdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
404 movdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
405 DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E
406 ; update input pointers
407 add inp0, IDX
408 mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
409 add inp1, IDX
410 mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
411 add inp2, IDX
412 mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
413 add inp3, IDX
414 mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
415 DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3
416 ;;;;;;;;;;;;;;;;
417 ;; Postamble
418
f67539c2
TL
419 ;; Clear stack frame (16*16 bytes)
420%ifdef SAFE_DATA
421 pxor xmm0, xmm0
422%assign i 0
423%rep 16
424 movdqa [rsp + i*16], xmm0
425%assign i (i+1)
426%endrep
427%endif
428
11fdf7f2
TL
429 add rsp, FRAMESZ
430
431 ret
432
433%ifdef LINUX
434section .note.GNU-stack noalloc noexec nowrite progbits
435%endif