]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha1_mb / sha1_mb_x4_sse.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
1e59de90 5; modification, are permitted provided that the following conditions
7c673cae
FG
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha1_mb_mgr_datastruct.asm"
31%include "reg_sizes.asm"
32
1e59de90 33[bits 64]
7c673cae 34default rel
1e59de90 35section .text
7c673cae
FG
36
37;; code to compute quad SHA1 using SSE
38;; derived from ...\sha1_multiple\sha1_quad4.asm
39;; variation of sha1_mult2.asm
40
41; transpose r0, r1, r2, r3, t0, t1
42; "transpose" data in {r0..r3} using temps {t0..t3}
43; Input looks like: {r0 r1 r2 r3}
44; r0 = {a3 a2 a1 a0}
45; r1 = {b3 b2 b1 b0}
46; r2 = {c3 c2 c1 c0}
47; r3 = {d3 d2 d1 d0}
48;
49; output looks like: {t0 r1 r0 r3}
50; t0 = {d0 c0 b0 a0}
51; r1 = {d1 c1 b1 a1}
52; r0 = {d2 c2 b2 a2}
53; r3 = {d3 c3 b3 a3}
1e59de90 54;
7c673cae
FG
55%macro TRANSPOSE 6
56%define %%r0 %1
57%define %%r1 %2
58%define %%r2 %3
59%define %%r3 %4
60%define %%t0 %5
61%define %%t1 %6
62 movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
63 shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
64 shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
65
66 movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
67 shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
68 shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
69
70 movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
71 shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
72
73 movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
74 shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
75
76 shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
77 shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
1e59de90 78%endmacro
7c673cae
FG
79;;
80;; Magic functions defined in FIPS 180-1
81;;
82; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
83%macro MAGIC_F0 5
84%define %%regF %1
85%define %%regB %2
86%define %%regC %3
87%define %%regD %4
88%define %%regT %5
89 movdqa %%regF,%%regC
90 pxor %%regF,%%regD
91 pand %%regF,%%regB
92 pxor %%regF,%%regD
93%endmacro
94
95; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
96%macro MAGIC_F1 5
97%define %%regF %1
98%define %%regB %2
99%define %%regC %3
100%define %%regD %4
101%define %%regT %5
102 movdqa %%regF,%%regD
103 pxor %%regF,%%regC
104 pxor %%regF,%%regB
105%endmacro
106
107; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
108%macro MAGIC_F2 5
109%define %%regF %1
110%define %%regB %2
111%define %%regC %3
112%define %%regD %4
113%define %%regT %5
114 movdqa %%regF,%%regB
115 movdqa %%regT,%%regB
116 por %%regF,%%regC
117 pand %%regT,%%regC
118 pand %%regF,%%regD
119 por %%regF,%%regT
120%endmacro
121
122; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
123%macro MAGIC_F3 5
124%define %%regF %1
125%define %%regB %2
126%define %%regC %3
127%define %%regD %4
128%define %%regT %5
129 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
130%endmacro
131
132; PROLD reg, imm, tmp
133%macro PROLD 3
134%define %%reg %1
135%define %%imm %2
136%define %%tmp %3
137 movdqa %%tmp, %%reg
138 pslld %%reg, %%imm
139 psrld %%tmp, (32-%%imm)
140 por %%reg, %%tmp
141%endmacro
142
143%macro SHA1_STEP_00_15 10
144%define %%regA %1
145%define %%regB %2
146%define %%regC %3
147%define %%regD %4
148%define %%regE %5
149%define %%regT %6
150%define %%regF %7
151%define %%memW %8
152%define %%immCNT %9
153%define %%MAGIC %10
154 paddd %%regE,%%immCNT
155 paddd %%regE,[rsp + (%%memW * 16)]
156 movdqa %%regT,%%regA
157 PROLD %%regT,5, %%regF
158 paddd %%regE,%%regT
159 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
160 PROLD %%regB,30, %%regT
161 paddd %%regE,%%regF
162%endmacro
163
164%macro SHA1_STEP_16_79 10
165%define %%regA %1
166%define %%regB %2
167%define %%regC %3
168%define %%regD %4
169%define %%regE %5
170%define %%regT %6
171%define %%regF %7
172%define %%memW %8
173%define %%immCNT %9
174%define %%MAGIC %10
175 paddd %%regE,%%immCNT
176 movdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
177 pxor W16, W14
178 pxor W16, [rsp + ((%%memW - 8) & 15) * 16]
179 pxor W16, [rsp + ((%%memW - 3) & 15) * 16]
180 movdqa %%regF, W16
181 pslld W16, 1
182 psrld %%regF, (32-1)
183 por %%regF, W16
184 ROTATE_W
185
186 movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
187 paddd %%regE,%%regF
188 movdqa %%regT,%%regA
189 PROLD %%regT,5, %%regF
190 paddd %%regE,%%regT
191 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
192 PROLD %%regB,30, %%regT
193 paddd %%regE,%%regF
194%endmacro
195
196;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
197;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
198;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
199
200;; FRAMESZ plus pushes must be an odd multiple of 8
201%define XMM_SAVE ((15-15)*16 + 1*8)
202%define FRAMESZ 16*16 + XMM_SAVE
203%define _XMM FRAMESZ - XMM_SAVE
1e59de90 204
7c673cae
FG
205%define MOVPS movups
206
207%define inp0 r8
208%define inp1 r9
209%define inp2 r10
210%define inp3 r11
211
212%define IDX rax
213
214%define A xmm0
215%define B xmm1
216%define C xmm2
217%define D xmm3
218%define E xmm4
219%define F xmm5 ; tmp
220%define G xmm6 ; tmp
221
222%define TMP G
223%define FUN F
224%define K xmm7
225
226%define AA xmm8
227%define BB xmm9
228%define CC xmm10
229%define DD xmm11
230%define EE xmm12
1e59de90 231
7c673cae
FG
232%define T0 xmm6
233%define T1 xmm7
234%define T2 xmm8
235%define T3 xmm9
236%define T4 xmm10
237%define T5 xmm11
238
239%macro ROTATE_ARGS 0
240%xdefine TMP_ E
241%xdefine E D
242%xdefine D C
243%xdefine C B
244%xdefine B A
245%xdefine A TMP_
246%endm
247
248%define W14 xmm13
249%define W15 xmm14
250%define W16 xmm15
251
252%macro ROTATE_W 0
253%xdefine TMP_ W16
254%xdefine W16 W15
255%xdefine W15 W14
256%xdefine W14 TMP_
257%endm
258
259%define DIGEST_SIZE (4*5*4)
260
261%ifidn __OUTPUT_FORMAT__, elf64
262 ; Linux
263 %define ARG1 rdi
264 %define ARG2 rsi
265%else
266 ; Windows
267 %define ARG1 rcx
268 %define ARG2 rdx
269%endif
270
271align 32
272
273; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
274; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
275; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
276;
277; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
278;
1e59de90 279mk_global sha1_mb_x4_sse, function, internal
7c673cae 280sha1_mb_x4_sse:
1e59de90 281 endbranch
7c673cae
FG
282
283 sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
1e59de90 284
7c673cae
FG
285 ;; Initialize digests
286 movdqa A, [ARG1 + 0*16]
287 movdqa B, [ARG1 + 1*16]
288 movdqa C, [ARG1 + 2*16]
289 movdqa D, [ARG1 + 3*16]
290 movdqa E, [ARG1 + 4*16]
1e59de90 291
7c673cae
FG
292 ;; load input pointers
293 mov inp0,[ARG1 + _data_ptr + 0*8]
294 mov inp1,[ARG1 + _data_ptr + 1*8]
295 mov inp2,[ARG1 + _data_ptr + 2*8]
296 mov inp3,[ARG1 + _data_ptr + 3*8]
297
298 xor IDX, IDX
299lloop:
300 movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
301%assign I 0
302%rep 4
303 MOVPS T2,[inp0+IDX]
304 MOVPS T1,[inp1+IDX]
305 MOVPS T4,[inp2+IDX]
306 MOVPS T3,[inp3+IDX]
307 TRANSPOSE T2, T1, T4, T3, T0, T5
308 pshufb T0, F
309 movdqa [rsp+(I*4+0)*16],T0
310 pshufb T1, F
311 movdqa [rsp+(I*4+1)*16],T1
312 pshufb T2, F
313 movdqa [rsp+(I*4+2)*16],T2
314 pshufb T3, F
315 movdqa [rsp+(I*4+3)*16],T3
316 add IDX, 4*4
317%assign I (I+1)
318%endrep
319
320 ; save old digests
321 movdqa AA, A
322 movdqa BB, B
323 movdqa CC, C
324 movdqa DD, D
325 movdqa EE, E
326
327;;
328;; perform 0-79 steps
329;;
330 movdqa K, [K00_19]
331;; do rounds 0...15
332%assign I 0
333%rep 16
334 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
335 ROTATE_ARGS
336%assign I (I+1)
337%endrep
338
339;; do rounds 16...19
340 movdqa W16, [rsp + ((16 - 16) & 15) * 16]
341 movdqa W15, [rsp + ((16 - 15) & 15) * 16]
342%rep 4
343 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
344 ROTATE_ARGS
345%assign I (I+1)
346%endrep
347
348;; do rounds 20...39
349 movdqa K, [K20_39]
350%rep 20
351 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
352 ROTATE_ARGS
353%assign I (I+1)
354%endrep
355
356;; do rounds 40...59
357 movdqa K, [K40_59]
358%rep 20
359 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
360 ROTATE_ARGS
361%assign I (I+1)
362%endrep
363
364;; do rounds 60...79
365 movdqa K, [K60_79]
366%rep 20
367 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
368 ROTATE_ARGS
369%assign I (I+1)
370%endrep
371
372 paddd A,AA
373 paddd B,BB
374 paddd C,CC
375 paddd D,DD
376 paddd E,EE
377
378 sub ARG2, 1
379 jne lloop
380
381 ; write out digests
382 movdqa [ARG1 + 0*16], A
383 movdqa [ARG1 + 1*16], B
384 movdqa [ARG1 + 2*16], C
385 movdqa [ARG1 + 3*16], D
386 movdqa [ARG1 + 4*16], E
387
388 ; update input pointers
389 add inp0, IDX
390 mov [ARG1 + _data_ptr + 0*8], inp0
391 add inp1, IDX
392 mov [ARG1 + _data_ptr + 1*8], inp1
393 add inp2, IDX
394 mov [ARG1 + _data_ptr + 2*8], inp2
395 add inp3, IDX
396 mov [ARG1 + _data_ptr + 3*8], inp3
397
398 ;;;;;;;;;;;;;;;;
399 ;; Postamble
1e59de90 400
7c673cae
FG
401 add rsp, FRAMESZ
402
403 ret
404
405
406section .data align=16
407
408align 16
409PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
410K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
411K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
412K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
413K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6