]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha1_mb / sha1_mb_x4_avx.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "sha1_mb_mgr_datastruct.asm"
31 %include "reg_sizes.asm"
32
33 default rel
34
35 ;; code to compute quad SHA1 using AVX
36 ;; derived from ...\sha1_multiple\sha1_quad4.asm
37 ;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
38
39 ; transpose r0, r1, r2, r3, t0, t1
40 ; "transpose" data in {r0..r3} using temps {t0..t3}
41 ; Input looks like: {r0 r1 r2 r3}
42 ; r0 = {a3 a2 a1 a0}
43 ; r1 = {b3 b2 b1 b0}
44 ; r2 = {c3 c2 c1 c0}
45 ; r3 = {d3 d2 d1 d0}
46 ;
47 ; output looks like: {t0 r1 r0 r3}
48 ; t0 = {d0 c0 b0 a0}
49 ; r1 = {d1 c1 b1 a1}
50 ; r0 = {d2 c2 b2 a2}
51 ; r3 = {d3 c3 b3 a3}
52 ;
53 %macro TRANSPOSE 6
54 %define %%r0 %1
55 %define %%r1 %2
56 %define %%r2 %3
57 %define %%r3 %4
58 %define %%t0 %5
59 %define %%t1 %6
60 vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
61 vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
62
63 vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
64 vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
65
66 vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
67
68 vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
69
70 vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
71 vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
72 %endmacro
73 ;;
74 ;; Magic functions defined in FIPS 180-1
75 ;;
76 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
77 %macro MAGIC_F0 5
78 %define %%regF %1
79 %define %%regB %2
80 %define %%regC %3
81 %define %%regD %4
82 %define %%regT %5
83 vpxor %%regF, %%regC,%%regD
84 vpand %%regF, %%regF,%%regB
85 vpxor %%regF, %%regF,%%regD
86 %endmacro
87
88 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
89 %macro MAGIC_F1 5
90 %define %%regF %1
91 %define %%regB %2
92 %define %%regC %3
93 %define %%regD %4
94 %define %%regT %5
95 vpxor %%regF,%%regD,%%regC
96 vpxor %%regF,%%regF,%%regB
97 %endmacro
98
99 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
100 %macro MAGIC_F2 5
101 %define %%regF %1
102 %define %%regB %2
103 %define %%regC %3
104 %define %%regD %4
105 %define %%regT %5
106 vpor %%regF,%%regB,%%regC
107 vpand %%regT,%%regB,%%regC
108 vpand %%regF,%%regF,%%regD
109 vpor %%regF,%%regF,%%regT
110 %endmacro
111
112 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
113 %macro MAGIC_F3 5
114 %define %%regF %1
115 %define %%regB %2
116 %define %%regC %3
117 %define %%regD %4
118 %define %%regT %5
119 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
120 %endmacro
121
122 ; PROLD reg, imm, tmp
123 %macro PROLD 3
124 %define %%reg %1
125 %define %%imm %2
126 %define %%tmp %3
127 vpsrld %%tmp, %%reg, (32-(%%imm))
128 vpslld %%reg, %%reg, %%imm
129 vpor %%reg, %%reg, %%tmp
130 %endmacro
131
132 ; non-destructive
133 ; PROLD_nd reg, imm, tmp, src
134 %macro PROLD_nd 4
135 %define %%reg %1
136 %define %%imm %2
137 %define %%tmp %3
138 %define %%src %4
139 vpsrld %%tmp, %%src, (32-(%%imm))
140 vpslld %%reg, %%src, %%imm
141 vpor %%reg, %%reg, %%tmp
142 %endmacro
143
144 %macro SHA1_STEP_00_15 10
145 %define %%regA %1
146 %define %%regB %2
147 %define %%regC %3
148 %define %%regD %4
149 %define %%regE %5
150 %define %%regT %6
151 %define %%regF %7
152 %define %%memW %8
153 %define %%immCNT %9
154 %define %%MAGIC %10
155 vpaddd %%regE, %%regE,%%immCNT
156 vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
157 PROLD_nd %%regT,5, %%regF,%%regA
158 vpaddd %%regE, %%regE,%%regT
159 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
160 PROLD %%regB,30, %%regT
161 vpaddd %%regE, %%regE,%%regF
162 %endmacro
163
164 %macro SHA1_STEP_16_79 10
165 %define %%regA %1
166 %define %%regB %2
167 %define %%regC %3
168 %define %%regD %4
169 %define %%regE %5
170 %define %%regT %6
171 %define %%regF %7
172 %define %%memW %8
173 %define %%immCNT %9
174 %define %%MAGIC %10
175 vpaddd %%regE, %%regE,%%immCNT
176
177 vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
178 vpxor W16, W16, W14
179 vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
180 vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
181
182 vpsrld %%regF, W16, (32-1)
183 vpslld W16, W16, 1
184 vpor %%regF, %%regF, W16
185 ROTATE_W
186
187 vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
188 vpaddd %%regE, %%regE,%%regF
189
190 PROLD_nd %%regT,5, %%regF, %%regA
191 vpaddd %%regE, %%regE,%%regT
192 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
193 PROLD %%regB,30, %%regT
194 vpaddd %%regE,%%regE,%%regF
195 %endmacro
196
197 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
198 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
199 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
200
201 ;; FRAMESZ plus pushes must be an odd multiple of 8
202 %define XMM_SAVE ((15-15)*16 + 1*8)
203 %define FRAMESZ 16*16 + XMM_SAVE
204 %define _XMM FRAMESZ - XMM_SAVE
205
206 %define VMOVPS vmovups
207
208 %define inp0 r8
209 %define inp1 r9
210 %define inp2 r10
211 %define inp3 r11
212
213 %define IDX rax
214
215 %define A xmm0
216 %define B xmm1
217 %define C xmm2
218 %define D xmm3
219 %define E xmm4
220 %define F xmm5 ; tmp
221 %define G xmm6 ; tmp
222
223 %define TMP G
224 %define FUN F
225 %define K xmm7
226
227 %define AA xmm8
228 %define BB xmm9
229 %define CC xmm10
230 %define DD xmm11
231 %define EE xmm12
232
233 %define T0 xmm6
234 %define T1 xmm7
235 %define T2 xmm8
236 %define T3 xmm9
237 %define T4 xmm10
238 %define T5 xmm11
239
240 %macro ROTATE_ARGS 0
241 %xdefine TMP_ E
242 %xdefine E D
243 %xdefine D C
244 %xdefine C B
245 %xdefine B A
246 %xdefine A TMP_
247 %endm
248
249 %define W14 xmm13
250 %define W15 xmm14
251 %define W16 xmm15
252
253 %macro ROTATE_W 0
254 %xdefine TMP_ W16
255 %xdefine W16 W15
256 %xdefine W15 W14
257 %xdefine W14 TMP_
258 %endm
259
260 %define DIGEST_SIZE (4*5*4)
261
262 ;%ifdef LINUX
263 %ifidn __OUTPUT_FORMAT__, elf64
264 %define ARG1 rdi
265 %define ARG2 rsi
266 %else
267 ; Windows
268 %define ARG1 rcx
269 %define ARG2 rdx
270 %endif
271
272 align 32
273
274 ; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
275 ; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
276 ; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
277 ;
278 ; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
279 ;
280 global sha1_mb_x4_avx:function internal
281 sha1_mb_x4_avx:
282
283 sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
284
285 ;; Initialize digests
286 vmovdqa A, [ARG1 + 0*16]
287 vmovdqa B, [ARG1 + 1*16]
288 vmovdqa C, [ARG1 + 2*16]
289 vmovdqa D, [ARG1 + 3*16]
290 vmovdqa E, [ARG1 + 4*16]
291
292 ;; load input pointers
293 mov inp0,[ARG1 + _data_ptr + 0*8]
294 mov inp1,[ARG1 + _data_ptr + 1*8]
295 mov inp2,[ARG1 + _data_ptr + 2*8]
296 mov inp3,[ARG1 + _data_ptr + 3*8]
297
298 xor IDX, IDX
299 lloop:
300 vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
301 %assign I 0
302 %rep 4
303 VMOVPS T2,[inp0+IDX]
304 VMOVPS T1,[inp1+IDX]
305 VMOVPS T4,[inp2+IDX]
306 VMOVPS T3,[inp3+IDX]
307 TRANSPOSE T2, T1, T4, T3, T0, T5
308 vpshufb T0, T0, F
309 vmovdqa [rsp+(I*4+0)*16],T0
310 vpshufb T1, T1, F
311 vmovdqa [rsp+(I*4+1)*16],T1
312 vpshufb T2, T2, F
313 vmovdqa [rsp+(I*4+2)*16],T2
314 vpshufb T3, T3, F
315 vmovdqa [rsp+(I*4+3)*16],T3
316 add IDX, 4*4
317 %assign I (I+1)
318 %endrep
319
320 ; save old digests
321 vmovdqa AA, A
322 vmovdqa BB, B
323 vmovdqa CC, C
324 vmovdqa DD, D
325 vmovdqa EE, E
326
327 ;;
328 ;; perform 0-79 steps
329 ;;
330 vmovdqa K, [K00_19]
331 ;; do rounds 0...15
332 %assign I 0
333 %rep 16
334 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
335 ROTATE_ARGS
336 %assign I (I+1)
337 %endrep
338
339 ;; do rounds 16...19
340 vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
341 vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
342 %rep 4
343 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
344 ROTATE_ARGS
345 %assign I (I+1)
346 %endrep
347
348 ;; do rounds 20...39
349 vmovdqa K, [K20_39]
350 %rep 20
351 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
352 ROTATE_ARGS
353 %assign I (I+1)
354 %endrep
355
356 ;; do rounds 40...59
357 vmovdqa K, [K40_59]
358 %rep 20
359 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
360 ROTATE_ARGS
361 %assign I (I+1)
362 %endrep
363
364 ;; do rounds 60...79
365 vmovdqa K, [K60_79]
366 %rep 20
367 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
368 ROTATE_ARGS
369 %assign I (I+1)
370 %endrep
371
372 vpaddd A,A,AA
373 vpaddd B,B,BB
374 vpaddd C,C,CC
375 vpaddd D,D,DD
376 vpaddd E,E,EE
377
378 sub ARG2, 1
379 jne lloop
380
381 ; write out digests
382 vmovdqa [ARG1 + 0*16], A
383 vmovdqa [ARG1 + 1*16], B
384 vmovdqa [ARG1 + 2*16], C
385 vmovdqa [ARG1 + 3*16], D
386 vmovdqa [ARG1 + 4*16], E
387
388 ; update input pointers
389 add inp0, IDX
390 mov [ARG1 + _data_ptr + 0*8], inp0
391 add inp1, IDX
392 mov [ARG1 + _data_ptr + 1*8], inp1
393 add inp2, IDX
394 mov [ARG1 + _data_ptr + 2*8], inp2
395 add inp3, IDX
396 mov [ARG1 + _data_ptr + 3*8], inp3
397
398 ;;;;;;;;;;;;;;;;
399 ;; Postamble
400
401 add rsp, FRAMESZ
402
403 ret
404
405
406 section .data align=16
407
408 align 16
409 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
410 K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
411 K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
412 K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
413 K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6