]> git.proxmox.com Git - ceph.git/blob - ceph/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / mh_sha1 / mh_sha1_block_avx.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;; code to compute 16 SHA1 using AVX
31 ;;
32
33 %include "reg_sizes.asm"
34 default rel
35
36 ;; Magic functions defined in FIPS 180-1
37 ;;
38 ; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
39 %macro MAGIC_F0 5
40 %define %%regF %1
41 %define %%regB %2
42 %define %%regC %3
43 %define %%regD %4
44 %define %%regT %5
45 vpxor %%regF, %%regC,%%regD
46 vpand %%regF, %%regF,%%regB
47 vpxor %%regF, %%regF,%%regD
48 %endmacro
49
50 ; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
51 %macro MAGIC_F1 5
52 %define %%regF %1
53 %define %%regB %2
54 %define %%regC %3
55 %define %%regD %4
56 %define %%regT %5
57 vpxor %%regF,%%regD,%%regC
58 vpxor %%regF,%%regF,%%regB
59 %endmacro
60
61 ; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
62 %macro MAGIC_F2 5
63 %define %%regF %1
64 %define %%regB %2
65 %define %%regC %3
66 %define %%regD %4
67 %define %%regT %5
68 vpor %%regF,%%regB,%%regC
69 vpand %%regT,%%regB,%%regC
70 vpand %%regF,%%regF,%%regD
71 vpor %%regF,%%regF,%%regT
72 %endmacro
73
74 ; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
75 %macro MAGIC_F3 5
76 %define %%regF %1
77 %define %%regB %2
78 %define %%regC %3
79 %define %%regD %4
80 %define %%regT %5
81 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
82 %endmacro
83
84 ; PROLD reg, imm, tmp
85 %macro PROLD 3
86 %define %%reg %1
87 %define %%imm %2
88 %define %%tmp %3
89 vpsrld %%tmp, %%reg, (32-(%%imm))
90 vpslld %%reg, %%reg, %%imm
91 vpor %%reg, %%reg, %%tmp
92 %endmacro
93
94 ; non-destructive
95 ; PROLD_nd reg, imm, tmp, src
96 %macro PROLD_nd 4
97 %define %%reg %1
98 %define %%imm %2
99 %define %%tmp %3
100 %define %%src %4
101 vpsrld %%tmp, %%src, (32-(%%imm))
102 vpslld %%reg, %%src, %%imm
103 vpor %%reg, %%reg, %%tmp
104 %endmacro
105
106 %macro SHA1_STEP_00_15 11
107 %define %%regA %1
108 %define %%regB %2
109 %define %%regC %3
110 %define %%regD %4
111 %define %%regE %5
112 %define %%regT %6
113 %define %%regF %7
114 %define %%memW %8
115 %define %%immCNT %9
116 %define %%MAGIC %10
117 %define %%data %11
118 vpaddd %%regE, %%regE,%%immCNT
119 vpaddd %%regE, %%regE,[%%data + (%%memW * 16)]
120 PROLD_nd %%regT,5, %%regF,%%regA
121 vpaddd %%regE, %%regE,%%regT
122 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
123 PROLD %%regB,30, %%regT
124 vpaddd %%regE, %%regE,%%regF
125 %endmacro
126
127 %macro SHA1_STEP_16_79 11
128 %define %%regA %1
129 %define %%regB %2
130 %define %%regC %3
131 %define %%regD %4
132 %define %%regE %5
133 %define %%regT %6
134 %define %%regF %7
135 %define %%memW %8
136 %define %%immCNT %9
137 %define %%MAGIC %10
138 %define %%data %11
139 vpaddd %%regE, %%regE,%%immCNT
140
141 vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
142 vpxor W16, W16, W14
143 vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
144 vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
145
146 vpsrld %%regF, W16, (32-1)
147 vpslld W16, W16, 1
148 vpor %%regF, %%regF, W16
149 ROTATE_W
150
151 vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
152 vpaddd %%regE, %%regE,%%regF
153
154 PROLD_nd %%regT,5, %%regF, %%regA
155 vpaddd %%regE, %%regE,%%regT
156 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
157 PROLD %%regB,30, %%regT
158 vpaddd %%regE,%%regE,%%regF
159 %endmacro
160
161 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
162 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
163 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
164 %ifidn __OUTPUT_FORMAT__, elf64
165 ; Linux
166 %define arg0 rdi
167 %define arg1 rsi
168 %define arg2 rdx
169 %define arg3 rcx
170
171 %define arg4 r8
172 %define arg5 r9
173
174 %define tmp1 r10
175 %define tmp2 r11
176 %define tmp3 r12 ; must be saved and restored
177 %define tmp4 r13 ; must be saved and restored
178 %define tmp5 r14 ; must be saved and restored
179 %define tmp6 r15 ; must be saved and restored
180 %define return rax
181
182 %define func(x) x:
183 %macro FUNC_SAVE 0
184 push r12
185 push r13
186 push r14
187 push r15
188 %endmacro
189 %macro FUNC_RESTORE 0
190 pop r15
191 pop r14
192 pop r13
193 pop r12
194 %endmacro
195 %else
196 ; Windows
197 %define arg0 rcx
198 %define arg1 rdx
199 %define arg2 r8
200 %define arg3 r9
201
202 %define arg4 r10
203 %define arg5 r11
204 %define tmp1 r12 ; must be saved and restored
205 %define tmp2 r13 ; must be saved and restored
206 %define tmp3 r14 ; must be saved and restored
207 %define tmp4 r15 ; must be saved and restored
208 %define tmp5 rdi ; must be saved and restored
209 %define tmp6 rsi ; must be saved and restored
210 %define return rax
211
212 %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
213 %define func(x) proc_frame x
214 %macro FUNC_SAVE 0
215 alloc_stack stack_size
216 save_xmm128 xmm6, 0*16
217 save_xmm128 xmm7, 1*16
218 save_xmm128 xmm8, 2*16
219 save_xmm128 xmm9, 3*16
220 save_xmm128 xmm10, 4*16
221 save_xmm128 xmm11, 5*16
222 save_xmm128 xmm12, 6*16
223 save_xmm128 xmm13, 7*16
224 save_xmm128 xmm14, 8*16
225 save_xmm128 xmm15, 9*16
226 save_reg r12, 10*16 + 0*8
227 save_reg r13, 10*16 + 1*8
228 save_reg r14, 10*16 + 2*8
229 save_reg r15, 10*16 + 3*8
230 save_reg rdi, 10*16 + 4*8
231 save_reg rsi, 10*16 + 5*8
232 end_prolog
233 %endmacro
234
235 %macro FUNC_RESTORE 0
236 movdqa xmm6, [rsp + 0*16]
237 movdqa xmm7, [rsp + 1*16]
238 movdqa xmm8, [rsp + 2*16]
239 movdqa xmm9, [rsp + 3*16]
240 movdqa xmm10, [rsp + 4*16]
241 movdqa xmm11, [rsp + 5*16]
242 movdqa xmm12, [rsp + 6*16]
243 movdqa xmm13, [rsp + 7*16]
244 movdqa xmm14, [rsp + 8*16]
245 movdqa xmm15, [rsp + 9*16]
246 mov r12, [rsp + 10*16 + 0*8]
247 mov r13, [rsp + 10*16 + 1*8]
248 mov r14, [rsp + 10*16 + 2*8]
249 mov r15, [rsp + 10*16 + 3*8]
250 mov rdi, [rsp + 10*16 + 4*8]
251 mov rsi, [rsp + 10*16 + 5*8]
252 add rsp, stack_size
253 %endmacro
254 %endif
255 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
256 %define loops arg3
257 ;variables of mh_sha1
258 %define mh_in_p arg0
259 %define mh_digests_p arg1
260 %define mh_data_p arg2
261 %define mh_segs tmp1
262 ;variables used by storing segs_digests on stack
263 %define RSP_SAVE tmp2
264 %define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
265
266 %define pref tmp3
267 %macro PREFETCH_X 1
268 %define %%mem %1
269 prefetchnta %%mem
270 %endmacro
271 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
272 %define VMOVPS vmovups
273
274 %define A xmm0
275 %define B xmm1
276 %define C xmm2
277 %define D xmm3
278 %define E xmm4
279 %define F xmm5 ; tmp
280 %define G xmm6 ; tmp
281
282 %define TMP G
283 %define FUN F
284 %define K xmm7
285
286 %define AA xmm8
287 %define BB xmm9
288 %define CC xmm10
289 %define DD xmm11
290 %define EE xmm12
291
292 %define T0 xmm6
293 %define T1 xmm7
294 %define T2 xmm8
295 %define T3 xmm9
296 %define T4 xmm10
297 %define T5 xmm11
298
299 %macro ROTATE_ARGS 0
300 %xdefine TMP_ E
301 %xdefine E D
302 %xdefine D C
303 %xdefine C B
304 %xdefine B A
305 %xdefine A TMP_
306 %endm
307
308 %define W14 xmm13
309 %define W15 xmm14
310 %define W16 xmm15
311
312 %macro ROTATE_W 0
313 %xdefine TMP_ W16
314 %xdefine W16 W15
315 %xdefine W15 W14
316 %xdefine W14 TMP_
317 %endm
318
319
320 ;init hash digests
321 ; segs_digests:low addr-> high_addr
322 ; a | b | c | ...| p | (16)
323 ; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
324 ; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
325 ; ....
326 ; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
327
328 align 32
329
330 ;void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
331 ; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
332 ; arg 0 pointer to input data
333 ; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
334 ; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
335 ; arg 3 number of 1KB blocks
336 ;
337 global mh_sha1_block_avx:function internal
338 func(mh_sha1_block_avx)
339 FUNC_SAVE
340 ; save rsp
341 mov RSP_SAVE, rsp
342
343 cmp loops, 0
344 jle .return
345
346 ; leave enough space to store segs_digests
347 sub rsp, FRAMESZ
348 ; align rsp to 16 Bytes needed by avx
349 and rsp, ~0x0F
350
351 %assign I 0 ; copy segs_digests into stack
352 %rep 5
353 VMOVPS A, [mh_digests_p + I*64 + 16*0]
354 VMOVPS B, [mh_digests_p + I*64 + 16*1]
355 VMOVPS C, [mh_digests_p + I*64 + 16*2]
356 VMOVPS D, [mh_digests_p + I*64 + 16*3]
357
358 vmovdqa [rsp + I*64 + 16*0], A
359 vmovdqa [rsp + I*64 + 16*1], B
360 vmovdqa [rsp + I*64 + 16*2], C
361 vmovdqa [rsp + I*64 + 16*3], D
362 %assign I (I+1)
363 %endrep
364
365
366 .block_loop:
367 ;transform to big-endian data and store on aligned_frame
368 vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
369 ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
370 %assign I 0
371 %rep 16
372 VMOVPS T0,[mh_in_p + I*64+0*16]
373 VMOVPS T1,[mh_in_p + I*64+1*16]
374 VMOVPS T2,[mh_in_p + I*64+2*16]
375 VMOVPS T3,[mh_in_p + I*64+3*16]
376
377 vpshufb T0, F
378 vmovdqa [mh_data_p +(I)*16 +0*256],T0
379 vpshufb T1, F
380 vmovdqa [mh_data_p +(I)*16 +1*256],T1
381 vpshufb T2, F
382 vmovdqa [mh_data_p +(I)*16 +2*256],T2
383 vpshufb T3, F
384 vmovdqa [mh_data_p +(I)*16 +3*256],T3
385 %assign I (I+1)
386 %endrep
387
388 mov mh_segs, 0 ;start from the first 4 segments
389 mov pref, 1024 ;avoid prefetch repeadtedly
390 .segs_loop:
391 ;; Initialize digests
392 vmovdqa A, [rsp + 0*64 + mh_segs]
393 vmovdqa B, [rsp + 1*64 + mh_segs]
394 vmovdqa C, [rsp + 2*64 + mh_segs]
395 vmovdqa D, [rsp + 3*64 + mh_segs]
396 vmovdqa E, [rsp + 4*64 + mh_segs]
397
398 vmovdqa AA, A
399 vmovdqa BB, B
400 vmovdqa CC, C
401 vmovdqa DD, D
402 vmovdqa EE, E
403 ;;
404 ;; perform 0-79 steps
405 ;;
406 vmovdqa K, [K00_19]
407 ;; do rounds 0...15
408 %assign I 0
409 %rep 16
410 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
411 ROTATE_ARGS
412 %assign I (I+1)
413 %endrep
414
415 ;; do rounds 16...19
416 vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
417 vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
418 %rep 4
419 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
420 ROTATE_ARGS
421 %assign I (I+1)
422 %endrep
423 PREFETCH_X [mh_in_p + pref+128*0]
424 ;; do rounds 20...39
425 vmovdqa K, [K20_39]
426 %rep 20
427 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
428 ROTATE_ARGS
429 %assign I (I+1)
430 %endrep
431
432 ;; do rounds 40...59
433 vmovdqa K, [K40_59]
434 %rep 20
435 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
436 ROTATE_ARGS
437 %assign I (I+1)
438 %endrep
439 PREFETCH_X [mh_in_p + pref+128*1]
440 ;; do rounds 60...79
441 vmovdqa K, [K60_79]
442 %rep 20
443 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
444 ROTATE_ARGS
445 %assign I (I+1)
446 %endrep
447
448 vpaddd A, AA
449 vpaddd B, BB
450 vpaddd C, CC
451 vpaddd D, DD
452 vpaddd E, EE
453
454 ; write out digests
455 vmovdqa [rsp + 0*64 + mh_segs], A
456 vmovdqa [rsp + 1*64 + mh_segs], B
457 vmovdqa [rsp + 2*64 + mh_segs], C
458 vmovdqa [rsp + 3*64 + mh_segs], D
459 vmovdqa [rsp + 4*64 + mh_segs], E
460
461 add pref, 256
462 add mh_data_p, 256
463 add mh_segs, 16
464 cmp mh_segs, 64
465 jc .segs_loop
466
467 sub mh_data_p, (1024)
468 add mh_in_p, (1024)
469 sub loops, 1
470 jne .block_loop
471
472
473 %assign I 0 ; copy segs_digests back to mh_digests_p
474 %rep 5
475 vmovdqa A, [rsp + I*64 + 16*0]
476 vmovdqa B, [rsp + I*64 + 16*1]
477 vmovdqa C, [rsp + I*64 + 16*2]
478 vmovdqa D, [rsp + I*64 + 16*3]
479
480 VMOVPS [mh_digests_p + I*64 + 16*0], A
481 VMOVPS [mh_digests_p + I*64 + 16*1], B
482 VMOVPS [mh_digests_p + I*64 + 16*2], C
483 VMOVPS [mh_digests_p + I*64 + 16*3], D
484 %assign I (I+1)
485 %endrep
486 mov rsp, RSP_SAVE ; restore rsp
487
488 .return:
489 FUNC_RESTORE
490 ret
491
492 endproc_frame
493
494 section .data align=16
495
496 align 16
497 PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
498
499 K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
500 K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
501 K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
502 K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6