]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / mh_sha1 / mh_sha1_block_sse.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
1e59de90 5; modification, are permitted provided that the following conditions
7c673cae
FG
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;; code to compute 16 SHA1 using SSE
31;;
32
33%include "reg_sizes.asm"
1e59de90
TL
34
35[bits 64]
7c673cae 36default rel
1e59de90 37section .text
7c673cae
FG
38
39;; Magic functions defined in FIPS 180-1
40;;
41; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
42%macro MAGIC_F0 5
43%define %%regF %1
44%define %%regB %2
45%define %%regC %3
46%define %%regD %4
47%define %%regT %5
48 movdqa %%regF,%%regC
49 pxor %%regF,%%regD
50 pand %%regF,%%regB
51 pxor %%regF,%%regD
52%endmacro
53
54; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
55%macro MAGIC_F1 5
56%define %%regF %1
57%define %%regB %2
58%define %%regC %3
59%define %%regD %4
60%define %%regT %5
61 movdqa %%regF,%%regD
62 pxor %%regF,%%regC
63 pxor %%regF,%%regB
64%endmacro
65
66; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
67%macro MAGIC_F2 5
68%define %%regF %1
69%define %%regB %2
70%define %%regC %3
71%define %%regD %4
72%define %%regT %5
73 movdqa %%regF,%%regB
74 movdqa %%regT,%%regB
75 por %%regF,%%regC
76 pand %%regT,%%regC
77 pand %%regF,%%regD
78 por %%regF,%%regT
79%endmacro
80
81; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
82%macro MAGIC_F3 5
83%define %%regF %1
84%define %%regB %2
85%define %%regC %3
86%define %%regD %4
87%define %%regT %5
88 MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
89%endmacro
90
91; PROLD reg, imm, tmp
92%macro PROLD 3
93%define %%reg %1
94%define %%imm %2
95%define %%tmp %3
96 movdqa %%tmp, %%reg
97 pslld %%reg, %%imm
98 psrld %%tmp, (32-%%imm)
99 por %%reg, %%tmp
100%endmacro
101
102%macro SHA1_STEP_00_15 11
103%define %%regA %1
104%define %%regB %2
105%define %%regC %3
106%define %%regD %4
107%define %%regE %5
108%define %%regT %6
109%define %%regF %7
110%define %%memW %8
111%define %%immCNT %9
112%define %%MAGIC %10
113%define %%data %11
114 paddd %%regE,%%immCNT
115 paddd %%regE,[%%data + (%%memW * 16)]
116 movdqa %%regT,%%regA
117 PROLD %%regT,5, %%regF
118 paddd %%regE,%%regT
119 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
120 PROLD %%regB,30, %%regT
121 paddd %%regE,%%regF
122%endmacro
123
124%macro SHA1_STEP_16_79 11
125%define %%regA %1
126%define %%regB %2
127%define %%regC %3
128%define %%regD %4
129%define %%regE %5
130%define %%regT %6
131%define %%regF %7
132%define %%memW %8
133%define %%immCNT %9
134%define %%MAGIC %10
135%define %%data %11
136 paddd %%regE,%%immCNT
137 movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
138 pxor W16, W14
139 pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
140 pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
141 movdqa %%regF, W16
142 pslld W16, 1
143 psrld %%regF, (32-1)
144 por %%regF, W16
145 ROTATE_W
146
147 movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
148 paddd %%regE,%%regF
149 movdqa %%regT,%%regA
150 PROLD %%regT,5, %%regF
151 paddd %%regE,%%regT
152 %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
153 PROLD %%regB,30, %%regT
154 paddd %%regE,%%regF
155%endmacro
156
157;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
158;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
159;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
160%ifidn __OUTPUT_FORMAT__, elf64
161 ; Linux
162 %define arg0 rdi
163 %define arg1 rsi
164 %define arg2 rdx
165 %define arg3 rcx
166
167 %define arg4 r8
168 %define arg5 r9
169
170 %define tmp1 r10
171 %define tmp2 r11
172 %define tmp3 r12 ; must be saved and restored
173 %define tmp4 r13 ; must be saved and restored
174 %define tmp5 r14 ; must be saved and restored
175 %define tmp6 r15 ; must be saved and restored
176 %define return rax
177
178 %define func(x) x:
179 %macro FUNC_SAVE 0
180 push r12
181 push r13
182 push r14
183 push r15
184 %endmacro
185 %macro FUNC_RESTORE 0
186 pop r15
187 pop r14
188 pop r13
189 pop r12
190 %endmacro
191%else
192 ; Windows
193 %define arg0 rcx
194 %define arg1 rdx
195 %define arg2 r8
196 %define arg3 r9
197
198 %define arg4 r10
199 %define arg5 r11
200 %define tmp1 r12 ; must be saved and restored
201 %define tmp2 r13 ; must be saved and restored
202 %define tmp3 r14 ; must be saved and restored
203 %define tmp4 r15 ; must be saved and restored
204 %define tmp5 rdi ; must be saved and restored
205 %define tmp6 rsi ; must be saved and restored
206 %define return rax
207
208 %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
209 %define func(x) proc_frame x
210 %macro FUNC_SAVE 0
211 alloc_stack stack_size
212 save_xmm128 xmm6, 0*16
213 save_xmm128 xmm7, 1*16
214 save_xmm128 xmm8, 2*16
215 save_xmm128 xmm9, 3*16
216 save_xmm128 xmm10, 4*16
217 save_xmm128 xmm11, 5*16
218 save_xmm128 xmm12, 6*16
219 save_xmm128 xmm13, 7*16
220 save_xmm128 xmm14, 8*16
221 save_xmm128 xmm15, 9*16
222 save_reg r12, 10*16 + 0*8
223 save_reg r13, 10*16 + 1*8
224 save_reg r14, 10*16 + 2*8
225 save_reg r15, 10*16 + 3*8
226 save_reg rdi, 10*16 + 4*8
227 save_reg rsi, 10*16 + 5*8
228 end_prolog
229 %endmacro
230
231 %macro FUNC_RESTORE 0
232 movdqa xmm6, [rsp + 0*16]
233 movdqa xmm7, [rsp + 1*16]
234 movdqa xmm8, [rsp + 2*16]
235 movdqa xmm9, [rsp + 3*16]
236 movdqa xmm10, [rsp + 4*16]
237 movdqa xmm11, [rsp + 5*16]
238 movdqa xmm12, [rsp + 6*16]
239 movdqa xmm13, [rsp + 7*16]
240 movdqa xmm14, [rsp + 8*16]
241 movdqa xmm15, [rsp + 9*16]
242 mov r12, [rsp + 10*16 + 0*8]
243 mov r13, [rsp + 10*16 + 1*8]
244 mov r14, [rsp + 10*16 + 2*8]
245 mov r15, [rsp + 10*16 + 3*8]
246 mov rdi, [rsp + 10*16 + 4*8]
247 mov rsi, [rsp + 10*16 + 5*8]
248 add rsp, stack_size
249 %endmacro
250%endif
251;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
252%define loops arg3
253;variables of mh_sha1
254%define mh_in_p arg0
255%define mh_digests_p arg1
256%define mh_data_p arg2
257%define mh_segs tmp1
258;variables used by storing segs_digests on stack
259%define RSP_SAVE tmp2
260%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
261
262%define pref tmp3
263%macro PREFETCH_X 1
264%define %%mem %1
265 prefetchnta %%mem
266%endmacro
267;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
268%define MOVPS movups
269
270%define A xmm0
271%define B xmm1
272%define C xmm2
273%define D xmm3
274%define E xmm4
275%define F xmm5 ; tmp
276%define G xmm6 ; tmp
277
278%define TMP G
279%define FUN F
280%define K xmm7
281
282%define AA xmm8
283%define BB xmm9
284%define CC xmm10
285%define DD xmm11
286%define EE xmm12
287
288%define T0 xmm6
289%define T1 xmm7
290%define T2 xmm8
291%define T3 xmm9
292%define T4 xmm10
293%define T5 xmm11
294
295%macro ROTATE_ARGS 0
296%xdefine TMP_ E
297%xdefine E D
298%xdefine D C
299%xdefine C B
300%xdefine B A
301%xdefine A TMP_
302%endm
303
304%define W14 xmm13
305%define W15 xmm14
306%define W16 xmm15
307
308%macro ROTATE_W 0
309%xdefine TMP_ W16
310%xdefine W16 W15
311%xdefine W15 W14
312%xdefine W14 TMP_
313%endm
314
315
316;init hash digests
317; segs_digests:low addr-> high_addr
318; a | b | c | ...| p | (16)
319; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
320; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
321; ....
1e59de90 322; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep |
7c673cae
FG
323
324align 32
325
326;void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
327; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
328; arg 0 pointer to input data
329; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
330; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
331; arg 3 number of 1KB blocks
332;
1e59de90 333mk_global mh_sha1_block_sse, function, internal
7c673cae 334func(mh_sha1_block_sse)
1e59de90 335 endbranch
7c673cae
FG
336 FUNC_SAVE
337 ; save rsp
338 mov RSP_SAVE, rsp
339
340 cmp loops, 0
341 jle .return
342
343 ; leave enough space to store segs_digests
344 sub rsp, FRAMESZ
345 ; align rsp to 16 Bytes needed by sse
346 and rsp, ~0x0F
347
348 %assign I 0 ; copy segs_digests into stack
349 %rep 5
350 MOVPS A, [mh_digests_p + I*64 + 16*0]
351 MOVPS B, [mh_digests_p + I*64 + 16*1]
352 MOVPS C, [mh_digests_p + I*64 + 16*2]
353 MOVPS D, [mh_digests_p + I*64 + 16*3]
354
355 movdqa [rsp + I*64 + 16*0], A
356 movdqa [rsp + I*64 + 16*1], B
357 movdqa [rsp + I*64 + 16*2], C
358 movdqa [rsp + I*64 + 16*3], D
359 %assign I (I+1)
360 %endrep
361
362.block_loop:
363 ;transform to big-endian data and store on aligned_frame
364 movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
365 ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
366 %assign I 0
367 %rep 16
368 MOVPS T0,[mh_in_p + I*64+0*16]
369 MOVPS T1,[mh_in_p + I*64+1*16]
370 MOVPS T2,[mh_in_p + I*64+2*16]
371 MOVPS T3,[mh_in_p + I*64+3*16]
372
373 pshufb T0, F
374 movdqa [mh_data_p +(I)*16 +0*256],T0
375 pshufb T1, F
376 movdqa [mh_data_p +(I)*16 +1*256],T1
377 pshufb T2, F
378 movdqa [mh_data_p +(I)*16 +2*256],T2
379 pshufb T3, F
380 movdqa [mh_data_p +(I)*16 +3*256],T3
381 %assign I (I+1)
382 %endrep
383
384 mov mh_segs, 0 ;start from the first 4 segments
385 mov pref, 1024 ;avoid prefetch repeadtedly
386 .segs_loop:
387 ;; Initialize digests
388 movdqa A, [rsp + 0*64 + mh_segs]
389 movdqa B, [rsp + 1*64 + mh_segs]
390 movdqa C, [rsp + 2*64 + mh_segs]
391 movdqa D, [rsp + 3*64 + mh_segs]
392 movdqa E, [rsp + 4*64 + mh_segs]
393
394 movdqa AA, A
395 movdqa BB, B
396 movdqa CC, C
397 movdqa DD, D
398 movdqa EE, E
399;;
400;; perform 0-79 steps
401;;
402 movdqa K, [K00_19]
403;; do rounds 0...15
404 %assign I 0
405 %rep 16
406 SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
407 ROTATE_ARGS
408 %assign I (I+1)
409 %endrep
410
411;; do rounds 16...19
412 movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
413 movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
414 %rep 4
415 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
416 ROTATE_ARGS
417 %assign I (I+1)
418 %endrep
419 PREFETCH_X [mh_in_p + pref+128*0]
420;; do rounds 20...39
421 movdqa K, [K20_39]
422 %rep 20
423 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
424 ROTATE_ARGS
425 %assign I (I+1)
426 %endrep
427
428;; do rounds 40...59
429 movdqa K, [K40_59]
430 %rep 20
431 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
432 ROTATE_ARGS
433 %assign I (I+1)
434 %endrep
435 PREFETCH_X [mh_in_p + pref+128*1]
436;; do rounds 60...79
437 movdqa K, [K60_79]
438 %rep 20
439 SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
440 ROTATE_ARGS
441 %assign I (I+1)
442 %endrep
443
444 paddd A, AA
445 paddd B, BB
446 paddd C, CC
447 paddd D, DD
448 paddd E, EE
449
450 ; write out digests
451 movdqa [rsp + 0*64 + mh_segs], A
452 movdqa [rsp + 1*64 + mh_segs], B
453 movdqa [rsp + 2*64 + mh_segs], C
454 movdqa [rsp + 3*64 + mh_segs], D
455 movdqa [rsp + 4*64 + mh_segs], E
456
457 add pref, 256
458 add mh_data_p, 256
459 add mh_segs, 16
460 cmp mh_segs, 64
461 jc .segs_loop
462
463 sub mh_data_p, (1024)
464 add mh_in_p, (1024)
465 sub loops, 1
466 jne .block_loop
467
468
469 %assign I 0 ; copy segs_digests back to mh_digests_p
470 %rep 5
471 movdqa A, [rsp + I*64 + 16*0]
472 movdqa B, [rsp + I*64 + 16*1]
473 movdqa C, [rsp + I*64 + 16*2]
474 movdqa D, [rsp + I*64 + 16*3]
475
476 MOVPS [mh_digests_p + I*64 + 16*0], A
477 MOVPS [mh_digests_p + I*64 + 16*1], B
478 MOVPS [mh_digests_p + I*64 + 16*2], C
479 MOVPS [mh_digests_p + I*64 + 16*3], D
480 %assign I (I+1)
481 %endrep
482 mov rsp, RSP_SAVE ; restore rsp
483
484.return:
485 FUNC_RESTORE
486 ret
487
488endproc_frame
489
490section .data align=16
491
492align 16
493PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
494
495K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
496K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
497K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
498K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6