]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha1_mb / sha1_mb_mgr_flush_avx.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
1e59de90 5; modification, are permitted provided that the following conditions
7c673cae
FG
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha1_job.asm"
31%include "sha1_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
34
35extern sha1_mb_x4_avx
1e59de90
TL
36extern sha1_opt_x1
37
38[bits 64]
7c673cae 39default rel
1e59de90 40section .text
7c673cae
FG
41
42%ifidn __OUTPUT_FORMAT__, elf64
43; LINUX register definitions
44%define arg1 rdi ; rcx
45%define arg2 rsi ; rdx
46
47; idx needs to be in a register not clobberred by sha1_mult
48%define idx rdx ; rsi
49%else
50; WINDOWS register definitions
51%define arg1 rcx
52%define arg2 rdx
53
54; idx needs to be in a register not clobberred by sha1_mult
55%define idx rsi
56%endif
57
58; Common definitions
59%define state arg1
60%define job arg2
61%define len2 arg2
62
63%define unused_lanes rbx
64%define lane_data rbx
65%define tmp2 rbx
1e59de90 66
7c673cae
FG
67%define job_rax rax
68%define tmp1 rax
69%define size_offset rax
70%define tmp rax
71%define start_offset rax
1e59de90 72
7c673cae 73%define tmp3 arg1
1e59de90 74
7c673cae
FG
75%define extra_blocks arg2
76%define p arg2
77
78%define tmp4 r8
79%define lens0 r8
80
81%define lens1 r9
82%define lens2 r10
83%define lens3 r11
84
85; STACK_SPACE needs to be an odd multiple of 8
86_XMM_SAVE_SIZE equ 10*16
87_GPR_SAVE_SIZE equ 8*2
88_ALIGN_SIZE equ 8
89
90_XMM_SAVE equ 0
91_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
92STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
93
94%define APPEND(a,b) a %+ b
95
96; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state)
97; arg 1 : rcx : state
1e59de90 98mk_global sha1_mb_mgr_flush_avx, function
7c673cae 99sha1_mb_mgr_flush_avx:
1e59de90 100 endbranch
7c673cae
FG
101
102 sub rsp, STACK_SPACE
103 mov [rsp + _GPR_SAVE + 8*0], rbx
104%ifidn __OUTPUT_FORMAT__, win64
105 mov [rsp + _GPR_SAVE + 8*1], rsi
106 vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
107 vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
108 vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
109 vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
110 vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
111 vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
112 vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
113 vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
114 vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
115 vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
116%endif
117
1e59de90
TL
118 ; use num_lanes_inuse to judge all lanes are empty
119 cmp dword [state + _num_lanes_inuse], 0
120 jz return_null
7c673cae
FG
121
122 ; find a lane with a non-null job
123 xor idx, idx
124 cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
125 cmovne idx, [one]
126 cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
127 cmovne idx, [two]
128 cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
129 cmovne idx, [three]
130
131 ; copy idx to empty lanes
132copy_lane_data:
133 mov tmp, [state + _args + _data_ptr + 8*idx]
134
135%assign I 0
136%rep 4
137 cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
138 jne APPEND(skip_,I)
139 mov [state + _args + _data_ptr + 8*I], tmp
140 mov dword [state + _lens + 4*I], 0xFFFFFFFF
141APPEND(skip_,I):
142%assign I (I+1)
143%endrep
144
145 ; Find min length
146 mov DWORD(lens0), [state + _lens + 0*4]
147 mov idx, lens0
148 mov DWORD(lens1), [state + _lens + 1*4]
149 cmp lens1, idx
150 cmovb idx, lens1
151 mov DWORD(lens2), [state + _lens + 2*4]
152 cmp lens2, idx
153 cmovb idx, lens2
154 mov DWORD(lens3), [state + _lens + 3*4]
155 cmp lens3, idx
156 cmovb idx, lens3
157 mov len2, idx
158 and idx, 0xF
159 and len2, ~0xF
1e59de90
TL
160 jz len_is_0
161
162 ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
163 cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX
164 ja mb_processing
7c673cae 165
1e59de90
TL
166 ; lensN-len2=idx
167 shr len2, 4
168 mov [state + _lens + idx*4], DWORD(idx)
169 mov r10, idx
170 or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16
171 ; "state" and "args" are the same address, arg1
172 ; len is arg2, idx and nlane in r10
173 call sha1_opt_x1
174 ; state and idx are intact
175 jmp len_is_0
176
177mb_processing:
7c673cae
FG
178 sub lens0, len2
179 sub lens1, len2
180 sub lens2, len2
181 sub lens3, len2
182 shr len2, 4
183 mov [state + _lens + 0*4], DWORD(lens0)
184 mov [state + _lens + 1*4], DWORD(lens1)
185 mov [state + _lens + 2*4], DWORD(lens2)
186 mov [state + _lens + 3*4], DWORD(lens3)
187
188 ; "state" and "args" are the same address, arg1
189 ; len is arg2
190 call sha1_mb_x4_avx
191 ; state and idx are intact
192
193len_is_0:
194 ; process completed job "idx"
195 imul lane_data, idx, _LANE_DATA_size
196 lea lane_data, [state + _ldata + lane_data]
197
198 mov job_rax, [lane_data + _job_in_lane]
199 mov qword [lane_data + _job_in_lane], 0
200 mov dword [job_rax + _status], STS_COMPLETED
201 mov unused_lanes, [state + _unused_lanes]
202 shl unused_lanes, 4
203 or unused_lanes, idx
204 mov [state + _unused_lanes], unused_lanes
205
1e59de90
TL
206 sub dword [state + _num_lanes_inuse], 1
207
7c673cae
FG
208 vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
209 vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
210 vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
211 vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
212 mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
213
214 vmovdqa [job_rax + _result_digest + 0*16], xmm0
215 mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
216
217return:
218
219%ifidn __OUTPUT_FORMAT__, win64
220 vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
221 vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
222 vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
223 vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
224 vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
225 vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
226 vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
227 vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
228 vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
229 vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
230 mov rsi, [rsp + _GPR_SAVE + 8*1]
231%endif
232 mov rbx, [rsp + _GPR_SAVE + 8*0]
233 add rsp, STACK_SPACE
234
235 ret
236
237return_null:
238 xor job_rax, job_rax
239 jmp return
1e59de90 240
7c673cae
FG
241section .data align=16
242
243align 16
244one: dq 1
245two: dq 2
246three: dq 3
247