]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / sha256_mb / sha256_mb_mgr_submit_avx.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
1e59de90 5; modification, are permitted provided that the following conditions
7c673cae
FG
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "sha256_job.asm"
31%include "sha256_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
34
35extern sha256_mb_x4_avx
1e59de90
TL
36
37[bits 64]
7c673cae 38default rel
1e59de90 39section .text
7c673cae
FG
40
41%ifidn __OUTPUT_FORMAT__, elf64
42; Linux register definitions
43%define arg1 rdi ; rcx
44%define arg2 rsi ; rdx
45
46; idx needs to be other than arg1, arg2, rbx, r12
47%define idx rdx ; rsi
48%define last_len rdx ; rsi
1e59de90 49
7c673cae
FG
50%define size_offset rcx ; rdi
51%define tmp2 rcx ; rdi
52
53%else
54; WINDOWS register definitions
55%define arg1 rcx
56%define arg2 rdx
57
58; idx needs to be other than arg1, arg2, rbx, r12
59%define last_len rsi
60%define idx rsi
1e59de90 61
7c673cae
FG
62%define size_offset rdi
63%define tmp2 rdi
1e59de90 64
7c673cae
FG
65%endif
66
67; Common definitions
68%define state arg1
69%define job arg2
70%define len2 arg2
71%define p2 arg2
72
73%define p r11
74%define start_offset r11
75
76%define unused_lanes rbx
1e59de90 77
7c673cae
FG
78%define job_rax rax
79%define len rax
80
81%define lane rbp
82%define tmp3 rbp
83%define lens3 rbp
1e59de90 84
7c673cae
FG
85%define extra_blocks r8
86%define lens0 r8
1e59de90 87
7c673cae
FG
88%define tmp r9
89%define lens1 r9
1e59de90 90
7c673cae
FG
91%define lane_data r10
92%define lens2 r10
93
94
95; STACK_SPACE needs to be an odd multiple of 8
96%define _XMM_SAVE 16*10
97%define _GPR_SAVE 8*5
98%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
99
100; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
101; arg 1 : rcx : state
102; arg 2 : rdx : job
1e59de90 103mk_global sha256_mb_mgr_submit_avx, function
7c673cae 104sha256_mb_mgr_submit_avx:
1e59de90 105 endbranch
7c673cae
FG
106
107 sub rsp, STACK_SPACE
108 mov [rsp + _XMM_SAVE + 8*0], rbx
109 mov [rsp + _XMM_SAVE + 8*1], rbp
110 mov [rsp + _XMM_SAVE + 8*2], r12
111%ifidn __OUTPUT_FORMAT__, win64
112 mov [rsp + _XMM_SAVE + 8*3], rsi
113 mov [rsp + _XMM_SAVE + 8*4], rdi
114 vmovdqa [rsp + 16*0], xmm6
115 vmovdqa [rsp + 16*1], xmm7
116 vmovdqa [rsp + 16*2], xmm8
117 vmovdqa [rsp + 16*3], xmm9
118 vmovdqa [rsp + 16*4], xmm10
119 vmovdqa [rsp + 16*5], xmm11
120 vmovdqa [rsp + 16*6], xmm12
121 vmovdqa [rsp + 16*7], xmm13
122 vmovdqa [rsp + 16*8], xmm14
123 vmovdqa [rsp + 16*9], xmm15
124%endif
125
126 mov unused_lanes, [state + _unused_lanes]
127 movzx lane, BYTE(unused_lanes)
128 and lane, 0xF
129 shr unused_lanes, 4
130 imul lane_data, lane, _LANE_DATA_size
131 mov dword [job + _status], STS_BEING_PROCESSED
132 lea lane_data, [state + _ldata + lane_data]
133 mov [state + _unused_lanes], unused_lanes
134 mov DWORD(len), [job + _len]
1e59de90 135
7c673cae
FG
136 shl len, 4
137 or len, lane
138
139 mov [lane_data + _job_in_lane], job
140 mov [state + _lens + 4*lane], DWORD(len)
141
142 ; Load digest words from result_digest
143 vmovdqa xmm0, [job + _result_digest + 0*16]
144 vmovdqa xmm1, [job + _result_digest + 1*16]
145 vmovd [state + _args_digest + 4*lane + 0*16], xmm0
146 vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
147 vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
148 vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
149 vmovd [state + _args_digest + 4*lane + 4*16], xmm1
150 vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
151 vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
152 vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
153
154
155 mov p, [job + _buffer]
156 mov [state + _args_data_ptr + 8*lane], p
157
1e59de90 158 add dword [state + _num_lanes_inuse], 1
7c673cae
FG
159 cmp unused_lanes, 0xF
160 jne return_null
161
162start_loop:
163 ; Find min length
164 mov DWORD(lens0), [state + _lens + 0*4]
165 mov idx, lens0
166 mov DWORD(lens1), [state + _lens + 1*4]
167 cmp lens1, idx
168 cmovb idx, lens1
169 mov DWORD(lens2), [state + _lens + 2*4]
170 cmp lens2, idx
171 cmovb idx, lens2
172 mov DWORD(lens3), [state + _lens + 3*4]
173 cmp lens3, idx
174 cmovb idx, lens3
175 mov len2, idx
176 and idx, 0xF
177 and len2, ~0xF
1e59de90 178 jz len_is_0
7c673cae
FG
179
180 sub lens0, len2
181 sub lens1, len2
182 sub lens2, len2
183 sub lens3, len2
184 shr len2, 4
185 mov [state + _lens + 0*4], DWORD(lens0)
186 mov [state + _lens + 1*4], DWORD(lens1)
187 mov [state + _lens + 2*4], DWORD(lens2)
188 mov [state + _lens + 3*4], DWORD(lens3)
189
190 ; "state" and "args" are the same address, arg1
191 ; len is arg2
192 call sha256_mb_x4_avx
193 ; state and idx are intact
194
195len_is_0:
196 ; process completed job "idx"
197 imul lane_data, idx, _LANE_DATA_size
198 lea lane_data, [state + _ldata + lane_data]
1e59de90 199
7c673cae
FG
200 mov job_rax, [lane_data + _job_in_lane]
201 mov unused_lanes, [state + _unused_lanes]
202 mov qword [lane_data + _job_in_lane], 0
203 mov dword [job_rax + _status], STS_COMPLETED
204 shl unused_lanes, 4
205 or unused_lanes, idx
206 mov [state + _unused_lanes], unused_lanes
207
1e59de90
TL
208 sub dword [state + _num_lanes_inuse], 1
209
7c673cae
FG
210 vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
211 vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
212 vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
213 vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
214 vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
215 vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
216 vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
217 vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
218
219 vmovdqa [job_rax + _result_digest + 0*16], xmm0
220 vmovdqa [job_rax + _result_digest + 1*16], xmm1
221
222return:
223
224%ifidn __OUTPUT_FORMAT__, win64
225 vmovdqa xmm6, [rsp + 16*0]
226 vmovdqa xmm7, [rsp + 16*1]
227 vmovdqa xmm8, [rsp + 16*2]
228 vmovdqa xmm9, [rsp + 16*3]
229 vmovdqa xmm10, [rsp + 16*4]
230 vmovdqa xmm11, [rsp + 16*5]
231 vmovdqa xmm12, [rsp + 16*6]
232 vmovdqa xmm13, [rsp + 16*7]
233 vmovdqa xmm14, [rsp + 16*8]
234 vmovdqa xmm15, [rsp + 16*9]
235 mov rsi, [rsp + _XMM_SAVE + 8*3]
236 mov rdi, [rsp + _XMM_SAVE + 8*4]
237%endif
238 mov rbx, [rsp + _XMM_SAVE + 8*0]
239 mov rbp, [rsp + _XMM_SAVE + 8*1]
240 mov r12, [rsp + _XMM_SAVE + 8*2]
241 add rsp, STACK_SPACE
242
243 ret
244
245return_null:
246 xor job_rax, job_rax
247 jmp return
248
249section .data align=16
250
251align 16
252H0: dd 0x6a09e667
253H1: dd 0xbb67ae85
254H2: dd 0x3c6ef372
255H3: dd 0xa54ff53a
256H4: dd 0x510e527f
257H5: dd 0x9b05688c
258H6: dd 0x1f83d9ab
259H7: dd 0x5be0cd19
260