]> git.proxmox.com Git - ceph.git/blame - ceph/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / crypto / isa-l / isa-l_crypto / md5_mb / md5_mb_mgr_submit_avx.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
1e59de90 5; modification, are permitted provided that the following conditions
7c673cae
FG
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "md5_job.asm"
31%include "md5_mb_mgr_datastruct.asm"
32
33%include "reg_sizes.asm"
1e59de90
TL
34
35[bits 64]
7c673cae 36default rel
1e59de90 37section .text
7c673cae
FG
38
39extern md5_mb_x4x2_avx
40
41%if 1
42%ifidn __OUTPUT_FORMAT__, win64
43; WINDOWS register definitions
44%define arg1 rcx
45%define arg2 rdx
1e59de90 46
7c673cae
FG
47%else
48; UN*X register definitions
49%define arg1 rdi
50%define arg2 rsi
1e59de90 51
7c673cae
FG
52%endif
53
54; Common definitions
55%define state arg1
56%define job arg2
57%define len2 arg2
58
59; idx must be a register not clobberred by md5_mb_x4x2_avx
60%define idx r8
61
62%define p r9
63
64%define unused_lanes rbx
1e59de90 65
7c673cae
FG
66%define job_rax rax
67%define len rax
68
69%define lane r10
1e59de90 70
7c673cae
FG
71%define lane_data r11
72
73%endif ; if 1
74
75; STACK_SPACE needs to be an odd multiple of 8
76%define STACK_SPACE 8*8 + 16*10 + 8
77
78; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
79; arg 1 : rcx : state
80; arg 2 : rdx : job
1e59de90 81mk_global md5_mb_mgr_submit_avx, function
7c673cae 82md5_mb_mgr_submit_avx:
1e59de90 83 endbranch
7c673cae
FG
84
85 sub rsp, STACK_SPACE
86 ; we need to save/restore all GPRs because lower layer clobbers them
87 mov [rsp + 8*0], rbx
88 mov [rsp + 8*1], rbp
89 mov [rsp + 8*2], r12
90 mov [rsp + 8*3], r13
91 mov [rsp + 8*4], r14
92 mov [rsp + 8*5], r15
93%ifidn __OUTPUT_FORMAT__, win64
94 mov [rsp + 8*6], rsi
95 mov [rsp + 8*7], rdi
96 vmovdqa [rsp + 8*8 + 16*0], xmm6
97 vmovdqa [rsp + 8*8 + 16*1], xmm7
98 vmovdqa [rsp + 8*8 + 16*2], xmm8
99 vmovdqa [rsp + 8*8 + 16*3], xmm9
100 vmovdqa [rsp + 8*8 + 16*4], xmm10
101 vmovdqa [rsp + 8*8 + 16*5], xmm11
102 vmovdqa [rsp + 8*8 + 16*6], xmm12
103 vmovdqa [rsp + 8*8 + 16*7], xmm13
104 vmovdqa [rsp + 8*8 + 16*8], xmm14
105 vmovdqa [rsp + 8*8 + 16*9], xmm15
106%endif
107
108 mov unused_lanes, [state + _unused_lanes]
109 mov lane, unused_lanes
110 and lane, 0xF
111 shr unused_lanes, 4
112 imul lane_data, lane, _LANE_DATA_size
113 mov dword [job + _status], STS_BEING_PROCESSED
114 lea lane_data, [state + _ldata + lane_data]
115 mov [state + _unused_lanes], unused_lanes
116 mov DWORD(len), [job + _len]
117
118 shl len, 4
119 or len, lane
120
121 mov [lane_data + _job_in_lane], job
122 mov [state + _lens + 4*lane], DWORD(len)
123
124 ; Load digest words from result_digest
125 vmovdqu xmm0, [job + _result_digest + 0*16]
126 vmovd [state + _args_digest + 4*lane + 0*32], xmm0
127 vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
128 vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
129 vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
130
131 mov p, [job + _buffer]
132 mov [state + _args_data_ptr + 8*lane], p
133
1e59de90 134 add dword [state + _num_lanes_inuse], 1
7c673cae
FG
135 cmp unused_lanes, 0xF
136 jne return_null
137
138start_loop:
139 ; Find min length
140 vmovdqa xmm0, [state + _lens + 0*16]
141 vmovdqa xmm1, [state + _lens + 1*16]
1e59de90 142
7c673cae
FG
143 vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
144 vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
145 vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
146 vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
147 vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
148
149 vmovd DWORD(idx), xmm2
150 mov len2, idx
151 and idx, 0xF
152 shr len2, 4
153 jz len_is_0
1e59de90 154
7c673cae
FG
155 vpand xmm2, xmm2, [rel clear_low_nibble]
156 vpshufd xmm2, xmm2, 0
157
158 vpsubd xmm0, xmm0, xmm2
159 vpsubd xmm1, xmm1, xmm2
160
161 vmovdqa [state + _lens + 0*16], xmm0
162 vmovdqa [state + _lens + 1*16], xmm1
163
164 ; "state" and "args" are the same address, arg1
165 ; len is arg2
166 call md5_mb_x4x2_avx
167 ; state and idx are intact
168
169len_is_0:
170 ; process completed job "idx"
171 imul lane_data, idx, _LANE_DATA_size
172 lea lane_data, [state + _ldata + lane_data]
1e59de90 173
7c673cae
FG
174 mov job_rax, [lane_data + _job_in_lane]
175 mov unused_lanes, [state + _unused_lanes]
176 mov qword [lane_data + _job_in_lane], 0
177 mov dword [job_rax + _status], STS_COMPLETED
178 shl unused_lanes, 4
179 or unused_lanes, idx
180 mov [state + _unused_lanes], unused_lanes
181
182 mov dword [state + _lens + 4*idx], 0xFFFFFFFF
1e59de90 183 sub dword [state + _num_lanes_inuse], 1
7c673cae
FG
184
185 vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
186 vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
187 vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
188 vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
189
190 vmovdqa [job_rax + _result_digest + 0*16], xmm0
191
192return:
193
194%ifidn __OUTPUT_FORMAT__, win64
195 vmovdqa xmm6, [rsp + 8*8 + 16*0]
196 vmovdqa xmm7, [rsp + 8*8 + 16*1]
197 vmovdqa xmm8, [rsp + 8*8 + 16*2]
198 vmovdqa xmm9, [rsp + 8*8 + 16*3]
199 vmovdqa xmm10, [rsp + 8*8 + 16*4]
200 vmovdqa xmm11, [rsp + 8*8 + 16*5]
201 vmovdqa xmm12, [rsp + 8*8 + 16*6]
202 vmovdqa xmm13, [rsp + 8*8 + 16*7]
203 vmovdqa xmm14, [rsp + 8*8 + 16*8]
204 vmovdqa xmm15, [rsp + 8*8 + 16*9]
205 mov rsi, [rsp + 8*6]
206 mov rdi, [rsp + 8*7]
207%endif
208 mov rbx, [rsp + 8*0]
209 mov rbp, [rsp + 8*1]
210 mov r12, [rsp + 8*2]
211 mov r13, [rsp + 8*3]
212 mov r14, [rsp + 8*4]
213 mov r15, [rsp + 8*5]
214
215 add rsp, STACK_SPACE
216
217 ret
218
219return_null:
220 xor job_rax, job_rax
221 jmp return
222
223
224section .data align=16
225
226align 16
227clear_low_nibble:
228 dq 0x00000000FFFFFFF0, 0x0000000000000000