]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_flush_avx2.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / mb_mgr_hmac_md5_flush_avx2.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "os.asm"
29%include "job_aes_hmac.asm"
30%include "mb_mgr_datastruct.asm"
31%include "reg_sizes.asm"
32;%define DO_DBGPRINT
33%include "dbgprint.asm"
34extern md5_x8x2_avx2
35
36section .data
37default rel
38align 16
39dupw: ;ddq 0x01000100010001000100010001000100
40 dq 0x0100010001000100, 0x0100010001000100
41x80: ;ddq 0x00000000000000000000000000000080
42 dq 0x0000000000000080, 0x0000000000000000
43x00: ;ddq 0x00000000000000000000000000000000
44 dq 0x0000000000000000, 0x0000000000000000
45len_masks:
46 ;ddq 0x0000000000000000000000000000FFFF
47 dq 0x000000000000FFFF, 0x0000000000000000
48 ;ddq 0x000000000000000000000000FFFF0000
49 dq 0x00000000FFFF0000, 0x0000000000000000
50 ;ddq 0x00000000000000000000FFFF00000000
51 dq 0x0000FFFF00000000, 0x0000000000000000
52 ;ddq 0x0000000000000000FFFF000000000000
53 dq 0xFFFF000000000000, 0x0000000000000000
54 ;ddq 0x000000000000FFFF0000000000000000
55 dq 0x0000000000000000, 0x000000000000FFFF
56 ;ddq 0x00000000FFFF00000000000000000000
57 dq 0x0000000000000000, 0x00000000FFFF0000
58 ;ddq 0x0000FFFF000000000000000000000000
59 dq 0x0000000000000000, 0x0000FFFF00000000
60 ;ddq 0xFFFF0000000000000000000000000000
61 dq 0x0000000000000000, 0xFFFF000000000000
62
63lane_1: dq 1
64lane_2: dq 2
65lane_3: dq 3
66lane_4: dq 4
67lane_5: dq 5
68lane_6: dq 6
69lane_7: dq 7
70lane_8: dq 8
71lane_9: dq 9
72lane_10: dq 10
73lane_11: dq 11
74lane_12: dq 12
75lane_13: dq 13
76lane_14: dq 14
77lane_15: dq 15
78
79section .text
80
81%if 1
82%ifdef LINUX
83%define arg1 rdi
84%define arg2 rsi
85%else
86%define arg1 rcx
87%define arg2 rdx
88%endif
89
90%define state arg1
91%define job arg2
92%define len2 arg2
93
94
95; idx needs to be in rbp
96%define idx rbp
97
98%define unused_lanes rbx
99%define lane_data rbx
100%define tmp2 rbx
101
102%define job_rax rax
103%define tmp1 rax
104%define size_offset rax
105%define tmp rax
106%define start_offset rax
107
108%define tmp3 arg1
109
110%define extra_blocks arg2
111%define p arg2
112
113%define tmp4 r8
9f95a23c 114%define tmp5 r9
11fdf7f2
TL
115%define num_lanes_inuse r12
116%define len_upper r13
117%define idx_upper r14
118%endif
119
120; This routine and/or the called routine clobbers all GPRs
121struc STACK
122_gpr_save: resq 8
123_rsp_save: resq 1
124endstruc
125
126%define APPEND(a,b) a %+ b
127
128; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state)
129; arg 1 : rcx : state
130MKGLOBAL(flush_job_hmac_md5_avx2,function,internal)
131flush_job_hmac_md5_avx2:
132
133 mov rax, rsp
134 sub rsp, STACK_size
135 and rsp, -32
136
137 mov [rsp + _gpr_save + 8*0], rbx
138 mov [rsp + _gpr_save + 8*1], rbp
139 mov [rsp + _gpr_save + 8*2], r12
140 mov [rsp + _gpr_save + 8*3], r13
141 mov [rsp + _gpr_save + 8*4], r14
142 mov [rsp + _gpr_save + 8*5], r15
143%ifndef LINUX
144 mov [rsp + _gpr_save + 8*6], rsi
145 mov [rsp + _gpr_save + 8*7], rdi
146%endif
147 mov [rsp + _rsp_save], rax ; original SP
148
149 DBGPRINTL "---------- enter md5 flush -----------"
150 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; empty?
151 cmp num_lanes_inuse, 0
152 jz return_null
153
154 ; find a lane with a non-null job -- flush does not have to be efficient!
155 mov idx, 0
156 %assign I 1
157%rep 15
158 cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
159 cmovne idx, [rel APPEND(lane_,I)]
160%assign I (I+1)
161%endrep
162
163
164copy_lane_data:
165 ; copy good lane (idx) to empty lanes
166 mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
167 ;; tackle lower 8 lanes
168 vmovdqa xmm0, [state + _lens_md5 + 0*16] ;; lower 8 lengths
169%assign I 0
170%rep 8
171 cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
172 jne APPEND(lower_skip_,I)
173 mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp
174 vpor xmm0, xmm0, [rel len_masks + 16*I]
175APPEND(lower_skip_,I):
176%assign I (I+1)
177%endrep
178 ;; tackle upper lanes
179 vmovdqa xmm1, [state + _lens_md5 + 1*16] ;; upper 8 lengths
180%assign I 0
181%rep 8
182 cmp qword [state + _ldata_md5 + (8 + I) * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
183 jne APPEND(upper_skip_,I)
184 mov [state + _args_data_ptr_md5 + PTR_SZ*(8+I)], tmp
185 vpor xmm1, xmm1, [rel len_masks + 16*I]
186APPEND(upper_skip_,I):
187%assign I (I+1)
188%endrep
189 jmp start_loop0
190
191 align 32
192start_loop0:
193 ; Find min length
194 vphminposuw xmm2, xmm0
195 vpextrw DWORD(len2), xmm2, 0 ; min value
196 vpextrw DWORD(idx), xmm2, 1 ; min index (0...7)
197
198 vphminposuw xmm3, xmm1
199 vpextrw DWORD(len_upper), xmm3, 0 ; min value
200 vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
201
202 cmp len2, len_upper
203 jle use_min
204
205min_in_high:
206 vmovdqa xmm2, xmm3
207 mov len2, len_upper
208 mov idx, idx_upper
209 or idx, 0x8 ; to reflect that index in 8-F
210use_min:
211 and len2, len2 ; to set flags
212 jz len_is_0
213 DBGPRINTL64 "min_length min_index ", len2, idx
214 DBGPRINTL_XMM "FLUSH md5 lens before sub lower", xmm0
215 vpbroadcastw xmm2, xmm2 ; duplicate words across all lanes
216 vpsubw xmm0, xmm0, xmm2
217 DBGPRINTL_XMM "FLUSH md5 lens after sub lower", xmm0
218 vmovdqa [state + _lens_md5 + 0*16], xmm0
219
220 vpsubw xmm1, xmm1, xmm2
221 DBGPRINTL_XMM "FLUSH md5 lens after sub upper", xmm1
222 vmovdqa [state + _lens_md5 + 1*16], xmm1
223
224 ; "state" and "args" are the same address, arg1
225 ; len is arg2
226 call md5_x8x2_avx2
227 ; state and idx are intact
228
229len_is_0:
230 ; process completed job "idx"
231 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
232 lea lane_data, [state + _ldata_md5 + lane_data]
233 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
234 cmp extra_blocks, 0
235 jne proc_extra_blocks
236 cmp dword [lane_data + _outer_done], 0
237 jne end_loop
238
239proc_outer:
240 mov dword [lane_data + _outer_done], 1
241 mov DWORD(size_offset), [lane_data + _size_offset]
242 mov qword [lane_data + _extra_block + size_offset], 0
243 mov word [state + _lens_md5 + 2*idx], 1
244 lea tmp, [lane_data + _outer_block]
245 mov job, [lane_data + _job_in_lane]
246 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
247
248 vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
249 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
250 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
251 vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
252 vmovdqa [lane_data + _outer_block], xmm0
253
254 mov tmp, [job + _auth_key_xor_opad]
255 vmovdqu xmm0, [tmp]
256 vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
257 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
258 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
259 vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
260 jmp copy_lane_data
261
262 align 16
263proc_extra_blocks:
264 mov DWORD(start_offset), [lane_data + _start_offset]
265 mov [state + _lens_md5 + 2*idx], WORD(extra_blocks)
266 lea tmp, [lane_data + _extra_block + start_offset]
267 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
268 mov dword [lane_data + _extra_blocks], 0
269 jmp copy_lane_data
270
271return_null:
272 xor job_rax, job_rax
273 jmp return
274
275 align 16
276end_loop:
277 mov job_rax, [lane_data + _job_in_lane]
278 mov qword [lane_data + _job_in_lane], 0
279 or dword [job_rax + _status], STS_COMPLETED_HMAC
280 mov unused_lanes, [state + _unused_lanes_md5]
281 shl unused_lanes, 4
282 or unused_lanes, idx
283 mov [state + _unused_lanes_md5], unused_lanes
284
285 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; update lanes inuse
286 sub num_lanes_inuse, 1
287 mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse)
288
289 mov p, [job_rax + _auth_tag_output]
290
291 ; copy 12 bytes
292 mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
293 mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
9f95a23c 294 mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
11fdf7f2
TL
295; bswap DWORD(tmp2)
296; bswap DWORD(tmp4)
297; bswap DWORD(tmp3)
298 mov [p + 0*4], DWORD(tmp2)
299 mov [p + 1*4], DWORD(tmp4)
9f95a23c
TL
300 mov [p + 2*4], DWORD(tmp5)
301
302 cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
303 je return
304
305 ; copy 16 bytes
306 mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
307 mov [p + 3*4], DWORD(tmp5)
11fdf7f2
TL
308
309return:
310 DBGPRINTL "---------- exit md5 flush -----------"
311 vzeroupper
312
313 mov rbx, [rsp + _gpr_save + 8*0]
314 mov rbp, [rsp + _gpr_save + 8*1]
315 mov r12, [rsp + _gpr_save + 8*2]
316 mov r13, [rsp + _gpr_save + 8*3]
317 mov r14, [rsp + _gpr_save + 8*4]
318 mov r15, [rsp + _gpr_save + 8*5]
319%ifndef LINUX
320 mov rsi, [rsp + _gpr_save + 8*6]
321 mov rdi, [rsp + _gpr_save + 8*7]
322%endif
323 mov rsp, [rsp + _rsp_save] ; original SP
324
325 ret
326
327%ifdef LINUX
328section .note.GNU-stack noalloc noexec nowrite progbits
329%endif