]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx512 / mb_mgr_hmac_flush_avx512.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; In System V AMD64 ABI
29;; calle saves: RBX, RBP, R12-R15
30;; Windows x64 ABI
31;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
32;;
33;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
34;; -----------------------------------------------------------
35;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
36;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
37;; -----------------------------------------------------------
38;; Linux clobbers: RAX RSI RDI R8 R9 R10 R11
39;; Linux preserves: RBX RCX RDX RBP R12 R13 R14 R15
40;; -----------------------------------------------------------
41;; Clobbers ZMM0-31
42
43%include "os.asm"
44%include "job_aes_hmac.asm"
45%include "mb_mgr_datastruct.asm"
46%include "reg_sizes.asm"
47
48;; %define DO_DBGPRINT
49%include "dbgprint.asm"
50
51extern sha1_x16_avx512
52
53section .data
54default rel
55
56align 16
57byteswap:
58 dq 0x0405060700010203
59 dq 0x0c0d0e0f08090a0b
60
61align 32
62len_masks:
63 dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
64 dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
65 dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
66 dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
67 dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
68 dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
69 dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
70 dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
71 dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
72 dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
73 dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
74 dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
75 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
76 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
77 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
78 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
79
80lane_1: dq 1
81lane_2: dq 2
82lane_3: dq 3
83lane_4: dq 4
84lane_5: dq 5
85lane_6: dq 6
86lane_7: dq 7
87lane_8: dq 8
88lane_9: dq 9
89lane_10: dq 10
90lane_11: dq 11
91lane_12: dq 12
92lane_13: dq 13
93lane_14: dq 14
94lane_15: dq 15
95
96section .text
97
98%if 1
99%ifdef LINUX
100%define arg1 rdi
101%define arg2 rsi
102%else
103%define arg1 rcx
104%define arg2 rdx
105%endif
106
107%define state arg1
108%define job arg2
109%define len2 arg2
110
111; idx needs to be in rbx, rdi, rbp
112%define idx rbp
113
114%define unused_lanes r9
115%define lane_data r9
116%define tmp2 r9
117%define num_lanes_inuse r12
118%define len_upper r13
119%define idx_upper r14
120
121%define job_rax rax
122%define tmp1 rax
123%define size_offset rax
124%define tmp rax
125%define start_offset rax
126
127%define tmp3 arg1
128
129%define extra_blocks arg2
130%define p arg2
131
132%define tmp4 r8
133
134%endif
135
136; we clobber rbp, called routine clobbers r12-r15
137struc STACK
138_gpr_save: resq 5
139_rsp_save: resq 1
140endstruc
141
142%define APPEND(a,b) a %+ b
143
144; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state)
145; arg 1 : rcx : state
146MKGLOBAL(flush_job_hmac_avx512,function,internal)
147flush_job_hmac_avx512:
148
149 mov rax, rsp
150 sub rsp, STACK_size
151 and rsp, -32 ; align stack to 32 byte boundary
152 mov [rsp + _gpr_save + 8*0], rbp
153 mov [rsp + _gpr_save + 8*1], r12
154 mov [rsp + _gpr_save + 8*2], r13
155 mov [rsp + _gpr_save + 8*3], r14
156 mov [rsp + _gpr_save + 8*4], r15
157 mov [rsp + _rsp_save], rax
158
159 DBGPRINTL "---------- start hmac flush avx512 -----------"
160
161 mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1] ;empty?
162 cmp num_lanes_inuse, 0
163 jz return_null
164
165 ; find a lane with a non-null job
166 xor idx, idx
167%assign I 1
168%rep 15
169 cmp qword [state + _ldata + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
170 cmovne idx, [rel APPEND(lane_,I)]
171%assign I (I+1)
172%endrep
173
174copy_lane_data:
175 ; copy valid lane (idx) to empty lanes
176 vmovdqa ymm0, [state + _lens]
177 mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
178
179%assign I 0
180%rep 16
181 cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
182 jne APPEND(skip_,I)
183 mov [state + _args_data_ptr + PTR_SZ*I], tmp
184 vpor ymm0, ymm0, [rel len_masks + 32*I] ; 32 for ymm, 16 for xmm
185APPEND(skip_,I):
186%assign I (I+1)
187%endrep
188 vmovdqa [state + _lens], ymm0
189
190 vphminposuw xmm1, xmm0
191 vpextrw DWORD(len2), xmm1, 0 ; min value
192 vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
193
194 vmovdqa xmm2, [state + _lens + 8*2]
195 vphminposuw xmm3, xmm2
196 vpextrw DWORD(len_upper), xmm3, 0 ; min value
197 vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
198
199 cmp len2, len_upper
200 jle use_min
201
202 vmovdqa xmm1, xmm3
203 mov len2, len_upper
204 mov idx, idx_upper ; idx would be in range 0..7
205 add idx, 8 ; to reflect that index is in 8..F range
206
207use_min:
208 DBGPRINTL64 "FLUSH min_length", len2
209 DBGPRINTL64 "FLUSH min_length index ", idx
210 cmp len2, 0
211 je len_is_0
212
213 vpbroadcastw xmm1, xmm1
214 DBGPRINTL_XMM "FLUSH lens after shuffle", xmm1
215
216 vpsubw xmm0, xmm0, xmm1
217 vmovdqa [state + _lens], xmm0
218 vpsubw xmm2, xmm2, xmm1
219 vmovdqa [state + _lens + 8*2], xmm2
220 DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (0..7)", xmm0
221 DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (8..F)", xmm2
222
223 ; "state" and "args" are the same address, arg1
224 ; len is arg2
225 call sha1_x16_avx512
226 ; state and idx are intact
227
228len_is_0:
229 ; process completed job "idx"
230 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
231 lea lane_data, [state + _ldata + lane_data]
232 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
233 cmp extra_blocks, 0
234 jne proc_extra_blocks
235 cmp dword [lane_data + _outer_done], 0
236 jne end_loop
237
238proc_outer:
239 mov dword [lane_data + _outer_done], 1
240 mov DWORD(size_offset), [lane_data + _size_offset]
241 mov qword [lane_data + _extra_block + size_offset], 0
242 mov word [state + _lens + 2*idx], 1
243 lea tmp, [lane_data + _outer_block]
244 mov job, [lane_data + _job_in_lane]
245 mov [state + _args_data_ptr + PTR_SZ*idx], tmp
246
247 vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
248 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
249 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
250 vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
251 vpshufb xmm0, xmm0, [rel byteswap]
252 mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
253 bswap DWORD(tmp)
254 vmovdqa [lane_data + _outer_block], xmm0
255 mov [lane_data + _outer_block + 4*4], DWORD(tmp)
256
257 mov tmp, [job + _auth_key_xor_opad]
258 vmovdqu xmm0, [tmp]
259 mov DWORD(tmp), [tmp + 4*4]
260 vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
261 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
262 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
263 vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
264 mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
265 jmp copy_lane_data
266
267 align 16
268proc_extra_blocks:
269 mov DWORD(start_offset), [lane_data + _start_offset]
270 mov [state + _lens + 2*idx], WORD(extra_blocks)
271 lea tmp, [lane_data + _extra_block + start_offset]
272 mov [state + _args_data_ptr + PTR_SZ*idx], tmp
273 mov dword [lane_data + _extra_blocks], 0
274 jmp copy_lane_data
275
276return_null:
277 DBGPRINTL "FLUSH *** ---------- return null"
278 xor job_rax, job_rax
279 jmp return
280
281 align 16
282end_loop:
283 mov job_rax, [lane_data + _job_in_lane]
284 mov qword [lane_data + _job_in_lane], 0
285 or dword [job_rax + _status], STS_COMPLETED_HMAC
286
287 mov unused_lanes, [state + _unused_lanes]
288 shl unused_lanes, 4 ;; a nibble
289 or unused_lanes, idx
290 mov [state + _unused_lanes], unused_lanes
291
292 sub dword [state + _num_lanes_inuse_sha1], 1
293
294 mov p, [job_rax + _auth_tag_output]
295
296 ; copy 12 bytes
297 mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
298 mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
9f95a23c 299 mov DWORD(r12), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
11fdf7f2
TL
300 bswap DWORD(tmp2)
301 bswap DWORD(tmp4)
9f95a23c 302 bswap DWORD(r12)
11fdf7f2
TL
303 mov [p + 0*4], DWORD(tmp2)
304 mov [p + 1*4], DWORD(tmp4)
9f95a23c
TL
305 mov [p + 2*4], DWORD(r12)
306
307 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
308 je return
309
310 ;; copy remaining 8 bytes to return 20 byte digest
311 mov DWORD(r13), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
312 mov DWORD(r14), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
313 bswap DWORD(r13)
314 bswap DWORD(r14)
315 mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(r13)
316 mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(r14)
11fdf7f2
TL
317
318return:
319 DBGPRINTL "---------- exit hmac flush avx512 -----------"
320 vzeroupper
321
322 mov rbp, [rsp + _gpr_save + 8*0]
323 mov r12, [rsp + _gpr_save + 8*1]
324 mov r13, [rsp + _gpr_save + 8*2]
325 mov r14, [rsp + _gpr_save + 8*3]
326 mov r15, [rsp + _gpr_save + 8*4]
327 mov rsp, [rsp + _rsp_save]
328 ret
329
330
331%ifdef LINUX
332section .note.GNU-stack noalloc noexec nowrite progbits
333%endif