]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx512 / mb_mgr_hmac_sha_256_flush_avx512.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2017-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; In System V AMD64 ABI
29;; calle saves: RBX, RBP, R12-R15
30;; Windows x64 ABI
31;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
32;;
33;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
34;; -----------------------------------------------------------
35;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
36;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
37;; -----------------------------------------------------------
38;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
39;; Linux preserves: RBX RBP R12 R13 R14 R15
40;; -----------------------------------------------------------
41;; Clobbers ZMM0-31
42
f67539c2 43%include "include/os.asm"
11fdf7f2
TL
44%include "job_aes_hmac.asm"
45%include "mb_mgr_datastruct.asm"
f67539c2 46%include "include/reg_sizes.asm"
11fdf7f2
TL
47
48;; %define DO_DBGPRINT
f67539c2 49%include "include/dbgprint.asm"
11fdf7f2
TL
50
51extern sha256_x16_avx512
52
53section .data
54default rel
55align 16
56byteswap:
57 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
58
59align 32
60len_masks:
61 dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
62 dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
63 dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
64 dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
65 dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
66 dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
67 dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
68 dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
69 dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
70 dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
71 dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
72 dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
73 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
74 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
75 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
76 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
77
78lane_1: dq 1
79lane_2: dq 2
80lane_3: dq 3
81lane_4: dq 4
82lane_5: dq 5
83lane_6: dq 6
84lane_7: dq 7
85lane_8: dq 8
86lane_9: dq 9
87lane_10: dq 10
88lane_11: dq 11
89lane_12: dq 12
90lane_13: dq 13
91lane_14: dq 14
92lane_15: dq 15
93
94section .text
95
96%ifdef LINUX
97%define arg1 rdi
98%define arg2 rsi
99%define arg3 rdx
100%else
101%define arg1 rcx
102%define arg2 rdx
103%define arg3 rsi
104%endif
105
106%define state arg1
107%define job arg2
108%define len2 arg2
109
110
111; idx needs to be in rbp, r15
112%define idx rbp
113
114%define unused_lanes r10
115%define tmp5 r10
116
117%define lane_data rbx
118%define tmp2 rbx
119
120%define job_rax rax
121%define tmp1 rax
122%define size_offset rax
123%define start_offset rax
124
125%define tmp3 arg1
126
127%define extra_blocks arg2
128%define p arg2
129
130%define tmp4 arg3
131%define tmp r9
132
133%define len_upper r13
134%define idx_upper r14
135
136
137; we clobber rsi, rbp; called routine also clobbers rax, r9 to r15
138struc STACK
139_gpr_save: resq 8
140_rsp_save: resq 1
141endstruc
142
143%define APPEND(a,b) a %+ b
144
145; JOB* flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
146; JOB* flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
147; arg 1 : state
148align 32
149%ifdef SHA224
150MKGLOBAL(flush_job_hmac_sha_224_avx512,function,internal)
151flush_job_hmac_sha_224_avx512:
152%else
153MKGLOBAL(flush_job_hmac_sha_256_avx512,function,internal)
154flush_job_hmac_sha_256_avx512:
155%endif
156 mov rax, rsp
157 sub rsp, STACK_size
158 and rsp, -32
159 mov [rsp + _gpr_save + 8*0], rbx
160 mov [rsp + _gpr_save + 8*1], rbp
161 mov [rsp + _gpr_save + 8*2], r12
162 mov [rsp + _gpr_save + 8*3], r13
163 mov [rsp + _gpr_save + 8*4], r14
164 mov [rsp + _gpr_save + 8*5], r15
165%ifndef LINUX
166 mov [rsp + _gpr_save + 8*6], rsi
167 mov [rsp + _gpr_save + 8*7], rdi
168%endif
169 mov [rsp + _rsp_save], rax ; original SP
170
171 ; if bit (32+3) is set, then all lanes are empty
172 cmp dword [state + _num_lanes_inuse_sha256], 0
173 jz return_null
174
175 ; find a lane with a non-null job
176 xor idx, idx
177
178%assign I 1
179%rep 15
180 cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
181 cmovne idx, [rel APPEND(lane_,I)]
182%assign I (I+1)
183%endrep
184
185copy_lane_data:
186 ; copy idx to empty lanes
187 vmovdqa ymm0, [state + _lens_sha256]
188 mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx]
189
190%assign I 0
191%rep 16
192 cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
193 jne APPEND(skip_,I)
194 mov [state + _args_data_ptr_sha256 + PTR_SZ*I], tmp
195 vpor ymm0, ymm0, [rel len_masks + 32*I]
196APPEND(skip_,I):
197%assign I (I+1)
198%endrep
199
200 vmovdqa [state + _lens_sha256 ], ymm0
201
202 vphminposuw xmm1, xmm0
203 vpextrw DWORD(len2), xmm1, 0 ; min value
204 vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
205
206 vmovdqa xmm2, [state + _lens_sha256 + 8*2]
207 vphminposuw xmm3, xmm2
208 vpextrw DWORD(len_upper), xmm3, 0 ; min value
209 vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
210
211 cmp len2, len_upper
212 jle use_min
213
214 vmovdqa xmm1, xmm3
215 mov len2, len_upper
216 mov idx, idx_upper ; idx would be in range 0..7
217 add idx, 8 ; to reflect that index is in 8..F range
218
219use_min:
220 cmp len2, 0
221 je len_is_0
222
223 vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
224 vpsubw xmm0, xmm0, xmm1
225 vmovdqa [state + _lens_sha256], xmm0
226 vpsubw xmm2, xmm2, xmm1
227 vmovdqa [state + _lens_sha256 + 8*2], xmm2
228
229 ; "state" and "args" are the same address, arg1
230 ; len is arg2
231 call sha256_x16_avx512
232 ; state and idx are intact
233
234len_is_0:
235 ; process completed job "idx"
236 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
237 lea lane_data, [state + _ldata_sha256 + lane_data]
238 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
239 cmp extra_blocks, 0
240 jne proc_extra_blocks
241 cmp dword [lane_data + _outer_done], 0
242 jne end_loop
243
244proc_outer:
245 mov dword [lane_data + _outer_done], 1
246 mov DWORD(size_offset), [lane_data + _size_offset]
247 mov qword [lane_data + _extra_block + size_offset], 0
248 mov word [state + _lens_sha256 + 2*idx], 1
249 lea tmp, [lane_data + _outer_block]
250 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
251
252 vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
253 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
254 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
255 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
256 vpshufb xmm0, xmm0, [rel byteswap]
257 vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
258 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
259 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
260%ifndef SHA224
261 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
262%endif
263 vpshufb xmm1, xmm1, [rel byteswap]
264
265 vmovdqa [lane_data + _outer_block], xmm0
266 vmovdqa [lane_data + _outer_block + 4*4], xmm1
267%ifdef SHA224
268 mov dword [lane_data + _outer_block + 7*4], 0x80
269%endif
270
271 mov job, [lane_data + _job_in_lane]
272 mov tmp, [job + _auth_key_xor_opad]
273 vmovdqu xmm0, [tmp]
274 vmovdqu xmm1, [tmp + 4*4]
275 vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
276 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
277 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
278 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
279 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
280 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
281 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
282 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
283 jmp copy_lane_data
284
285 align 16
286proc_extra_blocks:
287 mov DWORD(start_offset), [lane_data + _start_offset]
288 mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
289 lea tmp, [lane_data + _extra_block + start_offset]
290 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
291 mov dword [lane_data + _extra_blocks], 0
292 jmp copy_lane_data
293
294return_null:
295 xor job_rax, job_rax
296 jmp return
297
298 align 16
299end_loop:
300 mov job_rax, [lane_data + _job_in_lane]
301 mov qword [lane_data + _job_in_lane], 0
302 or dword [job_rax + _status], STS_COMPLETED_HMAC
303 mov unused_lanes, [state + _unused_lanes_sha256]
304 shl unused_lanes, 4
305 or unused_lanes, idx
306 mov [state + _unused_lanes_sha256], unused_lanes
307
308 sub dword [state + _num_lanes_inuse_sha256], 1
309
310 mov p, [job_rax + _auth_tag_output]
311
9f95a23c
TL
312%ifdef SHA224
313 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
314 jne copy_full_digest
315%else
316 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
317 jne copy_full_digest
318%endif
319
320 ;; copy SHA224 14 bytes / SHA256 16 bytes
11fdf7f2
TL
321 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
322 mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
323 mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
324 mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
11fdf7f2
TL
325 bswap DWORD(tmp)
326 bswap DWORD(tmp2)
327 bswap DWORD(tmp4)
328 bswap DWORD(tmp5)
329 mov [p + 0*4], DWORD(tmp)
330 mov [p + 1*4], DWORD(tmp2)
331 mov [p + 2*4], DWORD(tmp4)
332%ifdef SHA224
333 mov [p + 3*4], WORD(tmp5)
334%else
335 mov [p + 3*4], DWORD(tmp5)
336%endif
f67539c2 337 jmp clear_ret
9f95a23c
TL
338
339copy_full_digest:
340 ;; copy SHA224 28 bytes / SHA256 32 bytes
341 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
342 mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
343 mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
344 mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
345 bswap DWORD(tmp)
346 bswap DWORD(tmp2)
347 bswap DWORD(tmp4)
348 bswap DWORD(tmp5)
349 mov [p + 0*4], DWORD(tmp)
350 mov [p + 1*4], DWORD(tmp2)
351 mov [p + 2*4], DWORD(tmp4)
352 mov [p + 3*4], DWORD(tmp5)
353
354 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
355 mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
356 mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
357%ifndef SHA224
358 mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
359%endif
360 bswap DWORD(tmp)
361 bswap DWORD(tmp2)
362 bswap DWORD(tmp4)
363%ifndef SHA224
364 bswap DWORD(tmp5)
365%endif
366 mov [p + 4*4], DWORD(tmp)
367 mov [p + 5*4], DWORD(tmp2)
368 mov [p + 6*4], DWORD(tmp4)
369%ifndef SHA224
370 mov [p + 7*4], DWORD(tmp5)
371%endif
11fdf7f2 372
f67539c2
TL
373clear_ret:
374
375%ifdef SAFE_DATA
376 vpxorq zmm0, zmm0
377
378 ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
379 ;; of returned job and NULL jobs
380%assign I 0
381%rep 16
382 cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
383 jne APPEND(skip_clear_,I)
384
385 ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
386%assign J 0
387%rep 7
388 mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
389%assign J (J+1)
390%endrep
391%ifndef SHA224
392 mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
393%endif
394
395 lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
396 ;; Clear first 64 bytes of extra_block
397 vmovdqu64 [lane_data + _extra_block], zmm0
398
399 ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
400%ifdef SHA224
401 vmovdqa64 [lane_data + _outer_block], xmm0
402 mov qword [lane_data + _outer_block + 16], 0
403 mov dword [lane_data + _outer_block + 24], 0
404%else
405 vmovdqu64 [lane_data + _outer_block], ymm0
406%endif
407
408APPEND(skip_clear_,I):
409%assign I (I+1)
410%endrep
411
412%endif ;; SAFE_DATA
413
11fdf7f2
TL
414return:
415 vzeroupper
416
417 mov rbx, [rsp + _gpr_save + 8*0]
418 mov rbp, [rsp + _gpr_save + 8*1]
419 mov r12, [rsp + _gpr_save + 8*2]
420 mov r13, [rsp + _gpr_save + 8*3]
421 mov r14, [rsp + _gpr_save + 8*4]
422 mov r15, [rsp + _gpr_save + 8*5]
423%ifndef LINUX
424 mov rsi, [rsp + _gpr_save + 8*6]
425 mov rdi, [rsp + _gpr_save + 8*7]
426%endif
427 mov rsp, [rsp + _rsp_save] ; original SP
428
429 ret
430
431%ifdef LINUX
432section .note.GNU-stack noalloc noexec nowrite progbits
433%endif