]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_aes_cmac_submit_flush_sse.asm
CommitLineData
9f95a23c
TL
1;;
2;; Copyright (c) 2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28
f67539c2 29%include "include/os.asm"
9f95a23c
TL
30%include "job_aes_hmac.asm"
31%include "mb_mgr_datastruct.asm"
32
f67539c2
TL
33%include "include/reg_sizes.asm"
34%include "include/memcpy.asm"
35%include "include/const.inc"
9f95a23c 36;%define DO_DBGPRINT
f67539c2 37%include "include/dbgprint.asm"
9f95a23c
TL
38
39%ifndef AES128_CBC_MAC
40
41%define AES128_CBC_MAC aes128_cbc_mac_x4
42%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse
43%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse
44
45%endif
46
47extern AES128_CBC_MAC
48
49section .data
50default rel
51
52align 16
53len_masks:
54 ;ddq 0x0000000000000000000000000000FFFF
55 dq 0x000000000000FFFF, 0x0000000000000000
56 ;ddq 0x000000000000000000000000FFFF0000
57 dq 0x00000000FFFF0000, 0x0000000000000000
58 ;ddq 0x00000000000000000000FFFF00000000
59 dq 0x0000FFFF00000000, 0x0000000000000000
60 ;ddq 0x0000000000000000FFFF000000000000
61 dq 0xFFFF000000000000, 0x0000000000000000
62one: dq 1
63two: dq 2
64three: dq 3
65
66section .text
67
68%define APPEND(a,b) a %+ b
69
70%ifdef LINUX
71%define arg1 rdi
72%define arg2 rsi
73%else
74%define arg1 rcx
75%define arg2 rdx
76%endif
77
78%define state arg1
79%define job arg2
80%define len2 arg2
81
82%define job_rax rax
83
84; idx needs to be in rbp
85%define len rbp
86%define idx rbp
87%define tmp rbp
88
89%define lane r8
90
91%define iv r9
92%define m_last r10
93%define n r11
94
95%define unused_lanes rbx
96%define r rbx
97
98%define tmp3 r12
99%define tmp4 r13
100%define tmp2 r14
101
9f95a23c 102%define good_lane r15
f67539c2 103%define rbits r15
9f95a23c
TL
104
105; STACK_SPACE needs to be an odd multiple of 8
106; This routine and its callee clobbers all GPRs
107struc STACK
108_gpr_save: resq 8
109_rsp_save: resq 1
110endstruc
111
112;;; ===========================================================================
113;;; ===========================================================================
114;;; MACROS
115;;; ===========================================================================
116;;; ===========================================================================
117
118;;; ===========================================================================
119;;; AES CMAC job submit & flush
120;;; ===========================================================================
121;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
122%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1
123%define %%SUBMIT_FLUSH %1
124
125 mov rax, rsp
126 sub rsp, STACK_size
127 and rsp, -16
128
129 mov [rsp + _gpr_save + 8*0], rbx
130 mov [rsp + _gpr_save + 8*1], rbp
131 mov [rsp + _gpr_save + 8*2], r12
132 mov [rsp + _gpr_save + 8*3], r13
133 mov [rsp + _gpr_save + 8*4], r14
134 mov [rsp + _gpr_save + 8*5], r15
135%ifndef LINUX
136 mov [rsp + _gpr_save + 8*6], rsi
137 mov [rsp + _gpr_save + 8*7], rdi
138%endif
139 mov [rsp + _rsp_save], rax ; original SP
140
141 ;; Find free lane
142 mov unused_lanes, [state + _aes_cmac_unused_lanes]
143
144%ifidn %%SUBMIT_FLUSH, SUBMIT
9f95a23c
TL
145
146 mov lane, unused_lanes
147 and lane, 0xF
148 shr unused_lanes, 4
149 mov [state + _aes_cmac_unused_lanes], unused_lanes
150
151 ;; Copy job info into lane
152 mov [state + _aes_cmac_job_in_lane + lane*8], job
153 ;; Copy keys into lane args
154 mov tmp, [job + _key_expanded]
155 mov [state + _aes_cmac_args_keys + lane*8], tmp
156 mov tmp, lane
157 shl tmp, 4 ; lane*16
158
159 ;; Zero IV to store digest
160 pxor xmm0, xmm0
161 movdqa [state + _aes_cmac_args_IV + tmp], xmm0
162
163 lea m_last, [state + _aes_cmac_scratch + tmp]
164
f67539c2
TL
165 ;; calculate len
166 ;; convert bits to bytes (message length in bits for CMAC)
167 mov len, [job + _msg_len_to_hash_in_bits]
168 mov rbits, len
169 add len, 7 ; inc len if there are remainder bits
170 shr len, 3
171 and rbits, 7
172
9f95a23c 173 ;; Check at least 1 or more blocks (get n)
9f95a23c
TL
174 mov n, len
175 add n, 0xf
176 shr n, 4
177
178 ;; Check for partial block
179 mov r, len
180 and r, 0xf
181
182 or n, n ; check one or more blocks?
183 jz %%_lt_one_block
184
185 ;; One or more blocks, potentially partial
186 mov word [state + _aes_cmac_init_done + lane*2], 0
187
188 mov tmp2, [job + _src]
189 add tmp2, [job + _hash_start_src_offset_in_bytes]
190 mov [state + _aes_cmac_args_in + lane*8], tmp2
191
192 ;; len = (n-1)*16
193 lea tmp2, [n - 1]
194 shl tmp2, 4
195 movdqa xmm0, [state + _aes_cmac_lens]
196 XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
197 movdqa [state + _aes_cmac_lens], xmm0
198
f67539c2
TL
199 ;; check remainder bits
200 or rbits, rbits
201 jnz %%_not_complete_block_3gpp
202
203 ;; check if complete block
9f95a23c
TL
204 or r, r
205 jz %%_complete_block
206
207%%_not_complete_block:
208 ;; M_last = padding(M_n) XOR K2
209 lea tmp, [rel padding_0x80_tab16 + 16]
210 sub tmp, r
211 movdqu xmm0, [tmp]
212 movdqa [m_last], xmm0
213
214 mov tmp, [job + _src]
215 add tmp, [job + _hash_start_src_offset_in_bytes]
216 lea tmp3, [n - 1]
217 shl tmp3, 4
218 add tmp, tmp3
219
220 memcpy_sse_16 m_last, tmp, r, tmp4, iv
221
222 ;; src + n + r
223 mov tmp3, [job + _skey2]
224 movdqa xmm1, [m_last]
225 movdqu xmm0, [tmp3]
226 pxor xmm0, xmm1
227 movdqa [m_last], xmm0
228
229%%_step_5:
230 ;; Find min length
231 movdqa xmm0, [state + _aes_cmac_lens]
232 phminposuw xmm1, xmm0
233
234 cmp byte [state + _aes_cmac_unused_lanes], 0xf
235 jne %%_return_null
236
237%else ; end SUBMIT
238
239 ;; Check at least one job
240 bt unused_lanes, 19
241 jc %%_return_null
242
243 ;; Find a lane with a non-null job
244 xor good_lane, good_lane
245 cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
246 cmovne good_lane, [rel one]
247 cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
248 cmovne good_lane, [rel two]
249 cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
250 cmovne good_lane, [rel three]
251
252 ; Copy good_lane to empty lanes
253 mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
254 mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
255 shl good_lane, 4 ; multiply by 16
256 movdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
257 movdqa xmm0, [state + _aes_cmac_lens]
258
259%assign I 0
260%rep 4
261 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
262 jne APPEND(skip_,I)
263 mov [state + _aes_cmac_args_in + I*8], tmp2
264 mov [state + _aes_cmac_args_keys + I*8], tmp3
265 movdqa [state + _aes_cmac_args_IV + I*16], xmm2
266 por xmm0, [rel len_masks + 16*I]
267APPEND(skip_,I):
268%assign I (I+1)
269%endrep
270 ;; Find min length
271 phminposuw xmm1, xmm0
272
273%endif ; end FLUSH
274
275%%_cmac_round:
276 pextrw len2, xmm1, 0 ; min value
277 pextrw idx, xmm1, 1 ; min index (0...3)
278 cmp len2, 0
279 je %%_len_is_0
280 pshuflw xmm1, xmm1, 0
281 psubw xmm0, xmm1
282 movdqa [state + _aes_cmac_lens], xmm0
283
284 ; "state" and "args" are the same address, arg1
285 ; len2 is arg2
286 call AES128_CBC_MAC
287 ; state and idx are intact
288
f67539c2 289 movdqa xmm0, [state + _aes_cmac_lens] ; preload lens
9f95a23c
TL
290%%_len_is_0:
291 ; Check if job complete
292 test word [state + _aes_cmac_init_done + idx*2], 0xffff
293 jnz %%_copy_complete_digest
294
295 ; Finish step 6
296 mov word [state + _aes_cmac_init_done + idx*2], 1
297
9f95a23c
TL
298 XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
299 movdqa [state + _aes_cmac_lens], xmm0
300
301 phminposuw xmm1, xmm0 ; find min length
302
303 mov tmp3, idx
304 shl tmp3, 4 ; idx*16
305 lea m_last, [state + _aes_cmac_scratch + tmp3]
306 mov [state + _aes_cmac_args_in + idx*8], m_last
307
308 jmp %%_cmac_round
309
310%%_copy_complete_digest:
311 ; Job complete, copy digest to AT output
312 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
313
314 mov tmp4, idx
315 shl tmp4, 4
316 lea tmp3, [state + _aes_cmac_args_IV + tmp4]
317 mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
318 mov tmp2, [job_rax + _auth_tag_output]
319
320 cmp tmp4, 16
321 jne %%_ne_16_copy
322
323 ;; 16 byte AT copy
324 movdqu xmm0, [tmp3]
325 movdqu [tmp2], xmm0
326 jmp %%_update_lanes
327
328%%_ne_16_copy:
329 memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv
330
331%%_update_lanes:
332 ; Update unused lanes
333 mov unused_lanes, [state + _aes_cmac_unused_lanes]
334 shl unused_lanes, 4
335 or unused_lanes, idx
336 mov [state + _aes_cmac_unused_lanes], unused_lanes
337
338 ; Set return job
339 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
340
341 mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
342 or dword [job_rax + _status], STS_COMPLETED_HMAC
343
f67539c2
TL
344%ifdef SAFE_DATA
345 pxor xmm0, xmm0
346%ifidn %%SUBMIT_FLUSH, SUBMIT
347 ;; Clear digest (in memory for IV) and scratch memory of returned job
348 movdqa [tmp3], xmm0
349
350 shl idx, 4
351 movdqa [state + _aes_cmac_scratch + idx], xmm0
352
353%else
354 ;; Clear digest and scratch memory of returned job and "NULL lanes"
355%assign I 0
356%rep 4
357 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
358 jne APPEND(skip_clear_,I)
359 movdqa [state + _aes_cmac_args_IV + I*16], xmm0
360 movdqa [state + _aes_cmac_scratch + I*16], xmm0
361APPEND(skip_clear_,I):
362%assign I (I+1)
363%endrep
364%endif ;; SUBMIT
365
366%endif ;; SAFE_DATA
367
9f95a23c
TL
368%%_return:
369 mov rbx, [rsp + _gpr_save + 8*0]
370 mov rbp, [rsp + _gpr_save + 8*1]
371 mov r12, [rsp + _gpr_save + 8*2]
372 mov r13, [rsp + _gpr_save + 8*3]
373 mov r14, [rsp + _gpr_save + 8*4]
374 mov r15, [rsp + _gpr_save + 8*5]
375%ifndef LINUX
376 mov rsi, [rsp + _gpr_save + 8*6]
377 mov rdi, [rsp + _gpr_save + 8*7]
378%endif
379 mov rsp, [rsp + _rsp_save] ; original SP
380 ret
381
382%%_return_null:
383 xor job_rax, job_rax
384 jmp %%_return
385
386%ifidn %%SUBMIT_FLUSH, SUBMIT
387%%_complete_block:
9f95a23c
TL
388
389 ;; Block size aligned
390 mov tmp2, [job + _src]
391 add tmp2, [job + _hash_start_src_offset_in_bytes]
392 lea tmp3, [n - 1]
393 shl tmp3, 4
394 add tmp2, tmp3
395
396 ;; M_last = M_n XOR K1
397 mov tmp3, [job + _skey1]
398 movdqu xmm0, [tmp3]
399 movdqu xmm1, [tmp2]
400 pxor xmm0, xmm1
401 movdqa [m_last], xmm0
402
403 jmp %%_step_5
404
405%%_lt_one_block:
406 ;; Single partial block
407 mov word [state + _aes_cmac_init_done + lane*2], 1
408 mov [state + _aes_cmac_args_in + lane*8], m_last
409
410 movdqa xmm0, [state + _aes_cmac_lens]
411 XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
412 movdqa [state + _aes_cmac_lens], xmm0
413
414 mov n, 1
415 jmp %%_not_complete_block
f67539c2
TL
416
417%%_not_complete_block_3gpp:
418 ;; bit pad last block
419 ;; xor with skey2
420 ;; copy to m_last
421
422 ;; load pointer to src
423 mov tmp, [job + _src]
424 add tmp, [job + _hash_start_src_offset_in_bytes]
425 lea tmp3, [n - 1]
426 shl tmp3, 4
427 add tmp, tmp3
428
429 ;; check if partial block
430 or r, r
431 jz %%_load_full_block_3gpp
432
433 simd_load_sse_15_1 xmm0, tmp, r
434 dec r
435
436%%_update_mlast_3gpp:
437 ;; set last byte padding mask
438 ;; shift into correct xmm idx
439
440 ;; save and restore rcx on windows
441%ifndef LINUX
442 mov tmp, rcx
443%endif
444 mov rcx, rbits
445 mov tmp3, 0xff
446 shr tmp3, cl
447 movq xmm2, tmp3
448 XPSLLB xmm2, r, xmm1, tmp2
449
450 ;; pad final byte
451 pandn xmm2, xmm0
452%ifndef LINUX
453 mov rcx, tmp
454%endif
455 ;; set OR mask to pad final bit
456 mov tmp2, tmp3
457 shr tmp2, 1
458 xor tmp2, tmp3 ; XOR to get OR mask
459 movq xmm3, tmp2
460 ;; xmm1 contains shift table from previous shift
461 pshufb xmm3, xmm1
462
463 ;; load skey2 address
464 mov tmp3, [job + _skey2]
465 movdqu xmm1, [tmp3]
466
467 ;; set final padding bit
468 por xmm2, xmm3
469
470 ;; XOR last partial block with skey2
471 ;; update mlast
472 pxor xmm2, xmm1
473 movdqa [m_last], xmm2
474
475 jmp %%_step_5
476
477%%_load_full_block_3gpp:
478 movdqu xmm0, [tmp]
479 mov r, 0xf
480 jmp %%_update_mlast_3gpp
9f95a23c
TL
481%endif
482%endmacro
483
484
485align 64
f67539c2 486; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
9f95a23c
TL
487; arg 1 : state
488; arg 2 : job
489MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
490SUBMIT_JOB_AES_CMAC_AUTH:
491 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT
492
f67539c2 493; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state)
9f95a23c
TL
494; arg 1 : state
495MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
496FLUSH_JOB_AES_CMAC_AUTH:
497 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH
498
499
500%ifdef LINUX
501section .note.GNU-stack noalloc noexec nowrite progbits
502%endif