]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx / mb_mgr_aes_ccm_auth_submit_flush_avx.asm
1 ;;
2 ;; Copyright (c) 2019, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28
29 %include "include/os.asm"
30 %include "job_aes_hmac.asm"
31 %include "mb_mgr_datastruct.asm"
32
33 %include "include/reg_sizes.asm"
34 %include "include/const.inc"
35 %include "include/memcpy.asm"
36
37 %ifndef AES128_CBC_MAC
38
39 %define AES128_CBC_MAC aes128_cbc_mac_x8
40 %define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
41 %define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
42
43 %endif
44
45 extern AES128_CBC_MAC
46
47 section .data
48 default rel
49
50 align 16
51 len_mask:
52 dq 0xFFFFFFFFFFFFFFF0
53 align 16
54 len_masks:
55 dq 0x000000000000FFFF, 0x0000000000000000
56 dq 0x00000000FFFF0000, 0x0000000000000000
57 dq 0x0000FFFF00000000, 0x0000000000000000
58 dq 0xFFFF000000000000, 0x0000000000000000
59 dq 0x0000000000000000, 0x000000000000FFFF
60 dq 0x0000000000000000, 0x00000000FFFF0000
61 dq 0x0000000000000000, 0x0000FFFF00000000
62 dq 0x0000000000000000, 0xFFFF000000000000
63 dupw:
64 dq 0x0100010001000100, 0x0100010001000100
65 counter_mask:
66 dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF
67 one: dq 1
68 two: dq 2
69 three: dq 3
70 four: dq 4
71 five: dq 5
72 six: dq 6
73 seven: dq 7
74
75 section .text
76
77 %define APPEND(a,b) a %+ b
78
79 %define NROUNDS 9 ; AES-CCM-128
80 %ifdef LINUX
81 %define arg1 rdi
82 %define arg2 rsi
83 %else
84 %define arg1 rcx
85 %define arg2 rdx
86 %endif
87
88 %define state arg1
89 %define job arg2
90 %define len2 arg2
91
92 %define job_rax rax
93 %define tmp4 rax
94 %define auth_len_aad rax
95
96 %define min_idx rbp
97 %define flags rbp
98
99 %define lane r8
100
101 %define iv_len r9
102 %define auth_len r9
103
104 %define aad_len r10
105 %define init_block_addr r11
106
107 %define unused_lanes rbx
108 %define r rbx
109
110 %define tmp r12
111 %define tmp2 r13
112 %define tmp3 r14
113
114 %define good_lane r15
115 %define min_job r15
116
117 %define init_block0 xmm0
118 %define ccm_lens xmm1
119 %define min_len_idx xmm2
120 %define xtmp0 xmm3
121 %define xtmp1 xmm4
122 %define xtmp2 xmm5
123 %define xtmp3 xmm6
124
125 ; STACK_SPACE needs to be an odd multiple of 8
126 ; This routine and its callee clobbers all GPRs
127 struc STACK
128 _gpr_save: resq 8
129 _rsp_save: resq 1
130 endstruc
131
132 ;;; ===========================================================================
133 ;;; ===========================================================================
134 ;;; MACROS
135 ;;; ===========================================================================
136 ;;; ===========================================================================
137
138 %macro ENCRYPT_SINGLE_BLOCK 2
139 %define %%GDATA %1
140 %define %%XMM0 %2
141
142 vpxor %%XMM0, [%%GDATA+16*0]
143 %assign i 1
144 %rep NROUNDS
145 vaesenc %%XMM0, [%%GDATA+16*i]
146 %assign i (i+1)
147 %endrep
148 vaesenclast %%XMM0, [%%GDATA+16*i]
149 %endmacro
150
151 ;;; ===========================================================================
152 ;;; AES CCM auth job submit & flush
153 ;;; ===========================================================================
154 ;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
155 %macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX 1
156 %define %%SUBMIT_FLUSH %1
157
158 mov rax, rsp
159 sub rsp, STACK_size
160 and rsp, -16
161
162 mov [rsp + _gpr_save + 8*0], rbx
163 mov [rsp + _gpr_save + 8*1], rbp
164 mov [rsp + _gpr_save + 8*2], r12
165 mov [rsp + _gpr_save + 8*3], r13
166 mov [rsp + _gpr_save + 8*4], r14
167 mov [rsp + _gpr_save + 8*5], r15
168 %ifndef LINUX
169 mov [rsp + _gpr_save + 8*6], rsi
170 mov [rsp + _gpr_save + 8*7], rdi
171 %endif
172 mov [rsp + _rsp_save], rax ; original SP
173
174 ;; Find free lane
175 mov unused_lanes, [state + _aes_ccm_unused_lanes]
176
177 %ifidn %%SUBMIT_FLUSH, SUBMIT
178
179 mov lane, unused_lanes
180 and lane, 15
181 shr unused_lanes, 4
182 mov [state + _aes_ccm_unused_lanes], unused_lanes
183
184 ;; Copy job info into lane
185 mov [state + _aes_ccm_job_in_lane + lane*8], job
186 ;; Copy keys into lane args
187 mov tmp, [job + _aes_enc_key_expanded]
188 mov [state + _aes_ccm_args_keys + lane*8], tmp
189 ;; init_done = 0
190 mov word [state + _aes_ccm_init_done + lane*2], 0
191 lea tmp, [lane * 8]
192
193 vpxor init_block0, init_block0
194 vmovdqa [state + _aes_ccm_args_IV + tmp*2], init_block0
195
196 ;; Prepare initial Block 0 for CBC-MAC-128
197
198 ;; Byte 0: flags with L' and M' (AAD later)
199 ;; Calculate L' = 15 - IV length - 1 = 14 - IV length
200 mov flags, 14
201 mov iv_len, [job + _iv_len_in_bytes]
202 sub flags, iv_len
203 ;; Calculate M' = (Digest length - 2) / 2
204 mov tmp, [job + _auth_tag_output_len_in_bytes]
205 sub tmp, 2
206
207 shl tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl)
208 or flags, tmp
209
210 ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
211
212 ;; Bytes 1 - 7 are always copied (first 7 bytes)
213 mov tmp, [job + _iv]
214 vpinsrb init_block0, [tmp], 1
215 vpinsrw init_block0, [tmp + 1], 1
216 vpinsrd init_block0, [tmp + 3], 1
217
218 cmp iv_len, 7
219 je %%_finish_nonce_move
220
221 cmp iv_len, 8
222 je %%_iv_length_8
223 cmp iv_len, 9
224 je %%_iv_length_9
225 cmp iv_len, 10
226 je %%_iv_length_10
227 cmp iv_len, 11
228 je %%_iv_length_11
229 cmp iv_len, 12
230 je %%_iv_length_12
231
232 ;; Bytes 8 - 13
233 %%_iv_length_13:
234 vpinsrb init_block0, [tmp + 12], 13
235 %%_iv_length_12:
236 vpinsrb init_block0, [tmp + 11], 12
237 %%_iv_length_11:
238 vpinsrd init_block0, [tmp + 7], 2
239 jmp %%_finish_nonce_move
240 %%_iv_length_10:
241 vpinsrb init_block0, [tmp + 9], 10
242 %%_iv_length_9:
243 vpinsrb init_block0, [tmp + 8], 9
244 %%_iv_length_8:
245 vpinsrb init_block0, [tmp + 7], 8
246
247 %%_finish_nonce_move:
248
249 ;; Bytes 14 & 15 (message length), in Big Endian
250 mov ax, [job + _msg_len_to_hash_in_bytes]
251 xchg al, ah
252 vpinsrw init_block0, ax, 7
253
254 mov aad_len, [job + _cbcmac_aad_len]
255 ;; Initial length to authenticate (Block 0)
256 mov auth_len, 16
257 ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded,
258 ;; so length is multiple of 64B)
259 lea auth_len_aad, [aad_len + (2 + 15) + 16]
260 and auth_len_aad, -16
261
262 or aad_len, aad_len
263 cmovne auth_len, auth_len_aad
264 ;; Update lengths to authenticate and find min length
265 vmovdqa ccm_lens, [state + _aes_ccm_lens]
266 XVPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16
267 vmovdqa [state + _aes_ccm_lens], ccm_lens
268 vphminposuw min_len_idx, ccm_lens
269
270 mov tmp, lane
271 shl tmp, 6
272 lea init_block_addr, [state + _aes_ccm_init_blocks + tmp]
273 or aad_len, aad_len
274 je %%_aad_complete
275
276 or flags, (1 << 6) ; Set Adata bit in flags
277
278 ;; Copy AAD
279 ;; Set all 0s in last block (padding)
280 lea tmp, [init_block_addr + auth_len]
281 sub tmp, 16
282 vpxor xtmp0, xtmp0
283 vmovdqa [tmp], xtmp0
284
285 ;; Start copying from second block
286 lea tmp, [init_block_addr+16]
287 mov rax, aad_len
288 xchg al, ah
289 mov [tmp], ax
290 add tmp, 2
291 mov tmp2, [job + _cbcmac_aad]
292 memcpy_avx_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3
293
294 %%_aad_complete:
295
296 ;; Finish Block 0 with Byte 0
297 vpinsrb init_block0, BYTE(flags), 0
298 vmovdqa [init_block_addr], init_block0
299
300 mov [state + _aes_ccm_args_in + lane * 8], init_block_addr
301
302 cmp byte [state + _aes_ccm_unused_lanes], 0xf
303 jne %%_return_null
304
305 %else ; end SUBMIT
306
307 ;; Check at least one job
308 bt unused_lanes, 35
309 jc %%_return_null
310
311 ;; Find a lane with a non-null job
312 xor good_lane, good_lane
313 cmp QWORD [state + _aes_ccm_job_in_lane + 1*8], 0
314 cmovne good_lane, [rel one]
315 cmp QWORD [state + _aes_ccm_job_in_lane + 2*8], 0
316 cmovne good_lane, [rel two]
317 cmp QWORD [state + _aes_ccm_job_in_lane + 3*8], 0
318 cmovne good_lane, [rel three]
319 cmp qword [state + _aes_ccm_job_in_lane + 4*8], 0
320 cmovne good_lane, [rel four]
321 cmp qword [state + _aes_ccm_job_in_lane + 5*8], 0
322 cmovne good_lane, [rel five]
323 cmp qword [state + _aes_ccm_job_in_lane + 6*8], 0
324 cmovne good_lane, [rel six]
325 cmp qword [state + _aes_ccm_job_in_lane + 7*8], 0
326 cmovne good_lane, [rel seven]
327
328 ; Copy good_lane to empty lanes
329 movzx tmp, word [state + _aes_ccm_init_done + good_lane*2]
330 mov tmp2, [state + _aes_ccm_args_in + good_lane*8]
331 mov tmp3, [state + _aes_ccm_args_keys + good_lane*8]
332 shl good_lane, 4 ; multiply by 16
333 vmovdqa xtmp0, [state + _aes_ccm_args_IV + good_lane]
334 vmovdqa ccm_lens, [state + _aes_ccm_lens]
335
336 %assign I 0
337 %rep 8
338 cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
339 jne APPEND(skip_,I)
340 vpor ccm_lens, [rel len_masks + 16*I]
341 mov [state + _aes_ccm_init_done + I*2], WORD(tmp)
342 mov [state + _aes_ccm_args_in + I*8], tmp2
343 mov [state + _aes_ccm_args_keys + I*8], tmp3
344 vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
345 APPEND(skip_,I):
346 %assign I (I+1)
347 %endrep
348 vmovdqa [state + _aes_ccm_lens], ccm_lens
349 ;; Find min length
350 vphminposuw min_len_idx, ccm_lens
351
352 %endif ; end FLUSH
353
354 %%_ccm_round:
355 vpextrw len2, min_len_idx, 0 ; min value
356 vpextrw min_idx, min_len_idx, 1 ; min index (0...7)
357
358 mov min_job, [state + _aes_ccm_job_in_lane + min_idx*8]
359
360 or len2, len2
361 je %%_len_is_0
362 ;; subtract min length from all lengths
363 vpshufb min_len_idx, min_len_idx, [rel dupw] ; broadcast min length
364 vpsubw ccm_lens, min_len_idx
365 vmovdqa [state + _aes_ccm_lens], ccm_lens
366
367 ; "state" and "args" are the same address, arg1
368 ; len2 is arg2
369 call AES128_CBC_MAC
370 ; state and min_idx are intact
371
372 %%_len_is_0:
373
374 movzx tmp, WORD [state + _aes_ccm_init_done + min_idx*2]
375 cmp WORD(tmp), 0
376 je %%_prepare_full_blocks_to_auth
377 cmp WORD(tmp), 1
378 je %%_prepare_partial_block_to_auth
379
380 %%_encrypt_digest:
381
382 ;; Set counter block 0 (reusing previous initial block 0)
383 mov tmp, min_idx
384 shl tmp, 3
385 vmovdqa init_block0, [state + _aes_ccm_init_blocks + tmp * 8]
386
387 vpand init_block0, [rel counter_mask]
388
389 mov tmp2, [state + _aes_ccm_args_keys + tmp]
390 ENCRYPT_SINGLE_BLOCK tmp2, init_block0
391 vpxor init_block0, [state + _aes_ccm_args_IV + tmp * 2]
392
393 ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16)
394 mov min_job, [state + _aes_ccm_job_in_lane + tmp]
395 mov tmp3, [min_job + _auth_tag_output_len_in_bytes]
396 mov tmp2, [min_job + _auth_tag_output]
397
398 simd_store_avx tmp2, init_block0, tmp3, tmp, tmp4
399 %%_update_lanes:
400 ; Update unused lanes
401 mov unused_lanes, [state + _aes_ccm_unused_lanes]
402 shl unused_lanes, 4
403 or unused_lanes, min_idx
404 mov [state + _aes_ccm_unused_lanes], unused_lanes
405
406 ; Set return job
407 mov job_rax, min_job
408
409 mov qword [state + _aes_ccm_job_in_lane + min_idx*8], 0
410 or dword [job_rax + _status], STS_COMPLETED_HMAC
411
412 %ifdef SAFE_DATA
413 vpxor xtmp0, xtmp0
414 %ifidn %%SUBMIT_FLUSH, SUBMIT
415 shl min_idx, 3
416 ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job
417 vmovdqa [state + _aes_ccm_args_IV + min_idx * 2], xtmp0
418 vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8], xtmp0
419 vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0
420 vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0
421 vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0
422 mov qword [state + _aes_ccm_args_keys + min_idx], 0
423 %else
424 ;; Clear digest (in memory for CBC IV), counter block 0 and AAD
425 ;; of returned job and "NULL lanes"
426 %assign I 0
427 %rep 8
428 cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
429 jne APPEND(skip_clear_,I)
430 vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
431 vmovdqa [state + _aes_ccm_init_blocks + I*64], xtmp0
432 vmovdqa [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0
433 vmovdqa [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0
434 vmovdqa [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0
435 mov qword [state + _aes_ccm_args_keys + I*8], 0
436 APPEND(skip_clear_,I):
437 %assign I (I+1)
438 %endrep
439
440 %endif ;; SUBMIT
441 %endif ;; SAFE_DATA
442
443 %%_return:
444 mov rbx, [rsp + _gpr_save + 8*0]
445 mov rbp, [rsp + _gpr_save + 8*1]
446 mov r12, [rsp + _gpr_save + 8*2]
447 mov r13, [rsp + _gpr_save + 8*3]
448 mov r14, [rsp + _gpr_save + 8*4]
449 mov r15, [rsp + _gpr_save + 8*5]
450 %ifndef LINUX
451 mov rsi, [rsp + _gpr_save + 8*6]
452 mov rdi, [rsp + _gpr_save + 8*7]
453 %endif
454 mov rsp, [rsp + _rsp_save] ; original SP
455 ret
456
457 %%_return_null:
458 xor job_rax, job_rax
459 jmp %%_return
460
461 %%_prepare_full_blocks_to_auth:
462
463 cmp dword [min_job + _cipher_direction], 2 ; DECRYPT
464 je %%_decrypt
465
466 %%_encrypt:
467 mov tmp, [min_job + _src]
468 add tmp, [min_job + _hash_start_src_offset_in_bytes]
469 jmp %%_set_init_done_1
470
471 %%_decrypt:
472 mov tmp, [min_job + _dst]
473
474 %%_set_init_done_1:
475 mov [state + _aes_ccm_args_in + min_idx*8], tmp
476 mov word [state + _aes_ccm_init_done + min_idx*2], 1
477
478 ; Check if there are full blocks to hash
479 mov tmp, [min_job + _msg_len_to_hash_in_bytes]
480 and tmp, -16
481 je %%_prepare_partial_block_to_auth
482
483 ;; Update lengths to authenticate and find min length
484 vmovdqa ccm_lens, [state + _aes_ccm_lens]
485 XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16
486 vphminposuw min_len_idx, ccm_lens
487 vmovdqa [state + _aes_ccm_lens], ccm_lens
488
489 jmp %%_ccm_round
490
491 %%_prepare_partial_block_to_auth:
492 ; Check if partial block needs to be hashed
493 mov auth_len, [min_job + _msg_len_to_hash_in_bytes]
494 and auth_len, 15
495 je %%_encrypt_digest
496
497 mov word [state + _aes_ccm_init_done + min_idx * 2], 2
498 ;; Update lengths to authenticate and find min length
499 vmovdqa ccm_lens, [state + _aes_ccm_lens]
500 XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16
501 vphminposuw min_len_idx, ccm_lens
502 vmovdqa [state + _aes_ccm_lens], ccm_lens
503
504 mov tmp2, min_idx
505 shl tmp2, 6
506 add tmp2, 16 ; pb[AES_BLOCK_SIZE]
507 lea init_block_addr, [state + _aes_ccm_init_blocks + tmp2]
508 mov tmp2, [state + _aes_ccm_args_in + min_idx * 8]
509
510 simd_load_avx_15_1 xtmp0, tmp2, auth_len
511
512 %%_finish_partial_block_copy:
513 vmovdqa [init_block_addr], xtmp0
514 mov [state + _aes_ccm_args_in + min_idx * 8], init_block_addr
515
516 jmp %%_ccm_round
517 %endmacro
518
519
520 align 64
521 ; JOB_AES_HMAC * submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job)
522 ; arg 1 : state
523 ; arg 2 : job
524 MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal)
525 SUBMIT_JOB_AES_CCM_AUTH:
526 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT
527
528 ; JOB_AES_HMAC * flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state)
529 ; arg 1 : state
530 MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal)
531 FLUSH_JOB_AES_CCM_AUTH:
532 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH
533
534
535 %ifdef LINUX
536 section .note.GNU-stack noalloc noexec nowrite progbits
537 %endif