]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_aes_cmac_submit_flush_sse.asm
1 ;;
2 ;; Copyright (c) 2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28
29 %include "include/os.asm"
30 %include "job_aes_hmac.asm"
31 %include "mb_mgr_datastruct.asm"
32
33 %include "include/reg_sizes.asm"
34 %include "include/memcpy.asm"
35 %include "include/const.inc"
36 ;%define DO_DBGPRINT
37 %include "include/dbgprint.asm"
38
39 %ifndef AES128_CBC_MAC
40
41 %define AES128_CBC_MAC aes128_cbc_mac_x4
42 %define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse
43 %define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse
44
45 %endif
46
47 extern AES128_CBC_MAC
48
49 section .data
50 default rel
51
52 align 16
53 len_masks:
54 ;ddq 0x0000000000000000000000000000FFFF
55 dq 0x000000000000FFFF, 0x0000000000000000
56 ;ddq 0x000000000000000000000000FFFF0000
57 dq 0x00000000FFFF0000, 0x0000000000000000
58 ;ddq 0x00000000000000000000FFFF00000000
59 dq 0x0000FFFF00000000, 0x0000000000000000
60 ;ddq 0x0000000000000000FFFF000000000000
61 dq 0xFFFF000000000000, 0x0000000000000000
62 one: dq 1
63 two: dq 2
64 three: dq 3
65
66 section .text
67
68 %define APPEND(a,b) a %+ b
69
70 %ifdef LINUX
71 %define arg1 rdi
72 %define arg2 rsi
73 %else
74 %define arg1 rcx
75 %define arg2 rdx
76 %endif
77
78 %define state arg1
79 %define job arg2
80 %define len2 arg2
81
82 %define job_rax rax
83
84 ; idx needs to be in rbp
85 %define len rbp
86 %define idx rbp
87 %define tmp rbp
88
89 %define lane r8
90
91 %define iv r9
92 %define m_last r10
93 %define n r11
94
95 %define unused_lanes rbx
96 %define r rbx
97
98 %define tmp3 r12
99 %define tmp4 r13
100 %define tmp2 r14
101
102 %define good_lane r15
103 %define rbits r15
104
105 ; STACK_SPACE needs to be an odd multiple of 8
106 ; This routine and its callee clobbers all GPRs
107 struc STACK
108 _gpr_save: resq 8
109 _rsp_save: resq 1
110 endstruc
111
112 ;;; ===========================================================================
113 ;;; ===========================================================================
114 ;;; MACROS
115 ;;; ===========================================================================
116 ;;; ===========================================================================
117
118 ;;; ===========================================================================
119 ;;; AES CMAC job submit & flush
120 ;;; ===========================================================================
121 ;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
122 %macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1
123 %define %%SUBMIT_FLUSH %1
124
125 mov rax, rsp
126 sub rsp, STACK_size
127 and rsp, -16
128
129 mov [rsp + _gpr_save + 8*0], rbx
130 mov [rsp + _gpr_save + 8*1], rbp
131 mov [rsp + _gpr_save + 8*2], r12
132 mov [rsp + _gpr_save + 8*3], r13
133 mov [rsp + _gpr_save + 8*4], r14
134 mov [rsp + _gpr_save + 8*5], r15
135 %ifndef LINUX
136 mov [rsp + _gpr_save + 8*6], rsi
137 mov [rsp + _gpr_save + 8*7], rdi
138 %endif
139 mov [rsp + _rsp_save], rax ; original SP
140
141 ;; Find free lane
142 mov unused_lanes, [state + _aes_cmac_unused_lanes]
143
144 %ifidn %%SUBMIT_FLUSH, SUBMIT
145
146 mov lane, unused_lanes
147 and lane, 0xF
148 shr unused_lanes, 4
149 mov [state + _aes_cmac_unused_lanes], unused_lanes
150
151 ;; Copy job info into lane
152 mov [state + _aes_cmac_job_in_lane + lane*8], job
153 ;; Copy keys into lane args
154 mov tmp, [job + _key_expanded]
155 mov [state + _aes_cmac_args_keys + lane*8], tmp
156 mov tmp, lane
157 shl tmp, 4 ; lane*16
158
159 ;; Zero IV to store digest
160 pxor xmm0, xmm0
161 movdqa [state + _aes_cmac_args_IV + tmp], xmm0
162
163 lea m_last, [state + _aes_cmac_scratch + tmp]
164
165 ;; calculate len
166 ;; convert bits to bytes (message length in bits for CMAC)
167 mov len, [job + _msg_len_to_hash_in_bits]
168 mov rbits, len
169 add len, 7 ; inc len if there are remainder bits
170 shr len, 3
171 and rbits, 7
172
173 ;; Check at least 1 or more blocks (get n)
174 mov n, len
175 add n, 0xf
176 shr n, 4
177
178 ;; Check for partial block
179 mov r, len
180 and r, 0xf
181
182 or n, n ; check one or more blocks?
183 jz %%_lt_one_block
184
185 ;; One or more blocks, potentially partial
186 mov word [state + _aes_cmac_init_done + lane*2], 0
187
188 mov tmp2, [job + _src]
189 add tmp2, [job + _hash_start_src_offset_in_bytes]
190 mov [state + _aes_cmac_args_in + lane*8], tmp2
191
192 ;; len = (n-1)*16
193 lea tmp2, [n - 1]
194 shl tmp2, 4
195 movdqa xmm0, [state + _aes_cmac_lens]
196 XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
197 movdqa [state + _aes_cmac_lens], xmm0
198
199 ;; check remainder bits
200 or rbits, rbits
201 jnz %%_not_complete_block_3gpp
202
203 ;; check if complete block
204 or r, r
205 jz %%_complete_block
206
207 %%_not_complete_block:
208 ;; M_last = padding(M_n) XOR K2
209 lea tmp, [rel padding_0x80_tab16 + 16]
210 sub tmp, r
211 movdqu xmm0, [tmp]
212 movdqa [m_last], xmm0
213
214 mov tmp, [job + _src]
215 add tmp, [job + _hash_start_src_offset_in_bytes]
216 lea tmp3, [n - 1]
217 shl tmp3, 4
218 add tmp, tmp3
219
220 memcpy_sse_16 m_last, tmp, r, tmp4, iv
221
222 ;; src + n + r
223 mov tmp3, [job + _skey2]
224 movdqa xmm1, [m_last]
225 movdqu xmm0, [tmp3]
226 pxor xmm0, xmm1
227 movdqa [m_last], xmm0
228
229 %%_step_5:
230 ;; Find min length
231 movdqa xmm0, [state + _aes_cmac_lens]
232 phminposuw xmm1, xmm0
233
234 cmp byte [state + _aes_cmac_unused_lanes], 0xf
235 jne %%_return_null
236
237 %else ; end SUBMIT
238
239 ;; Check at least one job
240 bt unused_lanes, 19
241 jc %%_return_null
242
243 ;; Find a lane with a non-null job
244 xor good_lane, good_lane
245 cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
246 cmovne good_lane, [rel one]
247 cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
248 cmovne good_lane, [rel two]
249 cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
250 cmovne good_lane, [rel three]
251
252 ; Copy good_lane to empty lanes
253 mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
254 mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
255 shl good_lane, 4 ; multiply by 16
256 movdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
257 movdqa xmm0, [state + _aes_cmac_lens]
258
259 %assign I 0
260 %rep 4
261 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
262 jne APPEND(skip_,I)
263 mov [state + _aes_cmac_args_in + I*8], tmp2
264 mov [state + _aes_cmac_args_keys + I*8], tmp3
265 movdqa [state + _aes_cmac_args_IV + I*16], xmm2
266 por xmm0, [rel len_masks + 16*I]
267 APPEND(skip_,I):
268 %assign I (I+1)
269 %endrep
270 ;; Find min length
271 phminposuw xmm1, xmm0
272
273 %endif ; end FLUSH
274
275 %%_cmac_round:
276 pextrw len2, xmm1, 0 ; min value
277 pextrw idx, xmm1, 1 ; min index (0...3)
278 cmp len2, 0
279 je %%_len_is_0
280 pshuflw xmm1, xmm1, 0
281 psubw xmm0, xmm1
282 movdqa [state + _aes_cmac_lens], xmm0
283
284 ; "state" and "args" are the same address, arg1
285 ; len2 is arg2
286 call AES128_CBC_MAC
287 ; state and idx are intact
288
289 movdqa xmm0, [state + _aes_cmac_lens] ; preload lens
290 %%_len_is_0:
291 ; Check if job complete
292 test word [state + _aes_cmac_init_done + idx*2], 0xffff
293 jnz %%_copy_complete_digest
294
295 ; Finish step 6
296 mov word [state + _aes_cmac_init_done + idx*2], 1
297
298 XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
299 movdqa [state + _aes_cmac_lens], xmm0
300
301 phminposuw xmm1, xmm0 ; find min length
302
303 mov tmp3, idx
304 shl tmp3, 4 ; idx*16
305 lea m_last, [state + _aes_cmac_scratch + tmp3]
306 mov [state + _aes_cmac_args_in + idx*8], m_last
307
308 jmp %%_cmac_round
309
310 %%_copy_complete_digest:
311 ; Job complete, copy digest to AT output
312 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
313
314 mov tmp4, idx
315 shl tmp4, 4
316 lea tmp3, [state + _aes_cmac_args_IV + tmp4]
317 mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
318 mov tmp2, [job_rax + _auth_tag_output]
319
320 cmp tmp4, 16
321 jne %%_ne_16_copy
322
323 ;; 16 byte AT copy
324 movdqu xmm0, [tmp3]
325 movdqu [tmp2], xmm0
326 jmp %%_update_lanes
327
328 %%_ne_16_copy:
329 memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv
330
331 %%_update_lanes:
332 ; Update unused lanes
333 mov unused_lanes, [state + _aes_cmac_unused_lanes]
334 shl unused_lanes, 4
335 or unused_lanes, idx
336 mov [state + _aes_cmac_unused_lanes], unused_lanes
337
338 ; Set return job
339 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
340
341 mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
342 or dword [job_rax + _status], STS_COMPLETED_HMAC
343
344 %ifdef SAFE_DATA
345 pxor xmm0, xmm0
346 %ifidn %%SUBMIT_FLUSH, SUBMIT
347 ;; Clear digest (in memory for IV) and scratch memory of returned job
348 movdqa [tmp3], xmm0
349
350 shl idx, 4
351 movdqa [state + _aes_cmac_scratch + idx], xmm0
352
353 %else
354 ;; Clear digest and scratch memory of returned job and "NULL lanes"
355 %assign I 0
356 %rep 4
357 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
358 jne APPEND(skip_clear_,I)
359 movdqa [state + _aes_cmac_args_IV + I*16], xmm0
360 movdqa [state + _aes_cmac_scratch + I*16], xmm0
361 APPEND(skip_clear_,I):
362 %assign I (I+1)
363 %endrep
364 %endif ;; SUBMIT
365
366 %endif ;; SAFE_DATA
367
368 %%_return:
369 mov rbx, [rsp + _gpr_save + 8*0]
370 mov rbp, [rsp + _gpr_save + 8*1]
371 mov r12, [rsp + _gpr_save + 8*2]
372 mov r13, [rsp + _gpr_save + 8*3]
373 mov r14, [rsp + _gpr_save + 8*4]
374 mov r15, [rsp + _gpr_save + 8*5]
375 %ifndef LINUX
376 mov rsi, [rsp + _gpr_save + 8*6]
377 mov rdi, [rsp + _gpr_save + 8*7]
378 %endif
379 mov rsp, [rsp + _rsp_save] ; original SP
380 ret
381
382 %%_return_null:
383 xor job_rax, job_rax
384 jmp %%_return
385
386 %ifidn %%SUBMIT_FLUSH, SUBMIT
387 %%_complete_block:
388
389 ;; Block size aligned
390 mov tmp2, [job + _src]
391 add tmp2, [job + _hash_start_src_offset_in_bytes]
392 lea tmp3, [n - 1]
393 shl tmp3, 4
394 add tmp2, tmp3
395
396 ;; M_last = M_n XOR K1
397 mov tmp3, [job + _skey1]
398 movdqu xmm0, [tmp3]
399 movdqu xmm1, [tmp2]
400 pxor xmm0, xmm1
401 movdqa [m_last], xmm0
402
403 jmp %%_step_5
404
405 %%_lt_one_block:
406 ;; Single partial block
407 mov word [state + _aes_cmac_init_done + lane*2], 1
408 mov [state + _aes_cmac_args_in + lane*8], m_last
409
410 movdqa xmm0, [state + _aes_cmac_lens]
411 XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
412 movdqa [state + _aes_cmac_lens], xmm0
413
414 mov n, 1
415 jmp %%_not_complete_block
416
417 %%_not_complete_block_3gpp:
418 ;; bit pad last block
419 ;; xor with skey2
420 ;; copy to m_last
421
422 ;; load pointer to src
423 mov tmp, [job + _src]
424 add tmp, [job + _hash_start_src_offset_in_bytes]
425 lea tmp3, [n - 1]
426 shl tmp3, 4
427 add tmp, tmp3
428
429 ;; check if partial block
430 or r, r
431 jz %%_load_full_block_3gpp
432
433 simd_load_sse_15_1 xmm0, tmp, r
434 dec r
435
436 %%_update_mlast_3gpp:
437 ;; set last byte padding mask
438 ;; shift into correct xmm idx
439
440 ;; save and restore rcx on windows
441 %ifndef LINUX
442 mov tmp, rcx
443 %endif
444 mov rcx, rbits
445 mov tmp3, 0xff
446 shr tmp3, cl
447 movq xmm2, tmp3
448 XPSLLB xmm2, r, xmm1, tmp2
449
450 ;; pad final byte
451 pandn xmm2, xmm0
452 %ifndef LINUX
453 mov rcx, tmp
454 %endif
455 ;; set OR mask to pad final bit
456 mov tmp2, tmp3
457 shr tmp2, 1
458 xor tmp2, tmp3 ; XOR to get OR mask
459 movq xmm3, tmp2
460 ;; xmm1 contains shift table from previous shift
461 pshufb xmm3, xmm1
462
463 ;; load skey2 address
464 mov tmp3, [job + _skey2]
465 movdqu xmm1, [tmp3]
466
467 ;; set final padding bit
468 por xmm2, xmm3
469
470 ;; XOR last partial block with skey2
471 ;; update mlast
472 pxor xmm2, xmm1
473 movdqa [m_last], xmm2
474
475 jmp %%_step_5
476
477 %%_load_full_block_3gpp:
478 movdqu xmm0, [tmp]
479 mov r, 0xf
480 jmp %%_update_mlast_3gpp
481 %endif
482 %endmacro
483
484
485 align 64
486 ; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
487 ; arg 1 : state
488 ; arg 2 : job
489 MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
490 SUBMIT_JOB_AES_CMAC_AUTH:
491 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT
492
493 ; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CMAC_OOO *state)
494 ; arg 1 : state
495 MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
496 FLUSH_JOB_AES_CMAC_AUTH:
497 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH
498
499
500 %ifdef LINUX
501 section .note.GNU-stack noalloc noexec nowrite progbits
502 %endif