]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_cmac_submit_flush_sse.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_aes_cmac_submit_flush_sse.asm
1 ;;
2 ;; Copyright (c) 2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28
29 %include "os.asm"
30 %include "job_aes_hmac.asm"
31 %include "mb_mgr_datastruct.asm"
32
33 %include "reg_sizes.asm"
34 %include "memcpy.asm"
35 %include "const.inc"
36 ;%define DO_DBGPRINT
37 %include "dbgprint.asm"
38
39 %ifndef AES128_CBC_MAC
40
41 %define AES128_CBC_MAC aes128_cbc_mac_x4
42 %define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_sse
43 %define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_sse
44
45 %endif
46
47 extern AES128_CBC_MAC
48
49 section .data
50 default rel
51
52 align 16
53 len_masks:
54 ;ddq 0x0000000000000000000000000000FFFF
55 dq 0x000000000000FFFF, 0x0000000000000000
56 ;ddq 0x000000000000000000000000FFFF0000
57 dq 0x00000000FFFF0000, 0x0000000000000000
58 ;ddq 0x00000000000000000000FFFF00000000
59 dq 0x0000FFFF00000000, 0x0000000000000000
60 ;ddq 0x0000000000000000FFFF000000000000
61 dq 0xFFFF000000000000, 0x0000000000000000
62 one: dq 1
63 two: dq 2
64 three: dq 3
65
66 section .text
67
68 %define APPEND(a,b) a %+ b
69
70 %ifdef LINUX
71 %define arg1 rdi
72 %define arg2 rsi
73 %else
74 %define arg1 rcx
75 %define arg2 rdx
76 %endif
77
78 %define state arg1
79 %define job arg2
80 %define len2 arg2
81
82 %define job_rax rax
83
84 ; idx needs to be in rbp
85 %define len rbp
86 %define idx rbp
87 %define tmp rbp
88
89 %define lane r8
90
91 %define iv r9
92 %define m_last r10
93 %define n r11
94
95 %define unused_lanes rbx
96 %define r rbx
97
98 %define tmp3 r12
99 %define tmp4 r13
100 %define tmp2 r14
101
102 %define flag r15
103 %define good_lane r15
104
105 ; STACK_SPACE needs to be an odd multiple of 8
106 ; This routine and its callee clobbers all GPRs
107 struc STACK
108 _gpr_save: resq 8
109 _rsp_save: resq 1
110 endstruc
111
112 ;;; ===========================================================================
113 ;;; ===========================================================================
114 ;;; MACROS
115 ;;; ===========================================================================
116 ;;; ===========================================================================
117
118 ;;; ===========================================================================
119 ;;; AES CMAC job submit & flush
120 ;;; ===========================================================================
121 ;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
122 %macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE 1
123 %define %%SUBMIT_FLUSH %1
124
125 mov rax, rsp
126 sub rsp, STACK_size
127 and rsp, -16
128
129 mov [rsp + _gpr_save + 8*0], rbx
130 mov [rsp + _gpr_save + 8*1], rbp
131 mov [rsp + _gpr_save + 8*2], r12
132 mov [rsp + _gpr_save + 8*3], r13
133 mov [rsp + _gpr_save + 8*4], r14
134 mov [rsp + _gpr_save + 8*5], r15
135 %ifndef LINUX
136 mov [rsp + _gpr_save + 8*6], rsi
137 mov [rsp + _gpr_save + 8*7], rdi
138 %endif
139 mov [rsp + _rsp_save], rax ; original SP
140
141 ;; Find free lane
142 mov unused_lanes, [state + _aes_cmac_unused_lanes]
143
144 %ifidn %%SUBMIT_FLUSH, SUBMIT
145 mov flag, 0
146
147 mov lane, unused_lanes
148 and lane, 0xF
149 shr unused_lanes, 4
150 mov [state + _aes_cmac_unused_lanes], unused_lanes
151
152 ;; Copy job info into lane
153 mov [state + _aes_cmac_job_in_lane + lane*8], job
154 ;; Copy keys into lane args
155 mov tmp, [job + _key_expanded]
156 mov [state + _aes_cmac_args_keys + lane*8], tmp
157 mov tmp, lane
158 shl tmp, 4 ; lane*16
159
160 ;; Zero IV to store digest
161 pxor xmm0, xmm0
162 movdqa [state + _aes_cmac_args_IV + tmp], xmm0
163
164 lea m_last, [state + _aes_cmac_scratch + tmp]
165
166 ;; Check at least 1 or more blocks (get n)
167 mov len, [job + _msg_len_to_hash_in_bytes]
168 mov n, len
169 add n, 0xf
170 shr n, 4
171
172 ;; Check for partial block
173 mov r, len
174 and r, 0xf
175
176 or n, n ; check one or more blocks?
177 jz %%_lt_one_block
178
179 ;; One or more blocks, potentially partial
180 mov word [state + _aes_cmac_init_done + lane*2], 0
181
182 mov tmp2, [job + _src]
183 add tmp2, [job + _hash_start_src_offset_in_bytes]
184 mov [state + _aes_cmac_args_in + lane*8], tmp2
185
186 ;; len = (n-1)*16
187 lea tmp2, [n - 1]
188 shl tmp2, 4
189 movdqa xmm0, [state + _aes_cmac_lens]
190 XPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
191 movdqa [state + _aes_cmac_lens], xmm0
192
193 ;; Set flag = (r == 0)
194 or r, r
195 jz %%_complete_block
196
197 %%_not_complete_block:
198 ;; M_last = padding(M_n) XOR K2
199 lea tmp, [rel padding_0x80_tab16 + 16]
200 sub tmp, r
201 movdqu xmm0, [tmp]
202 movdqa [m_last], xmm0
203
204 mov tmp, [job + _src]
205 add tmp, [job + _hash_start_src_offset_in_bytes]
206 lea tmp3, [n - 1]
207 shl tmp3, 4
208 add tmp, tmp3
209
210 memcpy_sse_16 m_last, tmp, r, tmp4, iv
211
212 ;; src + n + r
213 mov tmp3, [job + _skey2]
214 movdqa xmm1, [m_last]
215 movdqu xmm0, [tmp3]
216 pxor xmm0, xmm1
217 movdqa [m_last], xmm0
218
219 %%_step_5:
220 ;; Find min length
221 movdqa xmm0, [state + _aes_cmac_lens]
222 phminposuw xmm1, xmm0
223
224 cmp byte [state + _aes_cmac_unused_lanes], 0xf
225 jne %%_return_null
226
227 %else ; end SUBMIT
228
229 ;; Check at least one job
230 bt unused_lanes, 19
231 jc %%_return_null
232
233 ;; Find a lane with a non-null job
234 xor good_lane, good_lane
235 cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
236 cmovne good_lane, [rel one]
237 cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
238 cmovne good_lane, [rel two]
239 cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
240 cmovne good_lane, [rel three]
241
242 ; Copy good_lane to empty lanes
243 mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
244 mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
245 shl good_lane, 4 ; multiply by 16
246 movdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
247 movdqa xmm0, [state + _aes_cmac_lens]
248
249 %assign I 0
250 %rep 4
251 cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
252 jne APPEND(skip_,I)
253 mov [state + _aes_cmac_args_in + I*8], tmp2
254 mov [state + _aes_cmac_args_keys + I*8], tmp3
255 movdqa [state + _aes_cmac_args_IV + I*16], xmm2
256 por xmm0, [rel len_masks + 16*I]
257 APPEND(skip_,I):
258 %assign I (I+1)
259 %endrep
260 ;; Find min length
261 phminposuw xmm1, xmm0
262
263 %endif ; end FLUSH
264
265 %%_cmac_round:
266 pextrw len2, xmm1, 0 ; min value
267 pextrw idx, xmm1, 1 ; min index (0...3)
268 cmp len2, 0
269 je %%_len_is_0
270 pshuflw xmm1, xmm1, 0
271 psubw xmm0, xmm1
272 movdqa [state + _aes_cmac_lens], xmm0
273
274 ; "state" and "args" are the same address, arg1
275 ; len2 is arg2
276 call AES128_CBC_MAC
277 ; state and idx are intact
278
279 %%_len_is_0:
280 ; Check if job complete
281 test word [state + _aes_cmac_init_done + idx*2], 0xffff
282 jnz %%_copy_complete_digest
283
284 ; Finish step 6
285 mov word [state + _aes_cmac_init_done + idx*2], 1
286
287 movdqa xmm0, [state + _aes_cmac_lens]
288 XPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
289 movdqa [state + _aes_cmac_lens], xmm0
290
291 phminposuw xmm1, xmm0 ; find min length
292
293 mov tmp3, idx
294 shl tmp3, 4 ; idx*16
295 lea m_last, [state + _aes_cmac_scratch + tmp3]
296 mov [state + _aes_cmac_args_in + idx*8], m_last
297
298 jmp %%_cmac_round
299
300 %%_copy_complete_digest:
301 ; Job complete, copy digest to AT output
302 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
303
304 mov tmp4, idx
305 shl tmp4, 4
306 lea tmp3, [state + _aes_cmac_args_IV + tmp4]
307 mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
308 mov tmp2, [job_rax + _auth_tag_output]
309
310 cmp tmp4, 16
311 jne %%_ne_16_copy
312
313 ;; 16 byte AT copy
314 movdqu xmm0, [tmp3]
315 movdqu [tmp2], xmm0
316 jmp %%_update_lanes
317
318 %%_ne_16_copy:
319 memcpy_sse_16 tmp2, tmp3, tmp4, lane, iv
320
321 %%_update_lanes:
322 ; Update unused lanes
323 mov unused_lanes, [state + _aes_cmac_unused_lanes]
324 shl unused_lanes, 4
325 or unused_lanes, idx
326 mov [state + _aes_cmac_unused_lanes], unused_lanes
327
328 ; Set return job
329 mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
330
331 mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
332 or dword [job_rax + _status], STS_COMPLETED_HMAC
333
334 %%_return:
335 mov rbx, [rsp + _gpr_save + 8*0]
336 mov rbp, [rsp + _gpr_save + 8*1]
337 mov r12, [rsp + _gpr_save + 8*2]
338 mov r13, [rsp + _gpr_save + 8*3]
339 mov r14, [rsp + _gpr_save + 8*4]
340 mov r15, [rsp + _gpr_save + 8*5]
341 %ifndef LINUX
342 mov rsi, [rsp + _gpr_save + 8*6]
343 mov rdi, [rsp + _gpr_save + 8*7]
344 %endif
345 mov rsp, [rsp + _rsp_save] ; original SP
346 ret
347
348 %%_return_null:
349 xor job_rax, job_rax
350 jmp %%_return
351
352 %ifidn %%SUBMIT_FLUSH, SUBMIT
353 %%_complete_block:
354 mov flag, 1
355
356 ;; Block size aligned
357 mov tmp2, [job + _src]
358 add tmp2, [job + _hash_start_src_offset_in_bytes]
359 lea tmp3, [n - 1]
360 shl tmp3, 4
361 add tmp2, tmp3
362
363 ;; M_last = M_n XOR K1
364 mov tmp3, [job + _skey1]
365 movdqu xmm0, [tmp3]
366 movdqu xmm1, [tmp2]
367 pxor xmm0, xmm1
368 movdqa [m_last], xmm0
369
370 jmp %%_step_5
371
372 %%_lt_one_block:
373 ;; Single partial block
374 mov word [state + _aes_cmac_init_done + lane*2], 1
375 mov [state + _aes_cmac_args_in + lane*8], m_last
376
377 movdqa xmm0, [state + _aes_cmac_lens]
378 XPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
379 movdqa [state + _aes_cmac_lens], xmm0
380
381 mov n, 1
382 jmp %%_not_complete_block
383 %endif
384 %endmacro
385
386
387 align 64
388 ; JOB_AES_HMAC * submit_job_aes_cmac_auth_sse(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job)
389 ; arg 1 : state
390 ; arg 2 : job
391 MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
392 SUBMIT_JOB_AES_CMAC_AUTH:
393 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE SUBMIT
394
395 ; JOB_AES_HMAC * flush_job_aes_cmac_auth_sse(MB_MGR_CCM_OOO *state)
396 ; arg 1 : state
397 MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
398 FLUSH_JOB_AES_CMAC_AUTH:
399 GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_SSE FLUSH
400
401
402 %ifdef LINUX
403 section .note.GNU-stack noalloc noexec nowrite progbits
404 %endif