]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_aes_xcbc_submit_sse.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 %include "include/os.asm"
29 %include "include/const.inc"
30 %include "job_aes_hmac.asm"
31 %include "mb_mgr_datastruct.asm"
32
33 %include "include/reg_sizes.asm"
34 %include "include/memcpy.asm"
35 %ifndef AES_XCBC_X4
36 %define AES_XCBC_X4 aes_xcbc_mac_128_x4
37 %define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse
38 %endif
39
40 ; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
41 extern AES_XCBC_X4
42
43 section .data
44 default rel
45
46 align 16
47 x80: ;ddq 0x00000000000000000000000000000080
48 dq 0x0000000000000080, 0x0000000000000000
49
50 section .text
51
52 %ifdef LINUX
53 %define arg1 rdi
54 %define arg2 rsi
55 %else
56 %define arg1 rcx
57 %define arg2 rdx
58 %endif
59
60 %define state arg1
61 %define job arg2
62 %define len2 arg2
63
64 %define job_rax rax
65
66 %if 1
67 ; idx needs to be in rbp
68 %define idx rbp
69 %define last_len rbp
70
71 %define lane r8
72
73 %define icv r9
74 %define p2 r9
75
76 %define tmp r10
77 %define len r11
78 %define lane_data r12
79 %define p r13
80 %define tmp2 r14
81
82 %define unused_lanes rbx
83 %endif
84
85 ; STACK_SPACE needs to be an odd multiple of 8
86 ; This routine and its callee clobbers all GPRs
87 struc STACK
88 _gpr_save: resq 8
89 _rsp_save: resq 1
90 endstruc
91
92 ; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
93 ; arg 1 : state
94 ; arg 2 : job
95 MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
96 SUBMIT_JOB_AES_XCBC:
97
98 mov rax, rsp
99 sub rsp, STACK_size
100 and rsp, -16
101
102 mov [rsp + _gpr_save + 8*0], rbx
103 mov [rsp + _gpr_save + 8*1], rbp
104 mov [rsp + _gpr_save + 8*2], r12
105 mov [rsp + _gpr_save + 8*3], r13
106 mov [rsp + _gpr_save + 8*4], r14
107 mov [rsp + _gpr_save + 8*5], r15
108 %ifndef LINUX
109 mov [rsp + _gpr_save + 8*6], rsi
110 mov [rsp + _gpr_save + 8*7], rdi
111 %endif
112 mov [rsp + _rsp_save], rax ; original SP
113
114 mov unused_lanes, [state + _aes_xcbc_unused_lanes]
115 movzx lane, BYTE(unused_lanes)
116 shr unused_lanes, 8
117 imul lane_data, lane, _XCBC_LANE_DATA_size
118 lea lane_data, [state + _aes_xcbc_ldata + lane_data]
119 mov [state + _aes_xcbc_unused_lanes], unused_lanes
120 mov len, [job + _msg_len_to_hash_in_bytes]
121 mov [lane_data + _xcbc_job_in_lane], job
122 mov dword [lane_data + _xcbc_final_done], 0
123 mov tmp, [job + _k1_expanded]
124 mov [state + _aes_xcbc_args_keys + lane*8], tmp
125 mov p, [job + _src]
126 add p, [job + _hash_start_src_offset_in_bytes]
127
128 mov last_len, len
129
130 cmp len, 16
131 jle small_buffer
132
133 mov [state + _aes_xcbc_args_in + lane*8], p
134 add p, len ; set point to end of data
135
136 and last_len, 15 ; Check lsbs of msg len
137 jnz slow_copy ; if not 16B mult, do slow copy
138
139 fast_copy:
140 movdqu xmm0, [p - 16] ; load last block M[n]
141 mov tmp, [job + _k2] ; load K2 address
142 movdqu xmm1, [tmp] ; load K2
143 pxor xmm0, xmm1 ; M[n] XOR K2
144 movdqa [lane_data + _xcbc_final_block], xmm0
145 sub len, 16 ; take last block off length
146 end_fast_copy:
147 pxor xmm0, xmm0
148 shl lane, 4 ; multiply by 16
149 movdqa [state + _aes_xcbc_args_ICV + lane], xmm0
150
151 ;; insert len into proper lane
152 movdqa xmm0, [state + _aes_xcbc_lens]
153 XPINSRW xmm0, xmm1, tmp, lane, len, no_scale
154 movdqa [state + _aes_xcbc_lens], xmm0
155
156 cmp unused_lanes, 0xff
157 jne return_null
158
159 start_loop:
160 ; Find min length
161 phminposuw xmm1, xmm0
162 pextrw len2, xmm1, 0 ; min value
163 pextrw idx, xmm1, 1 ; min index (0...3)
164 cmp len2, 0
165 je len_is_0
166
167 pshuflw xmm1, xmm1, 0
168 psubw xmm0, xmm1
169 movdqa [state + _aes_xcbc_lens], xmm0
170
171 ; "state" and "args" are the same address, arg1
172 ; len is arg2
173 call AES_XCBC_X4
174 ; state and idx are intact
175
176 len_is_0:
177 ; process completed job "idx"
178 imul lane_data, idx, _XCBC_LANE_DATA_size
179 lea lane_data, [state + _aes_xcbc_ldata + lane_data]
180 cmp dword [lane_data + _xcbc_final_done], 0
181 jne end_loop
182
183 mov dword [lane_data + _xcbc_final_done], 1
184 mov word [state + _aes_xcbc_lens + 2*idx], 16
185 lea tmp, [lane_data + _xcbc_final_block]
186 mov [state + _aes_xcbc_args_in + 8*idx], tmp
187 movdqa xmm0, [state + _aes_xcbc_lens]
188 jmp start_loop
189
190 end_loop:
191 ; process completed job "idx"
192 mov job_rax, [lane_data + _xcbc_job_in_lane]
193 mov icv, [job_rax + _auth_tag_output]
194 mov unused_lanes, [state + _aes_xcbc_unused_lanes]
195 mov qword [lane_data + _xcbc_job_in_lane], 0
196 or dword [job_rax + _status], STS_COMPLETED_HMAC
197 shl unused_lanes, 8
198 or unused_lanes, idx
199 shl idx, 4 ; multiply by 16
200 mov [state + _aes_xcbc_unused_lanes], unused_lanes
201
202 ; copy 12 bytes
203 movdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
204 movq [icv], xmm0
205 pextrd [icv + 8], xmm0, 2
206
207 %ifdef SAFE_DATA
208 ;; Clear ICV
209 pxor xmm0, xmm0
210 movdqa [state + _aes_xcbc_args_ICV + idx], xmm0
211
212 ;; Clear final block (32 bytes)
213 movdqa [lane_data + _xcbc_final_block], xmm0
214 movdqa [lane_data + _xcbc_final_block + 16], xmm0
215 %endif
216
217 return:
218
219 mov rbx, [rsp + _gpr_save + 8*0]
220 mov rbp, [rsp + _gpr_save + 8*1]
221 mov r12, [rsp + _gpr_save + 8*2]
222 mov r13, [rsp + _gpr_save + 8*3]
223 mov r14, [rsp + _gpr_save + 8*4]
224 mov r15, [rsp + _gpr_save + 8*5]
225 %ifndef LINUX
226 mov rsi, [rsp + _gpr_save + 8*6]
227 mov rdi, [rsp + _gpr_save + 8*7]
228 %endif
229 mov rsp, [rsp + _rsp_save] ; original SP
230
231 ret
232
233 small_buffer:
234 ; For buffers <= 16 Bytes
235 ; The input data is set to final block
236 lea tmp, [lane_data + _xcbc_final_block] ; final block
237 mov [state + _aes_xcbc_args_in + lane*8], tmp
238 add p, len ; set point to end of data
239 cmp len, 16
240 je fast_copy
241
242 slow_copy:
243 and len, ~15 ; take final block off len
244 sub p, last_len ; adjust data pointer
245 lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
246 sub p2, last_len ; adjust data pointer backwards
247 memcpy_sse_16_1 p2, p, last_len, tmp, tmp2
248 movdqa xmm0, [rel x80] ; fill reg with padding
249 movdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding
250 movdqu xmm0, [p2] ; load final block to process
251 mov tmp, [job + _k3] ; load K3 address
252 movdqu xmm1, [tmp] ; load K3
253 pxor xmm0, xmm1 ; M[n] XOR K3
254 movdqu [lane_data + _xcbc_final_block], xmm0 ; write final block
255 jmp end_fast_copy
256
257 return_null:
258 xor job_rax, job_rax
259 jmp return
260
261 %ifdef LINUX
262 section .note.GNU-stack noalloc noexec nowrite progbits
263 %endif