]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_hmac_md5_submit_sse.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28%include "os.asm"
29%include "job_aes_hmac.asm"
30%include "mb_mgr_datastruct.asm"
31%include "memcpy.asm"
32%include "reg_sizes.asm"
9f95a23c 33%include "const.inc"
11fdf7f2
TL
34
35extern md5_x4x2_sse
36
37section .data
38default rel
39align 16
40;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203
41dupw: ;ddq 0x01000100010001000100010001000100
42 dq 0x0100010001000100, 0x0100010001000100
43
44section .text
45
46%if 1
47%ifdef LINUX
48%define arg1 rdi
49%define arg2 rsi
50%define reg3 rcx
51%define reg4 rdx
52%else
53%define arg1 rcx
54%define arg2 rdx
55%define reg3 rdi
56%define reg4 rsi
57%endif
58
59%define state arg1
60%define job arg2
61%define len2 arg2
62
63
64; idx needs to be in rbp
65%define last_len rbp
66%define idx rbp
67
68%define p r11
69%define start_offset r11
70
71%define unused_lanes rbx
72%define tmp4 rbx
73
74%define job_rax rax
75%define len rax
76
77%define size_offset reg3
78%define tmp2 reg3
79
80%define lane reg4
81%define tmp3 reg4
82
83%define extra_blocks r8
84
85%define tmp r9
86%define p2 r9
87
88%define lane_data r10
89
90%endif
91
92; This routine and/or the called routine clobbers all GPRs
93struc STACK
94_gpr_save: resq 8
95_rsp_save: resq 1
96endstruc
97
98; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
99; arg 1 : rcx : state
100; arg 2 : rdx : job
101MKGLOBAL(submit_job_hmac_md5_sse,function,internal)
102submit_job_hmac_md5_sse:
103
104 mov rax, rsp
105 sub rsp, STACK_size
106 and rsp, -16
107
108 mov [rsp + _gpr_save + 8*0], rbx
109 mov [rsp + _gpr_save + 8*1], rbp
110 mov [rsp + _gpr_save + 8*2], r12
111 mov [rsp + _gpr_save + 8*3], r13
112 mov [rsp + _gpr_save + 8*4], r14
113 mov [rsp + _gpr_save + 8*5], r15
114%ifndef LINUX
115 mov [rsp + _gpr_save + 8*6], rsi
116 mov [rsp + _gpr_save + 8*7], rdi
117%endif
118 mov [rsp + _rsp_save], rax ; original SP
119
120 mov unused_lanes, [state + _unused_lanes_md5]
121 mov lane, unused_lanes
122 and lane, 0xF
123 shr unused_lanes, 4
124 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
125 lea lane_data, [state + _ldata_md5 + lane_data]
126 mov [state + _unused_lanes_md5], unused_lanes
127 mov len, [job + _msg_len_to_hash_in_bytes]
128 mov tmp, len
129 shr tmp, 6 ; divide by 64, len in terms of blocks
130
131 mov [lane_data + _job_in_lane], job
132 mov dword [lane_data + _outer_done], 0
9f95a23c
TL
133
134 ;; insert len into proper lane
135 movdqa xmm0, [state + _lens_md5]
136 XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
137 movdqa [state + _lens_md5], xmm0
11fdf7f2
TL
138
139 mov last_len, len
140 and last_len, 63
141 lea extra_blocks, [last_len + 9 + 63]
142 shr extra_blocks, 6
143 mov [lane_data + _extra_blocks], DWORD(extra_blocks)
144
145 mov p, [job + _src]
146 add p, [job + _hash_start_src_offset_in_bytes]
147 mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
148
149 cmp len, 64
150 jb copy_lt64
151
152fast_copy:
153 add p, len
154 movdqu xmm0, [p - 64 + 0*16]
155 movdqu xmm1, [p - 64 + 1*16]
156 movdqu xmm2, [p - 64 + 2*16]
157 movdqu xmm3, [p - 64 + 3*16]
158 movdqa [lane_data + _extra_block + 0*16], xmm0
159 movdqa [lane_data + _extra_block + 1*16], xmm1
160 movdqa [lane_data + _extra_block + 2*16], xmm2
161 movdqa [lane_data + _extra_block + 3*16], xmm3
162end_fast_copy:
163
164 mov size_offset, extra_blocks
165 shl size_offset, 6
166 sub size_offset, last_len
167 add size_offset, 64-8
168 mov [lane_data + _size_offset], DWORD(size_offset)
169 mov start_offset, 64
170 sub start_offset, last_len
171 mov [lane_data + _start_offset], DWORD(start_offset)
172
173 lea tmp, [8*64 + 8*len]
174; bswap tmp
175 mov [lane_data + _extra_block + size_offset], tmp
176
177 mov tmp, [job + _auth_key_xor_ipad]
178 movdqu xmm0, [tmp]
179 movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
180 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
181 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
182 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
183
184 test len, ~63
185 jnz ge64_bytes
186
187lt64_bytes:
9f95a23c
TL
188 movdqa xmm0, [state + _lens_md5]
189 XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
190 movdqa [state + _lens_md5], xmm0
191
11fdf7f2
TL
192 lea tmp, [lane_data + _extra_block + start_offset]
193 mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
194 mov dword [lane_data + _extra_blocks], 0
195
196ge64_bytes:
197 cmp unused_lanes, 0xf
198 jne return_null
199 jmp start_loop
200
201 align 16
202start_loop:
203 ; Find min length
204 movdqa xmm0, [state + _lens_md5]
205 phminposuw xmm1, xmm0
206 pextrw len2, xmm1, 0 ; min value
207 pextrw idx, xmm1, 1 ; min index (0...3)
208 cmp len2, 0
209 je len_is_0
210
211 pshufb xmm1, [rel dupw] ; duplicate words across all lanes
212 psubw xmm0, xmm1
213 movdqa [state + _lens_md5], xmm0
214
215 ; "state" and "args" are the same address, arg1
216 ; len is arg2
217 call md5_x4x2_sse
218 ; state and idx are intact
219
220len_is_0:
221 ; process completed job "idx"
222 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
223 lea lane_data, [state + _ldata_md5 + lane_data]
224 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
225 cmp extra_blocks, 0
226 jne proc_extra_blocks
227 cmp dword [lane_data + _outer_done], 0
228 jne end_loop
229
230proc_outer:
231 mov dword [lane_data + _outer_done], 1
232 mov DWORD(size_offset), [lane_data + _size_offset]
233 mov qword [lane_data + _extra_block + size_offset], 0
9f95a23c
TL
234
235 movdqa xmm0, [state + _lens_md5]
236 XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
237 movdqa [state + _lens_md5], xmm0
238
11fdf7f2
TL
239 lea tmp, [lane_data + _outer_block]
240 mov job, [lane_data + _job_in_lane]
241 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
242
243 movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
244 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
245 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
246 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
247; pshufb xmm0, [rel byteswap]
248 movdqa [lane_data + _outer_block], xmm0
249
250 mov tmp, [job + _auth_key_xor_opad]
251 movdqu xmm0, [tmp]
252 movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
253 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
254 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
255 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
256 jmp start_loop
257
258 align 16
259proc_extra_blocks:
260 mov DWORD(start_offset), [lane_data + _start_offset]
9f95a23c
TL
261
262 movdqa xmm0, [state + _lens_md5]
263 XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
264 movdqa [state + _lens_md5], xmm0
265
11fdf7f2
TL
266 lea tmp, [lane_data + _extra_block + start_offset]
267 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
268 mov dword [lane_data + _extra_blocks], 0
269 jmp start_loop
270
271 align 16
272
273copy_lt64:
274 ;; less than one message block of data
275 ;; beginning of source block
276 ;; destination extrablock but backwards by len from where 0x80 pre-populated
277 ;; p2 clobbers unused_lanes, undo before exiting
278 lea p2, [lane_data + _extra_block + 64]
279 sub p2, len
280 memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
281 mov unused_lanes, [state + _unused_lanes_md5]
282 jmp end_fast_copy
283
284return_null:
285 xor job_rax, job_rax
286 jmp return
287
288 align 16
289end_loop:
290 mov job_rax, [lane_data + _job_in_lane]
291 mov unused_lanes, [state + _unused_lanes_md5]
292 mov qword [lane_data + _job_in_lane], 0
293 or dword [job_rax + _status], STS_COMPLETED_HMAC
294 shl unused_lanes, 4
295 or unused_lanes, idx
296 mov [state + _unused_lanes_md5], unused_lanes
297
298 mov p, [job_rax + _auth_tag_output]
299
300 ; copy 12 bytes
301 mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
302 mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
303 mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
304 mov [p + 0*4], DWORD(tmp)
305 mov [p + 1*4], DWORD(tmp2)
306 mov [p + 2*4], DWORD(tmp3)
307
9f95a23c
TL
308 cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
309 je return
310
311 ; copy 16 bytes
312 mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
313 mov [p + 3*4], DWORD(tmp3)
314
11fdf7f2
TL
315return:
316
317 mov rbx, [rsp + _gpr_save + 8*0]
318 mov rbp, [rsp + _gpr_save + 8*1]
319 mov r12, [rsp + _gpr_save + 8*2]
320 mov r13, [rsp + _gpr_save + 8*3]
321 mov r14, [rsp + _gpr_save + 8*4]
322 mov r15, [rsp + _gpr_save + 8*5]
323%ifndef LINUX
324 mov rsi, [rsp + _gpr_save + 8*6]
325 mov rdi, [rsp + _gpr_save + 8*7]
326%endif
327 mov rsp, [rsp + _rsp_save] ; original SP
328
329 ret
330
331%ifdef LINUX
332section .note.GNU-stack noalloc noexec nowrite progbits
333%endif