]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_hmac_submit_ni_sse.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; In System V AMD64 ABI
29;; calle saves: RBX, RBP, R12-R15
30;; Windows x64 ABI
31;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
32;;
33;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
34;; -----------------------------------------------------------
35;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
36;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
37;; -----------------------------------------------------------
38;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
39;; Linux preserves: RBX RBP R12 R13 R14 R15
40;; -----------------------------------------------------------
41;;
42;; Linux/Windows clobbers: xmm0 - xmm15
43;;
44
f67539c2 45%include "include/os.asm"
11fdf7f2
TL
46%include "job_aes_hmac.asm"
47%include "mb_mgr_datastruct.asm"
f67539c2
TL
48%include "include/reg_sizes.asm"
49%include "include/memcpy.asm"
11fdf7f2
TL
50
51;%define DO_DBGPRINT
f67539c2 52%include "include/dbgprint.asm"
11fdf7f2
TL
53
54extern sha1_ni
55
56section .data
57default rel
58
59align 16
60byteswap:
61 dq 0x0405060700010203
62 dq 0x0c0d0e0f08090a0b
63
64section .text
65
66%ifdef LINUX
67%define arg1 rdi
68%define arg2 rsi
69%define reg3 rcx
70%define reg4 rdx
71%else
72%define arg1 rcx
73%define arg2 rdx
74%define reg3 rdi
75%define reg4 rsi
76%endif
77
78%define state arg1
79%define job arg2
80%define len2 arg2
81
82; idx needs to be in rbx, rbp, r12-r15
83%define last_len rbp
84%define idx rbp
85%define p4 rbp
86
87%define p r11
88%define start_offset r11
89
90%define unused_lanes rbx
91%define tmp4 rbx
92%define p3 rbx
93
94%define job_rax rax
95%define len rax
96
97%define size_offset reg3
98%define tmp2 reg3
99
100%define lane reg4
101%define tmp3 reg4
102
103%define extra_blocks r8
104
105%define tmp r9
106%define p2 r9
107
108%define lane_data r10
109
110struc STACK
111_gpr_save: resq 4
112_rsp_save: resq 1
113endstruc
114
115; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
116; arg 1 : rcx : state
117; arg 2 : rdx : job
118MKGLOBAL(submit_job_hmac_ni_sse,function,internal)
119submit_job_hmac_ni_sse:
120
121 mov rax, rsp
122 sub rsp, STACK_size
123 and rsp, -16
124
125 mov [rsp + _gpr_save + 8*0], rbx
126 mov [rsp + _gpr_save + 8*1], rbp
127%ifndef LINUX
128 mov [rsp + _gpr_save + 8*2], rsi
129 mov [rsp + _gpr_save + 8*3], rdi
130%endif
131 mov [rsp + _rsp_save], rax ; original SP
132
133 DBGPRINTL "enter sha1-ni-sse submit"
134 mov unused_lanes, [state + _unused_lanes]
135 movzx lane, BYTE(unused_lanes)
136 DBGPRINTL64 "lane: ", lane
137 shr unused_lanes, 8
138 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
139 lea lane_data, [state + _ldata + lane_data]
140 mov [state + _unused_lanes], unused_lanes
141 mov len, [job + _msg_len_to_hash_in_bytes]
142 DBGPRINTL64 "length: ", len
143 mov tmp, len
144 shr tmp, 6 ; divide by 64, len in terms of blocks
145
146 mov [lane_data + _job_in_lane], job
147 mov dword [lane_data + _outer_done], 0
148 mov [state + _lens + 2*lane], WORD(tmp)
149
150 mov last_len, len
151 and last_len, 63
152 lea extra_blocks, [last_len + 9 + 63]
153 shr extra_blocks, 6
154 mov [lane_data + _extra_blocks], DWORD(extra_blocks)
155
156 mov p, [job + _src]
157 add p, [job + _hash_start_src_offset_in_bytes]
158 DBGPRINTL64 "src pointer + offset:", p
159 mov [state + _args_data_ptr + PTR_SZ*lane], p
160 cmp len, 64
161 jb copy_lt64
162
163fast_copy:
164 add p, len
165 movdqu xmm0, [p - 64 + 0*16]
166 movdqu xmm1, [p - 64 + 1*16]
167 movdqu xmm2, [p - 64 + 2*16]
168 movdqu xmm3, [p - 64 + 3*16]
169 movdqa [lane_data + _extra_block + 0*16], xmm0
170 movdqa [lane_data + _extra_block + 1*16], xmm1
171 movdqa [lane_data + _extra_block + 2*16], xmm2
172 movdqa [lane_data + _extra_block + 3*16], xmm3
173end_fast_copy:
174
175 mov size_offset, extra_blocks
176 shl size_offset, 6
177 sub size_offset, last_len
178 add size_offset, 64-8
179 mov [lane_data + _size_offset], DWORD(size_offset)
180 mov start_offset, 64
181 sub start_offset, last_len
182 mov [lane_data + _start_offset], DWORD(start_offset)
183
184 lea tmp, [8*64 + 8*len]
185 bswap tmp
186 mov [lane_data + _extra_block + size_offset], tmp
187
188 mov tmp, [job + _auth_key_xor_ipad]
189 movdqu xmm0, [tmp]
190 mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE]
191%if SHA1NI_DIGEST_ROW_SIZE != 20
192%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
193%endif
194 lea p4, [lane + lane*4]
195 movdqu [state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0
196 mov [state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
197 test len, ~63
198 jnz ge64_bytes
199
200lt64_bytes:
201 mov [state + _lens + 2*lane], WORD(extra_blocks)
202 lea tmp, [lane_data + _extra_block + start_offset]
203 mov [state + _args_data_ptr + PTR_SZ*lane], tmp
204 mov dword [lane_data + _extra_blocks], 0
205
206ge64_bytes:
207 cmp unused_lanes, 0xff
208 jne return_null
209 jmp start_loop
210
211 align 16
212start_loop:
213 ; Find min length - only two lanes available
214 xor len2, len2
215 mov p3, 0x10000
216 mov WORD(len2), word [state + _lens + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0)
217 mov WORD(p3), word [state + _lens + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1)
218 cmp WORD(len2), WORD(p3)
219 cmovg DWORD(len2), DWORD(p3) ; move if lane 0 length is greater than lane 1 length
220
221 mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields
222 shr DWORD(idx), 16
223 and DWORD(len2), 0xffff
224 je len_is_0
225
226 sub word [state + _lens + 0*2], WORD(len2)
227 sub word [state + _lens + 1*2], WORD(len2)
228
229 ; "state" and "args" are the same address, arg1
230 ; len is arg2
231 call sha1_ni
232 ; state is intact
233
234len_is_0:
235 ; process completed job "idx"
236 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
237 lea lane_data, [state + _ldata + lane_data]
238 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
239 cmp extra_blocks, 0
240 jne proc_extra_blocks
241 cmp dword [lane_data + _outer_done], 0
242 jne end_loop
243
244proc_outer:
245 mov dword [lane_data + _outer_done], 1
246 mov DWORD(size_offset), [lane_data + _size_offset]
247 mov qword [lane_data + _extra_block + size_offset], 0
248 mov word [state + _lens + 2*idx], 1
249 lea tmp, [lane_data + _outer_block]
250 mov job, [lane_data + _job_in_lane]
251 mov [state + _args_data_ptr + PTR_SZ*idx], tmp
252
253%if SHA1NI_DIGEST_ROW_SIZE != 20
254%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
255%endif
256 lea p3, [idx + idx*4]
257 movdqu xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE]
258 pshufb xmm0, [rel byteswap]
259 mov DWORD(tmp), [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE]
260 bswap DWORD(tmp)
261 movdqa [lane_data + _outer_block], xmm0
262 mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
263
264 mov tmp, [job + _auth_key_xor_opad]
265 movdqu xmm0, [tmp]
266 mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE]
267 movdqu [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0
268 mov [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
269 jmp start_loop
270
271 align 16
272proc_extra_blocks:
273 mov DWORD(start_offset), [lane_data + _start_offset]
274 mov [state + _lens + 2*idx], WORD(extra_blocks)
275 lea tmp, [lane_data + _extra_block + start_offset]
276 mov [state + _args_data_ptr + PTR_SZ*idx], tmp
277 mov dword [lane_data + _extra_blocks], 0
278 jmp start_loop
279
280 align 16
281copy_lt64:
282 ;; less than one message block of data
283 ;; beginning of source block
284 ;; destination extrablock but backwards by len from where 0x80 pre-populated
285 lea p2, [lane_data + _extra_block + 64]
286 sub p2, len
287 memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
288 mov unused_lanes, [state + _unused_lanes]
289 jmp end_fast_copy
290
291return_null:
292 xor job_rax, job_rax
293 jmp return
294
295 align 16
296end_loop:
297 mov job_rax, [lane_data + _job_in_lane]
298 mov unused_lanes, [state + _unused_lanes]
299 mov qword [lane_data + _job_in_lane], 0
300 or dword [job_rax + _status], STS_COMPLETED_HMAC
301 shl unused_lanes, 8
302 or unused_lanes, idx
303 mov [state + _unused_lanes], unused_lanes
304
305 mov p, [job_rax + _auth_tag_output]
306
307 ; copy 12 bytes
308%if SHA1NI_DIGEST_ROW_SIZE != 20
309%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!"
310%endif
311 lea idx, [idx + 4*idx]
312 mov DWORD(tmp), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE]
313 mov DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE]
314 mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE]
315 bswap DWORD(tmp)
316 bswap DWORD(tmp2)
317 bswap DWORD(tmp3)
318 mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
319 mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
320 mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
321
9f95a23c 322 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
f67539c2 323 je clear_ret
9f95a23c
TL
324
325 ;; copy remaining 8 bytes to return 20 byte digest
326 mov DWORD(tmp), [state + _args_digest + idx*4 + 3*SHA1_DIGEST_WORD_SIZE]
327 mov DWORD(tmp2), [state + _args_digest + idx*4 + 4*SHA1_DIGEST_WORD_SIZE]
328 bswap DWORD(tmp)
329 bswap DWORD(tmp2)
330 mov [p + 3*4], DWORD(tmp)
331 mov [p + 4*4], DWORD(tmp2)
332
f67539c2
TL
333clear_ret:
334
335%ifdef SAFE_DATA
336 pxor xmm0, xmm0
337 ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
338 ;; idx = 0 or 5 (depending on lane)
339 movdqu [state + _args_digest + idx*4], xmm0
340 mov dword [state + _args_digest + idx*4 + 16], 0
341
342 shr idx, 2 ;; idx == 5 ? 1 : 0
343 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
344 lea lane_data, [state + _ldata + lane_data]
345 ;; Clear first 64 bytes of extra_block
346%assign offset 0
347%rep 4
348 movdqa [lane_data + _extra_block + offset], xmm0
349%assign offset (offset + 16)
350%endrep
351
352 ;; Clear 20 bytes of outer_block
353 movdqa [lane_data + _outer_block], xmm0
354 mov dword [lane_data + _outer_block + 16], 0
355%endif
356
11fdf7f2
TL
357return:
358 mov rbx, [rsp + _gpr_save + 8*0]
359 mov rbp, [rsp + _gpr_save + 8*1]
360%ifndef LINUX
361 mov rsi, [rsp + _gpr_save + 8*2]
362 mov rdi, [rsp + _gpr_save + 8*3]
363%endif
364 mov rsp, [rsp + _rsp_save] ; original SP
365
366 ret
367
368%ifdef LINUX
369section .note.GNU-stack noalloc noexec nowrite progbits
370%endif