]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm
import 15.2.0 Octopus source
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_hmac_md5_submit_sse.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 %include "os.asm"
29 %include "job_aes_hmac.asm"
30 %include "mb_mgr_datastruct.asm"
31 %include "memcpy.asm"
32 %include "reg_sizes.asm"
33 %include "const.inc"
34
35 extern md5_x4x2_sse
36
37 section .data
38 default rel
39 align 16
40 ;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203
41 dupw: ;ddq 0x01000100010001000100010001000100
42 dq 0x0100010001000100, 0x0100010001000100
43
44 section .text
45
46 %if 1
47 %ifdef LINUX
48 %define arg1 rdi
49 %define arg2 rsi
50 %define reg3 rcx
51 %define reg4 rdx
52 %else
53 %define arg1 rcx
54 %define arg2 rdx
55 %define reg3 rdi
56 %define reg4 rsi
57 %endif
58
59 %define state arg1
60 %define job arg2
61 %define len2 arg2
62
63
64 ; idx needs to be in rbp
65 %define last_len rbp
66 %define idx rbp
67
68 %define p r11
69 %define start_offset r11
70
71 %define unused_lanes rbx
72 %define tmp4 rbx
73
74 %define job_rax rax
75 %define len rax
76
77 %define size_offset reg3
78 %define tmp2 reg3
79
80 %define lane reg4
81 %define tmp3 reg4
82
83 %define extra_blocks r8
84
85 %define tmp r9
86 %define p2 r9
87
88 %define lane_data r10
89
90 %endif
91
92 ; This routine and/or the called routine clobbers all GPRs
93 struc STACK
94 _gpr_save: resq 8
95 _rsp_save: resq 1
96 endstruc
97
98 ; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
99 ; arg 1 : rcx : state
100 ; arg 2 : rdx : job
101 MKGLOBAL(submit_job_hmac_md5_sse,function,internal)
102 submit_job_hmac_md5_sse:
103
104 mov rax, rsp
105 sub rsp, STACK_size
106 and rsp, -16
107
108 mov [rsp + _gpr_save + 8*0], rbx
109 mov [rsp + _gpr_save + 8*1], rbp
110 mov [rsp + _gpr_save + 8*2], r12
111 mov [rsp + _gpr_save + 8*3], r13
112 mov [rsp + _gpr_save + 8*4], r14
113 mov [rsp + _gpr_save + 8*5], r15
114 %ifndef LINUX
115 mov [rsp + _gpr_save + 8*6], rsi
116 mov [rsp + _gpr_save + 8*7], rdi
117 %endif
118 mov [rsp + _rsp_save], rax ; original SP
119
120 mov unused_lanes, [state + _unused_lanes_md5]
121 mov lane, unused_lanes
122 and lane, 0xF
123 shr unused_lanes, 4
124 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
125 lea lane_data, [state + _ldata_md5 + lane_data]
126 mov [state + _unused_lanes_md5], unused_lanes
127 mov len, [job + _msg_len_to_hash_in_bytes]
128 mov tmp, len
129 shr tmp, 6 ; divide by 64, len in terms of blocks
130
131 mov [lane_data + _job_in_lane], job
132 mov dword [lane_data + _outer_done], 0
133
134 ;; insert len into proper lane
135 movdqa xmm0, [state + _lens_md5]
136 XPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
137 movdqa [state + _lens_md5], xmm0
138
139 mov last_len, len
140 and last_len, 63
141 lea extra_blocks, [last_len + 9 + 63]
142 shr extra_blocks, 6
143 mov [lane_data + _extra_blocks], DWORD(extra_blocks)
144
145 mov p, [job + _src]
146 add p, [job + _hash_start_src_offset_in_bytes]
147 mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
148
149 cmp len, 64
150 jb copy_lt64
151
152 fast_copy:
153 add p, len
154 movdqu xmm0, [p - 64 + 0*16]
155 movdqu xmm1, [p - 64 + 1*16]
156 movdqu xmm2, [p - 64 + 2*16]
157 movdqu xmm3, [p - 64 + 3*16]
158 movdqa [lane_data + _extra_block + 0*16], xmm0
159 movdqa [lane_data + _extra_block + 1*16], xmm1
160 movdqa [lane_data + _extra_block + 2*16], xmm2
161 movdqa [lane_data + _extra_block + 3*16], xmm3
162 end_fast_copy:
163
164 mov size_offset, extra_blocks
165 shl size_offset, 6
166 sub size_offset, last_len
167 add size_offset, 64-8
168 mov [lane_data + _size_offset], DWORD(size_offset)
169 mov start_offset, 64
170 sub start_offset, last_len
171 mov [lane_data + _start_offset], DWORD(start_offset)
172
173 lea tmp, [8*64 + 8*len]
174 ; bswap tmp
175 mov [lane_data + _extra_block + size_offset], tmp
176
177 mov tmp, [job + _auth_key_xor_ipad]
178 movdqu xmm0, [tmp]
179 movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
180 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
181 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
182 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
183
184 test len, ~63
185 jnz ge64_bytes
186
187 lt64_bytes:
188 movdqa xmm0, [state + _lens_md5]
189 XPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
190 movdqa [state + _lens_md5], xmm0
191
192 lea tmp, [lane_data + _extra_block + start_offset]
193 mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
194 mov dword [lane_data + _extra_blocks], 0
195
196 ge64_bytes:
197 cmp unused_lanes, 0xf
198 jne return_null
199 jmp start_loop
200
201 align 16
202 start_loop:
203 ; Find min length
204 movdqa xmm0, [state + _lens_md5]
205 phminposuw xmm1, xmm0
206 pextrw len2, xmm1, 0 ; min value
207 pextrw idx, xmm1, 1 ; min index (0...3)
208 cmp len2, 0
209 je len_is_0
210
211 pshufb xmm1, [rel dupw] ; duplicate words across all lanes
212 psubw xmm0, xmm1
213 movdqa [state + _lens_md5], xmm0
214
215 ; "state" and "args" are the same address, arg1
216 ; len is arg2
217 call md5_x4x2_sse
218 ; state and idx are intact
219
220 len_is_0:
221 ; process completed job "idx"
222 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
223 lea lane_data, [state + _ldata_md5 + lane_data]
224 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
225 cmp extra_blocks, 0
226 jne proc_extra_blocks
227 cmp dword [lane_data + _outer_done], 0
228 jne end_loop
229
230 proc_outer:
231 mov dword [lane_data + _outer_done], 1
232 mov DWORD(size_offset), [lane_data + _size_offset]
233 mov qword [lane_data + _extra_block + size_offset], 0
234
235 movdqa xmm0, [state + _lens_md5]
236 XPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
237 movdqa [state + _lens_md5], xmm0
238
239 lea tmp, [lane_data + _outer_block]
240 mov job, [lane_data + _job_in_lane]
241 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
242
243 movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
244 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
245 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
246 pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
247 ; pshufb xmm0, [rel byteswap]
248 movdqa [lane_data + _outer_block], xmm0
249
250 mov tmp, [job + _auth_key_xor_opad]
251 movdqu xmm0, [tmp]
252 movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
253 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
254 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
255 pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
256 jmp start_loop
257
258 align 16
259 proc_extra_blocks:
260 mov DWORD(start_offset), [lane_data + _start_offset]
261
262 movdqa xmm0, [state + _lens_md5]
263 XPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
264 movdqa [state + _lens_md5], xmm0
265
266 lea tmp, [lane_data + _extra_block + start_offset]
267 mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
268 mov dword [lane_data + _extra_blocks], 0
269 jmp start_loop
270
271 align 16
272
273 copy_lt64:
274 ;; less than one message block of data
275 ;; beginning of source block
276 ;; destination extrablock but backwards by len from where 0x80 pre-populated
277 ;; p2 clobbers unused_lanes, undo before exiting
278 lea p2, [lane_data + _extra_block + 64]
279 sub p2, len
280 memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
281 mov unused_lanes, [state + _unused_lanes_md5]
282 jmp end_fast_copy
283
284 return_null:
285 xor job_rax, job_rax
286 jmp return
287
288 align 16
289 end_loop:
290 mov job_rax, [lane_data + _job_in_lane]
291 mov unused_lanes, [state + _unused_lanes_md5]
292 mov qword [lane_data + _job_in_lane], 0
293 or dword [job_rax + _status], STS_COMPLETED_HMAC
294 shl unused_lanes, 4
295 or unused_lanes, idx
296 mov [state + _unused_lanes_md5], unused_lanes
297
298 mov p, [job_rax + _auth_tag_output]
299
300 ; copy 12 bytes
301 mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
302 mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
303 mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
304 mov [p + 0*4], DWORD(tmp)
305 mov [p + 1*4], DWORD(tmp2)
306 mov [p + 2*4], DWORD(tmp3)
307
308 cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
309 je return
310
311 ; copy 16 bytes
312 mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
313 mov [p + 3*4], DWORD(tmp3)
314
315 return:
316
317 mov rbx, [rsp + _gpr_save + 8*0]
318 mov rbp, [rsp + _gpr_save + 8*1]
319 mov r12, [rsp + _gpr_save + 8*2]
320 mov r13, [rsp + _gpr_save + 8*3]
321 mov r14, [rsp + _gpr_save + 8*4]
322 mov r15, [rsp + _gpr_save + 8*5]
323 %ifndef LINUX
324 mov rsi, [rsp + _gpr_save + 8*6]
325 mov rdi, [rsp + _gpr_save + 8*7]
326 %endif
327 mov rsp, [rsp + _rsp_save] ; original SP
328
329 ret
330
331 %ifdef LINUX
332 section .note.GNU-stack noalloc noexec nowrite progbits
333 %endif