]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / sse / mb_mgr_hmac_sha_256_submit_ni_sse.asm
1 ;;
2 ;; Copyright (c) 2012-2018, Intel Corporation
3 ;;
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
6 ;;
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
15 ;;
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 ;;
27
28 ;; In System V AMD64 ABI
29 ;; calle saves: RBX, RBP, R12-R15
30 ;; Windows x64 ABI
31 ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
32 ;;
33 ;; Linux/Windows clobbers: xmm0 - xmm15
34 ;;
35
36 %include "include/os.asm"
37 %include "job_aes_hmac.asm"
38 %include "mb_mgr_datastruct.asm"
39 %include "include/reg_sizes.asm"
40 %include "include/memcpy.asm"
41
42 ;%define DO_DBGPRINT
43 %include "include/dbgprint.asm"
44
45 extern sha256_ni
46
47 %ifdef LINUX
48 %define arg1 rdi
49 %define arg2 rsi
50 %define reg3 rcx
51 %define reg4 rdx
52 %else
53 %define arg1 rcx
54 %define arg2 rdx
55 %define reg3 rdi
56 %define reg4 rsi
57 %endif
58
59 %define state arg1
60 %define job arg2
61 %define len2 arg2
62
63
64 ; idx needs to be in rbx, rbp, r13-r15
65 %define last_len rbp
66 %define idx rbp
67
68 %define p r11
69 %define start_offset r11
70
71 %define unused_lanes rbx
72 %define tmp4 rbx
73
74 %define job_rax rax
75 %define len rax
76
77 %define size_offset reg3
78 %define tmp2 reg3
79
80 %define lane reg4
81
82 %define extra_blocks r8
83
84 %define tmp r9
85 %define p2 r9
86
87 %define lane_data r10
88
89 %define bswap_xmm4 xmm4
90
91 struc STACK
92 _gpr_save: resq 4 ; rbx, rbp, rsi (win), rdi (win)
93 _rsp_save: resq 1
94 endstruc
95
96 section .data
97 default rel
98
99 align 16
100 byteswap:
101 dq 0x0405060700010203
102 dq 0x0c0d0e0f08090a0b
103
104 section .text
105
106 %ifdef SHA224
107 ; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
108 ; arg 1 : state
109 ; arg 2 : job
110 MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal)
111 submit_job_hmac_sha_224_ni_sse:
112
113 %else
114
115 ; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
116 ; arg 1 : state
117 ; arg 2 : job
118 MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal)
119 submit_job_hmac_sha_256_ni_sse:
120 %endif
121
122 mov rax, rsp
123 sub rsp, STACK_size
124 and rsp, -16
125
126 mov [rsp + _gpr_save + 8*0], rbx
127 mov [rsp + _gpr_save + 8*1], rbp
128 %ifndef LINUX
129 mov [rsp + _gpr_save + 8*2], rsi
130 mov [rsp + _gpr_save + 8*3], rdi
131 %endif
132 mov [rsp + _rsp_save], rax ; original SP
133
134 DBGPRINTL "enter sha256-ni-sse submit"
135
136 mov unused_lanes, [state + _unused_lanes_sha256]
137 movzx lane, BYTE(unused_lanes)
138 DBGPRINTL64 "lane: ", lane
139 shr unused_lanes, 8
140 imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same
141 lea lane_data, [state + _ldata_sha256 + lane_data]
142 mov [state + _unused_lanes_sha256], unused_lanes
143 mov len, [job + _msg_len_to_hash_in_bytes]
144 DBGPRINTL64 "length: ", len
145 mov tmp, len
146 shr tmp, 6 ; divide by 64, len in terms of blocks
147
148 mov [lane_data + _job_in_lane], job
149 mov dword [lane_data + _outer_done], 0
150 mov [state + _lens_sha256 + 2*lane], WORD(tmp)
151
152 mov last_len, len
153 and last_len, 63
154 lea extra_blocks, [last_len + 9 + 63]
155 shr extra_blocks, 6
156 mov [lane_data + _extra_blocks], DWORD(extra_blocks)
157
158 mov p, [job + _src]
159 add p, [job + _hash_start_src_offset_in_bytes]
160 mov [state + _args_data_ptr_sha256 + 8*lane], p
161
162 cmp len, 64
163 jb copy_lt64
164
165 fast_copy:
166 add p, len
167 movdqu xmm0, [p - 64 + 0*16]
168 movdqu xmm1, [p - 64 + 1*16]
169 movdqu xmm2, [p - 64 + 2*16]
170 movdqu xmm3, [p - 64 + 3*16]
171 movdqa [lane_data + _extra_block + 0*16], xmm0
172 movdqa [lane_data + _extra_block + 1*16], xmm1
173 movdqa [lane_data + _extra_block + 2*16], xmm2
174 movdqa [lane_data + _extra_block + 3*16], xmm3
175 end_fast_copy:
176
177 mov size_offset, extra_blocks
178 shl size_offset, 6
179 sub size_offset, last_len
180 add size_offset, 64-8
181 mov [lane_data + _size_offset], DWORD(size_offset)
182 mov start_offset, 64
183 sub start_offset, last_len
184 mov [lane_data + _start_offset], DWORD(start_offset)
185
186 lea tmp, [8*64 + 8*len]
187 bswap tmp
188 mov [lane_data + _extra_block + size_offset], tmp
189
190 mov tmp, [job + _auth_key_xor_ipad]
191 movdqu xmm0, [tmp]
192 movdqu xmm1, [tmp + 4*4]
193 %if SHA256NI_DIGEST_ROW_SIZE != 32
194 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
195 %endif
196 lea tmp, [lane*8] ; x8 here plus x4 scale factor give x32
197 movdqu [state + _args_digest_sha256 + tmp*4], xmm0
198 movdqu [state + _args_digest_sha256 + tmp*4 + 4*4], xmm1
199 DBGPRINTL "args digest:"
200 DBGPRINT_XMM xmm0
201 DBGPRINT_XMM xmm1
202 test len, ~63
203 jnz ge64_bytes
204
205 lt64_bytes:
206 mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
207 lea tmp, [lane_data + _extra_block + start_offset]
208 mov [state + _args_data_ptr_sha256 + 8*lane], tmp
209 mov dword [lane_data + _extra_blocks], 0
210
211 ge64_bytes:
212 cmp unused_lanes, 0xff
213 jne return_null
214 jmp start_loop
215
216 align 16
217 start_loop:
218 ; Find min length - only two lanes available
219 xor len2, len2
220 mov tmp, 0x10000
221 mov WORD(len2), word [state + _lens_sha256 + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0)
222 mov WORD(tmp), word [state + _lens_sha256 + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1)
223 cmp WORD(len2), WORD(tmp)
224 cmovg DWORD(len2), DWORD(tmp) ; move if lane 0 length is greater than lane 1 length
225
226 mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields
227 shr DWORD(idx), 16
228 and DWORD(len2), 0xffff
229 je len_is_0
230
231 sub word [state + _lens_sha256 + 0*2], WORD(len2)
232 sub word [state + _lens_sha256 + 1*2], WORD(len2)
233
234 ; "state" and "args" are the same address, arg1
235 ; len is arg2
236 call sha256_ni
237 ; state is intact
238 len_is_0:
239 ; process completed job "idx"
240 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
241 lea lane_data, [state + _ldata_sha256 + lane_data]
242 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
243 cmp extra_blocks, 0
244 jne proc_extra_blocks
245 movdqa bswap_xmm4, [rel byteswap]
246 cmp dword [lane_data + _outer_done], 0
247 jne end_loop
248
249 proc_outer:
250 mov dword [lane_data + _outer_done], 1
251 mov DWORD(size_offset), [lane_data + _size_offset]
252 mov qword [lane_data + _extra_block + size_offset], 0
253 mov word [state + _lens_sha256 + 2*idx], 1
254 lea tmp, [lane_data + _outer_block]
255 mov job, [lane_data + _job_in_lane]
256 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
257
258 %if SHA256NI_DIGEST_ROW_SIZE != 32
259 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
260 %endif
261 lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32
262 movdqu xmm0, [state + _args_digest_sha256 + tmp4*4]
263 movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4]
264 pshufb xmm0, bswap_xmm4
265 pshufb xmm1, bswap_xmm4
266 movdqa [lane_data + _outer_block], xmm0
267 movdqa [lane_data + _outer_block + 4*4], xmm1
268 %ifdef SHA224
269 ;; overwrite top 4 bytes with 0x80
270 mov dword [lane_data + _outer_block + 7*4], 0x80
271 %endif
272
273 mov tmp, [job + _auth_key_xor_opad]
274 movdqu xmm0, [tmp]
275 movdqu xmm1, [tmp + 4*4]
276 movdqu [state + _args_digest_sha256 + tmp4*4], xmm0
277 movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1
278 jmp start_loop
279
280 align 16
281 proc_extra_blocks:
282 mov DWORD(start_offset), [lane_data + _start_offset]
283 mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
284 lea tmp, [lane_data + _extra_block + start_offset]
285 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
286 mov dword [lane_data + _extra_blocks], 0
287 jmp start_loop
288
289 align 16
290
291 copy_lt64:
292 ;; less than one message block of data
293 ;; beginning of source block
294 ;; destination extrablock but backwards by len from where 0x80 pre-populated
295 ;; p2 clobbers unused_lanes, undo before exit
296 lea p2, [lane_data + _extra_block + 64]
297 sub p2, len
298 memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
299 mov unused_lanes, [state + _unused_lanes_sha256]
300 jmp end_fast_copy
301
302 return_null:
303 xor job_rax, job_rax
304 jmp return
305
306 align 16
307 end_loop:
308 mov job_rax, [lane_data + _job_in_lane]
309 mov unused_lanes, [state + _unused_lanes_sha256]
310 mov qword [lane_data + _job_in_lane], 0
311 or dword [job_rax + _status], STS_COMPLETED_HMAC
312 shl unused_lanes, 8
313 or unused_lanes, idx
314 mov [state + _unused_lanes_sha256], unused_lanes
315
316 mov p, [job_rax + _auth_tag_output]
317
318 ; copy 16 bytes for SHA256, 14 for SHA224
319 %if SHA256NI_DIGEST_ROW_SIZE != 32
320 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
321 %endif
322 shl idx, 5
323
324 %ifdef SHA224
325 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
326 jne copy_full_digest
327 %else
328 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
329 jne copy_full_digest
330 %endif
331
332 movdqu xmm0, [state + _args_digest_sha256 + idx]
333 pshufb xmm0, bswap_xmm4
334 %ifdef SHA224
335 ;; SHA224
336 movq [p + 0*4], xmm0
337 pextrd [p + 2*4], xmm0, 2
338 pextrw [p + 3*4], xmm0, 6
339 %else
340 ;; SHA256
341 movdqu [p], xmm0
342 %endif
343 jmp clear_ret
344
345 copy_full_digest:
346 movdqu xmm0, [state + _args_digest_sha256 + idx]
347 movdqu xmm1, [state + _args_digest_sha256 + idx + 16]
348 pshufb xmm0, bswap_xmm4
349 pshufb xmm1, bswap_xmm4
350 %ifdef SHA224
351 ;; SHA224
352 movdqu [p], xmm0
353 movq [p + 16], xmm1
354 pextrd [p + 16 + 8], xmm1, 2
355 %else
356 ;; SHA256
357 movdqu [p], xmm0
358 movdqu [p + 16], xmm1
359 %endif
360
361 clear_ret:
362
363 %ifdef SAFE_DATA
364 pxor xmm0, xmm0
365 ;; Clear digest, outer_block (28B/32B) and extra_block (64B) of returned job
366 movdqa [state + _args_digest_sha256 + idx], xmm0
367 movdqa [state + _args_digest_sha256 + idx + 16], xmm0
368
369 shr idx, 5 ;; Restore lane idx to 0 or 1
370 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
371 lea lane_data, [state + _ldata_sha256 + lane_data]
372 ;; Clear first 64 bytes of extra_block
373 %assign offset 0
374 %rep 4
375 movdqa [lane_data + _extra_block + offset], xmm0
376 %assign offset (offset + 16)
377 %endrep
378
379 ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
380 movdqa [lane_data + _outer_block], xmm0
381 %ifdef SHA224
382 mov qword [lane_data + _outer_block + 16], 0
383 mov dword [lane_data + _outer_block + 24], 0
384 %else
385 movdqa [lane_data + _outer_block + 16], xmm0
386 %endif
387 %endif ;; SAFE_DATA
388
389 return:
390 mov rbx, [rsp + _gpr_save + 8*0]
391 mov rbp, [rsp + _gpr_save + 8*1]
392 %ifndef LINUX
393 mov rsi, [rsp + _gpr_save + 8*2]
394 mov rdi, [rsp + _gpr_save + 8*3]
395 %endif
396 mov rsp, [rsp + _rsp_save] ; original SP
397 ret
398
399 %ifdef LINUX
400 section .note.GNU-stack noalloc noexec nowrite progbits
401 %endif