]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_submit_avx2.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / intel-ipsec-mb / avx2 / mb_mgr_hmac_sha_512_submit_avx2.asm
CommitLineData
11fdf7f2
TL
1;;
2;; Copyright (c) 2012-2018, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;; * Redistributions of source code must retain the above copyright notice,
8;; this list of conditions and the following disclaimer.
9;; * Redistributions in binary form must reproduce the above copyright
10;; notice, this list of conditions and the following disclaimer in the
11;; documentation and/or other materials provided with the distribution.
12;; * Neither the name of Intel Corporation nor the names of its contributors
13;; may be used to endorse or promote products derived from this software
14;; without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
f67539c2 28%include "include/os.asm"
11fdf7f2
TL
29%include "job_aes_hmac.asm"
30%include "mb_mgr_datastruct.asm"
f67539c2
TL
31%include "include/reg_sizes.asm"
32%include "include/memcpy.asm"
33%include "include/const.inc"
11fdf7f2
TL
34
35extern sha512_x4_avx2
36
37section .data
38default rel
39align 16
40byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
41 dq 0x0001020304050607, 0x08090a0b0c0d0e0f
42
43section .text
44
45%ifndef FUNC
46%define FUNC submit_job_hmac_sha_512_avx2
47%define SHA_X_DIGEST_SIZE 512
48%endif
49
50%if 1
51%ifdef LINUX
52%define arg1 rdi
53%define arg2 rsi
54%define reg3 rcx
55%define reg4 rdx
56%else
57%define arg1 rcx
58%define arg2 rdx
59%define reg3 rdi
60%define reg4 rsi
61%endif
62
63%define state arg1
64%define job arg2
65%define len2 arg2
66
67
68; idx needs to be in rbp, r13, r14, r16
69%define last_len rbp
70%define idx rbp
71
72%define p r11
73%define start_offset r11
74
75%define unused_lanes rbx
76%define tmp4 rbx
77
78%define job_rax rax
79%define len rax
80
81%define size_offset reg3
82%define tmp2 reg3
83
84%define lane reg4
85%define tmp3 reg4
86
87%define extra_blocks r8
88
89%define tmp r9
90%define p2 r9
91
92%define lane_data r10
93
94%endif
95
96; Define stack usage
97
98; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12
99struc STACK
100_gpr_save: resq 5
101_rsp_save: resq 1
102endstruc
103
104; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job)
105; arg 1 : rcx : state
106; arg 2 : rdx : job
107MKGLOBAL(FUNC,function,internal)
108FUNC:
109
110 mov rax, rsp
111 sub rsp, STACK_size
112 and rsp, -32
113 mov [rsp + _gpr_save + 8*0], rbx
114 mov [rsp + _gpr_save + 8*1], rbp
115 mov [rsp + _gpr_save + 8*2], r12
116%ifndef LINUX
117 mov [rsp + _gpr_save + 8*3], rsi
118 mov [rsp + _gpr_save + 8*4], rdi
119%endif
120 mov [rsp + _rsp_save], rax ; original SP
121
122 mov unused_lanes, [state + _unused_lanes_sha512]
123 movzx lane, BYTE(unused_lanes)
124 shr unused_lanes, 8
125 imul lane_data, lane, _SHA512_LANE_DATA_size
126 lea lane_data, [state + _ldata_sha512 + lane_data]
127 mov [state + _unused_lanes_sha512], unused_lanes
128 mov len, [job + _msg_len_to_hash_in_bytes]
129 mov tmp, len
130 shr tmp, 7 ; divide by 128, len in terms of blocks
131
132 mov [lane_data + _job_in_lane_sha512], job
133 mov dword [lane_data + _outer_done_sha512], 0
9f95a23c
TL
134
135 vmovdqa xmm0, [state + _lens_sha512]
136 XVPINSRW xmm0, xmm1, extra_blocks, lane, tmp, scale_x16
137 vmovdqa [state + _lens_sha512], xmm0
138
11fdf7f2
TL
139
140 mov last_len, len
141 and last_len, 127
142 lea extra_blocks, [last_len + 17 + 127]
143 shr extra_blocks, 7
144 mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
145
146 mov p, [job + _src]
147 add p, [job + _hash_start_src_offset_in_bytes]
148 mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
149
150 cmp len, 128
151 jb copy_lt128
152
153fast_copy:
154 add p, len
155 vmovdqu ymm0, [p - 128 + 0*32]
156 vmovdqu ymm1, [p - 128 + 1*32]
157 vmovdqu ymm2, [p - 128 + 2*32]
158 vmovdqu ymm3, [p - 128 + 3*32]
159 vmovdqu [lane_data + _extra_block_sha512 + 0*32], ymm0
160 vmovdqu [lane_data + _extra_block_sha512 + 1*32], ymm1
161 vmovdqu [lane_data + _extra_block_sha512 + 2*32], ymm2
162 vmovdqu [lane_data + _extra_block_sha512 + 3*32], ymm3
163end_fast_copy:
164
165 mov size_offset, extra_blocks
166 shl size_offset, 7
167 sub size_offset, last_len
168 add size_offset, 128-8
169 mov [lane_data + _size_offset_sha512], DWORD(size_offset)
170 mov start_offset, 128
171 sub start_offset, last_len
172 mov [lane_data + _start_offset_sha512], DWORD(start_offset)
173
174 lea tmp, [8*128 + 8*len]
175 bswap tmp
176 mov [lane_data + _extra_block_sha512 + size_offset], tmp
177
178 mov tmp, [job + _auth_key_xor_ipad]
179
180%assign I 0
181%rep 4
182 vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE]
183 vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
184 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
185%assign I (I+1)
186%endrep
187
188 test len, ~127
189 jnz ge128_bytes
190
191lt128_bytes:
9f95a23c
TL
192 vmovdqa xmm0, [state + _lens_sha512]
193 XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
194 vmovdqa [state + _lens_sha512], xmm0
195
11fdf7f2
TL
196 lea tmp, [lane_data + _extra_block_sha512 + start_offset]
197 mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
198 mov dword [lane_data + _extra_blocks_sha512], 0
199
200ge128_bytes:
201 cmp unused_lanes, 0xff
202 jne return_null
203 jmp start_loop
204
205 align 16
206start_loop:
207 ; Find min length
208 vmovdqa xmm0, [state + _lens_sha512]
209 vphminposuw xmm1, xmm0
210 vpextrw DWORD(len2), xmm1, 0 ; min value
211 vpextrw DWORD(idx), xmm1, 1 ; min index (0...1)
212 cmp len2, 0
213 je len_is_0
214
215 vpshuflw xmm1, xmm1, 0x00
216 vpsubw xmm0, xmm0, xmm1
217 vmovdqa [state + _lens_sha512], xmm0
218
219 ; "state" and "args" are the same address, arg1
220 ; len is arg2
221 call sha512_x4_avx2
222 ; state and idx are intact
223
224len_is_0:
225 ; process completed job "idx"
226 imul lane_data, idx, _SHA512_LANE_DATA_size
227 lea lane_data, [state + _ldata_sha512 + lane_data]
228 mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
229 cmp extra_blocks, 0
230 jne proc_extra_blocks
231 cmp dword [lane_data + _outer_done_sha512], 0
232 jne end_loop
233
234proc_outer:
235 mov dword [lane_data + _outer_done_sha512], 1
236 mov DWORD(size_offset), [lane_data + _size_offset_sha512]
237 mov qword [lane_data + _extra_block_sha512 + size_offset], 0
9f95a23c
TL
238
239 vmovdqa xmm0, [state + _lens_sha512]
240 XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
241 vmovdqa [state + _lens_sha512], xmm0
242
11fdf7f2
TL
243 lea tmp, [lane_data + _outer_block_sha512]
244 mov job, [lane_data + _job_in_lane_sha512]
245 mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
246
247%assign I 0
248%rep (SHA_X_DIGEST_SIZE / (8 * 16))
249 vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE]
250 vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
251 vpshufb xmm0, [rel byteswap]
252 vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0
253%assign I (I+1)
254%endrep
255
256 mov tmp, [job + _auth_key_xor_opad]
257%assign I 0
258%rep 4
259 vmovdqu xmm0, [tmp + I * 16]
260 vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0
261 vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
262%assign I (I+1)
263%endrep
264
265 jmp start_loop
266
267 align 16
268proc_extra_blocks:
269 mov DWORD(start_offset), [lane_data + _start_offset_sha512]
9f95a23c
TL
270
271 vmovdqa xmm0, [state + _lens_sha512]
272 XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
273 vmovdqa [state + _lens_sha512], xmm0
274
11fdf7f2
TL
275 lea tmp, [lane_data + _extra_block_sha512 + start_offset]
276 mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
277 mov dword [lane_data + _extra_blocks_sha512], 0
278 jmp start_loop
279
280 align 16
281copy_lt128:
282 ;; less than one message block of data
283 ;; destination extra block but backwards by len from where 0x80 pre-populated
284 lea p2, [lane_data + _extra_block + 128]
285 sub p2, len
286 memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3
287 mov unused_lanes, [state + _unused_lanes_sha512]
288 jmp end_fast_copy
289
290return_null:
291 xor job_rax, job_rax
292 jmp return
293
294 align 16
295end_loop:
296 mov job_rax, [lane_data + _job_in_lane_sha512]
297 mov unused_lanes, [state + _unused_lanes_sha512]
298 mov qword [lane_data + _job_in_lane_sha512], 0
299 or dword [job_rax + _status], STS_COMPLETED_HMAC
300 shl unused_lanes, 8
301 or unused_lanes, idx
302 mov [state + _unused_lanes_sha512], unused_lanes
303
304 mov p, [job_rax + _auth_tag_output]
305
9f95a23c
TL
306 vzeroupper
307
308%if (SHA_X_DIGEST_SIZE != 384)
309 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
310 jne copy_full_digest
311%else
312 cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
313 jne copy_full_digest
314%endif
315 ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
11fdf7f2
TL
316 mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
317 mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
318 mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
319%if (SHA_X_DIGEST_SIZE != 384)
320 mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
321%endif
11fdf7f2
TL
322 bswap QWORD(tmp)
323 bswap QWORD(tmp2)
324 bswap QWORD(tmp3)
325%if (SHA_X_DIGEST_SIZE != 384)
326 bswap QWORD(tmp4)
327%endif
11fdf7f2
TL
328 mov [p + 0*8], QWORD(tmp)
329 mov [p + 1*8], QWORD(tmp2)
330 mov [p + 2*8], QWORD(tmp3)
331%if (SHA_X_DIGEST_SIZE != 384)
332 mov [p + 3*8], QWORD(tmp4)
333%endif
f67539c2 334 jmp clear_ret
9f95a23c
TL
335
336copy_full_digest:
337 ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
338 mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
339 mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
340 mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
341 mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
342 bswap QWORD(tmp)
343 bswap QWORD(tmp2)
344 bswap QWORD(tmp3)
345 bswap QWORD(tmp4)
346 mov [p + 0*8], QWORD(tmp)
347 mov [p + 1*8], QWORD(tmp2)
348 mov [p + 2*8], QWORD(tmp3)
349 mov [p + 3*8], QWORD(tmp4)
350
351 mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
352 mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
353%if (SHA_X_DIGEST_SIZE != 384)
354 mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
355 mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
356%endif
357 bswap QWORD(tmp)
358 bswap QWORD(tmp2)
359%if (SHA_X_DIGEST_SIZE != 384)
360 bswap QWORD(tmp3)
361 bswap QWORD(tmp4)
362%endif
363 mov [p + 4*8], QWORD(tmp)
364 mov [p + 5*8], QWORD(tmp2)
365%if (SHA_X_DIGEST_SIZE != 384)
366 mov [p + 6*8], QWORD(tmp3)
367 mov [p + 7*8], QWORD(tmp4)
368%endif
11fdf7f2 369
f67539c2
TL
370clear_ret:
371
372%ifdef SAFE_DATA
373 ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
374%assign J 0
375%rep 6
376 mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
377%assign J (J+1)
378%endrep
379%if (SHA_X_DIGEST_SIZE != 384)
380 mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
381 mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
382%endif
383
384 vpxor ymm0, ymm0
385 imul lane_data, idx, _SHA512_LANE_DATA_size
386 lea lane_data, [state + _ldata_sha512 + lane_data]
387 ;; Clear first 128 bytes of extra_block
388%assign offset 0
389%rep 4
390 vmovdqa [lane_data + _extra_block + offset], ymm0
391%assign offset (offset + 32)
392%endrep
393
394 ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
395 vmovdqu [lane_data + _outer_block], ymm0
396%if (SHA_X_DIGEST_SIZE == 384)
397 vmovdqa [lane_data + _outer_block + 32], xmm0
398%else
399 vmovdqu [lane_data + _outer_block + 32], ymm0
400%endif
401%endif ;; SAFE_DATA
402
11fdf7f2
TL
403return:
404 mov rbx, [rsp + _gpr_save + 8*0]
405 mov rbp, [rsp + _gpr_save + 8*1]
406 mov r12, [rsp + _gpr_save + 8*2]
407%ifndef LINUX
408 mov rsi, [rsp + _gpr_save + 8*3]
409 mov rdi, [rsp + _gpr_save + 8*4]
410%endif
411 mov rsp, [rsp + _rsp_save] ; original SP
412 ret
413
414%ifdef LINUX
415section .note.GNU-stack noalloc noexec nowrite progbits
416%endif