2 ;; Copyright (c) 2017-2018, Intel Corporation
4 ;; Redistribution and use in source and binary forms, with or without
5 ;; modification, are permitted provided that the following conditions are met:
7 ;; * Redistributions of source code must retain the above copyright notice,
8 ;; this list of conditions and the following disclaimer.
9 ;; * Redistributions in binary form must reproduce the above copyright
10 ;; notice, this list of conditions and the following disclaimer in the
11 ;; documentation and/or other materials provided with the distribution.
12 ;; * Neither the name of Intel Corporation nor the names of its contributors
13 ;; may be used to endorse or promote products derived from this software
14 ;; without specific prior written permission.
16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;; In System V AMD64 ABI
29 ;; calle saves: RBX, RBP, R12-R15
31 ;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
33 ;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
34 ;; -----------------------------------------------------------
35 ;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
36 ;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
37 ;; -----------------------------------------------------------
38 ;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
39 ;; Linux preserves: RBX RBP R12 R13 R14 R15
40 ;; -----------------------------------------------------------
44 %include "job_aes_hmac.asm"
45 %include "mb_mgr_datastruct.asm"
46 %include "reg_sizes.asm"
48 ;; %define DO_DBGPRINT
49 %include "dbgprint.asm"
51 extern sha256_x16_avx512
57 dq 0x0405060700010203, 0x0c0d0e0f08090a0b
61 dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
62 dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
63 dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
64 dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
65 dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
66 dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
67 dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
68 dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
69 dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
70 dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
71 dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
72 dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
73 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
74 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
75 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
76 dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
111 ; idx needs to be in rbp, r15
114 %define unused_lanes r10
117 %define lane_data rbx
122 %define size_offset rax
123 %define start_offset rax
127 %define extra_blocks arg2
133 %define len_upper r13
134 %define idx_upper r14
137 ; we clobber rsi, rbp; called routine also clobbers rax, r9 to r15
143 %define APPEND(a,b) a %+ b
145 ; JOB* flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
146 ; JOB* flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
150 MKGLOBAL(flush_job_hmac_sha_224_avx512,function,internal)
151 flush_job_hmac_sha_224_avx512:
153 MKGLOBAL(flush_job_hmac_sha_256_avx512,function,internal)
154 flush_job_hmac_sha_256_avx512:
159 mov [rsp + _gpr_save + 8*0], rbx
160 mov [rsp + _gpr_save + 8*1], rbp
161 mov [rsp + _gpr_save + 8*2], r12
162 mov [rsp + _gpr_save + 8*3], r13
163 mov [rsp + _gpr_save + 8*4], r14
164 mov [rsp + _gpr_save + 8*5], r15
166 mov [rsp + _gpr_save + 8*6], rsi
167 mov [rsp + _gpr_save + 8*7], rdi
169 mov [rsp + _rsp_save], rax ; original SP
171 ; if bit (32+3) is set, then all lanes are empty
172 cmp dword [state + _num_lanes_inuse_sha256], 0
175 ; find a lane with a non-null job
180 cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
181 cmovne idx, [rel APPEND(lane_,I)]
186 ; copy idx to empty lanes
187 vmovdqa ymm0, [state + _lens_sha256]
188 mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx]
192 cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
194 mov [state + _args_data_ptr_sha256 + PTR_SZ*I], tmp
195 vpor ymm0, ymm0, [rel len_masks + 32*I]
200 vmovdqa [state + _lens_sha256 ], ymm0
202 vphminposuw xmm1, xmm0
203 vpextrw DWORD(len2), xmm1, 0 ; min value
204 vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
206 vmovdqa xmm2, [state + _lens_sha256 + 8*2]
207 vphminposuw xmm3, xmm2
208 vpextrw DWORD(len_upper), xmm3, 0 ; min value
209 vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
216 mov idx, idx_upper ; idx would be in range 0..7
217 add idx, 8 ; to reflect that index is in 8..F range
223 vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
224 vpsubw xmm0, xmm0, xmm1
225 vmovdqa [state + _lens_sha256], xmm0
226 vpsubw xmm2, xmm2, xmm1
227 vmovdqa [state + _lens_sha256 + 8*2], xmm2
229 ; "state" and "args" are the same address, arg1
231 call sha256_x16_avx512
232 ; state and idx are intact
235 ; process completed job "idx"
236 imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
237 lea lane_data, [state + _ldata_sha256 + lane_data]
238 mov DWORD(extra_blocks), [lane_data + _extra_blocks]
240 jne proc_extra_blocks
241 cmp dword [lane_data + _outer_done], 0
245 mov dword [lane_data + _outer_done], 1
246 mov DWORD(size_offset), [lane_data + _size_offset]
247 mov qword [lane_data + _extra_block + size_offset], 0
248 mov word [state + _lens_sha256 + 2*idx], 1
249 lea tmp, [lane_data + _outer_block]
250 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
252 vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
253 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
254 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
255 vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
256 vpshufb xmm0, xmm0, [rel byteswap]
257 vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
258 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
259 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
261 vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
263 vpshufb xmm1, xmm1, [rel byteswap]
265 vmovdqa [lane_data + _outer_block], xmm0
266 vmovdqa [lane_data + _outer_block + 4*4], xmm1
268 mov dword [lane_data + _outer_block + 7*4], 0x80
271 mov job, [lane_data + _job_in_lane]
272 mov tmp, [job + _auth_key_xor_opad]
274 vmovdqu xmm1, [tmp + 4*4]
275 vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
276 vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
277 vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
278 vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
279 vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
280 vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
281 vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
282 vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
287 mov DWORD(start_offset), [lane_data + _start_offset]
288 mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
289 lea tmp, [lane_data + _extra_block + start_offset]
290 mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
291 mov dword [lane_data + _extra_blocks], 0
300 mov job_rax, [lane_data + _job_in_lane]
301 mov qword [lane_data + _job_in_lane], 0
302 or dword [job_rax + _status], STS_COMPLETED_HMAC
303 mov unused_lanes, [state + _unused_lanes_sha256]
306 mov [state + _unused_lanes_sha256], unused_lanes
308 sub dword [state + _num_lanes_inuse_sha256], 1
310 mov p, [job_rax + _auth_tag_output]
312 ; copy SHA224=14bytes and SHA256=16bytes
313 mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
314 mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
315 mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
316 mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
322 mov [p + 0*4], DWORD(tmp)
323 mov [p + 1*4], DWORD(tmp2)
324 mov [p + 2*4], DWORD(tmp4)
326 mov [p + 3*4], WORD(tmp5)
328 mov [p + 3*4], DWORD(tmp5)
334 mov rbx, [rsp + _gpr_save + 8*0]
335 mov rbp, [rsp + _gpr_save + 8*1]
336 mov r12, [rsp + _gpr_save + 8*2]
337 mov r13, [rsp + _gpr_save + 8*3]
338 mov r14, [rsp + _gpr_save + 8*4]
339 mov r15, [rsp + _gpr_save + 8*5]
341 mov rsi, [rsp + _gpr_save + 8*6]
342 mov rdi, [rsp + _gpr_save + 8*7]
344 mov rsp, [rsp + _rsp_save] ; original SP
349 section .note.GNU-stack noalloc noexec nowrite progbits