%include "mb_mgr_datastruct.asm"
%include "reg_sizes.asm"
%include "memcpy.asm"
+%include "const.inc"
extern sha_256_mult_avx
mov [lane_data + _job_in_lane], job
mov dword [lane_data + _outer_done], 0
- mov [state + _lens_sha256 + 2*lane], WORD(tmp)
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
mov last_len, len
and last_len, 63
jnz ge64_bytes
lt64_bytes:
- mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
lea tmp, [lane_data + _extra_block + start_offset]
mov [state + _args_data_ptr_sha256 + 8*lane], tmp
mov dword [lane_data + _extra_blocks], 0
mov dword [lane_data + _outer_done], 1
mov DWORD(size_offset), [lane_data + _size_offset]
mov qword [lane_data + _extra_block + size_offset], 0
- mov word [state + _lens_sha256 + 2*idx], 1
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
lea tmp, [lane_data + _outer_block]
mov job, [lane_data + _job_in_lane]
mov [state + _args_data_ptr_sha256 + 8*idx], tmp
align 16
proc_extra_blocks:
mov DWORD(start_offset), [lane_data + _start_offset]
- mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
lea tmp, [lane_data + _extra_block + start_offset]
mov [state + _args_data_ptr_sha256 + 8*idx], tmp
mov dword [lane_data + _extra_blocks], 0
mov p, [job_rax + _auth_tag_output]
- ; copy 14 bytes for SHA224 and 16 bytes for SHA256
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+ ; copy 14 bytes for SHA224 / 16 bytes for SHA256
mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
-
bswap DWORD(tmp)
bswap DWORD(tmp2)
bswap DWORD(tmp3)
bswap DWORD(tmp4)
-
mov [p + 0*4], DWORD(tmp)
mov [p + 1*4], DWORD(tmp2)
mov [p + 2*4], DWORD(tmp3)
-
%ifdef SHA224
mov [p + 3*4], WORD(tmp4)
%else
mov [p + 3*4], DWORD(tmp4)
%endif
+ jmp return
+
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+ mov [p + 3*4], DWORD(tmp4)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+%ifndef SHA224
+ bswap DWORD(tmp4)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp4)
+%endif
+
return:
mov rbx, [rsp + _gpr_save + 8*0]