%include "mb_mgr_datastruct.asm"
%include "reg_sizes.asm"
%include "memcpy.asm"
+%include "const.inc"
extern sha512_x2_avx
mov [lane_data + _job_in_lane_sha512], job
mov dword [lane_data + _outer_done_sha512], 0
- mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
mov last_len, len
and last_len, 127
jnz ge128_bytes
lt128_bytes:
- mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks)
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
lea tmp, [lane_data + _extra_block_sha512 + start_offset]
mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
mov dword [lane_data + _extra_blocks_sha512], 0
mov dword [lane_data + _outer_done_sha512], 1
mov DWORD(size_offset), [lane_data + _size_offset_sha512]
mov qword [lane_data + _extra_block_sha512 + size_offset], 0
- mov word [state + _lens_sha512 + 2*idx], 1
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
lea tmp, [lane_data + _outer_block_sha512]
mov job, [lane_data + _job_in_lane_sha512]
mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
align 16
proc_extra_blocks:
mov DWORD(start_offset), [lane_data + _start_offset_sha512]
- mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
lea tmp, [lane_data + _extra_block_sha512 + start_offset]
mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
mov dword [lane_data + _extra_blocks_sha512], 0
mov p, [job_rax + _auth_tag_output]
- ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+ ;; copy 32 bytes for SHA512 / 24 bytes and SHA384
mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
%if (SHA_X_DIGEST_SIZE != 384)
mov [p + 3*8], QWORD(tmp4)
%endif
+ jmp return
-return:
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes and SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+ mov [p + 3*8], QWORD(tmp4)
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 4*8], QWORD(tmp)
+ mov [p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp3)
+ mov [p + 7*8], QWORD(tmp4)
+%endif
+
+return:
mov rbx, [rsp + _gpr_save + 8*0]
mov rbp, [rsp + _gpr_save + 8*1]
%ifndef LINUX