; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
+; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
%include "sha256_mb_mgr_datastruct.asm"
%include "reg_sizes.asm"
+[bits 64]
default rel
+section .text
;; code to compute quad SHA256 using AVX
;; Logic designed/laid out by JDG
; r1 = {d1 c1 b1 a1}
; r0 = {d2 c2 b2 a2}
; r3 = {d3 c3 b3 a3}
-;
+;
%macro TRANSPOSE 6
%define %%r0 %1
%define %%r1 %2
vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
-%endmacro
+%endmacro
%define TABLE K256_4_MB
; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
%define _DIGEST (DATA)
-
+
%define VMOVPS vmovups
%define inp0 r8
%define inp2 r10
%define inp3 r11
-%ifidn __OUTPUT_FORMAT__, elf64
+%ifidn __OUTPUT_FORMAT__, elf64
; Linux definitions
%define arg1 rdi
%define arg2 rsi
%define ROUND rbx
%define TBL r12
-;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
;;
;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
;;
-global sha256_mb_x4_avx:function internal
+mk_global sha256_mb_x4_avx, function, internal
align 32
sha256_mb_x4_avx:
+ endbranch
sub rsp, FRAMESZ
;; Initialize digests
vmovdqa h,[arg1+7*SZ4]
lea TBL,[TABLE]
-
+
;; transpose input onto stack
mov inp0,[arg1 + _data_ptr + 0*8]
mov inp1,[arg1 + _data_ptr + 1*8]
vpshufb TT1, TT1, TMP
vpshufb TT2, TT2, TMP
vpshufb TT3, TT3, TMP
- ROUND_00_15 TT0,(i*4+0)
- ROUND_00_15 TT1,(i*4+1)
- ROUND_00_15 TT2,(i*4+2)
- ROUND_00_15 TT3,(i*4+3)
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
%assign i (i+1)
%endrep
add IDX, 4*4*4
-
+
%assign i (i*4)
jmp Lrounds_16_xx
vmovdqa [arg1+5*SZ4],f
vmovdqa [arg1+6*SZ4],g
vmovdqa [arg1+7*SZ4],h
-
+
; update input pointers
add inp0, IDX
mov [arg1 + _data_ptr + 0*8], inp0
;;;;;;;;;;;;;;;;
;; Postamble
-
+
add rsp, FRAMESZ
ret