; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions
+; modification, are permitted provided that the following conditions
; are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
%include "sha256_mb_mgr_datastruct.asm"
%include "reg_sizes.asm"
+[bits 64]
default rel
+section .text
;; code to compute oct SHA256 using SSE-256 / AVX2
;; outer calling routine takes care of save and restore of XMM registers
;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
-;; Windows preserves: rcx rbp r8
+;; Windows preserves: rcx rbp r8
;;
;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
-;; Linux preserves: rdi rbp r8
+;; Linux preserves: rdi rbp r8
;;
;; clobbers ymm0-15
-%ifidn __OUTPUT_FORMAT__, elf64
+%ifidn __OUTPUT_FORMAT__, elf64
; Linux definitions
%define arg1 rdi
%define arg2 rsi
%define IDX rax
%define ROUND rbx
-%define TBL reg3
+%define TBL reg3
%define inp0 r9
%define inp1 r10
%define MAX_SHA256_LANES 8
%define NUM_SHA256_DIGEST_WORDS 8
%define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
-%define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * NUM_SHA256_DIGEST_WORDS)
-%define SHA256_BLK_SZ 64 ; in bytes
; Define stack usage
%define _DIGEST stack_frame.digest
%define _YTMP stack_frame.ytmp
%define _RSP_SAVE stack_frame.rsp
-
+
%define YTMP0 rsp + _YTMP + 0*SZ8
%define YTMP1 rsp + _YTMP + 1*SZ8
%define YTMP2 rsp + _YTMP + 2*SZ8
%endm
-;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
+;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
;; arg 1 : STATE : pointer to input data
;; arg 2 : INP_SIZE : size of input in blocks
-global sha256_mb_x8_avx2:function internal
+mk_global sha256_mb_x8_avx2, function, internal
align 16
sha256_mb_x8_avx2:
+ endbranch
; general registers preserved in outer calling routine
; outer calling routine saves all the XMM registers
-
+
; save rsp, allocate 32-byte aligned for local variables
mov IDX, rsp
sub rsp, FRAMESZ
mov [rsp + _RSP_SAVE], IDX
- ;; Load the pre-transposed incoming digest.
+ ;; Load the pre-transposed incoming digest.
vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
lea TBL,[K256_8_MB]
-
+
;; load the address of each of the 4 message lanes
;; getting ready to transpose input onto stack
mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
vmovdqa [YTMP1], TT5
vmovdqa [YTMP2], TT6
vmovdqa [YTMP3], TT7
- ROUND_00_15 TT0,(i*8+0)
+ ROUND_00_15 TT0,(i*8+0)
vmovdqa TT0, [YTMP0]
- ROUND_00_15 TT1,(i*8+1)
+ ROUND_00_15 TT1,(i*8+1)
vmovdqa TT1, [YTMP1]
- ROUND_00_15 TT2,(i*8+2)
+ ROUND_00_15 TT2,(i*8+2)
vmovdqa TT2, [YTMP2]
- ROUND_00_15 TT3,(i*8+3)
+ ROUND_00_15 TT3,(i*8+3)
vmovdqa TT3, [YTMP3]
- ROUND_00_15 TT0,(i*8+4)
- ROUND_00_15 TT1,(i*8+5)
- ROUND_00_15 TT2,(i*8+6)
- ROUND_00_15 TT3,(i*8+7)
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
%assign i (i+1)
%endrep
add IDX, 4*4*4
-
+
%assign i (i*8)
jmp Lrounds_16_xx
add inp6, IDX
mov [STATE + _args_data_ptr + 6*8], inp6
add inp7, IDX
- mov [STATE + _args_data_ptr + 7*8], inp7
+ mov [STATE + _args_data_ptr + 7*8], inp7
;;;;;;;;;;;;;;;;
;; Postamble
mov rsp, [rsp + _RSP_SAVE]
- ret
+ ret
section .data
align 64
dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
dq 0x0405060700010203, 0x0c0d0e0f08090a0b
-
-align 64
-global K256_MB
-K256_MB:
- dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
- dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
- dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
- dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
- dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
- dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
- dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
- dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
- dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
- dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
- dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
- dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
- dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
- dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
- dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
- dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2