[mirror_ubuntu-artful-kernel.git] / arch / x86 / crypto / sha1_avx2_x86_64_asm.S

/*
 *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
 *
 * GPL LICENSE SUMMARY
 *
 * Copyright(c) 2014 Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of version 2 of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * Contact Information:
 * Ilya Albrekht <ilya.albrekht@intel.com>
 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
 * Ronen Zohar <ronen.zohar@intel.com>
 * Chandramouli Narayanan <mouli@linux.intel.com>
 *
 * BSD LICENSE
 *
 * Copyright(c) 2014 Intel Corporation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in
 * the documentation and/or other materials provided with the
 * distribution.
 * Neither the name of Intel Corporation nor the names of its
 * contributors may be used to endorse or promote products derived
 * from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

/*
 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
 *
 *This implementation is based on the previous SSSE3 release:
 *Visit http://software.intel.com/en-us/articles/
 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
 *
 *Updates 20-byte SHA-1 record in 'hash' for even number of
 *'num_blocks' consecutive 64-byte blocks
 *
 *extern "C" void sha1_transform_avx2(
 *	int *hash, const char* input, size_t num_blocks );
 */

#include <linux/linkage.h>

#define	CTX	%rdi	/* arg1 */
#define BUF	%rsi	/* arg2 */
#define CNT	%rdx	/* arg3 */

#define	REG_A	%ecx
#define	REG_B	%esi
#define	REG_C	%edi
#define	REG_D	%eax
#define	REG_E	%edx
#define	REG_TB	%ebx
#define	REG_TA	%r12d
#define	REG_RA	%rcx
#define	REG_RB	%rsi
#define	REG_RC	%rdi
#define	REG_RD	%rax
#define	REG_RE	%rdx
#define	REG_RTA	%r12
#define	REG_RTB	%rbx
#define	REG_T1	%ebp
#define	xmm_mov	vmovups
#define	avx2_zeroupper	vzeroupper
#define	RND_F1	1
#define	RND_F2	2
#define	RND_F3	3

.macro REGALLOC
	.set A, REG_A
	.set B, REG_B
	.set C, REG_C
	.set D, REG_D
	.set E, REG_E
	.set TB, REG_TB
	.set TA, REG_TA

	.set RA, REG_RA
	.set RB, REG_RB
	.set RC, REG_RC
	.set RD, REG_RD
	.set RE, REG_RE

	.set RTA, REG_RTA
	.set RTB, REG_RTB

	.set T1, REG_T1
.endm

#define HASH_PTR	%r9
#define BLOCKS_CTR	%r8
#define BUFFER_PTR	%r10
#define BUFFER_PTR2	%r13

#define PRECALC_BUF	%r14
#define WK_BUF		%r15

#define W_TMP		%xmm0
#define WY_TMP		%ymm0
#define WY_TMP2		%ymm9

# AVX2 variables
#define WY0		%ymm3
#define WY4		%ymm5
#define WY08		%ymm7
#define WY12		%ymm8
#define WY16		%ymm12
#define WY20		%ymm13
#define WY24		%ymm14
#define WY28		%ymm15

#define YMM_SHUFB_BSWAP	%ymm10

/*
 * Keep 2 iterations precalculated at a time:
 *    - 80 DWORDs per iteration * 2
 */
#define W_SIZE		(80*2*2 +16)

#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)


.macro UPDATE_HASH  hash, val
	add	\hash, \val
	mov	\val, \hash
.endm

.macro PRECALC_RESET_WY
	.set WY_00, WY0
	.set WY_04, WY4
	.set WY_08, WY08
	.set WY_12, WY12
	.set WY_16, WY16
	.set WY_20, WY20
	.set WY_24, WY24
	.set WY_28, WY28
	.set WY_32, WY_00
.endm

.macro PRECALC_ROTATE_WY
	/* Rotate macros */
	.set WY_32, WY_28
	.set WY_28, WY_24
	.set WY_24, WY_20
	.set WY_20, WY_16
	.set WY_16, WY_12
	.set WY_12, WY_08
	.set WY_08, WY_04
	.set WY_04, WY_00
	.set WY_00, WY_32

	/* Define register aliases */
	.set WY, WY_00
	.set WY_minus_04, WY_04
	.set WY_minus_08, WY_08
	.set WY_minus_12, WY_12
	.set WY_minus_16, WY_16
	.set WY_minus_20, WY_20
	.set WY_minus_24, WY_24
	.set WY_minus_28, WY_28
	.set WY_minus_32, WY
.endm

.macro PRECALC_00_15
	.if (i == 0) # Initialize and rotate registers
		PRECALC_RESET_WY
		PRECALC_ROTATE_WY
	.endif

	/* message scheduling pre-compute for rounds 0-15 */
	.if   ((i & 7) == 0)
		/*
		 * blended AVX2 and ALU instruction scheduling
		 * 1 vector iteration per 8 rounds
		 */
		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
	.elseif ((i & 7) == 1)
		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
			 WY_TMP, WY_TMP
	.elseif ((i & 7) == 2)
		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
	.elseif ((i & 7) == 4)
		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
	.elseif ((i & 7) == 7)
		vmovdqu  WY_TMP, PRECALC_WK(i&~7)

		PRECALC_ROTATE_WY
	.endif
.endm

.macro PRECALC_16_31
	/*
	 * message scheduling pre-compute for rounds 16-31
	 * calculating last 32 w[i] values in 8 XMM registers
	 * pre-calculate K+w[i] values and store to mem
	 * for later load by ALU add instruction
	 *
	 * "brute force" vectorization for rounds 16-31 only
	 * due to w[i]->w[i-3] dependency
	 */
	.if   ((i & 7) == 0)
		/*
		 * blended AVX2 and ALU instruction scheduling
		 * 1 vector iteration per 8 rounds
		 */
		/* w[i-14] */
		vpalignr	$8, WY_minus_16, WY_minus_12, WY
		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
	.elseif ((i & 7) == 1)
		vpxor	WY_minus_08, WY, WY
		vpxor	WY_minus_16, WY_TMP, WY_TMP
	.elseif ((i & 7) == 2)
		vpxor	WY_TMP, WY, WY
		vpslldq	$12, WY, WY_TMP2
	.elseif ((i & 7) == 3)
		vpslld	$1, WY, WY_TMP
		vpsrld	$31, WY, WY
	.elseif ((i & 7) == 4)
		vpor	WY, WY_TMP, WY_TMP
		vpslld	$2, WY_TMP2, WY
	.elseif ((i & 7) == 5)
		vpsrld	$30, WY_TMP2, WY_TMP2
		vpxor	WY, WY_TMP, WY_TMP
	.elseif ((i & 7) == 7)
		vpxor	WY_TMP2, WY_TMP, WY
		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
		vmovdqu	WY_TMP, PRECALC_WK(i&~7)

		PRECALC_ROTATE_WY
	.endif
.endm

.macro PRECALC_32_79
	/*
	 * in SHA-1 specification:
	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
	 * instead we do equal:
	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
	 * allows more efficient vectorization
	 * since w[i]=>w[i-3] dependency is broken
	 */

	.if   ((i & 7) == 0)
	/*
	 * blended AVX2 and ALU instruction scheduling
	 * 1 vector iteration per 8 rounds
	 */
		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
	.elseif ((i & 7) == 1)
		/* W is W_minus_32 before xor */
		vpxor	WY_minus_28, WY, WY
	.elseif ((i & 7) == 2)
		vpxor	WY_minus_16, WY_TMP, WY_TMP
	.elseif ((i & 7) == 3)
		vpxor	WY_TMP, WY, WY
	.elseif ((i & 7) == 4)
		vpslld	$2, WY, WY_TMP
	.elseif ((i & 7) == 5)
		vpsrld	$30, WY, WY
		vpor	WY, WY_TMP, WY
	.elseif ((i & 7) == 7)
		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
		vmovdqu	WY_TMP, PRECALC_WK(i&~7)

		PRECALC_ROTATE_WY
	.endif
.endm

.macro PRECALC r, s
	.set i, \r

	.if (i < 40)
		.set K_XMM, 32*0
	.elseif (i < 80)
		.set K_XMM, 32*1
	.elseif (i < 120)
		.set K_XMM, 32*2
	.else
		.set K_XMM, 32*3
	.endif

	.if (i<32)
		PRECALC_00_15	\s
	.elseif (i<64)
		PRECALC_16_31	\s
	.elseif (i < 160)
		PRECALC_32_79	\s
	.endif
.endm

.macro ROTATE_STATE
	.set T_REG, E
	.set E, D
	.set D, C
	.set C, B
	.set B, TB
	.set TB, A
	.set A, T_REG

	.set T_REG, RE
	.set RE, RD
	.set RD, RC
	.set RC, RB
	.set RB, RTB
	.set RTB, RA
	.set RA, T_REG
.endm

/* Macro relies on saved ROUND_Fx */

.macro RND_FUN f, r
	.if (\f == RND_F1)
		ROUND_F1	\r
	.elseif (\f == RND_F2)
		ROUND_F2	\r
	.elseif (\f == RND_F3)
		ROUND_F3	\r
	.endif
.endm

.macro RR r
	.set round_id, (\r % 80)

	.if (round_id == 0)        /* Precalculate F for first round */
		.set ROUND_FUNC, RND_F1
		mov	B, TB

		rorx	$(32-30), B, B    /* b>>>2 */
		andn	D, TB, T1
		and	C, TB
		xor	T1, TB
	.endif

	RND_FUN ROUND_FUNC, \r
	ROTATE_STATE

	.if   (round_id == 18)
		.set ROUND_FUNC, RND_F2
	.elseif (round_id == 38)
		.set ROUND_FUNC, RND_F3
	.elseif (round_id == 58)
		.set ROUND_FUNC, RND_F2
	.endif

	.set round_id, ( (\r+1) % 80)

	RND_FUN ROUND_FUNC, (\r+1)
	ROTATE_STATE
.endm

.macro ROUND_F1 r
	add	WK(\r), E

	andn	C, A, T1			/* ~b&d */
	lea	(RE,RTB), E		/* Add F from the previous round */

	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
	rorx	$(32-30),A, TB		/* b>>>2 for next round */

	PRECALC	(\r)			/* msg scheduling for next 2 blocks */

	/*
	 * Calculate F for the next round
	 * (b & c) ^ andn[b, d]
	 */
	and	B, A			/* b&c */
	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */

	lea	(RE,RTA), E		/* E += A >>> 5 */
.endm

.macro ROUND_F2 r
	add	WK(\r), E
	lea	(RE,RTB), E		/* Add F from the previous round */

	/* Calculate F for the next round */
	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
	.if ((round_id) < 79)
		rorx	$(32-30), A, TB	/* b>>>2 for next round */
	.endif
	PRECALC	(\r)			/* msg scheduling for next 2 blocks */

	.if ((round_id) < 79)
		xor	B, A
	.endif

	add	TA, E			/* E += A >>> 5 */

	.if ((round_id) < 79)
		xor	C, A
	.endif
.endm

.macro ROUND_F3 r
	add	WK(\r), E
	PRECALC	(\r)			/* msg scheduling for next 2 blocks */

	lea	(RE,RTB), E		/* Add F from the previous round */

	mov	B, T1
	or	A, T1

	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
	rorx	$(32-30), A, TB		/* b>>>2 for next round */

	/* Calculate F for the next round
	 * (b and c) or (d and (b or c))
	 */
	and	C, T1
	and	B, A
	or	T1, A

	add	TA, E			/* E += A >>> 5 */

.endm

/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
 * %1 + %2 >= %3 ? %4 : 0
 */
.macro ADD_IF_GE a, b, c, d
	mov     \a, RTA
	add     $\d, RTA
	cmp     $\c, \b
	cmovge  RTA, \a
.endm

/*
 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
 */
.macro SHA1_PIPELINED_MAIN_BODY

	REGALLOC

	mov	(HASH_PTR), A
	mov	4(HASH_PTR), B
	mov	8(HASH_PTR), C
	mov	12(HASH_PTR), D
	mov	16(HASH_PTR), E

	mov	%rsp, PRECALC_BUF
	lea	(2*4*80+32)(%rsp), WK_BUF

	# Precalc WK for first 2 blocks
	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
	.set i, 0
	.rept    160
		PRECALC i
		.set i, i + 1
	.endr

	/* Go to next block if needed */
	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
	xchg	WK_BUF, PRECALC_BUF

	.align 32
_loop:
	/*
	 * code loops through more than one block
	 * we use K_BASE value as a signal of a last block,
	 * it is set below by: cmovae BUFFER_PTR, K_BASE
	 */
	test BLOCKS_CTR, BLOCKS_CTR
	jnz _begin
	.align 32
	jmp	_end
	.align 32
_begin:

	/*
	 * Do first block
	 * rounds: 0,2,4,6,8
	 */
	.set j, 0
	.rept 5
		RR	j
		.set j, j+2
	.endr

	jmp _loop0
_loop0:

	/*
	 * rounds:
	 * 10,12,14,16,18
	 * 20,22,24,26,28
	 * 30,32,34,36,38
	 * 40,42,44,46,48
	 * 50,52,54,56,58
	 */
	.rept 25
		RR	j
		.set j, j+2
	.endr

	/* Update Counter */
	sub $1, BLOCKS_CTR
	/* Move to the next block only if needed*/
	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
	/*
	 * rounds
	 * 60,62,64,66,68
	 * 70,72,74,76,78
	 */
	.rept 10
		RR	j
		.set j, j+2
	.endr

	UPDATE_HASH	(HASH_PTR), A
	UPDATE_HASH	4(HASH_PTR), TB
	UPDATE_HASH	8(HASH_PTR), C
	UPDATE_HASH	12(HASH_PTR), D
	UPDATE_HASH	16(HASH_PTR), E

	test	BLOCKS_CTR, BLOCKS_CTR
	jz	_loop

	mov	TB, B

	/* Process second block */
	/*
	 * rounds
	 *  0+80, 2+80, 4+80, 6+80, 8+80
	 * 10+80,12+80,14+80,16+80,18+80
	 */

	.set j, 0
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	jmp	_loop1
_loop1:
	/*
	 * rounds
	 * 20+80,22+80,24+80,26+80,28+80
	 * 30+80,32+80,34+80,36+80,38+80
	 */
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	jmp	_loop2
_loop2:

	/*
	 * rounds
	 * 40+80,42+80,44+80,46+80,48+80
	 * 50+80,52+80,54+80,56+80,58+80
	 */
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	/* update counter */
	sub     $1, BLOCKS_CTR
	/* Move to the next block only if needed*/
	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128

	jmp	_loop3
_loop3:

	/*
	 * rounds
	 * 60+80,62+80,64+80,66+80,68+80
	 * 70+80,72+80,74+80,76+80,78+80
	 */
	.rept 10
		RR	j+80
		.set j, j+2
	.endr

	UPDATE_HASH	(HASH_PTR), A
	UPDATE_HASH	4(HASH_PTR), TB
	UPDATE_HASH	8(HASH_PTR), C
	UPDATE_HASH	12(HASH_PTR), D
	UPDATE_HASH	16(HASH_PTR), E

	/* Reset state for AVX2 reg permutation */
	mov	A, TA
	mov	TB, A
	mov	C, TB
	mov	E, C
	mov	D, B
	mov	TA, D

	REGALLOC

	xchg	WK_BUF, PRECALC_BUF

	jmp	_loop

	.align 32
	_end:

.endm
/*
 * macro implements SHA-1 function's body for several 64-byte blocks
 * param: function's name
 */
.macro SHA1_VECTOR_ASM  name
	ENTRY(\name)

	push	%rbx
	push	%rbp
	push	%r12
	push	%r13
	push	%r14
	push	%r15

	RESERVE_STACK  = (W_SIZE*4 + 8+24)

	/* Align stack */
	mov	%rsp, %rbx
	and	$~(0x20-1), %rsp
	push	%rbx
	sub	$RESERVE_STACK, %rsp

	avx2_zeroupper

	/* Setup initial values */
	mov	CTX, HASH_PTR
	mov	BUF, BUFFER_PTR

	mov	BUF, BUFFER_PTR2
	mov	CNT, BLOCKS_CTR

	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP

	SHA1_PIPELINED_MAIN_BODY

	avx2_zeroupper

	add	$RESERVE_STACK, %rsp
	pop	%rsp

	pop	%r15
	pop	%r14
	pop	%r13
	pop	%r12
	pop	%rbp
	pop	%rbx

	ret

	ENDPROC(\name)
.endm

.section .rodata

#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6

.align 128
K_XMM_AR:
	.long K1, K1, K1, K1
	.long K1, K1, K1, K1
	.long K2, K2, K2, K2
	.long K2, K2, K2, K2
	.long K3, K3, K3, K3
	.long K3, K3, K3, K3
	.long K4, K4, K4, K4
	.long K4, K4, K4, K4

BSWAP_SHUFB_CTL:
	.long 0x00010203
	.long 0x04050607
	.long 0x08090a0b
	.long 0x0c0d0e0f
	.long 0x00010203
	.long 0x04050607
	.long 0x08090a0b
	.long 0x0c0d0e0f
.text

SHA1_VECTOR_ASM     sha1_transform_avx2
Commit	Line	Data
	1	/*
	2	* Implement fast SHA-1 with AVX2 instructions. (x86_64)
	3	*
	4	* This file is provided under a dual BSD/GPLv2 license. When using or
	5	* redistributing this file, you may do so under either license.
	6	*
	7	* GPL LICENSE SUMMARY
	8	*
	9	* Copyright(c) 2014 Intel Corporation.
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of version 2 of the GNU General Public License as
	13	* published by the Free Software Foundation.
	14	*
	15	* This program is distributed in the hope that it will be useful, but
	16	* WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	18	* General Public License for more details.
	19	*
	20	* Contact Information:
	21	* Ilya Albrekht <ilya.albrekht@intel.com>
	22	* Maxim Locktyukhin <maxim.locktyukhin@intel.com>
	23	* Ronen Zohar <ronen.zohar@intel.com>
	24	* Chandramouli Narayanan <mouli@linux.intel.com>
	25	*
	26	* BSD LICENSE
	27	*
	28	* Copyright(c) 2014 Intel Corporation.
	29	*
	30	* Redistribution and use in source and binary forms, with or without
	31	* modification, are permitted provided that the following conditions
	32	* are met:
	33	*
	34	* Redistributions of source code must retain the above copyright
	35	* notice, this list of conditions and the following disclaimer.
	36	* Redistributions in binary form must reproduce the above copyright
	37	* notice, this list of conditions and the following disclaimer in
	38	* the documentation and/or other materials provided with the
	39	* distribution.
	40	* Neither the name of Intel Corporation nor the names of its
	41	* contributors may be used to endorse or promote products derived
	42	* from this software without specific prior written permission.
	43	*
	44	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	45	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	46	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	47	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	48	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	49	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	50	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	51	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	52	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	53	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	54	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	55	*
	56	*/
	57
	58	/*
	59	* SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
	60	*
	61	*This implementation is based on the previous SSSE3 release:
	62	*Visit http://software.intel.com/en-us/articles/
	63	*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
	64	*
	65	*Updates 20-byte SHA-1 record in 'hash' for even number of
	66	*'num_blocks' consecutive 64-byte blocks
	67	*
	68	*extern "C" void sha1_transform_avx2(
	69	* int hash, const char input, size_t num_blocks );
	70	*/
	71
	72	#include <linux/linkage.h>
	73
	74	#define CTX %rdi /* arg1 */
	75	#define BUF %rsi /* arg2 */
	76	#define CNT %rdx /* arg3 */
	77
	78	#define REG_A %ecx
	79	#define REG_B %esi
	80	#define REG_C %edi
	81	#define REG_D %eax
	82	#define REG_E %edx
	83	#define REG_TB %ebx
	84	#define REG_TA %r12d
	85	#define REG_RA %rcx
	86	#define REG_RB %rsi
	87	#define REG_RC %rdi
	88	#define REG_RD %rax
	89	#define REG_RE %rdx
	90	#define REG_RTA %r12
	91	#define REG_RTB %rbx
	92	#define REG_T1 %ebp
	93	#define xmm_mov vmovups
	94	#define avx2_zeroupper vzeroupper
	95	#define RND_F1 1
	96	#define RND_F2 2
	97	#define RND_F3 3
	98
	99	.macro REGALLOC
	100	.set A, REG_A
	101	.set B, REG_B
	102	.set C, REG_C
	103	.set D, REG_D
	104	.set E, REG_E
	105	.set TB, REG_TB
	106	.set TA, REG_TA
	107
	108	.set RA, REG_RA
	109	.set RB, REG_RB
	110	.set RC, REG_RC
	111	.set RD, REG_RD
	112	.set RE, REG_RE
	113
	114	.set RTA, REG_RTA
	115	.set RTB, REG_RTB
	116
	117	.set T1, REG_T1
	118	.endm
	119
	120	#define HASH_PTR %r9
	121	#define BLOCKS_CTR %r8
	122	#define BUFFER_PTR %r10
	123	#define BUFFER_PTR2 %r13
	124
	125	#define PRECALC_BUF %r14
	126	#define WK_BUF %r15
	127
	128	#define W_TMP %xmm0
	129	#define WY_TMP %ymm0
	130	#define WY_TMP2 %ymm9
	131
	132	# AVX2 variables
	133	#define WY0 %ymm3
	134	#define WY4 %ymm5
	135	#define WY08 %ymm7
	136	#define WY12 %ymm8
	137	#define WY16 %ymm12
	138	#define WY20 %ymm13
	139	#define WY24 %ymm14
	140	#define WY28 %ymm15
	141
	142	#define YMM_SHUFB_BSWAP %ymm10
	143
	144	/*
	145	* Keep 2 iterations precalculated at a time:
	146	* - 80 DWORDs per iteration * 2
	147	*/
	148	#define W_SIZE (8022 +16)
	149
	150	#define WK(t) ((((t) % 80) / 4)32 + ( (t) % 4)4 + ((t)/80)*16 )(WK_BUF)
	151	#define PRECALC_WK(t) ((t)22)(PRECALC_BUF)
	152
	153
	154	.macro UPDATE_HASH hash, val
	155	add \hash, \val
	156	mov \val, \hash
	157	.endm
	158
	159	.macro PRECALC_RESET_WY
	160	.set WY_00, WY0
	161	.set WY_04, WY4
	162	.set WY_08, WY08
	163	.set WY_12, WY12
	164	.set WY_16, WY16
	165	.set WY_20, WY20
	166	.set WY_24, WY24
	167	.set WY_28, WY28
	168	.set WY_32, WY_00
	169	.endm
	170
	171	.macro PRECALC_ROTATE_WY
	172	/* Rotate macros */
	173	.set WY_32, WY_28
	174	.set WY_28, WY_24
	175	.set WY_24, WY_20
	176	.set WY_20, WY_16
	177	.set WY_16, WY_12
	178	.set WY_12, WY_08
	179	.set WY_08, WY_04
	180	.set WY_04, WY_00
	181	.set WY_00, WY_32
	182
	183	/* Define register aliases */
	184	.set WY, WY_00
	185	.set WY_minus_04, WY_04
	186	.set WY_minus_08, WY_08
	187	.set WY_minus_12, WY_12
	188	.set WY_minus_16, WY_16
	189	.set WY_minus_20, WY_20
	190	.set WY_minus_24, WY_24
	191	.set WY_minus_28, WY_28
	192	.set WY_minus_32, WY
	193	.endm
	194
	195	.macro PRECALC_00_15
	196	.if (i == 0) # Initialize and rotate registers
	197	PRECALC_RESET_WY
	198	PRECALC_ROTATE_WY
	199	.endif
	200
	201	/* message scheduling pre-compute for rounds 0-15 */
	202	.if ((i & 7) == 0)
	203	/*
	204	* blended AVX2 and ALU instruction scheduling
	205	* 1 vector iteration per 8 rounds
	206	*/
	207	vmovdqu (i * 2)(BUFFER_PTR), W_TMP
	208	.elseif ((i & 7) == 1)
	209	vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
	210	WY_TMP, WY_TMP
	211	.elseif ((i & 7) == 2)
	212	vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
	213	.elseif ((i & 7) == 4)
	214	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
	215	.elseif ((i & 7) == 7)
	216	vmovdqu WY_TMP, PRECALC_WK(i&~7)
	217
	218	PRECALC_ROTATE_WY
	219	.endif
	220	.endm
	221
	222	.macro PRECALC_16_31
	223	/*
	224	* message scheduling pre-compute for rounds 16-31
	225	* calculating last 32 w[i] values in 8 XMM registers
	226	* pre-calculate K+w[i] values and store to mem
	227	* for later load by ALU add instruction
	228	*
	229	* "brute force" vectorization for rounds 16-31 only
	230	* due to w[i]->w[i-3] dependency
	231	*/
	232	.if ((i & 7) == 0)
	233	/*
	234	* blended AVX2 and ALU instruction scheduling
	235	* 1 vector iteration per 8 rounds
	236	*/
	237	/* w[i-14] */
	238	vpalignr $8, WY_minus_16, WY_minus_12, WY
	239	vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
	240	.elseif ((i & 7) == 1)
	241	vpxor WY_minus_08, WY, WY
	242	vpxor WY_minus_16, WY_TMP, WY_TMP
	243	.elseif ((i & 7) == 2)
	244	vpxor WY_TMP, WY, WY
	245	vpslldq $12, WY, WY_TMP2
	246	.elseif ((i & 7) == 3)
	247	vpslld $1, WY, WY_TMP
	248	vpsrld $31, WY, WY
	249	.elseif ((i & 7) == 4)
	250	vpor WY, WY_TMP, WY_TMP
	251	vpslld $2, WY_TMP2, WY
	252	.elseif ((i & 7) == 5)
	253	vpsrld $30, WY_TMP2, WY_TMP2
	254	vpxor WY, WY_TMP, WY_TMP
	255	.elseif ((i & 7) == 7)
	256	vpxor WY_TMP2, WY_TMP, WY
	257	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
	258	vmovdqu WY_TMP, PRECALC_WK(i&~7)
	259
	260	PRECALC_ROTATE_WY
	261	.endif
	262	.endm
	263
	264	.macro PRECALC_32_79
	265	/*
	266	* in SHA-1 specification:
	267	* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
	268	* instead we do equal:
	269	* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
	270	* allows more efficient vectorization
	271	* since w[i]=>w[i-3] dependency is broken
	272	*/
	273
	274	.if ((i & 7) == 0)
	275	/*
	276	* blended AVX2 and ALU instruction scheduling
	277	* 1 vector iteration per 8 rounds
	278	*/
	279	vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
	280	.elseif ((i & 7) == 1)
	281	/* W is W_minus_32 before xor */
	282	vpxor WY_minus_28, WY, WY
	283	.elseif ((i & 7) == 2)
	284	vpxor WY_minus_16, WY_TMP, WY_TMP
	285	.elseif ((i & 7) == 3)
	286	vpxor WY_TMP, WY, WY
	287	.elseif ((i & 7) == 4)
	288	vpslld $2, WY, WY_TMP
	289	.elseif ((i & 7) == 5)
	290	vpsrld $30, WY, WY
	291	vpor WY, WY_TMP, WY
	292	.elseif ((i & 7) == 7)
	293	vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
	294	vmovdqu WY_TMP, PRECALC_WK(i&~7)
	295
	296	PRECALC_ROTATE_WY
	297	.endif
	298	.endm
	299
	300	.macro PRECALC r, s
	301	.set i, \r
	302
	303	.if (i < 40)
	304	.set K_XMM, 32*0
	305	.elseif (i < 80)
	306	.set K_XMM, 32*1
	307	.elseif (i < 120)
	308	.set K_XMM, 32*2
	309	.else
	310	.set K_XMM, 32*3
	311	.endif
	312
	313	.if (i<32)
	314	PRECALC_00_15 \s
	315	.elseif (i<64)
	316	PRECALC_16_31 \s
	317	.elseif (i < 160)
	318	PRECALC_32_79 \s
	319	.endif
	320	.endm
	321
	322	.macro ROTATE_STATE
	323	.set T_REG, E
	324	.set E, D
	325	.set D, C
	326	.set C, B
	327	.set B, TB
	328	.set TB, A
	329	.set A, T_REG
	330
	331	.set T_REG, RE
	332	.set RE, RD
	333	.set RD, RC
	334	.set RC, RB
	335	.set RB, RTB
	336	.set RTB, RA
	337	.set RA, T_REG
	338	.endm
	339
	340	/* Macro relies on saved ROUND_Fx */
	341
	342	.macro RND_FUN f, r
	343	.if (\f == RND_F1)
	344	ROUND_F1 \r
	345	.elseif (\f == RND_F2)
	346	ROUND_F2 \r
	347	.elseif (\f == RND_F3)
	348	ROUND_F3 \r
	349	.endif
	350	.endm
	351
	352	.macro RR r
	353	.set round_id, (\r % 80)
	354
	355	.if (round_id == 0) /* Precalculate F for first round */
	356	.set ROUND_FUNC, RND_F1
	357	mov B, TB
	358
	359	rorx $(32-30), B, B /* b>>>2 */
	360	andn D, TB, T1
	361	and C, TB
	362	xor T1, TB
	363	.endif
	364
	365	RND_FUN ROUND_FUNC, \r
	366	ROTATE_STATE
	367
	368	.if (round_id == 18)
	369	.set ROUND_FUNC, RND_F2
	370	.elseif (round_id == 38)
	371	.set ROUND_FUNC, RND_F3
	372	.elseif (round_id == 58)
	373	.set ROUND_FUNC, RND_F2
	374	.endif
	375
	376	.set round_id, ( (\r+1) % 80)
	377
	378	RND_FUN ROUND_FUNC, (\r+1)
	379	ROTATE_STATE
	380	.endm
	381
	382	.macro ROUND_F1 r
	383	add WK(\r), E
	384
	385	andn C, A, T1 /* ~b&d */
	386	lea (RE,RTB), E /* Add F from the previous round */
	387
	388	rorx $(32-5), A, TA /* T2 = A >>> 5 */
	389	rorx $(32-30),A, TB /* b>>>2 for next round */
	390
	391	PRECALC (\r) /* msg scheduling for next 2 blocks */
	392
	393	/*
	394	* Calculate F for the next round
	395	* (b & c) ^ andn[b, d]
	396	*/
	397	and B, A /* b&c */
	398	xor T1, A /* F1 = (b&c) ^ (~b&d) */
	399
	400	lea (RE,RTA), E /* E += A >>> 5 */
	401	.endm
	402
	403	.macro ROUND_F2 r
	404	add WK(\r), E
	405	lea (RE,RTB), E /* Add F from the previous round */
	406
	407	/* Calculate F for the next round */
	408	rorx $(32-5), A, TA /* T2 = A >>> 5 */
	409	.if ((round_id) < 79)
	410	rorx $(32-30), A, TB /* b>>>2 for next round */
	411	.endif
	412	PRECALC (\r) /* msg scheduling for next 2 blocks */
	413
	414	.if ((round_id) < 79)
	415	xor B, A
	416	.endif
	417
	418	add TA, E /* E += A >>> 5 */
	419
	420	.if ((round_id) < 79)
	421	xor C, A
	422	.endif
	423	.endm
	424
	425	.macro ROUND_F3 r
	426	add WK(\r), E
	427	PRECALC (\r) /* msg scheduling for next 2 blocks */
	428
	429	lea (RE,RTB), E /* Add F from the previous round */
	430
	431	mov B, T1
	432	or A, T1
	433
	434	rorx $(32-5), A, TA /* T2 = A >>> 5 */
	435	rorx $(32-30), A, TB /* b>>>2 for next round */
	436
	437	/* Calculate F for the next round
	438	* (b and c) or (d and (b or c))
	439	*/
	440	and C, T1
	441	and B, A
	442	or T1, A
	443
	444	add TA, E /* E += A >>> 5 */
	445
	446	.endm
	447
	448	/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
	449	* %1 + %2 >= %3 ? %4 : 0
	450	*/
	451	.macro ADD_IF_GE a, b, c, d
	452	mov \a, RTA
	453	add $\d, RTA
	454	cmp $\c, \b
	455	cmovge RTA, \a
	456	.endm
	457
	458	/*
	459	* macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
	460	*/
	461	.macro SHA1_PIPELINED_MAIN_BODY
	462
	463	REGALLOC
	464
	465	mov (HASH_PTR), A
	466	mov 4(HASH_PTR), B
	467	mov 8(HASH_PTR), C
	468	mov 12(HASH_PTR), D
	469	mov 16(HASH_PTR), E
	470
	471	mov %rsp, PRECALC_BUF
	472	lea (2480+32)(%rsp), WK_BUF
	473
	474	# Precalc WK for first 2 blocks
	475	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
	476	.set i, 0
	477	.rept 160
	478	PRECALC i
	479	.set i, i + 1
	480	.endr
	481
	482	/* Go to next block if needed */
	483	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
	484	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
	485	xchg WK_BUF, PRECALC_BUF
	486
	487	.align 32
	488	_loop:
	489	/*
	490	* code loops through more than one block
	491	* we use K_BASE value as a signal of a last block,
	492	* it is set below by: cmovae BUFFER_PTR, K_BASE
	493	*/
	494	test BLOCKS_CTR, BLOCKS_CTR
	495	jnz _begin
	496	.align 32
	497	jmp _end
	498	.align 32
	499	_begin:
	500
	501	/*
	502	* Do first block
	503	* rounds: 0,2,4,6,8
	504	*/
	505	.set j, 0
	506	.rept 5
	507	RR j
	508	.set j, j+2
	509	.endr
	510
	511	jmp _loop0
	512	_loop0:
	513
	514	/*
	515	* rounds:
	516	* 10,12,14,16,18
	517	* 20,22,24,26,28
	518	* 30,32,34,36,38
	519	* 40,42,44,46,48
	520	* 50,52,54,56,58
	521	*/
	522	.rept 25
	523	RR j
	524	.set j, j+2
	525	.endr
	526
	527	/* Update Counter */
	528	sub $1, BLOCKS_CTR
	529	/* Move to the next block only if needed*/
	530	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
	531	/*
	532	* rounds
	533	* 60,62,64,66,68
	534	* 70,72,74,76,78
	535	*/
	536	.rept 10
	537	RR j
	538	.set j, j+2
	539	.endr
	540
	541	UPDATE_HASH (HASH_PTR), A
	542	UPDATE_HASH 4(HASH_PTR), TB
	543	UPDATE_HASH 8(HASH_PTR), C
	544	UPDATE_HASH 12(HASH_PTR), D
	545	UPDATE_HASH 16(HASH_PTR), E
	546
	547	test BLOCKS_CTR, BLOCKS_CTR
	548	jz _loop
	549
	550	mov TB, B
	551
	552	/* Process second block */
	553	/*
	554	* rounds
	555	* 0+80, 2+80, 4+80, 6+80, 8+80
	556	* 10+80,12+80,14+80,16+80,18+80
	557	*/
	558
	559	.set j, 0
	560	.rept 10
	561	RR j+80
	562	.set j, j+2
	563	.endr
	564
	565	jmp _loop1
	566	_loop1:
	567	/*
	568	* rounds
	569	* 20+80,22+80,24+80,26+80,28+80
	570	* 30+80,32+80,34+80,36+80,38+80
	571	*/
	572	.rept 10
	573	RR j+80
	574	.set j, j+2
	575	.endr
	576
	577	jmp _loop2
	578	_loop2:
	579
	580	/*
	581	* rounds
	582	* 40+80,42+80,44+80,46+80,48+80
	583	* 50+80,52+80,54+80,56+80,58+80
	584	*/
	585	.rept 10
	586	RR j+80
	587	.set j, j+2
	588	.endr
	589
	590	/* update counter */
	591	sub $1, BLOCKS_CTR
	592	/* Move to the next block only if needed*/
	593	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
	594
	595	jmp _loop3
	596	_loop3:
	597
	598	/*
	599	* rounds
	600	* 60+80,62+80,64+80,66+80,68+80
	601	* 70+80,72+80,74+80,76+80,78+80
	602	*/
	603	.rept 10
	604	RR j+80
	605	.set j, j+2
	606	.endr
	607
	608	UPDATE_HASH (HASH_PTR), A
	609	UPDATE_HASH 4(HASH_PTR), TB
	610	UPDATE_HASH 8(HASH_PTR), C
	611	UPDATE_HASH 12(HASH_PTR), D
	612	UPDATE_HASH 16(HASH_PTR), E
	613
	614	/* Reset state for AVX2 reg permutation */
	615	mov A, TA
	616	mov TB, A
	617	mov C, TB
	618	mov E, C
	619	mov D, B
	620	mov TA, D
	621
	622	REGALLOC
	623
	624	xchg WK_BUF, PRECALC_BUF
	625
	626	jmp _loop
	627
	628	.align 32
	629	_end:
	630
	631	.endm
	632	/*
	633	* macro implements SHA-1 function's body for several 64-byte blocks
	634	* param: function's name
	635	*/
	636	.macro SHA1_VECTOR_ASM name
	637	ENTRY(\name)
	638
	639	push %rbx
	640	push %rbp
	641	push %r12
	642	push %r13
	643	push %r14
	644	push %r15
	645
	646	RESERVE_STACK = (W_SIZE*4 + 8+24)
	647
	648	/* Align stack */
	649	mov %rsp, %rbx
	650	and $~(0x20-1), %rsp
	651	push %rbx
	652	sub $RESERVE_STACK, %rsp
	653
	654	avx2_zeroupper
	655
	656	/* Setup initial values */
	657	mov CTX, HASH_PTR
	658	mov BUF, BUFFER_PTR
	659
	660	mov BUF, BUFFER_PTR2
	661	mov CNT, BLOCKS_CTR
	662
	663	xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
	664
	665	SHA1_PIPELINED_MAIN_BODY
	666
	667	avx2_zeroupper
	668
	669	add $RESERVE_STACK, %rsp
	670	pop %rsp
	671
	672	pop %r15
	673	pop %r14
	674	pop %r13
	675	pop %r12
	676	pop %rbp
	677	pop %rbx
	678
	679	ret
	680
	681	ENDPROC(\name)
	682	.endm
	683
	684	.section .rodata
	685
	686	#define K1 0x5a827999
	687	#define K2 0x6ed9eba1
	688	#define K3 0x8f1bbcdc
	689	#define K4 0xca62c1d6
	690
	691	.align 128
	692	K_XMM_AR:
	693	.long K1, K1, K1, K1
	694	.long K1, K1, K1, K1
	695	.long K2, K2, K2, K2
	696	.long K2, K2, K2, K2
	697	.long K3, K3, K3, K3
	698	.long K3, K3, K3, K3
	699	.long K4, K4, K4, K4
	700	.long K4, K4, K4, K4
	701
	702	BSWAP_SHUFB_CTL:
	703	.long 0x00010203
	704	.long 0x04050607
	705	.long 0x08090a0b
	706	.long 0x0c0d0e0f
	707	.long 0x00010203
	708	.long 0x04050607
	709	.long 0x08090a0b
	710	.long 0x0c0d0e0f
	711	.text
	712
	713	SHA1_VECTOR_ASM sha1_transform_avx2