;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%ifidn __OUTPUT_FORMAT__, elf64
 %define WRT_OPT		wrt ..plt
%else
 %define WRT_OPT
%endif

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf32

[bits 32]

 %define def_wrd		dd
 %define wrd_sz  	dword
 %define arg1		esi
 %define arg2		eax
 %define arg3		ebx
 %define arg4		ecx
 %define arg5		edx

%else

 default rel
 [bits 64]

 %define def_wrd 	dq
 %define wrd_sz  	qword
 %define arg1		rsi
 %define arg2		rax
 %define arg3		rbx
 %define arg4		rcx
 %define arg5		rdx


 extern ec_encode_data_update_sse
 extern ec_encode_data_update_avx
 extern ec_encode_data_update_avx2
 extern gf_vect_mul_sse
 extern gf_vect_mul_avx

 extern gf_vect_mad_sse
 extern gf_vect_mad_avx
 extern gf_vect_mad_avx2
%endif

extern gf_vect_mul_base
extern ec_encode_data_base
extern ec_encode_data_update_base
extern gf_vect_dot_prod_base
extern gf_vect_mad_base

extern gf_vect_dot_prod_sse
extern gf_vect_dot_prod_avx
extern gf_vect_dot_prod_avx2
extern ec_encode_data_sse
extern ec_encode_data_avx
extern ec_encode_data_avx2


section .data
;;; *_mbinit are initial values for *_dispatched; is updated on first call.
;;; Therefore, *_dispatch_init is only executed on first call.

ec_encode_data_dispatched:
	def_wrd      ec_encode_data_mbinit

gf_vect_mul_dispatched:
	def_wrd      gf_vect_mul_mbinit

gf_vect_dot_prod_dispatched:
	def_wrd      gf_vect_dot_prod_mbinit

ec_encode_data_update_dispatched:
	def_wrd      ec_encode_data_update_mbinit

gf_vect_mad_dispatched:
	def_wrd      gf_vect_mad_mbinit

section .text
;;;;
; ec_encode_data multibinary function
;;;;
global ec_encode_data:function
ec_encode_data_mbinit:
	call	ec_encode_data_dispatch_init

ec_encode_data:
	jmp	wrd_sz [ec_encode_data_dispatched]

ec_encode_data_dispatch_init:
	push    arg1
	push    arg2
	push    arg3
	push    arg4
	push    arg5
	lea     arg1, [ec_encode_data_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     arg3, [ec_encode_data_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, arg3

	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea	arg3, [ec_encode_data_avx WRT_OPT]

	jne	_done_ec_encode_data_init
	mov	arg1, arg3

	;; Try for AVX2
	xor	ecx, ecx
	mov	eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     arg3, [ec_encode_data_avx2 WRT_OPT]
	cmovne	arg1, arg3
	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_ec_encode_data_init
	lea     arg1, [ec_encode_data_sse WRT_OPT]

_done_ec_encode_data_init:
	pop     arg5
	pop     arg4
	pop     arg3
	pop     arg2
	mov     [ec_encode_data_dispatched], arg1
	pop     arg1
	ret

;;;;
; gf_vect_mul multibinary function
;;;;
global gf_vect_mul:function
gf_vect_mul_mbinit:
	call    gf_vect_mul_dispatch_init

gf_vect_mul:
	jmp	wrd_sz [gf_vect_mul_dispatched]

gf_vect_mul_dispatch_init:
	push    arg1
%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
	lea     arg1, [gf_vect_mul_base]
%else
	push    rax
	push    rbx
	push    rcx
	push    rdx
	lea     arg1, [gf_vect_mul_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	test    ecx, FLAG_CPUID1_ECX_SSE4_2
	lea     rbx, [gf_vect_mul_sse WRT_OPT]
	je	_done_gf_vect_mul_dispatch_init
	mov  	arg1, rbx

	;; Try for AVX
	and     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
	cmp     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
	jne     _done_gf_vect_mul_dispatch_init

	;; Does it have xmm and ymm support
	xor     ecx, ecx
	xgetbv
	and     eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp     eax, FLAG_XGETBV_EAX_XMM_YMM
	jne     _done_gf_vect_mul_dispatch_init
	lea     arg1, [gf_vect_mul_avx WRT_OPT]

_done_gf_vect_mul_dispatch_init:
	pop     rdx
	pop     rcx
	pop     rbx
	pop     rax
%endif			;; END 32-bit check
	mov     [gf_vect_mul_dispatched], arg1
	pop     arg1
	ret

;;;;
; ec_encode_data_update multibinary function
;;;;
global ec_encode_data_update:function
ec_encode_data_update_mbinit:
	call	ec_encode_data_update_dispatch_init

ec_encode_data_update:
	jmp	wrd_sz [ec_encode_data_update_dispatched]

ec_encode_data_update_dispatch_init:
	push    arg1
%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
	lea     arg1, [ec_encode_data_update_base]
%else
	push    rax
	push    rbx
	push    rcx
	push    rdx
	lea     arg1, [ec_encode_data_update_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     rbx, [ec_encode_data_update_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, rbx

	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea	rbx, [ec_encode_data_update_avx WRT_OPT]

	jne	_done_ec_encode_data_update_init
	mov	rsi, rbx

	;; Try for AVX2
	xor	ecx, ecx
	mov	eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     rbx, [ec_encode_data_update_avx2 WRT_OPT]
	cmovne	rsi, rbx

	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_ec_encode_data_update_init
	lea     rsi, [ec_encode_data_update_sse WRT_OPT]

_done_ec_encode_data_update_init:
	pop     rdx
	pop     rcx
	pop     rbx
	pop     rax
%endif			;; END 32-bit check
	mov     [ec_encode_data_update_dispatched], arg1
	pop     arg1
	ret

;;;;
; gf_vect_dot_prod multibinary function
;;;;
global gf_vect_dot_prod:function
gf_vect_dot_prod_mbinit:
	call    gf_vect_dot_prod_dispatch_init

gf_vect_dot_prod:
	jmp     wrd_sz [gf_vect_dot_prod_dispatched]

gf_vect_dot_prod_dispatch_init:
	push    arg1
	push    arg2
	push    arg3
	push    arg4
	push    arg5
	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     arg3, [gf_vect_dot_prod_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, arg3

	and		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea     arg3, [gf_vect_dot_prod_avx WRT_OPT]

	jne     _done_gf_vect_dot_prod_init
	mov		arg1, arg3

	;; Try for AVX2
	xor		ecx, ecx
	mov		eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
	cmovne	arg1, arg3
	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_gf_vect_dot_prod_init
	lea     arg1, [gf_vect_dot_prod_sse WRT_OPT]

_done_gf_vect_dot_prod_init:
	pop     arg5
	pop     arg4
	pop     arg3
	pop     arg2
	mov     [gf_vect_dot_prod_dispatched], arg1
	pop	arg1
	ret

;;;;
; gf_vect_mad multibinary function
;;;;
global gf_vect_mad:function
gf_vect_mad_mbinit:
	call    gf_vect_mad_dispatch_init

gf_vect_mad:
	jmp     wrd_sz [gf_vect_mad_dispatched]

gf_vect_mad_dispatch_init:
	push    arg1
%ifidn __OUTPUT_FORMAT__, elf32         ;; 32-bit check
	lea     arg1, [gf_vect_mad_base]
%else
	push	rax
	push	rbx
	push	rcx
	push	rdx
	lea     arg1, [gf_vect_mad_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     rbx, [gf_vect_mad_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, rbx

	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea     rbx, [gf_vect_mad_avx WRT_OPT]

	jne     _done_gf_vect_mad_init
	mov	rsi, rbx

	;; Try for AVX2
	xor	ecx, ecx
	mov	eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     rbx, [gf_vect_mad_avx2 WRT_OPT]
	cmovne	rsi, rbx

	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_gf_vect_mad_init
	lea     rsi, [gf_vect_mad_sse WRT_OPT]

_done_gf_vect_mad_init:
	pop     rdx
	pop     rcx
	pop     rbx
	pop     rax
%endif			;; END 32-bit check
	mov     [gf_vect_mad_dispatched], arg1
	pop	arg1
	ret

;;;       func                 		core, ver, snum
slversion ec_encode_data,		00,   04,  0133
slversion gf_vect_mul,			00,   03,  0134
slversion ec_encode_data_update,	00,   03,  0212
slversion gf_vect_dot_prod,		00,   03,  0138
slversion gf_vect_mad,			00,   02,  0213