[ceph.git] / ceph / src / erasure-code / isa / isa-l / erasure_code / ec_multibinary.asm.s

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

%ifidn __OUTPUT_FORMAT__, elf64
 %define WRT_OPT		wrt ..plt
%else
 %define WRT_OPT
%endif

%include "reg_sizes.asm"

%ifidn __OUTPUT_FORMAT__, elf32

[bits 32]

 %define def_wrd		dd
 %define wrd_sz  	dword
 %define arg1		esi
 %define arg2		eax
 %define arg3		ebx
 %define arg4		ecx
 %define arg5		edx

%else

 default rel
 [bits 64]

 %define def_wrd 	dq
 %define wrd_sz  	qword
 %define arg1		rsi
 %define arg2		rax
 %define arg3		rbx
 %define arg4		rcx
 %define arg5		rdx


 extern ec_encode_data_update_sse
 extern ec_encode_data_update_avx
 extern ec_encode_data_update_avx2
 extern gf_vect_mul_sse
 extern gf_vect_mul_avx

 extern gf_vect_mad_sse
 extern gf_vect_mad_avx
 extern gf_vect_mad_avx2
%endif

extern gf_vect_mul_base
extern ec_encode_data_base
extern ec_encode_data_update_base
extern gf_vect_dot_prod_base
extern gf_vect_mad_base

extern gf_vect_dot_prod_sse
extern gf_vect_dot_prod_avx
extern gf_vect_dot_prod_avx2
extern ec_encode_data_sse
extern ec_encode_data_avx
extern ec_encode_data_avx2


section .data
;;; *_mbinit are initial values for *_dispatched; is updated on first call.
;;; Therefore, *_dispatch_init is only executed on first call.

ec_encode_data_dispatched:
	def_wrd      ec_encode_data_mbinit

gf_vect_mul_dispatched:
	def_wrd      gf_vect_mul_mbinit

gf_vect_dot_prod_dispatched:
	def_wrd      gf_vect_dot_prod_mbinit

ec_encode_data_update_dispatched:
	def_wrd      ec_encode_data_update_mbinit

gf_vect_mad_dispatched:
	def_wrd      gf_vect_mad_mbinit

section .text
;;;;
; ec_encode_data multibinary function
;;;;
global ec_encode_data:function
ec_encode_data_mbinit:
	call	ec_encode_data_dispatch_init

ec_encode_data:
	jmp	wrd_sz [ec_encode_data_dispatched]

ec_encode_data_dispatch_init:
	push    arg1
	push    arg2
	push    arg3
	push    arg4
	push    arg5
	lea     arg1, [ec_encode_data_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     arg3, [ec_encode_data_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, arg3

	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea	arg3, [ec_encode_data_avx WRT_OPT]

	jne	_done_ec_encode_data_init
	mov	arg1, arg3

	;; Try for AVX2
	xor	ecx, ecx
	mov	eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     arg3, [ec_encode_data_avx2 WRT_OPT]
	cmovne	arg1, arg3
	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_ec_encode_data_init
	lea     arg1, [ec_encode_data_sse WRT_OPT]

_done_ec_encode_data_init:
	pop     arg5
	pop     arg4
	pop     arg3
	pop     arg2
	mov     [ec_encode_data_dispatched], arg1
	pop     arg1
	ret

;;;;
; gf_vect_mul multibinary function
;;;;
global gf_vect_mul:function
gf_vect_mul_mbinit:
	call    gf_vect_mul_dispatch_init

gf_vect_mul:
	jmp	wrd_sz [gf_vect_mul_dispatched]

gf_vect_mul_dispatch_init:
	push    arg1
%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
	lea     arg1, [gf_vect_mul_base]
%else
	push    rax
	push    rbx
	push    rcx
	push    rdx
	lea     arg1, [gf_vect_mul_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	test    ecx, FLAG_CPUID1_ECX_SSE4_2
	lea     rbx, [gf_vect_mul_sse WRT_OPT]
	je	_done_gf_vect_mul_dispatch_init
	mov  	arg1, rbx

	;; Try for AVX
	and     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
	cmp     ecx, (FLAG_CPUID1_ECX_OSXSAVE | FLAG_CPUID1_ECX_AVX)
	jne     _done_gf_vect_mul_dispatch_init

	;; Does it have xmm and ymm support
	xor     ecx, ecx
	xgetbv
	and     eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp     eax, FLAG_XGETBV_EAX_XMM_YMM
	jne     _done_gf_vect_mul_dispatch_init
	lea     arg1, [gf_vect_mul_avx WRT_OPT]

_done_gf_vect_mul_dispatch_init:
	pop     rdx
	pop     rcx
	pop     rbx
	pop     rax
%endif			;; END 32-bit check
	mov     [gf_vect_mul_dispatched], arg1
	pop     arg1
	ret

;;;;
; ec_encode_data_update multibinary function
;;;;
global ec_encode_data_update:function
ec_encode_data_update_mbinit:
	call	ec_encode_data_update_dispatch_init

ec_encode_data_update:
	jmp	wrd_sz [ec_encode_data_update_dispatched]

ec_encode_data_update_dispatch_init:
	push    arg1
%ifidn __OUTPUT_FORMAT__, elf32		;; 32-bit check
	lea     arg1, [ec_encode_data_update_base]
%else
	push    rax
	push    rbx
	push    rcx
	push    rdx
	lea     arg1, [ec_encode_data_update_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     rbx, [ec_encode_data_update_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, rbx

	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea	rbx, [ec_encode_data_update_avx WRT_OPT]

	jne	_done_ec_encode_data_update_init
	mov	rsi, rbx

	;; Try for AVX2
	xor	ecx, ecx
	mov	eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     rbx, [ec_encode_data_update_avx2 WRT_OPT]
	cmovne	rsi, rbx

	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_ec_encode_data_update_init
	lea     rsi, [ec_encode_data_update_sse WRT_OPT]

_done_ec_encode_data_update_init:
	pop     rdx
	pop     rcx
	pop     rbx
	pop     rax
%endif			;; END 32-bit check
	mov     [ec_encode_data_update_dispatched], arg1
	pop     arg1
	ret

;;;;
; gf_vect_dot_prod multibinary function
;;;;
global gf_vect_dot_prod:function
gf_vect_dot_prod_mbinit:
	call    gf_vect_dot_prod_dispatch_init

gf_vect_dot_prod:
	jmp     wrd_sz [gf_vect_dot_prod_dispatched]

gf_vect_dot_prod_dispatch_init:
	push    arg1
	push    arg2
	push    arg3
	push    arg4
	push    arg5
	lea     arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     arg3, [gf_vect_dot_prod_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, arg3

	and		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp		ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea     arg3, [gf_vect_dot_prod_avx WRT_OPT]

	jne     _done_gf_vect_dot_prod_init
	mov		arg1, arg3

	;; Try for AVX2
	xor		ecx, ecx
	mov		eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
	cmovne	arg1, arg3
	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_gf_vect_dot_prod_init
	lea     arg1, [gf_vect_dot_prod_sse WRT_OPT]

_done_gf_vect_dot_prod_init:
	pop     arg5
	pop     arg4
	pop     arg3
	pop     arg2
	mov     [gf_vect_dot_prod_dispatched], arg1
	pop	arg1
	ret

;;;;
; gf_vect_mad multibinary function
;;;;
global gf_vect_mad:function
gf_vect_mad_mbinit:
	call    gf_vect_mad_dispatch_init

gf_vect_mad:
	jmp     wrd_sz [gf_vect_mad_dispatched]

gf_vect_mad_dispatch_init:
	push    arg1
%ifidn __OUTPUT_FORMAT__, elf32         ;; 32-bit check
	lea     arg1, [gf_vect_mad_base]
%else
	push	rax
	push	rbx
	push	rcx
	push	rdx
	lea     arg1, [gf_vect_mad_base WRT_OPT] ; Default

	mov     eax, 1
	cpuid
	lea     rbx, [gf_vect_mad_sse WRT_OPT]
	test    ecx, FLAG_CPUID1_ECX_SSE4_1
	cmovne  arg1, rbx

	and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
	lea     rbx, [gf_vect_mad_avx WRT_OPT]

	jne     _done_gf_vect_mad_init
	mov	rsi, rbx

	;; Try for AVX2
	xor	ecx, ecx
	mov	eax, 7
	cpuid
	test	ebx, FLAG_CPUID1_EBX_AVX2
	lea     rbx, [gf_vect_mad_avx2 WRT_OPT]
	cmovne	rsi, rbx

	;; Does it have xmm and ymm support
	xor	ecx, ecx
	xgetbv
	and	eax, FLAG_XGETBV_EAX_XMM_YMM
	cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
	je	_done_gf_vect_mad_init
	lea     rsi, [gf_vect_mad_sse WRT_OPT]

_done_gf_vect_mad_init:
	pop     rdx
	pop     rcx
	pop     rbx
	pop     rax
%endif			;; END 32-bit check
	mov     [gf_vect_mad_dispatched], arg1
	pop	arg1
	ret

;;;       func                 		core, ver, snum
slversion ec_encode_data,		00,   04,  0133
slversion gf_vect_mul,			00,   03,  0134
slversion ec_encode_data_update,	00,   03,  0212
slversion gf_vect_dot_prod,		00,   03,  0138
slversion gf_vect_mad,			00,   02,  0213
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	%ifidn __OUTPUT_FORMAT__, elf64
	31	%define WRT_OPT wrt ..plt
	32	%else
	33	%define WRT_OPT
	34	%endif
	35
	36	%include "reg_sizes.asm"
	37
	38	%ifidn __OUTPUT_FORMAT__, elf32
	39
	40	[bits 32]
	41
	42	%define def_wrd dd
	43	%define wrd_sz dword
	44	%define arg1 esi
	45	%define arg2 eax
	46	%define arg3 ebx
	47	%define arg4 ecx
	48	%define arg5 edx
	49
	50	%else
	51
	52	default rel
	53	[bits 64]
	54
	55	%define def_wrd dq
	56	%define wrd_sz qword
	57	%define arg1 rsi
	58	%define arg2 rax
	59	%define arg3 rbx
	60	%define arg4 rcx
	61	%define arg5 rdx
	62
	63
	64	extern ec_encode_data_update_sse
65	extern ec_encode_data_update_avx
66	extern ec_encode_data_update_avx2
67	extern gf_vect_mul_sse
68	extern gf_vect_mul_avx
69
70	extern gf_vect_mad_sse
71	extern gf_vect_mad_avx
72	extern gf_vect_mad_avx2
73	%endif
74
75	extern gf_vect_mul_base
76	extern ec_encode_data_base
77	extern ec_encode_data_update_base
78	extern gf_vect_dot_prod_base
79	extern gf_vect_mad_base
80
81	extern gf_vect_dot_prod_sse
82	extern gf_vect_dot_prod_avx
83	extern gf_vect_dot_prod_avx2
84	extern ec_encode_data_sse
85	extern ec_encode_data_avx
86	extern ec_encode_data_avx2
87
88
89	section .data
90	;;; _mbinit are initial values for _dispatched; is updated on first call.
91	;;; Therefore, *_dispatch_init is only executed on first call.
92
93	ec_encode_data_dispatched:
94	def_wrd ec_encode_data_mbinit
95
96	gf_vect_mul_dispatched:
97	def_wrd gf_vect_mul_mbinit
98
99	gf_vect_dot_prod_dispatched:
100	def_wrd gf_vect_dot_prod_mbinit
101
102	ec_encode_data_update_dispatched:
103	def_wrd ec_encode_data_update_mbinit
104
105	gf_vect_mad_dispatched:
106	def_wrd gf_vect_mad_mbinit
107
108	section .text
109	;;;;
110	; ec_encode_data multibinary function
111	;;;;
112	global ec_encode_data:function
113	ec_encode_data_mbinit:
114	call ec_encode_data_dispatch_init
115
116	ec_encode_data:
117	jmp wrd_sz [ec_encode_data_dispatched]
118
119	ec_encode_data_dispatch_init:
120	push arg1
121	push arg2
122	push arg3
123	push arg4
124	push arg5
125	lea arg1, [ec_encode_data_base WRT_OPT] ; Default
126
127	mov eax, 1
128	cpuid
129	lea arg3, [ec_encode_data_sse WRT_OPT]
130	test ecx, FLAG_CPUID1_ECX_SSE4_1
131	cmovne arg1, arg3
132
133	and ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
134	cmp ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
135	lea arg3, [ec_encode_data_avx WRT_OPT]
136
137	jne _done_ec_encode_data_init
138	mov arg1, arg3
139
140	;; Try for AVX2
141	xor ecx, ecx
142	mov eax, 7
143	cpuid
144	test ebx, FLAG_CPUID1_EBX_AVX2
145	lea arg3, [ec_encode_data_avx2 WRT_OPT]
146	cmovne arg1, arg3
147	;; Does it have xmm and ymm support
148	xor ecx, ecx
149	xgetbv
150	and eax, FLAG_XGETBV_EAX_XMM_YMM
151	cmp eax, FLAG_XGETBV_EAX_XMM_YMM
152	je _done_ec_encode_data_init
153	lea arg1, [ec_encode_data_sse WRT_OPT]
154
155	_done_ec_encode_data_init:
156	pop arg5
157	pop arg4
158	pop arg3
159	pop arg2
160	mov [ec_encode_data_dispatched], arg1
161	pop arg1
162	ret
163
164	;;;;
165	; gf_vect_mul multibinary function
166	;;;;
167	global gf_vect_mul:function
168	gf_vect_mul_mbinit:
169	call gf_vect_mul_dispatch_init
170
171	gf_vect_mul:
172	jmp wrd_sz [gf_vect_mul_dispatched]
173
174	gf_vect_mul_dispatch_init:
175	push arg1
176	%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
177	lea arg1, [gf_vect_mul_base]
178	%else
179	push rax
180	push rbx
181	push rcx
182	push rdx
183	lea arg1, [gf_vect_mul_base WRT_OPT] ; Default
184
185	mov eax, 1
186	cpuid
187	test ecx, FLAG_CPUID1_ECX_SSE4_2
188	lea rbx, [gf_vect_mul_sse WRT_OPT]
189	je _done_gf_vect_mul_dispatch_init
190	mov arg1, rbx
191
192	;; Try for AVX
193	and ecx, (FLAG_CPUID1_ECX_OSXSAVE \| FLAG_CPUID1_ECX_AVX)
194	cmp ecx, (FLAG_CPUID1_ECX_OSXSAVE \| FLAG_CPUID1_ECX_AVX)
195	jne _done_gf_vect_mul_dispatch_init
196
197	;; Does it have xmm and ymm support
198	xor ecx, ecx
199	xgetbv
200	and eax, FLAG_XGETBV_EAX_XMM_YMM
201	cmp eax, FLAG_XGETBV_EAX_XMM_YMM
202	jne _done_gf_vect_mul_dispatch_init
203	lea arg1, [gf_vect_mul_avx WRT_OPT]
204
205	_done_gf_vect_mul_dispatch_init:
206	pop rdx
207	pop rcx
208	pop rbx
209	pop rax
210	%endif ;; END 32-bit check
211	mov [gf_vect_mul_dispatched], arg1
212	pop arg1
213	ret
214
215	;;;;
216	; ec_encode_data_update multibinary function
217	;;;;
218	global ec_encode_data_update:function
219	ec_encode_data_update_mbinit:
220	call ec_encode_data_update_dispatch_init
221
222	ec_encode_data_update:
223	jmp wrd_sz [ec_encode_data_update_dispatched]
224
225	ec_encode_data_update_dispatch_init:
226	push arg1
227	%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
228	lea arg1, [ec_encode_data_update_base]
229	%else
230	push rax
231	push rbx
232	push rcx
233	push rdx
234	lea arg1, [ec_encode_data_update_base WRT_OPT] ; Default
235
236	mov eax, 1
237	cpuid
238	lea rbx, [ec_encode_data_update_sse WRT_OPT]
239	test ecx, FLAG_CPUID1_ECX_SSE4_1
240	cmovne arg1, rbx
241
242	and ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
243	cmp ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
244	lea rbx, [ec_encode_data_update_avx WRT_OPT]
245
246	jne _done_ec_encode_data_update_init
247	mov rsi, rbx
248
249	;; Try for AVX2
250	xor ecx, ecx
251	mov eax, 7
252	cpuid
253	test ebx, FLAG_CPUID1_EBX_AVX2
254	lea rbx, [ec_encode_data_update_avx2 WRT_OPT]
255	cmovne rsi, rbx
256
257	;; Does it have xmm and ymm support
258	xor ecx, ecx
259	xgetbv
260	and eax, FLAG_XGETBV_EAX_XMM_YMM
261	cmp eax, FLAG_XGETBV_EAX_XMM_YMM
262	je _done_ec_encode_data_update_init
263	lea rsi, [ec_encode_data_update_sse WRT_OPT]
264
265	_done_ec_encode_data_update_init:
266	pop rdx
267	pop rcx
268	pop rbx
269	pop rax
270	%endif ;; END 32-bit check
271	mov [ec_encode_data_update_dispatched], arg1
272	pop arg1
273	ret
274
275	;;;;
276	; gf_vect_dot_prod multibinary function
277	;;;;
278	global gf_vect_dot_prod:function
279	gf_vect_dot_prod_mbinit:
280	call gf_vect_dot_prod_dispatch_init
281
282	gf_vect_dot_prod:
283	jmp wrd_sz [gf_vect_dot_prod_dispatched]
284
285	gf_vect_dot_prod_dispatch_init:
286	push arg1
287	push arg2
288	push arg3
289	push arg4
290	push arg5
291	lea arg1, [gf_vect_dot_prod_base WRT_OPT] ; Default
292
293	mov eax, 1
294	cpuid
295	lea arg3, [gf_vect_dot_prod_sse WRT_OPT]
296	test ecx, FLAG_CPUID1_ECX_SSE4_1
297	cmovne arg1, arg3
298
299	and ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
300	cmp ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
301	lea arg3, [gf_vect_dot_prod_avx WRT_OPT]
302
303	jne _done_gf_vect_dot_prod_init
304	mov arg1, arg3
305
306	;; Try for AVX2
307	xor ecx, ecx
308	mov eax, 7
309	cpuid
310	test ebx, FLAG_CPUID1_EBX_AVX2
311	lea arg3, [gf_vect_dot_prod_avx2 WRT_OPT]
312	cmovne arg1, arg3
313	;; Does it have xmm and ymm support
314	xor ecx, ecx
315	xgetbv
316	and eax, FLAG_XGETBV_EAX_XMM_YMM
317	cmp eax, FLAG_XGETBV_EAX_XMM_YMM
318	je _done_gf_vect_dot_prod_init
319	lea arg1, [gf_vect_dot_prod_sse WRT_OPT]
320
321	_done_gf_vect_dot_prod_init:
322	pop arg5
323	pop arg4
324	pop arg3
325	pop arg2
326	mov [gf_vect_dot_prod_dispatched], arg1
327	pop arg1
328	ret
329
330	;;;;
331	; gf_vect_mad multibinary function
332	;;;;
333	global gf_vect_mad:function
334	gf_vect_mad_mbinit:
335	call gf_vect_mad_dispatch_init
336
337	gf_vect_mad:
338	jmp wrd_sz [gf_vect_mad_dispatched]
339
340	gf_vect_mad_dispatch_init:
341	push arg1
342	%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
343	lea arg1, [gf_vect_mad_base]
344	%else
345	push rax
346	push rbx
347	push rcx
348	push rdx
349	lea arg1, [gf_vect_mad_base WRT_OPT] ; Default
350
351	mov eax, 1
352	cpuid
353	lea rbx, [gf_vect_mad_sse WRT_OPT]
354	test ecx, FLAG_CPUID1_ECX_SSE4_1
355	cmovne arg1, rbx
356
357	and ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
358	cmp ecx, (FLAG_CPUID1_ECX_AVX \| FLAG_CPUID1_ECX_OSXSAVE)
359	lea rbx, [gf_vect_mad_avx WRT_OPT]
360
361	jne _done_gf_vect_mad_init
362	mov rsi, rbx
363
364	;; Try for AVX2
365	xor ecx, ecx
366	mov eax, 7
367	cpuid
368	test ebx, FLAG_CPUID1_EBX_AVX2
369	lea rbx, [gf_vect_mad_avx2 WRT_OPT]
370	cmovne rsi, rbx
371
372	;; Does it have xmm and ymm support
373	xor ecx, ecx
374	xgetbv
375	and eax, FLAG_XGETBV_EAX_XMM_YMM
376	cmp eax, FLAG_XGETBV_EAX_XMM_YMM
377	je _done_gf_vect_mad_init
378	lea rsi, [gf_vect_mad_sse WRT_OPT]
379
380	_done_gf_vect_mad_init:
381	pop rdx
382	pop rcx
383	pop rbx
384	pop rax
385	%endif ;; END 32-bit check
386	mov [gf_vect_mad_dispatched], arg1
387	pop arg1
388	ret
389
390	;;; func core, ver, snum
391	slversion ec_encode_data, 00, 04, 0133
392	slversion gf_vect_mul, 00, 03, 0134
393	slversion ec_encode_data_update, 00, 03, 0212
394	slversion gf_vect_dot_prod, 00, 03, 0138
395	slversion gf_vect_mad, 00, 02, 0213