1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %ifndef _MULTIBINARY_ASM_
31 %define _MULTIBINARY_ASM_
33 %ifidn __OUTPUT_FORMAT__, elf32
34 %define mbin_def_ptr dd
35 %define mbin_ptr_sz dword
43 %define mbin_def_ptr dq
44 %define mbin_ptr_sz qword
53 %ifndef AS_FEATURE_LEVEL
54 %define AS_FEATURE_LEVEL 4
59 ; creates the visable entry point that uses HW optimized call pointer
60 ; creates the init of the HW optimized call pointer
62 %macro mbin_interface 1
64 ; *_dispatched is defaulted to *_mbinit and replaced on first call.
65 ; Therefore, *_dispatch_init is only executed on first call.
69 mbin_def_ptr %1_mbinit
72 mk_global %1, function
74 ;;; only called the first time to setup hardware match
76 ;;; falls thru to execute the hw optimized code
78 jmp mbin_ptr_sz [%1_dispatched]
82 ; mbin_dispatch_init parameters
83 ; Use this function when SSE/00/01 is a minimum requirement
85 ; 2-> SSE/00/01 optimized function used as base
86 ; 3-> AVX or AVX/02 opt func
87 ; 4-> AVX2 or AVX/04 opt func
89 %macro mbin_dispatch_init 4
97 lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
101 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
102 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
103 lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
104 jne _%1_init_done ; AVX is not available so end
105 mov mbin_rsi, mbin_rbx
111 test ebx, FLAG_CPUID7_EBX_AVX2
112 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
113 cmovne mbin_rsi, mbin_rbx
115 ;; Does it have xmm and ymm support
118 and eax, FLAG_XGETBV_EAX_XMM_YMM
119 cmp eax, FLAG_XGETBV_EAX_XMM_YMM
121 lea mbin_rsi, [%2 WRT_OPT]
128 mov [%1_dispatched], mbin_rsi
134 ; mbin_dispatch_init2 parameters
135 ; Cases where only base functions are available
139 %macro mbin_dispatch_init2 2
143 lea mbin_rsi, [%2 WRT_OPT] ; Default
144 mov [%1_dispatched], mbin_rsi
150 ; mbin_dispatch_init5 parameters
153 ; 3-> SSE4_1 or 00/01 optimized function
154 ; 4-> AVX/02 opt func
155 ; 5-> AVX2/04 opt func
157 %macro mbin_dispatch_init5 5
165 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
170 test ecx, FLAG_CPUID1_ECX_SSE4_1
171 lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
172 cmovne mbin_rsi, mbin_rbx
174 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
175 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
176 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
177 jne _%1_init_done ; AVX is not available so end
178 mov mbin_rsi, mbin_rbx
184 test ebx, FLAG_CPUID7_EBX_AVX2
185 lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
186 cmovne mbin_rsi, mbin_rbx
188 ;; Does it have xmm and ymm support
191 and eax, FLAG_XGETBV_EAX_XMM_YMM
192 cmp eax, FLAG_XGETBV_EAX_XMM_YMM
194 lea mbin_rsi, [%3 WRT_OPT]
201 mov [%1_dispatched], mbin_rsi
206 %if AS_FEATURE_LEVEL >= 6
208 ; mbin_dispatch_init6 parameters
211 ; 3-> SSE4_1 or 00/01 optimized function
212 ; 4-> AVX/02 opt func
213 ; 5-> AVX2/04 opt func
214 ; 6-> AVX512/06 opt func
216 %macro mbin_dispatch_init6 6
225 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
229 mov ebx, ecx ; save cpuid1.ecx
230 test ecx, FLAG_CPUID1_ECX_SSE4_1
231 je _%1_init_done ; Use base function if no SSE4_1
232 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
234 ;; Test for XMM_YMM support/AVX
235 test ecx, FLAG_CPUID1_ECX_OSXSAVE
238 xgetbv ; xcr -> edx:eax
239 mov edi, eax ; save xgetvb.eax
241 and eax, FLAG_XGETBV_EAX_XMM_YMM
242 cmp eax, FLAG_XGETBV_EAX_XMM_YMM
244 test ebx, FLAG_CPUID1_ECX_AVX
246 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
252 test ebx, FLAG_CPUID7_EBX_AVX2
253 je _%1_init_done ; No AVX2 possible
254 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
257 and edi, FLAG_XGETBV_EAX_ZMM_OPM
258 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
259 jne _%1_init_done ; No AVX512 possible
260 and ebx, FLAGS_CPUID7_EBX_AVX512_G1
261 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
262 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
263 cmove mbin_rsi, mbin_rbx
271 mov [%1_dispatched], mbin_rsi
277 %macro mbin_dispatch_init6 6
278 mbin_dispatch_init5 %1, %2, %3, %4, %5
282 %if AS_FEATURE_LEVEL >= 10
284 ; mbin_dispatch_init7 parameters
287 ; 3-> SSE4_2 or 00/01 optimized function
288 ; 4-> AVX/02 opt func
289 ; 5-> AVX2/04 opt func
290 ; 6-> AVX512/06 opt func
291 ; 7-> AVX512 Update/10 opt func
293 %macro mbin_dispatch_init7 7
302 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
306 mov ebx, ecx ; save cpuid1.ecx
307 test ecx, FLAG_CPUID1_ECX_SSE4_2
308 je _%1_init_done ; Use base function if no SSE4_2
309 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
311 ;; Test for XMM_YMM support/AVX
312 test ecx, FLAG_CPUID1_ECX_OSXSAVE
315 xgetbv ; xcr -> edx:eax
316 mov edi, eax ; save xgetvb.eax
318 and eax, FLAG_XGETBV_EAX_XMM_YMM
319 cmp eax, FLAG_XGETBV_EAX_XMM_YMM
321 test ebx, FLAG_CPUID1_ECX_AVX
323 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
329 test ebx, FLAG_CPUID7_EBX_AVX2
330 je _%1_init_done ; No AVX2 possible
331 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
334 and edi, FLAG_XGETBV_EAX_ZMM_OPM
335 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
336 jne _%1_init_done ; No AVX512 possible
337 and ebx, FLAGS_CPUID7_EBX_AVX512_G1
338 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
339 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
340 cmove mbin_rsi, mbin_rbx
342 and ecx, FLAGS_CPUID7_ECX_AVX512_G2
343 cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
344 lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
345 cmove mbin_rsi, mbin_rbx
353 mov [%1_dispatched], mbin_rsi
358 %macro mbin_dispatch_init7 7
359 mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
364 ; mbin_dispatch_sse_to_avx2_shani parameters
365 ; derived from mbin_dispatch_init
366 ; Use this function when SSE/00/01 is a minimum requirement
368 ; 2-> SSE/00/01 optimized function used as base
369 ; 3-> AVX or AVX/02 opt func
370 ; 4-> AVX2 or AVX/04 opt func
371 ; 5-> SHANI opt for GLM
373 %macro mbin_dispatch_sse_to_avx2_shani 5
381 lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
385 and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
386 cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
387 lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
388 jne _%1_shani_check ; AVX is not available so check shani
389 mov mbin_rsi, mbin_rbx
395 test ebx, FLAG_CPUID7_EBX_AVX2
396 lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
397 cmovne mbin_rsi, mbin_rbx
399 ;; Does it have xmm and ymm support
402 and eax, FLAG_XGETBV_EAX_XMM_YMM
403 cmp eax, FLAG_XGETBV_EAX_XMM_YMM
405 lea mbin_rsi, [%2 WRT_OPT]
412 mov [%1_dispatched], mbin_rsi
420 test ebx, FLAG_CPUID7_EBX_SHA
421 lea mbin_rbx, [%5 WRT_OPT] ; SHANI opt func
422 cmovne mbin_rsi, mbin_rbx
423 jmp _%1_init_done ; end
427 ; mbin_dispatch_base_to_avx512_shani parameters
428 ; derived from mbin_dispatch_init6
431 ; 3-> SSE4_2 or 00/01 optimized function
432 ; 4-> AVX/02 opt func
433 ; 5-> AVX2/04 opt func
434 ; 6-> AVX512/06 opt func
435 ; 7-> SHANI opt for GLM
436 ; 8-> SHANI opt for CNL
438 %macro mbin_dispatch_base_to_avx512_shani 8
447 lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
451 mov ebx, ecx ; save cpuid1.ecx
452 test ecx, FLAG_CPUID1_ECX_SSE4_2
453 je _%1_init_done ; Use base function if no SSE4_2
454 lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
456 ;; Test for XMM_YMM support/AVX
457 test ecx, FLAG_CPUID1_ECX_OSXSAVE
460 xgetbv ; xcr -> edx:eax
461 mov edi, eax ; save xgetvb.eax
463 and eax, FLAG_XGETBV_EAX_XMM_YMM
464 cmp eax, FLAG_XGETBV_EAX_XMM_YMM
466 test ebx, FLAG_CPUID1_ECX_AVX
468 lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
474 test ebx, FLAG_CPUID7_EBX_AVX2
475 je _%1_init_done ; No AVX2 possible
476 lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
479 and edi, FLAG_XGETBV_EAX_ZMM_OPM
480 cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
481 jne _%1_init_done ; No AVX512 possible
482 and ebx, FLAGS_CPUID7_EBX_AVX512_G1
483 cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
484 lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
485 cmove mbin_rsi, mbin_rbx
491 test ebx, FLAG_CPUID7_EBX_SHA
492 lea mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func
493 cmovne mbin_rsi, mbin_rbx
501 mov [%1_dispatched], mbin_rsi
509 test ebx, FLAG_CPUID7_EBX_SHA
510 lea mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func
511 cmovne mbin_rsi, mbin_rbx
512 jmp _%1_init_done ; end
517 %endif ; ifndef _MULTIBINARY_ASM_