]> git.proxmox.com Git - ceph.git/blame - ceph/src/erasure-code/isa/isa-l/erasure_code/gf_vect_dot_prod_avx2.asm.s
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / erasure-code / isa / isa-l / erasure_code / gf_vect_dot_prod_avx2.asm.s
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;
31;;; gf_vect_dot_prod_avx2(len, vec, *g_tbls, **buffs, *dest);
32;;;
33
34%include "reg_sizes.asm"
35
36%ifidn __OUTPUT_FORMAT__, elf64
37 %define arg0 rdi
38 %define arg1 rsi
39 %define arg2 rdx
40 %define arg3 rcx
41 %define arg4 r8
42 %define arg5 r9
43
44 %define tmp r11
45 %define tmp.w r11d
46 %define tmp.b r11b
47 %define tmp2 r10
48 %define tmp3 r9
49 %define return rax
50 %macro SLDR 2
51 %endmacro
52 %define SSTR SLDR
53 %define PS 8
54 %define func(x) x:
55 %define FUNC_SAVE
56 %define FUNC_RESTORE
57%endif
58
59%ifidn __OUTPUT_FORMAT__, win64
60 %define arg0 rcx
61 %define arg1 rdx
62 %define arg2 r8
63 %define arg3 r9
64
65 %define arg4 r12 ; must be saved and loaded
66 %define tmp r11
67 %define tmp.w r11d
68 %define tmp.b r11b
69 %define tmp2 r10
70 %define tmp3 rdi ; must be saved and loaded
71 %define return rax
72 %macro SLDR 2
73 %endmacro
74 %define SSTR SLDR
75 %define PS 8
76 %define frame_size 2*8
77 %define arg(x) [rsp + frame_size + PS + PS*x]
78
79 %define func(x) proc_frame x
80 %macro FUNC_SAVE 0
81 rex_push_reg r12
82 push_reg rdi
83 end_prolog
84 mov arg4, arg(4)
85 %endmacro
86
87 %macro FUNC_RESTORE 0
88 pop rdi
89 pop r12
90 %endmacro
91%endif
92
93%ifidn __OUTPUT_FORMAT__, elf32
94
95;;;================== High Address;
96;;; arg4
97;;; arg3
98;;; arg2
99;;; arg1
100;;; arg0
101;;; return
102;;;<================= esp of caller
103;;; ebp
104;;;<================= ebp = esp
105;;; esi
106;;; edi
107;;; ebx
108;;;<================= esp of callee
109;;;
110;;;================== Low Address;
111
112 %define PS 4
113 %define LOG_PS 2
114 %define func(x) x:
115 %define arg(x) [ebp + PS*2 + PS*x]
116
117 %define trans ecx ;trans is for the variables in stack
118 %define arg0 trans
119 %define arg0_m arg(0)
120 %define arg1 trans
121 %define arg1_m arg(1)
122 %define arg2 arg2_m
123 %define arg2_m arg(2)
124 %define arg3 ebx
125 %define arg4 trans
126 %define arg4_m arg(4)
127 %define tmp edx
128 %define tmp.w edx
129 %define tmp.b dl
130 %define tmp2 edi
131 %define tmp3 esi
132 %define return eax
133 %macro SLDR 2 ;stack load/restore
134 mov %1, %2
135 %endmacro
136 %define SSTR SLDR
137
138 %macro FUNC_SAVE 0
139 push ebp
140 mov ebp, esp
141 push esi
142 push edi
143 push ebx
144 mov arg3, arg(3)
145 %endmacro
146
147 %macro FUNC_RESTORE 0
148 pop ebx
149 pop edi
150 pop esi
151 mov esp, ebp
152 pop ebp
153 %endmacro
154
155%endif ; output formats
156
157%define len arg0
158%define vec arg1
159%define mul_array arg2
160%define src arg3
161%define dest arg4
162
163%define vec_i tmp2
164%define ptr tmp3
165%define pos return
166
167%ifidn PS,4 ;32-bit code
168 %define vec_m arg1_m
169 %define len_m arg0_m
170 %define dest_m arg4_m
171%endif
172
173%ifndef EC_ALIGNED_ADDR
174;;; Use Un-aligned load/store
175 %define XLDR vmovdqu
176 %define XSTR vmovdqu
177%else
178;;; Use Non-temporal load/stor
179 %ifdef NO_NT_LDST
180 %define XLDR vmovdqa
181 %define XSTR vmovdqa
182 %else
183 %define XLDR vmovntdqa
184 %define XSTR vmovntdq
185 %endif
186%endif
187
188%ifidn PS,8 ;64-bit code
189 default rel
190 [bits 64]
191%endif
192
193section .text
194
195%define xmask0f ymm3
196%define xmask0fx xmm3
197%define xgft_lo ymm4
198%define xgft_hi ymm5
199
200%define x0 ymm0
201%define xtmpa ymm1
202%define xp ymm2
203
204align 16
205global gf_vect_dot_prod_avx2:function
206func(gf_vect_dot_prod_avx2)
207 FUNC_SAVE
208 SLDR len, len_m
209 sub len, 32
210 SSTR len_m, len
211 jl .return_fail
212 xor pos, pos
213 mov tmp.b, 0x0f
214 vpinsrb xmask0fx, xmask0fx, tmp.w, 0
215 vpbroadcastb xmask0f, xmask0fx ;Construct mask 0x0f0f0f...
216
217.loop32:
218 vpxor xp, xp
219 mov tmp, mul_array
220 xor vec_i, vec_i
221
222.next_vect:
223
224 mov ptr, [src+vec_i*PS]
225
226 vmovdqu xgft_lo, [tmp] ;Load array Cx{00}, Cx{01}, Cx{02}, ...
227 ; " Cx{00}, Cx{10}, Cx{20}, ... , Cx{f0}
228 vperm2i128 xgft_hi, xgft_lo, xgft_lo, 0x11 ; swapped to hi | hi
229 vperm2i128 xgft_lo, xgft_lo, xgft_lo, 0x00 ; swapped to lo | lo
230
231 XLDR x0, [ptr+pos] ;Get next source vector
232
233 add tmp, 32
234 add vec_i, 1
235
236 vpand xtmpa, x0, xmask0f ;Mask low src nibble in bits 4-0
237 vpsraw x0, x0, 4 ;Shift to put high nibble into bits 4-0
238 vpand x0, x0, xmask0f ;Mask high src nibble in bits 4-0
239
240 vpshufb xgft_hi, xgft_hi, x0 ;Lookup mul table of high nibble
241 vpshufb xgft_lo, xgft_lo, xtmpa ;Lookup mul table of low nibble
242 vpxor xgft_hi, xgft_hi, xgft_lo ;GF add high and low partials
243 vpxor xp, xp, xgft_hi ;xp += partial
244
245 SLDR vec, vec_m
246 cmp vec_i, vec
247 jl .next_vect
248
249 SLDR dest, dest_m
250 XSTR [dest+pos], xp
251
252 add pos, 32 ;Loop on 32 bytes at a time
253 SLDR len, len_m
254 cmp pos, len
255 jle .loop32
256
257 lea tmp, [len + 32]
258 cmp pos, tmp
259 je .return_pass
260
261 ;; Tail len
262 mov pos, len ;Overlapped offset length-32
263 jmp .loop32 ;Do one more overlap pass
264
265.return_pass:
266 mov return, 0
267 FUNC_RESTORE
268 ret
269
270.return_fail:
271 mov return, 1
272 FUNC_RESTORE
273 ret
274
275endproc_frame
276
277section .data
278
279;;; func core, ver, snum
280slversion gf_vect_dot_prod_avx2, 04, 05, 0190