]> git.proxmox.com Git - ceph.git/blame - ceph/src/spdk/isa-l/igzip/adler32_avx2_4.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / isa-l / igzip / adler32_avx2_4.asm
CommitLineData
9f95a23c
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
31
32%define LIMIT 5552
33%define BASE 0xFFF1 ; 65521
34
35%define CHUNKSIZE 16
36%define CHUNKSIZE_M1 (CHUNKSIZE-1)
37
38%include "reg_sizes.asm"
39
40default rel
41[bits 64]
42
43; need to keep free: eax, ecx, edx
44
45%ifidn __OUTPUT_FORMAT__, elf64
46 %define arg1 rdi
47 %define arg2 rsi
48 %define arg3 rdx
49
50 %define init_d edi
51 %define data r9
52 %define size r10
53 %define s r11
54 %define a_d r12d
55 %define b_d r8d
56 %define end r13
57
58 %define func(x) x:
59 %macro FUNC_SAVE 0
60 push r12
61 push r13
62 %endmacro
63 %macro FUNC_RESTORE 0
64 pop r13
65 pop r12
66 %endmacro
67%endif
68
69%ifidn __OUTPUT_FORMAT__, win64
70 %define arg1 rcx
71 %define arg2 rdx
72 %define arg3 r8
73
74 %define init_d r12d
75 %define data r9
76 %define size r10
77 %define s r11
78 %define a_d esi
79 %define b_d edi
80 %define end r13
81
82 %define stack_size 2*16 + 5*8 ; must be an odd multiple of 8
83 %define arg(x) [rsp + stack_size + PS + PS*x]
84 %define func(x) proc_frame x
85 %macro FUNC_SAVE 0
86 alloc_stack stack_size
87 vmovdqa [rsp + 0*16], xmm6
88 vmovdqa [rsp + 1*16], xmm7
89 save_reg rdi, 2*16 + 0*8
90 save_reg rsi, 2*16 + 1*8
91 save_reg r12, 2*16 + 2*8
92 save_reg r13, 2*16 + 3*8
93 end_prolog
94 mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
95 %endmacro
96
97 %macro FUNC_RESTORE 0
98 vmovdqa xmm6, [rsp + 0*16]
99 vmovdqa xmm7, [rsp + 1*16]
100 mov rdi, [rsp + 2*16 + 0*8]
101 mov rsi, [rsp + 2*16 + 1*8]
102 mov r12, [rsp + 2*16 + 2*8]
103 mov r13, [rsp + 2*16 + 3*8]
104 add rsp, stack_size
105 %endmacro
106%endif
107
108%define ya ymm0
109%define yb ymm1
110%define ydata0 ymm2
111%define ydata1 ymm3
112%define ysa ymm4
113%define ydata ysa
114%define ytmp0 ydata0
115%define ytmp1 ydata1
116%define ytmp2 ymm5
117%define xa xmm0
118%define xb xmm1
119%define xtmp0 xmm2
120%define xtmp1 xmm3
121%define xsa xmm4
122%define xtmp2 xmm5
123%define yshuf0 ymm6
124%define yshuf1 ymm7
125
126
f67539c2 127global adler32_avx2_4:ISAL_SYM_TYPE_FUNCTION
9f95a23c
TL
128func(adler32_avx2_4)
129 FUNC_SAVE
130
131 vmovdqa yshuf0, [SHUF0]
132 vmovdqa yshuf1, [SHUF1]
133
134 mov data, arg2
135 mov size, arg3
136
137 mov b_d, init_d
138 shr b_d, 16
139 and init_d, 0xFFFF
140 cmp size, 32
141 jb .lt64
142 vmovd xa, init_d
143 vpxor yb, yb, yb
144.sloop1:
145 mov s, LIMIT
146 cmp s, size
147 cmova s, size ; s = min(size, LIMIT)
148 lea end, [data + s - CHUNKSIZE_M1]
149 cmp data, end
150 jae .skip_loop_1a
151align 32
152.sloop1a:
153 ; do CHUNKSIZE adds
154 vbroadcastf128 ydata, [data]
155 add data, CHUNKSIZE
156 vpshufb ydata0, ydata, yshuf0
157 vpaddd ya, ya, ydata0
158 vpaddd yb, yb, ya
159 vpshufb ydata1, ydata, yshuf1
160 vpaddd ya, ya, ydata1
161 vpaddd yb, yb, ya
162 cmp data, end
163 jb .sloop1a
164
165.skip_loop_1a:
166 add end, CHUNKSIZE_M1
167
168 test s, CHUNKSIZE_M1
169 jnz .do_final
170
171 ; either we're done, or we just did LIMIT
172 sub size, s
173
174 ; reduce
175 vpslld yb, 3 ; b is scaled by 8
176 vpmulld ysa, ya, [A_SCALE] ; scaled a
177
178 ; compute horizontal sums of ya, yb, ysa
179 vextracti128 xtmp0, ya, 1
180 vextracti128 xtmp1, yb, 1
181 vextracti128 xtmp2, ysa, 1
182 vpaddd xa, xa, xtmp0
183 vpaddd xb, xb, xtmp1
184 vpaddd xsa, xsa, xtmp2
185 vphaddd xa, xa, xa
186 vphaddd xb, xb, xb
187 vphaddd xsa, xsa, xsa
188 vphaddd xa, xa, xa
189 vphaddd xb, xb, xb
190 vphaddd xsa, xsa, xsa
191
192 vmovd eax, xa
193 xor edx, edx
194 mov ecx, BASE
195 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
196 mov a_d, edx
197
198 vpsubd xb, xb, xsa
199 vmovd eax, xb
200 add eax, b_d
201 xor edx, edx
202 mov ecx, BASE
203 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
204 mov b_d, edx
205
206 test size, size
207 jz .finish
208
209 ; continue loop
210 vmovd xa, a_d
211 vpxor yb, yb
212 jmp .sloop1
213
214.finish:
215 mov eax, b_d
216 shl eax, 16
217 or eax, a_d
218 jmp .end
219
220.lt64:
221 mov a_d, init_d
222 lea end, [data + size]
223 test size, size
224 jnz .final_loop
225 jmp .zero_size
226
227 ; handle remaining 1...15 bytes
228.do_final:
229 ; reduce
230 vpslld yb, 3 ; b is scaled by 8
231 vpmulld ysa, ya, [A_SCALE] ; scaled a
232
233 vextracti128 xtmp0, ya, 1
234 vextracti128 xtmp1, yb, 1
235 vextracti128 xtmp2, ysa, 1
236 vpaddd xa, xa, xtmp0
237 vpaddd xb, xb, xtmp1
238 vpaddd xsa, xsa, xtmp2
239 vphaddd xa, xa, xa
240 vphaddd xb, xb, xb
241 vphaddd xsa, xsa, xsa
242 vphaddd xa, xa, xa
243 vphaddd xb, xb, xb
244 vphaddd xsa, xsa, xsa
245 vpsubd xb, xb, xsa
246
247 vmovd a_d, xa
248 vmovd eax, xb
249 add b_d, eax
250
251align 32
252.final_loop:
253 movzx eax, byte[data]
254 add a_d, eax
255 inc data
256 add b_d, a_d
257 cmp data, end
258 jb .final_loop
259
260.zero_size:
261 mov eax, a_d
262 xor edx, edx
263 mov ecx, BASE
264 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
265 mov a_d, edx
266
267 mov eax, b_d
268 xor edx, edx
269 mov ecx, BASE
270 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
271 shl edx, 16
272 or edx, a_d
273 mov eax, edx
274
275.end:
276 FUNC_RESTORE
277 ret
278
279endproc_frame
280
281section .data
282align 32
283A_SCALE:
284 dq 0x0000000100000000, 0x0000000300000002
285 dq 0x0000000500000004, 0x0000000700000006
286SHUF0:
287 dq 0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02
288 dq 0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06
289SHUF1:
290 dq 0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A
291 dq 0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E
292