]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/adler32_avx2_4.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / igzip / adler32_avx2_4.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
31
32 %define LIMIT 5552
33 %define BASE 0xFFF1 ; 65521
34
35 %define CHUNKSIZE 16
36 %define CHUNKSIZE_M1 (CHUNKSIZE-1)
37
38 %include "reg_sizes.asm"
39
40 default rel
41 [bits 64]
42
43 ; need to keep free: eax, ecx, edx
44
45 %ifidn __OUTPUT_FORMAT__, elf64
46 %define arg1 rdi
47 %define arg2 rsi
48 %define arg3 rdx
49
50 %define init_d edi
51 %define data r9
52 %define size r10
53 %define s r11
54 %define a_d r12d
55 %define b_d r8d
56 %define end r13
57
58 %define func(x) x: endbranch
59 %macro FUNC_SAVE 0
60 push r12
61 push r13
62 %endmacro
63 %macro FUNC_RESTORE 0
64 pop r13
65 pop r12
66 %endmacro
67 %endif
68
69 %ifidn __OUTPUT_FORMAT__, win64
70 %define arg1 rcx
71 %define arg2 rdx
72 %define arg3 r8
73
74 %define init_d r12d
75 %define data r9
76 %define size r10
77 %define s r11
78 %define a_d esi
79 %define b_d edi
80 %define end r13
81
82 %define stack_size 2*16 + 5*8 ; must be an odd multiple of 8
83 %define arg(x) [rsp + stack_size + PS + PS*x]
84 %define func(x) proc_frame x
85 %macro FUNC_SAVE 0
86 alloc_stack stack_size
87 vmovdqa [rsp + 0*16], xmm6
88 vmovdqa [rsp + 1*16], xmm7
89 save_reg rdi, 2*16 + 0*8
90 save_reg rsi, 2*16 + 1*8
91 save_reg r12, 2*16 + 2*8
92 save_reg r13, 2*16 + 3*8
93 end_prolog
94 mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
95 %endmacro
96
97 %macro FUNC_RESTORE 0
98 vmovdqa xmm6, [rsp + 0*16]
99 vmovdqa xmm7, [rsp + 1*16]
100 mov rdi, [rsp + 2*16 + 0*8]
101 mov rsi, [rsp + 2*16 + 1*8]
102 mov r12, [rsp + 2*16 + 2*8]
103 mov r13, [rsp + 2*16 + 3*8]
104 add rsp, stack_size
105 %endmacro
106 %endif
107
108 %define ya ymm0
109 %define yb ymm1
110 %define ydata0 ymm2
111 %define ydata1 ymm3
112 %define ysa ymm4
113 %define ydata ysa
114 %define ytmp0 ydata0
115 %define ytmp1 ydata1
116 %define ytmp2 ymm5
117 %define xa xmm0
118 %define xb xmm1
119 %define xtmp0 xmm2
120 %define xtmp1 xmm3
121 %define xsa xmm4
122 %define xtmp2 xmm5
123 %define yshuf0 ymm6
124 %define yshuf1 ymm7
125
126 [bits 64]
127 default rel
128 section .text
129
130 mk_global adler32_avx2_4, function
131 func(adler32_avx2_4)
132 FUNC_SAVE
133
134 vmovdqa yshuf0, [SHUF0]
135 vmovdqa yshuf1, [SHUF1]
136
137 mov data, arg2
138 mov size, arg3
139
140 mov b_d, init_d
141 shr b_d, 16
142 and init_d, 0xFFFF
143 cmp size, 32
144 jb .lt64
145 vmovd xa, init_d
146 vpxor yb, yb, yb
147 .sloop1:
148 mov s, LIMIT
149 cmp s, size
150 cmova s, size ; s = min(size, LIMIT)
151 lea end, [data + s - CHUNKSIZE_M1]
152 cmp data, end
153 jae .skip_loop_1a
154 align 32
155 .sloop1a:
156 ; do CHUNKSIZE adds
157 vbroadcastf128 ydata, [data]
158 add data, CHUNKSIZE
159 vpshufb ydata0, ydata, yshuf0
160 vpaddd ya, ya, ydata0
161 vpaddd yb, yb, ya
162 vpshufb ydata1, ydata, yshuf1
163 vpaddd ya, ya, ydata1
164 vpaddd yb, yb, ya
165 cmp data, end
166 jb .sloop1a
167
168 .skip_loop_1a:
169 add end, CHUNKSIZE_M1
170
171 test s, CHUNKSIZE_M1
172 jnz .do_final
173
174 ; either we're done, or we just did LIMIT
175 sub size, s
176
177 ; reduce
178 vpslld yb, 3 ; b is scaled by 8
179 vpmulld ysa, ya, [A_SCALE] ; scaled a
180
181 ; compute horizontal sums of ya, yb, ysa
182 vextracti128 xtmp0, ya, 1
183 vextracti128 xtmp1, yb, 1
184 vextracti128 xtmp2, ysa, 1
185 vpaddd xa, xa, xtmp0
186 vpaddd xb, xb, xtmp1
187 vpaddd xsa, xsa, xtmp2
188 vphaddd xa, xa, xa
189 vphaddd xb, xb, xb
190 vphaddd xsa, xsa, xsa
191 vphaddd xa, xa, xa
192 vphaddd xb, xb, xb
193 vphaddd xsa, xsa, xsa
194
195 vmovd eax, xa
196 xor edx, edx
197 mov ecx, BASE
198 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
199 mov a_d, edx
200
201 vpsubd xb, xb, xsa
202 vmovd eax, xb
203 add eax, b_d
204 xor edx, edx
205 mov ecx, BASE
206 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
207 mov b_d, edx
208
209 test size, size
210 jz .finish
211
212 ; continue loop
213 vmovd xa, a_d
214 vpxor yb, yb
215 jmp .sloop1
216
217 .finish:
218 mov eax, b_d
219 shl eax, 16
220 or eax, a_d
221 jmp .end
222
223 .lt64:
224 mov a_d, init_d
225 lea end, [data + size]
226 test size, size
227 jnz .final_loop
228 jmp .zero_size
229
230 ; handle remaining 1...15 bytes
231 .do_final:
232 ; reduce
233 vpslld yb, 3 ; b is scaled by 8
234 vpmulld ysa, ya, [A_SCALE] ; scaled a
235
236 vextracti128 xtmp0, ya, 1
237 vextracti128 xtmp1, yb, 1
238 vextracti128 xtmp2, ysa, 1
239 vpaddd xa, xa, xtmp0
240 vpaddd xb, xb, xtmp1
241 vpaddd xsa, xsa, xtmp2
242 vphaddd xa, xa, xa
243 vphaddd xb, xb, xb
244 vphaddd xsa, xsa, xsa
245 vphaddd xa, xa, xa
246 vphaddd xb, xb, xb
247 vphaddd xsa, xsa, xsa
248 vpsubd xb, xb, xsa
249
250 vmovd a_d, xa
251 vmovd eax, xb
252 add b_d, eax
253
254 align 32
255 .final_loop:
256 movzx eax, byte[data]
257 add a_d, eax
258 inc data
259 add b_d, a_d
260 cmp data, end
261 jb .final_loop
262
263 .zero_size:
264 mov eax, a_d
265 xor edx, edx
266 mov ecx, BASE
267 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
268 mov a_d, edx
269
270 mov eax, b_d
271 xor edx, edx
272 mov ecx, BASE
273 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
274 shl edx, 16
275 or edx, a_d
276 mov eax, edx
277
278 .end:
279 FUNC_RESTORE
280 ret
281
282 endproc_frame
283
284 section .data
285 align 32
286 A_SCALE:
287 dq 0x0000000100000000, 0x0000000300000002
288 dq 0x0000000500000004, 0x0000000700000006
289 SHUF0:
290 dq 0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02
291 dq 0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06
292 SHUF1:
293 dq 0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A
294 dq 0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E
295