]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/adler32_sse.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / igzip / adler32_sse.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
31
32 %define LIMIT 5552
33 %define BASE 0xFFF1 ; 65521
34
35 %include "reg_sizes.asm"
36
37 default rel
38 [bits 64]
39
40 ; need to keep free: eax, ecx, edx
41
42 %ifidn __OUTPUT_FORMAT__, elf64
43 %define arg1 rdi
44 %define arg2 rsi
45 %define arg3 rdx
46
47 %define init_d edi
48 %define data r9
49 %define size r10
50 %define s r11
51 %define a_d r12d
52 %define b_d r8d
53 %define end r13
54
55 %define func(x) x:
56 %macro FUNC_SAVE 0
57 push r12
58 push r13
59 %endmacro
60 %macro FUNC_RESTORE 0
61 pop r13
62 pop r12
63 %endmacro
64 %endif
65
66
67 %ifidn __OUTPUT_FORMAT__, win64
68 %define arg1 rcx
69 %define arg2 rdx
70 %define arg3 r8
71
72 %define init_d r12d
73 %define data r9
74 %define size r10
75 %define s r11
76 %define a_d esi
77 %define b_d edi
78 %define end r13
79
80 %define stack_size 5*8 ; must be an odd multiple of 8
81 %define func(x) proc_frame x
82 %macro FUNC_SAVE 0
83 alloc_stack stack_size
84 save_reg rdi, 0*8
85 save_reg rsi, 1*8
86 save_reg r12, 2*8
87 save_reg r13, 3*8
88 end_prolog
89 mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
90 %endmacro
91
92 %macro FUNC_RESTORE 0
93 mov rdi, [rsp + 0*8]
94 mov rsi, [rsp + 1*8]
95 mov r12, [rsp + 2*8]
96 mov r13, [rsp + 3*8]
97 add rsp, stack_size
98 %endmacro
99 %endif
100
101 %define xa xmm0
102 %define xb xmm1
103 %define xdata0 xmm2
104 %define xdata1 xmm3
105 %define xsa xmm4
106
107 global adler32_sse:ISAL_SYM_TYPE_FUNCTION
108 func(adler32_sse)
109 FUNC_SAVE
110
111 mov data, arg2
112 mov size, arg3
113
114 mov b_d, init_d
115 shr b_d, 16
116 and init_d, 0xFFFF
117 cmp size, 32
118 jb .lt64
119 movd xa, init_d
120 pxor xb, xb
121 .sloop1:
122 mov s, LIMIT
123 cmp s, size
124 cmova s, size ; s = min(size, LIMIT)
125 lea end, [data + s - 7]
126 cmp data, end
127 jae .skip_loop_1a
128 align 32
129 .sloop1a:
130 ; do 8 adds
131 pmovzxbd xdata0, [data]
132 pmovzxbd xdata1, [data + 4]
133 add data, 8
134 paddd xa, xdata0
135 paddd xb, xa
136 paddd xa, xdata1
137 paddd xb, xa
138 cmp data, end
139 jb .sloop1a
140
141 .skip_loop_1a:
142 add end, 7
143
144 test s, 7
145 jnz .do_final
146
147 ; either we're done, or we just did LIMIT
148 sub size, s
149
150 ; reduce
151 pslld xb, 2 ; b is scaled by 4
152 movdqa xsa, xa ; scaled a
153 pmulld xsa, [A_SCALE]
154
155 phaddd xa, xa
156 phaddd xb, xb
157 phaddd xsa, xsa
158 phaddd xa, xa
159 phaddd xb, xb
160 phaddd xsa, xsa
161
162 movd eax, xa
163 xor edx, edx
164 mov ecx, BASE
165 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
166 mov a_d, edx
167
168 psubd xb, xsa
169 movd eax, xb
170 add eax, b_d
171 xor edx, edx
172 mov ecx, BASE
173 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
174 mov b_d, edx
175
176 test size, size
177 jz .finish
178
179 ; continue loop
180 movd xa, a_d
181 pxor xb, xb
182 jmp .sloop1
183
184 .finish:
185 mov eax, b_d
186 shl eax, 16
187 or eax, a_d
188 jmp .end
189
190 .lt64:
191 mov a_d, init_d
192 lea end, [data + size]
193 test size, size
194 jnz .final_loop
195 jmp .zero_size
196
197 ; handle remaining 1...15 bytes
198 .do_final:
199 ; reduce
200 pslld xb, 2 ; b is scaled by 4
201 movdqa xsa, xa ; scaled a
202 pmulld xsa, [A_SCALE]
203
204 phaddd xa, xa
205 phaddd xb, xb
206 phaddd xsa, xsa
207 phaddd xa, xa
208 phaddd xb, xb
209 phaddd xsa, xsa
210 psubd xb, xsa
211
212 movd a_d, xa
213 movd eax, xb
214 add b_d, eax
215
216 align 32
217 .final_loop:
218 movzx eax, byte[data]
219 add a_d, eax
220 inc data
221 add b_d, a_d
222 cmp data, end
223 jb .final_loop
224
225 .zero_size:
226 mov eax, a_d
227 xor edx, edx
228 mov ecx, BASE
229 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
230 mov a_d, edx
231
232 mov eax, b_d
233 xor edx, edx
234 mov ecx, BASE
235 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
236 shl edx, 16
237 or edx, a_d
238 mov eax, edx
239
240 .end:
241 FUNC_RESTORE
242 ret
243
244 endproc_frame
245
246 section .data
247 align 32
248 A_SCALE:
249 dq 0x0000000100000000, 0x0000000300000002