1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 ; uint32_t adler32_avx2(uint32_t init, const unsigned char *buf, uint64_t len)
33 %define BASE 0xFFF1 ; 65521
36 %define CHUNKSIZE_M1 (CHUNKSIZE-1)
38 %include "reg_sizes.asm"
43 ; need to keep free: eax, ecx, edx
45 %ifidn __OUTPUT_FORMAT__, elf64
58 %define func(x) x: endbranch
69 %ifidn __OUTPUT_FORMAT__, win64
82 %define stack_size 2*16 + 5*8 ; must be an odd multiple of 8
83 %define arg(x) [rsp + stack_size + PS + PS*x]
84 %define func(x) proc_frame x
86 alloc_stack stack_size
87 vmovdqa [rsp + 0*16], xmm6
88 vmovdqa [rsp + 1*16], xmm7
89 save_reg rdi, 2*16 + 0*8
90 save_reg rsi, 2*16 + 1*8
91 save_reg r12, 2*16 + 2*8
92 save_reg r13, 2*16 + 3*8
94 mov init_d, ecx ; initalize init_d from arg1 to keep ecx free
98 vmovdqa xmm6, [rsp + 0*16]
99 vmovdqa xmm7, [rsp + 1*16]
100 mov rdi, [rsp + 2*16 + 0*8]
101 mov rsi, [rsp + 2*16 + 1*8]
102 mov r12, [rsp + 2*16 + 2*8]
103 mov r13, [rsp + 2*16 + 3*8]
130 mk_global adler32_avx2_4, function
134 vmovdqa yshuf0, [SHUF0]
135 vmovdqa yshuf1, [SHUF1]
150 cmova s, size ; s = min(size, LIMIT)
151 lea end, [data + s - CHUNKSIZE_M1]
157 vbroadcastf128 ydata, [data]
159 vpshufb ydata0, ydata, yshuf0
160 vpaddd ya, ya, ydata0
162 vpshufb ydata1, ydata, yshuf1
163 vpaddd ya, ya, ydata1
169 add end, CHUNKSIZE_M1
174 ; either we're done, or we just did LIMIT
178 vpslld yb, 3 ; b is scaled by 8
179 vpmulld ysa, ya, [A_SCALE] ; scaled a
181 ; compute horizontal sums of ya, yb, ysa
182 vextracti128 xtmp0, ya, 1
183 vextracti128 xtmp1, yb, 1
184 vextracti128 xtmp2, ysa, 1
187 vpaddd xsa, xsa, xtmp2
190 vphaddd xsa, xsa, xsa
193 vphaddd xsa, xsa, xsa
198 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
206 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
225 lea end, [data + size]
230 ; handle remaining 1...15 bytes
233 vpslld yb, 3 ; b is scaled by 8
234 vpmulld ysa, ya, [A_SCALE] ; scaled a
236 vextracti128 xtmp0, ya, 1
237 vextracti128 xtmp1, yb, 1
238 vextracti128 xtmp2, ysa, 1
241 vpaddd xsa, xsa, xtmp2
244 vphaddd xsa, xsa, xsa
247 vphaddd xsa, xsa, xsa
256 movzx eax, byte[data]
267 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
273 div ecx ; divide edx:eax by ecx, quot->eax, rem->edx
287 dq 0x0000000100000000, 0x0000000300000002
288 dq 0x0000000500000004, 0x0000000700000006
290 dq 0xFFFFFF01FFFFFF00, 0xFFFFFF03FFFFFF02
291 dq 0xFFFFFF05FFFFFF04, 0xFFFFFF07FFFFFF06
293 dq 0xFFFFFF09FFFFFF08, 0xFFFFFF0BFFFFFF0A
294 dq 0xFFFFFF0DFFFFFF0C, 0xFFFFFF0FFFFFFF0E