1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33 %include "options.asm"
35 extern pshufb_shf_table
39 %define vmovntdqa vmovdqa
45 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
46 ; code for doing the CRC calculation as part of copy-in, using pclmulqdq
48 ; "shift" 4 input registers down 4 places
49 ; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
51 %define %%xmm0 %1 ; xmm reg, in/out
52 %define %%xmm1 %2 ; xmm reg, in/out
53 %define %%xmm2 %3 ; xmm reg, in/out
54 %define %%xmm3 %4 ; xmm reg, in/out
55 %define %%const %5 ; xmm reg, in
56 %define %%tmp0 %6 ; xmm reg, tmp
57 %define %%tmp1 %7 ; xmm reg, tmp
59 vmovaps %%tmp0, %%xmm0
60 vmovaps %%tmp1, %%xmm1
62 vpclmulqdq %%xmm0, %%const, 0x01
63 vpclmulqdq %%xmm1, %%const, 0x01
65 vpclmulqdq %%tmp0, %%const, 0x10
66 vpclmulqdq %%tmp1, %%const, 0x10
72 vmovaps %%tmp0, %%xmm2
73 vmovaps %%tmp1, %%xmm3
75 vpclmulqdq %%xmm2, %%const, 0x01
76 vpclmulqdq %%xmm3, %%const, 0x01
78 vpclmulqdq %%tmp0, %%const, 0x10
79 vpclmulqdq %%tmp1, %%const, 0x10
85 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
87 ; "shift" 3 input registers down 4 places
88 ; macro FOLD3 x0, x1, x2, x3, const, tmp0
93 %define %%x0 %1 ; xmm reg, in/out
94 %define %%x1 %2 ; xmm reg, in/out
95 %define %%x2 %3 ; xmm reg, in/out
96 %define %%x3 %4 ; xmm reg, in/out
97 %define %%const %5 ; xmm reg, in
98 %define %%tmp0 %6 ; xmm reg, tmp
103 vpclmulqdq %%x2, %%const, 0x01
104 vpclmulqdq %%x3, %%const, 0x10
108 vpclmulqdq %%x1, %%const, 0x01
109 vpclmulqdq %%x2, %%const, 0x10
113 vpclmulqdq %%x0, %%const, 0x01
114 vpclmulqdq %%x1, %%const, 0x10
120 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
122 ; "shift" 2 input registers down 4 places
123 ; macro FOLD2 x0, x1, x2, x3, const, tmp0
128 %define %%x0 %1 ; xmm reg, in/out
129 %define %%x1 %2 ; xmm reg, in/out
130 %define %%x2 %3 ; xmm reg, in/out
131 %define %%x3 %4 ; xmm reg, in/out
132 %define %%const %5 ; xmm reg, in
133 %define %%tmp0 %6 ; xmm reg, tmp
138 vpclmulqdq %%x1, %%const, 0x01
139 vpclmulqdq %%x3, %%const, 0x10
146 vpclmulqdq %%x0, %%const, 0x01
147 vpclmulqdq %%x2, %%const, 0x10
153 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
155 ; "shift" 1 input registers down 4 places
156 ; macro FOLD1 x0, x1, x2, x3, const, tmp0
161 %define %%x0 %1 ; xmm reg, in/out
162 %define %%x1 %2 ; xmm reg, in/out
163 %define %%x2 %3 ; xmm reg, in/out
164 %define %%x3 %4 ; xmm reg, in/out
165 %define %%const %5 ; xmm reg, in
166 %define %%tmp0 %6 ; xmm reg, tmp
171 vpclmulqdq %%x0, %%const, 0x01
172 vpclmulqdq %%x3, %%const, 0x10
180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
182 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184 ; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
186 ; XP X3 X2 X1 X0 tmp2
187 ; Initial state xI HG FE DC BA
188 ; after shift IH GF ED CB A0
189 ; after fold ff GF ED CB ff = merge(IH, A0)
191 %macro PARTIAL_FOLD 12
192 %define %%x0 %1 ; xmm reg, in/out
193 %define %%x1 %2 ; xmm reg, in/out
194 %define %%x2 %3 ; xmm reg, in/out
195 %define %%x3 %4 ; xmm reg, in/out
196 %define %%xp %5 ; xmm partial reg, in/clobbered
197 %define %%size %6 ; GPR, in/clobbered (1...15)
198 %define %%const %7 ; xmm reg, in
199 %define %%shl %8 ; xmm reg, tmp
200 %define %%shr %9 ; xmm reg, tmp
201 %define %%tmp2 %10 ; xmm reg, tmp
202 %define %%tmp3 %11 ; xmm reg, tmp
203 %define %%gtmp %12 ; GPR, tmp
205 ; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
206 shl %%size, 4 ; size *= 16
207 lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
208 vmovdqa %%shl, [%%gtmp + %%size] ; shl constant
210 vpxor %%shr, [mask3 WRT_OPT] ; shr constant
212 vmovdqa %%tmp2, %%x0 ; tmp2 = BA
213 vpshufb %%tmp2, %%shl ; tmp2 = A0
215 vpshufb %%x0, %%shr ; x0 = 0B
216 vmovdqa %%tmp3, %%x1 ; tmp3 = DC
217 vpshufb %%tmp3, %%shl ; tmp3 = C0
218 vpor %%x0, %%tmp3 ; x0 = CB
220 vpshufb %%x1, %%shr ; x1 = 0D
221 vmovdqa %%tmp3, %%x2 ; tmp3 = FE
222 vpshufb %%tmp3, %%shl ; tmp3 = E0
223 vpor %%x1, %%tmp3 ; x1 = ED
225 vpshufb %%x2, %%shr ; x2 = 0F
226 vmovdqa %%tmp3, %%x3 ; tmp3 = HG
227 vpshufb %%tmp3, %%shl ; tmp3 = G0
228 vpor %%x2, %%tmp3 ; x2 = GF
230 vpshufb %%x3, %%shr ; x3 = 0H
231 vpshufb %%xp, %%shl ; xp = I0
232 vpor %%x3, %%xp ; x3 = IH
235 vmovaps %%tmp3, %%tmp2
236 vpclmulqdq %%tmp2, %%const, 0x01
237 vpclmulqdq %%tmp3, %%const, 0x10
243 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
244 ; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
245 ; Returns 0 if data has length 0.
246 ; Input: The input data (src), that data's length (size).
247 ; Output: The packed xmm register (xmm_out).
249 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
250 %macro LOAD_FRACTIONAL_XMM 3
251 %define %%xmm_out %1 ; %%xmm_out is an xmm register
255 vpxor %%xmm_out, %%xmm_out
266 vpinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
271 %%_byte_loop: ;Read in data 1 byte at a time while data is left
275 vpinsrb %%xmm_out, BYTE [%%src], 0
282 %endmacro ; LOAD_FRACTIONAL_XMM
284 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
285 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
286 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
288 ; copy x bytes (rounded up to 16 bytes) from src to dst
289 ; src & dst are unaligned
290 ; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
291 ; xt0, xt1, xt2, xt3, xt4
292 %macro COPY_IN_CRC 14
293 %define %%dst %1 ; reg, in/clobbered
294 %define %%src %2 ; reg, in/clobbered
295 %define %%size %3 ; reg, in/clobbered
296 %define %%tmp %4 ; reg, tmp
297 %define %%x0 %5 ; xmm, in/out: crc state
298 %define %%x1 %6 ; xmm, in/out: crc state
299 %define %%x2 %7 ; xmm, in/out: crc state
300 %define %%x3 %8 ; xmm, in/out: crc state
301 %define %%xfold %9 ; xmm, in: (loaded from fold4)
302 %define %%xtmp0 %10 ; xmm, tmp
303 %define %%xtmp1 %11 ; xmm, tmp
304 %define %%xtmp2 %12 ; xmm, tmp
305 %define %%xtmp3 %13 ; xmm, tmp
306 %define %%xtmp4 %14 ; xmm, tmp
317 ; need to align, tmp contains number of bytes to transfer
318 vmovdqu %%xtmp0, [%%src]
319 vmovdqu [%%dst], %%xtmp0
327 PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
328 %%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
338 vmovntdqa %%xtmp0, [%%src+0*16]
339 vmovntdqa %%xtmp1, [%%src+1*16]
340 vmovntdqa %%xtmp2, [%%src+2*16]
343 FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
345 vmovntdqa %%xtmp3, [%%src+3*16]
347 vmovdqu [%%dst+0*16], %%xtmp0
348 vmovdqu [%%dst+1*16], %%xtmp1
349 vmovdqu [%%dst+2*16], %%xtmp2
350 vmovdqu [%%dst+3*16], %%xtmp3
364 ; %%size contains (num bytes left - 64)
366 jge %%three_full_regs
373 %%no_full_regs: ; 0 <= %%size < 16, no full regs
374 jz %%done ; if no bytes left, we're done
377 ;; Handle case where input is <16 bytes
380 jz %%done ; if no bytes left, we're done
385 vmovntdqa %%xtmp0, [%%src+0*16]
388 FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
390 vmovdqu [%%dst+0*16], %%xtmp0
396 jz %%done ; if no bytes left, we're done
404 vmovntdqa %%xtmp0, [%%src+0*16]
405 vmovntdqa %%xtmp1, [%%src+1*16]
408 FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
410 vmovdqu [%%dst+0*16], %%xtmp0
411 vmovdqu [%%dst+1*16], %%xtmp1
418 jz %%done ; if no bytes left, we're done
426 vmovntdqa %%xtmp0, [%%src+0*16]
427 vmovntdqa %%xtmp1, [%%src+1*16]
428 vmovntdqa %%xtmp2, [%%src+2*16]
431 FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
433 vmovdqu [%%dst+0*16], %%xtmp0
434 vmovdqu [%%dst+1*16], %%xtmp1
435 vmovdqu [%%dst+2*16], %%xtmp2
443 jz %%done ; if no bytes left, we're done
448 ; fall through to %%partial
449 %%partial: ; 0 <= %%size < 16
455 LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
457 vmovdqu [%%dst], %%xtmp0
460 PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
461 %%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
469 ;%assign D 8 * K; ; Amount of history
470 ;%assign LA 17 * 16; ; Max look-ahead, rounded up to 32 byte boundary
472 ; copy D + LA bytes from src to dst
474 ;void copy_D_LA(uint8_t *dst, uint8_t *src);
477 ; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
479 %define %%dst %1 ; reg, clobbered
480 %define %%src %2 ; reg, clobbered
489 %assign %%SIZE (D + LA) / 32 ; number of DQ words to be copied
490 %assign %%SIZE4 %%SIZE/4
491 %assign %%MOD16 ((D + LA) - 32 * %%SIZE) / 16
493 lea %%tmp, [%%dst + 4 * 32 * %%SIZE4]
497 vmovdqu %%ytmp0, [%%src]
498 vmovdqu %%ytmp1, [%%src + 1 * 32]
499 vmovdqu %%ytmp2, [%%src + 2 * 32]
500 vmovdqu %%ytmp3, [%%src + 3 * 32]
501 vmovdqa [%%dst], %%ytmp0
502 vmovdqa [%%dst + 1 * 32], %%ytmp1
503 vmovdqa [%%dst + 2 * 32], %%ytmp2
504 vmovdqa [%%dst + 3 * 32], %%ytmp3
510 %rep (%%SIZE - 4 * %%SIZE4)
513 vmovdqu %%ytmp0, [%%src + %%i*32]
515 vmovdqu %%ytmp1, [%%src + %%i*32]
517 vmovdqu %%ytmp2, [%%src + %%i*32]
519 vmovdqu %%ytmp3, [%%src + %%i*32]
528 %rep (%%SIZE - 4 * %%SIZE4)
531 vmovdqa [%%dst + %%i*32], %%ytmp0
533 vmovdqa [%%dst + %%i*32], %%ytmp1
535 vmovdqa [%%dst + %%i*32], %%ytmp2
537 vmovdqa [%%dst + %%i*32], %%ytmp3
547 vmovdqu %%xtmp0, [%%src + (%%SIZE - 4 * %%SIZE4)*32]
548 vmovdqa [%%dst + (%%SIZE - 4 * %%SIZE4)*32], %%xtmp0