]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/igzip_buffer_utils_01.asm
buildsys: fix parallel builds
[ceph.git] / ceph / src / isa-l / igzip / igzip_buffer_utils_01.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%ifndef BUFFER_UTILS
31%define BUFFER_UTILS
32
33%include "options.asm"
34
35extern pshufb_shf_table
36extern mask3
37
38%ifdef FIX_CACHE_READ
39%define movntdqa movdqa
40%else
41%macro prefetchnta 1
42%endm
43%endif
44
45;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
46; code for doing the CRC calculation as part of copy-in, using pclmulqdq
47
48; "shift" 4 input registers down 4 places
49; macro FOLD4 xmm0, xmm1, xmm2, xmm3, const, tmp0, tmp1
50%macro FOLD4 7
51%define %%xmm0 %1 ; xmm reg, in/out
52%define %%xmm1 %2 ; xmm reg, in/out
53%define %%xmm2 %3 ; xmm reg, in/out
54%define %%xmm3 %4 ; xmm reg, in/out
55%define %%const %5 ; xmm reg, in
56%define %%tmp0 %6 ; xmm reg, tmp
57%define %%tmp1 %7 ; xmm reg, tmp
58
59 movaps %%tmp0, %%xmm0
60 movaps %%tmp1, %%xmm1
61
62 pclmulqdq %%xmm0, %%const, 0x01
63 pclmulqdq %%xmm1, %%const, 0x01
64
65 pclmulqdq %%tmp0, %%const, 0x10
66 pclmulqdq %%tmp1, %%const, 0x10
67
68 xorps %%xmm0, %%tmp0
69 xorps %%xmm1, %%tmp1
70
71
72 movaps %%tmp0, %%xmm2
73 movaps %%tmp1, %%xmm3
74
75 pclmulqdq %%xmm2, %%const, 0x01
76 pclmulqdq %%xmm3, %%const, 0x01
77
78 pclmulqdq %%tmp0, %%const, 0x10
79 pclmulqdq %%tmp1, %%const, 0x10
80
81 xorps %%xmm2, %%tmp0
82 xorps %%xmm3, %%tmp1
83%endm
84
85;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
86
87; "shift" 3 input registers down 4 places
88; macro FOLD3 x0, x1, x2, x3, const, tmp0
89; x0 x1 x2 x3
90; In A B C D
91; Out D A' B' C'
92%macro FOLD3 6
93%define %%x0 %1 ; xmm reg, in/out
94%define %%x1 %2 ; xmm reg, in/out
95%define %%x2 %3 ; xmm reg, in/out
96%define %%x3 %4 ; xmm reg, in/out
97%define %%const %5 ; xmm reg, in
98%define %%tmp0 %6 ; xmm reg, tmp
99
100 movdqa %%tmp0, %%x3
101
102 movaps %%x3, %%x2
103 pclmulqdq %%x2, %%const, 0x01
104 pclmulqdq %%x3, %%const, 0x10
105 xorps %%x3, %%x2
106
107 movaps %%x2, %%x1
108 pclmulqdq %%x1, %%const, 0x01
109 pclmulqdq %%x2, %%const, 0x10
110 xorps %%x2, %%x1
111
112 movaps %%x1, %%x0
113 pclmulqdq %%x0, %%const, 0x01
114 pclmulqdq %%x1, %%const, 0x10
115 xorps %%x1, %%x0
116
117 movdqa %%x0, %%tmp0
118%endm
119
120;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121
122; "shift" 2 input registers down 4 places
123; macro FOLD2 x0, x1, x2, x3, const, tmp0
124; x0 x1 x2 x3
125; In A B C D
126; Out C D A' B'
127%macro FOLD2 6
128%define %%x0 %1 ; xmm reg, in/out
129%define %%x1 %2 ; xmm reg, in/out
130%define %%x2 %3 ; xmm reg, in/out
131%define %%x3 %4 ; xmm reg, in/out
132%define %%const %5 ; xmm reg, in
133%define %%tmp0 %6 ; xmm reg, tmp
134
135 movdqa %%tmp0, %%x3
136
137 movaps %%x3, %%x1
138 pclmulqdq %%x1, %%const, 0x01
139 pclmulqdq %%x3, %%const, 0x10
140 xorps %%x3, %%x1
141
142 movdqa %%x1, %%tmp0
143 movdqa %%tmp0, %%x2
144
145 movaps %%x2, %%x0
146 pclmulqdq %%x0, %%const, 0x01
147 pclmulqdq %%x2, %%const, 0x10
148 xorps %%x2, %%x0
149
150 movdqa %%x0, %%tmp0
151%endm
152
153;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
154
155; "shift" 1 input registers down 4 places
156; macro FOLD1 x0, x1, x2, x3, const, tmp0
157; x0 x1 x2 x3
158; In A B C D
159; Out B C D A'
160%macro FOLD1 6
161%define %%x0 %1 ; xmm reg, in/out
162%define %%x1 %2 ; xmm reg, in/out
163%define %%x2 %3 ; xmm reg, in/out
164%define %%x3 %4 ; xmm reg, in/out
165%define %%const %5 ; xmm reg, in
166%define %%tmp0 %6 ; xmm reg, tmp
167
168 movdqa %%tmp0, %%x3
169
170 movaps %%x3, %%x0
171 pclmulqdq %%x0, %%const, 0x01
172 pclmulqdq %%x3, %%const, 0x10
173 xorps %%x3, %%x0
174
175 movdqa %%x0, %%x1
176 movdqa %%x1, %%x2
177 movdqa %%x2, %%tmp0
178%endm
179
180;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
182;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
183
184; macro PARTIAL_FOLD x0, x1, x2, x3, xp, size, xfold, xt0, xt1, xt2, xt3
185
186; XP X3 X2 X1 X0 tmp2
187; Initial state xI HG FE DC BA
188; after shift IH GF ED CB A0
189; after fold ff GF ED CB ff = merge(IH, A0)
190;
191%macro PARTIAL_FOLD 12
192%define %%x0 %1 ; xmm reg, in/out
193%define %%x1 %2 ; xmm reg, in/out
194%define %%x2 %3 ; xmm reg, in/out
195%define %%x3 %4 ; xmm reg, in/out
196%define %%xp %5 ; xmm partial reg, in/clobbered
197%define %%size %6 ; GPR, in/clobbered (1...15)
198%define %%const %7 ; xmm reg, in
199%define %%shl %8 ; xmm reg, tmp
200%define %%shr %9 ; xmm reg, tmp
201%define %%tmp2 %10 ; xmm reg, tmp
202%define %%tmp3 %11 ; xmm reg, tmp
203%define %%gtmp %12 ; GPR, tmp
204
205 ; {XP X3 X2 X1 X0} = {xI HG FE DC BA}
206 shl %%size, 4 ; size *= 16
207 lea %%gtmp, [pshufb_shf_table - 16 WRT_OPT]
208 movdqa %%shl, [%%gtmp + %%size] ; shl constant
209 movdqa %%shr, %%shl
210 pxor %%shr, [mask3 WRT_OPT] ; shr constant
211
212 movdqa %%tmp2, %%x0 ; tmp2 = BA
213 pshufb %%tmp2, %%shl ; tmp2 = A0
214
215 pshufb %%x0, %%shr ; x0 = 0B
216 movdqa %%tmp3, %%x1 ; tmp3 = DC
217 pshufb %%tmp3, %%shl ; tmp3 = C0
218 por %%x0, %%tmp3 ; x0 = CB
219
220 pshufb %%x1, %%shr ; x1 = 0D
221 movdqa %%tmp3, %%x2 ; tmp3 = FE
222 pshufb %%tmp3, %%shl ; tmp3 = E0
223 por %%x1, %%tmp3 ; x1 = ED
224
225 pshufb %%x2, %%shr ; x2 = 0F
226 movdqa %%tmp3, %%x3 ; tmp3 = HG
227 pshufb %%tmp3, %%shl ; tmp3 = G0
228 por %%x2, %%tmp3 ; x2 = GF
229
230 pshufb %%x3, %%shr ; x3 = 0H
231 pshufb %%xp, %%shl ; xp = I0
232 por %%x3, %%xp ; x3 = IH
233
234 ; fold tmp2 into X3
235 movaps %%tmp3, %%tmp2
236 pclmulqdq %%tmp2, %%const, 0x01
237 pclmulqdq %%tmp3, %%const, 0x10
238 xorps %%x3, %%tmp2
239 xorps %%x3, %%tmp3
240%endm
241
242
243;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
244; LOAD_FRACTIONAL_XMM: Packs xmm register with data when data input is less than 16 bytes.
245; Returns 0 if data has length 0.
246; Input: The input data (src), that data's length (size).
247; Output: The packed xmm register (xmm_out).
248; size is clobbered.
249;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
250%macro LOAD_FRACTIONAL_XMM 3
251%define %%xmm_out %1 ; %%xmm_out is an xmm register
252%define %%src %2
253%define %%size %3
254
255 pxor %%xmm_out, %%xmm_out
256
257 cmp %%size, 0
258 je %%_done
259
260 add %%src, %%size
261
262 cmp %%size, 8
263 jl %%_byte_loop
264
265 sub %%src, 8
266 pinsrq %%xmm_out, [%%src], 0 ;Read in 8 bytes if they exists
267 sub %%size, 8
268
269 je %%_done
270
271%%_byte_loop: ;Read in data 1 byte at a time while data is left
272 pslldq %%xmm_out, 1
273
274 dec %%src
275 pinsrb %%xmm_out, BYTE [%%src], 0
276 dec %%size
277
278 jg %%_byte_loop
279
280%%_done:
281
282%endmacro ; LOAD_FRACTIONAL_XMM
283
284;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
285;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
286;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
287
288; copy x bytes (rounded up to 16 bytes) from src to dst
289; src & dst are unaligned
290; macro COPY_IN_CRC dst, src, size_in_bytes, tmp, x0, x1, x2, x3, xfold,
291; xt0, xt1, xt2, xt3, xt4
292%macro COPY_IN_CRC 14
293%define %%dst %1 ; reg, in/clobbered
294%define %%src %2 ; reg, in/clobbered
295%define %%size %3 ; reg, in/clobbered
296%define %%tmp %4 ; reg, tmp
297%define %%x0 %5 ; xmm, in/out: crc state
298%define %%x1 %6 ; xmm, in/out: crc state
299%define %%x2 %7 ; xmm, in/out: crc state
300%define %%x3 %8 ; xmm, in/out: crc state
301%define %%xfold %9 ; xmm, in: (loaded from fold4)
302%define %%xtmp0 %10 ; xmm, tmp
303%define %%xtmp1 %11 ; xmm, tmp
304%define %%xtmp2 %12 ; xmm, tmp
305%define %%xtmp3 %13 ; xmm, tmp
306%define %%xtmp4 %14 ; xmm, tmp
307
308 cmp %%size, 16
309 jl %%lt_16
310
311 ; align source
312 xor %%tmp, %%tmp
313 sub %%tmp, %%src
314 and %%tmp, 15
315 jz %%already_aligned
316
317 ; need to align, tmp contains number of bytes to transfer
318 movdqu %%xtmp0, [%%src]
319 movdqu [%%dst], %%xtmp0
320 add %%dst, %%tmp
321 add %%src, %%tmp
322 sub %%size, %%tmp
323
324%ifndef DEFLATE
325 push %%dst
326
327 PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
328 %%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
329 pop %%dst
330%endif
331
332%%already_aligned:
333 sub %%size, 64
334 jl %%end_loop
335 jmp %%loop
336align 16
337%%loop:
338 movntdqa %%xtmp0, [%%src+0*16]
339 movntdqa %%xtmp1, [%%src+1*16]
340 movntdqa %%xtmp2, [%%src+2*16]
341
342%ifndef DEFLATE
343 FOLD4 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3, %%xtmp4
344%endif
345 movntdqa %%xtmp3, [%%src+3*16]
346
347 movdqu [%%dst+0*16], %%xtmp0
348 movdqu [%%dst+1*16], %%xtmp1
349 movdqu [%%dst+2*16], %%xtmp2
350 movdqu [%%dst+3*16], %%xtmp3
351
352%ifndef DEFLATE
353 pxor %%x0, %%xtmp0
354 pxor %%x1, %%xtmp1
355 pxor %%x2, %%xtmp2
356 pxor %%x3, %%xtmp3
357%endif
358 add %%src, 4*16
359 add %%dst, 4*16
360 sub %%size, 4*16
361 jge %%loop
362
363%%end_loop:
364 ; %%size contains (num bytes left - 64)
365 add %%size, 16
366 jge %%three_full_regs
367 add %%size, 16
368 jge %%two_full_regs
369 add %%size, 16
370 jge %%one_full_reg
371 add %%size, 16
372
373%%no_full_regs: ; 0 <= %%size < 16, no full regs
374 jz %%done ; if no bytes left, we're done
375 jmp %%partial
376
377 ;; Handle case where input is <16 bytes
378%%lt_16:
379 test %%size, %%size
380 jz %%done ; if no bytes left, we're done
381 jmp %%partial
382
383
384%%one_full_reg:
385 movntdqa %%xtmp0, [%%src+0*16]
386
387%ifndef DEFLATE
388 FOLD1 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
389%endif
390 movdqu [%%dst+0*16], %%xtmp0
391
392%ifndef DEFLATE
393 pxor %%x3, %%xtmp0
394%endif
395 test %%size, %%size
396 jz %%done ; if no bytes left, we're done
397
398 add %%dst, 1*16
399 add %%src, 1*16
400 jmp %%partial
401
402
403%%two_full_regs:
404 movntdqa %%xtmp0, [%%src+0*16]
405 movntdqa %%xtmp1, [%%src+1*16]
406
407%ifndef DEFLATE
408 FOLD2 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
409%endif
410 movdqu [%%dst+0*16], %%xtmp0
411 movdqu [%%dst+1*16], %%xtmp1
412
413%ifndef DEFLATE
414 pxor %%x2, %%xtmp0
415 pxor %%x3, %%xtmp1
416%endif
417 test %%size, %%size
418 jz %%done ; if no bytes left, we're done
419
420 add %%dst, 2*16
421 add %%src, 2*16
422 jmp %%partial
423
424
425%%three_full_regs:
426 movntdqa %%xtmp0, [%%src+0*16]
427 movntdqa %%xtmp1, [%%src+1*16]
428 movntdqa %%xtmp2, [%%src+2*16]
429
430%ifndef DEFLATE
431 FOLD3 %%x0, %%x1, %%x2, %%x3, %%xfold, %%xtmp3
432%endif
433 movdqu [%%dst+0*16], %%xtmp0
434 movdqu [%%dst+1*16], %%xtmp1
435 movdqu [%%dst+2*16], %%xtmp2
436
437%ifndef DEFLATE
438 pxor %%x1, %%xtmp0
439 pxor %%x2, %%xtmp1
440 pxor %%x3, %%xtmp2
441%endif
442 test %%size, %%size
443 jz %%done ; if no bytes left, we're done
444
445 add %%dst, 3*16
446 add %%src, 3*16
447
448 ; fall through to %%partial
449%%partial: ; 0 <= %%size < 16
450
451%ifndef DEFLATE
452 mov %%tmp, %%size
453%endif
454
455 LOAD_FRACTIONAL_XMM %%xtmp0, %%src, %%size
456
457 movdqu [%%dst], %%xtmp0
458
459%ifndef DEFLATE
460 PARTIAL_FOLD %%x0, %%x1, %%x2, %%x3, %%xtmp0, %%tmp, %%xfold, \
461 %%xtmp1, %%xtmp2, %%xtmp3, %%xtmp4, %%dst
462%endif
463
464%%done:
465%endm
466
467
468;%assign K 1024;
469;%assign D 8 * K; ; Amount of history
470;%assign LA 17 * 16; ; Max look-ahead, rounded up to 32 byte boundary
471
472; copy D + LA bytes from src to dst
473; dst is aligned
474;void copy_D_LA(uint8_t *dst, uint8_t *src);
475; arg 1: rcx : dst
476; arg 2: rdx : src
477; copy_D_LA dst, src, tmp, xtmp0, xtmp1, xtmp2, xtmp3
478%macro copy_D_LA 7
479%define %%dst %1 ; reg, clobbered
480%define %%src %2 ; reg, clobbered
481%define %%tmp %3
482%define %%xtmp0 %4
483%define %%xtmp1 %5
484%define %%xtmp2 %6
485%define %%xtmp3 %7
486
487%assign %%SIZE (D + LA) / 16 ; number of DQ words to be copied
488%assign %%SIZE4 %%SIZE/4
489
490 lea %%tmp, [%%dst + 4 * 16 * %%SIZE4]
491 jmp %%copy_D_LA_1
492align 16
493%%copy_D_LA_1:
494 movdqu %%xtmp0, [%%src]
495 movdqu %%xtmp1, [%%src+16]
496 movdqu %%xtmp2, [%%src+32]
497 movdqu %%xtmp3, [%%src+48]
498 movdqa [%%dst], %%xtmp0
499 movdqa [%%dst+16], %%xtmp1
500 movdqa [%%dst+32], %%xtmp2
501 movdqa [%%dst+48], %%xtmp3
502 add %%src, 4*16
503 add %%dst, 4*16
504 cmp %%dst, %%tmp
505 jne %%copy_D_LA_1
506%assign %%i 0
507%rep (%%SIZE - 4 * %%SIZE4)
508
509%if (%%i == 0)
510 movdqu %%xtmp0, [%%src + %%i*16]
511%elif (%%i == 1)
512 movdqu %%xtmp1, [%%src + %%i*16]
513%elif (%%i == 2)
514 movdqu %%xtmp2, [%%src + %%i*16]
515%elif (%%i == 3)
516 movdqu %%xtmp3, [%%src + %%i*16]
517%else
518 %error too many i
519 % error
520%endif
521
522%assign %%i %%i+1
523%endrep
524%assign %%i 0
525%rep (%%SIZE - 4 * %%SIZE4)
526
527%if (%%i == 0)
528 movdqa [%%dst + %%i*16], %%xtmp0
529%elif (%%i == 1)
530 movdqa [%%dst + %%i*16], %%xtmp1
531%elif (%%i == 2)
532 movdqa [%%dst + %%i*16], %%xtmp2
533%elif (%%i == 3)
534 movdqa [%%dst + %%i*16], %%xtmp3
535%else
536 %error too many i
537 % error
538%endif
539
540%assign %%i %%i+1
541%endrep
542%endm
543%endif