]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/encode_df_04.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / igzip / encode_df_04.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "reg_sizes.asm"
31%include "lz0a_const.asm"
32%include "data_struct2.asm"
33%include "stdmac.asm"
34
224ce89b
WB
35%define ARCH 04
36%define USE_HSWNI
37
f91f0fd5
TL
38; tree entry is 4 bytes:
39; lit/len tree (513 entries)
40; | 3 | 2 | 1 | 0 |
41; | len | code |
42;
43; dist tree
44; | 3 | 2 | 1 | 0 |
45; |eblen:codlen| code |
46
47; token format:
48; DIST_OFFSET:0 : lit/len
49; 31:(DIST_OFFSET + 5) : dist Extra Bits
50; (DIST_OFFSET + 5):DIST_OFFSET : dist code
51; lit/len: 0-256 (literal)
52; 257-512 (dist + 254)
53
54; returns final token pointer
55; equal to token_end if successful
56; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
57; BitBuf *out_buf, uint32_t *trees);
58
59%ifidn __OUTPUT_FORMAT__, win64
60%define arg1 rcx
61%define arg2 rdx
62%define arg3 r8
63%define arg4 r9
64%define sym rsi
65%define dsym rdi
66%define hufftables r9
67%define ptr r11
68%else
69; Linux
70%define arg1 rdi
71%define arg2 rsi
72%define arg3 rdx
73%define arg4 rcx
74%define sym r9
75%define dsym r8
76%define hufftables r11
77%define ptr rdi
78%endif
79
80%define in_buf_end arg2
81%define bitbuf arg3
82%define out_buf bitbuf
83; bit_count is rcx
84%define bits rax
85%define data r12
86%define tmp rbx
87%define len dsym
88%define tmp2 r10
89%define end_ptr rbp
90
91%define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
92%define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
93
94%define codes1 ymm1
95%define code_lens1 ymm2
96%define codes2 ymm3
97%define code_lens2 ymm4
98%define codes3 ymm5
99%define code_lens3 ymm6
100%define codes4 ymm7
101%define syms ymm7
102
103%define code_lens4 ymm8
104%define dsyms ymm8
105
106%define ytmp ymm9
107%define codes_lookup1 ymm10
108%define codes_lookup2 ymm11
109%define datas ymm12
110%define ybits ymm13
111%define ybits_count ymm14
112%define yoffset_mask ymm15
113
114%define VECTOR_SIZE 0x20
115%define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
116%define VECTOR_SLOP 0x20 - 8
117
118gpr_save_mem_offset equ 0
119gpr_save_mem_size equ 8 * 6
120xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
121xmm_save_mem_size equ 10 * 16
122bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
123bitbuf_mem_size equ 8
124stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
125
126
127%macro FUNC_SAVE 0
128 sub rsp, stack_size
129 mov [rsp + gpr_save_mem_offset + 0*8], rbx
130 mov [rsp + gpr_save_mem_offset + 1*8], rbp
131 mov [rsp + gpr_save_mem_offset + 2*8], r12
132
133%ifidn __OUTPUT_FORMAT__, win64
134 mov [rsp + gpr_save_mem_offset + 3*8], rsi
135 mov [rsp + gpr_save_mem_offset + 4*8], rdi
136
137 MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
138 MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
139 MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
140 MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
141 MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
142 MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
143 MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
144 MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
145 MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
146 MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
147%endif
148
149%endm
150
151%macro FUNC_RESTORE 0
152 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
153 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
154 mov r12, [rsp + gpr_save_mem_offset + 2*8]
155
156%ifidn __OUTPUT_FORMAT__, win64
157 mov rsi, [rsp + gpr_save_mem_offset + 3*8]
158 mov rdi, [rsp + gpr_save_mem_offset + 4*8]
159
160 MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
161 MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
162 MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
163 MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
164 MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
165 MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
166 MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
167 MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
168 MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
169 MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
170%endif
171 add rsp, stack_size
172
173%endmacro
174
175global encode_deflate_icf_ %+ ARCH
176encode_deflate_icf_ %+ ARCH:
177 FUNC_SAVE
178
179%ifnidn ptr, arg1
180 mov ptr, arg1
181%endif
182%ifnidn hufftables, arg4
183 mov hufftables, arg4
184%endif
185
186 mov [rsp + bitbuf_mem_offset], bitbuf
187 mov bits, [bitbuf + _m_bits]
188 mov ecx, [bitbuf + _m_bit_count]
189 mov end_ptr, [bitbuf + _m_out_end]
190 mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
191
192 sub end_ptr, VECTOR_SLOP
193 sub in_buf_end, VECTOR_LOOP_PROCESSED
194 cmp ptr, in_buf_end
195 jge .finish
196
197 vpcmpeqq ytmp, ytmp, ytmp
198 vmovdqu datas, [ptr]
199 vpand syms, datas, [lit_mask]
200 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
201
202 vpcmpeqq ytmp, ytmp, ytmp
203 vpsrld dsyms, datas, DIST_OFFSET
204 vpand dsyms, dsyms, [dist_mask]
205 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
206
207 vmovq ybits %+ x, bits
208 vmovq ybits_count %+ x, rcx
209 vmovdqa yoffset_mask, [offset_mask]
210
211.main_loop:
212 ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
213 vpsrld code_lens1, codes_lookup1, 24
214 vpand codes1, codes_lookup1, [lit_icr_mask]
215
216 ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
217 ;; and code_lens3 the extra bit counts
218 vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
219 vpsrld code_lens2, codes_lookup2, 24
220 vpsrld code_lens3, codes_lookup2, 16
221 vpand code_lens3, [eb_icr_mask]
222
223 ;; Set codes3 to contain the extra bits
224 vpsrld codes3, datas, EXTRA_BITS_OFFSET
225
226 cmp out_buf, end_ptr
227 ja .main_loop_exit
228
229 ;; Start code lookups for next iteration
230 add ptr, VECTOR_SIZE
231 vpcmpeqq ytmp, ytmp, ytmp
232 vmovdqu datas, [ptr]
233 vpand syms, datas, [lit_mask]
234 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
235
236 vpcmpeqq ytmp, ytmp, ytmp
237 vpsrld dsyms, datas, DIST_OFFSET
238 vpand dsyms, dsyms, [dist_mask]
239 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
240
241 ;; Merge dist code with extra bits
242 vpsllvd codes3, codes3, code_lens2
243 vpxor codes2, codes2, codes3
244 vpaddd code_lens2, code_lens2, code_lens3
245
246 ;; Check for long codes
247 vpaddd code_lens3, code_lens1, code_lens2
248 vpcmpgtd ytmp, code_lens3, [max_write_d]
249 vptest ytmp, ytmp
250 jnz .long_codes
251
252 ;; Merge dist and len codes
253 vpsllvd codes2, codes2, code_lens1
254 vpxor codes1, codes1, codes2
255
256 ;; Split buffer data into qwords, ytmp is 0 after last branch
257 vpblendd codes3, ytmp, codes1, 0x55
258 vpsrlq codes1, codes1, 32
259 vpsrlq code_lens1, code_lens3, 32
260 vpblendd code_lens3, ytmp, code_lens3, 0x55
261
262 ;; Merge bitbuf bits
263 vpsllvq codes3, codes3, ybits_count
264 vpxor codes3, codes3, ybits
265 vpaddq code_lens3, code_lens3, ybits_count
266
267 ;; Merge two symbols into qwords
268 vpsllvq codes1, codes1, code_lens3
269 vpxor codes1, codes1, codes3
270 vpaddq code_lens1, code_lens1, code_lens3
271
272 ;; Split buffer data into dqwords, ytmp is 0 after last branch
273 vpblendd codes2, ytmp, codes1, 0x33
274 vpblendd code_lens2, ytmp, code_lens1, 0x33
275 vpsrldq codes1, 8
276 vpsrldq code_lens1, 8
277
278 ;; Bit align dqwords
279 vpaddq code_lens1, code_lens1, code_lens2
280 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
281 vpermq ybits_count, ybits_count, 0xcf
282 vpaddq code_lens2, ybits_count
283 vpsllvq codes2, codes2, ybits_count
284
285 ;; Merge two qwords into dqwords
286 vmovdqa ytmp, [q_64]
287 vpsubq code_lens3, ytmp, code_lens2
288 vpsrlvq codes3, codes1, code_lens3
289 vpslldq codes3, codes3, 8
290
291 vpsllvq codes1, codes1, code_lens2
292
293 vpxor codes1, codes1, codes3
294 vpxor codes1, codes1, codes2
295
296 vmovq tmp, code_lens1 %+ x ;Number of bytes
297 shr tmp, 3
298
299 ;; Extract last bytes
300 vpaddq code_lens2, code_lens1, ybits_count
301 vpsrlq code_lens2, code_lens2, 3
302 vpshufb codes2, codes1, code_lens2
303 vpand codes2, codes2, [bytes_mask]
304 vextracti128 ybits %+ x, codes2, 1
305
306 ;; Check for short codes
307 vptest code_lens2, [min_write_mask]
308 jz .short_codes
309.short_codes_next:
310
311 vpermq codes2, codes2, 0x45
312 vpor codes1, codes1, codes2
313
314 ;; bit shift upper dqword combined bits to line up with lower dqword
315 vextracti128 code_lens2 %+ x, code_lens1, 1
316
317 ; Write out lower dqword of combined bits
318 vmovdqu [out_buf], codes1
319 vpaddq code_lens1, code_lens1, code_lens2
320
321 vmovq tmp2, code_lens1 %+ x ;Number of bytes
322 shr tmp2, 3
323 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
324
325 ; Write out upper dqword of combined bits
326 vextracti128 [out_buf + tmp], codes1, 1
327 add out_buf, tmp2
328
329 cmp ptr, in_buf_end
330 jbe .main_loop
331
332.main_loop_exit:
333 vmovq rcx, ybits_count %+ x
334 vmovq bits, ybits %+ x
335 jmp .finish
336
337.short_codes:
338 ;; Merge last bytes when the second dqword contains less than a byte
339 vpor ybits %+ x, codes2 %+ x
340 jmp .short_codes_next
341
342.long_codes:
343 add end_ptr, VECTOR_SLOP
344 sub ptr, VECTOR_SIZE
345
346 vpxor ytmp, ytmp, ytmp
347 vpblendd codes3, ytmp, codes1, 0x55
348 vpblendd code_lens3, ytmp, code_lens1, 0x55
349 vpblendd codes4, ytmp, codes2, 0x55
350
351 vpsllvq codes4, codes4, code_lens3
352 vpxor codes3, codes3, codes4
353 vpaddd code_lens3, code_lens1, code_lens2
354
355 vpsrlq codes1, codes1, 32
356 vpsrlq code_lens1, code_lens1, 32
357 vpsrlq codes2, codes2, 32
358
359 vpsllvq codes2, codes2, code_lens1
360 vpxor codes1, codes1, codes2
361
362 vpsrlq code_lens1, code_lens3, 32
363 vpblendd code_lens3, ytmp, code_lens3, 0x55
364
365 ;; Merge bitbuf bits
366 vpsllvq codes3, codes3, ybits_count
367 vpxor codes3, codes3, ybits
368 vpaddq code_lens3, code_lens3, ybits_count
369 vpaddq code_lens1, code_lens1, code_lens3
370
371 xor bits, bits
372 xor rcx, rcx
373 vpsubq code_lens1, code_lens1, code_lens3
374%rep 2
375;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 cmp out_buf, end_ptr
377 ja .overflow
378 ;; insert LL code
379 vmovq sym, codes3 %+ x
380 vmovq tmp2, code_lens3 %+ x
381 SHLX sym, sym, rcx
382 or bits, sym
383 add rcx, tmp2
384
385 ; empty bits
386 mov [out_buf], bits
387 mov tmp, rcx
388 shr tmp, 3 ; byte count
389 add out_buf, tmp
390 mov tmp, rcx
391 and rcx, ~7
392 SHRX bits, bits, rcx
393 mov rcx, tmp
394 and rcx, 7
395 add ptr, 4
396
397;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
398 cmp out_buf, end_ptr
399 ja .overflow
400 ;; insert LL code
401 vmovq sym, codes1 %+ x
402 vmovq tmp2, code_lens1 %+ x
403 SHLX sym, sym, rcx
404 or bits, sym
405 add rcx, tmp2
406
407 ; empty bits
408 mov [out_buf], bits
409 mov tmp, rcx
410 shr tmp, 3 ; byte count
411 add out_buf, tmp
412 mov tmp, rcx
413 and rcx, ~7
414 SHRX bits, bits, rcx
415 mov rcx, tmp
416 and rcx, 7
417 add ptr, 4
418
419;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
420 cmp out_buf, end_ptr
421 ja .overflow
422 ;; insert LL code
423 vpextrq sym, codes3 %+ x, 1
424 vpextrq tmp2, code_lens3 %+ x, 1
425 SHLX sym, sym, rcx
426 or bits, sym
427 add rcx, tmp2
428
429 ; empty bits
430 mov [out_buf], bits
431 mov tmp, rcx
432 shr tmp, 3 ; byte count
433 add out_buf, tmp
434 mov tmp, rcx
435 and rcx, ~7
436 SHRX bits, bits, rcx
437 mov rcx, tmp
438 and rcx, 7
439 add ptr, 4
440
441;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
442 cmp out_buf, end_ptr
443 ja .overflow
444 ;; insert LL code
445 vpextrq sym, codes1 %+ x, 1
446 vpextrq tmp2, code_lens1 %+ x, 1
447 SHLX sym, sym, rcx
448 or bits, sym
449 add rcx, tmp2
450
451 ; empty bits
452 mov [out_buf], bits
453 mov tmp, rcx
454 shr tmp, 3 ; byte count
455 add out_buf, tmp
456 mov tmp, rcx
457 and rcx, ~7
458 SHRX bits, bits, rcx
459 mov rcx, tmp
460 and rcx, 7
461 add ptr, 4
462
463 vextracti128 codes3 %+ x, codes3, 1
464 vextracti128 code_lens3 %+ x, code_lens3, 1
465 vextracti128 codes1 %+ x, codes1, 1
466 vextracti128 code_lens1 %+ x, code_lens1, 1
467%endrep
468 sub end_ptr, VECTOR_SLOP
469
470 vmovq ybits %+ x, bits
471 vmovq ybits_count %+ x, rcx
472 cmp ptr, in_buf_end
473 jbe .main_loop
474
475.finish:
476 add in_buf_end, VECTOR_LOOP_PROCESSED
477 add end_ptr, VECTOR_SLOP
478
479 cmp ptr, in_buf_end
480 jge .overflow
481
482.finish_loop:
483 mov DWORD(data), [ptr]
484
485 cmp out_buf, end_ptr
486 ja .overflow
487
488 mov sym, data
489 and sym, LIT_MASK ; sym has ll_code
490 mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
491
492 ; look up dist sym
493 mov dsym, data
494 shr dsym, DIST_OFFSET
495 and dsym, DIST_MASK
496 mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
497
498 ; insert LL code
499 ; sym: 31:24 length; 23:0 code
500 mov tmp2, sym
501 and sym, 0xFFFFFF
502 SHLX sym, sym, rcx
503 shr tmp2, 24
504 or bits, sym
505 add rcx, tmp2
506
507 ; insert dist code
508 movzx tmp, WORD(dsym)
509 SHLX tmp, tmp, rcx
510 or bits, tmp
511 mov tmp, dsym
512 shr tmp, 24
513 add rcx, tmp
514
515 ; insert dist extra bits
516 shr data, EXTRA_BITS_OFFSET
517 add ptr, 4
518 SHLX data, data, rcx
519 or bits, data
520 shr dsym, 16
521 and dsym, 0xFF
522 add rcx, dsym
523
524 ; empty bits
525 mov [out_buf], bits
526 mov tmp, rcx
527 shr tmp, 3 ; byte count
528 add out_buf, tmp
529 mov tmp, rcx
530 and rcx, ~7
531 SHRX bits, bits, rcx
532 mov rcx, tmp
533 and rcx, 7
534
535 cmp ptr, in_buf_end
536 jb .finish_loop
537
538.overflow:
539 mov tmp, [rsp + bitbuf_mem_offset]
540 mov [tmp + _m_bits], bits
541 mov [tmp + _m_bit_count], ecx
542 mov [tmp + _m_out_buf], out_buf
543
544 mov rax, ptr
545
546 FUNC_RESTORE
547
548 ret
549
550section .data
551 align 32
552max_write_d:
553 dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
554min_write_mask:
555 dq 0x00, 0x00, 0xff, 0x00
556offset_mask:
557 dq 0x0000000000000007, 0x0000000000000000
558 dq 0x0000000000000000, 0x0000000000000000
559q_64:
560 dq 0x0000000000000040, 0x0000000000000000
561 dq 0x0000000000000040, 0x0000000000000000
562lit_mask:
563 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
564 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
565dist_mask:
566 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
567 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
568lit_icr_mask:
569 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
570 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
571eb_icr_mask:
572 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
573 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
574bytes_mask:
575 dq 0x00000000000000ff, 0x0000000000000000
576 dq 0x00000000000000ff, 0x0000000000000000