]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/encode_df_04.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / igzip / encode_df_04.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "stdmac.asm"
34
35 %define ARCH 04
36 %define USE_HSWNI
37
38 ; tree entry is 4 bytes:
39 ; lit/len tree (513 entries)
40 ; | 3 | 2 | 1 | 0 |
41 ; | len | code |
42 ;
43 ; dist tree
44 ; | 3 | 2 | 1 | 0 |
45 ; |eblen:codlen| code |
46
47 ; token format:
48 ; DIST_OFFSET:0 : lit/len
49 ; 31:(DIST_OFFSET + 5) : dist Extra Bits
50 ; (DIST_OFFSET + 5):DIST_OFFSET : dist code
51 ; lit/len: 0-256 (literal)
52 ; 257-512 (dist + 254)
53
54 ; returns final token pointer
55 ; equal to token_end if successful
56 ; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
57 ; BitBuf *out_buf, uint32_t *trees);
58
59 %ifidn __OUTPUT_FORMAT__, win64
60 %define arg1 rcx
61 %define arg2 rdx
62 %define arg3 r8
63 %define arg4 r9
64 %define sym rsi
65 %define dsym rdi
66 %define hufftables r9
67 %define ptr r11
68 %else
69 ; Linux
70 %define arg1 rdi
71 %define arg2 rsi
72 %define arg3 rdx
73 %define arg4 rcx
74 %define sym r9
75 %define dsym r8
76 %define hufftables r11
77 %define ptr rdi
78 %endif
79
80 %define in_buf_end arg2
81 %define bitbuf arg3
82 %define out_buf bitbuf
83 ; bit_count is rcx
84 %define bits rax
85 %define data r12
86 %define tmp rbx
87 %define len dsym
88 %define tmp2 r10
89 %define end_ptr rbp
90
91 %define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
92 %define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
93
94 %define codes1 ymm1
95 %define code_lens1 ymm2
96 %define codes2 ymm3
97 %define code_lens2 ymm4
98 %define codes3 ymm5
99 %define code_lens3 ymm6
100 %define codes4 ymm7
101 %define syms ymm7
102
103 %define code_lens4 ymm8
104 %define dsyms ymm8
105
106 %define ytmp ymm9
107 %define codes_lookup1 ymm10
108 %define codes_lookup2 ymm11
109 %define datas ymm12
110 %define ybits ymm13
111 %define ybits_count ymm14
112 %define yoffset_mask ymm15
113
114 %define VECTOR_SIZE 0x20
115 %define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
116 %define VECTOR_SLOP 0x20 - 8
117
118 gpr_save_mem_offset equ 0
119 gpr_save_mem_size equ 8 * 6
120 xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
121 xmm_save_mem_size equ 10 * 16
122 bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
123 bitbuf_mem_size equ 8
124 stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
125
126
127 %macro FUNC_SAVE 0
128 sub rsp, stack_size
129 mov [rsp + gpr_save_mem_offset + 0*8], rbx
130 mov [rsp + gpr_save_mem_offset + 1*8], rbp
131 mov [rsp + gpr_save_mem_offset + 2*8], r12
132
133 %ifidn __OUTPUT_FORMAT__, win64
134 mov [rsp + gpr_save_mem_offset + 3*8], rsi
135 mov [rsp + gpr_save_mem_offset + 4*8], rdi
136
137 MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
138 MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
139 MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
140 MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
141 MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
142 MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
143 MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
144 MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
145 MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
146 MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
147 %endif
148
149 %endm
150
151 %macro FUNC_RESTORE 0
152 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
153 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
154 mov r12, [rsp + gpr_save_mem_offset + 2*8]
155
156 %ifidn __OUTPUT_FORMAT__, win64
157 mov rsi, [rsp + gpr_save_mem_offset + 3*8]
158 mov rdi, [rsp + gpr_save_mem_offset + 4*8]
159
160 MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
161 MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
162 MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
163 MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
164 MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
165 MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
166 MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
167 MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
168 MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
169 MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
170 %endif
171 add rsp, stack_size
172
173 %endmacro
174
175 global encode_deflate_icf_ %+ ARCH
176 encode_deflate_icf_ %+ ARCH:
177 FUNC_SAVE
178
179 %ifnidn ptr, arg1
180 mov ptr, arg1
181 %endif
182 %ifnidn hufftables, arg4
183 mov hufftables, arg4
184 %endif
185
186 mov [rsp + bitbuf_mem_offset], bitbuf
187 mov bits, [bitbuf + _m_bits]
188 mov ecx, [bitbuf + _m_bit_count]
189 mov end_ptr, [bitbuf + _m_out_end]
190 mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
191
192 sub end_ptr, VECTOR_SLOP
193 sub in_buf_end, VECTOR_LOOP_PROCESSED
194 cmp ptr, in_buf_end
195 jge .finish
196
197 vpcmpeqq ytmp, ytmp, ytmp
198 vmovdqu datas, [ptr]
199 vpand syms, datas, [lit_mask]
200 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
201
202 vpcmpeqq ytmp, ytmp, ytmp
203 vpsrld dsyms, datas, DIST_OFFSET
204 vpand dsyms, dsyms, [dist_mask]
205 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
206
207 vmovq ybits %+ x, bits
208 vmovq ybits_count %+ x, rcx
209 vmovdqa yoffset_mask, [offset_mask]
210
211 .main_loop:
212 ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
213 vpsrld code_lens1, codes_lookup1, 24
214 vpand codes1, codes_lookup1, [lit_icr_mask]
215
216 ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
217 ;; and code_lens3 the extra bit counts
218 vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
219 vpsrld code_lens2, codes_lookup2, 24
220 vpsrld code_lens3, codes_lookup2, 16
221 vpand code_lens3, [eb_icr_mask]
222
223 ;; Set codes3 to contain the extra bits
224 vpsrld codes3, datas, EXTRA_BITS_OFFSET
225
226 cmp out_buf, end_ptr
227 ja .main_loop_exit
228
229 ;; Start code lookups for next iteration
230 add ptr, VECTOR_SIZE
231 vpcmpeqq ytmp, ytmp, ytmp
232 vmovdqu datas, [ptr]
233 vpand syms, datas, [lit_mask]
234 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
235
236 vpcmpeqq ytmp, ytmp, ytmp
237 vpsrld dsyms, datas, DIST_OFFSET
238 vpand dsyms, dsyms, [dist_mask]
239 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
240
241 ;; Merge dist code with extra bits
242 vpsllvd codes3, codes3, code_lens2
243 vpxor codes2, codes2, codes3
244 vpaddd code_lens2, code_lens2, code_lens3
245
246 ;; Check for long codes
247 vpaddd code_lens3, code_lens1, code_lens2
248 vpcmpgtd ytmp, code_lens3, [max_write_d]
249 vptest ytmp, ytmp
250 jnz .long_codes
251
252 ;; Merge dist and len codes
253 vpsllvd codes2, codes2, code_lens1
254 vpxor codes1, codes1, codes2
255
256 ;; Split buffer data into qwords, ytmp is 0 after last branch
257 vpblendd codes3, ytmp, codes1, 0x55
258 vpsrlq codes1, codes1, 32
259 vpsrlq code_lens1, code_lens3, 32
260 vpblendd code_lens3, ytmp, code_lens3, 0x55
261
262 ;; Merge bitbuf bits
263 vpsllvq codes3, codes3, ybits_count
264 vpxor codes3, codes3, ybits
265 vpaddq code_lens3, code_lens3, ybits_count
266
267 ;; Merge two symbols into qwords
268 vpsllvq codes1, codes1, code_lens3
269 vpxor codes1, codes1, codes3
270 vpaddq code_lens1, code_lens1, code_lens3
271
272 ;; Split buffer data into dqwords, ytmp is 0 after last branch
273 vpblendd codes2, ytmp, codes1, 0x33
274 vpblendd code_lens2, ytmp, code_lens1, 0x33
275 vpsrldq codes1, 8
276 vpsrldq code_lens1, 8
277
278 ;; Bit align dqwords
279 vpaddq code_lens1, code_lens1, code_lens2
280 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
281 vpermq ybits_count, ybits_count, 0xcf
282 vpaddq code_lens2, ybits_count
283 vpsllvq codes2, codes2, ybits_count
284
285 ;; Merge two qwords into dqwords
286 vmovdqa ytmp, [q_64]
287 vpsubq code_lens3, ytmp, code_lens2
288 vpsrlvq codes3, codes1, code_lens3
289 vpslldq codes3, codes3, 8
290
291 vpsllvq codes1, codes1, code_lens2
292
293 vpxor codes1, codes1, codes3
294 vpxor codes1, codes1, codes2
295
296 vmovq tmp, code_lens1 %+ x ;Number of bytes
297 shr tmp, 3
298
299 ;; Extract last bytes
300 vpaddq code_lens2, code_lens1, ybits_count
301 vpsrlq code_lens2, code_lens2, 3
302 vpshufb codes2, codes1, code_lens2
303 vpand codes2, codes2, [bytes_mask]
304 vextracti128 ybits %+ x, codes2, 1
305
306 ;; Check for short codes
307 vptest code_lens2, [min_write_mask]
308 jz .short_codes
309 .short_codes_next:
310
311 vpermq codes2, codes2, 0x45
312 vpor codes1, codes1, codes2
313
314 ;; bit shift upper dqword combined bits to line up with lower dqword
315 vextracti128 code_lens2 %+ x, code_lens1, 1
316
317 ; Write out lower dqword of combined bits
318 vmovdqu [out_buf], codes1
319 vpaddq code_lens1, code_lens1, code_lens2
320
321 vmovq tmp2, code_lens1 %+ x ;Number of bytes
322 shr tmp2, 3
323 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
324
325 ; Write out upper dqword of combined bits
326 vextracti128 [out_buf + tmp], codes1, 1
327 add out_buf, tmp2
328
329 cmp ptr, in_buf_end
330 jbe .main_loop
331
332 .main_loop_exit:
333 vmovq rcx, ybits_count %+ x
334 vmovq bits, ybits %+ x
335 jmp .finish
336
337 .short_codes:
338 ;; Merge last bytes when the second dqword contains less than a byte
339 vpor ybits %+ x, codes2 %+ x
340 jmp .short_codes_next
341
342 .long_codes:
343 add end_ptr, VECTOR_SLOP
344 sub ptr, VECTOR_SIZE
345
346 vpxor ytmp, ytmp, ytmp
347 vpblendd codes3, ytmp, codes1, 0x55
348 vpblendd code_lens3, ytmp, code_lens1, 0x55
349 vpblendd codes4, ytmp, codes2, 0x55
350
351 vpsllvq codes4, codes4, code_lens3
352 vpxor codes3, codes3, codes4
353 vpaddd code_lens3, code_lens1, code_lens2
354
355 vpsrlq codes1, codes1, 32
356 vpsrlq code_lens1, code_lens1, 32
357 vpsrlq codes2, codes2, 32
358
359 vpsllvq codes2, codes2, code_lens1
360 vpxor codes1, codes1, codes2
361
362 vpsrlq code_lens1, code_lens3, 32
363 vpblendd code_lens3, ytmp, code_lens3, 0x55
364
365 ;; Merge bitbuf bits
366 vpsllvq codes3, codes3, ybits_count
367 vpxor codes3, codes3, ybits
368 vpaddq code_lens3, code_lens3, ybits_count
369 vpaddq code_lens1, code_lens1, code_lens3
370
371 xor bits, bits
372 xor rcx, rcx
373 vpsubq code_lens1, code_lens1, code_lens3
374 %rep 2
375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 cmp out_buf, end_ptr
377 ja .overflow
378 ;; insert LL code
379 vmovq sym, codes3 %+ x
380 vmovq tmp2, code_lens3 %+ x
381 SHLX sym, sym, rcx
382 or bits, sym
383 add rcx, tmp2
384
385 ; empty bits
386 mov [out_buf], bits
387 mov tmp, rcx
388 shr tmp, 3 ; byte count
389 add out_buf, tmp
390 mov tmp, rcx
391 and rcx, ~7
392 SHRX bits, bits, rcx
393 mov rcx, tmp
394 and rcx, 7
395 add ptr, 4
396
397 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
398 cmp out_buf, end_ptr
399 ja .overflow
400 ;; insert LL code
401 vmovq sym, codes1 %+ x
402 vmovq tmp2, code_lens1 %+ x
403 SHLX sym, sym, rcx
404 or bits, sym
405 add rcx, tmp2
406
407 ; empty bits
408 mov [out_buf], bits
409 mov tmp, rcx
410 shr tmp, 3 ; byte count
411 add out_buf, tmp
412 mov tmp, rcx
413 and rcx, ~7
414 SHRX bits, bits, rcx
415 mov rcx, tmp
416 and rcx, 7
417 add ptr, 4
418
419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
420 cmp out_buf, end_ptr
421 ja .overflow
422 ;; insert LL code
423 vpextrq sym, codes3 %+ x, 1
424 vpextrq tmp2, code_lens3 %+ x, 1
425 SHLX sym, sym, rcx
426 or bits, sym
427 add rcx, tmp2
428
429 ; empty bits
430 mov [out_buf], bits
431 mov tmp, rcx
432 shr tmp, 3 ; byte count
433 add out_buf, tmp
434 mov tmp, rcx
435 and rcx, ~7
436 SHRX bits, bits, rcx
437 mov rcx, tmp
438 and rcx, 7
439 add ptr, 4
440
441 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
442 cmp out_buf, end_ptr
443 ja .overflow
444 ;; insert LL code
445 vpextrq sym, codes1 %+ x, 1
446 vpextrq tmp2, code_lens1 %+ x, 1
447 SHLX sym, sym, rcx
448 or bits, sym
449 add rcx, tmp2
450
451 ; empty bits
452 mov [out_buf], bits
453 mov tmp, rcx
454 shr tmp, 3 ; byte count
455 add out_buf, tmp
456 mov tmp, rcx
457 and rcx, ~7
458 SHRX bits, bits, rcx
459 mov rcx, tmp
460 and rcx, 7
461 add ptr, 4
462
463 vextracti128 codes3 %+ x, codes3, 1
464 vextracti128 code_lens3 %+ x, code_lens3, 1
465 vextracti128 codes1 %+ x, codes1, 1
466 vextracti128 code_lens1 %+ x, code_lens1, 1
467 %endrep
468 sub end_ptr, VECTOR_SLOP
469
470 vmovq ybits %+ x, bits
471 vmovq ybits_count %+ x, rcx
472 cmp ptr, in_buf_end
473 jbe .main_loop
474
475 .finish:
476 add in_buf_end, VECTOR_LOOP_PROCESSED
477 add end_ptr, VECTOR_SLOP
478
479 cmp ptr, in_buf_end
480 jge .overflow
481
482 .finish_loop:
483 mov DWORD(data), [ptr]
484
485 cmp out_buf, end_ptr
486 ja .overflow
487
488 mov sym, data
489 and sym, LIT_MASK ; sym has ll_code
490 mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
491
492 ; look up dist sym
493 mov dsym, data
494 shr dsym, DIST_OFFSET
495 and dsym, DIST_MASK
496 mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
497
498 ; insert LL code
499 ; sym: 31:24 length; 23:0 code
500 mov tmp2, sym
501 and sym, 0xFFFFFF
502 SHLX sym, sym, rcx
503 shr tmp2, 24
504 or bits, sym
505 add rcx, tmp2
506
507 ; insert dist code
508 movzx tmp, WORD(dsym)
509 SHLX tmp, tmp, rcx
510 or bits, tmp
511 mov tmp, dsym
512 shr tmp, 24
513 add rcx, tmp
514
515 ; insert dist extra bits
516 shr data, EXTRA_BITS_OFFSET
517 add ptr, 4
518 SHLX data, data, rcx
519 or bits, data
520 shr dsym, 16
521 and dsym, 0xFF
522 add rcx, dsym
523
524 ; empty bits
525 mov [out_buf], bits
526 mov tmp, rcx
527 shr tmp, 3 ; byte count
528 add out_buf, tmp
529 mov tmp, rcx
530 and rcx, ~7
531 SHRX bits, bits, rcx
532 mov rcx, tmp
533 and rcx, 7
534
535 cmp ptr, in_buf_end
536 jb .finish_loop
537
538 .overflow:
539 mov tmp, [rsp + bitbuf_mem_offset]
540 mov [tmp + _m_bits], bits
541 mov [tmp + _m_bit_count], ecx
542 mov [tmp + _m_out_buf], out_buf
543
544 mov rax, ptr
545
546 FUNC_RESTORE
547
548 ret
549
550 section .data
551 align 32
552 max_write_d:
553 dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
554 min_write_mask:
555 dq 0x00, 0x00, 0xff, 0x00
556 offset_mask:
557 dq 0x0000000000000007, 0x0000000000000000
558 dq 0x0000000000000000, 0x0000000000000000
559 q_64:
560 dq 0x0000000000000040, 0x0000000000000000
561 dq 0x0000000000000040, 0x0000000000000000
562 lit_mask:
563 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
564 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
565 dist_mask:
566 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
567 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
568 lit_icr_mask:
569 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
570 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
571 eb_icr_mask:
572 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
573 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
574 bytes_mask:
575 dq 0x00000000000000ff, 0x0000000000000000
576 dq 0x00000000000000ff, 0x0000000000000000