]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/encode_df_asm.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / igzip / encode_df_asm.asm
1 %include "reg_sizes.asm"
2 %include "lz0a_const.asm"
3 %include "data_struct2.asm"
4 %include "stdmac.asm"
5
6 ; tree entry is 4 bytes:
7 ; lit/len tree (513 entries)
8 ; | 3 | 2 | 1 | 0 |
9 ; | len | code |
10 ;
11 ; dist tree
12 ; | 3 | 2 | 1 | 0 |
13 ; |eblen:codlen| code |
14
15 ; token format:
16 ; DIST_OFFSET:0 : lit/len
17 ; 31:(DIST_OFFSET + 5) : dist Extra Bits
18 ; (DIST_OFFSET + 5):DIST_OFFSET : dist code
19 ; lit/len: 0-256 (literal)
20 ; 257-512 (dist + 254)
21
22 ; returns final token pointer
23 ; equal to token_end if successful
24 ; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
25 ; BitBuf *out_buf, uint32_t *trees);
26
27 %ifidn __OUTPUT_FORMAT__, win64
28 %define arg1 rcx
29 %define arg2 rdx
30 %define arg3 r8
31 %define arg4 r9
32 %define sym rsi
33 %define dsym rdi
34 %define hufftables r9
35 %define ptr r11
36 %else
37 ; Linux
38 %define arg1 rdi
39 %define arg2 rsi
40 %define arg3 rdx
41 %define arg4 rcx
42 %define sym r9
43 %define dsym r8
44 %define hufftables r11
45 %define ptr rdi
46 %endif
47
48 %define in_buf_end arg2
49 %define bitbuf arg3
50 %define out_buf bitbuf
51 ; bit_count is rcx
52 %define bits rax
53 %define data r12
54 %define tmp rbx
55 %define len dsym
56 %define tmp2 r10
57 %define end_ptr rbp
58
59 %define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
60 %define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
61
62 %define codes1 ymm1
63 %define code_lens1 ymm2
64 %define codes2 ymm3
65 %define code_lens2 ymm4
66 %define codes3 ymm5
67 %define code_lens3 ymm6
68 %define codes4 ymm7
69 %define syms ymm7
70
71 %define code_lens4 ymm8
72 %define dsyms ymm8
73
74 %define ytmp ymm9
75 %define codes_lookup1 ymm10
76 %define codes_lookup2 ymm11
77 %define datas ymm12
78 %define ybits ymm13
79 %define ybits_count ymm14
80 %define yoffset_mask ymm15
81
82 %define VECTOR_SIZE 0x20
83 %define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
84 %define VECTOR_SLOP 0x20 - 8
85
86 gpr_save_mem_offset equ 0
87 gpr_save_mem_size equ 8 * 6
88 xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
89 xmm_save_mem_size equ 10 * 16
90 bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
91 bitbuf_mem_size equ 8
92 stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
93
94
95 %macro FUNC_SAVE 0
96 sub rsp, stack_size
97 mov [rsp + gpr_save_mem_offset + 0*8], rbx
98 mov [rsp + gpr_save_mem_offset + 1*8], rbp
99 mov [rsp + gpr_save_mem_offset + 2*8], r12
100
101 %ifidn __OUTPUT_FORMAT__, win64
102 mov [rsp + gpr_save_mem_offset + 3*8], rsi
103 mov [rsp + gpr_save_mem_offset + 4*8], rdi
104
105 MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
106 MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
107 MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
108 MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
109 MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
110 MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
111 MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
112 MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
113 MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
114 MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
115 %endif
116
117 %endm
118
119 %macro FUNC_RESTORE 0
120 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
121 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
122 mov r12, [rsp + gpr_save_mem_offset + 2*8]
123
124 %ifidn __OUTPUT_FORMAT__, win64
125 mov rsi, [rsp + gpr_save_mem_offset + 3*8]
126 mov rdi, [rsp + gpr_save_mem_offset + 4*8]
127
128 MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
129 MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
130 MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
131 MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
132 MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
133 MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
134 MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
135 MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
136 MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
137 MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
138 %endif
139 add rsp, stack_size
140
141 %endmacro
142
143 global encode_deflate_icf_ %+ ARCH
144 encode_deflate_icf_ %+ ARCH:
145 FUNC_SAVE
146
147 %ifnidn ptr, arg1
148 mov ptr, arg1
149 %endif
150 %ifnidn hufftables, arg4
151 mov hufftables, arg4
152 %endif
153
154 mov [rsp + bitbuf_mem_offset], bitbuf
155 mov bits, [bitbuf + _m_bits]
156 mov ecx, [bitbuf + _m_bit_count]
157 mov end_ptr, [bitbuf + _m_out_end]
158 mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
159
160 sub end_ptr, VECTOR_SLOP
161 sub in_buf_end, VECTOR_LOOP_PROCESSED
162 cmp ptr, in_buf_end
163 jge .finish
164
165 vpcmpeqq ytmp, ytmp, ytmp
166 vmovdqu datas, [ptr]
167 vpand syms, datas, [lit_mask]
168 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
169
170 vpcmpeqq ytmp, ytmp, ytmp
171 vpsrld dsyms, datas, DIST_OFFSET
172 vpand dsyms, dsyms, [dist_mask]
173 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
174
175 vmovq ybits %+ x, bits
176 vmovq ybits_count %+ x, rcx
177 vmovdqa yoffset_mask, [offset_mask]
178
179 .main_loop:
180 ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
181 vpsrld code_lens1, codes_lookup1, 24
182 vpand codes1, codes_lookup1, [lit_icr_mask]
183
184 ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
185 ;; and code_lens3 the extra bit counts
186 vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
187 vpsrld code_lens2, codes_lookup2, 24
188 vpsrld code_lens3, codes_lookup2, 16
189 vpand code_lens3, [eb_icr_mask]
190
191 ;; Set codes3 to contain the extra bits
192 vpsrld codes3, datas, EXTRA_BITS_OFFSET
193
194 cmp out_buf, end_ptr
195 ja .main_loop_exit
196
197 ;; Start code lookups for next iteration
198 add ptr, VECTOR_SIZE
199 vpcmpeqq ytmp, ytmp, ytmp
200 vmovdqu datas, [ptr]
201 vpand syms, datas, [lit_mask]
202 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
203
204 vpcmpeqq ytmp, ytmp, ytmp
205 vpsrld dsyms, datas, DIST_OFFSET
206 vpand dsyms, dsyms, [dist_mask]
207 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
208
209 ;; Merge dist code with extra bits
210 vpsllvd codes3, codes3, code_lens2
211 vpxor codes2, codes2, codes3
212 vpaddd code_lens2, code_lens2, code_lens3
213
214 ;; Check for long codes
215 vpaddd code_lens3, code_lens1, code_lens2
216 vpcmpgtd ytmp, code_lens3, [max_write_d]
217 vptest ytmp, ytmp
218 jnz .long_codes
219
220 ;; Merge dist and len codes
221 vpsllvd codes2, codes2, code_lens1
222 vpxor codes1, codes1, codes2
223
224 ;; Split buffer data into qwords, ytmp is 0 after last branch
225 vpblendd codes3, ytmp, codes1, 0x55
226 vpsrlq codes1, codes1, 32
227 vpsrlq code_lens1, code_lens3, 32
228 vpblendd code_lens3, ytmp, code_lens3, 0x55
229
230 ;; Merge bitbuf bits
231 vpsllvq codes3, codes3, ybits_count
232 vpxor codes3, codes3, ybits
233 vpaddq code_lens3, code_lens3, ybits_count
234
235 ;; Merge two symbols into qwords
236 vpsllvq codes1, codes1, code_lens3
237 vpxor codes1, codes1, codes3
238 vpaddq code_lens1, code_lens1, code_lens3
239
240 ;; Split buffer data into dqwords, ytmp is 0 after last branch
241 vpblendd codes2, ytmp, codes1, 0x33
242 vpblendd code_lens2, ytmp, code_lens1, 0x33
243 vpsrldq codes1, 8
244 vpsrldq code_lens1, 8
245
246 ;; Merge two qwords into dqwords
247 vmovdqa ytmp, [q_64]
248 vpsubq code_lens3, ytmp, code_lens2
249 vpsrlvq codes3, codes1, code_lens3
250 vpslldq codes3, codes3, 8
251
252 vpsllvq codes1, codes1, code_lens2
253
254 vpxor codes1, codes1, codes3
255 vpxor codes1, codes1, codes2
256 vpaddq code_lens1, code_lens1, code_lens2
257
258 vmovq tmp, code_lens1 %+ x ;Number of bytes
259 shr tmp, 3
260 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
261
262 ;; bit shift upper dqword combined bits to line up with lower dqword
263 vextracti128 codes2 %+ x, codes1, 1
264 vextracti128 code_lens2 %+ x, code_lens1, 1
265
266 vpbroadcastq ybits_count, ybits_count %+ x
267 vpsrldq codes3, codes2, 1
268 vpsllvq codes2, codes2, ybits_count
269 vpsllvq codes3, codes3, ybits_count
270 vpslldq codes3, codes3, 1
271 vpor codes2, codes2, codes3
272
273 ; Write out lower dqword of combined bits
274 vmovdqu [out_buf], codes1
275 movzx bits, byte [out_buf + tmp]
276 vmovq codes1 %+ x, bits
277 vpaddq code_lens1, code_lens1, code_lens2
278
279 vmovq tmp2, code_lens1 %+ x ;Number of bytes
280 shr tmp2, 3
281 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
282
283 ; Write out upper dqword of combined bits
284 vpor codes1 %+ x, codes1 %+ x, codes2 %+ x
285 vmovdqu [out_buf + tmp], codes1 %+ x
286 add out_buf, tmp2
287 movzx bits, byte [out_buf]
288 vmovq ybits %+ x, bits
289
290 cmp ptr, in_buf_end
291 jbe .main_loop
292
293 .main_loop_exit:
294 vmovq rcx, ybits_count %+ x
295 vmovq bits, ybits %+ x
296 jmp .finish
297
298 .long_codes:
299 add end_ptr, VECTOR_SLOP
300 sub ptr, VECTOR_SIZE
301
302 vpxor ytmp, ytmp, ytmp
303 vpblendd codes3, ytmp, codes1, 0x55
304 vpblendd code_lens3, ytmp, code_lens1, 0x55
305 vpblendd codes4, ytmp, codes2, 0x55
306
307 vpsllvq codes4, codes4, code_lens3
308 vpxor codes3, codes3, codes4
309 vpaddd code_lens3, code_lens1, code_lens2
310
311 vpsrlq codes1, codes1, 32
312 vpsrlq code_lens1, code_lens1, 32
313 vpsrlq codes2, codes2, 32
314
315 vpsllvq codes2, codes2, code_lens1
316 vpxor codes1, codes1, codes2
317
318 vpsrlq code_lens1, code_lens3, 32
319 vpblendd code_lens3, ytmp, code_lens3, 0x55
320
321 ;; Merge bitbuf bits
322 vpsllvq codes3, codes3, ybits_count
323 vpxor codes3, codes3, ybits
324 vpaddq code_lens3, code_lens3, ybits_count
325 vpaddq code_lens1, code_lens1, code_lens3
326
327 xor bits, bits
328 xor rcx, rcx
329 vpsubq code_lens1, code_lens1, code_lens3
330 %rep 2
331 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
332 cmp out_buf, end_ptr
333 ja .overflow
334 ;; insert LL code
335 vmovq sym, codes3 %+ x
336 vmovq tmp2, code_lens3 %+ x
337 SHLX sym, sym, rcx
338 or bits, sym
339 add rcx, tmp2
340
341 ; empty bits
342 mov [out_buf], bits
343 mov tmp, rcx
344 shr tmp, 3 ; byte count
345 add out_buf, tmp
346 mov tmp, rcx
347 and rcx, ~7
348 SHRX bits, bits, rcx
349 mov rcx, tmp
350 and rcx, 7
351 add ptr, 4
352
353 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
354 cmp out_buf, end_ptr
355 ja .overflow
356 ;; insert LL code
357 vmovq sym, codes1 %+ x
358 vmovq tmp2, code_lens1 %+ x
359 SHLX sym, sym, rcx
360 or bits, sym
361 add rcx, tmp2
362
363 ; empty bits
364 mov [out_buf], bits
365 mov tmp, rcx
366 shr tmp, 3 ; byte count
367 add out_buf, tmp
368 mov tmp, rcx
369 and rcx, ~7
370 SHRX bits, bits, rcx
371 mov rcx, tmp
372 and rcx, 7
373 add ptr, 4
374
375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 cmp out_buf, end_ptr
377 ja .overflow
378 ;; insert LL code
379 vpextrq sym, codes3 %+ x, 1
380 vpextrq tmp2, code_lens3 %+ x, 1
381 SHLX sym, sym, rcx
382 or bits, sym
383 add rcx, tmp2
384
385 ; empty bits
386 mov [out_buf], bits
387 mov tmp, rcx
388 shr tmp, 3 ; byte count
389 add out_buf, tmp
390 mov tmp, rcx
391 and rcx, ~7
392 SHRX bits, bits, rcx
393 mov rcx, tmp
394 and rcx, 7
395 add ptr, 4
396
397 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
398 cmp out_buf, end_ptr
399 ja .overflow
400 ;; insert LL code
401 vpextrq sym, codes1 %+ x, 1
402 vpextrq tmp2, code_lens1 %+ x, 1
403 SHLX sym, sym, rcx
404 or bits, sym
405 add rcx, tmp2
406
407 ; empty bits
408 mov [out_buf], bits
409 mov tmp, rcx
410 shr tmp, 3 ; byte count
411 add out_buf, tmp
412 mov tmp, rcx
413 and rcx, ~7
414 SHRX bits, bits, rcx
415 mov rcx, tmp
416 and rcx, 7
417 add ptr, 4
418
419 vextracti128 codes3 %+ x, codes3, 1
420 vextracti128 code_lens3 %+ x, code_lens3, 1
421 vextracti128 codes1 %+ x, codes1, 1
422 vextracti128 code_lens1 %+ x, code_lens1, 1
423 %endrep
424 sub end_ptr, VECTOR_SLOP
425
426 vmovq ybits %+ x, bits
427 vmovq ybits_count %+ x, rcx
428 cmp ptr, in_buf_end
429 jbe .main_loop
430
431 .finish:
432 add in_buf_end, VECTOR_LOOP_PROCESSED
433 add end_ptr, VECTOR_SLOP
434
435 cmp ptr, in_buf_end
436 jge .overflow
437
438 .finish_loop:
439 mov DWORD(data), [ptr]
440
441 cmp out_buf, end_ptr
442 ja .overflow
443
444 mov sym, data
445 and sym, LIT_MASK ; sym has ll_code
446 mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
447
448 ; look up dist sym
449 mov dsym, data
450 shr dsym, DIST_OFFSET
451 and dsym, DIST_MASK
452 mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
453
454 ; insert LL code
455 ; sym: 31:24 length; 23:0 code
456 mov tmp2, sym
457 and sym, 0xFFFFFF
458 SHLX sym, sym, rcx
459 shr tmp2, 24
460 or bits, sym
461 add rcx, tmp2
462
463 ; insert dist code
464 movzx tmp, WORD(dsym)
465 SHLX tmp, tmp, rcx
466 or bits, tmp
467 mov tmp, dsym
468 shr tmp, 24
469 add rcx, tmp
470
471 ; insert dist extra bits
472 shr data, EXTRA_BITS_OFFSET
473 add ptr, 4
474 SHLX data, data, rcx
475 or bits, data
476 shr dsym, 16
477 and dsym, 0xFF
478 add rcx, dsym
479
480 ; empty bits
481 mov [out_buf], bits
482 mov tmp, rcx
483 shr tmp, 3 ; byte count
484 add out_buf, tmp
485 mov tmp, rcx
486 and rcx, ~7
487 SHRX bits, bits, rcx
488 mov rcx, tmp
489 and rcx, 7
490
491 cmp ptr, in_buf_end
492 jb .finish_loop
493
494 .overflow:
495 mov tmp, [rsp + bitbuf_mem_offset]
496 mov [tmp + _m_bits], bits
497 mov [tmp + _m_bit_count], ecx
498 mov [tmp + _m_out_buf], out_buf
499
500 mov rax, ptr
501
502 FUNC_RESTORE
503
504 ret
505
506 section .data
507 align 32
508 max_write_d:
509 dd 0x1c, 0x1d, 0x20, 0x20, 0x1e, 0x1e, 0x1e, 0x1e
510 offset_mask:
511 dq 0x0000000000000007, 0x0000000000000000
512 dq 0x0000000000000000, 0x0000000000000000
513 q_64:
514 dq 0x0000000000000040, 0x0000000000000000
515 dq 0x0000000000000040, 0x0000000000000000
516 lit_mask:
517 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
518 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
519 dist_mask:
520 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
521 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
522 lit_icr_mask:
523 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
524 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
525 eb_icr_mask:
526 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
527 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF