]>
Commit | Line | Data |
---|---|---|
f91f0fd5 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2018 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "reg_sizes.asm" | |
31 | %include "lz0a_const.asm" | |
32 | %include "data_struct2.asm" | |
33 | %include "stdmac.asm" | |
34 | ||
224ce89b WB |
35 | %define ARCH 04 |
36 | %define USE_HSWNI | |
37 | ||
f91f0fd5 TL |
38 | ; tree entry is 4 bytes: |
39 | ; lit/len tree (513 entries) | |
40 | ; | 3 | 2 | 1 | 0 | | |
41 | ; | len | code | | |
42 | ; | |
43 | ; dist tree | |
44 | ; | 3 | 2 | 1 | 0 | | |
45 | ; |eblen:codlen| code | | |
46 | ||
47 | ; token format: | |
48 | ; DIST_OFFSET:0 : lit/len | |
49 | ; 31:(DIST_OFFSET + 5) : dist Extra Bits | |
50 | ; (DIST_OFFSET + 5):DIST_OFFSET : dist code | |
51 | ; lit/len: 0-256 (literal) | |
52 | ; 257-512 (dist + 254) | |
53 | ||
54 | ; returns final token pointer | |
55 | ; equal to token_end if successful | |
56 | ; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end, | |
57 | ; BitBuf *out_buf, uint32_t *trees); | |
58 | ||
59 | %ifidn __OUTPUT_FORMAT__, win64 | |
60 | %define arg1 rcx | |
61 | %define arg2 rdx | |
62 | %define arg3 r8 | |
63 | %define arg4 r9 | |
64 | %define sym rsi | |
65 | %define dsym rdi | |
66 | %define hufftables r9 | |
67 | %define ptr r11 | |
68 | %else | |
69 | ; Linux | |
70 | %define arg1 rdi | |
71 | %define arg2 rsi | |
72 | %define arg3 rdx | |
73 | %define arg4 rcx | |
74 | %define sym r9 | |
75 | %define dsym r8 | |
76 | %define hufftables r11 | |
77 | %define ptr rdi | |
78 | %endif | |
79 | ||
80 | %define in_buf_end arg2 | |
81 | %define bitbuf arg3 | |
82 | %define out_buf bitbuf | |
83 | ; bit_count is rcx | |
84 | %define bits rax | |
85 | %define data r12 | |
86 | %define tmp rbx | |
87 | %define len dsym | |
88 | %define tmp2 r10 | |
89 | %define end_ptr rbp | |
90 | ||
91 | %define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1) | |
92 | %define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1) | |
93 | ||
94 | %define codes1 ymm1 | |
95 | %define code_lens1 ymm2 | |
96 | %define codes2 ymm3 | |
97 | %define code_lens2 ymm4 | |
98 | %define codes3 ymm5 | |
99 | %define code_lens3 ymm6 | |
100 | %define codes4 ymm7 | |
101 | %define syms ymm7 | |
102 | ||
103 | %define code_lens4 ymm8 | |
104 | %define dsyms ymm8 | |
105 | ||
106 | %define ytmp ymm9 | |
107 | %define codes_lookup1 ymm10 | |
108 | %define codes_lookup2 ymm11 | |
109 | %define datas ymm12 | |
110 | %define ybits ymm13 | |
111 | %define ybits_count ymm14 | |
112 | %define yoffset_mask ymm15 | |
113 | ||
114 | %define VECTOR_SIZE 0x20 | |
115 | %define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE) | |
116 | %define VECTOR_SLOP 0x20 - 8 | |
117 | ||
118 | gpr_save_mem_offset equ 0 | |
119 | gpr_save_mem_size equ 8 * 6 | |
120 | xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size | |
121 | xmm_save_mem_size equ 10 * 16 | |
122 | bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size | |
123 | bitbuf_mem_size equ 8 | |
124 | stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size | |
125 | ||
126 | ||
127 | %macro FUNC_SAVE 0 | |
128 | sub rsp, stack_size | |
129 | mov [rsp + gpr_save_mem_offset + 0*8], rbx | |
130 | mov [rsp + gpr_save_mem_offset + 1*8], rbp | |
131 | mov [rsp + gpr_save_mem_offset + 2*8], r12 | |
132 | ||
133 | %ifidn __OUTPUT_FORMAT__, win64 | |
134 | mov [rsp + gpr_save_mem_offset + 3*8], rsi | |
135 | mov [rsp + gpr_save_mem_offset + 4*8], rdi | |
136 | ||
137 | MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6 | |
138 | MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7 | |
139 | MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8 | |
140 | MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9 | |
141 | MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10 | |
142 | MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11 | |
143 | MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12 | |
144 | MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13 | |
145 | MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14 | |
146 | MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15 | |
147 | %endif | |
148 | ||
149 | %endm | |
150 | ||
151 | %macro FUNC_RESTORE 0 | |
152 | mov rbx, [rsp + gpr_save_mem_offset + 0*8] | |
153 | mov rbp, [rsp + gpr_save_mem_offset + 1*8] | |
154 | mov r12, [rsp + gpr_save_mem_offset + 2*8] | |
155 | ||
156 | %ifidn __OUTPUT_FORMAT__, win64 | |
157 | mov rsi, [rsp + gpr_save_mem_offset + 3*8] | |
158 | mov rdi, [rsp + gpr_save_mem_offset + 4*8] | |
159 | ||
160 | MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8] | |
161 | MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8] | |
162 | MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8] | |
163 | MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8] | |
164 | MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8] | |
165 | MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8] | |
166 | MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8] | |
167 | MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8] | |
168 | MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8] | |
169 | MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8] | |
170 | %endif | |
171 | add rsp, stack_size | |
172 | ||
173 | %endmacro | |
174 | ||
175 | global encode_deflate_icf_ %+ ARCH | |
176 | encode_deflate_icf_ %+ ARCH: | |
177 | FUNC_SAVE | |
178 | ||
179 | %ifnidn ptr, arg1 | |
180 | mov ptr, arg1 | |
181 | %endif | |
182 | %ifnidn hufftables, arg4 | |
183 | mov hufftables, arg4 | |
184 | %endif | |
185 | ||
186 | mov [rsp + bitbuf_mem_offset], bitbuf | |
187 | mov bits, [bitbuf + _m_bits] | |
188 | mov ecx, [bitbuf + _m_bit_count] | |
189 | mov end_ptr, [bitbuf + _m_out_end] | |
190 | mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf | |
191 | ||
192 | sub end_ptr, VECTOR_SLOP | |
193 | sub in_buf_end, VECTOR_LOOP_PROCESSED | |
194 | cmp ptr, in_buf_end | |
195 | jge .finish | |
196 | ||
197 | vpcmpeqq ytmp, ytmp, ytmp | |
198 | vmovdqu datas, [ptr] | |
199 | vpand syms, datas, [lit_mask] | |
200 | vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp | |
201 | ||
202 | vpcmpeqq ytmp, ytmp, ytmp | |
203 | vpsrld dsyms, datas, DIST_OFFSET | |
204 | vpand dsyms, dsyms, [dist_mask] | |
205 | vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp | |
206 | ||
207 | vmovq ybits %+ x, bits | |
208 | vmovq ybits_count %+ x, rcx | |
209 | vmovdqa yoffset_mask, [offset_mask] | |
210 | ||
211 | .main_loop: | |
212 | ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths | |
213 | vpsrld code_lens1, codes_lookup1, 24 | |
214 | vpand codes1, codes_lookup1, [lit_icr_mask] | |
215 | ||
216 | ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths, | |
217 | ;; and code_lens3 the extra bit counts | |
218 | vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0 | |
219 | vpsrld code_lens2, codes_lookup2, 24 | |
220 | vpsrld code_lens3, codes_lookup2, 16 | |
221 | vpand code_lens3, [eb_icr_mask] | |
222 | ||
223 | ;; Set codes3 to contain the extra bits | |
224 | vpsrld codes3, datas, EXTRA_BITS_OFFSET | |
225 | ||
226 | cmp out_buf, end_ptr | |
227 | ja .main_loop_exit | |
228 | ||
229 | ;; Start code lookups for next iteration | |
230 | add ptr, VECTOR_SIZE | |
231 | vpcmpeqq ytmp, ytmp, ytmp | |
232 | vmovdqu datas, [ptr] | |
233 | vpand syms, datas, [lit_mask] | |
234 | vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp | |
235 | ||
236 | vpcmpeqq ytmp, ytmp, ytmp | |
237 | vpsrld dsyms, datas, DIST_OFFSET | |
238 | vpand dsyms, dsyms, [dist_mask] | |
239 | vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp | |
240 | ||
241 | ;; Merge dist code with extra bits | |
242 | vpsllvd codes3, codes3, code_lens2 | |
243 | vpxor codes2, codes2, codes3 | |
244 | vpaddd code_lens2, code_lens2, code_lens3 | |
245 | ||
246 | ;; Check for long codes | |
247 | vpaddd code_lens3, code_lens1, code_lens2 | |
248 | vpcmpgtd ytmp, code_lens3, [max_write_d] | |
249 | vptest ytmp, ytmp | |
250 | jnz .long_codes | |
251 | ||
252 | ;; Merge dist and len codes | |
253 | vpsllvd codes2, codes2, code_lens1 | |
254 | vpxor codes1, codes1, codes2 | |
255 | ||
256 | ;; Split buffer data into qwords, ytmp is 0 after last branch | |
257 | vpblendd codes3, ytmp, codes1, 0x55 | |
258 | vpsrlq codes1, codes1, 32 | |
259 | vpsrlq code_lens1, code_lens3, 32 | |
260 | vpblendd code_lens3, ytmp, code_lens3, 0x55 | |
261 | ||
262 | ;; Merge bitbuf bits | |
263 | vpsllvq codes3, codes3, ybits_count | |
264 | vpxor codes3, codes3, ybits | |
265 | vpaddq code_lens3, code_lens3, ybits_count | |
266 | ||
267 | ;; Merge two symbols into qwords | |
268 | vpsllvq codes1, codes1, code_lens3 | |
269 | vpxor codes1, codes1, codes3 | |
270 | vpaddq code_lens1, code_lens1, code_lens3 | |
271 | ||
272 | ;; Split buffer data into dqwords, ytmp is 0 after last branch | |
273 | vpblendd codes2, ytmp, codes1, 0x33 | |
274 | vpblendd code_lens2, ytmp, code_lens1, 0x33 | |
275 | vpsrldq codes1, 8 | |
276 | vpsrldq code_lens1, 8 | |
277 | ||
278 | ;; Bit align dqwords | |
279 | vpaddq code_lens1, code_lens1, code_lens2 | |
280 | vpand ybits_count, code_lens1, yoffset_mask ;Extra bits | |
281 | vpermq ybits_count, ybits_count, 0xcf | |
282 | vpaddq code_lens2, ybits_count | |
283 | vpsllvq codes2, codes2, ybits_count | |
284 | ||
285 | ;; Merge two qwords into dqwords | |
286 | vmovdqa ytmp, [q_64] | |
287 | vpsubq code_lens3, ytmp, code_lens2 | |
288 | vpsrlvq codes3, codes1, code_lens3 | |
289 | vpslldq codes3, codes3, 8 | |
290 | ||
291 | vpsllvq codes1, codes1, code_lens2 | |
292 | ||
293 | vpxor codes1, codes1, codes3 | |
294 | vpxor codes1, codes1, codes2 | |
295 | ||
296 | vmovq tmp, code_lens1 %+ x ;Number of bytes | |
297 | shr tmp, 3 | |
298 | ||
299 | ;; Extract last bytes | |
300 | vpaddq code_lens2, code_lens1, ybits_count | |
301 | vpsrlq code_lens2, code_lens2, 3 | |
302 | vpshufb codes2, codes1, code_lens2 | |
303 | vpand codes2, codes2, [bytes_mask] | |
304 | vextracti128 ybits %+ x, codes2, 1 | |
305 | ||
306 | ;; Check for short codes | |
307 | vptest code_lens2, [min_write_mask] | |
308 | jz .short_codes | |
309 | .short_codes_next: | |
310 | ||
311 | vpermq codes2, codes2, 0x45 | |
312 | vpor codes1, codes1, codes2 | |
313 | ||
314 | ;; bit shift upper dqword combined bits to line up with lower dqword | |
315 | vextracti128 code_lens2 %+ x, code_lens1, 1 | |
316 | ||
317 | ; Write out lower dqword of combined bits | |
318 | vmovdqu [out_buf], codes1 | |
319 | vpaddq code_lens1, code_lens1, code_lens2 | |
320 | ||
321 | vmovq tmp2, code_lens1 %+ x ;Number of bytes | |
322 | shr tmp2, 3 | |
323 | vpand ybits_count, code_lens1, yoffset_mask ;Extra bits | |
324 | ||
325 | ; Write out upper dqword of combined bits | |
326 | vextracti128 [out_buf + tmp], codes1, 1 | |
327 | add out_buf, tmp2 | |
328 | ||
329 | cmp ptr, in_buf_end | |
330 | jbe .main_loop | |
331 | ||
332 | .main_loop_exit: | |
333 | vmovq rcx, ybits_count %+ x | |
334 | vmovq bits, ybits %+ x | |
335 | jmp .finish | |
336 | ||
337 | .short_codes: | |
338 | ;; Merge last bytes when the second dqword contains less than a byte | |
339 | vpor ybits %+ x, codes2 %+ x | |
340 | jmp .short_codes_next | |
341 | ||
342 | .long_codes: | |
343 | add end_ptr, VECTOR_SLOP | |
344 | sub ptr, VECTOR_SIZE | |
345 | ||
346 | vpxor ytmp, ytmp, ytmp | |
347 | vpblendd codes3, ytmp, codes1, 0x55 | |
348 | vpblendd code_lens3, ytmp, code_lens1, 0x55 | |
349 | vpblendd codes4, ytmp, codes2, 0x55 | |
350 | ||
351 | vpsllvq codes4, codes4, code_lens3 | |
352 | vpxor codes3, codes3, codes4 | |
353 | vpaddd code_lens3, code_lens1, code_lens2 | |
354 | ||
355 | vpsrlq codes1, codes1, 32 | |
356 | vpsrlq code_lens1, code_lens1, 32 | |
357 | vpsrlq codes2, codes2, 32 | |
358 | ||
359 | vpsllvq codes2, codes2, code_lens1 | |
360 | vpxor codes1, codes1, codes2 | |
361 | ||
362 | vpsrlq code_lens1, code_lens3, 32 | |
363 | vpblendd code_lens3, ytmp, code_lens3, 0x55 | |
364 | ||
365 | ;; Merge bitbuf bits | |
366 | vpsllvq codes3, codes3, ybits_count | |
367 | vpxor codes3, codes3, ybits | |
368 | vpaddq code_lens3, code_lens3, ybits_count | |
369 | vpaddq code_lens1, code_lens1, code_lens3 | |
370 | ||
371 | xor bits, bits | |
372 | xor rcx, rcx | |
373 | vpsubq code_lens1, code_lens1, code_lens3 | |
374 | %rep 2 | |
375 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
376 | cmp out_buf, end_ptr | |
377 | ja .overflow | |
378 | ;; insert LL code | |
379 | vmovq sym, codes3 %+ x | |
380 | vmovq tmp2, code_lens3 %+ x | |
381 | SHLX sym, sym, rcx | |
382 | or bits, sym | |
383 | add rcx, tmp2 | |
384 | ||
385 | ; empty bits | |
386 | mov [out_buf], bits | |
387 | mov tmp, rcx | |
388 | shr tmp, 3 ; byte count | |
389 | add out_buf, tmp | |
390 | mov tmp, rcx | |
391 | and rcx, ~7 | |
392 | SHRX bits, bits, rcx | |
393 | mov rcx, tmp | |
394 | and rcx, 7 | |
395 | add ptr, 4 | |
396 | ||
397 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
398 | cmp out_buf, end_ptr | |
399 | ja .overflow | |
400 | ;; insert LL code | |
401 | vmovq sym, codes1 %+ x | |
402 | vmovq tmp2, code_lens1 %+ x | |
403 | SHLX sym, sym, rcx | |
404 | or bits, sym | |
405 | add rcx, tmp2 | |
406 | ||
407 | ; empty bits | |
408 | mov [out_buf], bits | |
409 | mov tmp, rcx | |
410 | shr tmp, 3 ; byte count | |
411 | add out_buf, tmp | |
412 | mov tmp, rcx | |
413 | and rcx, ~7 | |
414 | SHRX bits, bits, rcx | |
415 | mov rcx, tmp | |
416 | and rcx, 7 | |
417 | add ptr, 4 | |
418 | ||
419 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
420 | cmp out_buf, end_ptr | |
421 | ja .overflow | |
422 | ;; insert LL code | |
423 | vpextrq sym, codes3 %+ x, 1 | |
424 | vpextrq tmp2, code_lens3 %+ x, 1 | |
425 | SHLX sym, sym, rcx | |
426 | or bits, sym | |
427 | add rcx, tmp2 | |
428 | ||
429 | ; empty bits | |
430 | mov [out_buf], bits | |
431 | mov tmp, rcx | |
432 | shr tmp, 3 ; byte count | |
433 | add out_buf, tmp | |
434 | mov tmp, rcx | |
435 | and rcx, ~7 | |
436 | SHRX bits, bits, rcx | |
437 | mov rcx, tmp | |
438 | and rcx, 7 | |
439 | add ptr, 4 | |
440 | ||
441 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
442 | cmp out_buf, end_ptr | |
443 | ja .overflow | |
444 | ;; insert LL code | |
445 | vpextrq sym, codes1 %+ x, 1 | |
446 | vpextrq tmp2, code_lens1 %+ x, 1 | |
447 | SHLX sym, sym, rcx | |
448 | or bits, sym | |
449 | add rcx, tmp2 | |
450 | ||
451 | ; empty bits | |
452 | mov [out_buf], bits | |
453 | mov tmp, rcx | |
454 | shr tmp, 3 ; byte count | |
455 | add out_buf, tmp | |
456 | mov tmp, rcx | |
457 | and rcx, ~7 | |
458 | SHRX bits, bits, rcx | |
459 | mov rcx, tmp | |
460 | and rcx, 7 | |
461 | add ptr, 4 | |
462 | ||
463 | vextracti128 codes3 %+ x, codes3, 1 | |
464 | vextracti128 code_lens3 %+ x, code_lens3, 1 | |
465 | vextracti128 codes1 %+ x, codes1, 1 | |
466 | vextracti128 code_lens1 %+ x, code_lens1, 1 | |
467 | %endrep | |
468 | sub end_ptr, VECTOR_SLOP | |
469 | ||
470 | vmovq ybits %+ x, bits | |
471 | vmovq ybits_count %+ x, rcx | |
472 | cmp ptr, in_buf_end | |
473 | jbe .main_loop | |
474 | ||
475 | .finish: | |
476 | add in_buf_end, VECTOR_LOOP_PROCESSED | |
477 | add end_ptr, VECTOR_SLOP | |
478 | ||
479 | cmp ptr, in_buf_end | |
480 | jge .overflow | |
481 | ||
482 | .finish_loop: | |
483 | mov DWORD(data), [ptr] | |
484 | ||
485 | cmp out_buf, end_ptr | |
486 | ja .overflow | |
487 | ||
488 | mov sym, data | |
489 | and sym, LIT_MASK ; sym has ll_code | |
490 | mov DWORD(sym), [hufftables + _lit_len_table + sym * 4] | |
491 | ||
492 | ; look up dist sym | |
493 | mov dsym, data | |
494 | shr dsym, DIST_OFFSET | |
495 | and dsym, DIST_MASK | |
496 | mov DWORD(dsym), [hufftables + _dist_table + dsym * 4] | |
497 | ||
498 | ; insert LL code | |
499 | ; sym: 31:24 length; 23:0 code | |
500 | mov tmp2, sym | |
501 | and sym, 0xFFFFFF | |
502 | SHLX sym, sym, rcx | |
503 | shr tmp2, 24 | |
504 | or bits, sym | |
505 | add rcx, tmp2 | |
506 | ||
507 | ; insert dist code | |
508 | movzx tmp, WORD(dsym) | |
509 | SHLX tmp, tmp, rcx | |
510 | or bits, tmp | |
511 | mov tmp, dsym | |
512 | shr tmp, 24 | |
513 | add rcx, tmp | |
514 | ||
515 | ; insert dist extra bits | |
516 | shr data, EXTRA_BITS_OFFSET | |
517 | add ptr, 4 | |
518 | SHLX data, data, rcx | |
519 | or bits, data | |
520 | shr dsym, 16 | |
521 | and dsym, 0xFF | |
522 | add rcx, dsym | |
523 | ||
524 | ; empty bits | |
525 | mov [out_buf], bits | |
526 | mov tmp, rcx | |
527 | shr tmp, 3 ; byte count | |
528 | add out_buf, tmp | |
529 | mov tmp, rcx | |
530 | and rcx, ~7 | |
531 | SHRX bits, bits, rcx | |
532 | mov rcx, tmp | |
533 | and rcx, 7 | |
534 | ||
535 | cmp ptr, in_buf_end | |
536 | jb .finish_loop | |
537 | ||
538 | .overflow: | |
539 | mov tmp, [rsp + bitbuf_mem_offset] | |
540 | mov [tmp + _m_bits], bits | |
541 | mov [tmp + _m_bit_count], ecx | |
542 | mov [tmp + _m_out_buf], out_buf | |
543 | ||
544 | mov rax, ptr | |
545 | ||
546 | FUNC_RESTORE | |
547 | ||
548 | ret | |
549 | ||
550 | section .data | |
551 | align 32 | |
552 | max_write_d: | |
553 | dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20 | |
554 | min_write_mask: | |
555 | dq 0x00, 0x00, 0xff, 0x00 | |
556 | offset_mask: | |
557 | dq 0x0000000000000007, 0x0000000000000000 | |
558 | dq 0x0000000000000000, 0x0000000000000000 | |
559 | q_64: | |
560 | dq 0x0000000000000040, 0x0000000000000000 | |
561 | dq 0x0000000000000040, 0x0000000000000000 | |
562 | lit_mask: | |
563 | dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK | |
564 | dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK | |
565 | dist_mask: | |
566 | dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK | |
567 | dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK | |
568 | lit_icr_mask: | |
569 | dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF | |
570 | dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF | |
571 | eb_icr_mask: | |
572 | dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF | |
573 | dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF | |
574 | bytes_mask: | |
575 | dq 0x00000000000000ff, 0x0000000000000000 | |
576 | dq 0x00000000000000ff, 0x0000000000000000 |