1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
38 ; tree entry is 4 bytes:
39 ; lit/len tree (513 entries)
45 ; |eblen:codlen| code |
48 ; DIST_OFFSET:0 : lit/len
49 ; 31:(DIST_OFFSET + 5) : dist Extra Bits
50 ; (DIST_OFFSET + 5):DIST_OFFSET : dist code
51 ; lit/len: 0-256 (literal)
52 ; 257-512 (dist + 254)
54 ; returns final token pointer
55 ; equal to token_end if successful
56 ; uint32_t* encode_df(uint32_t *token_start, uint32_t *token_end,
57 ; BitBuf *out_buf, uint32_t *trees);
59 %ifidn __OUTPUT_FORMAT__, win64
76 %define hufftables r11
80 %define in_buf_end arg2
82 %define out_buf bitbuf
91 %define LIT_MASK ((0x1 << LIT_LEN_BIT_COUNT) - 1)
92 %define DIST_MASK ((0x1 << DIST_LIT_BIT_COUNT) - 1)
95 %define code_lens1 ymm2
97 %define code_lens2 ymm4
99 %define code_lens3 ymm6
103 %define code_lens4 ymm8
107 %define codes_lookup1 ymm10
108 %define codes_lookup2 ymm11
111 %define ybits_count ymm14
112 %define yoffset_mask ymm15
114 %define VECTOR_SIZE 0x20
115 %define VECTOR_LOOP_PROCESSED (2 * VECTOR_SIZE)
116 %define VECTOR_SLOP 0x20 - 8
118 gpr_save_mem_offset equ 0
119 gpr_save_mem_size equ 8 * 6
120 xmm_save_mem_offset equ gpr_save_mem_offset + gpr_save_mem_size
121 xmm_save_mem_size equ 10 * 16
122 bitbuf_mem_offset equ xmm_save_mem_offset + xmm_save_mem_size
123 bitbuf_mem_size equ 8
124 stack_size equ gpr_save_mem_size + xmm_save_mem_size + bitbuf_mem_size
129 mov [rsp + gpr_save_mem_offset + 0*8], rbx
130 mov [rsp + gpr_save_mem_offset + 1*8], rbp
131 mov [rsp + gpr_save_mem_offset + 2*8], r12
133 %ifidn __OUTPUT_FORMAT__, win64
134 mov [rsp + gpr_save_mem_offset + 3*8], rsi
135 mov [rsp + gpr_save_mem_offset + 4*8], rdi
137 MOVDQU [rsp + xmm_save_mem_offset + 0*8], xmm6
138 MOVDQU [rsp + xmm_save_mem_offset + 1*8], xmm7
139 MOVDQU [rsp + xmm_save_mem_offset + 2*8], xmm8
140 MOVDQU [rsp + xmm_save_mem_offset + 3*8], xmm9
141 MOVDQU [rsp + xmm_save_mem_offset + 4*8], xmm10
142 MOVDQU [rsp + xmm_save_mem_offset + 5*8], xmm11
143 MOVDQU [rsp + xmm_save_mem_offset + 6*8], xmm12
144 MOVDQU [rsp + xmm_save_mem_offset + 7*8], xmm13
145 MOVDQU [rsp + xmm_save_mem_offset + 8*8], xmm14
146 MOVDQU [rsp + xmm_save_mem_offset + 9*8], xmm15
151 %macro FUNC_RESTORE 0
152 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
153 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
154 mov r12, [rsp + gpr_save_mem_offset + 2*8]
156 %ifidn __OUTPUT_FORMAT__, win64
157 mov rsi, [rsp + gpr_save_mem_offset + 3*8]
158 mov rdi, [rsp + gpr_save_mem_offset + 4*8]
160 MOVDQU xmm6, [rsp + xmm_save_mem_offset + 0*8]
161 MOVDQU xmm7, [rsp + xmm_save_mem_offset + 1*8]
162 MOVDQU xmm8, [rsp + xmm_save_mem_offset + 2*8]
163 MOVDQU xmm9, [rsp + xmm_save_mem_offset + 3*8]
164 MOVDQU xmm10, [rsp + xmm_save_mem_offset + 4*8]
165 MOVDQU xmm11, [rsp + xmm_save_mem_offset + 5*8]
166 MOVDQU xmm12, [rsp + xmm_save_mem_offset + 6*8]
167 MOVDQU xmm13, [rsp + xmm_save_mem_offset + 7*8]
168 MOVDQU xmm14, [rsp + xmm_save_mem_offset + 8*8]
169 MOVDQU xmm15, [rsp + xmm_save_mem_offset + 9*8]
175 global encode_deflate_icf_ %+ ARCH
176 encode_deflate_icf_ %+ ARCH:
182 %ifnidn hufftables, arg4
186 mov [rsp + bitbuf_mem_offset], bitbuf
187 mov bits, [bitbuf + _m_bits]
188 mov ecx, [bitbuf + _m_bit_count]
189 mov end_ptr, [bitbuf + _m_out_end]
190 mov out_buf, [bitbuf + _m_out_buf] ; clobbers bitbuf
192 sub end_ptr, VECTOR_SLOP
193 sub in_buf_end, VECTOR_LOOP_PROCESSED
197 vpcmpeqq ytmp, ytmp, ytmp
199 vpand syms, datas, [lit_mask]
200 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
202 vpcmpeqq ytmp, ytmp, ytmp
203 vpsrld dsyms, datas, DIST_OFFSET
204 vpand dsyms, dsyms, [dist_mask]
205 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
207 vmovq ybits %+ x, bits
208 vmovq ybits_count %+ x, rcx
209 vmovdqa yoffset_mask, [offset_mask]
212 ;; Sets codes1 to contain lit/len codes andcode_lens1 the corresponding lengths
213 vpsrld code_lens1, codes_lookup1, 24
214 vpand codes1, codes_lookup1, [lit_icr_mask]
216 ;; Sets codes2 to contain dist codes, code_lens2 the corresponding lengths,
217 ;; and code_lens3 the extra bit counts
218 vpblendw codes2, ybits, codes_lookup2, 0x55 ;Bits 8 and above of ybits are 0
219 vpsrld code_lens2, codes_lookup2, 24
220 vpsrld code_lens3, codes_lookup2, 16
221 vpand code_lens3, [eb_icr_mask]
223 ;; Set codes3 to contain the extra bits
224 vpsrld codes3, datas, EXTRA_BITS_OFFSET
229 ;; Start code lookups for next iteration
231 vpcmpeqq ytmp, ytmp, ytmp
233 vpand syms, datas, [lit_mask]
234 vpgatherdd codes_lookup1, [hufftables + _lit_len_table + 4 * syms], ytmp
236 vpcmpeqq ytmp, ytmp, ytmp
237 vpsrld dsyms, datas, DIST_OFFSET
238 vpand dsyms, dsyms, [dist_mask]
239 vpgatherdd codes_lookup2, [hufftables + _dist_table + 4 * dsyms], ytmp
241 ;; Merge dist code with extra bits
242 vpsllvd codes3, codes3, code_lens2
243 vpxor codes2, codes2, codes3
244 vpaddd code_lens2, code_lens2, code_lens3
246 ;; Check for long codes
247 vpaddd code_lens3, code_lens1, code_lens2
248 vpcmpgtd ytmp, code_lens3, [max_write_d]
252 ;; Merge dist and len codes
253 vpsllvd codes2, codes2, code_lens1
254 vpxor codes1, codes1, codes2
256 ;; Split buffer data into qwords, ytmp is 0 after last branch
257 vpblendd codes3, ytmp, codes1, 0x55
258 vpsrlq codes1, codes1, 32
259 vpsrlq code_lens1, code_lens3, 32
260 vpblendd code_lens3, ytmp, code_lens3, 0x55
263 vpsllvq codes3, codes3, ybits_count
264 vpxor codes3, codes3, ybits
265 vpaddq code_lens3, code_lens3, ybits_count
267 ;; Merge two symbols into qwords
268 vpsllvq codes1, codes1, code_lens3
269 vpxor codes1, codes1, codes3
270 vpaddq code_lens1, code_lens1, code_lens3
272 ;; Split buffer data into dqwords, ytmp is 0 after last branch
273 vpblendd codes2, ytmp, codes1, 0x33
274 vpblendd code_lens2, ytmp, code_lens1, 0x33
276 vpsrldq code_lens1, 8
279 vpaddq code_lens1, code_lens1, code_lens2
280 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
281 vpermq ybits_count, ybits_count, 0xcf
282 vpaddq code_lens2, ybits_count
283 vpsllvq codes2, codes2, ybits_count
285 ;; Merge two qwords into dqwords
287 vpsubq code_lens3, ytmp, code_lens2
288 vpsrlvq codes3, codes1, code_lens3
289 vpslldq codes3, codes3, 8
291 vpsllvq codes1, codes1, code_lens2
293 vpxor codes1, codes1, codes3
294 vpxor codes1, codes1, codes2
296 vmovq tmp, code_lens1 %+ x ;Number of bytes
299 ;; Extract last bytes
300 vpaddq code_lens2, code_lens1, ybits_count
301 vpsrlq code_lens2, code_lens2, 3
302 vpshufb codes2, codes1, code_lens2
303 vpand codes2, codes2, [bytes_mask]
304 vextracti128 ybits %+ x, codes2, 1
306 ;; Check for short codes
307 vptest code_lens2, [min_write_mask]
311 vpermq codes2, codes2, 0x45
312 vpor codes1, codes1, codes2
314 ;; bit shift upper dqword combined bits to line up with lower dqword
315 vextracti128 code_lens2 %+ x, code_lens1, 1
317 ; Write out lower dqword of combined bits
318 vmovdqu [out_buf], codes1
319 vpaddq code_lens1, code_lens1, code_lens2
321 vmovq tmp2, code_lens1 %+ x ;Number of bytes
323 vpand ybits_count, code_lens1, yoffset_mask ;Extra bits
325 ; Write out upper dqword of combined bits
326 vextracti128 [out_buf + tmp], codes1, 1
333 vmovq rcx, ybits_count %+ x
334 vmovq bits, ybits %+ x
338 ;; Merge last bytes when the second dqword contains less than a byte
339 vpor ybits %+ x, codes2 %+ x
340 jmp .short_codes_next
343 add end_ptr, VECTOR_SLOP
346 vpxor ytmp, ytmp, ytmp
347 vpblendd codes3, ytmp, codes1, 0x55
348 vpblendd code_lens3, ytmp, code_lens1, 0x55
349 vpblendd codes4, ytmp, codes2, 0x55
351 vpsllvq codes4, codes4, code_lens3
352 vpxor codes3, codes3, codes4
353 vpaddd code_lens3, code_lens1, code_lens2
355 vpsrlq codes1, codes1, 32
356 vpsrlq code_lens1, code_lens1, 32
357 vpsrlq codes2, codes2, 32
359 vpsllvq codes2, codes2, code_lens1
360 vpxor codes1, codes1, codes2
362 vpsrlq code_lens1, code_lens3, 32
363 vpblendd code_lens3, ytmp, code_lens3, 0x55
366 vpsllvq codes3, codes3, ybits_count
367 vpxor codes3, codes3, ybits
368 vpaddq code_lens3, code_lens3, ybits_count
369 vpaddq code_lens1, code_lens1, code_lens3
373 vpsubq code_lens1, code_lens1, code_lens3
375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
379 vmovq sym, codes3 %+ x
380 vmovq tmp2, code_lens3 %+ x
388 shr tmp, 3 ; byte count
397 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
401 vmovq sym, codes1 %+ x
402 vmovq tmp2, code_lens1 %+ x
410 shr tmp, 3 ; byte count
419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
423 vpextrq sym, codes3 %+ x, 1
424 vpextrq tmp2, code_lens3 %+ x, 1
432 shr tmp, 3 ; byte count
441 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
445 vpextrq sym, codes1 %+ x, 1
446 vpextrq tmp2, code_lens1 %+ x, 1
454 shr tmp, 3 ; byte count
463 vextracti128 codes3 %+ x, codes3, 1
464 vextracti128 code_lens3 %+ x, code_lens3, 1
465 vextracti128 codes1 %+ x, codes1, 1
466 vextracti128 code_lens1 %+ x, code_lens1, 1
468 sub end_ptr, VECTOR_SLOP
470 vmovq ybits %+ x, bits
471 vmovq ybits_count %+ x, rcx
476 add in_buf_end, VECTOR_LOOP_PROCESSED
477 add end_ptr, VECTOR_SLOP
483 mov DWORD(data), [ptr]
489 and sym, LIT_MASK ; sym has ll_code
490 mov DWORD(sym), [hufftables + _lit_len_table + sym * 4]
494 shr dsym, DIST_OFFSET
496 mov DWORD(dsym), [hufftables + _dist_table + dsym * 4]
499 ; sym: 31:24 length; 23:0 code
508 movzx tmp, WORD(dsym)
515 ; insert dist extra bits
516 shr data, EXTRA_BITS_OFFSET
527 shr tmp, 3 ; byte count
539 mov tmp, [rsp + bitbuf_mem_offset]
540 mov [tmp + _m_bits], bits
541 mov [tmp + _m_bit_count], ecx
542 mov [tmp + _m_out_buf], out_buf
553 dd 0x1c, 0x1d, 0x1f, 0x20, 0x1c, 0x1d, 0x1f, 0x20
555 dq 0x00, 0x00, 0xff, 0x00
557 dq 0x0000000000000007, 0x0000000000000000
558 dq 0x0000000000000000, 0x0000000000000000
560 dq 0x0000000000000040, 0x0000000000000000
561 dq 0x0000000000000040, 0x0000000000000000
563 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
564 dd LIT_MASK, LIT_MASK, LIT_MASK, LIT_MASK
566 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
567 dd DIST_MASK, DIST_MASK, DIST_MASK, DIST_MASK
569 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
570 dd 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF
572 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
573 dd 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
575 dq 0x00000000000000ff, 0x0000000000000000
576 dq 0x00000000000000ff, 0x0000000000000000