1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 %include "options.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "bitbuf2.asm"
34 %include "huffman.asm"
35 %include "igzip_compare_types.asm"
36 %include "reg_sizes.asm"
50 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 %define file_start rdi
54 %define file_length r15
64 %define dist_code2 rbx
87 ;; GPR r8 & r15 can be used
89 %define xtmp0 xmm0 ; tmp
90 %define xtmp1 xmm1 ; tmp
93 %define ytmp0 ymm0 ; tmp
94 %define ytmp1 ymm1 ; tmp
97 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
98 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
99 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
101 m_out_end equ 0 ; local variable (8 bytes)
103 f_end_i_mem_offset equ 16
104 gpr_save_mem_offset equ 24 ; gpr save area (8*8 bytes)
105 xmm_save_mem_offset equ 24 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
106 stack_size equ 3*8 + 8*8 + 4*16
108 ;;; 8 because stack address is odd multiple of 8 after a function call and
109 ;;; we want it aligned to 16 bytes
111 ; void isal_deflate_icf_body ( isal_zstream *stream )
112 ; arg 1: rcx: addr of stream
113 global isal_deflate_icf_body_ %+ ARCH
114 isal_deflate_icf_body_ %+ ARCH %+ :
115 %ifidn __OUTPUT_FORMAT__, elf64
119 ;; do nothing if (avail_in == 0)
120 cmp dword [rcx + _avail_in], 0
123 ;; Set stream's next state
124 mov rdx, ZSTATE_FLUSH_READ_BUFFER
125 mov rax, ZSTATE_CREATE_HDR
126 cmp dword [rcx + _end_of_stream], 0
128 cmp dword [rcx + _flush], _NO_FLUSH
130 mov dword [rcx + _internal_state_state], eax
143 mov [rsp + gpr_save_mem_offset + 0*8], rbx
144 mov [rsp + gpr_save_mem_offset + 1*8], rsi
145 mov [rsp + gpr_save_mem_offset + 2*8], rdi
146 mov [rsp + gpr_save_mem_offset + 3*8], rbp
147 mov [rsp + gpr_save_mem_offset + 4*8], r12
148 mov [rsp + gpr_save_mem_offset + 5*8], r13
149 mov [rsp + gpr_save_mem_offset + 6*8], r14
150 mov [rsp + gpr_save_mem_offset + 7*8], r15
153 mov dword [stream + _internal_state_has_eob], 0
155 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
156 mov tmp1, [stream + _level_buf]
157 mov m_out_buf, [tmp1 + _icf_buf_next]
159 mov [rsp + m_out_start], m_out_buf
160 mov tmp1, [tmp1 + _icf_buf_avail_out]
164 mov [rsp + m_out_end], tmp1
166 mov file_start, [stream + _next_in]
168 mov f_i %+ d, dword [stream + _total_in]
171 mov file_length %+ d, [stream + _avail_in]
176 ; if (file_length <= 0) continue;
181 ; for (f_i = f_start_i; f_i < file_length; f_i++) {
182 MARK __body_compute_hash_ %+ ARCH
183 MOVDQU xdata, [file_start + f_i]
184 mov curr_data, [file_start + f_i]
188 compute_hash hash, curr_data
191 compute_hash hash2, tmp3
196 cmp dword [stream + _internal_state_has_hist], 0
203 ; if (state->bitbuf.is_full()) {
204 cmp m_out_buf, [rsp + m_out_end]
211 lea tmp1, [file_start + f_i]
213 mov dist %+ w, f_i %+ w
215 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
216 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
222 compute_hash hash, curr_data
223 and hash %+ d, HASH_MASK
225 mov dist2 %+ w, f_i %+ w
227 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
228 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
230 ; if ((dist-1) < (D-1)) {
235 compute_hash hash2, tmp2
236 and hash2 %+ d, HASH_MASK
238 and dist2 %+ d, (D-1)
241 MARK __body_compare_ %+ ARCH
242 ;; Check for long len/dist match (>7) with first literal
246 xor len, [tmp1 + dist - 1]
249 ;; Check for len/dist match (>7) with second literal
251 xor len2, [tmp1 + dist2]
254 movzx lit_code, curr_data %+ b
257 ;; Check for len/dist match for first literal
258 test len %+ d, 0xFFFFFFFF
259 jz len_dist_huffman_pre
261 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code]
262 movzx lit_code2, curr_data %+ b
263 ;; Check for len/dist match for second literal
264 test len2 %+ d, 0xFFFFFFFF
267 MARK __body_len_dist_lit_huffman_ %+ ARCH
268 len_dist_lit_huffman_pre:
272 len_dist_lit_huffman:
274 movnti dword [m_out_buf], lit_code %+ d
278 get_dist_icf_code dist2, dist_code2, tmp1
280 ;; Setup for updating hash
281 lea tmp3, [f_i + 1] ; tmp3 <= k
284 MOVDQU xdata, [file_start + len2]
285 mov tmp1, [file_start + len2]
288 compute_hash hash3, curr_data
297 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
299 compute_hash hash, curr_data
302 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
304 compute_hash hash2, tmp1
307 mov [stream + _internal_state_head + 2 * hash3], tmp3 %+ w
312 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*(len2 + 254)]
314 movnti dword [m_out_buf + 4], dist_code2 %+ d
317 shr dist_code2, DIST_OFFSET
319 inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code2]
321 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
322 and hash %+ d, HASH_MASK
323 and hash2 %+ d, HASH_MASK
329 ;; encode as dist/len
331 MARK __body_len_dist_huffman_ %+ ARCH
332 len_dist_huffman_pre:
338 ;; Setup for updateing hash
339 lea tmp3, [f_i + 2] ; tmp3 <= k
343 ; get_dist_code(dist, &code2, &code_len2);
344 get_dist_icf_code dist, dist_code, tmp1
347 MOVDQU xdata, [file_start + len]
348 mov curr_data2, [file_start + len]
349 mov curr_data, curr_data2
352 ; get_len_code(len, &code, &code_len);
353 lea len_code, [len + 254]
354 or dist_code, len_code
356 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
358 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
360 compute_hash hash, curr_data
363 compute_hash hash2, curr_data2
365 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*len_code]
367 movnti dword [m_out_buf], dist_code %+ d
370 shr dist_code, DIST_OFFSET
372 inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code]
374 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
375 and hash %+ d, HASH_MASK
376 and hash2 %+ d, HASH_MASK
383 MARK __body_write_lit_bits_ %+ ARCH
385 MOVDQU xdata, [file_start + f_i + 1]
387 MOVQ curr_data, xdata
389 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code2]
391 shl lit_code2, DIST_OFFSET
392 lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
394 movnti dword [m_out_buf], lit_code %+ d
402 mov tmp1, ZSTATE_FLUSH_READ_BUFFER
403 mov tmp2, ZSTATE_BODY
404 cmp dword [stream + _end_of_stream], 0
406 cmp dword [stream + _flush], _NO_FLUSH
409 mov dword [stream + _internal_state_state], tmp2 %+ d
413 mov dword [stream + _internal_state_state], ZSTATE_CREATE_HDR
416 ;; update input buffer
418 mov [stream + _total_in], f_i %+ d
420 mov [stream + _next_in], file_start
422 mov [stream + _avail_in], file_length %+ d
424 ;; update output buffer
425 mov tmp1, [stream + _level_buf]
426 mov [tmp1 + _icf_buf_next], m_out_buf
427 sub m_out_buf, [rsp + m_out_start]
428 sub [tmp1 + _icf_buf_avail_out], m_out_buf %+ d
430 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
431 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
432 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
433 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
434 mov r12, [rsp + gpr_save_mem_offset + 4*8]
435 mov r13, [rsp + gpr_save_mem_offset + 5*8]
436 mov r14, [rsp + gpr_save_mem_offset + 6*8]
437 mov r15, [rsp + gpr_save_mem_offset + 7*8]
447 MARK __body_compare_loops_ %+ ARCH
449 lea tmp2, [tmp1 + dist - 1]
450 %if (COMPARE_TYPE == 1)
451 compare250 tmp1, tmp2, len, tmp3
452 %elif (COMPARE_TYPE == 2)
453 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
454 %elif (COMPARE_TYPE == 3)
455 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
457 %error Unknown Compare type COMPARE_TYPE
463 lea tmp2, [tmp1 + dist2]
465 %if (COMPARE_TYPE == 1)
466 compare250 tmp1, tmp2, len2, tmp3
467 %elif (COMPARE_TYPE == 2)
468 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
469 %elif (COMPARE_TYPE == 3)
470 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
472 %error Unknown Compare type COMPARE_TYPE
475 movzx lit_code, curr_data %+ b
477 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code]
478 jmp len_dist_lit_huffman
480 MARK __write_first_byte_ %+ ARCH
482 cmp m_out_buf, [rsp + m_out_end]
485 mov dword [stream + _internal_state_has_hist], 1
487 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
491 compute_hash hash2, tmp4
494 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*curr_data]
497 movnti dword [m_out_buf], curr_data %+ d
500 MOVDQU xdata, [file_start + f_i + 1]
502 mov curr_data, [file_start + f_i]
503 and hash %+ d, HASH_MASK
504 and hash2 %+ d, HASH_MASK
512 mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK