1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "options.asm"
32 %include "lz0a_const.asm"
33 %include "data_struct2.asm"
34 %include "bitbuf2.asm"
35 %include "huffman.asm"
36 %include "igzip_compare_types.asm"
37 %include "reg_sizes.asm"
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
74 %define file_start rdi
76 %define m_bit_count rbp
98 %define hufftables r15
100 ;; GPR r8 & r15 can be used
102 %define xtmp0 xmm0 ; tmp
103 %define xtmp1 xmm1 ; tmp
108 %define ytmp0 ymm0 ; tmp
109 %define ytmp1 ymm1 ; tmp
112 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
117 blen_mem_offset equ 0 ; local variable (8 bytes)
118 f_end_i_mem_offset equ 8
119 gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
120 xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
121 stack_size equ 2*8 + 8*8 + 4*16 + 8
122 ;;; 8 because stack address is odd multiple of 8 after a function call and
123 ;;; we want it aligned to 16 bytes
125 ; void isal_deflate_body ( isal_zstream *stream )
126 ; arg 1: rcx: addr of stream
127 global isal_deflate_body_ %+ ARCH
128 isal_deflate_body_ %+ ARCH %+ :
129 %ifidn __OUTPUT_FORMAT__, elf64
133 ;; do nothing if (avail_in == 0)
134 cmp dword [rcx + _avail_in], 0
137 ;; Set stream's next state
138 mov rdx, ZSTATE_FLUSH_READ_BUFFER
140 cmp dword [rcx + _end_of_stream], 0
142 cmp dword [rcx + _flush], _NO_FLUSH
144 mov dword [rcx + _internal_state_state], eax
157 mov [rsp + gpr_save_mem_offset + 0*8], rbx
158 mov [rsp + gpr_save_mem_offset + 1*8], rsi
159 mov [rsp + gpr_save_mem_offset + 2*8], rdi
160 mov [rsp + gpr_save_mem_offset + 3*8], rbp
161 mov [rsp + gpr_save_mem_offset + 4*8], r12
162 mov [rsp + gpr_save_mem_offset + 5*8], r13
163 mov [rsp + gpr_save_mem_offset + 6*8], r14
164 mov [rsp + gpr_save_mem_offset + 7*8], r15
167 mov dword [stream + _internal_state_has_eob], 0
171 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
172 mov m_out_buf, [stream + _next_out]
173 mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf
174 mov tmp1 %+ d, [stream + _avail_out]
178 mov [stream + _internal_state_bitbuf_m_out_end], tmp1
180 mov m_bits, [stream + _internal_state_bitbuf_m_bits]
181 mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]
182 mov hufftables, [stream + _hufftables]
184 mov file_start, [stream + _next_in]
186 mov f_i %+ d, dword [stream + _total_in]
189 mov f_end_i %+ d, [stream + _avail_in]
194 mov [rsp + f_end_i_mem_offset], f_end_i
195 ; if (f_end_i <= 0) continue;
200 ; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
201 MARK __body_compute_hash_ %+ ARCH
202 MOVDQU xdata, [file_start + f_i]
203 mov curr_data, [file_start + f_i]
207 compute_hash hash, curr_data
210 compute_hash hash2, tmp3
215 cmp dword [stream + _internal_state_has_hist], 0
222 ; if (state->bitbuf.is_full()) {
223 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
230 lea tmp1, [file_start + f_i]
232 mov dist %+ w, f_i %+ w
234 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
235 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
242 compute_hash tmp6, tmp5
244 mov dist2 %+ w, f_i %+ w
246 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
247 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
249 ; if ((dist-1) < (D-1)) {
254 compute_hash tmp2, tmp8
256 and dist2 %+ d, (D-1)
259 MARK __body_compare_ %+ ARCH
260 ;; Check for long len/dist match (>7) with first literal
264 xor len, [tmp1 + dist - 1]
267 MOVD xhash, tmp6 %+ d
268 PINSRD xhash, tmp2 %+ d, 1
269 PAND xhash, xhash, xmask
271 ;; Check for len/dist match (>7) with second literal
273 xor len2, [tmp1 + dist2]
276 ;; Specutively load the code for the first literal
277 movzx tmp1, curr_data %+ b
278 get_lit_code tmp1, code3, rcx, hufftables
280 ;; Check for len/dist match for first literal
281 test len %+ d, 0xFFFFFFFF
282 jz len_dist_huffman_pre
284 ;; Specutively load the code for the second literal
287 get_lit_code curr_data, code2, code_len2, hufftables
289 SHLX code2, code2, rcx
293 ;; Check for len/dist match for second literal
294 test len2 %+ d, 0xFFFFFFFF
297 MARK __body_len_dist_lit_huffman_ %+ ARCH
298 len_dist_lit_huffman_pre:
303 len_dist_lit_huffman:
306 %ifndef LONGER_HUFFTABLE
308 get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx
310 get_dist_code dist2, code4, code_len2, hufftables
312 get_len_code len2, code, rcx, hufftables ;; rcx is code_len
314 SHLX code4, code4, rcx
323 compute_hash tmp4, tmp5
326 SHLX code4, code4, code_len3
328 add code_len2, code_len3
330 ;; Setup for updating hash
331 lea tmp3, [f_i + len2 + 1] ; tmp3 <= k
333 MOVDQU xdata, [file_start + f_i]
334 mov curr_data, [file_start + f_i]
335 mov curr_data2, curr_data
337 MOVD hash %+ d, xhash
338 PEXTRD hash2 %+ d, xhash, 1
339 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
341 compute_hash hash, curr_data
344 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
347 mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
349 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf, tmp4
350 mov f_end_i, [rsp + f_end_i_mem_offset]
353 compute_hash hash2, curr_data2
355 %ifdef NO_LIMIT_HASH_UPDATE
360 mov tmp6, [file_start + tmp3]
361 compute_hash tmp4, tmp6
362 and tmp4 %+ d, HASH_MASK
363 ; state->head[hash] = k;
364 mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
368 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
369 and hash %+ d, HASH_MASK
370 and hash2 %+ d, HASH_MASK
376 ;; encode as dist/len
378 MARK __body_len_dist_huffman_ %+ ARCH
379 len_dist_huffman_pre:
387 ; get_dist_code(dist, &code2, &code_len2);
388 %ifndef LONGER_HUFFTABLE
389 mov tmp3, dist ; since code2 and dist are rbx
390 get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
392 get_dist_code dist, code2, code_len2, hufftables
394 ; get_len_code(len, &code, &code_len);
395 get_len_code len, code, rcx, hufftables ;; rcx is code_len
399 ; code_len2 += code_len
400 SHLX code2, code2, rcx
404 ;; Setup for updateing hash
405 lea tmp3, [f_i + 2] ; tmp3 <= k
408 MOVD hash %+ d, xhash
409 PEXTRD hash2 %+ d, xhash, 1
410 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
412 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
414 MOVDQU xdata, [file_start + f_i]
415 mov curr_data, [file_start + f_i]
416 mov curr_data2, curr_data
417 compute_hash hash, curr_data
419 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp7
420 mov f_end_i, [rsp + f_end_i_mem_offset]
423 compute_hash hash2, curr_data2
425 %ifdef NO_LIMIT_HASH_UPDATE
430 mov tmp6, [file_start + tmp3]
431 compute_hash tmp4, tmp6
433 mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
438 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
439 and hash %+ d, HASH_MASK
440 and hash2 %+ d, HASH_MASK
447 MARK __body_write_lit_bits_ %+ ARCH
449 MOVDQU xdata, [file_start + f_i + 1]
450 mov f_end_i, [rsp + f_end_i_mem_offset]
452 mov curr_data, [file_start + f_i]
454 MOVD hash %+ d, xhash
456 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
458 PEXTRD hash2 %+ d, xhash, 1
465 mov tmp1, ZSTATE_FLUSH_READ_BUFFER
466 mov tmp5, ZSTATE_BODY
467 cmp dword [stream + _end_of_stream], 0
469 cmp dword [stream + _flush], _NO_FLUSH
471 mov dword [stream + _internal_state_state], tmp5 %+ d
474 ;; update input buffer
476 mov [stream + _total_in], f_i %+ d
478 mov [stream + _next_in], file_start
480 mov [stream + _avail_in], f_end_i %+ d
482 ;; update output buffer
483 mov [stream + _next_out], m_out_buf
484 sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
485 sub [stream + _avail_out], m_out_buf %+ d
486 add [stream + _total_out], m_out_buf %+ d
488 mov [stream + _internal_state_bitbuf_m_bits], m_bits
489 mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
491 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
492 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
493 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
494 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
495 mov r12, [rsp + gpr_save_mem_offset + 4*8]
496 mov r13, [rsp + gpr_save_mem_offset + 5*8]
497 mov r14, [rsp + gpr_save_mem_offset + 6*8]
498 mov r15, [rsp + gpr_save_mem_offset + 7*8]
508 MARK __body_compare_loops_ %+ ARCH
510 MOVD xhash, tmp6 %+ d
511 PINSRD xhash, tmp2 %+ d, 1
512 PAND xhash, xhash, xmask
513 lea tmp2, [tmp1 + dist - 1]
514 %if (COMPARE_TYPE == 1)
515 compare250 tmp1, tmp2, len, tmp3
516 %elif (COMPARE_TYPE == 2)
517 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
518 %elif (COMPARE_TYPE == 3)
519 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
521 %error Unknown Compare type COMPARE_TYPE
527 lea tmp2, [tmp1 + dist2]
529 %if (COMPARE_TYPE == 1)
530 compare250 tmp1, tmp2, len2, tmp3
531 %elif (COMPARE_TYPE == 2)
532 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
533 %elif (COMPARE_TYPE == 3)
534 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
536 %error Unknown Compare type COMPARE_TYPE
540 get_lit_code curr_data, code3, code_len3, hufftables
541 jmp len_dist_lit_huffman
543 MARK __write_first_byte_ %+ ARCH
545 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
548 mov dword [stream + _internal_state_has_hist], 1
550 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
554 compute_hash hash2, tmp6
556 MOVD xhash, hash %+ d
557 PINSRD xhash, hash2 %+ d, 1
558 PAND xhash, xhash, xmask
561 get_lit_code curr_data, code2, code_len2, hufftables
566 mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK