]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/isa-l/igzip/igzip_icf_body_h1_gr_bt.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / isa-l / igzip / igzip_icf_body_h1_gr_bt.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 %include "options.asm"
30
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "bitbuf2.asm"
34 %include "huffman.asm"
35 %include "igzip_compare_types.asm"
36 %include "reg_sizes.asm"
37
38 %include "stdmac.asm"
39 %ifdef DEBUG
40 %macro MARK 1
41 global %1
42 %1:
43 %endm
44 %else
45 %macro MARK 1
46 %endm
47 %endif
48
49 %define LARGE_MATCH_HASH_REP 1 ; Hash 4 * LARGE_MATCH_HASH_REP elements
50 %define LARGE_MATCH_MIN 264 ; Minimum match size to enter large match emit loop
51 %define MIN_INBUF_PADDING 16
52 %define MAX_EMIT_SIZE 258 * 16
53
54 %define SKIP_SIZE_BASE (2 << 10) ; No match length before starting skipping
55 %define SKIP_BASE 32 ; Initial skip size
56 %define SKIP_START 512 ; Start increasing skip size once level is beyond SKIP_START
57 %define SKIP_RATE 2 ; Rate skip size increases after SKIP_START
58 %define MAX_SKIP_SIZE 128 ; Maximum skip size
59 %define MAX_SKIP_LEVEL (((MAX_SKIP_SIZE - SKIP_BASE) / SKIP_RATE) + SKIP_START) ; Maximum skip level
60 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
61 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
62 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
63 %define file_start rdi
64 %define file_length r15
65 %define level_buf r14
66 %define f_i r10
67 %define m_out_buf r11
68
69 %define curr_data rax
70
71 %define tmp2 rcx
72 %define skip_count rcx
73
74 %define dist rbx
75 %define dist_code2 rbx
76 %define lit_code2 rbx
77 %define hmask2 rbx
78
79 %define dist2 r12
80 %define dist_code r12
81 %define hmask3 r12
82
83 %define tmp1 rsi
84 %define lit_code rsi
85
86 %define curr_data2 r8
87 %define len2 r8
88 %define tmp4 r8
89 %define hmask1 r8
90 %define len_code2 r8
91
92 %define len rdx
93 %define len_code rdx
94 %define hash3 rdx
95
96 %define stream r13
97 %define tmp3 r13
98
99 %define hash rbp
100 %define hash2 r9
101
102 ;; GPR r8 & r15 can be used
103
104 %define xtmp0 xmm0 ; tmp
105 %define xtmp1 xmm1 ; tmp
106 %define xlow_lit_shuf xmm2
107 %define xup_lit_shuf xmm3
108 %define xdata xmm4
109 %define xlit xmm5
110
111 %define ytmp0 ymm0 ; tmp
112 %define ytmp1 ymm1 ; tmp
113
114 %define hash_table level_buf + _hash8k_hash_table
115 %define lit_len_hist level_buf + _hist_lit_len
116 %define dist_hist level_buf + _hist_dist
117
118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
119 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
120 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121
122 m_out_end equ 0 ; local variable (8 bytes)
123 m_out_start equ 8
124 dist_mask_offset equ 16
125 hash_mask_offset equ 24
126 f_end_i_mem_offset equ 32
127 stream_offset equ 40
128 inbuf_slop_offset equ 48
129 skip_match_offset equ 56
130 skip_level_offset equ 64
131 gpr_save_mem_offset equ 80 ; gpr save area (8*8 bytes)
132 xmm_save_mem_offset equ gpr_save_mem_offset + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
133 stack_size equ 11*8 + 8*8 + 4*16
134
135 ;;; 8 because stack address is odd multiple of 8 after a function call and
136 ;;; we want it aligned to 16 bytes
137
138 ;; Defines to generate functions for different architecture
139 %xdefine ARCH 01
140 %xdefine ARCH1 02
141 %xdefine ARCH2 04
142
143 %ifndef COMPARE_TYPE
144 %xdefine COMPARE_TYPE_NOT_DEF
145 %xdefine COMPARE_TYPE 1
146 %xdefine COMPARE_TYPE1 2
147 %xdefine COMPARE_TYPE2 3
148 %endif
149
150 ;; Defines to generate functions for different levels
151 %xdefine METHOD hash_hist
152
153 %rep 3
154 %if ARCH == 04
155 %define USE_HSWNI
156 %endif
157
158 ; void isal_deflate_icf_body <hashsize> <arch> ( isal_zstream *stream )
159 ; we make 6 different versions of this function
160 ; arg 1: rcx: addr of stream
161 global isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH
162 isal_deflate_icf_body_ %+ METHOD %+ _ %+ ARCH %+ :
163 %ifidn __OUTPUT_FORMAT__, elf64
164 mov rcx, rdi
165 %endif
166
167 ;; do nothing if (avail_in == 0)
168 cmp dword [rcx + _avail_in], 0
169 jne .skip1
170
171 ;; Set stream's next state
172 mov rdx, ZSTATE_CREATE_HDR
173 mov eax, [rcx + _internal_state_state]
174 cmp word [rcx + _end_of_stream], 0
175 cmovne rax, rdx
176 cmp word [rcx + _flush], _NO_FLUSH
177 cmovne rax, rdx
178 mov dword [rcx + _internal_state_state], eax
179 ret
180 .skip1:
181
182 %ifdef ALIGN_STACK
183 push rbp
184 mov rbp, rsp
185 sub rsp, stack_size
186 and rsp, ~15
187 %else
188 sub rsp, stack_size
189 %endif
190
191 mov [rsp + gpr_save_mem_offset + 0*8], rbx
192 mov [rsp + gpr_save_mem_offset + 1*8], rsi
193 mov [rsp + gpr_save_mem_offset + 2*8], rdi
194 mov [rsp + gpr_save_mem_offset + 3*8], rbp
195 mov [rsp + gpr_save_mem_offset + 4*8], r12
196 mov [rsp + gpr_save_mem_offset + 5*8], r13
197 mov [rsp + gpr_save_mem_offset + 6*8], r14
198 mov [rsp + gpr_save_mem_offset + 7*8], r15
199
200 mov stream, rcx
201 mov [rsp + stream_offset], stream
202
203 mov byte [stream + _internal_state_has_eob], 0
204
205 mov tmp1 %+ d, dword[stream + _internal_state_dist_mask]
206 mov [rsp + dist_mask_offset], tmp1
207 mov tmp1 %+ d, dword[stream + _internal_state_hash_mask]
208 mov [rsp + hash_mask_offset], tmp1
209
210 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
211 mov level_buf, [stream + _level_buf]
212 mov m_out_buf, [level_buf + _icf_buf_next]
213
214 mov [rsp + m_out_start], m_out_buf
215 mov tmp1, [level_buf + _icf_buf_avail_out]
216 add tmp1, m_out_buf
217 sub tmp1, SLOP
218
219 mov [rsp + m_out_end], tmp1
220
221 mov file_start, [stream + _next_in]
222
223 mov f_i %+ d, dword [stream + _total_in]
224 sub file_start, f_i
225
226 mov file_length %+ d, [stream + _avail_in]
227 add file_length, f_i
228
229 mov [rsp + skip_match_offset], f_i
230 add qword [rsp + skip_match_offset], SKIP_SIZE_BASE
231 mov qword [rsp + skip_level_offset], 0
232
233 PBROADCASTD xlit, dword [min_lit_dist_syms]
234 MOVDQU xlow_lit_shuf, [low_lit_shuf]
235 MOVDQU xup_lit_shuf, [up_lit_shuf]
236
237 mov qword [rsp + inbuf_slop_offset], MIN_INBUF_PADDING
238 cmp byte [stream + _end_of_stream], 0
239 jnz .default_inbuf_padding
240 cmp byte [stream + _flush], 0
241 jnz .default_inbuf_padding
242 mov qword [rsp + inbuf_slop_offset], LA
243 .default_inbuf_padding:
244
245 ; file_length -= INBUF_PADDING;
246 sub file_length, [rsp + inbuf_slop_offset]
247 ; if (file_length <= 0) continue;
248 mov hmask1 %+ d, [rsp + hash_mask_offset]
249
250 cmp file_length, f_i
251 jle .input_end
252
253 ; for (f_i = f_start_i; f_i < file_length; f_i++) {
254 MOVDQU xdata, [file_start + f_i]
255 mov curr_data, [file_start + f_i]
256 mov tmp1, curr_data
257
258 compute_hash hash, curr_data
259
260 shr tmp1, 8
261 compute_hash hash2, tmp1
262
263 and hash, hmask1
264 and hash2, hmask1
265
266 cmp byte [stream + _internal_state_has_hist], IGZIP_NO_HIST
267 je .write_first_byte
268
269 jmp .loop2
270 align 16
271
272 .loop2:
273 mov tmp3 %+ d, [rsp + dist_mask_offset]
274 mov hmask1 %+ d, [rsp + hash_mask_offset]
275 ; if (state->bitbuf.is_full()) {
276 cmp m_out_buf, [rsp + m_out_end]
277 ja .output_end
278
279 xor dist, dist
280 xor dist2, dist2
281
282 lea tmp1, [file_start + f_i]
283
284 mov dist %+ w, f_i %+ w
285 dec dist
286 sub dist %+ w, word [hash_table + 2 * hash]
287 mov [hash_table + 2 * hash], f_i %+ w
288
289 inc f_i
290
291 mov tmp2, curr_data
292 shr curr_data, 16
293 compute_hash hash, curr_data
294 and hash %+ d, hmask1 %+ d
295
296 mov dist2 %+ w, f_i %+ w
297 dec dist2
298 sub dist2 %+ w, word [hash_table + 2 * hash2]
299 mov [hash_table + 2 * hash2], f_i %+ w
300
301 ; if ((dist-1) < (D-1)) {
302 and dist %+ d, tmp3 %+ d
303 neg dist
304
305 shr tmp2, 24
306 compute_hash hash2, tmp2
307 and hash2 %+ d, hmask1 %+ d
308
309 and dist2 %+ d, tmp3 %+ d
310 neg dist2
311
312 ;; Check for long len/dist match (>7) with first literal
313 MOVQ len, xdata
314 mov curr_data, len
315 PSRLDQ xdata, 1
316 xor len, [tmp1 + dist - 1]
317 jz .compare_loop
318
319 ;; Check for len/dist match (>7) with second literal
320 MOVQ len2, xdata
321 xor len2, [tmp1 + dist2]
322 jz .compare_loop2
323
324 movzx lit_code, curr_data %+ b
325 shr curr_data, 8
326
327 ;; Check for len/dist match for first literal
328 test len %+ d, 0xFFFFFFFF
329 jz .len_dist_huffman_pre
330
331 PSRLDQ xdata, 1
332 inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
333 movzx lit_code2, curr_data %+ b
334 ;; Check for len/dist match for second literal
335 test len2 %+ d, 0xFFFFFFFF
336 jnz .write_lit_bits
337
338 .len_dist_lit_huffman_pre:
339 bsf len2, len2
340 shr len2, 3
341
342 .len_dist_lit_huffman:
343 or lit_code, LIT
344 mov dword [m_out_buf], lit_code %+ d
345
346 neg dist2
347
348 get_dist_icf_code dist2, dist_code2, tmp1
349
350 mov hmask3 %+ d, dword [rsp + hash_mask_offset]
351
352 ;; Setup for updating hash
353 lea tmp3, [f_i + 1] ; tmp3 <= k
354
355 mov tmp2, f_i
356 add file_start, f_i
357 add f_i, len2
358 cmp f_i, file_length
359 jg .len_dist_lit_huffman_finish
360
361 lea tmp1, [f_i + SKIP_SIZE_BASE]
362 mov qword [rsp + skip_match_offset], tmp1
363 sub qword [rsp + skip_level_offset], len2
364
365 MOVDQU xdata, [file_start + len2]
366 mov tmp1, [file_start + len2]
367 sub file_start, tmp2
368
369 shr curr_data, 24
370 compute_hash hash3, curr_data
371 and hash3 %+ d, hmask3 %+ d
372
373 mov curr_data, tmp1
374 shr tmp1, 8
375
376 mov [hash_table + 2 * hash], tmp3 %+ w
377
378 compute_hash hash, curr_data
379
380 add tmp3,1
381 mov [hash_table + 2 * hash2], tmp3 %+ w
382
383 compute_hash hash2, tmp1
384
385 add tmp3, 1
386 mov [hash_table + 2 * hash3], tmp3 %+ w
387
388 add dist_code2, 254
389 add dist_code2, len2
390
391 inc dword [lit_len_hist + HIST_ELEM_SIZE*(len2 + 254)]
392
393 mov dword [m_out_buf + 4], dist_code2 %+ d
394 add m_out_buf, 8
395
396 shr dist_code2, DIST_OFFSET
397 and dist_code2, 0x1F
398 inc dword [dist_hist + HIST_ELEM_SIZE*dist_code2]
399
400 ; hash = compute_hash(state->file_start + f_i) & hash_mask;
401 and hash %+ d, hmask3 %+ d
402 and hash2 %+ d, hmask3 %+ d
403
404 ; continue
405 jmp .loop2
406
407 .len_dist_lit_huffman_finish:
408 sub file_start, tmp2
409
410 mov [hash_table + 2 * hash], tmp3 %+ w
411 add tmp3,1
412 mov [hash_table + 2 * hash2], tmp3 %+ w
413
414 add dist_code2, 254
415 add dist_code2, len2
416
417 inc dword [lit_len_hist + HIST_ELEM_SIZE*(len2 + 254)]
418
419 mov dword [m_out_buf + 4], dist_code2 %+ d
420 add m_out_buf, 8
421
422 shr dist_code2, DIST_OFFSET
423 and dist_code2, 0x1F
424 inc dword [dist_hist + HIST_ELEM_SIZE*dist_code2]
425
426 jmp .input_end
427
428 .len_dist_huffman_pre:
429 bsf len, len
430 shr len, 3
431
432 .len_dist_huffman:
433 dec f_i
434 ;; Setup for updateing hash
435 lea tmp3, [f_i + 2] ; tmp3 <= k
436
437 neg dist
438
439 ; get_dist_code(dist, &code2, &code_len2);
440 get_dist_icf_code dist, dist_code, tmp1
441
442 .len_dist_huffman_skip:
443
444 mov hmask2 %+ d, [rsp + hash_mask_offset]
445
446 mov tmp1, f_i
447 add file_start, f_i
448
449 add f_i, len
450 cmp f_i, file_length
451 jg .len_dist_huffman_finish
452
453 lea tmp2, [f_i + SKIP_SIZE_BASE]
454 mov qword [rsp + skip_match_offset], tmp2
455 sub qword [rsp + skip_level_offset], len
456
457 MOVDQU xdata, [file_start + len]
458 mov curr_data2, [file_start + len]
459 mov curr_data, curr_data2
460 sub file_start, tmp1
461 ; get_len_code(len, &code, &code_len);
462 lea len_code, [len + 254]
463 or dist_code, len_code
464
465 mov [hash_table + 2 * hash], tmp3 %+ w
466 add tmp3,1
467 mov [hash_table + 2 * hash2], tmp3 %+ w
468
469 compute_hash hash, curr_data
470
471 shr curr_data2, 8
472 compute_hash hash2, curr_data2
473
474 inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
475
476 mov dword [m_out_buf], dist_code %+ d
477 add m_out_buf, 4
478
479 shr dist_code, DIST_OFFSET
480 and dist_code, 0x1F
481 inc dword [dist_hist + HIST_ELEM_SIZE*dist_code]
482
483 ; hash = compute_hash(state->file_start + f_i) & hash_mask;
484 and hash %+ d, hmask2 %+ d
485 and hash2 %+ d, hmask2 %+ d
486
487 ; continue
488 jmp .loop2
489
490 .len_dist_huffman_finish:
491 sub file_start, tmp1
492
493 ; get_len_code(len, &code, &code_len);
494 lea len_code, [len + 254]
495 or dist_code, len_code
496
497 mov [hash_table + 2 * hash], tmp3 %+ w
498 add tmp3,1
499 mov [hash_table + 2 * hash2], tmp3 %+ w
500
501 inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
502
503 mov dword [m_out_buf], dist_code %+ d
504 add m_out_buf, 4
505
506 shr dist_code, DIST_OFFSET
507 and dist_code, 0x1F
508 inc dword [dist_hist + HIST_ELEM_SIZE*dist_code]
509
510 jmp .input_end
511
512 .write_lit_bits:
513 MOVQ curr_data, xdata
514
515 add f_i, 1
516 cmp f_i, file_length
517 jg .write_lit_bits_finish
518
519 MOVDQU xdata, [file_start + f_i]
520
521 inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
522
523 shl lit_code2, DIST_OFFSET
524 lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
525
526 mov dword [m_out_buf], lit_code %+ d
527 add m_out_buf, 4
528
529 cmp f_i, [rsp + skip_match_offset]
530 jle .loop2
531
532 xor tmp3, tmp3
533 mov rcx, [rsp + skip_level_offset]
534 add rcx, 1
535 cmovl rcx, tmp3
536 mov tmp1, MAX_SKIP_LEVEL
537 cmp rcx, MAX_SKIP_LEVEL
538 cmovg rcx, tmp1
539
540 mov tmp1, SKIP_SIZE_BASE
541 shr tmp1, cl
542
543 %if MAX_SKIP_LEVEL > 63
544 cmp rcx, 63
545 cmovg tmp1, tmp3
546 %endif
547 mov [rsp + skip_match_offset], tmp1
548 mov [rsp + skip_level_offset], rcx
549
550 sub rcx, SKIP_START
551 cmovl rcx, tmp3
552
553 lea skip_count, [SKIP_RATE * rcx + SKIP_BASE]
554 and skip_count, -SKIP_BASE
555
556 mov tmp1, [rsp + m_out_end]
557 lea tmp1, [tmp1 + 4]
558 sub tmp1, m_out_buf
559 shr tmp1, 1
560 cmp tmp1, skip_count
561 jl .skip_forward_short
562
563 mov tmp1, [rsp + inbuf_slop_offset]
564 add tmp1, file_length
565 sub tmp1, f_i
566 cmp tmp1, skip_count
567 jl .skip_forward_short
568
569 .skip_forward_long:
570 MOVQ xdata, [file_start + f_i]
571
572 movzx lit_code, byte [file_start + f_i]
573 movzx lit_code2, byte [file_start + f_i + 1]
574
575 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code], 1
576 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2], 1
577
578 movzx lit_code, byte [file_start + f_i + 2]
579 movzx lit_code2, byte [file_start + f_i + 3]
580
581 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code], 1
582 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2], 1
583
584 movzx lit_code, byte [file_start + f_i + 4]
585 movzx lit_code2, byte [file_start + f_i + 5]
586
587 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code], 1
588 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2], 1
589
590 movzx lit_code, byte [file_start + f_i + 6]
591 movzx lit_code2, byte [file_start + f_i + 7]
592
593 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code], 1
594 add dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2], 1
595
596 PSHUFB xtmp0, xdata, xlow_lit_shuf
597 PSHUFB xtmp1, xdata, xup_lit_shuf
598 PSLLD xtmp1, xtmp1, DIST_OFFSET
599 POR xtmp0, xtmp0, xtmp1
600 PADDD xtmp0, xtmp0, xlit
601 MOVDQU [m_out_buf], xtmp0
602
603 add m_out_buf, 16
604 add f_i, 8
605
606 sub skip_count, 8
607 jg .skip_forward_long
608
609 cmp file_length, f_i
610 jle .input_end
611
612 mov curr_data, [file_start + f_i]
613 MOVDQU xdata, [file_start + f_i]
614 add [rsp + skip_match_offset], f_i
615
616 jmp .loop2
617
618 .skip_forward_short:
619 movzx lit_code, byte [file_start + f_i]
620 movzx lit_code2, byte [file_start + f_i + 1]
621
622 inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
623 inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
624
625 shl lit_code2, DIST_OFFSET
626 lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
627
628 mov dword [m_out_buf], lit_code %+ d
629 add m_out_buf, 4
630 add f_i, 2
631
632 cmp m_out_buf, [rsp + m_out_end]
633 ja .output_end
634
635 cmp file_length, f_i
636 jle .input_end
637
638 jmp .skip_forward_short
639
640 .write_lit_bits_finish:
641 inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code2]
642
643 shl lit_code2, DIST_OFFSET
644 lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
645
646 mov dword [m_out_buf], lit_code %+ d
647 add m_out_buf, 4
648
649 .input_end:
650 mov stream, [rsp + stream_offset]
651 mov tmp1, ZSTATE_FLUSH_READ_BUFFER
652 mov tmp2, ZSTATE_BODY
653 cmp word [stream + _end_of_stream], 0
654 cmovne tmp2, tmp1
655 cmp word [stream + _flush], _NO_FLUSH
656
657 cmovne tmp2, tmp1
658 mov dword [stream + _internal_state_state], tmp2 %+ d
659 jmp .end
660
661 .output_end:
662 mov stream, [rsp + stream_offset]
663 mov dword [stream + _internal_state_state], ZSTATE_CREATE_HDR
664
665 .end:
666 ;; update input buffer
667 add file_length, [rsp + inbuf_slop_offset]
668 mov [stream + _total_in], f_i %+ d
669 mov [stream + _internal_state_block_end], f_i %+ d
670 add file_start, f_i
671 mov [stream + _next_in], file_start
672 sub file_length, f_i
673 mov [stream + _avail_in], file_length %+ d
674
675 ;; update output buffer
676 mov [level_buf + _icf_buf_next], m_out_buf
677 sub m_out_buf, [rsp + m_out_start]
678 sub [level_buf + _icf_buf_avail_out], m_out_buf %+ d
679
680 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
681 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
682 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
683 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
684 mov r12, [rsp + gpr_save_mem_offset + 4*8]
685 mov r13, [rsp + gpr_save_mem_offset + 5*8]
686 mov r14, [rsp + gpr_save_mem_offset + 6*8]
687 mov r15, [rsp + gpr_save_mem_offset + 7*8]
688
689 %ifndef ALIGN_STACK
690 add rsp, stack_size
691 %else
692 mov rsp, rbp
693 pop rbp
694 %endif
695 ret
696
697 align 16
698 .compare_loop:
699 lea tmp2, [tmp1 + dist - 1]
700
701 mov len2, file_length
702 sub len2, f_i
703 add len2, [rsp + inbuf_slop_offset]
704 add len2, 1
705 mov tmp3, MAX_EMIT_SIZE
706 cmp len2, tmp3
707 cmovg len2, tmp3
708
709 mov len, 8
710 compare_large tmp1, tmp2, len, len2, tmp3, ytmp0, ytmp1
711
712 cmp len, 258
713 jle .len_dist_huffman
714 cmp len, LARGE_MATCH_MIN
715 jge .do_emit
716 mov len, 258
717 jmp .len_dist_huffman
718
719 align 16
720 .compare_loop2:
721 lea tmp2, [tmp1 + dist2]
722 add tmp1, 1
723
724 mov len, file_length
725 sub len, f_i
726 add len, [rsp + inbuf_slop_offset]
727 mov tmp3, MAX_EMIT_SIZE
728 cmp len, tmp3
729 cmovg len, tmp3
730
731 mov len2, 8
732 compare_large tmp1, tmp2, len2, len, tmp3, ytmp0, ytmp1
733
734 movzx lit_code, curr_data %+ b
735 shr curr_data, 8
736 inc dword [lit_len_hist + HIST_ELEM_SIZE*lit_code]
737 cmp len2, 258
738 jle .len_dist_lit_huffman
739 cmp len2, LARGE_MATCH_MIN
740 jge .do_emit2
741 mov len2, 258
742 jmp .len_dist_lit_huffman
743
744 .do_emit2:
745 or lit_code, LIT
746 mov dword [m_out_buf], lit_code %+ d
747 add m_out_buf, 4
748
749 inc f_i
750 mov dist, dist2
751 mov len, len2
752
753 .do_emit:
754 neg dist
755 get_dist_icf_code dist, dist_code, tmp1
756
757 mov len_code2, 258 + 254
758 or len_code2, dist_code
759 mov tmp1, dist_code
760 shr tmp1, DIST_OFFSET
761 and tmp1, 0x1F
762 lea tmp3, [f_i + 1]
763 dec f_i
764
765 mov [hash_table + 2 * hash], tmp3 %+ w
766 add tmp3,1
767 mov [hash_table + 2 * hash2], tmp3 %+ w
768 .emit:
769 sub len, 258
770 add f_i, 258
771
772 inc dword [lit_len_hist + HIST_ELEM_SIZE*(258 + 254)]
773 inc dword [dist_hist + HIST_ELEM_SIZE*tmp1]
774 mov dword [m_out_buf], len_code2 %+ d
775 add m_out_buf, 4
776
777 cmp m_out_buf, [rsp + m_out_end]
778 ja .output_end
779
780 cmp len, LARGE_MATCH_MIN
781 jge .emit
782
783 mov len2, 258
784 cmp len, len2
785 cmovg len, len2
786
787 ; get_len_code(len, &code, &code_len);
788 add f_i, len
789 lea len_code, [len + 254]
790 or dist_code, len_code
791
792 inc dword [lit_len_hist + HIST_ELEM_SIZE*len_code]
793 inc dword [dist_hist + HIST_ELEM_SIZE*tmp1]
794
795 mov dword [m_out_buf], dist_code %+ d
796 add m_out_buf, 4
797
798 cmp file_length, f_i
799 jle .input_end
800
801 lea tmp2, [f_i - 4 * LARGE_MATCH_HASH_REP]
802 mov hmask2 %+ d, [rsp + hash_mask_offset]
803
804 %rep LARGE_MATCH_HASH_REP
805 mov curr_data %+ d, dword [file_start + tmp2]
806 mov curr_data2 %+ d, dword [file_start + tmp2 + 1]
807 mov tmp3 %+ d, dword [file_start + tmp2 + 2]
808 mov tmp1 %+ d, dword [file_start + tmp2 + 3]
809
810 compute_hash hash, curr_data
811 compute_hash hash2, curr_data2
812 compute_hash hash3, tmp3
813 compute_hash hmask3, tmp1
814
815 and hash %+ d, hmask2 %+ d
816 and hash2 %+ d, hmask2 %+ d
817 and hash3 %+ d, hmask2 %+ d
818 and hmask3 %+ d, hmask2 %+ d
819
820 mov [hash_table + 2 * hash], tmp2 %+ w
821 add tmp2, 1
822 mov [hash_table + 2 * hash2], tmp2 %+ w
823 add tmp2, 1
824 mov [hash_table + 2 * hash3], tmp2 %+ w
825 add tmp2, 1
826 mov [hash_table + 2 * hmask3], tmp2 %+ w
827 %if (LARGE_MATCH_HASH_REP > 1)
828 add tmp2, 1
829 %endif
830 %endrep
831 ; for (f_i = f_start_i; f_i < file_length; f_i++) {
832 MOVDQU xdata, [file_start + f_i]
833 mov curr_data, [file_start + f_i]
834 mov tmp1, curr_data
835
836 compute_hash hash, curr_data
837
838 shr tmp1, 8
839 compute_hash hash2, tmp1
840
841 and hash, hmask2
842 and hash2, hmask2
843
844 jmp .loop2
845
846 .write_first_byte:
847 mov hmask1 %+ d, [rsp + hash_mask_offset]
848 cmp m_out_buf, [rsp + m_out_end]
849 ja .output_end
850
851 mov byte [stream + _internal_state_has_hist], IGZIP_HIST
852
853 mov [hash_table + 2 * hash], f_i %+ w
854
855 mov hash, hash2
856 shr tmp2, 16
857 compute_hash hash2, tmp2
858
859 and curr_data, 0xff
860 inc dword [lit_len_hist + HIST_ELEM_SIZE*curr_data]
861 or curr_data, LIT
862
863 mov dword [m_out_buf], curr_data %+ d
864 add m_out_buf, 4
865
866 MOVDQU xdata, [file_start + f_i + 1]
867 add f_i, 1
868 mov curr_data, [file_start + f_i]
869 and hash %+ d, hmask1 %+ d
870 and hash2 %+ d, hmask1 %+ d
871
872 cmp f_i, file_length
873 jl .loop2
874 jmp .input_end
875
876 %ifdef USE_HSWNI
877 %undef USE_HSWNI
878 %endif
879
880 ;; Shift defines over in order to iterate over all versions
881 %undef ARCH
882 %xdefine ARCH ARCH1
883 %undef ARCH1
884 %xdefine ARCH1 ARCH2
885
886 %ifdef COMPARE_TYPE_NOT_DEF
887 %undef COMPARE_TYPE
888 %xdefine COMPARE_TYPE COMPARE_TYPE1
889 %undef COMPARE_TYPE1
890 %xdefine COMPARE_TYPE1 COMPARE_TYPE2
891 %endif
892 %endrep
893 min_lit_dist_syms:
894 dd LIT + (1 << DIST_OFFSET)
895
896 low_lit_shuf:
897 db 0x00, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff
898 db 0x04, 0xff, 0xff, 0xff, 0x06, 0xff, 0xff, 0xff
899 up_lit_shuf:
900 db 0x01, 0xff, 0xff, 0xff, 0x03, 0xff, 0xff, 0xff
901 db 0x05, 0xff, 0xff, 0xff, 0x07, 0xff, 0xff, 0xff