]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "options.asm" | |
31 | %ifndef TEST | |
32 | ||
33 | extern fold_4 | |
34 | ||
35 | %include "lz0a_const.asm" | |
36 | %include "data_struct2.asm" | |
37 | %include "bitbuf2.asm" | |
38 | %include "huffman.asm" | |
39 | %include "igzip_compare_types.asm" | |
40 | ||
41 | %include "reg_sizes.asm" | |
42 | ||
43 | %include "stdmac.asm" | |
44 | ||
45 | %if (ARCH == 04) | |
46 | %define MOVDQA vmovdqa | |
47 | %else | |
48 | %define MOVDQA movdqa | |
49 | %endif | |
50 | ||
51 | %ifdef DEBUG | |
52 | %macro MARK 1 | |
53 | global %1 | |
54 | %1: | |
55 | %endm | |
56 | %else | |
57 | %macro MARK 1 | |
58 | %endm | |
59 | %endif | |
60 | ||
61 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
62 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
63 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
64 | %define tmp2 rcx | |
65 | %define hash2 rcx | |
66 | ||
67 | %define b_bytes_valid rax | |
68 | %define curr_data rax | |
69 | %define code rax | |
70 | %define tmp5 rax | |
71 | ||
72 | %define tmp4 rbx | |
73 | %define dist rbx | |
74 | %define code2 rbx | |
75 | ||
76 | %define x rdx | |
77 | %define len rdx | |
78 | %define hash rdx | |
79 | %define code_len3 rdx | |
80 | ||
81 | %define tmp1 rsi | |
82 | %define code_len2 rsi | |
83 | ||
84 | %define blen rdi | |
85 | %define file_start rdi | |
86 | ||
87 | %define m_bit_count rbp | |
88 | ||
89 | %define in_buf r8 | |
90 | %define curr_data2 r8 | |
91 | %define len2 r8 | |
92 | %define tmp6 r8 | |
93 | ||
94 | %define m_bits r9 | |
95 | ||
96 | %define f_i r10 | |
97 | ||
98 | %define m_out_buf r11 | |
99 | ||
100 | %define f_end_i r12 | |
101 | %define dist2 r12 | |
102 | %define tmp7 r12 | |
103 | %define code4 r12 | |
104 | ||
105 | %define tmp3 r13 | |
106 | %define code3 r13 | |
107 | ||
108 | %define stream r14 | |
109 | ||
110 | %define hufftables r15 | |
111 | ||
112 | %define crc_0 xmm0 ; in/out: crc state | |
113 | %define crc_1 xmm1 ; in/out: crc state | |
114 | %define crc_2 xmm2 ; in/out: crc state | |
115 | %define crc_3 xmm3 ; in/out: crc state | |
116 | %define crc_fold xmm4 ; in: (loaded from fold_4) | |
117 | ||
118 | %define xtmp0 xmm5 ; tmp | |
119 | %define xtmp1 xmm6 ; tmp | |
120 | %define xtmp2 xmm7 ; tmp | |
121 | %define xtmp3 xmm8 ; tmp | |
122 | %define xtmp4 xmm9 ; tmp | |
123 | ||
124 | %define ytmp0 ymm5 ; tmp | |
125 | %define ytmp1 ymm6 ; tmp | |
126 | ||
127 | %if (ARCH == 04) | |
128 | %define vtmp0 ymm5 ; tmp | |
129 | %define vtmp1 ymm6 ; tmp | |
130 | %define vtmp2 ymm7 ; tmp | |
131 | %define vtmp3 ymm8 ; tmp | |
132 | %define vtmp4 ymm9 ; tmp | |
133 | %else | |
134 | %define vtmp0 xmm5 ; tmp | |
135 | %define vtmp1 xmm6 ; tmp | |
136 | %define vtmp2 xmm7 ; tmp | |
137 | %define vtmp3 xmm8 ; tmp | |
138 | %define vtmp4 xmm9 ; tmp | |
139 | %endif | |
140 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
141 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
142 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
143 | ||
144 | %define b_bytes_processed f_i | |
145 | ||
146 | blen_mem_offset equ 0 ; local variable (8 bytes) | |
147 | in_buf_mem_offset equ 8 | |
148 | f_end_i_mem_offset equ 16 | |
149 | empty_buffer_flag equ 24 | |
150 | gpr_save_mem_offset equ 32 ; gpr save area (8*8 bytes) | |
151 | xmm_save_mem_offset equ 32 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) | |
152 | stack_size equ 4*8 + 8*8 + 4*16 + 8 | |
153 | ;;; 8 because stack address is odd multiple of 8 after a function call and | |
154 | ;;; we want it aligned to 16 bytes | |
155 | ||
156 | ; void isal_deflate_body ( isal_zstream *stream ) | |
157 | ; arg 1: rcx: addr of stream | |
158 | global isal_deflate_body_ %+ ARCH | |
159 | isal_deflate_body_ %+ ARCH %+ : | |
160 | %ifidn __OUTPUT_FORMAT__, elf64 | |
161 | mov rcx, rdi | |
162 | %endif | |
163 | ||
164 | ;; do nothing if (avail_in == 0) | |
165 | cmp dword [rcx + _avail_in], 0 | |
166 | jne skip1 | |
167 | ||
168 | ;; Set stream's next state | |
169 | mov rdx, ZSTATE_FLUSH_READ_BUFFER | |
170 | mov rax, ZSTATE_BODY | |
171 | cmp dword [rcx + _end_of_stream], 0 | |
172 | cmovne rax, rdx | |
173 | cmp dword [rcx + _flush], _NO_FLUSH | |
174 | cmovne rax, rdx | |
175 | mov dword [rcx + _internal_state_state], eax | |
176 | ret | |
177 | skip1: | |
178 | ||
179 | %ifdef ALIGN_STACK | |
180 | push rbp | |
181 | mov rbp, rsp | |
182 | sub rsp, stack_size | |
183 | and rsp, ~15 | |
184 | %else | |
185 | sub rsp, stack_size | |
186 | %endif | |
187 | ||
188 | mov [rsp + gpr_save_mem_offset + 0*8], rbx | |
189 | mov [rsp + gpr_save_mem_offset + 1*8], rsi | |
190 | mov [rsp + gpr_save_mem_offset + 2*8], rdi | |
191 | mov [rsp + gpr_save_mem_offset + 3*8], rbp | |
192 | mov [rsp + gpr_save_mem_offset + 4*8], r12 | |
193 | mov [rsp + gpr_save_mem_offset + 5*8], r13 | |
194 | mov [rsp + gpr_save_mem_offset + 6*8], r14 | |
195 | mov [rsp + gpr_save_mem_offset + 7*8], r15 | |
196 | MOVDQA [rsp + xmm_save_mem_offset + 0*16], xmm6 | |
197 | MOVDQA [rsp + xmm_save_mem_offset + 1*16], xmm7 | |
198 | MOVDQA [rsp + xmm_save_mem_offset + 2*16], xmm8 | |
199 | MOVDQA [rsp + xmm_save_mem_offset + 3*16], xmm9 | |
200 | ||
201 | mov stream, rcx | |
202 | ||
203 | MOVDQA crc_0, [stream + _internal_state_crc + 0*16] | |
204 | MOVDQA crc_1, [stream + _internal_state_crc + 1*16] | |
205 | MOVDQA crc_2, [stream + _internal_state_crc + 2*16] | |
206 | MOVDQA crc_3, [stream + _internal_state_crc + 3*16] | |
207 | MOVDQA crc_fold, [fold_4] | |
208 | mov dword [stream + _internal_state_has_eob], 0 | |
209 | ||
210 | ; state->bitbuf.set_buf(stream->next_out, stream->avail_out); | |
211 | mov m_out_buf, [stream + _next_out] | |
212 | mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf | |
213 | mov tmp1 %+ d, [stream + _avail_out] | |
214 | add tmp1, m_out_buf | |
215 | sub tmp1, SLOP | |
216 | skip_SLOP: | |
217 | mov [stream + _internal_state_bitbuf_m_out_end], tmp1 | |
218 | ||
219 | mov m_bits, [stream + _internal_state_bitbuf_m_bits] | |
220 | mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count] | |
221 | ||
222 | mov hufftables, [stream + _hufftables] | |
223 | ; in_buf = stream->next_in | |
224 | mov in_buf, [stream + _next_in] | |
225 | mov blen %+ d, [stream + _avail_in] | |
226 | ||
227 | mov dword [rsp + empty_buffer_flag], 0 | |
228 | cmp dword [stream + _flush], _FULL_FLUSH | |
229 | sete byte [rsp + empty_buffer_flag] | |
230 | cmp dword [stream + _internal_state_b_bytes_processed], 0 | |
231 | sete byte [rsp + empty_buffer_flag + 1] | |
232 | ||
233 | ; while (blen != 0) | |
234 | MARK __Compute_X_ %+ ARCH | |
235 | loop1: | |
236 | ; x = D + LA - (state->b_bytes_valid - state->b_bytes_processed); | |
237 | mov b_bytes_valid %+ d, [stream + _internal_state_b_bytes_valid] | |
238 | mov b_bytes_processed %+ d, [stream + _internal_state_b_bytes_processed] | |
239 | lea x, [b_bytes_processed + D + LA] | |
240 | sub x, b_bytes_valid | |
241 | ||
242 | ; if (x > D) x = D; | |
243 | cmp x, D | |
244 | cmova x, [const_D] | |
245 | ||
246 | ; if (blen < D) x = blen; | |
247 | cmp blen, D | |
248 | cmovb x, blen | |
249 | ||
250 | ;; process x bytes starting at in_buf | |
251 | ||
252 | ;; If there isn't enough room, shift buffer down | |
253 | ; if (x > BSIZE - state->b_bytes_valid) { | |
254 | mov tmp1, BSIZE | |
255 | sub tmp1, b_bytes_valid | |
256 | cmp x, tmp1 | |
257 | jbe skip_move | |
258 | ||
259 | ; if (state->b_bytes_processed < state->b_bytes_valid - LA) { | |
260 | mov tmp1, b_bytes_valid | |
261 | sub tmp1, LA | |
262 | cmp b_bytes_processed, tmp1 | |
263 | jae do_move | |
264 | ||
265 | ;; We need to move an odd amount, skip move for this copy of loop | |
266 | xor x,x | |
267 | mov [rsp + blen_mem_offset], blen | |
268 | jmp skip_move_zero | |
269 | ||
270 | MARK __shift_data_down_ %+ ARCH | |
271 | do_move: | |
272 | ; offset = state->b_bytes_valid - (D + LA); | |
273 | mov tmp4, b_bytes_valid | |
274 | sub tmp4, D + LA | |
275 | ; copy_D_LA(state->buffer, state->buffer + offset); | |
276 | lea tmp1, [stream + _internal_state_buffer] | |
277 | lea tmp2, [tmp1 + tmp4] | |
278 | copy_D_LA tmp1, tmp2, tmp3, vtmp0, vtmp1, vtmp2, vtmp3 | |
279 | ; tmp1 clobbered | |
280 | ||
281 | ; state->file_start -= offset; | |
282 | sub [stream + _internal_state_file_start], tmp4 | |
283 | ; state->b_bytes_processed -= offset; | |
284 | sub b_bytes_processed, tmp4 | |
285 | mov b_bytes_valid, D + LA | |
286 | ||
287 | MARK __copy_in_ %+ ARCH | |
288 | skip_move: | |
289 | sub blen, x | |
290 | ||
291 | mov [rsp + blen_mem_offset], blen | |
292 | ||
293 | ; copy_in(state->buffer + state->b_bytes_valid, in_buf, x); | |
294 | lea tmp1, [stream + _internal_state_buffer + b_bytes_valid] | |
295 | mov tmp2, in_buf | |
296 | mov tmp3, x | |
297 | ||
298 | ||
299 | COPY_IN_CRC tmp1, tmp2, tmp3, tmp4, crc_0, crc_1, crc_2, crc_3, crc_fold, \ | |
300 | xtmp0, xtmp1, xtmp2, xtmp3, xtmp4 | |
301 | ||
302 | ; in_buf += x; | |
303 | add in_buf, x | |
304 | MARK __prepare_loop_ %+ ARCH | |
305 | skip_move_zero: | |
306 | mov [rsp + in_buf_mem_offset], in_buf | |
307 | ; state->b_bytes_valid += x; | |
308 | add b_bytes_valid, x | |
309 | mov [stream + _internal_state_b_bytes_valid], b_bytes_valid %+ d | |
310 | ||
311 | ; f_end_i = state->b_bytes_valid - LA; | |
312 | %ifnidn f_end_i, b_bytes_valid | |
313 | mov f_end_i, b_bytes_valid | |
314 | %endif | |
315 | sub f_end_i, LA | |
316 | ; if (f_end_i <= 0) continue; | |
317 | cmp f_end_i, 0 | |
318 | jle continue_while | |
319 | ||
320 | ; f_start_i = state->b_bytes_processed; | |
321 | ;; f_i and b_bytes_processed are same register, just store b_bytes_proc | |
322 | mov [stream + _internal_state_b_bytes_processed], b_bytes_processed %+ d | |
323 | ||
324 | ; f_start_i += (uint32_t)(state->buffer - state->file_start); | |
325 | mov file_start, [stream + _internal_state_file_start] | |
326 | lea tmp1, [stream + _internal_state_buffer] | |
327 | sub tmp1, file_start | |
328 | add f_i, tmp1 | |
329 | add f_end_i, tmp1 | |
330 | mov [rsp + f_end_i_mem_offset], f_end_i | |
331 | ||
332 | ; for (f_i = f_start_i; f_i < f_end_i; f_i++) { | |
333 | cmp f_i, f_end_i | |
334 | jge end_loop_2 | |
335 | ||
336 | MARK __misc_compute_hash_lookup_ %+ ARCH | |
337 | mov curr_data %+ d, [file_start + f_i] | |
338 | ||
339 | cmp dword [rsp + empty_buffer_flag], 0 | |
340 | jne write_first_byte | |
341 | ||
342 | mov curr_data2, curr_data | |
343 | ||
344 | compute_hash hash, curr_data | |
345 | jmp loop2 | |
346 | ||
347 | align 16 | |
348 | ||
349 | loop2: | |
350 | shr curr_data2, 8 | |
351 | xor hash2 %+ d, hash2 %+ d | |
352 | crc32 hash2 %+ d, curr_data2 %+ d | |
353 | ||
354 | ; hash = compute_hash(state->file_start + f_i) & HASH_MASK; | |
355 | and hash %+ d, HASH_MASK | |
356 | and hash2 %+ d, HASH_MASK | |
357 | ||
358 | ; if (state->bitbuf.is_full()) { | |
359 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
360 | ja bitbuf_full | |
361 | ||
362 | xor dist, dist | |
363 | xor dist2, dist2 | |
364 | xor tmp3, tmp3 | |
365 | ||
366 | lea tmp1, [file_start + f_i] | |
367 | lea tmp6, [tmp1 - 1] | |
368 | ||
369 | mov dist %+ w, f_i %+ w | |
370 | sub dist %+ w, word [stream + _internal_state_head + 2 * hash] | |
371 | ||
372 | ; state->head[hash] = (uint16_t) f_i; | |
373 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
374 | ||
375 | inc f_i | |
376 | ||
377 | mov dist2 %+ w, f_i %+ w | |
378 | sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2] | |
379 | dec dist2 | |
380 | ||
381 | ; state->head[hash2] = (uint16_t) f_i; | |
382 | mov [stream + _internal_state_head + 2 * hash2], f_i %+ w | |
383 | ||
384 | mov tmp2, tmp1 | |
385 | sub tmp2, dist | |
386 | dec dist | |
387 | ||
388 | ; if ((dist-1) < (D-1)) { | |
389 | cmp dist %+ d, (D-1) | |
390 | cmovae tmp2, tmp6 | |
391 | cmovae dist, tmp3 | |
392 | inc dist | |
393 | ||
394 | cmp dist2 %+ d, (D-1) | |
395 | cmovae dist2, tmp3 | |
396 | inc dist2 | |
397 | ||
398 | MARK __compare_ %+ ARCH | |
399 | ; len = compare258(state->file_start + f_i, | |
400 | ; state->file_start + f_i - dist); | |
401 | ||
402 | ;; Specutively load distance code (except for when large windows are used) | |
403 | get_packed_dist_code dist, code2, hufftables | |
404 | ||
405 | ;; Check for long len/dist match (>7) with first literal | |
406 | mov len, [tmp1] | |
407 | xor len, [tmp2] | |
408 | jz compare_loop | |
409 | ||
410 | %ifdef USE_HSWNI | |
411 | blsmsk tmp3, len | |
412 | or tmp3, 0xFFFFFF | |
413 | %endif | |
414 | ||
415 | lea tmp1, [file_start + f_i] | |
416 | mov tmp2, tmp1 | |
417 | sub tmp2, dist2 | |
418 | ||
419 | ;; Specutively load distance code (except for when large windows are used) | |
420 | get_packed_dist_code dist2, code4, hufftables | |
421 | ||
422 | ;; Check for len/dist match (>7) with second literal | |
423 | mov len2, [tmp1] | |
424 | xor len2, [tmp2] | |
425 | jz compare_loop2 | |
426 | ||
427 | %ifdef USE_HSWNI | |
428 | ;; Check for len/dist match for first literal | |
429 | test tmp3, len2 | |
430 | jz len_dist_lit_huffman_pre | |
431 | ||
432 | cmp tmp3, 0xFFFFFF | |
433 | je encode_2_literals | |
434 | jmp len_dist_huffman_pre | |
435 | ||
436 | ||
437 | MARK __len_dist_lit_huffman_ %+ ARCH | |
438 | len_dist_lit_huffman_pre: | |
439 | movzx tmp1, curr_data %+ b | |
440 | get_lit_code tmp1, code3, code_len3, hufftables | |
441 | %else | |
442 | ;; Specutively load the code for the first literal | |
443 | movzx tmp1, curr_data %+ b | |
444 | get_lit_code tmp1, code3, rcx, hufftables | |
445 | ||
446 | ;; Check for len/dist match for first literal | |
447 | test len, 0xFFFFFF | |
448 | jz len_dist_huffman_pre | |
449 | ||
450 | ;; Specutively load the code for the second literal | |
451 | shr curr_data, 8 | |
452 | and curr_data, 0xff | |
453 | get_lit_code curr_data, code2, code_len2, hufftables | |
454 | ||
455 | shl code2, cl | |
456 | or code2, code3 | |
457 | add code_len2, rcx | |
458 | ||
459 | ;; Check for len/dist match for second literal | |
460 | test len2, 0xFFFFFF | |
461 | jnz write_lit_bits | |
462 | ||
463 | MARK __len_dist_lit_huffman_ %+ ARCH | |
464 | len_dist_lit_huffman_pre: | |
465 | mov code_len3, rcx | |
466 | %endif | |
467 | bsf len2, len2 | |
468 | shr len2, 3 | |
469 | ||
470 | len_dist_lit_huffman: | |
471 | %ifndef LONGER_HUFFTABLE | |
472 | mov tmp4, dist2 | |
473 | get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx | |
474 | %else | |
475 | unpack_dist_code code4, code_len2 | |
476 | %endif | |
477 | get_len_code len2, code, rcx, hufftables ;; rcx is code_len | |
478 | ||
479 | %ifdef USE_HSWNI | |
480 | shlx code4, code4, rcx | |
481 | %else | |
482 | shl code4, cl | |
483 | %endif | |
484 | or code4, code | |
485 | add code_len2, rcx | |
486 | ||
487 | mov rcx, code_len3 | |
488 | ||
489 | %ifdef USE_HSWNI | |
490 | shlx code4, code4, rcx | |
491 | %else | |
492 | shl code4, cl | |
493 | %endif | |
494 | or code4, code3 | |
495 | add code_len2, rcx | |
496 | ||
497 | mov code2, code4 | |
498 | ;; Setup for updating hash | |
499 | lea tmp3, [f_i + 1] ; tmp3 <= k | |
500 | add f_i, len2 | |
501 | ||
502 | ; hash = compute_hash(state->file_start + k) & HASH_MASK; | |
503 | mov tmp5 %+ d, [file_start + tmp3] | |
504 | mov tmp7, tmp5 | |
505 | shr tmp7, 8 | |
506 | ||
507 | compute_hash hash, tmp5 | |
508 | and hash %+ d, HASH_MASK | |
509 | ||
510 | ; state->head[hash] = k; | |
511 | mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w | |
512 | ||
513 | add tmp3,1 | |
514 | ||
515 | jmp update_hash_for_symbol | |
516 | ;; encode as dist/len | |
517 | ||
518 | MARK __len_dist_huffman_ %+ ARCH | |
519 | len_dist_huffman_pre: | |
520 | bsf len, len | |
521 | shr len, 3 | |
522 | len_dist_huffman: | |
523 | dec f_i | |
524 | ||
525 | ; get_dist_code(dist, &code2, &code_len2); | |
526 | %ifndef LONGER_HUFFTABLE | |
527 | mov tmp3, dist ; since code2 and dist are rbx | |
528 | get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx | |
529 | %else | |
530 | unpack_dist_code code2, code_len2 | |
531 | %endif | |
532 | ; get_len_code(len, &code, &code_len); | |
533 | get_len_code len, code, rcx, hufftables ;; rcx is code_len | |
534 | ||
535 | ; code2 <<= code_len | |
536 | ; code2 |= code | |
537 | ; code_len2 += code_len | |
538 | %ifdef USE_HSWNI | |
539 | shlx code2, code2, rcx | |
540 | %else | |
541 | shl code2, cl | |
542 | %endif | |
543 | or code2, code | |
544 | add code_len2, rcx | |
545 | ||
546 | ;; Setup for updateing hash | |
547 | lea tmp3, [f_i + 2] ; tmp3 <= k | |
548 | add f_i, len | |
549 | mov tmp7 %+ d, [file_start + tmp3] | |
550 | ||
551 | MARK __update_hash_for_symbol_ %+ ARCH | |
552 | update_hash_for_symbol: | |
553 | mov curr_data %+ d, [file_start + f_i] | |
554 | mov curr_data2, curr_data | |
555 | compute_hash hash, curr_data | |
556 | %ifdef LIMIT_HASH_UPDATE | |
557 | ; only update hash twice, first hash was already calculated. | |
558 | ||
559 | ; hash = compute_hash(state->file_start + k) & HASH_MASK; | |
560 | compute_hash hash2, tmp7 | |
561 | and hash2 %+ d, HASH_MASK | |
562 | ; state->head[hash] = k; | |
563 | mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w | |
564 | ||
565 | %else | |
566 | loop3: | |
567 | ; hash = compute_hash(state->file_start + k) & HASH_MASK; | |
568 | mov tmp7 %+ d, [file_start + tmp3] | |
569 | compute_hash hash2, tmp7 | |
570 | and hash2 %+ d, HASH_MASK | |
571 | ; state->head[hash] = k; | |
572 | mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w | |
573 | add tmp3,1 | |
574 | cmp tmp3, f_i | |
575 | jl loop3 | |
576 | %endif | |
577 | ||
578 | ||
579 | MARK __write_len_dist_bits_ %+ ARCH | |
580 | mov f_end_i, [rsp + f_end_i_mem_offset] | |
581 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
582 | ||
583 | ; continue | |
584 | cmp f_i, f_end_i | |
585 | jl loop2 | |
586 | jmp end_loop_2 | |
587 | ||
588 | ||
589 | MARK __write_lit_bits_ %+ ARCH | |
590 | %ifdef USE_HSWNI | |
591 | encode_2_literals: | |
592 | movzx tmp1, curr_data %+ b | |
593 | get_lit_code tmp1, code3, rcx, hufftables | |
594 | ||
595 | shr curr_data, 8 | |
596 | and curr_data, 0xff | |
597 | get_lit_code curr_data, code2, code_len2, hufftables | |
598 | ||
599 | ;; Calculate code associated with both literals | |
600 | shlx code2, code2, rcx | |
601 | or code2, code3 | |
602 | add code_len2, rcx | |
603 | %endif | |
604 | write_lit_bits: | |
605 | mov f_end_i, [rsp + f_end_i_mem_offset] | |
606 | add f_i, 1 | |
607 | mov curr_data %+ d, [file_start + f_i] | |
608 | mov curr_data2, curr_data | |
609 | ||
610 | compute_hash hash, curr_data | |
611 | ||
612 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
613 | ||
614 | ; continue | |
615 | cmp f_i, f_end_i | |
616 | jl loop2 | |
617 | ||
618 | ||
619 | MARK __end_loops_ %+ ARCH | |
620 | end_loop_2: | |
621 | ||
622 | ; state->b_bytes_processed = f_i - (state->buffer - state->file_start); | |
623 | add f_i, [stream + _internal_state_file_start] | |
624 | sub f_i, stream | |
625 | sub f_i, _internal_state_buffer | |
626 | mov [stream + _internal_state_b_bytes_processed], f_i %+ d | |
627 | ||
628 | ; continue | |
629 | continue_while: | |
630 | mov blen, [rsp + blen_mem_offset] | |
631 | mov in_buf, [rsp + in_buf_mem_offset] | |
632 | cmp blen, 0 | |
633 | jnz loop1 | |
634 | ||
635 | end: | |
636 | ;; update input buffer | |
637 | ; stream->total_in += (uint32_t)(in_buf - stream->next_in); // bytes copied | |
638 | mov tmp1 %+ d, [stream + _total_in] | |
639 | mov in_buf, [rsp + in_buf_mem_offset] | |
640 | add tmp1, in_buf | |
641 | sub tmp1, [stream + _next_in] | |
642 | mov [stream + _total_in], tmp1 %+ d | |
643 | ||
644 | mov [stream + _next_in], in_buf | |
645 | mov [stream + _avail_in], blen %+ d | |
646 | ||
647 | cmp blen, 0 | |
648 | jne skip2 | |
649 | ||
650 | ;; Set stream's next state | |
651 | mov tmp1, ZSTATE_FLUSH_READ_BUFFER | |
652 | mov tmp5, ZSTATE_BODY | |
653 | cmp dword [stream + _end_of_stream], 0 | |
654 | cmovne tmp5, tmp1 | |
655 | cmp dword [stream + _flush], _NO_FLUSH | |
656 | cmovne tmp5, tmp1 | |
657 | mov dword [stream + _internal_state_state], tmp5 %+ d | |
658 | skip2: | |
659 | mov [stream + _next_out], m_out_buf | |
660 | ; offset = state->bitbuf.buffer_used(); | |
661 | sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start] | |
662 | sub [stream + _avail_out], m_out_buf %+ d | |
663 | add [stream + _total_out], m_out_buf %+ d | |
664 | ||
665 | mov [stream + _internal_state_bitbuf_m_bits], m_bits | |
666 | mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d | |
667 | ||
668 | ||
669 | MOVDQA [stream + _internal_state_crc + 0*16], crc_0 | |
670 | MOVDQA [stream + _internal_state_crc + 1*16], crc_1 | |
671 | MOVDQA [stream + _internal_state_crc + 2*16], crc_2 | |
672 | MOVDQA [stream + _internal_state_crc + 3*16], crc_3 | |
673 | ||
674 | mov rbx, [rsp + gpr_save_mem_offset + 0*8] | |
675 | mov rsi, [rsp + gpr_save_mem_offset + 1*8] | |
676 | mov rdi, [rsp + gpr_save_mem_offset + 2*8] | |
677 | mov rbp, [rsp + gpr_save_mem_offset + 3*8] | |
678 | mov r12, [rsp + gpr_save_mem_offset + 4*8] | |
679 | mov r13, [rsp + gpr_save_mem_offset + 5*8] | |
680 | mov r14, [rsp + gpr_save_mem_offset + 6*8] | |
681 | mov r15, [rsp + gpr_save_mem_offset + 7*8] | |
682 | MOVDQA xmm6, [rsp + xmm_save_mem_offset + 0*16] | |
683 | MOVDQA xmm7, [rsp + xmm_save_mem_offset + 1*16] | |
684 | MOVDQA xmm8, [rsp + xmm_save_mem_offset + 2*16] | |
685 | MOVDQA xmm9, [rsp + xmm_save_mem_offset + 3*16] | |
686 | ||
687 | %ifndef ALIGN_STACK | |
688 | add rsp, stack_size | |
689 | %else | |
690 | mov rsp, rbp | |
691 | pop rbp | |
692 | %endif | |
693 | ret | |
694 | ||
695 | MARK __bitbuf_full_ %+ ARCH | |
696 | bitbuf_full: | |
697 | mov blen, [rsp + blen_mem_offset] | |
698 | ; state->b_bytes_processed = f_i - (state->buffer - state->file_start); | |
699 | add f_i, [stream + _internal_state_file_start] | |
700 | sub f_i, stream | |
701 | sub f_i, _internal_state_buffer | |
702 | mov [stream + _internal_state_b_bytes_processed], f_i %+ d | |
703 | jmp end | |
704 | ||
705 | MARK __compare_loops_ %+ ARCH | |
706 | compare_loop: | |
707 | %if (COMPARE_TYPE == 1) | |
708 | compare250 tmp1, tmp2, len, tmp3 | |
709 | %elif (COMPARE_TYPE == 2) | |
710 | compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1 | |
711 | %elif (COMPARE_TYPE == 3) | |
712 | compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1 | |
713 | %else | |
714 | %error Unknown Compare type COMPARE_TYPE | |
715 | % error | |
716 | %endif | |
717 | jmp len_dist_huffman | |
718 | ||
719 | compare_loop2: | |
720 | %if (COMPARE_TYPE == 1) | |
721 | compare250 tmp1, tmp2, len2, tmp3 | |
722 | %elif (COMPARE_TYPE == 2) | |
723 | compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1 | |
724 | %elif (COMPARE_TYPE == 3) | |
725 | compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1 | |
726 | %else | |
727 | %error Unknown Compare type COMPARE_TYPE | |
728 | % error | |
729 | %endif | |
730 | and curr_data, 0xff | |
731 | get_lit_code curr_data, code3, code_len3, hufftables | |
732 | jmp len_dist_lit_huffman | |
733 | ||
734 | MARK __write_first_byte_ %+ ARCH | |
735 | write_first_byte: | |
736 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
737 | ja bitbuf_full | |
738 | ||
739 | mov dword [rsp + empty_buffer_flag], 0 | |
740 | compute_hash hash, curr_data | |
741 | and hash %+ d, HASH_MASK | |
742 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
743 | and curr_data, 0xff | |
744 | get_lit_code curr_data, code2, code_len2, hufftables | |
745 | jmp write_lit_bits | |
746 | ||
747 | section .data | |
748 | align 4 | |
749 | const_D: dq D | |
750 | ||
751 | %endif ;; ifndef TEST |