]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | %include "options.asm" | |
30 | ||
31 | %include "lz0a_const.asm" | |
32 | %include "data_struct2.asm" | |
33 | %include "bitbuf2.asm" | |
34 | %include "huffman.asm" | |
35 | %include "igzip_compare_types.asm" | |
36 | %include "reg_sizes.asm" | |
37 | ||
38 | %include "stdmac.asm" | |
39 | ||
40 | %define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds | |
41 | %define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary | |
42 | ||
43 | %ifdef DEBUG | |
44 | %macro MARK 1 | |
45 | global %1 | |
46 | %1: | |
47 | %endm | |
48 | %else | |
49 | %macro MARK 1 | |
50 | %endm | |
51 | %endif | |
52 | ||
53 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
54 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
55 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
56 | ||
57 | %define tmp2 rcx | |
58 | %define hash2 rcx | |
59 | ||
60 | %define curr_data rax | |
61 | %define code rax | |
62 | %define tmp5 rax | |
63 | ||
64 | %define tmp4 rbx | |
65 | %define dist rbx | |
66 | %define code2 rbx | |
67 | ||
68 | %define hash rdx | |
69 | %define len rdx | |
70 | %define code_len3 rdx | |
71 | ||
72 | %define tmp1 rsi | |
73 | %define code_len2 rsi | |
74 | ||
75 | %define file_start rdi | |
76 | ||
77 | %define m_bit_count rbp | |
78 | ||
79 | %define curr_data2 r8 | |
80 | %define len2 r8 | |
81 | %define tmp6 r8 | |
82 | ||
83 | %define m_bits r9 | |
84 | ||
85 | %define f_i r10 | |
86 | ||
87 | %define m_out_buf r11 | |
88 | ||
89 | %define f_end_i r12 | |
90 | %define dist2 r12 | |
91 | %define tmp7 r12 | |
92 | %define code4 r12 | |
93 | ||
94 | %define tmp3 r13 | |
95 | %define code3 r13 | |
96 | ||
97 | %define stream r14 | |
98 | ||
99 | %define hufftables r15 | |
100 | ||
101 | ;; GPR r8 & r15 can be used | |
102 | ||
103 | %define xtmp0 xmm0 ; tmp | |
104 | %define xtmp1 xmm1 ; tmp | |
105 | ||
106 | %define ytmp0 ymm0 ; tmp | |
107 | %define ytmp1 ymm1 ; tmp | |
108 | ||
109 | ||
110 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
111 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
112 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
113 | ||
114 | ||
115 | blen_mem_offset equ 0 ; local variable (8 bytes) | |
116 | f_end_i_mem_offset equ 8 | |
117 | gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes) | |
118 | xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) | |
119 | stack_size equ 2*8 + 8*8 + 4*16 + 8 | |
120 | ;;; 8 because stack address is odd multiple of 8 after a function call and | |
121 | ;;; we want it aligned to 16 bytes | |
122 | ||
123 | ; void isal_deflate_body_stateless ( isal_zstream *stream ) | |
124 | ; arg 1: rcx: addr of stream | |
125 | global isal_deflate_body_stateless_ %+ ARCH | |
126 | isal_deflate_body_stateless_ %+ ARCH %+ : | |
127 | %ifidn __OUTPUT_FORMAT__, elf64 | |
128 | mov rcx, rdi | |
129 | %endif | |
130 | ||
131 | ;; do nothing if (avail_in == 0) | |
132 | cmp dword [rcx + _avail_in], 0 | |
133 | jne skip1 | |
134 | ret | |
135 | skip1: | |
136 | ||
137 | %ifdef ALIGN_STACK | |
138 | push rbp | |
139 | mov rbp, rsp | |
140 | sub rsp, stack_size | |
141 | and rsp, ~15 | |
142 | %else | |
143 | sub rsp, stack_size | |
144 | %endif | |
145 | ||
146 | mov [rsp + gpr_save_mem_offset + 0*8], rbx | |
147 | mov [rsp + gpr_save_mem_offset + 1*8], rsi | |
148 | mov [rsp + gpr_save_mem_offset + 2*8], rdi | |
149 | mov [rsp + gpr_save_mem_offset + 3*8], rbp | |
150 | mov [rsp + gpr_save_mem_offset + 4*8], r12 | |
151 | mov [rsp + gpr_save_mem_offset + 5*8], r13 | |
152 | mov [rsp + gpr_save_mem_offset + 6*8], r14 | |
153 | mov [rsp + gpr_save_mem_offset + 7*8], r15 | |
154 | ||
155 | mov stream, rcx | |
156 | mov dword [stream + _internal_state_has_eob], 0 | |
157 | ||
158 | ; state->bitbuf.set_buf(stream->next_out, stream->avail_out); | |
159 | mov m_out_buf, [stream + _next_out] | |
160 | mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf | |
161 | mov tmp1 %+ d, [stream + _avail_out] | |
162 | add tmp1, m_out_buf | |
163 | sub tmp1, SLOP | |
164 | ||
165 | skip_SLOP: | |
166 | mov [stream + _internal_state_bitbuf_m_out_end], tmp1 | |
167 | ||
168 | mov m_bits, [stream + _internal_state_bitbuf_m_bits] | |
169 | mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count] | |
170 | mov hufftables, [stream + _hufftables] | |
171 | ; state->b_bytes_valid = stream->avail_in; | |
172 | mov f_end_i %+ d, [stream + _avail_in] | |
173 | mov [stream + _internal_state_b_bytes_valid], f_end_i %+ d | |
174 | ||
175 | mov f_i, 0 | |
176 | mov file_start, [stream + _next_in] | |
177 | mov [stream + _internal_state_file_start], file_start | |
178 | ||
179 | ; f_end_i -= LA; | |
180 | sub f_end_i, LA_STATELESS | |
181 | mov [rsp + f_end_i_mem_offset], f_end_i | |
182 | ; if (f_end_i <= 0) continue; | |
183 | cmp f_end_i, 0 | |
184 | jle end_loop_2 | |
185 | ||
186 | ; for (f_i = f_start_i; f_i < f_end_i; f_i++) { | |
187 | MARK __stateless_compute_hash_ %+ ARCH | |
188 | mov curr_data %+ d, [file_start + f_i] | |
189 | ||
190 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
191 | ja end | |
192 | ||
193 | ;; Encode first byte in the stream as a literal | |
194 | compute_hash hash, curr_data | |
195 | and hash %+ d, HASH_MASK | |
196 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
197 | and curr_data, 0xff | |
198 | get_lit_code curr_data, code2, code_len2, hufftables | |
199 | jmp write_lit_bits | |
200 | ||
201 | align 16 | |
202 | ||
203 | loop2: | |
204 | shr curr_data2, 8 | |
205 | xor hash2 %+ d, hash2 %+ d | |
206 | crc32 hash2 %+ d, curr_data2 %+ d | |
207 | ||
208 | ; hash = compute_hash(state->file_start + f_i) & HASH_MASK; | |
209 | and hash %+ d, HASH_MASK | |
210 | and hash2 %+ d, HASH_MASK | |
211 | ||
212 | ; if (state->bitbuf.is_full()) { | |
213 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
214 | ja end | |
215 | ||
216 | xor dist, dist | |
217 | xor dist2, dist2 | |
218 | xor tmp3, tmp3 | |
219 | ||
220 | lea tmp1, [file_start + f_i] | |
221 | lea tmp6, [tmp1 - 1] | |
222 | ||
223 | mov dist %+ w, f_i %+ w | |
224 | sub dist %+ w, word [stream + _internal_state_head + 2 * hash] | |
225 | ||
226 | ; state->head[hash] = (uint16_t) f_i; | |
227 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
228 | ||
229 | inc f_i | |
230 | ||
231 | mov dist2 %+ w, f_i %+ w | |
232 | sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2] | |
233 | dec dist2 | |
234 | ||
235 | ; state->head[hash2] = (uint16_t) f_i; | |
236 | mov [stream + _internal_state_head + 2 * hash2], f_i %+ w | |
237 | ||
238 | mov tmp2, tmp1 | |
239 | sub tmp2, dist | |
240 | dec dist | |
241 | ||
242 | ; if ((dist-1) < (D-1)) { | |
243 | cmp dist %+ d, (D-1) | |
244 | cmovae tmp2, tmp6 | |
245 | cmovae dist, tmp3 | |
246 | inc dist | |
247 | ||
248 | cmp dist2 %+ d, (D-1) | |
249 | cmovae dist2, tmp3 | |
250 | inc dist2 | |
251 | ||
252 | MARK __stateless_compare_ %+ ARCH | |
253 | ; len = compare258(state->file_start + f_i, | |
254 | ; state->file_start + f_i - dist); | |
255 | ||
256 | ;; Specutively load distance code (except for when large windows are used) | |
257 | get_packed_dist_code dist, code2, hufftables | |
258 | ||
259 | ;; Check for long len/dist match (>7) with first literal | |
260 | mov len, [tmp1] | |
261 | xor len, [tmp2] | |
262 | jz compare_loop | |
263 | ||
264 | %ifdef USE_HSWNI | |
265 | blsmsk tmp3, len | |
266 | or tmp3, 0xFFFFFF | |
267 | %endif | |
268 | ||
269 | lea tmp1, [file_start + f_i] | |
270 | mov tmp2, tmp1 | |
271 | sub tmp2, dist2 | |
272 | ||
273 | ;; Specutively load distance code (except for when large windows are used) | |
274 | get_packed_dist_code dist2, code4, hufftables | |
275 | ||
276 | ;; Check for len/dist match (>7) with second literal | |
277 | mov len2, [tmp1] | |
278 | xor len2, [tmp2] | |
279 | jz compare_loop2 | |
280 | ||
281 | %ifdef USE_HSWNI | |
282 | ;; Check for len/dist match for first literal | |
283 | test tmp3, len2 | |
284 | jz len_dist_lit_huffman_pre | |
285 | ||
286 | cmp tmp3, 0xFFFFFF | |
287 | je encode_2_literals | |
288 | jmp len_dist_huffman_pre | |
289 | ||
290 | ||
291 | MARK __stateless_len_dist_lit_huffman_ %+ ARCH | |
292 | len_dist_lit_huffman_pre: | |
293 | movzx tmp1, curr_data %+ b | |
294 | get_lit_code tmp1, code3, code_len3, hufftables | |
295 | %else | |
296 | ;; Specutively load the code for the first literal | |
297 | movzx tmp1, curr_data %+ b | |
298 | get_lit_code tmp1, code3, rcx, hufftables | |
299 | ||
300 | ;; Check for len/dist match for first literal | |
301 | test len, 0xFFFFFF | |
302 | jz len_dist_huffman_pre | |
303 | ||
304 | ;; Specutively load the code for the second literal | |
305 | shr curr_data, 8 | |
306 | and curr_data, 0xff | |
307 | get_lit_code curr_data, code2, code_len2, hufftables | |
308 | ||
309 | shl code2, cl | |
310 | or code2, code3 | |
311 | add code_len2, rcx | |
312 | ||
313 | ;; Check for len/dist match for second literal | |
314 | test len2, 0xFFFFFF | |
315 | jnz write_lit_bits | |
316 | ||
317 | MARK __stateless_len_dist_lit_huffman_ %+ ARCH | |
318 | len_dist_lit_huffman_pre: | |
319 | mov code_len3, rcx | |
320 | %endif | |
321 | bsf len2, len2 | |
322 | shr len2, 3 | |
323 | ||
324 | ||
325 | len_dist_lit_huffman: | |
326 | %ifndef LONGER_HUFFTABLE | |
327 | mov tmp4, dist2 | |
328 | get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx | |
329 | %else | |
330 | unpack_dist_code code4, code_len2 | |
331 | %endif | |
332 | get_len_code len2, code, rcx, hufftables ;; rcx is code_len | |
333 | ||
334 | %ifdef USE_HSWNI | |
335 | shlx code4, code4, rcx | |
336 | %else | |
337 | shl code4, cl | |
338 | %endif | |
339 | or code4, code | |
340 | add code_len2, rcx | |
341 | ||
342 | mov rcx, code_len3 | |
343 | ||
344 | %ifdef USE_HSWNI | |
345 | shlx code4, code4, rcx | |
346 | %else | |
347 | shl code4, cl | |
348 | %endif | |
349 | or code4, code3 | |
350 | add code_len2, rcx | |
351 | ||
352 | mov code2, code4 | |
353 | ;; Setup for updating hash | |
354 | lea tmp3, [f_i + 1] ; tmp3 <= k | |
355 | add f_i, len2 | |
356 | ||
357 | ; hash = compute_hash(state->file_start + k) & HASH_MASK; | |
358 | mov tmp5 %+ d, [file_start + tmp3] | |
359 | mov tmp7, tmp5 | |
360 | shr tmp7, 8 | |
361 | ||
362 | compute_hash hash, tmp5 | |
363 | and hash %+ d, HASH_MASK | |
364 | ||
365 | ; state->head[hash] = k; | |
366 | mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w | |
367 | ||
368 | add tmp3,1 | |
369 | ||
370 | jmp update_hash_for_symbol | |
371 | ;; encode as dist/len | |
372 | ||
373 | MARK __stateless_len_dist_huffman_ %+ ARCH | |
374 | len_dist_huffman_pre: | |
375 | bsf len, len | |
376 | shr len, 3 | |
377 | ||
378 | len_dist_huffman: | |
379 | dec f_i | |
380 | ||
381 | ; get_dist_code(dist, &code2, &code_len2); | |
382 | %ifndef LONGER_HUFFTABLE | |
383 | mov tmp3, dist ; since code2 and dist are rbx | |
384 | get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx | |
385 | %else | |
386 | unpack_dist_code code2, code_len2 | |
387 | %endif | |
388 | ; get_len_code(len, &code, &code_len); | |
389 | get_len_code len, code, rcx, hufftables ;; rcx is code_len | |
390 | ||
391 | ; code2 <<= code_len | |
392 | ; code2 |= code | |
393 | ; code_len2 += code_len | |
394 | %ifdef USE_HSWNI | |
395 | shlx code2, code2, rcx | |
396 | %else | |
397 | shl code2, cl | |
398 | %endif | |
399 | or code2, code | |
400 | add code_len2, rcx | |
401 | ||
402 | ;; Setup for updateing hash | |
403 | lea tmp3, [f_i + 2] ; tmp3 <= k | |
404 | add f_i, len | |
405 | mov tmp7 %+ d, [file_start + tmp3] | |
406 | ||
407 | MARK __stateless_update_hash_for_symbol_ %+ ARCH | |
408 | update_hash_for_symbol: | |
409 | mov curr_data %+ d, [file_start + f_i] | |
410 | mov curr_data2, curr_data | |
411 | compute_hash hash, curr_data | |
412 | %ifdef LIMIT_HASH_UPDATE | |
413 | ; only update hash twice, first hash was already calculated. | |
414 | ||
415 | ; hash = compute_hash(state->file_start + k) & HASH_MASK; | |
416 | compute_hash hash2, tmp7 | |
417 | and hash2 %+ d, HASH_MASK | |
418 | ; state->head[hash] = k; | |
419 | mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w | |
420 | ||
421 | %else | |
422 | loop3: | |
423 | ; hash = compute_hash(state->file_start + k) & HASH_MASK; | |
424 | mov tmp7 %+ d, [file_start + tmp3] | |
425 | compute_hash hash2, tmp7 | |
426 | and hash2 %+ d, HASH_MASK | |
427 | ; state->head[hash] = k; | |
428 | mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w | |
429 | add tmp3,1 | |
430 | cmp tmp3, f_i | |
431 | jl loop3 | |
432 | %endif | |
433 | ||
434 | ||
435 | MARK __stateless_write_len_dist_bits_ %+ ARCH | |
436 | mov f_end_i, [rsp + f_end_i_mem_offset] | |
437 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
438 | ||
439 | ; continue | |
440 | cmp f_i, f_end_i | |
441 | jl loop2 | |
442 | jmp end_loop_2 | |
443 | ||
444 | ||
445 | MARK __stateless_write_lit_bits_ %+ ARCH | |
446 | %ifdef USE_HSWNI | |
447 | encode_2_literals: | |
448 | movzx tmp1, curr_data %+ b | |
449 | get_lit_code tmp1, code3, rcx, hufftables | |
450 | ||
451 | shr curr_data, 8 | |
452 | and curr_data, 0xff | |
453 | get_lit_code curr_data, code2, code_len2, hufftables | |
454 | ||
455 | ;; Calculate code associated with both literals | |
456 | shlx code2, code2, rcx | |
457 | or code2, code3 | |
458 | add code_len2, rcx | |
459 | %endif | |
460 | write_lit_bits: | |
461 | mov f_end_i, [rsp + f_end_i_mem_offset] | |
462 | add f_i, 1 | |
463 | mov curr_data %+ d, [file_start + f_i] | |
464 | mov curr_data2, curr_data | |
465 | ||
466 | compute_hash hash, curr_data | |
467 | ||
468 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
469 | ||
470 | ; continue | |
471 | cmp f_i, f_end_i | |
472 | jl loop2 | |
473 | ||
474 | MARK __stateless_end_loops_ %+ ARCH | |
475 | end_loop_2: | |
476 | ;; Handle the last bytes (at most LA_statless bytes) | |
477 | add f_end_i, LA_STATELESS - LAST_BYTES_COUNT | |
478 | cmp f_i, f_end_i | |
479 | jge end_loop_2_finish | |
480 | ||
481 | loop2_finish: | |
482 | ;; Check for space in out buffer | |
483 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
484 | ja end | |
485 | ||
486 | mov curr_data %+ d, [file_start + f_i] | |
487 | compute_hash hash, curr_data | |
488 | and hash %+ d, HASH_MASK | |
489 | ||
490 | ;; Calculate possible distance for length/dist pair. | |
491 | xor dist, dist | |
492 | mov dist %+ w, f_i %+ w | |
493 | sub dist %+ w, word [stream + _internal_state_head + 2 * hash] | |
494 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
495 | ||
496 | ;; Check if look back distance is valid (the dec is to handle when dist = 0) | |
497 | dec dist | |
498 | cmp dist %+ d, (D-1) | |
499 | jae encode_literal_finish | |
500 | inc dist | |
501 | ||
502 | ;; Check if look back distance is a match | |
503 | lea tmp6, [f_end_i + LAST_BYTES_COUNT] | |
504 | sub tmp6, f_i | |
505 | lea tmp1, [file_start + f_i] | |
506 | mov tmp2, tmp1 | |
507 | sub tmp2, dist | |
508 | compare tmp6, tmp1, tmp2, len, tmp3 | |
509 | ||
510 | ;; Limit len to maximum value of 258 | |
511 | mov tmp2, 258 | |
512 | cmp len, 258 | |
513 | cmova len, tmp2 | |
514 | cmp len, SHORTEST_MATCH | |
515 | jb encode_literal_finish | |
516 | ||
517 | ;; Encode len/dist pair | |
518 | %ifndef LONGER_HUFFTABLE | |
519 | mov tmp3, dist | |
520 | get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx | |
521 | %else | |
522 | get_dist_code dist, code2, code_len2, hufftables ;; clobbers dist, rcx | |
523 | %endif | |
524 | get_len_code len, code, rcx, hufftables ;; rcx is code_len | |
525 | ||
526 | ;; Combine length and distance code for writing it out | |
527 | %ifdef USE_HSWNI | |
528 | shlx code2, code2, rcx | |
529 | %else | |
530 | shl code2, cl | |
531 | %endif | |
532 | or code2, code | |
533 | add code_len2, rcx | |
534 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
535 | ||
536 | ;; Setup for next loop | |
537 | add f_i, len | |
538 | cmp f_i, f_end_i | |
539 | jl loop2_finish | |
540 | jmp end_loop_2_finish | |
541 | ||
542 | encode_literal_finish: | |
543 | ;; Encode literal | |
544 | and curr_data %+ d, 0xFF | |
545 | get_lit_code curr_data, code2, code_len2, hufftables | |
546 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
547 | ||
548 | ;; Setup for next loop | |
549 | add f_i, 1 | |
550 | cmp f_i, f_end_i | |
551 | jl loop2_finish | |
552 | end_loop_2_finish: | |
553 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
554 | ja end | |
555 | ||
556 | ;; Check if any bytes left (at most LAST_BYTES_COUNT bytes) | |
557 | add f_end_i, LAST_BYTES_COUNT | |
558 | cmp f_i, f_end_i | |
559 | jz write_eob | |
560 | ||
561 | ;; Handle encoding last few bytes by encoding them as literals | |
562 | xor curr_data, curr_data | |
563 | final_bytes: | |
564 | movzx curr_data, byte [file_start + f_i] | |
565 | get_lit_code curr_data, code2, code_len2, hufftables | |
566 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
567 | ||
568 | cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end] | |
569 | ja end | |
570 | ||
571 | inc f_i | |
572 | cmp f_i, f_end_i | |
573 | jl final_bytes | |
574 | ||
575 | write_eob: | |
576 | ;; Write out end of block | |
577 | get_lit_code 256, code2, code_len2, hufftables | |
578 | write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3 | |
579 | mov dword [stream + _internal_state_has_eob], 1 | |
580 | ||
581 | end: | |
582 | ;; update input buffer | |
583 | add [stream + _total_in], f_i %+ d | |
584 | add [stream + _next_in], f_i | |
585 | sub [stream + _avail_in], f_i %+ d | |
586 | ||
587 | ;; update output buffer | |
588 | mov [stream + _next_out], m_out_buf | |
589 | sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start] | |
590 | sub [stream + _avail_out], m_out_buf %+ d | |
591 | add [stream + _total_out], m_out_buf %+ d | |
592 | ||
593 | mov [stream + _internal_state_bitbuf_m_bits], m_bits | |
594 | mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d | |
595 | ||
596 | mov rbx, [rsp + gpr_save_mem_offset + 0*8] | |
597 | mov rsi, [rsp + gpr_save_mem_offset + 1*8] | |
598 | mov rdi, [rsp + gpr_save_mem_offset + 2*8] | |
599 | mov rbp, [rsp + gpr_save_mem_offset + 3*8] | |
600 | mov r12, [rsp + gpr_save_mem_offset + 4*8] | |
601 | mov r13, [rsp + gpr_save_mem_offset + 5*8] | |
602 | mov r14, [rsp + gpr_save_mem_offset + 6*8] | |
603 | mov r15, [rsp + gpr_save_mem_offset + 7*8] | |
604 | ||
605 | %ifndef ALIGN_STACK | |
606 | add rsp, stack_size | |
607 | %else | |
608 | mov rsp, rbp | |
609 | pop rbp | |
610 | %endif | |
611 | ret | |
612 | ||
613 | MARK __stateless_compare_loops_ %+ ARCH | |
614 | compare_loop: | |
615 | %if (COMPARE_TYPE == 1) | |
616 | compare250 tmp1, tmp2, len, tmp3 | |
617 | %elif (COMPARE_TYPE == 2) | |
618 | compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1 | |
619 | %elif (COMPARE_TYPE == 3) | |
620 | compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1 | |
621 | %else | |
622 | %error Unknown Compare type COMPARE_TYPE | |
623 | % error | |
624 | %endif | |
625 | jmp len_dist_huffman | |
626 | ||
627 | compare_loop2: | |
628 | %if (COMPARE_TYPE == 1) | |
629 | compare250 tmp1, tmp2, len2, tmp3 | |
630 | %elif (COMPARE_TYPE == 2) | |
631 | compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1 | |
632 | %elif (COMPARE_TYPE == 3) | |
633 | compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1 | |
634 | %else | |
635 | %error Unknown Compare type COMPARE_TYPE | |
636 | % error | |
637 | %endif | |
638 | and curr_data, 0xff | |
639 | get_lit_code curr_data, code3, code_len3, hufftables | |
640 | jmp len_dist_lit_huffman | |
641 | ||
642 | section .data | |
643 | align 4 | |
644 | const_D: dq D |