3 %include "reg_sizes.asm"
8 %define INVALID_BLOCK -1
9 %define INVALID_SYMBOL -2
10 %define INVALID_LOOKBACK -3
12 %define ISAL_DECODE_LONG_BITS 12
13 %define ISAL_DECODE_SHORT_BITS 10
15 %define MAX_LONG_CODE_LARGE (288 + (1 << (15 - ISAL_DECODE_LONG_BITS)))
16 %define MAX_LONG_CODE_SMALL (32 + (1 << (15 - ISAL_DECODE_SHORT_BITS)))
19 %define COPY_LEN_MAX 258
21 %define IN_BUFFER_SLOP 8
22 %define OUT_BUFFER_SLOP COPY_SIZE + COPY_LEN_MAX
24 %include "inflate_data_structs.asm"
27 extern rfc1951_lookup_table
32 %define look_back_dist rax
37 %define copy_start rdx
42 %define look_back_dist2 rdi
43 %define next_bits2 rdi
57 %define repeat_length r8
63 %define read_in_length r10
78 %define rfc_lookup r15
80 start_out_mem_offset equ 0
81 read_in_mem_offset equ 8
82 read_in_length_mem_offset equ 16
83 gpr_save_mem_offset equ 24
84 stack_size equ 3 * 8 + 8 * 8
86 %define _dist_extra_bit_count 264
87 %define _dist_start _dist_extra_bit_count + 1*32
88 %define _len_extra_bit_count _dist_start + 4*32
89 %define _len_start _len_extra_bit_count + 1*32
91 %ifidn __OUTPUT_FORMAT__, elf64
104 mov [rsp + gpr_save_mem_offset + 0*8], rbx
105 mov [rsp + gpr_save_mem_offset + 1*8], rbp
106 mov [rsp + gpr_save_mem_offset + 2*8], r12
107 mov [rsp + gpr_save_mem_offset + 3*8], r13
108 mov [rsp + gpr_save_mem_offset + 4*8], r14
109 mov [rsp + gpr_save_mem_offset + 5*8], r15
112 %macro FUNC_RESTORE 0
113 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
114 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
115 mov r12, [rsp + gpr_save_mem_offset + 2*8]
116 mov r13, [rsp + gpr_save_mem_offset + 3*8]
117 mov r14, [rsp + gpr_save_mem_offset + 4*8]
118 mov r15, [rsp + gpr_save_mem_offset + 5*8]
129 %ifidn __OUTPUT_FORMAT__, win64
141 mov [rsp + gpr_save_mem_offset + 0*8], rbx
142 mov [rsp + gpr_save_mem_offset + 1*8], rsi
143 mov [rsp + gpr_save_mem_offset + 2*8], rdi
144 mov [rsp + gpr_save_mem_offset + 3*8], rbp
145 mov [rsp + gpr_save_mem_offset + 4*8], r12
146 mov [rsp + gpr_save_mem_offset + 5*8], r13
147 mov [rsp + gpr_save_mem_offset + 6*8], r14
148 mov [rsp + gpr_save_mem_offset + 7*8], r15
151 %macro FUNC_RESTORE 0
152 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
153 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
154 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
155 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
156 mov r12, [rsp + gpr_save_mem_offset + 4*8]
157 mov r13, [rsp + gpr_save_mem_offset + 5*8]
158 mov r14, [rsp + gpr_save_mem_offset + 6*8]
159 mov r15, [rsp + gpr_save_mem_offset + 7*8]
170 ;; Load read_in and updated in_buffer accordingly
171 ;; when there are at least 8 bytes in the in buffer
172 ;; Clobbers rcx, unless rcx is %%read_in_length
173 %macro inflate_in_load 6
177 %define %%read_in_length %4
178 %define %%tmp1 %5 ; Tmp registers
181 SHLX %%tmp1, [%%next_in], %%read_in_length
185 sub %%tmp1, %%read_in_length
188 add %%next_in, %%tmp1
189 lea %%read_in_length, [%%read_in_length + 8 * %%tmp1]
193 ;; Load read_in and updated in_buffer accordingly
194 ;; Clobbers rcx, unless rcx is %%read_in_length
195 %macro inflate_in_small_load 6
199 %define %%read_in_length %4
200 %define %%avail_in %5 ; Tmp registers
202 %define %%loop_count %6
204 mov %%avail_in, %%end_in
205 sub %%avail_in, %%next_in
207 %ifnidn %%read_in_length, rcx
208 mov rcx, %%read_in_length
212 sub %%loop_count, %%read_in_length
215 cmp %%loop_count, %%avail_in
216 cmovg %%loop_count, %%avail_in
222 mov %%tmp1 %+ b, byte [%%next_in]
223 SHLX %%tmp1, %%tmp1, rcx
229 %ifnidn %%read_in_length, rcx
230 mov %%read_in_length, rcx
235 ;; Decode next symbol
238 %define %%state %1 ; State structure associated with compressed stream
239 %define %%lookup_size %2 ; Number of bits used for small lookup
240 %define %%state_offset %3
241 %define %%read_in %4 ; Bits read in from compressed stream
242 %define %%read_in_length %5 ; Number of valid bits in read_in
243 %define %%next_sym %6 ; Returned symobl
244 %define %%next_bits %7
245 %define %%next_bits2 %8
247 ;; Lookup possible next symbol
248 mov %%next_bits, %%read_in
249 and %%next_bits, (1 << %%lookup_size) - 1
250 movzx %%next_sym, word [%%state + %%state_offset + 2 * %%next_bits]
252 ;; Save length associated with symbol
257 ;; Check if symbol or hint was looked up
258 and %%next_sym, 0x81FF
259 cmp %%next_sym, 0x8000
262 ;; Decode next_sym using hint
263 mov %%next_bits2, %%read_in
265 ;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first DECODE_LOOKUP_SIZE bits.
268 bzhi %%next_bits2, %%next_bits2, rcx
274 shr %%next_bits2, %%lookup_size
276 add %%next_bits2, %%next_sym
278 ;; Lookup actual next symbol
279 movzx %%next_sym, word [%%state + %%state_offset + 2 * %%next_bits2 + 2 *((1 << %%lookup_size) - 0x8000)]
281 ;; Save length associated with symbol
285 and %%next_sym, 0x1FF
287 ;; Updated read_in to reflect the bits which were decoded
288 sub %%read_in_length, rcx
289 SHRX %%read_in, %%read_in, rcx
293 ;; Decode next symbol
295 %macro decode_next2 7
296 %define %%state %1 ; State structure associated with compressed stream
297 %define %%lookup_size %2 ; Number of bits used for small lookup
298 %define %%state_offset %3 ; Type of huff code, should be either LIT or DIST
299 %define %%read_in %4 ; Bits read in from compressed stream
300 %define %%read_in_length %5 ; Number of valid bits in read_in
301 %define %%next_sym %6 ; Returned symobl
302 %define %%next_bits2 %7
304 ;; Save length associated with symbol
305 mov %%next_bits2, %%read_in
306 shr %%next_bits2, %%lookup_size
312 ;; Check if symbol or hint was looked up
313 and %%next_sym, 0x81FF
314 cmp %%next_sym, 0x8000
317 ;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first %%lookup_size bits.
318 lea %%next_sym, [%%state + 2 * %%next_sym]
319 sub rcx, 0x40 + %%lookup_size
322 bzhi %%next_bits2, %%next_bits2, rcx
324 ;; Decode next_sym using hint
330 ;; Lookup actual next symbol
331 movzx %%next_sym, word [%%next_sym + %%state_offset + 2 * %%next_bits2 + 2 * ((1 << %%lookup_size) - 0x8000)]
333 ;; Save length associated with symbol
337 and %%next_sym, 0x1FF
340 ;; Updated read_in to reflect the bits which were decoded
341 SHRX %%read_in, %%read_in, rcx
342 sub %%read_in_length, rcx
345 global decode_huffman_code_block_stateless_ %+ ARCH
346 decode_huffman_code_block_stateless_ %+ ARCH %+ :
351 lea rfc_lookup, [rfc1951_lookup_table]
353 mov read_in,[state + _read_in]
354 mov read_in_length %+ d, dword [state + _read_in_length]
355 mov next_out, [state + _next_out]
356 mov end_out %+ d, dword [state + _avail_out]
357 add end_out, next_out
358 mov next_in, [state + _next_in]
359 mov end_in %+ d, dword [state + _avail_in]
362 mov dword [state + _copy_overflow_len], 0
363 mov dword [state + _copy_overflow_dist], 0
365 mov tmp3 %+ d, dword [state + _total_out]
369 mov [rsp + start_out_mem_offset], tmp3
371 sub end_out, OUT_BUFFER_SLOP
372 sub end_in, IN_BUFFER_SLOP
375 jg end_loop_block_pre
377 cmp read_in_length, 64
380 inflate_in_load next_in, end_in, read_in, read_in_length, tmp1, tmp2
384 and tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1
385 movzx next_sym, word [state + _lit_huff_code + 2 * tmp3]
387 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391 ;; Check if near end of in buffer or out buffer
393 jg end_loop_block_pre
394 cmp next_out, end_out
395 jg end_loop_block_pre
397 ;; Decode next symbol and reload the read_in buffer
398 decode_next2 state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, tmp1
400 ;; Save next_sym in next_sym2 so next_sym can be preloaded
401 mov next_sym2, next_sym
403 ;; Find index to specutively preload next_sym from
405 and tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1
407 ;; Start reloading read_in
409 SHLX tmp1, tmp1, read_in_length
412 ;; Specutively load data associated with length symbol
413 movzx rcx, byte [rfc_lookup + _len_extra_bit_count + next_sym2 - 257]
414 movzx repeat_length, word [rfc_lookup + _len_start + 2 * (next_sym2 - 257)]
416 ;; Test for end of block symbol
420 ;; Specutively load next_sym for next loop if a literal was decoded
421 movzx next_sym, word [state + _lit_huff_code + 2 * tmp3]
423 ;; Finish updating read_in_length for read_in
425 sub tmp1, read_in_length
428 lea read_in_length, [read_in_length + 8 * tmp1]
430 ;; Specultively load next dist code
431 SHRX read_in_2, read_in, rcx
432 mov next_bits2, read_in_2
433 and next_bits2, (1 << ISAL_DECODE_SHORT_BITS) - 1
434 movzx next_sym3, word [state + _dist_huff_code + 2 * next_bits2]
436 ;; Specutively write next_sym2 if it is a literal
437 mov [next_out], next_sym2
440 ;; Check if next_sym2 is a literal, length, or end of block symbol
445 ;; Find length for length/dist pair
446 mov next_bits, read_in
448 BZHI next_bits, next_bits, rcx, tmp4
449 add repeat_length, next_bits
451 ;; Update read_in for the length extra bits which were read in
452 sub read_in_length, rcx
454 ;; Decode distance code
455 decode_next2 state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in_2, read_in_length, next_sym3, tmp2
457 movzx rcx, byte [rfc_lookup + _dist_extra_bit_count + next_sym3]
458 mov look_back_dist2 %+ d, [rfc_lookup + _dist_start + 4 * next_sym3]
460 ;; Load distance code extra bits
461 mov next_bits, read_in_2
463 ;; Determine next_out after the copy is finished
464 add next_out, repeat_length
467 ;; Calculate the look back distance
468 BZHI next_bits, next_bits, rcx, tmp4
469 SHRX read_in_2, read_in_2, rcx
471 ;; Setup next_sym, read_in, and read_in_length for next loop
472 mov read_in, read_in_2
473 and read_in_2, (1 << ISAL_DECODE_LONG_BITS) - 1
474 movzx next_sym, word [state + _lit_huff_code + 2 * read_in_2]
475 sub read_in_length, rcx
477 ;; Copy distance in len/dist pair
478 add look_back_dist2, next_bits
480 ;; Find beginning of copy
481 mov copy_start, next_out
482 sub copy_start, repeat_length
483 sub copy_start, look_back_dist2
485 ;; Check if a valid look back distances was decoded
486 cmp copy_start, [rsp + start_out_mem_offset]
487 jl invalid_look_back_distance
488 MOVDQU xmm1, [copy_start]
490 ;; Set tmp2 to be the minimum of COPY_SIZE and repeat_length
491 ;; This is to decrease use of small_byte_copy branch
493 cmp tmp2, repeat_length
494 cmovg tmp2, repeat_length
496 ;; Check for overlapping memory in the copy
497 cmp look_back_dist2, tmp2
498 jl small_byte_copy_pre
501 ;; Copy length distance pair when memory overlap is not an issue
502 MOVDQU [copy_start + look_back_dist2], xmm1
504 sub repeat_length, COPY_SIZE
507 add copy_start, COPY_SIZE
508 MOVDQU xmm1, [copy_start]
512 ;; Copy length distance pair when source and destination overlap
513 add repeat_length, look_back_dist2
515 MOVDQU [copy_start + look_back_dist2], xmm1
517 shl look_back_dist2, 1
518 MOVDQU xmm1, [copy_start]
519 cmp look_back_dist2, COPY_SIZE
522 sub repeat_length, look_back_dist2
526 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
528 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
530 ;; Fix up in buffer and out buffer to reflect the actual buffer end
531 add end_out, OUT_BUFFER_SLOP
532 add end_in, IN_BUFFER_SLOP
535 ;; Load read in buffer and decode next lit/len symbol
536 inflate_in_small_load next_in, end_in, read_in, read_in_length, tmp1, tmp2
537 mov [rsp + read_in_mem_offset], read_in
538 mov [rsp + read_in_length_mem_offset], read_in_length
540 decode_next state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, tmp1, tmp2
542 ;; Check that enough input was available to decode symbol
543 cmp read_in_length, 0
551 ;; Load length exta bits
552 mov next_bits, read_in
554 movzx repeat_length, word [rfc_lookup + _len_start + 2 * (next_sym - 257)]
555 movzx rcx, byte [rfc_lookup + _len_extra_bit_count + next_sym - 257]
557 ;; Calculate repeat length
558 BZHI next_bits, next_bits, rcx, tmp1
559 add repeat_length, next_bits
561 ;; Update read_in for the length extra bits which were read in
562 SHRX read_in, read_in, rcx
563 sub read_in_length, rcx
565 ;; Decode distance code
566 decode_next state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in, read_in_length, next_sym, tmp1, tmp2
568 ;; Load distance code extra bits
569 mov next_bits, read_in
570 mov look_back_dist %+ d, [rfc_lookup + _dist_start + 4 * next_sym]
571 movzx rcx, byte [rfc_lookup + _dist_extra_bit_count + next_sym]
574 ;; Calculate the look back distance and check for enough input
575 BZHI next_bits, next_bits, rcx, tmp1
576 SHRX read_in, read_in, rcx
577 add look_back_dist, next_bits
578 sub read_in_length, rcx
581 ;; Setup code for byte copy using rep movsb
584 mov rcx, repeat_length
585 sub rsi, look_back_dist
587 ;; Check if a valid look back distance was decoded
588 cmp rsi, [rsp + start_out_mem_offset]
589 jl invalid_look_back_distance
591 ;; Check for out buffer overflow
592 add repeat_length, next_out
593 cmp repeat_length, end_out
594 jg out_buffer_overflow_repeat
596 mov next_out, repeat_length
602 ;; Store literal decoded from the input stream
603 cmp next_out, end_out
604 jge out_buffer_overflow_lit
606 mov byte [next_out - 1], next_sym %+ b
611 mov read_in, [rsp + read_in_mem_offset]
612 mov read_in_length, [rsp + read_in_length_mem_offset]
616 out_buffer_overflow_repeat:
619 sub repeat_length, rcx
620 sub repeat_length, next_out
623 mov [state + _copy_overflow_len], repeat_length %+ d
624 mov [state + _copy_overflow_dist], look_back_dist %+ d
626 mov next_out, end_out
628 mov rax, OUT_OVERFLOW
631 out_buffer_overflow_lit:
632 mov read_in, [rsp + read_in_mem_offset]
633 mov read_in_length, [rsp + read_in_length_mem_offset]
634 mov rax, OUT_OVERFLOW
637 invalid_look_back_distance:
638 mov rax, INVALID_LOOKBACK
642 mov rax, INVALID_SYMBOL
646 ;; Fix up in buffer and out buffer to reflect the actual buffer
647 add end_out, OUT_BUFFER_SLOP
648 add end_in, IN_BUFFER_SLOP
650 ;; Set flag identifying a new block is required
651 mov byte [state + _block_state], ISAL_BLOCK_NEW_HDR
654 ;; Save current buffer states
655 mov [state + _read_in], read_in
656 mov [state + _read_in_length], read_in_length %+ d
657 mov [state + _next_out], next_out
658 sub end_out, next_out
659 mov dword [state + _avail_out], end_out %+ d
660 sub next_out, [rsp + start_out_mem_offset]
661 mov [state + _total_out], next_out %+ d
662 mov [state + _next_in], next_in
664 mov [state + _avail_in], end_in %+ d