4 %include "lz0a_const.asm"
5 %include "data_struct2.asm"
8 %include "igzip_compare_types.asm"
9 %include "reg_sizes.asm"
13 extern rfc1951_lookup_table
14 _len_to_code_offset equ 0
16 %define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
17 %define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
20 %define HIST_ELEM_SIZE 8
32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
35 %define file_start rdi
36 %define file_length rsi
46 %define dist_code2 rbx
83 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
84 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
85 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
86 _eob_count_offset equ 0 ; local variable (8 bytes)
87 f_end_i_mem_offset equ 8
88 gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
89 xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
90 stack_size equ 2*8 + 8*8 + 4*16 + 8
91 ;;; 8 because stack address is odd multiple of 8 after a function call and
92 ;;; we want it aligned to 16 bytes
94 %ifidn __OUTPUT_FORMAT__, elf64
109 mov [rsp + gpr_save_mem_offset + 0*8], rbx
110 mov [rsp + gpr_save_mem_offset + 1*8], rbp
111 mov [rsp + gpr_save_mem_offset + 2*8], r12
112 mov [rsp + gpr_save_mem_offset + 3*8], r13
113 mov [rsp + gpr_save_mem_offset + 4*8], r14
114 mov [rsp + gpr_save_mem_offset + 5*8], r15
117 %macro FUNC_RESTORE 0
118 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
119 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
120 mov r12, [rsp + gpr_save_mem_offset + 2*8]
121 mov r13, [rsp + gpr_save_mem_offset + 3*8]
122 mov r14, [rsp + gpr_save_mem_offset + 4*8]
123 mov r15, [rsp + gpr_save_mem_offset + 5*8]
134 %ifidn __OUTPUT_FORMAT__, win64
149 mov [rsp + gpr_save_mem_offset + 0*8], rbx
150 mov [rsp + gpr_save_mem_offset + 1*8], rsi
151 mov [rsp + gpr_save_mem_offset + 2*8], rdi
152 mov [rsp + gpr_save_mem_offset + 3*8], rbp
153 mov [rsp + gpr_save_mem_offset + 4*8], r12
154 mov [rsp + gpr_save_mem_offset + 5*8], r13
155 mov [rsp + gpr_save_mem_offset + 6*8], r14
156 mov [rsp + gpr_save_mem_offset + 7*8], r15
159 %macro FUNC_RESTORE 0
160 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
161 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
162 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
163 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
164 mov r12, [rsp + gpr_save_mem_offset + 4*8]
165 mov r13, [rsp + gpr_save_mem_offset + 5*8]
166 mov r14, [rsp + gpr_save_mem_offset + 6*8]
167 mov r15, [rsp + gpr_save_mem_offset + 7*8]
179 _lit_len_offset equ 0
180 _dist_offset equ (8 * LIT_LEN)
181 _hash_offset equ (_dist_offset + 8 * DIST_LEN)
184 %macro len_to_len_code 3
185 %define %%len_code %1 ; Output
186 %define %%len %2 ; Input
187 %define %%rfc_lookup %3
188 movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len]
192 ;;; Clobbers rcx and dist
193 %macro dist_to_dist_code 2
194 %define %%dist_code %1 ; Output code associated with dist
195 %define %%dist_coded %1d
196 %define %%dist %2d ; Input dist
198 mov %%dist_coded, %%dist
199 bsr ecx, %%dist_coded
201 SHRX %%dist_code, %%dist_code, rcx
202 lea %%dist_coded, [%%dist_coded + 2*ecx]
205 cmovle %%dist_coded, %%dist
208 ;;; Clobbers rcx and dist
209 %macro dist_to_dist_code2 2
210 %define %%dist_code %1 ; Output code associated with dist
211 %define %%dist_coded %1d
212 %define %%dist %2d ; Input -(dist - 1)
214 mov %%dist_coded, %%dist
215 bsr ecx, %%dist_coded
217 SHRX %%dist_code, %%dist_code, rcx
218 lea %%dist_coded, [%%dist_coded + 2*ecx]
221 cmovle %%dist_coded, %%dist
224 ; void isal_update_histogram
225 global isal_update_histogram_ %+ ARCH
226 isal_update_histogram_ %+ ARCH %+ :
229 %ifnidn file_start, arg0
232 %ifnidn file_length, arg1
233 mov file_length, arg1
235 %ifnidn histogram, arg2
240 je exit_ret ; If nothing to do then exit
242 mov tmp1, qword [histogram + _lit_len_offset + 8*256]
244 mov [rsp + _eob_count_offset], tmp1
246 lea rfc_lookup, [rfc1951_lookup_table]
249 PXOR vtmp0, vtmp0, vtmp0
250 mov rcx, (IGZIP_HASH_SIZE - V_LENGTH)
252 MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0
253 MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0
257 sub file_length, LA_STATELESS
262 ;; Load first literal into histogram
263 mov curr_data, [file_start + f_i]
264 compute_hash hash, curr_data
265 and hash %+ d, HASH_MASK
266 mov [histogram + _hash_offset + 2 * hash], f_i %+ w
268 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
271 ;; Setup to begin loop 2
272 MOVDQU xdata, [file_start + f_i]
273 mov curr_data, [file_start + f_i]
274 mov curr_data2, curr_data
275 compute_hash hash, curr_data
277 compute_hash hash2, curr_data2
279 and hash2 %+ d, HASH_MASK
286 lea tmp1, [file_start + f_i]
288 MOVQ curr_data, xdata
291 ;; Load possible look back distances and update hash data
292 mov dist %+ w, f_i %+ w
294 sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
295 mov [histogram + _hash_offset + 2 * hash], f_i %+ w
299 mov dist2 %+ w, f_i %+ w
301 sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2]
302 mov [histogram + _hash_offset + 2 * hash2], f_i %+ w
304 ;; Start computing hashes to be used in either the next loop or
305 ;; for updating the hash if a match is found
306 MOVQ curr_data2, xdata
309 compute_hash hash, curr_data2
311 ;; Check if look back distances are valid. Load a junk distance of 1
312 ;; if the look back distance is too long for speculative lookups.
316 and dist2 %+ d, (D-1)
320 compute_hash hash2, tmp2
322 ;; Check for long len/dist matches (>7)
324 xor len, [tmp1 + dist - 1]
327 and hash %+ d, HASH_MASK
328 and hash2 %+ d, HASH_MASK
331 xor len2, [tmp1 + dist2]
334 ;; Specutively load the code for the first literal
335 movzx tmp1, curr_data %+ b
340 ;; Check for len/dist match for first literal
341 test len %+ d, 0xFFFFFFFF
342 jz len_dist_huffman_pre
344 ;; Store first literal
345 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1]
347 ;; Check for len/dist match for second literal
348 test len2 %+ d, 0xFFFFFFFF
350 len_dist_lit_huffman_pre:
351 ;; Calculate repeat length
355 len_dist_lit_huffman:
356 MOVQ curr_data, xdata
358 compute_hash hash3, curr_data
360 ;; Store updated hashes
361 mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
363 mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
368 MOVDQU xdata, [file_start + f_i]
369 mov curr_data, [file_start + f_i]
371 compute_hash hash, curr_data
374 mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w
376 dist_to_dist_code2 dist_code2, dist2
378 len_to_len_code len_code, len2, rfc_lookup
381 compute_hash hash2, tmp1
383 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
384 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2]
386 and hash2 %+ d, HASH_MASK
392 ;; encode as dist/len
394 len_dist_huffman_pre:
399 mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
401 mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
406 MOVDQU xdata, [file_start + f_i]
407 mov curr_data, [file_start + f_i]
409 compute_hash hash, curr_data
411 dist_to_dist_code2 dist_code, dist
413 len_to_len_code len_code, len, rfc_lookup
416 compute_hash hash2, tmp1
418 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
419 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
421 and hash2 %+ d, HASH_MASK
429 MOVDQU xdata, [file_start + f_i + 1]
432 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
438 add file_length, LA_STATELESS - LAST_BYTES_COUNT
443 mov curr_data %+ d, dword [file_start + f_i]
444 compute_hash hash, curr_data
445 and hash %+ d, HASH_MASK
447 ;; Calculate possible distance for length/dist pair.
449 mov dist %+ w, f_i %+ w
450 sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
451 mov [histogram + _hash_offset + 2 * hash], f_i %+ w
453 ;; Check if look back distance is valid (the dec is to handle when dist = 0)
456 jae encode_literal_finish
459 ;; Check if look back distance is a match
460 lea tmp4, [file_length + LAST_BYTES_COUNT]
462 lea tmp1, [file_start + f_i]
465 compare tmp4, tmp1, tmp2, len, tmp3
467 ;; Limit len to maximum value of 258
471 cmp len, SHORTEST_MATCH
472 jb encode_literal_finish
476 len_to_len_code len_code, len, rfc_lookup
477 dist_to_dist_code dist_code, dist
479 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
480 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
486 encode_literal_finish:
488 and curr_data %+ d, 0xFF
489 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
491 ;; Setup for next loop
497 add file_length, LAST_BYTES_COUNT
501 movzx curr_data, byte [file_start + f_i]
502 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
507 ;; Handle eob at end of stream
508 mov tmp1, [rsp + _eob_count_offset]
509 mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1
516 and hash %+ d, HASH_MASK
517 and hash2 %+ d, HASH_MASK
518 lea tmp2, [tmp1 + dist - 1]
519 %if (COMPARE_TYPE == 1)
520 compare250 tmp1, tmp2, len, tmp3
521 %elif (COMPARE_TYPE == 2)
522 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
523 %elif (COMPARE_TYPE == 3)
524 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
526 %error Unknown Compare type COMPARE_TYPE
534 lea tmp2, [tmp1 + dist2 - 1]
536 %if (COMPARE_TYPE == 1)
537 compare250 tmp1, tmp2, len2, tmp3
538 %elif (COMPARE_TYPE == 2)
539 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
540 %elif (COMPARE_TYPE == 3)
541 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
543 %error Unknown Compare type COMPARE_TYPE
547 inc qword [histogram + _lit_len_offset + 8 * curr_data]
549 jmp len_dist_lit_huffman
554 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
555 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
556 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
557 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF