]>
Commit | Line | Data |
---|---|---|
224ce89b WB |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2016 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | %include "options.asm" | |
30 | ||
31 | %include "lz0a_const.asm" | |
32 | %include "data_struct2.asm" | |
33 | %include "bitbuf2.asm" | |
34 | %include "huffman.asm" | |
35 | %include "igzip_compare_types.asm" | |
36 | %include "reg_sizes.asm" | |
37 | ||
38 | %include "stdmac.asm" | |
39 | ||
40 | %ifdef DEBUG | |
41 | %macro MARK 1 | |
42 | global %1 | |
43 | %1: | |
44 | %endm | |
45 | %else | |
46 | %macro MARK 1 | |
47 | %endm | |
48 | %endif | |
49 | ||
50 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
51 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
52 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
53 | %define file_start rdi | |
54 | %define file_length r15 | |
55 | %define stream r14 | |
56 | %define f_i r10 | |
57 | %define m_out_buf r11 | |
58 | ||
59 | %define curr_data rax | |
60 | ||
61 | %define tmp2 rcx | |
62 | ||
63 | %define dist rbx | |
64 | %define dist_code2 rbx | |
65 | %define lit_code2 rbx | |
66 | ||
67 | %define dist2 r12 | |
68 | %define dist_code r12 | |
69 | ||
70 | %define tmp1 rsi | |
71 | ||
72 | %define lit_code rsi | |
73 | ||
74 | %define curr_data2 r8 | |
75 | %define len2 r8 | |
76 | %define tmp4 r8 | |
77 | ||
78 | %define len rdx | |
79 | %define len_code rdx | |
80 | %define hash3 rdx | |
81 | ||
82 | %define tmp3 r13 | |
83 | ||
84 | %define hash rbp | |
85 | %define hash2 r9 | |
86 | ||
87 | ;; GPR r8 & r15 can be used | |
88 | ||
89 | %define xtmp0 xmm0 ; tmp | |
90 | %define xtmp1 xmm1 ; tmp | |
91 | %define xdata xmm4 | |
92 | ||
93 | %define ytmp0 ymm0 ; tmp | |
94 | %define ytmp1 ymm1 ; tmp | |
95 | ||
96 | ||
97 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
98 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
99 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
100 | ||
101 | m_out_end equ 0 ; local variable (8 bytes) | |
102 | m_out_start equ 8 | |
103 | f_end_i_mem_offset equ 16 | |
104 | gpr_save_mem_offset equ 24 ; gpr save area (8*8 bytes) | |
105 | xmm_save_mem_offset equ 24 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned) | |
106 | stack_size equ 3*8 + 8*8 + 4*16 | |
107 | ||
108 | ;;; 8 because stack address is odd multiple of 8 after a function call and | |
109 | ;;; we want it aligned to 16 bytes | |
110 | ||
111 | ; void isal_deflate_icf_body ( isal_zstream *stream ) | |
112 | ; arg 1: rcx: addr of stream | |
113 | global isal_deflate_icf_body_ %+ ARCH | |
114 | isal_deflate_icf_body_ %+ ARCH %+ : | |
115 | %ifidn __OUTPUT_FORMAT__, elf64 | |
116 | mov rcx, rdi | |
117 | %endif | |
118 | ||
119 | ;; do nothing if (avail_in == 0) | |
120 | cmp dword [rcx + _avail_in], 0 | |
121 | jne skip1 | |
122 | ||
123 | ;; Set stream's next state | |
124 | mov rdx, ZSTATE_FLUSH_READ_BUFFER | |
125 | mov rax, ZSTATE_CREATE_HDR | |
126 | cmp dword [rcx + _end_of_stream], 0 | |
127 | cmovne rax, rdx | |
128 | cmp dword [rcx + _flush], _NO_FLUSH | |
129 | cmovne rax, rdx | |
130 | mov dword [rcx + _internal_state_state], eax | |
131 | ret | |
132 | skip1: | |
133 | ||
134 | %ifdef ALIGN_STACK | |
135 | push rbp | |
136 | mov rbp, rsp | |
137 | sub rsp, stack_size | |
138 | and rsp, ~15 | |
139 | %else | |
140 | sub rsp, stack_size | |
141 | %endif | |
142 | ||
143 | mov [rsp + gpr_save_mem_offset + 0*8], rbx | |
144 | mov [rsp + gpr_save_mem_offset + 1*8], rsi | |
145 | mov [rsp + gpr_save_mem_offset + 2*8], rdi | |
146 | mov [rsp + gpr_save_mem_offset + 3*8], rbp | |
147 | mov [rsp + gpr_save_mem_offset + 4*8], r12 | |
148 | mov [rsp + gpr_save_mem_offset + 5*8], r13 | |
149 | mov [rsp + gpr_save_mem_offset + 6*8], r14 | |
150 | mov [rsp + gpr_save_mem_offset + 7*8], r15 | |
151 | ||
152 | mov stream, rcx | |
153 | mov dword [stream + _internal_state_has_eob], 0 | |
154 | ||
155 | ; state->bitbuf.set_buf(stream->next_out, stream->avail_out); | |
156 | mov tmp1, [stream + _level_buf] | |
157 | mov m_out_buf, [tmp1 + _icf_buf_next] | |
158 | ||
159 | mov [rsp + m_out_start], m_out_buf | |
160 | mov tmp1, [tmp1 + _icf_buf_avail_out] | |
161 | add tmp1, m_out_buf | |
162 | sub tmp1, SLOP | |
163 | ||
164 | mov [rsp + m_out_end], tmp1 | |
165 | ||
166 | mov file_start, [stream + _next_in] | |
167 | ||
168 | mov f_i %+ d, dword [stream + _total_in] | |
169 | sub file_start, f_i | |
170 | ||
171 | mov file_length %+ d, [stream + _avail_in] | |
172 | add file_length, f_i | |
173 | ||
174 | ; file_length -= LA; | |
175 | sub file_length, LA | |
176 | ; if (file_length <= 0) continue; | |
177 | ||
178 | cmp file_length, f_i | |
179 | jle input_end | |
180 | ||
181 | ; for (f_i = f_start_i; f_i < file_length; f_i++) { | |
182 | MARK __body_compute_hash_ %+ ARCH | |
183 | MOVDQU xdata, [file_start + f_i] | |
184 | mov curr_data, [file_start + f_i] | |
185 | mov tmp3, curr_data | |
186 | mov tmp4, curr_data | |
187 | ||
188 | compute_hash hash, curr_data | |
189 | ||
190 | shr tmp3, 8 | |
191 | compute_hash hash2, tmp3 | |
192 | ||
193 | and hash, HASH_MASK | |
194 | and hash2, HASH_MASK | |
195 | ||
196 | cmp dword [stream + _internal_state_has_hist], 0 | |
197 | je write_first_byte | |
198 | ||
199 | jmp loop2 | |
200 | align 16 | |
201 | ||
202 | loop2: | |
203 | ; if (state->bitbuf.is_full()) { | |
204 | cmp m_out_buf, [rsp + m_out_end] | |
205 | ja output_end | |
206 | ||
207 | xor dist, dist | |
208 | xor dist2, dist2 | |
209 | xor tmp3, tmp3 | |
210 | ||
211 | lea tmp1, [file_start + f_i] | |
212 | ||
213 | mov dist %+ w, f_i %+ w | |
214 | dec dist | |
215 | sub dist %+ w, word [stream + _internal_state_head + 2 * hash] | |
216 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
217 | ||
218 | inc f_i | |
219 | ||
220 | mov tmp2, curr_data | |
221 | shr curr_data, 16 | |
222 | compute_hash hash, curr_data | |
223 | and hash %+ d, HASH_MASK | |
224 | ||
225 | mov dist2 %+ w, f_i %+ w | |
226 | dec dist2 | |
227 | sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2] | |
228 | mov [stream + _internal_state_head + 2 * hash2], f_i %+ w | |
229 | ||
230 | ; if ((dist-1) < (D-1)) { | |
231 | and dist %+ d, (D-1) | |
232 | neg dist | |
233 | ||
234 | shr tmp2, 24 | |
235 | compute_hash hash2, tmp2 | |
236 | and hash2 %+ d, HASH_MASK | |
237 | ||
238 | and dist2 %+ d, (D-1) | |
239 | neg dist2 | |
240 | ||
241 | MARK __body_compare_ %+ ARCH | |
242 | ;; Check for long len/dist match (>7) with first literal | |
243 | MOVQ len, xdata | |
244 | mov curr_data, len | |
245 | PSRLDQ xdata, 1 | |
246 | xor len, [tmp1 + dist - 1] | |
247 | jz compare_loop | |
248 | ||
249 | ;; Check for len/dist match (>7) with second literal | |
250 | MOVQ len2, xdata | |
251 | xor len2, [tmp1 + dist2] | |
252 | jz compare_loop2 | |
253 | ||
254 | movzx lit_code, curr_data %+ b | |
255 | shr curr_data, 8 | |
256 | ||
257 | ;; Check for len/dist match for first literal | |
258 | test len %+ d, 0xFFFFFFFF | |
259 | jz len_dist_huffman_pre | |
260 | ||
261 | inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code] | |
262 | movzx lit_code2, curr_data %+ b | |
263 | ;; Check for len/dist match for second literal | |
264 | test len2 %+ d, 0xFFFFFFFF | |
265 | jnz write_lit_bits | |
266 | ||
267 | MARK __body_len_dist_lit_huffman_ %+ ARCH | |
268 | len_dist_lit_huffman_pre: | |
269 | bsf len2, len2 | |
270 | shr len2, 3 | |
271 | ||
272 | len_dist_lit_huffman: | |
273 | or lit_code, LIT | |
274 | movnti dword [m_out_buf], lit_code %+ d | |
275 | ||
276 | neg dist2 | |
277 | ||
278 | get_dist_icf_code dist2, dist_code2, tmp1 | |
279 | ||
280 | ;; Setup for updating hash | |
281 | lea tmp3, [f_i + 1] ; tmp3 <= k | |
282 | ||
283 | add file_start, f_i | |
284 | MOVDQU xdata, [file_start + len2] | |
285 | mov tmp1, [file_start + len2] | |
286 | ||
287 | shr curr_data, 24 | |
288 | compute_hash hash3, curr_data | |
289 | and hash3, HASH_MASK | |
290 | ||
291 | mov curr_data, tmp1 | |
292 | shr tmp1, 8 | |
293 | ||
294 | sub file_start, f_i | |
295 | add f_i, len2 | |
296 | ||
297 | mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w | |
298 | ||
299 | compute_hash hash, curr_data | |
300 | ||
301 | add tmp3,1 | |
302 | mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w | |
303 | ||
304 | compute_hash hash2, tmp1 | |
305 | ||
306 | add tmp3, 1 | |
307 | mov [stream + _internal_state_head + 2 * hash3], tmp3 %+ w | |
308 | ||
309 | add dist_code2, 254 | |
310 | add dist_code2, len2 | |
311 | ||
312 | inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*(len2 + 254)] | |
313 | ||
314 | movnti dword [m_out_buf + 4], dist_code2 %+ d | |
315 | add m_out_buf, 8 | |
316 | ||
317 | shr dist_code2, DIST_OFFSET | |
318 | and dist_code2, 0x1F | |
319 | inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code2] | |
320 | ||
321 | ; hash = compute_hash(state->file_start + f_i) & HASH_MASK; | |
322 | and hash %+ d, HASH_MASK | |
323 | and hash2 %+ d, HASH_MASK | |
324 | ||
325 | ; continue | |
326 | cmp f_i, file_length | |
327 | jl loop2 | |
328 | jmp input_end | |
329 | ;; encode as dist/len | |
330 | ||
331 | MARK __body_len_dist_huffman_ %+ ARCH | |
332 | len_dist_huffman_pre: | |
333 | bsf len, len | |
334 | shr len, 3 | |
335 | ||
336 | len_dist_huffman: | |
337 | dec f_i | |
338 | ;; Setup for updateing hash | |
339 | lea tmp3, [f_i + 2] ; tmp3 <= k | |
340 | ||
341 | neg dist | |
342 | ||
343 | ; get_dist_code(dist, &code2, &code_len2); | |
344 | get_dist_icf_code dist, dist_code, tmp1 | |
345 | ||
346 | add file_start, f_i | |
347 | MOVDQU xdata, [file_start + len] | |
348 | mov curr_data2, [file_start + len] | |
349 | mov curr_data, curr_data2 | |
350 | sub file_start, f_i | |
351 | add f_i, len | |
352 | ; get_len_code(len, &code, &code_len); | |
353 | lea len_code, [len + 254] | |
354 | or dist_code, len_code | |
355 | ||
356 | mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w | |
357 | add tmp3,1 | |
358 | mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w | |
359 | ||
360 | compute_hash hash, curr_data | |
361 | ||
362 | shr curr_data2, 8 | |
363 | compute_hash hash2, curr_data2 | |
364 | ||
365 | inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*len_code] | |
366 | ||
367 | movnti dword [m_out_buf], dist_code %+ d | |
368 | add m_out_buf, 4 | |
369 | ||
370 | shr dist_code, DIST_OFFSET | |
371 | and dist_code, 0x1F | |
372 | inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code] | |
373 | ||
374 | ; hash = compute_hash(state->file_start + f_i) & HASH_MASK; | |
375 | and hash %+ d, HASH_MASK | |
376 | and hash2 %+ d, HASH_MASK | |
377 | ||
378 | ; continue | |
379 | cmp f_i, file_length | |
380 | jl loop2 | |
381 | jmp input_end | |
382 | ||
383 | MARK __body_write_lit_bits_ %+ ARCH | |
384 | write_lit_bits: | |
385 | MOVDQU xdata, [file_start + f_i + 1] | |
386 | add f_i, 1 | |
387 | MOVQ curr_data, xdata | |
388 | ||
389 | inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code2] | |
390 | ||
391 | shl lit_code2, DIST_OFFSET | |
392 | lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)] | |
393 | ||
394 | movnti dword [m_out_buf], lit_code %+ d | |
395 | add m_out_buf, 4 | |
396 | ||
397 | ; continue | |
398 | cmp f_i, file_length | |
399 | jl loop2 | |
400 | ||
401 | input_end: | |
402 | mov tmp1, ZSTATE_FLUSH_READ_BUFFER | |
403 | mov tmp2, ZSTATE_BODY | |
404 | cmp dword [stream + _end_of_stream], 0 | |
405 | cmovne tmp2, tmp1 | |
406 | cmp dword [stream + _flush], _NO_FLUSH | |
407 | ||
408 | cmovne tmp2, tmp1 | |
409 | mov dword [stream + _internal_state_state], tmp2 %+ d | |
410 | jmp end | |
411 | ||
412 | output_end: | |
413 | mov dword [stream + _internal_state_state], ZSTATE_CREATE_HDR | |
414 | ||
415 | end: | |
416 | ;; update input buffer | |
417 | add file_length, LA | |
418 | mov [stream + _total_in], f_i %+ d | |
419 | add file_start, f_i | |
420 | mov [stream + _next_in], file_start | |
421 | sub file_length, f_i | |
422 | mov [stream + _avail_in], file_length %+ d | |
423 | ||
424 | ;; update output buffer | |
425 | mov tmp1, [stream + _level_buf] | |
426 | mov [tmp1 + _icf_buf_next], m_out_buf | |
427 | sub m_out_buf, [rsp + m_out_start] | |
428 | sub [tmp1 + _icf_buf_avail_out], m_out_buf %+ d | |
429 | ||
430 | mov rbx, [rsp + gpr_save_mem_offset + 0*8] | |
431 | mov rsi, [rsp + gpr_save_mem_offset + 1*8] | |
432 | mov rdi, [rsp + gpr_save_mem_offset + 2*8] | |
433 | mov rbp, [rsp + gpr_save_mem_offset + 3*8] | |
434 | mov r12, [rsp + gpr_save_mem_offset + 4*8] | |
435 | mov r13, [rsp + gpr_save_mem_offset + 5*8] | |
436 | mov r14, [rsp + gpr_save_mem_offset + 6*8] | |
437 | mov r15, [rsp + gpr_save_mem_offset + 7*8] | |
438 | ||
439 | %ifndef ALIGN_STACK | |
440 | add rsp, stack_size | |
441 | %else | |
442 | mov rsp, rbp | |
443 | pop rbp | |
444 | %endif | |
445 | ret | |
446 | ||
447 | MARK __body_compare_loops_ %+ ARCH | |
448 | compare_loop: | |
449 | lea tmp2, [tmp1 + dist - 1] | |
450 | %if (COMPARE_TYPE == 1) | |
451 | compare250 tmp1, tmp2, len, tmp3 | |
452 | %elif (COMPARE_TYPE == 2) | |
453 | compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1 | |
454 | %elif (COMPARE_TYPE == 3) | |
455 | compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1 | |
456 | %else | |
457 | %error Unknown Compare type COMPARE_TYPE | |
458 | % error | |
459 | %endif | |
460 | jmp len_dist_huffman | |
461 | ||
462 | compare_loop2: | |
463 | lea tmp2, [tmp1 + dist2] | |
464 | add tmp1, 1 | |
465 | %if (COMPARE_TYPE == 1) | |
466 | compare250 tmp1, tmp2, len2, tmp3 | |
467 | %elif (COMPARE_TYPE == 2) | |
468 | compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1 | |
469 | %elif (COMPARE_TYPE == 3) | |
470 | compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1 | |
471 | %else | |
472 | %error Unknown Compare type COMPARE_TYPE | |
473 | % error | |
474 | %endif | |
475 | movzx lit_code, curr_data %+ b | |
476 | shr curr_data, 8 | |
477 | inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code] | |
478 | jmp len_dist_lit_huffman | |
479 | ||
480 | MARK __write_first_byte_ %+ ARCH | |
481 | write_first_byte: | |
482 | cmp m_out_buf, [rsp + m_out_end] | |
483 | ja output_end | |
484 | ||
485 | mov dword [stream + _internal_state_has_hist], 1 | |
486 | ||
487 | mov [stream + _internal_state_head + 2 * hash], f_i %+ w | |
488 | ||
489 | mov hash, hash2 | |
490 | shr tmp4, 16 | |
491 | compute_hash hash2, tmp4 | |
492 | ||
493 | and curr_data, 0xff | |
494 | inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*curr_data] | |
495 | or curr_data, LIT | |
496 | ||
497 | movnti dword [m_out_buf], curr_data %+ d | |
498 | add m_out_buf, 4 | |
499 | ||
500 | MOVDQU xdata, [file_start + f_i + 1] | |
501 | add f_i, 1 | |
502 | mov curr_data, [file_start + f_i] | |
503 | and hash %+ d, HASH_MASK | |
504 | and hash2 %+ d, HASH_MASK | |
505 | ||
506 | cmp f_i, file_length | |
507 | jl loop2 | |
508 | jmp input_end | |
509 | ||
510 | section .data | |
511 | align 16 | |
512 | mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK | |
513 | const_D: dq D |