]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/igzip_icf_body.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / igzip / igzip_icf_body.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 %include "options.asm"
30
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "bitbuf2.asm"
34 %include "huffman.asm"
35 %include "igzip_compare_types.asm"
36 %include "reg_sizes.asm"
37
38 %include "stdmac.asm"
39
40 %ifdef DEBUG
41 %macro MARK 1
42 global %1
43 %1:
44 %endm
45 %else
46 %macro MARK 1
47 %endm
48 %endif
49
50 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 %define file_start rdi
54 %define file_length r15
55 %define stream r14
56 %define f_i r10
57 %define m_out_buf r11
58
59 %define curr_data rax
60
61 %define tmp2 rcx
62
63 %define dist rbx
64 %define dist_code2 rbx
65 %define lit_code2 rbx
66
67 %define dist2 r12
68 %define dist_code r12
69
70 %define tmp1 rsi
71
72 %define lit_code rsi
73
74 %define curr_data2 r8
75 %define len2 r8
76 %define tmp4 r8
77
78 %define len rdx
79 %define len_code rdx
80 %define hash3 rdx
81
82 %define tmp3 r13
83
84 %define hash rbp
85 %define hash2 r9
86
87 ;; GPR r8 & r15 can be used
88
89 %define xtmp0 xmm0 ; tmp
90 %define xtmp1 xmm1 ; tmp
91 %define xdata xmm4
92
93 %define ytmp0 ymm0 ; tmp
94 %define ytmp1 ymm1 ; tmp
95
96
97 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
98 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
99 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
100
101 m_out_end equ 0 ; local variable (8 bytes)
102 m_out_start equ 8
103 f_end_i_mem_offset equ 16
104 gpr_save_mem_offset equ 24 ; gpr save area (8*8 bytes)
105 xmm_save_mem_offset equ 24 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
106 stack_size equ 3*8 + 8*8 + 4*16
107
108 ;;; 8 because stack address is odd multiple of 8 after a function call and
109 ;;; we want it aligned to 16 bytes
110
111 ; void isal_deflate_icf_body ( isal_zstream *stream )
112 ; arg 1: rcx: addr of stream
113 global isal_deflate_icf_body_ %+ ARCH
114 isal_deflate_icf_body_ %+ ARCH %+ :
115 %ifidn __OUTPUT_FORMAT__, elf64
116 mov rcx, rdi
117 %endif
118
119 ;; do nothing if (avail_in == 0)
120 cmp dword [rcx + _avail_in], 0
121 jne skip1
122
123 ;; Set stream's next state
124 mov rdx, ZSTATE_FLUSH_READ_BUFFER
125 mov rax, ZSTATE_CREATE_HDR
126 cmp dword [rcx + _end_of_stream], 0
127 cmovne rax, rdx
128 cmp dword [rcx + _flush], _NO_FLUSH
129 cmovne rax, rdx
130 mov dword [rcx + _internal_state_state], eax
131 ret
132 skip1:
133
134 %ifdef ALIGN_STACK
135 push rbp
136 mov rbp, rsp
137 sub rsp, stack_size
138 and rsp, ~15
139 %else
140 sub rsp, stack_size
141 %endif
142
143 mov [rsp + gpr_save_mem_offset + 0*8], rbx
144 mov [rsp + gpr_save_mem_offset + 1*8], rsi
145 mov [rsp + gpr_save_mem_offset + 2*8], rdi
146 mov [rsp + gpr_save_mem_offset + 3*8], rbp
147 mov [rsp + gpr_save_mem_offset + 4*8], r12
148 mov [rsp + gpr_save_mem_offset + 5*8], r13
149 mov [rsp + gpr_save_mem_offset + 6*8], r14
150 mov [rsp + gpr_save_mem_offset + 7*8], r15
151
152 mov stream, rcx
153 mov dword [stream + _internal_state_has_eob], 0
154
155 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
156 mov tmp1, [stream + _level_buf]
157 mov m_out_buf, [tmp1 + _icf_buf_next]
158
159 mov [rsp + m_out_start], m_out_buf
160 mov tmp1, [tmp1 + _icf_buf_avail_out]
161 add tmp1, m_out_buf
162 sub tmp1, SLOP
163
164 mov [rsp + m_out_end], tmp1
165
166 mov file_start, [stream + _next_in]
167
168 mov f_i %+ d, dword [stream + _total_in]
169 sub file_start, f_i
170
171 mov file_length %+ d, [stream + _avail_in]
172 add file_length, f_i
173
174 ; file_length -= LA;
175 sub file_length, LA
176 ; if (file_length <= 0) continue;
177
178 cmp file_length, f_i
179 jle input_end
180
181 ; for (f_i = f_start_i; f_i < file_length; f_i++) {
182 MARK __body_compute_hash_ %+ ARCH
183 MOVDQU xdata, [file_start + f_i]
184 mov curr_data, [file_start + f_i]
185 mov tmp3, curr_data
186 mov tmp4, curr_data
187
188 compute_hash hash, curr_data
189
190 shr tmp3, 8
191 compute_hash hash2, tmp3
192
193 and hash, HASH_MASK
194 and hash2, HASH_MASK
195
196 cmp dword [stream + _internal_state_has_hist], 0
197 je write_first_byte
198
199 jmp loop2
200 align 16
201
202 loop2:
203 ; if (state->bitbuf.is_full()) {
204 cmp m_out_buf, [rsp + m_out_end]
205 ja output_end
206
207 xor dist, dist
208 xor dist2, dist2
209 xor tmp3, tmp3
210
211 lea tmp1, [file_start + f_i]
212
213 mov dist %+ w, f_i %+ w
214 dec dist
215 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
216 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
217
218 inc f_i
219
220 mov tmp2, curr_data
221 shr curr_data, 16
222 compute_hash hash, curr_data
223 and hash %+ d, HASH_MASK
224
225 mov dist2 %+ w, f_i %+ w
226 dec dist2
227 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
228 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
229
230 ; if ((dist-1) < (D-1)) {
231 and dist %+ d, (D-1)
232 neg dist
233
234 shr tmp2, 24
235 compute_hash hash2, tmp2
236 and hash2 %+ d, HASH_MASK
237
238 and dist2 %+ d, (D-1)
239 neg dist2
240
241 MARK __body_compare_ %+ ARCH
242 ;; Check for long len/dist match (>7) with first literal
243 MOVQ len, xdata
244 mov curr_data, len
245 PSRLDQ xdata, 1
246 xor len, [tmp1 + dist - 1]
247 jz compare_loop
248
249 ;; Check for len/dist match (>7) with second literal
250 MOVQ len2, xdata
251 xor len2, [tmp1 + dist2]
252 jz compare_loop2
253
254 movzx lit_code, curr_data %+ b
255 shr curr_data, 8
256
257 ;; Check for len/dist match for first literal
258 test len %+ d, 0xFFFFFFFF
259 jz len_dist_huffman_pre
260
261 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code]
262 movzx lit_code2, curr_data %+ b
263 ;; Check for len/dist match for second literal
264 test len2 %+ d, 0xFFFFFFFF
265 jnz write_lit_bits
266
267 MARK __body_len_dist_lit_huffman_ %+ ARCH
268 len_dist_lit_huffman_pre:
269 bsf len2, len2
270 shr len2, 3
271
272 len_dist_lit_huffman:
273 or lit_code, LIT
274 movnti dword [m_out_buf], lit_code %+ d
275
276 neg dist2
277
278 get_dist_icf_code dist2, dist_code2, tmp1
279
280 ;; Setup for updating hash
281 lea tmp3, [f_i + 1] ; tmp3 <= k
282
283 add file_start, f_i
284 MOVDQU xdata, [file_start + len2]
285 mov tmp1, [file_start + len2]
286
287 shr curr_data, 24
288 compute_hash hash3, curr_data
289 and hash3, HASH_MASK
290
291 mov curr_data, tmp1
292 shr tmp1, 8
293
294 sub file_start, f_i
295 add f_i, len2
296
297 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
298
299 compute_hash hash, curr_data
300
301 add tmp3,1
302 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
303
304 compute_hash hash2, tmp1
305
306 add tmp3, 1
307 mov [stream + _internal_state_head + 2 * hash3], tmp3 %+ w
308
309 add dist_code2, 254
310 add dist_code2, len2
311
312 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*(len2 + 254)]
313
314 movnti dword [m_out_buf + 4], dist_code2 %+ d
315 add m_out_buf, 8
316
317 shr dist_code2, DIST_OFFSET
318 and dist_code2, 0x1F
319 inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code2]
320
321 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
322 and hash %+ d, HASH_MASK
323 and hash2 %+ d, HASH_MASK
324
325 ; continue
326 cmp f_i, file_length
327 jl loop2
328 jmp input_end
329 ;; encode as dist/len
330
331 MARK __body_len_dist_huffman_ %+ ARCH
332 len_dist_huffman_pre:
333 bsf len, len
334 shr len, 3
335
336 len_dist_huffman:
337 dec f_i
338 ;; Setup for updateing hash
339 lea tmp3, [f_i + 2] ; tmp3 <= k
340
341 neg dist
342
343 ; get_dist_code(dist, &code2, &code_len2);
344 get_dist_icf_code dist, dist_code, tmp1
345
346 add file_start, f_i
347 MOVDQU xdata, [file_start + len]
348 mov curr_data2, [file_start + len]
349 mov curr_data, curr_data2
350 sub file_start, f_i
351 add f_i, len
352 ; get_len_code(len, &code, &code_len);
353 lea len_code, [len + 254]
354 or dist_code, len_code
355
356 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
357 add tmp3,1
358 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
359
360 compute_hash hash, curr_data
361
362 shr curr_data2, 8
363 compute_hash hash2, curr_data2
364
365 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*len_code]
366
367 movnti dword [m_out_buf], dist_code %+ d
368 add m_out_buf, 4
369
370 shr dist_code, DIST_OFFSET
371 and dist_code, 0x1F
372 inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code]
373
374 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
375 and hash %+ d, HASH_MASK
376 and hash2 %+ d, HASH_MASK
377
378 ; continue
379 cmp f_i, file_length
380 jl loop2
381 jmp input_end
382
383 MARK __body_write_lit_bits_ %+ ARCH
384 write_lit_bits:
385 MOVDQU xdata, [file_start + f_i + 1]
386 add f_i, 1
387 MOVQ curr_data, xdata
388
389 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code2]
390
391 shl lit_code2, DIST_OFFSET
392 lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
393
394 movnti dword [m_out_buf], lit_code %+ d
395 add m_out_buf, 4
396
397 ; continue
398 cmp f_i, file_length
399 jl loop2
400
401 input_end:
402 mov tmp1, ZSTATE_FLUSH_READ_BUFFER
403 mov tmp2, ZSTATE_BODY
404 cmp dword [stream + _end_of_stream], 0
405 cmovne tmp2, tmp1
406 cmp dword [stream + _flush], _NO_FLUSH
407
408 cmovne tmp2, tmp1
409 mov dword [stream + _internal_state_state], tmp2 %+ d
410 jmp end
411
412 output_end:
413 mov dword [stream + _internal_state_state], ZSTATE_CREATE_HDR
414
415 end:
416 ;; update input buffer
417 add file_length, LA
418 mov [stream + _total_in], f_i %+ d
419 add file_start, f_i
420 mov [stream + _next_in], file_start
421 sub file_length, f_i
422 mov [stream + _avail_in], file_length %+ d
423
424 ;; update output buffer
425 mov tmp1, [stream + _level_buf]
426 mov [tmp1 + _icf_buf_next], m_out_buf
427 sub m_out_buf, [rsp + m_out_start]
428 sub [tmp1 + _icf_buf_avail_out], m_out_buf %+ d
429
430 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
431 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
432 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
433 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
434 mov r12, [rsp + gpr_save_mem_offset + 4*8]
435 mov r13, [rsp + gpr_save_mem_offset + 5*8]
436 mov r14, [rsp + gpr_save_mem_offset + 6*8]
437 mov r15, [rsp + gpr_save_mem_offset + 7*8]
438
439 %ifndef ALIGN_STACK
440 add rsp, stack_size
441 %else
442 mov rsp, rbp
443 pop rbp
444 %endif
445 ret
446
447 MARK __body_compare_loops_ %+ ARCH
448 compare_loop:
449 lea tmp2, [tmp1 + dist - 1]
450 %if (COMPARE_TYPE == 1)
451 compare250 tmp1, tmp2, len, tmp3
452 %elif (COMPARE_TYPE == 2)
453 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
454 %elif (COMPARE_TYPE == 3)
455 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
456 %else
457 %error Unknown Compare type COMPARE_TYPE
458 % error
459 %endif
460 jmp len_dist_huffman
461
462 compare_loop2:
463 lea tmp2, [tmp1 + dist2]
464 add tmp1, 1
465 %if (COMPARE_TYPE == 1)
466 compare250 tmp1, tmp2, len2, tmp3
467 %elif (COMPARE_TYPE == 2)
468 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
469 %elif (COMPARE_TYPE == 3)
470 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
471 %else
472 %error Unknown Compare type COMPARE_TYPE
473 % error
474 %endif
475 movzx lit_code, curr_data %+ b
476 shr curr_data, 8
477 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code]
478 jmp len_dist_lit_huffman
479
480 MARK __write_first_byte_ %+ ARCH
481 write_first_byte:
482 cmp m_out_buf, [rsp + m_out_end]
483 ja output_end
484
485 mov dword [stream + _internal_state_has_hist], 1
486
487 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
488
489 mov hash, hash2
490 shr tmp4, 16
491 compute_hash hash2, tmp4
492
493 and curr_data, 0xff
494 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*curr_data]
495 or curr_data, LIT
496
497 movnti dword [m_out_buf], curr_data %+ d
498 add m_out_buf, 4
499
500 MOVDQU xdata, [file_start + f_i + 1]
501 add f_i, 1
502 mov curr_data, [file_start + f_i]
503 and hash %+ d, HASH_MASK
504 and hash2 %+ d, HASH_MASK
505
506 cmp f_i, file_length
507 jl loop2
508 jmp input_end
509
510 section .data
511 align 16
512 mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK
513 const_D: dq D