]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/igzip_icf_body.asm
bump version to 15.2.6-pve1
[ceph.git] / ceph / src / isa-l / igzip / igzip_icf_body.asm
CommitLineData
224ce89b
WB
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29%include "options.asm"
30
31%include "lz0a_const.asm"
32%include "data_struct2.asm"
33%include "bitbuf2.asm"
34%include "huffman.asm"
35%include "igzip_compare_types.asm"
36%include "reg_sizes.asm"
37
38%include "stdmac.asm"
39
40%ifdef DEBUG
41%macro MARK 1
42global %1
43%1:
44%endm
45%else
46%macro MARK 1
47%endm
48%endif
49
50;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
51;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53%define file_start rdi
54%define file_length r15
55%define stream r14
56%define f_i r10
57%define m_out_buf r11
58
59%define curr_data rax
60
61%define tmp2 rcx
62
63%define dist rbx
64%define dist_code2 rbx
65%define lit_code2 rbx
66
67%define dist2 r12
68%define dist_code r12
69
70%define tmp1 rsi
71
72%define lit_code rsi
73
74%define curr_data2 r8
75%define len2 r8
76%define tmp4 r8
77
78%define len rdx
79%define len_code rdx
80%define hash3 rdx
81
82%define tmp3 r13
83
84%define hash rbp
85%define hash2 r9
86
87;; GPR r8 & r15 can be used
88
89%define xtmp0 xmm0 ; tmp
90%define xtmp1 xmm1 ; tmp
91%define xdata xmm4
92
93%define ytmp0 ymm0 ; tmp
94%define ytmp1 ymm1 ; tmp
95
96
97;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
98;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
99;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
100
101m_out_end equ 0 ; local variable (8 bytes)
102m_out_start equ 8
103f_end_i_mem_offset equ 16
104gpr_save_mem_offset equ 24 ; gpr save area (8*8 bytes)
105xmm_save_mem_offset equ 24 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
106stack_size equ 3*8 + 8*8 + 4*16
107
108;;; 8 because stack address is odd multiple of 8 after a function call and
109;;; we want it aligned to 16 bytes
110
111; void isal_deflate_icf_body ( isal_zstream *stream )
112; arg 1: rcx: addr of stream
113global isal_deflate_icf_body_ %+ ARCH
114isal_deflate_icf_body_ %+ ARCH %+ :
115%ifidn __OUTPUT_FORMAT__, elf64
116 mov rcx, rdi
117%endif
118
119 ;; do nothing if (avail_in == 0)
120 cmp dword [rcx + _avail_in], 0
121 jne skip1
122
123 ;; Set stream's next state
124 mov rdx, ZSTATE_FLUSH_READ_BUFFER
125 mov rax, ZSTATE_CREATE_HDR
126 cmp dword [rcx + _end_of_stream], 0
127 cmovne rax, rdx
128 cmp dword [rcx + _flush], _NO_FLUSH
129 cmovne rax, rdx
130 mov dword [rcx + _internal_state_state], eax
131 ret
132skip1:
133
134%ifdef ALIGN_STACK
135 push rbp
136 mov rbp, rsp
137 sub rsp, stack_size
138 and rsp, ~15
139%else
140 sub rsp, stack_size
141%endif
142
143 mov [rsp + gpr_save_mem_offset + 0*8], rbx
144 mov [rsp + gpr_save_mem_offset + 1*8], rsi
145 mov [rsp + gpr_save_mem_offset + 2*8], rdi
146 mov [rsp + gpr_save_mem_offset + 3*8], rbp
147 mov [rsp + gpr_save_mem_offset + 4*8], r12
148 mov [rsp + gpr_save_mem_offset + 5*8], r13
149 mov [rsp + gpr_save_mem_offset + 6*8], r14
150 mov [rsp + gpr_save_mem_offset + 7*8], r15
151
152 mov stream, rcx
153 mov dword [stream + _internal_state_has_eob], 0
154
155 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
156 mov tmp1, [stream + _level_buf]
157 mov m_out_buf, [tmp1 + _icf_buf_next]
158
159 mov [rsp + m_out_start], m_out_buf
160 mov tmp1, [tmp1 + _icf_buf_avail_out]
161 add tmp1, m_out_buf
162 sub tmp1, SLOP
163
164 mov [rsp + m_out_end], tmp1
165
166 mov file_start, [stream + _next_in]
167
168 mov f_i %+ d, dword [stream + _total_in]
169 sub file_start, f_i
170
171 mov file_length %+ d, [stream + _avail_in]
172 add file_length, f_i
173
174 ; file_length -= LA;
175 sub file_length, LA
176 ; if (file_length <= 0) continue;
177
178 cmp file_length, f_i
179 jle input_end
180
181 ; for (f_i = f_start_i; f_i < file_length; f_i++) {
182MARK __body_compute_hash_ %+ ARCH
183 MOVDQU xdata, [file_start + f_i]
184 mov curr_data, [file_start + f_i]
185 mov tmp3, curr_data
186 mov tmp4, curr_data
187
188 compute_hash hash, curr_data
189
190 shr tmp3, 8
191 compute_hash hash2, tmp3
192
193 and hash, HASH_MASK
194 and hash2, HASH_MASK
195
196 cmp dword [stream + _internal_state_has_hist], 0
197 je write_first_byte
198
199 jmp loop2
200 align 16
201
202loop2:
203 ; if (state->bitbuf.is_full()) {
204 cmp m_out_buf, [rsp + m_out_end]
205 ja output_end
206
207 xor dist, dist
208 xor dist2, dist2
209 xor tmp3, tmp3
210
211 lea tmp1, [file_start + f_i]
212
213 mov dist %+ w, f_i %+ w
214 dec dist
215 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
216 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
217
218 inc f_i
219
220 mov tmp2, curr_data
221 shr curr_data, 16
222 compute_hash hash, curr_data
223 and hash %+ d, HASH_MASK
224
225 mov dist2 %+ w, f_i %+ w
226 dec dist2
227 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
228 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
229
230 ; if ((dist-1) < (D-1)) {
231 and dist %+ d, (D-1)
232 neg dist
233
234 shr tmp2, 24
235 compute_hash hash2, tmp2
236 and hash2 %+ d, HASH_MASK
237
238 and dist2 %+ d, (D-1)
239 neg dist2
240
241MARK __body_compare_ %+ ARCH
242 ;; Check for long len/dist match (>7) with first literal
243 MOVQ len, xdata
244 mov curr_data, len
245 PSRLDQ xdata, 1
246 xor len, [tmp1 + dist - 1]
247 jz compare_loop
248
249 ;; Check for len/dist match (>7) with second literal
250 MOVQ len2, xdata
251 xor len2, [tmp1 + dist2]
252 jz compare_loop2
253
254 movzx lit_code, curr_data %+ b
255 shr curr_data, 8
256
257 ;; Check for len/dist match for first literal
258 test len %+ d, 0xFFFFFFFF
259 jz len_dist_huffman_pre
260
261 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code]
262 movzx lit_code2, curr_data %+ b
263 ;; Check for len/dist match for second literal
264 test len2 %+ d, 0xFFFFFFFF
265 jnz write_lit_bits
266
267MARK __body_len_dist_lit_huffman_ %+ ARCH
268len_dist_lit_huffman_pre:
269 bsf len2, len2
270 shr len2, 3
271
272len_dist_lit_huffman:
273 or lit_code, LIT
274 movnti dword [m_out_buf], lit_code %+ d
275
276 neg dist2
277
278 get_dist_icf_code dist2, dist_code2, tmp1
279
280 ;; Setup for updating hash
281 lea tmp3, [f_i + 1] ; tmp3 <= k
282
283 add file_start, f_i
284 MOVDQU xdata, [file_start + len2]
285 mov tmp1, [file_start + len2]
286
287 shr curr_data, 24
288 compute_hash hash3, curr_data
289 and hash3, HASH_MASK
290
291 mov curr_data, tmp1
292 shr tmp1, 8
293
294 sub file_start, f_i
295 add f_i, len2
296
297 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
298
299 compute_hash hash, curr_data
300
301 add tmp3,1
302 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
303
304 compute_hash hash2, tmp1
305
306 add tmp3, 1
307 mov [stream + _internal_state_head + 2 * hash3], tmp3 %+ w
308
309 add dist_code2, 254
310 add dist_code2, len2
311
312 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*(len2 + 254)]
313
314 movnti dword [m_out_buf + 4], dist_code2 %+ d
315 add m_out_buf, 8
316
317 shr dist_code2, DIST_OFFSET
318 and dist_code2, 0x1F
319 inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code2]
320
321 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
322 and hash %+ d, HASH_MASK
323 and hash2 %+ d, HASH_MASK
324
325 ; continue
326 cmp f_i, file_length
327 jl loop2
328 jmp input_end
329 ;; encode as dist/len
330
331MARK __body_len_dist_huffman_ %+ ARCH
332len_dist_huffman_pre:
333 bsf len, len
334 shr len, 3
335
336len_dist_huffman:
337 dec f_i
338 ;; Setup for updateing hash
339 lea tmp3, [f_i + 2] ; tmp3 <= k
340
341 neg dist
342
343 ; get_dist_code(dist, &code2, &code_len2);
344 get_dist_icf_code dist, dist_code, tmp1
345
346 add file_start, f_i
347 MOVDQU xdata, [file_start + len]
348 mov curr_data2, [file_start + len]
349 mov curr_data, curr_data2
350 sub file_start, f_i
351 add f_i, len
352 ; get_len_code(len, &code, &code_len);
353 lea len_code, [len + 254]
354 or dist_code, len_code
355
356 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
357 add tmp3,1
358 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
359
360 compute_hash hash, curr_data
361
362 shr curr_data2, 8
363 compute_hash hash2, curr_data2
364
365 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*len_code]
366
367 movnti dword [m_out_buf], dist_code %+ d
368 add m_out_buf, 4
369
370 shr dist_code, DIST_OFFSET
371 and dist_code, 0x1F
372 inc word [stream + _internal_state_hist_dist + HIST_ELEM_SIZE*dist_code]
373
374 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
375 and hash %+ d, HASH_MASK
376 and hash2 %+ d, HASH_MASK
377
378 ; continue
379 cmp f_i, file_length
380 jl loop2
381 jmp input_end
382
383MARK __body_write_lit_bits_ %+ ARCH
384write_lit_bits:
385 MOVDQU xdata, [file_start + f_i + 1]
386 add f_i, 1
387 MOVQ curr_data, xdata
388
389 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code2]
390
391 shl lit_code2, DIST_OFFSET
392 lea lit_code, [lit_code + lit_code2 + (31 << DIST_OFFSET)]
393
394 movnti dword [m_out_buf], lit_code %+ d
395 add m_out_buf, 4
396
397 ; continue
398 cmp f_i, file_length
399 jl loop2
400
401input_end:
402 mov tmp1, ZSTATE_FLUSH_READ_BUFFER
403 mov tmp2, ZSTATE_BODY
404 cmp dword [stream + _end_of_stream], 0
405 cmovne tmp2, tmp1
406 cmp dword [stream + _flush], _NO_FLUSH
407
408 cmovne tmp2, tmp1
409 mov dword [stream + _internal_state_state], tmp2 %+ d
410 jmp end
411
412output_end:
413 mov dword [stream + _internal_state_state], ZSTATE_CREATE_HDR
414
415end:
416 ;; update input buffer
417 add file_length, LA
418 mov [stream + _total_in], f_i %+ d
419 add file_start, f_i
420 mov [stream + _next_in], file_start
421 sub file_length, f_i
422 mov [stream + _avail_in], file_length %+ d
423
424 ;; update output buffer
425 mov tmp1, [stream + _level_buf]
426 mov [tmp1 + _icf_buf_next], m_out_buf
427 sub m_out_buf, [rsp + m_out_start]
428 sub [tmp1 + _icf_buf_avail_out], m_out_buf %+ d
429
430 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
431 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
432 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
433 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
434 mov r12, [rsp + gpr_save_mem_offset + 4*8]
435 mov r13, [rsp + gpr_save_mem_offset + 5*8]
436 mov r14, [rsp + gpr_save_mem_offset + 6*8]
437 mov r15, [rsp + gpr_save_mem_offset + 7*8]
438
439%ifndef ALIGN_STACK
440 add rsp, stack_size
441%else
442 mov rsp, rbp
443 pop rbp
444%endif
445 ret
446
447MARK __body_compare_loops_ %+ ARCH
448compare_loop:
449 lea tmp2, [tmp1 + dist - 1]
450%if (COMPARE_TYPE == 1)
451 compare250 tmp1, tmp2, len, tmp3
452%elif (COMPARE_TYPE == 2)
453 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
454%elif (COMPARE_TYPE == 3)
455 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
456%else
457 %error Unknown Compare type COMPARE_TYPE
458 % error
459%endif
460 jmp len_dist_huffman
461
462compare_loop2:
463 lea tmp2, [tmp1 + dist2]
464 add tmp1, 1
465%if (COMPARE_TYPE == 1)
466 compare250 tmp1, tmp2, len2, tmp3
467%elif (COMPARE_TYPE == 2)
468 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
469%elif (COMPARE_TYPE == 3)
470 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
471%else
472%error Unknown Compare type COMPARE_TYPE
473 % error
474%endif
475 movzx lit_code, curr_data %+ b
476 shr curr_data, 8
477 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*lit_code]
478 jmp len_dist_lit_huffman
479
480MARK __write_first_byte_ %+ ARCH
481write_first_byte:
482 cmp m_out_buf, [rsp + m_out_end]
483 ja output_end
484
485 mov dword [stream + _internal_state_has_hist], 1
486
487 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
488
489 mov hash, hash2
490 shr tmp4, 16
491 compute_hash hash2, tmp4
492
493 and curr_data, 0xff
494 inc word [stream + _internal_state_hist_lit_len + HIST_ELEM_SIZE*curr_data]
495 or curr_data, LIT
496
497 movnti dword [m_out_buf], curr_data %+ d
498 add m_out_buf, 4
499
500 MOVDQU xdata, [file_start + f_i + 1]
501 add f_i, 1
502 mov curr_data, [file_start + f_i]
503 and hash %+ d, HASH_MASK
504 and hash2 %+ d, HASH_MASK
505
506 cmp f_i, file_length
507 jl loop2
508 jmp input_end
509
510section .data
511 align 16
512mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK
513const_D: dq D