]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/igzip_body.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / igzip / igzip_body.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "options.asm"
31
32 %include "lz0a_const.asm"
33 %include "data_struct2.asm"
34 %include "bitbuf2.asm"
35 %include "huffman.asm"
36 %include "igzip_compare_types.asm"
37 %include "reg_sizes.asm"
38
39 %include "stdmac.asm"
40
41 %ifdef DEBUG
42 %macro MARK 1
43 global %1
44 %1:
45 %endm
46 %else
47 %macro MARK 1
48 %endm
49 %endif
50
51 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
52 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
53 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
54
55 %define tmp2 rcx
56 %define hash2 rcx
57
58 %define curr_data rax
59 %define code rax
60 %define tmp5 rax
61
62 %define tmp4 rbx
63 %define dist rbx
64 %define code2 rbx
65
66 %define hash rdx
67 %define len rdx
68 %define code_len3 rdx
69 %define tmp8 rdx
70
71 %define tmp1 rsi
72 %define code_len2 rsi
73
74 %define file_start rdi
75
76 %define m_bit_count rbp
77
78 %define curr_data2 r8
79 %define len2 r8
80 %define tmp6 r8
81
82 %define m_bits r9
83
84 %define f_i r10
85
86 %define m_out_buf r11
87
88 %define f_end_i r12
89 %define dist2 r12
90 %define tmp7 r12
91 %define code4 r12
92
93 %define tmp3 r13
94 %define code3 r13
95
96 %define stream r14
97
98 %define hufftables r15
99
100 ;; GPR r8 & r15 can be used
101
102 %define xtmp0 xmm0 ; tmp
103 %define xtmp1 xmm1 ; tmp
104 %define xhash xmm2
105 %define xmask xmm3
106 %define xdata xmm4
107
108 %define ytmp0 ymm0 ; tmp
109 %define ytmp1 ymm1 ; tmp
110
111
112 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
113 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
115
116
117 blen_mem_offset equ 0 ; local variable (8 bytes)
118 f_end_i_mem_offset equ 8
119 gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
120 xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
121 stack_size equ 2*8 + 8*8 + 4*16 + 8
122 ;;; 8 because stack address is odd multiple of 8 after a function call and
123 ;;; we want it aligned to 16 bytes
124
125 ; void isal_deflate_body ( isal_zstream *stream )
126 ; arg 1: rcx: addr of stream
127 global isal_deflate_body_ %+ ARCH
128 isal_deflate_body_ %+ ARCH %+ :
129 %ifidn __OUTPUT_FORMAT__, elf64
130 mov rcx, rdi
131 %endif
132
133 ;; do nothing if (avail_in == 0)
134 cmp dword [rcx + _avail_in], 0
135 jne skip1
136
137 ;; Set stream's next state
138 mov rdx, ZSTATE_FLUSH_READ_BUFFER
139 mov rax, ZSTATE_BODY
140 cmp dword [rcx + _end_of_stream], 0
141 cmovne rax, rdx
142 cmp dword [rcx + _flush], _NO_FLUSH
143 cmovne rax, rdx
144 mov dword [rcx + _internal_state_state], eax
145 ret
146 skip1:
147
148 %ifdef ALIGN_STACK
149 push rbp
150 mov rbp, rsp
151 sub rsp, stack_size
152 and rsp, ~15
153 %else
154 sub rsp, stack_size
155 %endif
156
157 mov [rsp + gpr_save_mem_offset + 0*8], rbx
158 mov [rsp + gpr_save_mem_offset + 1*8], rsi
159 mov [rsp + gpr_save_mem_offset + 2*8], rdi
160 mov [rsp + gpr_save_mem_offset + 3*8], rbp
161 mov [rsp + gpr_save_mem_offset + 4*8], r12
162 mov [rsp + gpr_save_mem_offset + 5*8], r13
163 mov [rsp + gpr_save_mem_offset + 6*8], r14
164 mov [rsp + gpr_save_mem_offset + 7*8], r15
165
166 mov stream, rcx
167 mov dword [stream + _internal_state_has_eob], 0
168
169 MOVDQU xmask, [mask]
170
171 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
172 mov m_out_buf, [stream + _next_out]
173 mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf
174 mov tmp1 %+ d, [stream + _avail_out]
175 add tmp1, m_out_buf
176 sub tmp1, SLOP
177
178 mov [stream + _internal_state_bitbuf_m_out_end], tmp1
179
180 mov m_bits, [stream + _internal_state_bitbuf_m_bits]
181 mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]
182 mov hufftables, [stream + _hufftables]
183
184 mov file_start, [stream + _next_in]
185
186 mov f_i %+ d, dword [stream + _total_in]
187 sub file_start, f_i
188
189 mov f_end_i %+ d, [stream + _avail_in]
190 add f_end_i, f_i
191
192 ; f_end_i -= LA;
193 sub f_end_i, LA
194 mov [rsp + f_end_i_mem_offset], f_end_i
195 ; if (f_end_i <= 0) continue;
196
197 cmp f_end_i, f_i
198 jle input_end
199
200 ; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
201 MARK __body_compute_hash_ %+ ARCH
202 MOVDQU xdata, [file_start + f_i]
203 mov curr_data, [file_start + f_i]
204 mov tmp3, curr_data
205 mov tmp6, curr_data
206
207 compute_hash hash, curr_data
208
209 shr tmp3, 8
210 compute_hash hash2, tmp3
211
212 and hash, HASH_MASK
213 and hash2, HASH_MASK
214
215 cmp dword [stream + _internal_state_has_hist], 0
216 je write_first_byte
217
218 jmp loop2
219 align 16
220
221 loop2:
222 ; if (state->bitbuf.is_full()) {
223 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
224 ja output_end
225
226 xor dist, dist
227 xor dist2, dist2
228 xor tmp3, tmp3
229
230 lea tmp1, [file_start + f_i]
231
232 mov dist %+ w, f_i %+ w
233 dec dist
234 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
235 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
236
237 inc f_i
238
239 MOVQ tmp6, xdata
240 shr tmp5, 16
241 mov tmp8, tmp5
242 compute_hash tmp6, tmp5
243
244 mov dist2 %+ w, f_i %+ w
245 dec dist2
246 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
247 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
248
249 ; if ((dist-1) < (D-1)) {
250 and dist %+ d, (D-1)
251 neg dist
252
253 shr tmp8, 8
254 compute_hash tmp2, tmp8
255
256 and dist2 %+ d, (D-1)
257 neg dist2
258
259 MARK __body_compare_ %+ ARCH
260 ;; Check for long len/dist match (>7) with first literal
261 MOVQ len, xdata
262 mov curr_data, len
263 PSRLDQ xdata, 1
264 xor len, [tmp1 + dist - 1]
265 jz compare_loop
266
267 MOVD xhash, tmp6 %+ d
268 PINSRD xhash, tmp2 %+ d, 1
269 PAND xhash, xhash, xmask
270
271 ;; Check for len/dist match (>7) with second literal
272 MOVQ len2, xdata
273 xor len2, [tmp1 + dist2]
274 jz compare_loop2
275
276 ;; Specutively load the code for the first literal
277 movzx tmp1, curr_data %+ b
278 get_lit_code tmp1, code3, rcx, hufftables
279
280 ;; Check for len/dist match for first literal
281 test len %+ d, 0xFFFFFFFF
282 jz len_dist_huffman_pre
283
284 ;; Specutively load the code for the second literal
285 shr curr_data, 8
286 and curr_data, 0xff
287 get_lit_code curr_data, code2, code_len2, hufftables
288
289 SHLX code2, code2, rcx
290 or code2, code3
291 add code_len2, rcx
292
293 ;; Check for len/dist match for second literal
294 test len2 %+ d, 0xFFFFFFFF
295 jnz write_lit_bits
296
297 MARK __body_len_dist_lit_huffman_ %+ ARCH
298 len_dist_lit_huffman_pre:
299 mov code_len3, rcx
300 bsf len2, len2
301 shr len2, 3
302
303 len_dist_lit_huffman:
304 neg dist2
305
306 %ifndef LONGER_HUFFTABLE
307 mov tmp4, dist2
308 get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx
309 %else
310 get_dist_code dist2, code4, code_len2, hufftables
311 %endif
312 get_len_code len2, code, rcx, hufftables ;; rcx is code_len
313
314 SHLX code4, code4, rcx
315 or code4, code
316 add code_len2, rcx
317
318 add f_i, len2
319 neg len2
320
321 MOVQ tmp5, xdata
322 shr tmp5, 24
323 compute_hash tmp4, tmp5
324 and tmp4, HASH_MASK
325
326 SHLX code4, code4, code_len3
327 or code4, code3
328 add code_len2, code_len3
329
330 ;; Setup for updating hash
331 lea tmp3, [f_i + len2 + 1] ; tmp3 <= k
332
333 MOVDQU xdata, [file_start + f_i]
334 mov curr_data, [file_start + f_i]
335 mov curr_data2, curr_data
336
337 MOVD hash %+ d, xhash
338 PEXTRD hash2 %+ d, xhash, 1
339 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
340
341 compute_hash hash, curr_data
342
343 add tmp3,1
344 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
345
346 add tmp3, 1
347 mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
348
349 write_bits m_bits, m_bit_count, code4, code_len2, m_out_buf, tmp4
350 mov f_end_i, [rsp + f_end_i_mem_offset]
351
352 shr curr_data2, 8
353 compute_hash hash2, curr_data2
354
355 %ifdef NO_LIMIT_HASH_UPDATE
356 loop3:
357 add tmp3,1
358 cmp tmp3, f_i
359 jae loop3_done
360 mov tmp6, [file_start + tmp3]
361 compute_hash tmp4, tmp6
362 and tmp4 %+ d, HASH_MASK
363 ; state->head[hash] = k;
364 mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
365 jmp loop3
366 loop3_done:
367 %endif
368 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
369 and hash %+ d, HASH_MASK
370 and hash2 %+ d, HASH_MASK
371
372 ; continue
373 cmp f_i, f_end_i
374 jl loop2
375 jmp input_end
376 ;; encode as dist/len
377
378 MARK __body_len_dist_huffman_ %+ ARCH
379 len_dist_huffman_pre:
380 bsf len, len
381 shr len, 3
382
383 len_dist_huffman:
384 dec f_i
385 neg dist
386
387 ; get_dist_code(dist, &code2, &code_len2);
388 %ifndef LONGER_HUFFTABLE
389 mov tmp3, dist ; since code2 and dist are rbx
390 get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
391 %else
392 get_dist_code dist, code2, code_len2, hufftables
393 %endif
394 ; get_len_code(len, &code, &code_len);
395 get_len_code len, code, rcx, hufftables ;; rcx is code_len
396
397 ; code2 <<= code_len
398 ; code2 |= code
399 ; code_len2 += code_len
400 SHLX code2, code2, rcx
401 or code2, code
402 add code_len2, rcx
403
404 ;; Setup for updateing hash
405 lea tmp3, [f_i + 2] ; tmp3 <= k
406 add f_i, len
407
408 MOVD hash %+ d, xhash
409 PEXTRD hash2 %+ d, xhash, 1
410 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
411 add tmp3,1
412 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
413
414 MOVDQU xdata, [file_start + f_i]
415 mov curr_data, [file_start + f_i]
416 mov curr_data2, curr_data
417 compute_hash hash, curr_data
418
419 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp7
420 mov f_end_i, [rsp + f_end_i_mem_offset]
421
422 shr curr_data2, 8
423 compute_hash hash2, curr_data2
424
425 %ifdef NO_LIMIT_HASH_UPDATE
426 loop4:
427 add tmp3,1
428 cmp tmp3, f_i
429 jae loop4_done
430 mov tmp6, [file_start + tmp3]
431 compute_hash tmp4, tmp6
432 and tmp4, HASH_MASK
433 mov [stream + _internal_state_head + 2 * tmp4], tmp3 %+ w
434 jmp loop4
435 loop4_done:
436 %endif
437
438 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
439 and hash %+ d, HASH_MASK
440 and hash2 %+ d, HASH_MASK
441
442 ; continue
443 cmp f_i, f_end_i
444 jl loop2
445 jmp input_end
446
447 MARK __body_write_lit_bits_ %+ ARCH
448 write_lit_bits:
449 MOVDQU xdata, [file_start + f_i + 1]
450 mov f_end_i, [rsp + f_end_i_mem_offset]
451 add f_i, 1
452 mov curr_data, [file_start + f_i]
453
454 MOVD hash %+ d, xhash
455
456 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
457
458 PEXTRD hash2 %+ d, xhash, 1
459
460 ; continue
461 cmp f_i, f_end_i
462 jl loop2
463
464 input_end:
465 mov tmp1, ZSTATE_FLUSH_READ_BUFFER
466 mov tmp5, ZSTATE_BODY
467 cmp dword [stream + _end_of_stream], 0
468 cmovne tmp5, tmp1
469 cmp dword [stream + _flush], _NO_FLUSH
470 cmovne tmp5, tmp1
471 mov dword [stream + _internal_state_state], tmp5 %+ d
472
473 output_end:
474 ;; update input buffer
475 add f_end_i, LA
476 mov [stream + _total_in], f_i %+ d
477 add file_start, f_i
478 mov [stream + _next_in], file_start
479 sub f_end_i, f_i
480 mov [stream + _avail_in], f_end_i %+ d
481
482 ;; update output buffer
483 mov [stream + _next_out], m_out_buf
484 sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
485 sub [stream + _avail_out], m_out_buf %+ d
486 add [stream + _total_out], m_out_buf %+ d
487
488 mov [stream + _internal_state_bitbuf_m_bits], m_bits
489 mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
490
491 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
492 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
493 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
494 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
495 mov r12, [rsp + gpr_save_mem_offset + 4*8]
496 mov r13, [rsp + gpr_save_mem_offset + 5*8]
497 mov r14, [rsp + gpr_save_mem_offset + 6*8]
498 mov r15, [rsp + gpr_save_mem_offset + 7*8]
499
500 %ifndef ALIGN_STACK
501 add rsp, stack_size
502 %else
503 mov rsp, rbp
504 pop rbp
505 %endif
506 ret
507
508 MARK __body_compare_loops_ %+ ARCH
509 compare_loop:
510 MOVD xhash, tmp6 %+ d
511 PINSRD xhash, tmp2 %+ d, 1
512 PAND xhash, xhash, xmask
513 lea tmp2, [tmp1 + dist - 1]
514 %if (COMPARE_TYPE == 1)
515 compare250 tmp1, tmp2, len, tmp3
516 %elif (COMPARE_TYPE == 2)
517 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
518 %elif (COMPARE_TYPE == 3)
519 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
520 %else
521 %error Unknown Compare type COMPARE_TYPE
522 % error
523 %endif
524 jmp len_dist_huffman
525
526 compare_loop2:
527 lea tmp2, [tmp1 + dist2]
528 add tmp1, 1
529 %if (COMPARE_TYPE == 1)
530 compare250 tmp1, tmp2, len2, tmp3
531 %elif (COMPARE_TYPE == 2)
532 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
533 %elif (COMPARE_TYPE == 3)
534 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
535 %else
536 %error Unknown Compare type COMPARE_TYPE
537 % error
538 %endif
539 and curr_data, 0xff
540 get_lit_code curr_data, code3, code_len3, hufftables
541 jmp len_dist_lit_huffman
542
543 MARK __write_first_byte_ %+ ARCH
544 write_first_byte:
545 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
546 ja output_end
547
548 mov dword [stream + _internal_state_has_hist], 1
549
550 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
551
552 mov hash, hash2
553 shr tmp6, 16
554 compute_hash hash2, tmp6
555
556 MOVD xhash, hash %+ d
557 PINSRD xhash, hash2 %+ d, 1
558 PAND xhash, xhash, xmask
559
560 and curr_data, 0xff
561 get_lit_code curr_data, code2, code_len2, hufftables
562 jmp write_lit_bits
563
564 section .data
565 align 16
566 mask: dd HASH_MASK, HASH_MASK, HASH_MASK, HASH_MASK
567 const_D: dq D