]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/igzip_stateless.asm
buildsys: fix parallel builds
[ceph.git] / ceph / src / isa-l / igzip / igzip_stateless.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29%include "options.asm"
30
31%include "lz0a_const.asm"
32%include "data_struct2.asm"
33%include "bitbuf2.asm"
34%include "huffman.asm"
35%include "igzip_compare_types.asm"
36%include "reg_sizes.asm"
37
38%include "stdmac.asm"
39
40%define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
41%define LA_STATELESS 264 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
42
43%ifdef DEBUG
44%macro MARK 1
45global %1
46%1:
47%endm
48%else
49%macro MARK 1
50%endm
51%endif
52
53;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
54;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
55;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
56
57%define tmp2 rcx
58%define hash2 rcx
59
60%define curr_data rax
61%define code rax
62%define tmp5 rax
63
64%define tmp4 rbx
65%define dist rbx
66%define code2 rbx
67
68%define hash rdx
69%define len rdx
70%define code_len3 rdx
71
72%define tmp1 rsi
73%define code_len2 rsi
74
75%define file_start rdi
76
77%define m_bit_count rbp
78
79%define curr_data2 r8
80%define len2 r8
81%define tmp6 r8
82
83%define m_bits r9
84
85%define f_i r10
86
87%define m_out_buf r11
88
89%define f_end_i r12
90%define dist2 r12
91%define tmp7 r12
92%define code4 r12
93
94%define tmp3 r13
95%define code3 r13
96
97%define stream r14
98
99%define hufftables r15
100
101;; GPR r8 & r15 can be used
102
103%define xtmp0 xmm0 ; tmp
104%define xtmp1 xmm1 ; tmp
105
106%define ytmp0 ymm0 ; tmp
107%define ytmp1 ymm1 ; tmp
108
109
110;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
111;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
112;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
113
114
115blen_mem_offset equ 0 ; local variable (8 bytes)
116f_end_i_mem_offset equ 8
117gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
118xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
119stack_size equ 2*8 + 8*8 + 4*16 + 8
120;;; 8 because stack address is odd multiple of 8 after a function call and
121;;; we want it aligned to 16 bytes
122
123; void isal_deflate_body_stateless ( isal_zstream *stream )
124; arg 1: rcx: addr of stream
125global isal_deflate_body_stateless_ %+ ARCH
126isal_deflate_body_stateless_ %+ ARCH %+ :
127%ifidn __OUTPUT_FORMAT__, elf64
128 mov rcx, rdi
129%endif
130
131 ;; do nothing if (avail_in == 0)
132 cmp dword [rcx + _avail_in], 0
133 jne skip1
134 ret
135skip1:
136
137%ifdef ALIGN_STACK
138 push rbp
139 mov rbp, rsp
140 sub rsp, stack_size
141 and rsp, ~15
142%else
143 sub rsp, stack_size
144%endif
145
146 mov [rsp + gpr_save_mem_offset + 0*8], rbx
147 mov [rsp + gpr_save_mem_offset + 1*8], rsi
148 mov [rsp + gpr_save_mem_offset + 2*8], rdi
149 mov [rsp + gpr_save_mem_offset + 3*8], rbp
150 mov [rsp + gpr_save_mem_offset + 4*8], r12
151 mov [rsp + gpr_save_mem_offset + 5*8], r13
152 mov [rsp + gpr_save_mem_offset + 6*8], r14
153 mov [rsp + gpr_save_mem_offset + 7*8], r15
154
155 mov stream, rcx
156 mov dword [stream + _internal_state_has_eob], 0
157
158 ; state->bitbuf.set_buf(stream->next_out, stream->avail_out);
159 mov m_out_buf, [stream + _next_out]
160 mov [stream + _internal_state_bitbuf_m_out_start], m_out_buf
161 mov tmp1 %+ d, [stream + _avail_out]
162 add tmp1, m_out_buf
163 sub tmp1, SLOP
164
165skip_SLOP:
166 mov [stream + _internal_state_bitbuf_m_out_end], tmp1
167
168 mov m_bits, [stream + _internal_state_bitbuf_m_bits]
169 mov m_bit_count %+ d, [stream + _internal_state_bitbuf_m_bit_count]
170 mov hufftables, [stream + _hufftables]
171 ; state->b_bytes_valid = stream->avail_in;
172 mov f_end_i %+ d, [stream + _avail_in]
173 mov [stream + _internal_state_b_bytes_valid], f_end_i %+ d
174
175 mov f_i, 0
176 mov file_start, [stream + _next_in]
177 mov [stream + _internal_state_file_start], file_start
178
179 ; f_end_i -= LA;
180 sub f_end_i, LA_STATELESS
181 mov [rsp + f_end_i_mem_offset], f_end_i
182 ; if (f_end_i <= 0) continue;
183 cmp f_end_i, 0
184 jle end_loop_2
185
186 ; for (f_i = f_start_i; f_i < f_end_i; f_i++) {
187MARK __stateless_compute_hash_ %+ ARCH
188 mov curr_data %+ d, [file_start + f_i]
189
190 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
191 ja end
192
193 ;; Encode first byte in the stream as a literal
194 compute_hash hash, curr_data
195 and hash %+ d, HASH_MASK
196 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
197 and curr_data, 0xff
198 get_lit_code curr_data, code2, code_len2, hufftables
199 jmp write_lit_bits
200
201 align 16
202
203loop2:
204 shr curr_data2, 8
205 xor hash2 %+ d, hash2 %+ d
206 crc32 hash2 %+ d, curr_data2 %+ d
207
208 ; hash = compute_hash(state->file_start + f_i) & HASH_MASK;
209 and hash %+ d, HASH_MASK
210 and hash2 %+ d, HASH_MASK
211
212 ; if (state->bitbuf.is_full()) {
213 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
214 ja end
215
216 xor dist, dist
217 xor dist2, dist2
218 xor tmp3, tmp3
219
220 lea tmp1, [file_start + f_i]
221 lea tmp6, [tmp1 - 1]
222
223 mov dist %+ w, f_i %+ w
224 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
225
226 ; state->head[hash] = (uint16_t) f_i;
227 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
228
229 inc f_i
230
231 mov dist2 %+ w, f_i %+ w
232 sub dist2 %+ w, word [stream + _internal_state_head + 2 * hash2]
233 dec dist2
234
235 ; state->head[hash2] = (uint16_t) f_i;
236 mov [stream + _internal_state_head + 2 * hash2], f_i %+ w
237
238 mov tmp2, tmp1
239 sub tmp2, dist
240 dec dist
241
242 ; if ((dist-1) < (D-1)) {
243 cmp dist %+ d, (D-1)
244 cmovae tmp2, tmp6
245 cmovae dist, tmp3
246 inc dist
247
248 cmp dist2 %+ d, (D-1)
249 cmovae dist2, tmp3
250 inc dist2
251
252MARK __stateless_compare_ %+ ARCH
253 ; len = compare258(state->file_start + f_i,
254 ; state->file_start + f_i - dist);
255
256 ;; Specutively load distance code (except for when large windows are used)
257 get_packed_dist_code dist, code2, hufftables
258
259 ;; Check for long len/dist match (>7) with first literal
260 mov len, [tmp1]
261 xor len, [tmp2]
262 jz compare_loop
263
264%ifdef USE_HSWNI
265 blsmsk tmp3, len
266 or tmp3, 0xFFFFFF
267%endif
268
269 lea tmp1, [file_start + f_i]
270 mov tmp2, tmp1
271 sub tmp2, dist2
272
273 ;; Specutively load distance code (except for when large windows are used)
274 get_packed_dist_code dist2, code4, hufftables
275
276 ;; Check for len/dist match (>7) with second literal
277 mov len2, [tmp1]
278 xor len2, [tmp2]
279 jz compare_loop2
280
281%ifdef USE_HSWNI
282 ;; Check for len/dist match for first literal
283 test tmp3, len2
284 jz len_dist_lit_huffman_pre
285
286 cmp tmp3, 0xFFFFFF
287 je encode_2_literals
288 jmp len_dist_huffman_pre
289
290
291MARK __stateless_len_dist_lit_huffman_ %+ ARCH
292len_dist_lit_huffman_pre:
293 movzx tmp1, curr_data %+ b
294 get_lit_code tmp1, code3, code_len3, hufftables
295%else
296 ;; Specutively load the code for the first literal
297 movzx tmp1, curr_data %+ b
298 get_lit_code tmp1, code3, rcx, hufftables
299
300 ;; Check for len/dist match for first literal
301 test len, 0xFFFFFF
302 jz len_dist_huffman_pre
303
304 ;; Specutively load the code for the second literal
305 shr curr_data, 8
306 and curr_data, 0xff
307 get_lit_code curr_data, code2, code_len2, hufftables
308
309 shl code2, cl
310 or code2, code3
311 add code_len2, rcx
312
313 ;; Check for len/dist match for second literal
314 test len2, 0xFFFFFF
315 jnz write_lit_bits
316
317MARK __stateless_len_dist_lit_huffman_ %+ ARCH
318len_dist_lit_huffman_pre:
319 mov code_len3, rcx
320%endif
321 bsf len2, len2
322 shr len2, 3
323
324
325len_dist_lit_huffman:
326%ifndef LONGER_HUFFTABLE
327 mov tmp4, dist2
328 get_dist_code tmp4, code4, code_len2, hufftables ;; clobbers dist, rcx
329%else
330 unpack_dist_code code4, code_len2
331%endif
332 get_len_code len2, code, rcx, hufftables ;; rcx is code_len
333
334%ifdef USE_HSWNI
335 shlx code4, code4, rcx
336%else
337 shl code4, cl
338%endif
339 or code4, code
340 add code_len2, rcx
341
342 mov rcx, code_len3
343
344%ifdef USE_HSWNI
345 shlx code4, code4, rcx
346%else
347 shl code4, cl
348%endif
349 or code4, code3
350 add code_len2, rcx
351
352 mov code2, code4
353 ;; Setup for updating hash
354 lea tmp3, [f_i + 1] ; tmp3 <= k
355 add f_i, len2
356
357 ; hash = compute_hash(state->file_start + k) & HASH_MASK;
358 mov tmp5 %+ d, [file_start + tmp3]
359 mov tmp7, tmp5
360 shr tmp7, 8
361
362 compute_hash hash, tmp5
363 and hash %+ d, HASH_MASK
364
365 ; state->head[hash] = k;
366 mov [stream + _internal_state_head + 2 * hash], tmp3 %+ w
367
368 add tmp3,1
369
370 jmp update_hash_for_symbol
371 ;; encode as dist/len
372
373MARK __stateless_len_dist_huffman_ %+ ARCH
374len_dist_huffman_pre:
375 bsf len, len
376 shr len, 3
377
378len_dist_huffman:
379 dec f_i
380
381 ; get_dist_code(dist, &code2, &code_len2);
382%ifndef LONGER_HUFFTABLE
383 mov tmp3, dist ; since code2 and dist are rbx
384 get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
385%else
386 unpack_dist_code code2, code_len2
387%endif
388 ; get_len_code(len, &code, &code_len);
389 get_len_code len, code, rcx, hufftables ;; rcx is code_len
390
391 ; code2 <<= code_len
392 ; code2 |= code
393 ; code_len2 += code_len
394%ifdef USE_HSWNI
395 shlx code2, code2, rcx
396%else
397 shl code2, cl
398%endif
399 or code2, code
400 add code_len2, rcx
401
402 ;; Setup for updateing hash
403 lea tmp3, [f_i + 2] ; tmp3 <= k
404 add f_i, len
405 mov tmp7 %+ d, [file_start + tmp3]
406
407MARK __stateless_update_hash_for_symbol_ %+ ARCH
408update_hash_for_symbol:
409 mov curr_data %+ d, [file_start + f_i]
410 mov curr_data2, curr_data
411 compute_hash hash, curr_data
412%ifdef LIMIT_HASH_UPDATE
413 ; only update hash twice, first hash was already calculated.
414
415 ; hash = compute_hash(state->file_start + k) & HASH_MASK;
416 compute_hash hash2, tmp7
417 and hash2 %+ d, HASH_MASK
418 ; state->head[hash] = k;
419 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
420
421%else
422loop3:
423 ; hash = compute_hash(state->file_start + k) & HASH_MASK;
424 mov tmp7 %+ d, [file_start + tmp3]
425 compute_hash hash2, tmp7
426 and hash2 %+ d, HASH_MASK
427 ; state->head[hash] = k;
428 mov [stream + _internal_state_head + 2 * hash2], tmp3 %+ w
429 add tmp3,1
430 cmp tmp3, f_i
431 jl loop3
432%endif
433
434
435MARK __stateless_write_len_dist_bits_ %+ ARCH
436 mov f_end_i, [rsp + f_end_i_mem_offset]
437 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
438
439 ; continue
440 cmp f_i, f_end_i
441 jl loop2
442 jmp end_loop_2
443
444
445MARK __stateless_write_lit_bits_ %+ ARCH
446%ifdef USE_HSWNI
447encode_2_literals:
448 movzx tmp1, curr_data %+ b
449 get_lit_code tmp1, code3, rcx, hufftables
450
451 shr curr_data, 8
452 and curr_data, 0xff
453 get_lit_code curr_data, code2, code_len2, hufftables
454
455 ;; Calculate code associated with both literals
456 shlx code2, code2, rcx
457 or code2, code3
458 add code_len2, rcx
459%endif
460write_lit_bits:
461 mov f_end_i, [rsp + f_end_i_mem_offset]
462 add f_i, 1
463 mov curr_data %+ d, [file_start + f_i]
464 mov curr_data2, curr_data
465
466 compute_hash hash, curr_data
467
468 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
469
470 ; continue
471 cmp f_i, f_end_i
472 jl loop2
473
474MARK __stateless_end_loops_ %+ ARCH
475end_loop_2:
476 ;; Handle the last bytes (at most LA_statless bytes)
477 add f_end_i, LA_STATELESS - LAST_BYTES_COUNT
478 cmp f_i, f_end_i
479 jge end_loop_2_finish
480
481loop2_finish:
482 ;; Check for space in out buffer
483 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
484 ja end
485
486 mov curr_data %+ d, [file_start + f_i]
487 compute_hash hash, curr_data
488 and hash %+ d, HASH_MASK
489
490 ;; Calculate possible distance for length/dist pair.
491 xor dist, dist
492 mov dist %+ w, f_i %+ w
493 sub dist %+ w, word [stream + _internal_state_head + 2 * hash]
494 mov [stream + _internal_state_head + 2 * hash], f_i %+ w
495
496 ;; Check if look back distance is valid (the dec is to handle when dist = 0)
497 dec dist
498 cmp dist %+ d, (D-1)
499 jae encode_literal_finish
500 inc dist
501
502 ;; Check if look back distance is a match
503 lea tmp6, [f_end_i + LAST_BYTES_COUNT]
504 sub tmp6, f_i
505 lea tmp1, [file_start + f_i]
506 mov tmp2, tmp1
507 sub tmp2, dist
508 compare tmp6, tmp1, tmp2, len, tmp3
509
510 ;; Limit len to maximum value of 258
511 mov tmp2, 258
512 cmp len, 258
513 cmova len, tmp2
514 cmp len, SHORTEST_MATCH
515 jb encode_literal_finish
516
517 ;; Encode len/dist pair
518%ifndef LONGER_HUFFTABLE
519 mov tmp3, dist
520 get_dist_code tmp3, code2, code_len2, hufftables ;; clobbers dist, rcx
521%else
522 get_dist_code dist, code2, code_len2, hufftables ;; clobbers dist, rcx
523%endif
524 get_len_code len, code, rcx, hufftables ;; rcx is code_len
525
526 ;; Combine length and distance code for writing it out
527%ifdef USE_HSWNI
528 shlx code2, code2, rcx
529%else
530 shl code2, cl
531%endif
532 or code2, code
533 add code_len2, rcx
534 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
535
536 ;; Setup for next loop
537 add f_i, len
538 cmp f_i, f_end_i
539 jl loop2_finish
540 jmp end_loop_2_finish
541
542encode_literal_finish:
543 ;; Encode literal
544 and curr_data %+ d, 0xFF
545 get_lit_code curr_data, code2, code_len2, hufftables
546 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
547
548 ;; Setup for next loop
549 add f_i, 1
550 cmp f_i, f_end_i
551 jl loop2_finish
552end_loop_2_finish:
553 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
554 ja end
555
556 ;; Check if any bytes left (at most LAST_BYTES_COUNT bytes)
557 add f_end_i, LAST_BYTES_COUNT
558 cmp f_i, f_end_i
559 jz write_eob
560
561 ;; Handle encoding last few bytes by encoding them as literals
562 xor curr_data, curr_data
563final_bytes:
564 movzx curr_data, byte [file_start + f_i]
565 get_lit_code curr_data, code2, code_len2, hufftables
566 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
567
568 cmp m_out_buf, [stream + _internal_state_bitbuf_m_out_end]
569 ja end
570
571 inc f_i
572 cmp f_i, f_end_i
573 jl final_bytes
574
575write_eob:
576 ;; Write out end of block
577 get_lit_code 256, code2, code_len2, hufftables
578 write_bits m_bits, m_bit_count, code2, code_len2, m_out_buf, tmp3
579 mov dword [stream + _internal_state_has_eob], 1
580
581end:
582 ;; update input buffer
583 add [stream + _total_in], f_i %+ d
584 add [stream + _next_in], f_i
585 sub [stream + _avail_in], f_i %+ d
586
587 ;; update output buffer
588 mov [stream + _next_out], m_out_buf
589 sub m_out_buf, [stream + _internal_state_bitbuf_m_out_start]
590 sub [stream + _avail_out], m_out_buf %+ d
591 add [stream + _total_out], m_out_buf %+ d
592
593 mov [stream + _internal_state_bitbuf_m_bits], m_bits
594 mov [stream + _internal_state_bitbuf_m_bit_count], m_bit_count %+ d
595
596 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
597 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
598 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
599 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
600 mov r12, [rsp + gpr_save_mem_offset + 4*8]
601 mov r13, [rsp + gpr_save_mem_offset + 5*8]
602 mov r14, [rsp + gpr_save_mem_offset + 6*8]
603 mov r15, [rsp + gpr_save_mem_offset + 7*8]
604
605%ifndef ALIGN_STACK
606 add rsp, stack_size
607%else
608 mov rsp, rbp
609 pop rbp
610%endif
611 ret
612
613MARK __stateless_compare_loops_ %+ ARCH
614compare_loop:
615%if (COMPARE_TYPE == 1)
616 compare250 tmp1, tmp2, len, tmp3
617%elif (COMPARE_TYPE == 2)
618 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
619%elif (COMPARE_TYPE == 3)
620 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
621%else
622 %error Unknown Compare type COMPARE_TYPE
623 % error
624%endif
625 jmp len_dist_huffman
626
627compare_loop2:
628%if (COMPARE_TYPE == 1)
629 compare250 tmp1, tmp2, len2, tmp3
630%elif (COMPARE_TYPE == 2)
631 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
632%elif (COMPARE_TYPE == 3)
633 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
634%else
635%error Unknown Compare type COMPARE_TYPE
636 % error
637%endif
638 and curr_data, 0xff
639 get_lit_code curr_data, code3, code_len3, hufftables
640 jmp len_dist_lit_huffman
641
642section .data
643 align 4
644const_D: dq D