]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/igzip_decode_block_stateless.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / igzip / igzip_decode_block_stateless.asm
CommitLineData
224ce89b
WB
1default rel
2
3%include "reg_sizes.asm"
4
5%define DECOMP_OK 0
6%define END_INPUT 1
7%define OUT_OVERFLOW 2
8%define INVALID_BLOCK -1
9%define INVALID_SYMBOL -2
10%define INVALID_LOOKBACK -3
11
12%define ISAL_DECODE_LONG_BITS 12
13%define ISAL_DECODE_SHORT_BITS 10
14
15%define MAX_LONG_CODE_LARGE (288 + (1 << (15 - ISAL_DECODE_LONG_BITS)))
16%define MAX_LONG_CODE_SMALL (32 + (1 << (15 - ISAL_DECODE_SHORT_BITS)))
17
18%define COPY_SIZE 16
19%define COPY_LEN_MAX 258
20
21%define IN_BUFFER_SLOP 8
22%define OUT_BUFFER_SLOP COPY_SIZE + COPY_LEN_MAX
23
24%include "inflate_data_structs.asm"
25%include "stdmac.asm"
26
27extern rfc1951_lookup_table
28
29;; rax
30%define tmp3 rax
31%define read_in_2 rax
32%define look_back_dist rax
33
34;; rcx
35;; rdx arg3
36%define next_sym2 rdx
37%define copy_start rdx
38%define tmp4 rdx
39
40;; rdi arg1
41%define tmp1 rdi
42%define look_back_dist2 rdi
43%define next_bits2 rdi
44%define next_sym3 rdi
45
46;; rsi arg2
47%define tmp2 rsi
48%define next_bits rsi
49
50;; rbx ; Saved
51%define next_in rbx
52
53;; rbp ; Saved
54%define end_in rbp
55
56;; r8
57%define repeat_length r8
58
59;; r9
60%define read_in r9
61
62;; r10
63%define read_in_length r10
64
65;; r11
66%define state r11
67
68;; r12 ; Saved
69%define next_out r12
70
71;; r13 ; Saved
72%define end_out r13
73
74;; r14 ; Saved
75%define next_sym r14
76
77;; r15 ; Saved
78%define rfc_lookup r15
79
80start_out_mem_offset equ 0
81read_in_mem_offset equ 8
82read_in_length_mem_offset equ 16
83gpr_save_mem_offset equ 24
84stack_size equ 3 * 8 + 8 * 8
85
86%define _dist_extra_bit_count 264
87%define _dist_start _dist_extra_bit_count + 1*32
88%define _len_extra_bit_count _dist_start + 4*32
89%define _len_start _len_extra_bit_count + 1*32
90
91%ifidn __OUTPUT_FORMAT__, elf64
92%define arg0 rdi
93
94%macro FUNC_SAVE 0
95%ifdef ALIGN_STACK
96 push rbp
97 mov rbp, rsp
98 sub rsp, stack_size
99 and rsp, ~15
100%else
101 sub rsp, stack_size
102%endif
103
104 mov [rsp + gpr_save_mem_offset + 0*8], rbx
105 mov [rsp + gpr_save_mem_offset + 1*8], rbp
106 mov [rsp + gpr_save_mem_offset + 2*8], r12
107 mov [rsp + gpr_save_mem_offset + 3*8], r13
108 mov [rsp + gpr_save_mem_offset + 4*8], r14
109 mov [rsp + gpr_save_mem_offset + 5*8], r15
110%endm
111
112%macro FUNC_RESTORE 0
113 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
114 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
115 mov r12, [rsp + gpr_save_mem_offset + 2*8]
116 mov r13, [rsp + gpr_save_mem_offset + 3*8]
117 mov r14, [rsp + gpr_save_mem_offset + 4*8]
118 mov r15, [rsp + gpr_save_mem_offset + 5*8]
119
120%ifndef ALIGN_STACK
121 add rsp, stack_size
122%else
123 mov rsp, rbp
124 pop rbp
125%endif
126%endm
127%endif
128
129%ifidn __OUTPUT_FORMAT__, win64
130%define arg0 rcx
131%macro FUNC_SAVE 0
132%ifdef ALIGN_STACK
133 push rbp
134 mov rbp, rsp
135 sub rsp, stack_size
136 and rsp, ~15
137%else
138 sub rsp, stack_size
139%endif
140
141 mov [rsp + gpr_save_mem_offset + 0*8], rbx
142 mov [rsp + gpr_save_mem_offset + 1*8], rsi
143 mov [rsp + gpr_save_mem_offset + 2*8], rdi
144 mov [rsp + gpr_save_mem_offset + 3*8], rbp
145 mov [rsp + gpr_save_mem_offset + 4*8], r12
146 mov [rsp + gpr_save_mem_offset + 5*8], r13
147 mov [rsp + gpr_save_mem_offset + 6*8], r14
148 mov [rsp + gpr_save_mem_offset + 7*8], r15
149%endm
150
151%macro FUNC_RESTORE 0
152 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
153 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
154 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
155 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
156 mov r12, [rsp + gpr_save_mem_offset + 4*8]
157 mov r13, [rsp + gpr_save_mem_offset + 5*8]
158 mov r14, [rsp + gpr_save_mem_offset + 6*8]
159 mov r15, [rsp + gpr_save_mem_offset + 7*8]
160
161%ifndef ALIGN_STACK
162 add rsp, stack_size
163%else
164 mov rsp, rbp
165 pop rbp
166%endif
167%endm
168%endif
169
170;; Load read_in and updated in_buffer accordingly
171;; when there are at least 8 bytes in the in buffer
172;; Clobbers rcx, unless rcx is %%read_in_length
173%macro inflate_in_load 6
174%define %%next_in %1
175%define %%end_in %2
176%define %%read_in %3
177%define %%read_in_length %4
178%define %%tmp1 %5 ; Tmp registers
179%define %%tmp2 %6
180
181 SHLX %%tmp1, [%%next_in], %%read_in_length
182 or %%read_in, %%tmp1
183
184 mov %%tmp1, 64
185 sub %%tmp1, %%read_in_length
186 shr %%tmp1, 3
187
188 add %%next_in, %%tmp1
189 lea %%read_in_length, [%%read_in_length + 8 * %%tmp1]
190%%end:
191%endm
192
193;; Load read_in and updated in_buffer accordingly
194;; Clobbers rcx, unless rcx is %%read_in_length
195%macro inflate_in_small_load 6
196%define %%next_in %1
197%define %%end_in %2
198%define %%read_in %3
199%define %%read_in_length %4
200%define %%avail_in %5 ; Tmp registers
201%define %%tmp1 %5
202%define %%loop_count %6
203
204 mov %%avail_in, %%end_in
205 sub %%avail_in, %%next_in
206
207%ifnidn %%read_in_length, rcx
208 mov rcx, %%read_in_length
209%endif
210
211 mov %%loop_count, 64
212 sub %%loop_count, %%read_in_length
213 shr %%loop_count, 3
214
215 cmp %%loop_count, %%avail_in
216 cmovg %%loop_count, %%avail_in
217 cmp %%loop_count, 0
218 je %%end
219
220%%load_byte:
221 xor %%tmp1, %%tmp1
222 mov %%tmp1 %+ b, byte [%%next_in]
223 SHLX %%tmp1, %%tmp1, rcx
224 or %%read_in, %%tmp1
225 add rcx, 8
226 add %%next_in, 1
227 sub %%loop_count, 1
228 jg %%load_byte
229%ifnidn %%read_in_length, rcx
230 mov %%read_in_length, rcx
231%endif
232%%end:
233%endm
234
235;; Decode next symbol
236;; Clobber rcx
237%macro decode_next 8
238%define %%state %1 ; State structure associated with compressed stream
239%define %%lookup_size %2 ; Number of bits used for small lookup
240%define %%state_offset %3
241%define %%read_in %4 ; Bits read in from compressed stream
242%define %%read_in_length %5 ; Number of valid bits in read_in
243%define %%next_sym %6 ; Returned symobl
244%define %%next_bits %7
245%define %%next_bits2 %8
246
247 ;; Lookup possible next symbol
248 mov %%next_bits, %%read_in
249 and %%next_bits, (1 << %%lookup_size) - 1
250 movzx %%next_sym, word [%%state + %%state_offset + 2 * %%next_bits]
251
252 ;; Save length associated with symbol
253 mov rcx, %%next_sym
254 shr rcx, 9
255 jz invalid_symbol
256
257 ;; Check if symbol or hint was looked up
258 and %%next_sym, 0x81FF
259 cmp %%next_sym, 0x8000
260 jl %%end
261
262 ;; Decode next_sym using hint
263 mov %%next_bits2, %%read_in
264
265 ;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first DECODE_LOOKUP_SIZE bits.
266%ifdef USE_HSWNI
267 and rcx, 0x1F
268 bzhi %%next_bits2, %%next_bits2, rcx
269%else
270 neg rcx
271 shl %%next_bits2, cl
272 shr %%next_bits2, cl
273%endif
274 shr %%next_bits2, %%lookup_size
275
276 add %%next_bits2, %%next_sym
277
278 ;; Lookup actual next symbol
279 movzx %%next_sym, word [%%state + %%state_offset + 2 * %%next_bits2 + 2 *((1 << %%lookup_size) - 0x8000)]
280
281 ;; Save length associated with symbol
282 mov rcx, %%next_sym
283 shr rcx, 9
284 jz invalid_symbol
285 and %%next_sym, 0x1FF
286%%end:
287 ;; Updated read_in to reflect the bits which were decoded
288 sub %%read_in_length, rcx
289 SHRX %%read_in, %%read_in, rcx
290%endm
291
292
293;; Decode next symbol
294;; Clobber rcx
295%macro decode_next2 7
296%define %%state %1 ; State structure associated with compressed stream
297%define %%lookup_size %2 ; Number of bits used for small lookup
298%define %%state_offset %3 ; Type of huff code, should be either LIT or DIST
299%define %%read_in %4 ; Bits read in from compressed stream
300%define %%read_in_length %5 ; Number of valid bits in read_in
301%define %%next_sym %6 ; Returned symobl
302%define %%next_bits2 %7
303
304 ;; Save length associated with symbol
305 mov %%next_bits2, %%read_in
306 shr %%next_bits2, %%lookup_size
307
308 mov rcx, %%next_sym
309 shr rcx, 9
310 jz invalid_symbol
311
312 ;; Check if symbol or hint was looked up
313 and %%next_sym, 0x81FF
314 cmp %%next_sym, 0x8000
315 jl %%end
316
317 ;; Extract the 15-DECODE_LOOKUP_SIZE bits beyond the first %%lookup_size bits.
318 lea %%next_sym, [%%state + 2 * %%next_sym]
319 sub rcx, 0x40 + %%lookup_size
320
321%ifdef USE_HSWNI
322 bzhi %%next_bits2, %%next_bits2, rcx
323%else
324 ;; Decode next_sym using hint
325 neg rcx
326 shl %%next_bits2, cl
327 shr %%next_bits2, cl
328%endif
329
330 ;; Lookup actual next symbol
331 movzx %%next_sym, word [%%next_sym + %%state_offset + 2 * %%next_bits2 + 2 * ((1 << %%lookup_size) - 0x8000)]
332
333 ;; Save length associated with symbol
334 mov rcx, %%next_sym
335 shr rcx, 9
336 jz invalid_symbol
337 and %%next_sym, 0x1FF
338
339%%end:
340 ;; Updated read_in to reflect the bits which were decoded
341 SHRX %%read_in, %%read_in, rcx
342 sub %%read_in_length, rcx
343%endm
344
345global decode_huffman_code_block_stateless_ %+ ARCH
346decode_huffman_code_block_stateless_ %+ ARCH %+ :
347
348 FUNC_SAVE
349
350 mov state, arg0
351 lea rfc_lookup, [rfc1951_lookup_table]
352
353 mov read_in,[state + _read_in]
354 mov read_in_length %+ d, dword [state + _read_in_length]
355 mov next_out, [state + _next_out]
356 mov end_out %+ d, dword [state + _avail_out]
357 add end_out, next_out
358 mov next_in, [state + _next_in]
359 mov end_in %+ d, dword [state + _avail_in]
360 add end_in, next_in
361
362 mov dword [state + _copy_overflow_len], 0
363 mov dword [state + _copy_overflow_dist], 0
364
365 mov tmp3 %+ d, dword [state + _total_out]
366 sub tmp3, next_out
367 neg tmp3
368
369 mov [rsp + start_out_mem_offset], tmp3
370
371 sub end_out, OUT_BUFFER_SLOP
372 sub end_in, IN_BUFFER_SLOP
373
374 cmp next_in, end_in
375 jg end_loop_block_pre
376
377 cmp read_in_length, 64
378 je skip_load
379
380 inflate_in_load next_in, end_in, read_in, read_in_length, tmp1, tmp2
381
382skip_load:
383 mov tmp3, read_in
384 and tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1
385 movzx next_sym, word [state + _lit_huff_code + 2 * tmp3]
386
387;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
388; Main Loop
389;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390loop_block:
391 ;; Check if near end of in buffer or out buffer
392 cmp next_in, end_in
393 jg end_loop_block_pre
394 cmp next_out, end_out
395 jg end_loop_block_pre
396
397 ;; Decode next symbol and reload the read_in buffer
398 decode_next2 state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, tmp1
399
400 ;; Save next_sym in next_sym2 so next_sym can be preloaded
401 mov next_sym2, next_sym
402
403 ;; Find index to specutively preload next_sym from
404 mov tmp3, read_in
405 and tmp3, (1 << ISAL_DECODE_LONG_BITS) - 1
406
407 ;; Start reloading read_in
408 mov tmp1, [next_in]
409 SHLX tmp1, tmp1, read_in_length
410 or read_in, tmp1
411
412 ;; Specutively load data associated with length symbol
413 movzx rcx, byte [rfc_lookup + _len_extra_bit_count + next_sym2 - 257]
414 movzx repeat_length, word [rfc_lookup + _len_start + 2 * (next_sym2 - 257)]
415
416 ;; Test for end of block symbol
417 cmp next_sym2, 256
418 je end_symbol_pre
419
420 ;; Specutively load next_sym for next loop if a literal was decoded
421 movzx next_sym, word [state + _lit_huff_code + 2 * tmp3]
422
423 ;; Finish updating read_in_length for read_in
424 mov tmp1, 64
425 sub tmp1, read_in_length
426 shr tmp1, 3
427 add next_in, tmp1
428 lea read_in_length, [read_in_length + 8 * tmp1]
429
430 ;; Specultively load next dist code
431 SHRX read_in_2, read_in, rcx
432 mov next_bits2, read_in_2
433 and next_bits2, (1 << ISAL_DECODE_SHORT_BITS) - 1
434 movzx next_sym3, word [state + _dist_huff_code + 2 * next_bits2]
435
436 ;; Specutively write next_sym2 if it is a literal
437 mov [next_out], next_sym2
438 add next_out, 1
439
440 ;; Check if next_sym2 is a literal, length, or end of block symbol
441 cmp next_sym2, 256
442 jl loop_block
443
444decode_len_dist:
445 ;; Find length for length/dist pair
446 mov next_bits, read_in
447
448 BZHI next_bits, next_bits, rcx, tmp4
449 add repeat_length, next_bits
450
451 ;; Update read_in for the length extra bits which were read in
452 sub read_in_length, rcx
453
454 ;; Decode distance code
455 decode_next2 state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in_2, read_in_length, next_sym3, tmp2
456
457 movzx rcx, byte [rfc_lookup + _dist_extra_bit_count + next_sym3]
458 mov look_back_dist2 %+ d, [rfc_lookup + _dist_start + 4 * next_sym3]
459
460 ;; Load distance code extra bits
461 mov next_bits, read_in_2
462
463 ;; Determine next_out after the copy is finished
464 add next_out, repeat_length
465 sub next_out, 1
466
467 ;; Calculate the look back distance
468 BZHI next_bits, next_bits, rcx, tmp4
469 SHRX read_in_2, read_in_2, rcx
470
471 ;; Setup next_sym, read_in, and read_in_length for next loop
472 mov read_in, read_in_2
473 and read_in_2, (1 << ISAL_DECODE_LONG_BITS) - 1
474 movzx next_sym, word [state + _lit_huff_code + 2 * read_in_2]
475 sub read_in_length, rcx
476
477 ;; Copy distance in len/dist pair
478 add look_back_dist2, next_bits
479
480 ;; Find beginning of copy
481 mov copy_start, next_out
482 sub copy_start, repeat_length
483 sub copy_start, look_back_dist2
484
485 ;; Check if a valid look back distances was decoded
486 cmp copy_start, [rsp + start_out_mem_offset]
487 jl invalid_look_back_distance
488 MOVDQU xmm1, [copy_start]
489
490 ;; Set tmp2 to be the minimum of COPY_SIZE and repeat_length
491 ;; This is to decrease use of small_byte_copy branch
492 mov tmp2, COPY_SIZE
493 cmp tmp2, repeat_length
494 cmovg tmp2, repeat_length
495
496 ;; Check for overlapping memory in the copy
497 cmp look_back_dist2, tmp2
498 jl small_byte_copy_pre
499
500large_byte_copy:
501 ;; Copy length distance pair when memory overlap is not an issue
502 MOVDQU [copy_start + look_back_dist2], xmm1
503
504 sub repeat_length, COPY_SIZE
505 jle loop_block
506
507 add copy_start, COPY_SIZE
508 MOVDQU xmm1, [copy_start]
509 jmp large_byte_copy
510
511small_byte_copy_pre:
512 ;; Copy length distance pair when source and destination overlap
513 add repeat_length, look_back_dist2
514small_byte_copy:
515 MOVDQU [copy_start + look_back_dist2], xmm1
516
517 shl look_back_dist2, 1
518 MOVDQU xmm1, [copy_start]
519 cmp look_back_dist2, COPY_SIZE
520 jl small_byte_copy
521
522 sub repeat_length, look_back_dist2
523 jge large_byte_copy
524 jmp loop_block
525
526;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
527; Finish Main Loop
528;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
529end_loop_block_pre:
530 ;; Fix up in buffer and out buffer to reflect the actual buffer end
531 add end_out, OUT_BUFFER_SLOP
532 add end_in, IN_BUFFER_SLOP
533
534end_loop_block:
535 ;; Load read in buffer and decode next lit/len symbol
536 inflate_in_small_load next_in, end_in, read_in, read_in_length, tmp1, tmp2
537 mov [rsp + read_in_mem_offset], read_in
538 mov [rsp + read_in_length_mem_offset], read_in_length
539
540 decode_next state, ISAL_DECODE_LONG_BITS, _lit_huff_code, read_in, read_in_length, next_sym, tmp1, tmp2
541
542 ;; Check that enough input was available to decode symbol
543 cmp read_in_length, 0
544 jl end_of_input
545
546 cmp next_sym, 256
547 jl decode_literal
548 je end_symbol
549
550decode_len_dist_2:
551 ;; Load length exta bits
552 mov next_bits, read_in
553
554 movzx repeat_length, word [rfc_lookup + _len_start + 2 * (next_sym - 257)]
555 movzx rcx, byte [rfc_lookup + _len_extra_bit_count + next_sym - 257]
556
557 ;; Calculate repeat length
558 BZHI next_bits, next_bits, rcx, tmp1
559 add repeat_length, next_bits
560
561 ;; Update read_in for the length extra bits which were read in
562 SHRX read_in, read_in, rcx
563 sub read_in_length, rcx
564
565 ;; Decode distance code
566 decode_next state, ISAL_DECODE_SHORT_BITS, _dist_huff_code, read_in, read_in_length, next_sym, tmp1, tmp2
567
568 ;; Load distance code extra bits
569 mov next_bits, read_in
570 mov look_back_dist %+ d, [rfc_lookup + _dist_start + 4 * next_sym]
571 movzx rcx, byte [rfc_lookup + _dist_extra_bit_count + next_sym]
572
573
574 ;; Calculate the look back distance and check for enough input
575 BZHI next_bits, next_bits, rcx, tmp1
576 SHRX read_in, read_in, rcx
577 add look_back_dist, next_bits
578 sub read_in_length, rcx
579 jl end_of_input
580
581 ;; Setup code for byte copy using rep movsb
582 mov rsi, next_out
583 mov rdi, rsi
584 mov rcx, repeat_length
585 sub rsi, look_back_dist
586
587 ;; Check if a valid look back distance was decoded
588 cmp rsi, [rsp + start_out_mem_offset]
589 jl invalid_look_back_distance
590
591 ;; Check for out buffer overflow
592 add repeat_length, next_out
593 cmp repeat_length, end_out
594 jg out_buffer_overflow_repeat
595
596 mov next_out, repeat_length
597
598 rep movsb
599 jmp end_loop_block
600
601decode_literal:
602 ;; Store literal decoded from the input stream
603 cmp next_out, end_out
604 jge out_buffer_overflow_lit
605 add next_out, 1
606 mov byte [next_out - 1], next_sym %+ b
607 jmp end_loop_block
608
609;; Set exit codes
610end_of_input:
611 mov read_in, [rsp + read_in_mem_offset]
612 mov read_in_length, [rsp + read_in_length_mem_offset]
613 mov rax, END_INPUT
614 jmp end
615
616out_buffer_overflow_repeat:
617 mov rcx, end_out
618 sub rcx, next_out
619 sub repeat_length, rcx
620 sub repeat_length, next_out
621 rep movsb
622
623 mov [state + _copy_overflow_len], repeat_length %+ d
624 mov [state + _copy_overflow_dist], look_back_dist %+ d
625
626 mov next_out, end_out
627
628 mov rax, OUT_OVERFLOW
629 jmp end
630
631out_buffer_overflow_lit:
632 mov read_in, [rsp + read_in_mem_offset]
633 mov read_in_length, [rsp + read_in_length_mem_offset]
634 mov rax, OUT_OVERFLOW
635 jmp end
636
637invalid_look_back_distance:
638 mov rax, INVALID_LOOKBACK
639 jmp end
640
641invalid_symbol:
642 mov rax, INVALID_SYMBOL
643 jmp end
644
645end_symbol_pre:
646 ;; Fix up in buffer and out buffer to reflect the actual buffer
647 add end_out, OUT_BUFFER_SLOP
648 add end_in, IN_BUFFER_SLOP
649end_symbol:
650 ;; Set flag identifying a new block is required
651 mov byte [state + _block_state], ISAL_BLOCK_NEW_HDR
652 xor rax, rax
653end:
654 ;; Save current buffer states
655 mov [state + _read_in], read_in
656 mov [state + _read_in_length], read_in_length %+ d
657 mov [state + _next_out], next_out
658 sub end_out, next_out
659 mov dword [state + _avail_out], end_out %+ d
660 sub next_out, [rsp + start_out_mem_offset]
661 mov [state + _total_out], next_out %+ d
662 mov [state + _next_in], next_in
663 sub end_in, next_in
664 mov [state + _avail_in], end_in %+ d
665
666 FUNC_RESTORE
667
668 ret