]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/igzip_update_histogram.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / igzip / igzip_update_histogram.asm
1
2 %include "options.asm"
3
4 %include "lz0a_const.asm"
5 %include "data_struct2.asm"
6 %include "bitbuf2.asm"
7 %include "huffman.asm"
8 %include "igzip_compare_types.asm"
9 %include "reg_sizes.asm"
10
11 %include "stdmac.asm"
12
13 extern rfc1951_lookup_table
14 _len_to_code_offset equ 0
15
16 %define LAST_BYTES_COUNT 3 ; Bytes to prevent reading out of array bounds
17 %define LA_STATELESS 280 ; Max number of bytes read in loop2 rounded up to 8 byte boundary
18 %define LIT_LEN 286
19 %define DIST_LEN 30
20 %define HIST_ELEM_SIZE 8
21
22 %ifdef DEBUG
23 %macro MARK 1
24 global %1
25 %1:
26 %endm
27 %else
28 %macro MARK 1
29 %endm
30 %endif
31
32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
35 %define file_start rdi
36 %define file_length rsi
37 %define histogram rdx
38 %define rfc_lookup r9
39 %define f_i r10
40
41 %define curr_data rax
42
43 %define tmp2 rcx
44
45 %define dist rbx
46 %define dist_code2 rbx
47
48 %define dist2 r12
49 %define dist_code r12
50
51 %define len rbp
52 %define len_code rbp
53 %define hash3 rbp
54
55 %define curr_data2 r8
56 %define len2 r8
57 %define tmp4 r8
58
59 %define tmp1 r11
60
61 %define tmp3 r13
62
63 %define hash r14
64
65 %define hash2 r15
66
67 %define xtmp0 xmm0
68 %define xtmp1 xmm1
69 %define xdata xmm2
70
71 %define ytmp0 ymm0
72 %define ytmp1 ymm1
73
74 %if(ARCH == 01)
75 %define vtmp0 xtmp0
76 %define vtmp1 xtmp1
77 %define V_LENGTH 16
78 %else
79 %define vtmp0 ytmp0
80 %define vtmp1 ytmp1
81 %define V_LENGTH 32
82 %endif
83 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
84 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
85 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
86 _eob_count_offset equ 0 ; local variable (8 bytes)
87 f_end_i_mem_offset equ 8
88 gpr_save_mem_offset equ 16 ; gpr save area (8*8 bytes)
89 xmm_save_mem_offset equ 16 + 8*8 ; xmm save area (4*16 bytes) (16 byte aligned)
90 stack_size equ 2*8 + 8*8 + 4*16 + 8
91 ;;; 8 because stack address is odd multiple of 8 after a function call and
92 ;;; we want it aligned to 16 bytes
93
94 %ifidn __OUTPUT_FORMAT__, elf64
95 %define arg0 rdi
96 %define arg1 rsi
97 %define arg2 rdx
98
99 %macro FUNC_SAVE 0
100 %ifdef ALIGN_STACK
101 push rbp
102 mov rbp, rsp
103 sub rsp, stack_size
104 and rsp, ~15
105 %else
106 sub rsp, stack_size
107 %endif
108
109 mov [rsp + gpr_save_mem_offset + 0*8], rbx
110 mov [rsp + gpr_save_mem_offset + 1*8], rbp
111 mov [rsp + gpr_save_mem_offset + 2*8], r12
112 mov [rsp + gpr_save_mem_offset + 3*8], r13
113 mov [rsp + gpr_save_mem_offset + 4*8], r14
114 mov [rsp + gpr_save_mem_offset + 5*8], r15
115 %endm
116
117 %macro FUNC_RESTORE 0
118 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
119 mov rbp, [rsp + gpr_save_mem_offset + 1*8]
120 mov r12, [rsp + gpr_save_mem_offset + 2*8]
121 mov r13, [rsp + gpr_save_mem_offset + 3*8]
122 mov r14, [rsp + gpr_save_mem_offset + 4*8]
123 mov r15, [rsp + gpr_save_mem_offset + 5*8]
124
125 %ifndef ALIGN_STACK
126 add rsp, stack_size
127 %else
128 mov rsp, rbp
129 pop rbp
130 %endif
131 %endm
132 %endif
133
134 %ifidn __OUTPUT_FORMAT__, win64
135 %define arg0 rcx
136 %define arg1 rdx
137 %define arg2 r8
138
139 %macro FUNC_SAVE 0
140 %ifdef ALIGN_STACK
141 push rbp
142 mov rbp, rsp
143 sub rsp, stack_size
144 and rsp, ~15
145 %else
146 sub rsp, stack_size
147 %endif
148
149 mov [rsp + gpr_save_mem_offset + 0*8], rbx
150 mov [rsp + gpr_save_mem_offset + 1*8], rsi
151 mov [rsp + gpr_save_mem_offset + 2*8], rdi
152 mov [rsp + gpr_save_mem_offset + 3*8], rbp
153 mov [rsp + gpr_save_mem_offset + 4*8], r12
154 mov [rsp + gpr_save_mem_offset + 5*8], r13
155 mov [rsp + gpr_save_mem_offset + 6*8], r14
156 mov [rsp + gpr_save_mem_offset + 7*8], r15
157 %endm
158
159 %macro FUNC_RESTORE 0
160 mov rbx, [rsp + gpr_save_mem_offset + 0*8]
161 mov rsi, [rsp + gpr_save_mem_offset + 1*8]
162 mov rdi, [rsp + gpr_save_mem_offset + 2*8]
163 mov rbp, [rsp + gpr_save_mem_offset + 3*8]
164 mov r12, [rsp + gpr_save_mem_offset + 4*8]
165 mov r13, [rsp + gpr_save_mem_offset + 5*8]
166 mov r14, [rsp + gpr_save_mem_offset + 6*8]
167 mov r15, [rsp + gpr_save_mem_offset + 7*8]
168
169 %ifndef ALIGN_STACK
170 add rsp, stack_size
171 %else
172 mov rsp, rbp
173 pop rbp
174 %endif
175 %endm
176 %endif
177
178
179 _lit_len_offset equ 0
180 _dist_offset equ (8 * LIT_LEN)
181 _hash_offset equ (_dist_offset + 8 * DIST_LEN)
182
183
184 %macro len_to_len_code 3
185 %define %%len_code %1 ; Output
186 %define %%len %2 ; Input
187 %define %%rfc_lookup %3
188 movzx %%len_code, byte [%%rfc_lookup + _len_to_code_offset + %%len]
189 or %%len_code, 0x100
190 %endm
191
192 ;;; Clobbers rcx and dist
193 %macro dist_to_dist_code 2
194 %define %%dist_code %1 ; Output code associated with dist
195 %define %%dist_coded %1d
196 %define %%dist %2d ; Input dist
197 dec %%dist
198 mov %%dist_coded, %%dist
199 bsr ecx, %%dist_coded
200 dec ecx
201 SHRX %%dist_code, %%dist_code, rcx
202 lea %%dist_coded, [%%dist_coded + 2*ecx]
203
204 cmp %%dist, 1
205 cmovle %%dist_coded, %%dist
206 %endm
207
208 ;;; Clobbers rcx and dist
209 %macro dist_to_dist_code2 2
210 %define %%dist_code %1 ; Output code associated with dist
211 %define %%dist_coded %1d
212 %define %%dist %2d ; Input -(dist - 1)
213 neg %%dist
214 mov %%dist_coded, %%dist
215 bsr ecx, %%dist_coded
216 dec ecx
217 SHRX %%dist_code, %%dist_code, rcx
218 lea %%dist_coded, [%%dist_coded + 2*ecx]
219
220 cmp %%dist, 1
221 cmovle %%dist_coded, %%dist
222 %endm
223
224 ; void isal_update_histogram
225 global isal_update_histogram_ %+ ARCH
226 isal_update_histogram_ %+ ARCH %+ :
227 FUNC_SAVE
228
229 %ifnidn file_start, arg0
230 mov file_start, arg0
231 %endif
232 %ifnidn file_length, arg1
233 mov file_length, arg1
234 %endif
235 %ifnidn histogram, arg2
236 mov histogram, arg2
237 %endif
238 mov f_i, 0
239 cmp file_length, 0
240 je exit_ret ; If nothing to do then exit
241
242 mov tmp1, qword [histogram + _lit_len_offset + 8*256]
243 inc tmp1
244 mov [rsp + _eob_count_offset], tmp1
245
246 lea rfc_lookup, [rfc1951_lookup_table]
247
248 ;; Init hash_table
249 PXOR vtmp0, vtmp0, vtmp0
250 mov rcx, (IGZIP_HASH_SIZE - V_LENGTH)
251 init_hash_table:
252 MOVDQU [histogram + _hash_offset + 2 * rcx], vtmp0
253 MOVDQU [histogram + _hash_offset + 2 * (rcx + V_LENGTH / 2)], vtmp0
254 sub rcx, V_LENGTH
255 jge init_hash_table
256
257 sub file_length, LA_STATELESS
258 cmp file_length, 0
259 jle end_loop_2
260
261
262 ;; Load first literal into histogram
263 mov curr_data, [file_start + f_i]
264 compute_hash hash, curr_data
265 and hash %+ d, HASH_MASK
266 mov [histogram + _hash_offset + 2 * hash], f_i %+ w
267 and curr_data, 0xff
268 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
269 inc f_i
270
271 ;; Setup to begin loop 2
272 MOVDQU xdata, [file_start + f_i]
273 mov curr_data, [file_start + f_i]
274 mov curr_data2, curr_data
275 compute_hash hash, curr_data
276 shr curr_data2, 8
277 compute_hash hash2, curr_data2
278
279 and hash2 %+ d, HASH_MASK
280 and hash, HASH_MASK
281 loop2:
282 xor dist, dist
283 xor dist2, dist2
284 xor tmp3, tmp3
285
286 lea tmp1, [file_start + f_i]
287
288 MOVQ curr_data, xdata
289 PSRLDQ xdata, 1
290
291 ;; Load possible look back distances and update hash data
292 mov dist %+ w, f_i %+ w
293 sub dist, 1
294 sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
295 mov [histogram + _hash_offset + 2 * hash], f_i %+ w
296
297 add f_i, 1
298
299 mov dist2 %+ w, f_i %+ w
300 sub dist2, 1
301 sub dist2 %+ w, word [histogram + _hash_offset + 2 * hash2]
302 mov [histogram + _hash_offset + 2 * hash2], f_i %+ w
303
304 ;; Start computing hashes to be used in either the next loop or
305 ;; for updating the hash if a match is found
306 MOVQ curr_data2, xdata
307 MOVQ tmp2, xdata
308 shr curr_data2, 8
309 compute_hash hash, curr_data2
310
311 ;; Check if look back distances are valid. Load a junk distance of 1
312 ;; if the look back distance is too long for speculative lookups.
313 and dist %+ d, (D-1)
314 neg dist
315
316 and dist2 %+ d, (D-1)
317 neg dist2
318
319 shr tmp2, 16
320 compute_hash hash2, tmp2
321
322 ;; Check for long len/dist matches (>7)
323 mov len, curr_data
324 xor len, [tmp1 + dist - 1]
325 jz compare_loop
326
327 and hash %+ d, HASH_MASK
328 and hash2 %+ d, HASH_MASK
329
330 MOVQ len2, xdata
331 xor len2, [tmp1 + dist2]
332 jz compare_loop2
333
334 ;; Specutively load the code for the first literal
335 movzx tmp1, curr_data %+ b
336 shr curr_data, 8
337
338 lea tmp3, [f_i + 1]
339
340 ;; Check for len/dist match for first literal
341 test len %+ d, 0xFFFFFFFF
342 jz len_dist_huffman_pre
343
344 ;; Store first literal
345 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * tmp1]
346
347 ;; Check for len/dist match for second literal
348 test len2 %+ d, 0xFFFFFFFF
349 jnz lit_lit_huffman
350 len_dist_lit_huffman_pre:
351 ;; Calculate repeat length
352 tzcnt len2, len2
353 shr len2, 3
354
355 len_dist_lit_huffman:
356 MOVQ curr_data, xdata
357 shr curr_data, 24
358 compute_hash hash3, curr_data
359
360 ;; Store updated hashes
361 mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
362 add tmp3,1
363 mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
364 add tmp3, 1
365
366 add f_i, len2
367
368 MOVDQU xdata, [file_start + f_i]
369 mov curr_data, [file_start + f_i]
370 mov tmp1, curr_data
371 compute_hash hash, curr_data
372
373 and hash3, HASH_MASK
374 mov [histogram + _hash_offset + 2 * hash3], tmp3 %+ w
375
376 dist_to_dist_code2 dist_code2, dist2
377
378 len_to_len_code len_code, len2, rfc_lookup
379
380 shr tmp1, 8
381 compute_hash hash2, tmp1
382
383 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
384 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code2]
385
386 and hash2 %+ d, HASH_MASK
387 and hash, HASH_MASK
388
389 cmp f_i, file_length
390 jl loop2
391 jmp end_loop_2
392 ;; encode as dist/len
393
394 len_dist_huffman_pre:
395 tzcnt len, len
396 shr len, 3
397
398 len_dist_huffman:
399 mov [histogram + _hash_offset + 2 * hash], tmp3 %+ w
400 add tmp3,1
401 mov [histogram + _hash_offset + 2 * hash2], tmp3 %+ w
402
403 dec f_i
404 add f_i, len
405
406 MOVDQU xdata, [file_start + f_i]
407 mov curr_data, [file_start + f_i]
408 mov tmp1, curr_data
409 compute_hash hash, curr_data
410
411 dist_to_dist_code2 dist_code, dist
412
413 len_to_len_code len_code, len, rfc_lookup
414
415 shr tmp1, 8
416 compute_hash hash2, tmp1
417
418 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
419 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
420
421 and hash2 %+ d, HASH_MASK
422 and hash, HASH_MASK
423
424 cmp f_i, file_length
425 jl loop2
426 jmp end_loop_2
427
428 lit_lit_huffman:
429 MOVDQU xdata, [file_start + f_i + 1]
430 and curr_data, 0xff
431 add f_i, 1
432 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
433
434 cmp f_i, file_length
435 jl loop2
436
437 end_loop_2:
438 add file_length, LA_STATELESS - LAST_BYTES_COUNT
439 cmp f_i, file_length
440 jge final_bytes
441
442 loop2_finish:
443 mov curr_data %+ d, dword [file_start + f_i]
444 compute_hash hash, curr_data
445 and hash %+ d, HASH_MASK
446
447 ;; Calculate possible distance for length/dist pair.
448 xor dist, dist
449 mov dist %+ w, f_i %+ w
450 sub dist %+ w, word [histogram + _hash_offset + 2 * hash]
451 mov [histogram + _hash_offset + 2 * hash], f_i %+ w
452
453 ;; Check if look back distance is valid (the dec is to handle when dist = 0)
454 dec dist
455 cmp dist %+ d, (D-1)
456 jae encode_literal_finish
457 inc dist
458
459 ;; Check if look back distance is a match
460 lea tmp4, [file_length + LAST_BYTES_COUNT]
461 sub tmp4, f_i
462 lea tmp1, [file_start + f_i]
463 mov tmp2, tmp1
464 sub tmp2, dist
465 compare tmp4, tmp1, tmp2, len, tmp3
466
467 ;; Limit len to maximum value of 258
468 mov tmp2, 258
469 cmp len, 258
470 cmova len, tmp2
471 cmp len, SHORTEST_MATCH
472 jb encode_literal_finish
473
474 add f_i, len
475
476 len_to_len_code len_code, len, rfc_lookup
477 dist_to_dist_code dist_code, dist
478
479 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * len_code]
480 inc qword [histogram + _dist_offset + HIST_ELEM_SIZE * dist_code]
481
482 cmp f_i, file_length
483 jl loop2_finish
484 jmp final_bytes
485
486 encode_literal_finish:
487 ;; Encode literal
488 and curr_data %+ d, 0xFF
489 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
490
491 ;; Setup for next loop
492 add f_i, 1
493 cmp f_i, file_length
494 jl loop2_finish
495
496 final_bytes:
497 add file_length, LAST_BYTES_COUNT
498 final_bytes_loop:
499 cmp f_i, file_length
500 jge end
501 movzx curr_data, byte [file_start + f_i]
502 inc qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * curr_data]
503 inc f_i
504 jmp final_bytes_loop
505
506 end:
507 ;; Handle eob at end of stream
508 mov tmp1, [rsp + _eob_count_offset]
509 mov qword [histogram + _lit_len_offset + HIST_ELEM_SIZE * 256], tmp1
510
511 exit_ret:
512 FUNC_RESTORE
513 ret
514
515 compare_loop:
516 and hash %+ d, HASH_MASK
517 and hash2 %+ d, HASH_MASK
518 lea tmp2, [tmp1 + dist - 1]
519 %if (COMPARE_TYPE == 1)
520 compare250 tmp1, tmp2, len, tmp3
521 %elif (COMPARE_TYPE == 2)
522 compare250_x tmp1, tmp2, len, tmp3, xtmp0, xtmp1
523 %elif (COMPARE_TYPE == 3)
524 compare250_y tmp1, tmp2, len, tmp3, ytmp0, ytmp1
525 %else
526 %error Unknown Compare type COMPARE_TYPE
527 % error
528 %endif
529 lea tmp3, [f_i + 1]
530 jmp len_dist_huffman
531
532 compare_loop2:
533 add tmp1, 1
534 lea tmp2, [tmp1 + dist2 - 1]
535
536 %if (COMPARE_TYPE == 1)
537 compare250 tmp1, tmp2, len2, tmp3
538 %elif (COMPARE_TYPE == 2)
539 compare250_x tmp1, tmp2, len2, tmp3, xtmp0, xtmp1
540 %elif (COMPARE_TYPE == 3)
541 compare250_y tmp1, tmp2, len2, tmp3, ytmp0, ytmp1
542 %else
543 %error Unknown Compare type COMPARE_TYPE
544 % error
545 %endif
546 and curr_data, 0xff
547 inc qword [histogram + _lit_len_offset + 8 * curr_data]
548 lea tmp3, [f_i + 1]
549 jmp len_dist_lit_huffman
550
551 section .data
552 align 32
553 D_vector:
554 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
555 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
556 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF
557 dw -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF, -(D + 1) & 0xFFFF