]> git.proxmox.com Git - ceph.git/blob - ceph/src/spdk/isa-l/igzip/igzip_gen_icf_map_lh1_04.asm
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / spdk / isa-l / igzip / igzip_gen_icf_map_lh1_04.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "huffman.asm"
34
35
36 %define USE_HSWNI
37 %define ARCH 04
38
39 %ifidn __OUTPUT_FORMAT__, win64
40 %define arg1 rcx
41 %define arg2 rdx
42 %define arg3 r8
43 %define hash rsi
44 %define next_in rdi
45 %else
46 %define arg1 rdi
47 %define arg2 rsi
48 %define arg3 rdx
49 %define hash r8
50 %define next_in rcx
51 %endif
52
53 %define stream arg1
54 %define level_buf arg1
55 %define matches_next arg2
56 %define f_i_end arg3
57
58 %define f_i rax
59 %define file_start rbp
60 %define tmp r9
61 %define tmp2 r10
62 %define prev_len r11
63 %define prev_dist r12
64 %define f_i_orig r13
65
66 %define hash_table level_buf + _hash_map_hash_table
67
68 %define datas ymm0
69 %define datas_lookup ymm1
70 %define yhashes ymm2
71 %define ydists ymm3
72 %define ydists_lookup ymm4
73
74 %define ydownconvert_qd ymm5
75 %define ydists2 ymm5
76 %define yscatter ymm5
77 %define ytmp2 ymm5
78 %define ynull_syms ymm5
79
80 %define ylens1 ymm6
81 %define ylens2 ymm7
82 %define ylookup ymm8
83 %define ylookup2 ymm9
84 %define yindex ymm10
85
86 %define yrot_left ymm11
87 %define yshift_finish ymm11
88 %define yqword_shuf ymm11
89 %define yhash_prod ymm11
90 %define ycode ymm11
91 %define ytmp3 ymm11
92
93 %define yones ymm12
94 %define ydatas_perm2 ymm13
95 %define yincrement ymm14
96
97 %define ytmp ymm15
98 %define ydist_extra ymm15
99 %define yhash_mask ymm15
100 %define ydist_mask ymm15
101
102 %ifidn __OUTPUT_FORMAT__, win64
103 %define stack_size 10*16 + 6 * 8 + 3 * 8
104 %define local_storage_offset (stack_size - 16)
105 %define func(x) proc_frame x
106
107 %macro FUNC_SAVE 0
108 alloc_stack stack_size
109 vmovdqa [rsp + 0*16], xmm6
110 vmovdqa [rsp + 1*16], xmm7
111 vmovdqa [rsp + 2*16], xmm8
112 vmovdqa [rsp + 3*16], xmm9
113 vmovdqa [rsp + 4*16], xmm10
114 vmovdqa [rsp + 5*16], xmm11
115 vmovdqa [rsp + 6*16], xmm12
116 vmovdqa [rsp + 7*16], xmm13
117 vmovdqu [rsp + 8*16], xmm14
118 vmovdqa [rsp + 9*16], xmm15
119 save_reg rsi, 10*16 + 0*8
120 save_reg rdi, 10*16 + 1*8
121 save_reg rbp, 10*16 + 2*8
122 save_reg r12, 10*16 + 3*8
123 save_reg r13, 10*16 + 4*8
124 end_prolog
125 %endm
126
127 %macro FUNC_RESTORE 0
128 vmovdqa xmm6, [rsp + 0*16]
129 vmovdqa xmm7, [rsp + 1*16]
130 vmovdqa xmm8, [rsp + 2*16]
131 vmovdqa xmm9, [rsp + 3*16]
132 vmovdqa xmm10, [rsp + 4*16]
133 vmovdqa xmm11, [rsp + 5*16]
134 vmovdqa xmm12, [rsp + 6*16]
135 vmovdqa xmm13, [rsp + 7*16]
136 vmovdqa xmm14, [rsp + 8*16]
137 vmovdqa xmm15, [rsp + 9*16]
138
139 mov rsi, [rsp + 10*16 + 0*8]
140 mov rdi, [rsp + 10*16 + 1*8]
141 mov rbp, [rsp + 10*16 + 2*8]
142 mov r12, [rsp + 10*16 + 3*8]
143 mov r13, [rsp + 10*16 + 4*8]
144 add rsp, stack_size
145 %endm
146 %else
147 %define stack_size 16
148 %define local_storage_offset 0
149
150 %define func(x) x:
151 %macro FUNC_SAVE 0
152 push rbp
153 push r12
154 push r13
155 sub rsp, stack_size
156 %endm
157
158 %macro FUNC_RESTORE 0
159 add rsp, stack_size
160 pop r13
161 pop r12
162 pop rbp
163 %endm
164 %endif
165
166 %define dist_mask_offset local_storage_offset
167 %define hash_mask_offset local_storage_offset + 8
168
169 %define VECT_SIZE 8
170 %define HASH_BYTES 2
171
172 global gen_icf_map_lh1_04
173 func(gen_icf_map_lh1_04)
174 FUNC_SAVE
175
176 mov file_start, [stream + _next_in]
177 mov f_i %+ d, dword [stream + _total_in]
178 mov f_i_orig, f_i
179
180 sub file_start, f_i
181 add f_i_end, f_i
182 cmp f_i, f_i_end
183 jge end_main
184
185 ;; Prep for main loop
186 mov tmp %+ d, dword [stream + _internal_state_dist_mask]
187 mov [rsp + dist_mask_offset], tmp
188 mov tmp %+ d, dword [stream + _internal_state_hash_mask]
189 mov [rsp + hash_mask_offset], tmp
190 mov tmp, stream
191 mov level_buf, [stream + _level_buf]
192 sub f_i_end, LA
193 vmovdqu yincrement, [increment]
194 vpbroadcastd yones, [ones]
195 vmovdqu ydatas_perm2, [datas_perm2]
196
197 ;; Process first byte
198 vpbroadcastd yhash_prod, [hash_prod]
199 vpbroadcastd yhash_mask, [rsp + hash_mask_offset]
200 vmovd yhashes %+ x, dword [f_i + file_start]
201 vpmaddwd yhashes, yhashes, yhash_prod
202 vpmaddwd yhashes, yhashes, yhash_prod
203 vpand yhashes, yhashes, yhash_mask
204 vmovd hash %+ d, yhashes %+ x
205 cmp byte [tmp + _internal_state_has_hist], IGZIP_NO_HIST
206 jne .has_hist
207 ;; No history, the byte is a literal
208 xor prev_len, prev_len
209 xor prev_dist, prev_dist
210 mov byte [tmp + _internal_state_has_hist], IGZIP_HIST
211 jmp .byte_processed
212
213 .has_hist:
214 ;; History exists, need to set prev_len and prev_dist accordingly
215 lea next_in, [f_i + file_start]
216
217 ;; Determine match lookback distance
218 xor tmp, tmp
219 mov tmp %+ w, f_i %+ w
220 dec tmp
221 sub tmp %+ w, word [hash_table + HASH_BYTES * hash]
222
223 and tmp %+ d, [rsp + dist_mask_offset]
224 neg tmp
225
226 ;; Check first 8 bytes of match
227 mov prev_len, [next_in]
228 xor prev_len, [next_in + tmp - 1]
229 neg tmp
230
231 ;; Set prev_dist
232 %ifidn arg1, rcx
233 mov tmp2, rcx
234 %endif
235 ;; The third register is unused on Haswell and later,
236 ;; This line will not work on previous architectures
237 get_dist_icf_code tmp, prev_dist, tmp
238
239 %ifidn arg1, rcx
240 mov rcx, tmp2
241 %endif
242
243 ;; Set prev_len
244 xor tmp2, tmp2
245 tzcnt prev_len, prev_len
246 shr prev_len, 3
247 cmp prev_len, MIN_DEF_MATCH
248 cmovl prev_len, tmp2
249
250 .byte_processed:
251 mov word [hash_table + HASH_BYTES * hash], f_i %+ w
252
253 add f_i, 1
254
255 ;;hash
256 vmovdqu datas, [f_i + file_start]
257 vpermq yhashes, datas, 0x44
258 vpshufb yhashes, yhashes, [datas_shuf]
259 vpmaddwd yhashes, yhashes, yhash_prod
260 vpmaddwd yhashes, yhashes, yhash_prod
261 vpand yhashes, yhashes, yhash_mask
262
263 vpermq ylookup, datas, 0x44
264 vmovdqu yqword_shuf, [qword_shuf]
265 vpshufb ylookup, ylookup, yqword_shuf
266 vpermd ylookup2, ydatas_perm2, datas
267 vpshufb ylookup2, ylookup2, yqword_shuf
268
269 ;;gather/scatter hashes
270 vpcmpeqq ytmp, ytmp, ytmp
271 vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
272
273 vpbroadcastd ytmp2, [upper_word]
274 vpbroadcastd ytmp, [low_word]
275 vmovd yindex %+ x, f_i %+ d
276 vpbroadcastd yindex, yindex %+ x
277 vpaddd yindex, yindex, yincrement
278 vpand yscatter, ydists_lookup, ytmp2
279 vpand ytmp, yindex, ytmp
280 vpor yscatter, yscatter, ytmp
281
282 vmovd tmp %+ d, yhashes %+ x
283 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
284 vpextrd tmp %+ d, yhashes %+ x, 1
285 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
286 vpextrd tmp %+ d, yhashes %+ x, 2
287 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
288 vpextrd tmp %+ d,yhashes %+ x, 3
289 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
290
291 vextracti128 yscatter %+ x, yscatter, 1
292 vextracti128 yhashes %+ x, yhashes, 1
293
294 vmovd tmp %+ d, yhashes %+ x
295 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
296 vpextrd tmp %+ d, yhashes %+ x, 1
297 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
298 vpextrd tmp %+ d, yhashes %+ x, 2
299 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
300 vpextrd tmp %+ d,yhashes %+ x, 3
301 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
302
303 ;; Compute hash for next loop
304 vpbroadcastd yhash_prod, [hash_prod]
305 vpbroadcastd yhash_mask, [rsp + hash_mask_offset]
306 vmovdqu datas, [f_i + file_start + VECT_SIZE]
307 vpermq yhashes, datas, 0x44
308 vpshufb yhashes, yhashes, [datas_shuf]
309 vpmaddwd yhashes, yhashes, yhash_prod
310 vpmaddwd yhashes, yhashes, yhash_prod
311 vpand yhashes, yhashes, yhash_mask
312
313 vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
314
315 sub f_i_end, VECT_SIZE
316 cmp f_i, f_i_end
317 jg .loop1_end
318
319 .loop1:
320 lea next_in, [f_i + file_start]
321
322 ;; Calculate look back dists
323 vpbroadcastd ydist_mask, [rsp + dist_mask_offset]
324 vpaddd ydists, ydists_lookup, yones
325 vpsubd ydists, yindex, ydists
326 vpand ydists, ydists, ydist_mask
327 vpaddd ydists, ydists, yones
328 vpsubd ydists, yincrement, ydists
329
330 ;;gather/scatter hashes
331 add f_i, VECT_SIZE
332
333 vpcmpeqq ytmp, ytmp, ytmp
334 vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
335
336 vpbroadcastd ytmp2, [upper_word]
337 vpbroadcastd ytmp, [low_word]
338 vmovd yindex %+ x, f_i %+ d
339 vpbroadcastd yindex, yindex %+ x
340 vpaddd yindex, yindex, yincrement
341 vpand yscatter, ydists_lookup, ytmp2
342 vpand ytmp, yindex, ytmp
343 vpor yscatter, yscatter, ytmp
344
345 vmovd tmp %+ d, yhashes %+ x
346 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
347 vpextrd tmp %+ d, yhashes %+ x, 1
348 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
349 vpextrd tmp %+ d, yhashes %+ x, 2
350 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
351 vpextrd tmp %+ d,yhashes %+ x, 3
352 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
353
354 vextracti128 yscatter %+ x, yscatter, 1
355 vextracti128 yhashes %+ x, yhashes, 1
356
357 vmovd tmp %+ d, yhashes %+ x
358 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
359 vpextrd tmp %+ d, yhashes %+ x, 1
360 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
361 vpextrd tmp %+ d, yhashes %+ x, 2
362 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
363 vpextrd tmp %+ d,yhashes %+ x, 3
364 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
365
366 ;; Compute hash for next loop
367 vpbroadcastd yhash_prod, [hash_prod]
368 vpbroadcastd yhash_mask, [rsp + hash_mask_offset]
369 vpermq yhashes, datas_lookup, 0x44
370 vpshufb yhashes, yhashes, [datas_shuf]
371 vpmaddwd yhashes, yhashes, yhash_prod
372 vpmaddwd yhashes, yhashes, yhash_prod
373 vpand yhashes, yhashes, yhash_mask
374
375 ;;lookup old codes
376 vextracti128 ydists2 %+ x, ydists, 1
377
378 vpcmpeqq ytmp, ytmp, ytmp
379 vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
380 vpcmpeqq ytmp, ytmp, ytmp
381 vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
382
383 ;; Calculate dist_icf_code
384 vpaddd ydists, ydists, yones
385 vpsubd ydists, yincrement, ydists
386
387 vpbroadcastd ytmp2, [low_nibble]
388 vbroadcasti128 ytmp3, [nibble_order]
389 vpslld ydist_extra, ydists, 12
390 vpor ydist_extra, ydists, ydist_extra
391 vpand ydist_extra, ydist_extra, ytmp2
392 vpshufb ydist_extra, ydist_extra, ytmp3
393 vbroadcasti128 ytmp2, [bit_index]
394 vpshufb ydist_extra, ytmp2, ydist_extra
395 vpxor ytmp2, ytmp2, ytmp2
396 vpcmpgtb ytmp2, ydist_extra, ytmp2
397 vpsrld ytmp3, ytmp2, 8
398 vpandn ytmp2, ytmp3, ytmp2
399 vpsrld ytmp3, ytmp2, 16
400 vpandn ytmp2, ytmp3, ytmp2
401 vpsrld ytmp3, ytmp2, 24
402 vpandn ytmp2, ytmp3, ytmp2
403 vpbroadcastd ytmp3, [base_offset]
404 vpaddb ydist_extra, ytmp3
405 vpand ydist_extra, ydist_extra, ytmp2
406 vpsrlq ytmp2, ydist_extra, 32
407 vpxor ytmp3, ytmp3, ytmp3
408 vpsadbw ydist_extra, ydist_extra, ytmp3
409 vpsadbw ytmp2, ytmp2, ytmp3
410 vpsubd ydist_extra, ydist_extra, ytmp2
411 vpsllq ytmp2, ytmp2, 32
412 vpor ydist_extra, ydist_extra, ytmp2
413 vpcmpgtb ytmp3, ydist_extra, ytmp3
414 vpand ydist_extra, ydist_extra, ytmp3
415
416 vpsllvd ycode, yones, ydist_extra
417 vpsubd ycode, ycode, yones
418 vpcmpgtd ytmp2, ydists, yones
419 vpand ycode, ydists, ycode
420 vpand ycode, ycode, ytmp2
421 vpsrlvd ydists, ydists, ydist_extra
422 vpslld ydist_extra, ydist_extra, 1
423 vpaddd ydists, ydists, ydist_extra
424 vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
425 vpaddd ydists, ydists, ycode
426
427 ;; Setup ydists for combining with ylens
428 vpslld ydists, ydists, DIST_OFFSET
429
430 ;; xor current data with lookback dist
431 vpxor ylens1, ylens1, ylookup
432 vpxor ylens2, ylens2, ylookup2
433
434 ;; Setup registers for next loop
435 vpermq ylookup, datas, 0x44
436 vmovdqu yqword_shuf, [qword_shuf]
437 vpshufb ylookup, ylookup, yqword_shuf
438 vpermd ylookup2, ydatas_perm2, datas
439 vpshufb ylookup2, ylookup2, yqword_shuf
440
441 ;; Compute match length
442 vpxor ytmp, ytmp, ytmp
443 vpcmpeqb ylens1, ylens1, ytmp
444 vpcmpeqb ylens2, ylens2, ytmp
445 vpbroadcastq yshift_finish, [shift_finish]
446 vpand ylens1, ylens1, yshift_finish
447 vpand ylens2, ylens2, yshift_finish
448 vpsadbw ylens1, ylens1, ytmp
449 vpsadbw ylens2, ylens2, ytmp
450 vmovdqu ydownconvert_qd, [downconvert_qd]
451 vpshufb ylens1, ylens1, ydownconvert_qd
452 vextracti128 ytmp %+ x, ylens1, 1
453 vpor ylens1, ylens1, ytmp
454 vpshufb ylens2, ylens2, ydownconvert_qd
455 vextracti128 ytmp %+ x, ylens2, 1
456 vpor ylens2, ylens2, ytmp
457 vinserti128 ylens1, ylens1, ylens2 %+ x, 1
458 vpbroadcastd ytmp, [low_nibble]
459 vpsrld ylens2, ylens1, 4
460 vpand ylens1, ylens1, ytmp
461 vbroadcasti128 ytmp, [match_cnt_perm]
462 vpbroadcastd ytmp2, [match_cnt_low_max]
463 vpshufb ylens1, ytmp, ylens1
464 vpshufb ylens2, ytmp, ylens2
465 vpcmpeqb ytmp, ylens1, ytmp2
466 vpand ylens2, ylens2, ytmp
467 vpaddd ylens1, ylens1, ylens2
468
469 ;; Preload for next loops
470 vmovdqu datas, datas_lookup
471 vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
472
473 ;; Zero out matches which should not be taken
474 vmovdqu yrot_left, [drot_left]
475 vpermd ylens2, yrot_left, ylens1
476 vpermd ydists, yrot_left, ydists
477
478 vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
479 vmovd prev_len %+ d, ylens2 %+ x
480 vinserti128 ylens2, ylens2, ytmp %+ x, 0
481
482 vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
483 vmovd prev_dist %+ d, ydists %+ x
484 vinserti128 ydists, ydists, ytmp %+ x, 0
485
486 vpbroadcastd ytmp, [shortest_matches]
487 vpcmpgtd ytmp, ylens2, ytmp
488 vpcmpgtd ytmp2, ylens1, ylens2
489
490 vpcmpeqd ytmp3, ytmp3, ytmp3
491 vpxor ytmp, ytmp, ytmp3
492 vpor ytmp, ytmp, ytmp2
493
494 vpandn ylens1, ytmp, ylens2
495
496 ;; Update zdists to match ylens1
497 vpbroadcastd ytmp2, [twofiftyfour]
498 vpaddd ydists, ydists, ylens1
499 vpaddd ydists, ydists, ytmp2
500
501 vpbroadcastd ynull_syms, [null_dist_syms]
502 vpmovzxbd ytmp3, [f_i + file_start - VECT_SIZE - 1]
503 vpaddd ytmp3, ynull_syms
504 vpand ytmp3, ytmp3, ytmp
505 vpandn ydists, ytmp, ydists
506 vpor ydists, ydists, ytmp3
507
508 ;;Store ydists
509 vmovdqu [matches_next], ydists
510 add matches_next, ICF_CODE_BYTES * VECT_SIZE
511
512 cmp f_i, f_i_end
513 jle .loop1
514
515 .loop1_end:
516 lea next_in, [f_i + file_start]
517
518 ;; Calculate look back dists
519 vpbroadcastd ydist_mask, [rsp + dist_mask_offset]
520 vpaddd ydists, ydists_lookup, yones
521 vpsubd ydists, yindex, ydists
522 vpand ydists, ydists, ydist_mask
523 vpaddd ydists, ydists, yones
524 vpsubd ydists, yincrement, ydists
525
526 ;;lookup old codes
527 vextracti128 ydists2 %+ x, ydists, 1
528 vpcmpeqq ytmp, ytmp, ytmp
529 vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
530 vpcmpeqq ytmp, ytmp, ytmp
531 vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
532
533 ;; Restore last update hash value
534 vpextrd tmp %+ d, ydists2 %+ x, 3
535 add tmp %+ d, f_i %+ d
536
537 vpbroadcastd yhash_prod %+ x, [hash_prod]
538 vpbroadcastd yhash_mask %+ x, [rsp + hash_mask_offset]
539
540 vmovd yhashes %+ x, dword [f_i + file_start + VECT_SIZE - 1]
541 vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x
542 vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x
543 vpand yhashes %+ x, yhashes %+ x, yhash_mask %+ x
544 vmovd hash %+ d, yhashes %+ x
545
546 mov word [hash_table + HASH_BYTES * hash], tmp %+ w
547
548 ;; Calculate dist_icf_code
549 vpaddd ydists, ydists, yones
550 vpsubd ydists, yincrement, ydists
551
552 vpbroadcastd ytmp2, [low_nibble]
553 vbroadcasti128 ytmp3, [nibble_order]
554 vpslld ydist_extra, ydists, 12
555 vpor ydist_extra, ydists, ydist_extra
556 vpand ydist_extra, ydist_extra, ytmp2
557 vpshufb ydist_extra, ydist_extra, ytmp3
558 vbroadcasti128 ytmp2, [bit_index]
559 vpshufb ydist_extra, ytmp2, ydist_extra
560 vpxor ytmp2, ytmp2, ytmp2
561 vpcmpgtb ytmp2, ydist_extra, ytmp2
562 vpsrld ytmp3, ytmp2, 8
563 vpandn ytmp2, ytmp3, ytmp2
564 vpsrld ytmp3, ytmp2, 16
565 vpandn ytmp2, ytmp3, ytmp2
566 vpsrld ytmp3, ytmp2, 24
567 vpandn ytmp2, ytmp3, ytmp2
568 vpbroadcastd ytmp3, [base_offset]
569 vpaddb ydist_extra, ytmp3
570 vpand ydist_extra, ydist_extra, ytmp2
571 vpsrlq ytmp2, ydist_extra, 32
572 vpxor ytmp3, ytmp3, ytmp3
573 vpsadbw ydist_extra, ydist_extra, ytmp3
574 vpsadbw ytmp2, ytmp2, ytmp3
575 vpsubd ydist_extra, ydist_extra, ytmp2
576 vpsllq ytmp2, ytmp2, 32
577 vpor ydist_extra, ydist_extra, ytmp2
578 vpcmpgtb ytmp3, ydist_extra, ytmp3
579 vpand ydist_extra, ydist_extra, ytmp3
580
581 vpsllvd ycode, yones, ydist_extra
582 vpsubd ycode, ycode, yones
583 vpcmpgtd ytmp2, ydists, yones
584 vpand ycode, ydists, ycode
585 vpand ycode, ycode, ytmp2
586 vpsrlvd ydists, ydists, ydist_extra
587 vpslld ydist_extra, ydist_extra, 1
588 vpaddd ydists, ydists, ydist_extra
589 vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
590 vpaddd ydists, ydists, ycode
591
592 ;; Setup ydists for combining with ylens
593 vpslld ydists, ydists, DIST_OFFSET
594
595 ;; xor current data with lookback dist
596 vpxor ylens1, ylens1, ylookup
597 vpxor ylens2, ylens2, ylookup2
598
599 ;; Compute match length
600 vpxor ytmp, ytmp, ytmp
601 vpcmpeqb ylens1, ylens1, ytmp
602 vpcmpeqb ylens2, ylens2, ytmp
603 vpbroadcastq yshift_finish, [shift_finish]
604 vpand ylens1, ylens1, yshift_finish
605 vpand ylens2, ylens2, yshift_finish
606 vpsadbw ylens1, ylens1, ytmp
607 vpsadbw ylens2, ylens2, ytmp
608 vmovdqu ydownconvert_qd, [downconvert_qd]
609 vpshufb ylens1, ylens1, ydownconvert_qd
610 vextracti128 ytmp %+ x, ylens1, 1
611 vpor ylens1, ylens1, ytmp
612 vpshufb ylens2, ylens2, ydownconvert_qd
613 vextracti128 ytmp %+ x, ylens2, 1
614 vpor ylens2, ylens2, ytmp
615 vinserti128 ylens1, ylens1, ylens2 %+ x, 1
616 vpbroadcastd ytmp, [low_nibble]
617 vpsrld ylens2, ylens1, 4
618 vpand ylens1, ylens1, ytmp
619 vbroadcasti128 ytmp, [match_cnt_perm]
620 vpbroadcastd ytmp2, [match_cnt_low_max]
621 vpshufb ylens1, ytmp, ylens1
622 vpshufb ylens2, ytmp, ylens2
623 vpcmpeqb ytmp, ylens1, ytmp2
624 vpand ylens2, ylens2, ytmp
625 vpaddd ylens1, ylens1, ylens2
626
627 ;; Zero out matches which should not be taken
628 vmovdqu yrot_left, [drot_left]
629 vpermd ylens2, yrot_left, ylens1
630 vpermd ydists, yrot_left, ydists
631
632 vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
633 vinserti128 ylens2, ylens2, ytmp %+ x, 0
634
635 vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
636 vinserti128 ydists, ydists, ytmp %+ x, 0
637
638 vpbroadcastd ytmp, [shortest_matches]
639 vpcmpgtd ytmp, ylens2, ytmp
640 vpcmpgtd ytmp2, ylens1, ylens2
641
642 vpcmpeqd ytmp3, ytmp3, ytmp3
643 vpxor ytmp, ytmp, ytmp3
644 vpor ytmp, ytmp, ytmp2
645
646 vpandn ylens1, ytmp, ylens2
647
648 ;; Update zdists to match ylens1
649 vpbroadcastd ytmp2, [twofiftyfour]
650 vpaddd ydists, ydists, ylens1
651 vpaddd ydists, ydists, ytmp2
652
653 vpbroadcastd ynull_syms, [null_dist_syms]
654 vpmovzxbd ytmp3, [f_i + file_start - 1]
655 vpaddd ytmp3, ynull_syms
656 vpand ytmp3, ytmp3, ytmp
657 vpandn ydists, ytmp, ydists
658 vpor ydists, ydists, ytmp3
659
660 ;;Store ydists
661 vmovdqu [matches_next], ydists
662 add f_i, VECT_SIZE
663
664 end_main:
665 sub f_i, f_i_orig
666 sub f_i, 1
667
668 %ifnidn f_i, rax
669 mov rax, f_i
670 %endif
671 FUNC_RESTORE
672 ret
673
674 endproc_frame
675
676 section .data
677 align 32
678 ;; 32 byte data
679 datas_perm2:
680 dd 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4
681 drot_left:
682 dd 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6
683 datas_shuf:
684 db 0x0, 0x1, 0x2, 0x3
685 db 0x1, 0x2, 0x3, 0x4
686 db 0x2, 0x3, 0x4, 0x5
687 db 0x3, 0x4, 0x5, 0x6
688 db 0x4, 0x5, 0x6, 0x7
689 db 0x5, 0x6, 0x7, 0x8
690 db 0x6, 0x7, 0x8, 0x9
691 db 0x7, 0x8, 0x9, 0xa
692 qword_shuf:
693 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
694 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
695 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
696 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
697 increment:
698 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
699 downconvert_qd:
700 db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
701 db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
702 db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
703 db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
704
705 ;; 16 byte data
706 match_cnt_perm:
707 db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4
708 bit_index:
709 db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3
710 db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
711 nibble_order:
712 db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7
713 db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf
714
715 ;; 8 byte data
716 shift_finish:
717 db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
718
719 ;; 4 byte data
720 ones:
721 dd 0x1
722 %define PROD1 0xE84B
723 %define PROD2 0x97B1
724 hash_prod:
725 dw PROD1, PROD2
726 null_dist_syms:
727 dd LIT
728 twofiftyfour:
729 dd 0xfe
730 shortest_matches:
731 dd MIN_DEF_MATCH
732 upper_word:
733 dw 0x0000, 0xffff
734 low_word:
735 dw 0xffff, 0x0000
736 low_nibble:
737 db 0x0f, 0x0f, 0x0f, 0x0f
738 match_cnt_low_max:
739 dd 0x4
740 base_offset:
741 db -0x2, 0x2, 0x6, 0xa