]>
Commit | Line | Data |
---|---|---|
f91f0fd5 TL |
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
2 | ; Copyright(c) 2011-2018 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "reg_sizes.asm" | |
31 | %include "lz0a_const.asm" | |
32 | %include "data_struct2.asm" | |
33 | %include "huffman.asm" | |
34 | ||
35 | ||
36 | %define USE_HSWNI | |
37 | %define ARCH 04 | |
38 | ||
39 | %ifidn __OUTPUT_FORMAT__, win64 | |
40 | %define arg1 rcx | |
41 | %define arg2 rdx | |
42 | %define arg3 r8 | |
43 | %define hash rsi | |
44 | %define next_in rdi | |
45 | %else | |
46 | %define arg1 rdi | |
47 | %define arg2 rsi | |
48 | %define arg3 rdx | |
49 | %define hash r8 | |
50 | %define next_in rcx | |
51 | %endif | |
52 | ||
53 | %define stream arg1 | |
54 | %define level_buf arg1 | |
55 | %define matches_next arg2 | |
56 | %define f_i_end arg3 | |
57 | ||
58 | %define f_i rax | |
59 | %define file_start rbp | |
60 | %define tmp r9 | |
61 | %define tmp2 r10 | |
62 | %define prev_len r11 | |
63 | %define prev_dist r12 | |
64 | %define f_i_orig r13 | |
65 | ||
66 | %define hash_table level_buf + _hash_map_hash_table | |
67 | ||
68 | %define datas ymm0 | |
69 | %define datas_lookup ymm1 | |
70 | %define yhashes ymm2 | |
71 | %define ydists ymm3 | |
72 | %define ydists_lookup ymm4 | |
73 | ||
74 | %define ydownconvert_qd ymm5 | |
75 | %define ydists2 ymm5 | |
76 | %define yscatter ymm5 | |
77 | %define ytmp2 ymm5 | |
78 | %define ynull_syms ymm5 | |
79 | ||
80 | %define ylens1 ymm6 | |
81 | %define ylens2 ymm7 | |
82 | %define ylookup ymm8 | |
83 | %define ylookup2 ymm9 | |
84 | %define yindex ymm10 | |
85 | ||
86 | %define yrot_left ymm11 | |
87 | %define yshift_finish ymm11 | |
88 | %define yqword_shuf ymm11 | |
89 | %define yhash_prod ymm11 | |
90 | %define ycode ymm11 | |
91 | %define ytmp3 ymm11 | |
92 | ||
93 | %define yones ymm12 | |
94 | %define ydatas_perm2 ymm13 | |
95 | %define yincrement ymm14 | |
96 | ||
97 | %define ytmp ymm15 | |
98 | %define ydist_extra ymm15 | |
99 | %define yhash_mask ymm15 | |
100 | %define ydist_mask ymm15 | |
101 | ||
102 | %ifidn __OUTPUT_FORMAT__, win64 | |
103 | %define stack_size 10*16 + 6 * 8 + 3 * 8 | |
104 | %define local_storage_offset (stack_size - 16) | |
105 | %define func(x) proc_frame x | |
106 | ||
107 | %macro FUNC_SAVE 0 | |
108 | alloc_stack stack_size | |
109 | vmovdqa [rsp + 0*16], xmm6 | |
110 | vmovdqa [rsp + 1*16], xmm7 | |
111 | vmovdqa [rsp + 2*16], xmm8 | |
112 | vmovdqa [rsp + 3*16], xmm9 | |
113 | vmovdqa [rsp + 4*16], xmm10 | |
114 | vmovdqa [rsp + 5*16], xmm11 | |
115 | vmovdqa [rsp + 6*16], xmm12 | |
116 | vmovdqa [rsp + 7*16], xmm13 | |
117 | vmovdqu [rsp + 8*16], xmm14 | |
118 | vmovdqa [rsp + 9*16], xmm15 | |
119 | save_reg rsi, 10*16 + 0*8 | |
120 | save_reg rdi, 10*16 + 1*8 | |
121 | save_reg rbp, 10*16 + 2*8 | |
122 | save_reg r12, 10*16 + 3*8 | |
123 | save_reg r13, 10*16 + 4*8 | |
124 | end_prolog | |
125 | %endm | |
126 | ||
127 | %macro FUNC_RESTORE 0 | |
128 | vmovdqa xmm6, [rsp + 0*16] | |
129 | vmovdqa xmm7, [rsp + 1*16] | |
130 | vmovdqa xmm8, [rsp + 2*16] | |
131 | vmovdqa xmm9, [rsp + 3*16] | |
132 | vmovdqa xmm10, [rsp + 4*16] | |
133 | vmovdqa xmm11, [rsp + 5*16] | |
134 | vmovdqa xmm12, [rsp + 6*16] | |
135 | vmovdqa xmm13, [rsp + 7*16] | |
136 | vmovdqa xmm14, [rsp + 8*16] | |
137 | vmovdqa xmm15, [rsp + 9*16] | |
138 | ||
139 | mov rsi, [rsp + 10*16 + 0*8] | |
140 | mov rdi, [rsp + 10*16 + 1*8] | |
141 | mov rbp, [rsp + 10*16 + 2*8] | |
142 | mov r12, [rsp + 10*16 + 3*8] | |
143 | mov r13, [rsp + 10*16 + 4*8] | |
144 | add rsp, stack_size | |
145 | %endm | |
146 | %else | |
147 | %define stack_size 16 | |
148 | %define local_storage_offset 0 | |
149 | ||
150 | %define func(x) x: | |
151 | %macro FUNC_SAVE 0 | |
152 | push rbp | |
153 | push r12 | |
154 | push r13 | |
155 | sub rsp, stack_size | |
156 | %endm | |
157 | ||
158 | %macro FUNC_RESTORE 0 | |
159 | add rsp, stack_size | |
160 | pop r13 | |
161 | pop r12 | |
162 | pop rbp | |
163 | %endm | |
164 | %endif | |
165 | ||
166 | %define dist_mask_offset local_storage_offset | |
167 | %define hash_mask_offset local_storage_offset + 8 | |
168 | ||
169 | %define VECT_SIZE 8 | |
170 | %define HASH_BYTES 2 | |
171 | ||
172 | global gen_icf_map_lh1_04 | |
173 | func(gen_icf_map_lh1_04) | |
174 | FUNC_SAVE | |
175 | ||
176 | mov file_start, [stream + _next_in] | |
177 | mov f_i %+ d, dword [stream + _total_in] | |
178 | mov f_i_orig, f_i | |
179 | ||
180 | sub file_start, f_i | |
181 | add f_i_end, f_i | |
182 | cmp f_i, f_i_end | |
183 | jge end_main | |
184 | ||
185 | ;; Prep for main loop | |
186 | mov tmp %+ d, dword [stream + _internal_state_dist_mask] | |
187 | mov [rsp + dist_mask_offset], tmp | |
188 | mov tmp %+ d, dword [stream + _internal_state_hash_mask] | |
189 | mov [rsp + hash_mask_offset], tmp | |
190 | mov tmp, stream | |
191 | mov level_buf, [stream + _level_buf] | |
192 | sub f_i_end, LA | |
193 | vmovdqu yincrement, [increment] | |
194 | vpbroadcastd yones, [ones] | |
195 | vmovdqu ydatas_perm2, [datas_perm2] | |
196 | ||
197 | ;; Process first byte | |
198 | vpbroadcastd yhash_prod, [hash_prod] | |
199 | vpbroadcastd yhash_mask, [rsp + hash_mask_offset] | |
200 | vmovd yhashes %+ x, dword [f_i + file_start] | |
201 | vpmaddwd yhashes, yhashes, yhash_prod | |
202 | vpmaddwd yhashes, yhashes, yhash_prod | |
203 | vpand yhashes, yhashes, yhash_mask | |
204 | vmovd hash %+ d, yhashes %+ x | |
205 | cmp byte [tmp + _internal_state_has_hist], IGZIP_NO_HIST | |
206 | jne .has_hist | |
207 | ;; No history, the byte is a literal | |
208 | xor prev_len, prev_len | |
209 | xor prev_dist, prev_dist | |
210 | mov byte [tmp + _internal_state_has_hist], IGZIP_HIST | |
211 | jmp .byte_processed | |
212 | ||
213 | .has_hist: | |
214 | ;; History exists, need to set prev_len and prev_dist accordingly | |
215 | lea next_in, [f_i + file_start] | |
216 | ||
217 | ;; Determine match lookback distance | |
218 | xor tmp, tmp | |
219 | mov tmp %+ w, f_i %+ w | |
220 | dec tmp | |
221 | sub tmp %+ w, word [hash_table + HASH_BYTES * hash] | |
222 | ||
223 | and tmp %+ d, [rsp + dist_mask_offset] | |
224 | neg tmp | |
225 | ||
226 | ;; Check first 8 bytes of match | |
227 | mov prev_len, [next_in] | |
228 | xor prev_len, [next_in + tmp - 1] | |
229 | neg tmp | |
230 | ||
231 | ;; Set prev_dist | |
232 | %ifidn arg1, rcx | |
233 | mov tmp2, rcx | |
234 | %endif | |
235 | ;; The third register is unused on Haswell and later, | |
236 | ;; This line will not work on previous architectures | |
237 | get_dist_icf_code tmp, prev_dist, tmp | |
238 | ||
239 | %ifidn arg1, rcx | |
240 | mov rcx, tmp2 | |
241 | %endif | |
242 | ||
243 | ;; Set prev_len | |
244 | xor tmp2, tmp2 | |
245 | tzcnt prev_len, prev_len | |
246 | shr prev_len, 3 | |
247 | cmp prev_len, MIN_DEF_MATCH | |
248 | cmovl prev_len, tmp2 | |
249 | ||
250 | .byte_processed: | |
251 | mov word [hash_table + HASH_BYTES * hash], f_i %+ w | |
252 | ||
253 | add f_i, 1 | |
254 | ||
255 | ;;hash | |
256 | vmovdqu datas, [f_i + file_start] | |
257 | vpermq yhashes, datas, 0x44 | |
258 | vpshufb yhashes, yhashes, [datas_shuf] | |
259 | vpmaddwd yhashes, yhashes, yhash_prod | |
260 | vpmaddwd yhashes, yhashes, yhash_prod | |
261 | vpand yhashes, yhashes, yhash_mask | |
262 | ||
263 | vpermq ylookup, datas, 0x44 | |
264 | vmovdqu yqword_shuf, [qword_shuf] | |
265 | vpshufb ylookup, ylookup, yqword_shuf | |
266 | vpermd ylookup2, ydatas_perm2, datas | |
267 | vpshufb ylookup2, ylookup2, yqword_shuf | |
268 | ||
269 | ;;gather/scatter hashes | |
270 | vpcmpeqq ytmp, ytmp, ytmp | |
271 | vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp | |
272 | ||
273 | vpbroadcastd ytmp2, [upper_word] | |
274 | vpbroadcastd ytmp, [low_word] | |
275 | vmovd yindex %+ x, f_i %+ d | |
276 | vpbroadcastd yindex, yindex %+ x | |
277 | vpaddd yindex, yindex, yincrement | |
278 | vpand yscatter, ydists_lookup, ytmp2 | |
279 | vpand ytmp, yindex, ytmp | |
280 | vpor yscatter, yscatter, ytmp | |
281 | ||
282 | vmovd tmp %+ d, yhashes %+ x | |
283 | vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x | |
284 | vpextrd tmp %+ d, yhashes %+ x, 1 | |
285 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 | |
286 | vpextrd tmp %+ d, yhashes %+ x, 2 | |
287 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 | |
288 | vpextrd tmp %+ d,yhashes %+ x, 3 | |
289 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 | |
290 | ||
291 | vextracti128 yscatter %+ x, yscatter, 1 | |
292 | vextracti128 yhashes %+ x, yhashes, 1 | |
293 | ||
294 | vmovd tmp %+ d, yhashes %+ x | |
295 | vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x | |
296 | vpextrd tmp %+ d, yhashes %+ x, 1 | |
297 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 | |
298 | vpextrd tmp %+ d, yhashes %+ x, 2 | |
299 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 | |
300 | vpextrd tmp %+ d,yhashes %+ x, 3 | |
301 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 | |
302 | ||
303 | ;; Compute hash for next loop | |
304 | vpbroadcastd yhash_prod, [hash_prod] | |
305 | vpbroadcastd yhash_mask, [rsp + hash_mask_offset] | |
306 | vmovdqu datas, [f_i + file_start + VECT_SIZE] | |
307 | vpermq yhashes, datas, 0x44 | |
308 | vpshufb yhashes, yhashes, [datas_shuf] | |
309 | vpmaddwd yhashes, yhashes, yhash_prod | |
310 | vpmaddwd yhashes, yhashes, yhash_prod | |
311 | vpand yhashes, yhashes, yhash_mask | |
312 | ||
313 | vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE] | |
314 | ||
315 | sub f_i_end, VECT_SIZE | |
316 | cmp f_i, f_i_end | |
317 | jg .loop1_end | |
318 | ||
319 | .loop1: | |
320 | lea next_in, [f_i + file_start] | |
321 | ||
322 | ;; Calculate look back dists | |
323 | vpbroadcastd ydist_mask, [rsp + dist_mask_offset] | |
324 | vpaddd ydists, ydists_lookup, yones | |
325 | vpsubd ydists, yindex, ydists | |
326 | vpand ydists, ydists, ydist_mask | |
327 | vpaddd ydists, ydists, yones | |
328 | vpsubd ydists, yincrement, ydists | |
329 | ||
330 | ;;gather/scatter hashes | |
331 | add f_i, VECT_SIZE | |
332 | ||
333 | vpcmpeqq ytmp, ytmp, ytmp | |
334 | vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp | |
335 | ||
336 | vpbroadcastd ytmp2, [upper_word] | |
337 | vpbroadcastd ytmp, [low_word] | |
338 | vmovd yindex %+ x, f_i %+ d | |
339 | vpbroadcastd yindex, yindex %+ x | |
340 | vpaddd yindex, yindex, yincrement | |
341 | vpand yscatter, ydists_lookup, ytmp2 | |
342 | vpand ytmp, yindex, ytmp | |
343 | vpor yscatter, yscatter, ytmp | |
344 | ||
345 | vmovd tmp %+ d, yhashes %+ x | |
346 | vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x | |
347 | vpextrd tmp %+ d, yhashes %+ x, 1 | |
348 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 | |
349 | vpextrd tmp %+ d, yhashes %+ x, 2 | |
350 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 | |
351 | vpextrd tmp %+ d,yhashes %+ x, 3 | |
352 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 | |
353 | ||
354 | vextracti128 yscatter %+ x, yscatter, 1 | |
355 | vextracti128 yhashes %+ x, yhashes, 1 | |
356 | ||
357 | vmovd tmp %+ d, yhashes %+ x | |
358 | vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x | |
359 | vpextrd tmp %+ d, yhashes %+ x, 1 | |
360 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1 | |
361 | vpextrd tmp %+ d, yhashes %+ x, 2 | |
362 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2 | |
363 | vpextrd tmp %+ d,yhashes %+ x, 3 | |
364 | vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3 | |
365 | ||
366 | ;; Compute hash for next loop | |
367 | vpbroadcastd yhash_prod, [hash_prod] | |
368 | vpbroadcastd yhash_mask, [rsp + hash_mask_offset] | |
369 | vpermq yhashes, datas_lookup, 0x44 | |
370 | vpshufb yhashes, yhashes, [datas_shuf] | |
371 | vpmaddwd yhashes, yhashes, yhash_prod | |
372 | vpmaddwd yhashes, yhashes, yhash_prod | |
373 | vpand yhashes, yhashes, yhash_mask | |
374 | ||
375 | ;;lookup old codes | |
376 | vextracti128 ydists2 %+ x, ydists, 1 | |
377 | ||
378 | vpcmpeqq ytmp, ytmp, ytmp | |
379 | vpgatherdq ylens1, [next_in + ydists %+ x], ytmp | |
380 | vpcmpeqq ytmp, ytmp, ytmp | |
381 | vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp | |
382 | ||
383 | ;; Calculate dist_icf_code | |
384 | vpaddd ydists, ydists, yones | |
385 | vpsubd ydists, yincrement, ydists | |
386 | ||
387 | vpbroadcastd ytmp2, [low_nibble] | |
388 | vbroadcasti128 ytmp3, [nibble_order] | |
389 | vpslld ydist_extra, ydists, 12 | |
390 | vpor ydist_extra, ydists, ydist_extra | |
391 | vpand ydist_extra, ydist_extra, ytmp2 | |
392 | vpshufb ydist_extra, ydist_extra, ytmp3 | |
393 | vbroadcasti128 ytmp2, [bit_index] | |
394 | vpshufb ydist_extra, ytmp2, ydist_extra | |
395 | vpxor ytmp2, ytmp2, ytmp2 | |
396 | vpcmpgtb ytmp2, ydist_extra, ytmp2 | |
397 | vpsrld ytmp3, ytmp2, 8 | |
398 | vpandn ytmp2, ytmp3, ytmp2 | |
399 | vpsrld ytmp3, ytmp2, 16 | |
400 | vpandn ytmp2, ytmp3, ytmp2 | |
401 | vpsrld ytmp3, ytmp2, 24 | |
402 | vpandn ytmp2, ytmp3, ytmp2 | |
403 | vpbroadcastd ytmp3, [base_offset] | |
404 | vpaddb ydist_extra, ytmp3 | |
405 | vpand ydist_extra, ydist_extra, ytmp2 | |
406 | vpsrlq ytmp2, ydist_extra, 32 | |
407 | vpxor ytmp3, ytmp3, ytmp3 | |
408 | vpsadbw ydist_extra, ydist_extra, ytmp3 | |
409 | vpsadbw ytmp2, ytmp2, ytmp3 | |
410 | vpsubd ydist_extra, ydist_extra, ytmp2 | |
411 | vpsllq ytmp2, ytmp2, 32 | |
412 | vpor ydist_extra, ydist_extra, ytmp2 | |
413 | vpcmpgtb ytmp3, ydist_extra, ytmp3 | |
414 | vpand ydist_extra, ydist_extra, ytmp3 | |
415 | ||
416 | vpsllvd ycode, yones, ydist_extra | |
417 | vpsubd ycode, ycode, yones | |
418 | vpcmpgtd ytmp2, ydists, yones | |
419 | vpand ycode, ydists, ycode | |
420 | vpand ycode, ycode, ytmp2 | |
421 | vpsrlvd ydists, ydists, ydist_extra | |
422 | vpslld ydist_extra, ydist_extra, 1 | |
423 | vpaddd ydists, ydists, ydist_extra | |
424 | vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET | |
425 | vpaddd ydists, ydists, ycode | |
426 | ||
427 | ;; Setup ydists for combining with ylens | |
428 | vpslld ydists, ydists, DIST_OFFSET | |
429 | ||
430 | ;; xor current data with lookback dist | |
431 | vpxor ylens1, ylens1, ylookup | |
432 | vpxor ylens2, ylens2, ylookup2 | |
433 | ||
434 | ;; Setup registers for next loop | |
435 | vpermq ylookup, datas, 0x44 | |
436 | vmovdqu yqword_shuf, [qword_shuf] | |
437 | vpshufb ylookup, ylookup, yqword_shuf | |
438 | vpermd ylookup2, ydatas_perm2, datas | |
439 | vpshufb ylookup2, ylookup2, yqword_shuf | |
440 | ||
441 | ;; Compute match length | |
442 | vpxor ytmp, ytmp, ytmp | |
443 | vpcmpeqb ylens1, ylens1, ytmp | |
444 | vpcmpeqb ylens2, ylens2, ytmp | |
445 | vpbroadcastq yshift_finish, [shift_finish] | |
446 | vpand ylens1, ylens1, yshift_finish | |
447 | vpand ylens2, ylens2, yshift_finish | |
448 | vpsadbw ylens1, ylens1, ytmp | |
449 | vpsadbw ylens2, ylens2, ytmp | |
450 | vmovdqu ydownconvert_qd, [downconvert_qd] | |
451 | vpshufb ylens1, ylens1, ydownconvert_qd | |
452 | vextracti128 ytmp %+ x, ylens1, 1 | |
453 | vpor ylens1, ylens1, ytmp | |
454 | vpshufb ylens2, ylens2, ydownconvert_qd | |
455 | vextracti128 ytmp %+ x, ylens2, 1 | |
456 | vpor ylens2, ylens2, ytmp | |
457 | vinserti128 ylens1, ylens1, ylens2 %+ x, 1 | |
458 | vpbroadcastd ytmp, [low_nibble] | |
459 | vpsrld ylens2, ylens1, 4 | |
460 | vpand ylens1, ylens1, ytmp | |
461 | vbroadcasti128 ytmp, [match_cnt_perm] | |
462 | vpbroadcastd ytmp2, [match_cnt_low_max] | |
463 | vpshufb ylens1, ytmp, ylens1 | |
464 | vpshufb ylens2, ytmp, ylens2 | |
465 | vpcmpeqb ytmp, ylens1, ytmp2 | |
466 | vpand ylens2, ylens2, ytmp | |
467 | vpaddd ylens1, ylens1, ylens2 | |
468 | ||
469 | ;; Preload for next loops | |
470 | vmovdqu datas, datas_lookup | |
471 | vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE] | |
472 | ||
473 | ;; Zero out matches which should not be taken | |
474 | vmovdqu yrot_left, [drot_left] | |
475 | vpermd ylens2, yrot_left, ylens1 | |
476 | vpermd ydists, yrot_left, ydists | |
477 | ||
478 | vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0 | |
479 | vmovd prev_len %+ d, ylens2 %+ x | |
480 | vinserti128 ylens2, ylens2, ytmp %+ x, 0 | |
481 | ||
482 | vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0 | |
483 | vmovd prev_dist %+ d, ydists %+ x | |
484 | vinserti128 ydists, ydists, ytmp %+ x, 0 | |
485 | ||
486 | vpbroadcastd ytmp, [shortest_matches] | |
487 | vpcmpgtd ytmp, ylens2, ytmp | |
488 | vpcmpgtd ytmp2, ylens1, ylens2 | |
489 | ||
490 | vpcmpeqd ytmp3, ytmp3, ytmp3 | |
491 | vpxor ytmp, ytmp, ytmp3 | |
492 | vpor ytmp, ytmp, ytmp2 | |
493 | ||
494 | vpandn ylens1, ytmp, ylens2 | |
495 | ||
496 | ;; Update zdists to match ylens1 | |
497 | vpbroadcastd ytmp2, [twofiftyfour] | |
498 | vpaddd ydists, ydists, ylens1 | |
499 | vpaddd ydists, ydists, ytmp2 | |
500 | ||
501 | vpbroadcastd ynull_syms, [null_dist_syms] | |
502 | vpmovzxbd ytmp3, [f_i + file_start - VECT_SIZE - 1] | |
503 | vpaddd ytmp3, ynull_syms | |
504 | vpand ytmp3, ytmp3, ytmp | |
505 | vpandn ydists, ytmp, ydists | |
506 | vpor ydists, ydists, ytmp3 | |
507 | ||
508 | ;;Store ydists | |
509 | vmovdqu [matches_next], ydists | |
510 | add matches_next, ICF_CODE_BYTES * VECT_SIZE | |
511 | ||
512 | cmp f_i, f_i_end | |
513 | jle .loop1 | |
514 | ||
515 | .loop1_end: | |
516 | lea next_in, [f_i + file_start] | |
517 | ||
518 | ;; Calculate look back dists | |
519 | vpbroadcastd ydist_mask, [rsp + dist_mask_offset] | |
520 | vpaddd ydists, ydists_lookup, yones | |
521 | vpsubd ydists, yindex, ydists | |
522 | vpand ydists, ydists, ydist_mask | |
523 | vpaddd ydists, ydists, yones | |
524 | vpsubd ydists, yincrement, ydists | |
525 | ||
526 | ;;lookup old codes | |
527 | vextracti128 ydists2 %+ x, ydists, 1 | |
528 | vpcmpeqq ytmp, ytmp, ytmp | |
529 | vpgatherdq ylens1, [next_in + ydists %+ x], ytmp | |
530 | vpcmpeqq ytmp, ytmp, ytmp | |
531 | vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp | |
532 | ||
533 | ;; Restore last update hash value | |
534 | vpextrd tmp %+ d, ydists2 %+ x, 3 | |
535 | add tmp %+ d, f_i %+ d | |
536 | ||
537 | vpbroadcastd yhash_prod %+ x, [hash_prod] | |
538 | vpbroadcastd yhash_mask %+ x, [rsp + hash_mask_offset] | |
539 | ||
540 | vmovd yhashes %+ x, dword [f_i + file_start + VECT_SIZE - 1] | |
541 | vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x | |
542 | vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x | |
543 | vpand yhashes %+ x, yhashes %+ x, yhash_mask %+ x | |
544 | vmovd hash %+ d, yhashes %+ x | |
545 | ||
546 | mov word [hash_table + HASH_BYTES * hash], tmp %+ w | |
547 | ||
548 | ;; Calculate dist_icf_code | |
549 | vpaddd ydists, ydists, yones | |
550 | vpsubd ydists, yincrement, ydists | |
551 | ||
552 | vpbroadcastd ytmp2, [low_nibble] | |
553 | vbroadcasti128 ytmp3, [nibble_order] | |
554 | vpslld ydist_extra, ydists, 12 | |
555 | vpor ydist_extra, ydists, ydist_extra | |
556 | vpand ydist_extra, ydist_extra, ytmp2 | |
557 | vpshufb ydist_extra, ydist_extra, ytmp3 | |
558 | vbroadcasti128 ytmp2, [bit_index] | |
559 | vpshufb ydist_extra, ytmp2, ydist_extra | |
560 | vpxor ytmp2, ytmp2, ytmp2 | |
561 | vpcmpgtb ytmp2, ydist_extra, ytmp2 | |
562 | vpsrld ytmp3, ytmp2, 8 | |
563 | vpandn ytmp2, ytmp3, ytmp2 | |
564 | vpsrld ytmp3, ytmp2, 16 | |
565 | vpandn ytmp2, ytmp3, ytmp2 | |
566 | vpsrld ytmp3, ytmp2, 24 | |
567 | vpandn ytmp2, ytmp3, ytmp2 | |
568 | vpbroadcastd ytmp3, [base_offset] | |
569 | vpaddb ydist_extra, ytmp3 | |
570 | vpand ydist_extra, ydist_extra, ytmp2 | |
571 | vpsrlq ytmp2, ydist_extra, 32 | |
572 | vpxor ytmp3, ytmp3, ytmp3 | |
573 | vpsadbw ydist_extra, ydist_extra, ytmp3 | |
574 | vpsadbw ytmp2, ytmp2, ytmp3 | |
575 | vpsubd ydist_extra, ydist_extra, ytmp2 | |
576 | vpsllq ytmp2, ytmp2, 32 | |
577 | vpor ydist_extra, ydist_extra, ytmp2 | |
578 | vpcmpgtb ytmp3, ydist_extra, ytmp3 | |
579 | vpand ydist_extra, ydist_extra, ytmp3 | |
580 | ||
581 | vpsllvd ycode, yones, ydist_extra | |
582 | vpsubd ycode, ycode, yones | |
583 | vpcmpgtd ytmp2, ydists, yones | |
584 | vpand ycode, ydists, ycode | |
585 | vpand ycode, ycode, ytmp2 | |
586 | vpsrlvd ydists, ydists, ydist_extra | |
587 | vpslld ydist_extra, ydist_extra, 1 | |
588 | vpaddd ydists, ydists, ydist_extra | |
589 | vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET | |
590 | vpaddd ydists, ydists, ycode | |
591 | ||
592 | ;; Setup ydists for combining with ylens | |
593 | vpslld ydists, ydists, DIST_OFFSET | |
594 | ||
595 | ;; xor current data with lookback dist | |
596 | vpxor ylens1, ylens1, ylookup | |
597 | vpxor ylens2, ylens2, ylookup2 | |
598 | ||
599 | ;; Compute match length | |
600 | vpxor ytmp, ytmp, ytmp | |
601 | vpcmpeqb ylens1, ylens1, ytmp | |
602 | vpcmpeqb ylens2, ylens2, ytmp | |
603 | vpbroadcastq yshift_finish, [shift_finish] | |
604 | vpand ylens1, ylens1, yshift_finish | |
605 | vpand ylens2, ylens2, yshift_finish | |
606 | vpsadbw ylens1, ylens1, ytmp | |
607 | vpsadbw ylens2, ylens2, ytmp | |
608 | vmovdqu ydownconvert_qd, [downconvert_qd] | |
609 | vpshufb ylens1, ylens1, ydownconvert_qd | |
610 | vextracti128 ytmp %+ x, ylens1, 1 | |
611 | vpor ylens1, ylens1, ytmp | |
612 | vpshufb ylens2, ylens2, ydownconvert_qd | |
613 | vextracti128 ytmp %+ x, ylens2, 1 | |
614 | vpor ylens2, ylens2, ytmp | |
615 | vinserti128 ylens1, ylens1, ylens2 %+ x, 1 | |
616 | vpbroadcastd ytmp, [low_nibble] | |
617 | vpsrld ylens2, ylens1, 4 | |
618 | vpand ylens1, ylens1, ytmp | |
619 | vbroadcasti128 ytmp, [match_cnt_perm] | |
620 | vpbroadcastd ytmp2, [match_cnt_low_max] | |
621 | vpshufb ylens1, ytmp, ylens1 | |
622 | vpshufb ylens2, ytmp, ylens2 | |
623 | vpcmpeqb ytmp, ylens1, ytmp2 | |
624 | vpand ylens2, ylens2, ytmp | |
625 | vpaddd ylens1, ylens1, ylens2 | |
626 | ||
627 | ;; Zero out matches which should not be taken | |
628 | vmovdqu yrot_left, [drot_left] | |
629 | vpermd ylens2, yrot_left, ylens1 | |
630 | vpermd ydists, yrot_left, ydists | |
631 | ||
632 | vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0 | |
633 | vinserti128 ylens2, ylens2, ytmp %+ x, 0 | |
634 | ||
635 | vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0 | |
636 | vinserti128 ydists, ydists, ytmp %+ x, 0 | |
637 | ||
638 | vpbroadcastd ytmp, [shortest_matches] | |
639 | vpcmpgtd ytmp, ylens2, ytmp | |
640 | vpcmpgtd ytmp2, ylens1, ylens2 | |
641 | ||
642 | vpcmpeqd ytmp3, ytmp3, ytmp3 | |
643 | vpxor ytmp, ytmp, ytmp3 | |
644 | vpor ytmp, ytmp, ytmp2 | |
645 | ||
646 | vpandn ylens1, ytmp, ylens2 | |
647 | ||
648 | ;; Update zdists to match ylens1 | |
649 | vpbroadcastd ytmp2, [twofiftyfour] | |
650 | vpaddd ydists, ydists, ylens1 | |
651 | vpaddd ydists, ydists, ytmp2 | |
652 | ||
653 | vpbroadcastd ynull_syms, [null_dist_syms] | |
654 | vpmovzxbd ytmp3, [f_i + file_start - 1] | |
655 | vpaddd ytmp3, ynull_syms | |
656 | vpand ytmp3, ytmp3, ytmp | |
657 | vpandn ydists, ytmp, ydists | |
658 | vpor ydists, ydists, ytmp3 | |
659 | ||
660 | ;;Store ydists | |
661 | vmovdqu [matches_next], ydists | |
662 | add f_i, VECT_SIZE | |
663 | ||
664 | end_main: | |
665 | sub f_i, f_i_orig | |
666 | sub f_i, 1 | |
667 | ||
668 | %ifnidn f_i, rax | |
669 | mov rax, f_i | |
670 | %endif | |
671 | FUNC_RESTORE | |
672 | ret | |
673 | ||
674 | endproc_frame | |
675 | ||
676 | section .data | |
677 | align 32 | |
678 | ;; 32 byte data | |
679 | datas_perm2: | |
680 | dd 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4 | |
681 | drot_left: | |
682 | dd 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 | |
683 | datas_shuf: | |
684 | db 0x0, 0x1, 0x2, 0x3 | |
685 | db 0x1, 0x2, 0x3, 0x4 | |
686 | db 0x2, 0x3, 0x4, 0x5 | |
687 | db 0x3, 0x4, 0x5, 0x6 | |
688 | db 0x4, 0x5, 0x6, 0x7 | |
689 | db 0x5, 0x6, 0x7, 0x8 | |
690 | db 0x6, 0x7, 0x8, 0x9 | |
691 | db 0x7, 0x8, 0x9, 0xa | |
692 | qword_shuf: | |
693 | db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | |
694 | db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8 | |
695 | db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9 | |
696 | db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa | |
697 | increment: | |
698 | dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | |
699 | downconvert_qd: | |
700 | db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff | |
701 | db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
702 | db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff | |
703 | db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff | |
704 | ||
705 | ;; 16 byte data | |
706 | match_cnt_perm: | |
707 | db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4 | |
708 | bit_index: | |
709 | db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3 | |
710 | db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4 | |
711 | nibble_order: | |
712 | db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7 | |
713 | db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf | |
714 | ||
715 | ;; 8 byte data | |
716 | shift_finish: | |
717 | db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 | |
718 | ||
719 | ;; 4 byte data | |
720 | ones: | |
721 | dd 0x1 | |
722 | %define PROD1 0xE84B | |
723 | %define PROD2 0x97B1 | |
724 | hash_prod: | |
725 | dw PROD1, PROD2 | |
726 | null_dist_syms: | |
727 | dd LIT | |
728 | twofiftyfour: | |
729 | dd 0xfe | |
730 | shortest_matches: | |
731 | dd MIN_DEF_MATCH | |
732 | upper_word: | |
733 | dw 0x0000, 0xffff | |
734 | low_word: | |
735 | dw 0xffff, 0x0000 | |
736 | low_nibble: | |
737 | db 0x0f, 0x0f, 0x0f, 0x0f | |
738 | match_cnt_low_max: | |
739 | dd 0x4 | |
740 | base_offset: | |
741 | db -0x2, 0x2, 0x6, 0xa |