1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "huffman.asm"
39 %ifidn __OUTPUT_FORMAT__, win64
54 %define level_buf arg1
55 %define matches_next arg2
59 %define file_start rbp
66 %define hash_table level_buf + _hash_map_hash_table
69 %define datas_lookup ymm1
72 %define ydists_lookup ymm4
74 %define ydownconvert_qd ymm5
78 %define ynull_syms ymm5
86 %define yrot_left ymm11
87 %define yshift_finish ymm11
88 %define yqword_shuf ymm11
89 %define yhash_prod ymm11
94 %define ydatas_perm2 ymm13
95 %define yincrement ymm14
98 %define ydist_extra ymm15
99 %define yhash_mask ymm15
100 %define ydist_mask ymm15
102 %ifidn __OUTPUT_FORMAT__, win64
103 %define stack_size 10*16 + 6 * 8 + 3 * 8
104 %define local_storage_offset (stack_size - 16)
105 %define func(x) proc_frame x
108 alloc_stack stack_size
109 vmovdqa [rsp + 0*16], xmm6
110 vmovdqa [rsp + 1*16], xmm7
111 vmovdqa [rsp + 2*16], xmm8
112 vmovdqa [rsp + 3*16], xmm9
113 vmovdqa [rsp + 4*16], xmm10
114 vmovdqa [rsp + 5*16], xmm11
115 vmovdqa [rsp + 6*16], xmm12
116 vmovdqa [rsp + 7*16], xmm13
117 vmovdqu [rsp + 8*16], xmm14
118 vmovdqa [rsp + 9*16], xmm15
119 save_reg rsi, 10*16 + 0*8
120 save_reg rdi, 10*16 + 1*8
121 save_reg rbp, 10*16 + 2*8
122 save_reg r12, 10*16 + 3*8
123 save_reg r13, 10*16 + 4*8
127 %macro FUNC_RESTORE 0
128 vmovdqa xmm6, [rsp + 0*16]
129 vmovdqa xmm7, [rsp + 1*16]
130 vmovdqa xmm8, [rsp + 2*16]
131 vmovdqa xmm9, [rsp + 3*16]
132 vmovdqa xmm10, [rsp + 4*16]
133 vmovdqa xmm11, [rsp + 5*16]
134 vmovdqa xmm12, [rsp + 6*16]
135 vmovdqa xmm13, [rsp + 7*16]
136 vmovdqa xmm14, [rsp + 8*16]
137 vmovdqa xmm15, [rsp + 9*16]
139 mov rsi, [rsp + 10*16 + 0*8]
140 mov rdi, [rsp + 10*16 + 1*8]
141 mov rbp, [rsp + 10*16 + 2*8]
142 mov r12, [rsp + 10*16 + 3*8]
143 mov r13, [rsp + 10*16 + 4*8]
147 %define stack_size 16
148 %define local_storage_offset 0
158 %macro FUNC_RESTORE 0
166 %define dist_mask_offset local_storage_offset
167 %define hash_mask_offset local_storage_offset + 8
172 global gen_icf_map_lh1_04
173 func(gen_icf_map_lh1_04)
176 mov file_start, [stream + _next_in]
177 mov f_i %+ d, dword [stream + _total_in]
185 ;; Prep for main loop
186 mov tmp %+ d, dword [stream + _internal_state_dist_mask]
187 mov [rsp + dist_mask_offset], tmp
188 mov tmp %+ d, dword [stream + _internal_state_hash_mask]
189 mov [rsp + hash_mask_offset], tmp
191 mov level_buf, [stream + _level_buf]
193 vmovdqu yincrement, [increment]
194 vpbroadcastd yones, [ones]
195 vmovdqu ydatas_perm2, [datas_perm2]
197 ;; Process first byte
198 vpbroadcastd yhash_prod, [hash_prod]
199 vpbroadcastd yhash_mask, [rsp + hash_mask_offset]
200 vmovd yhashes %+ x, dword [f_i + file_start]
201 vpmaddwd yhashes, yhashes, yhash_prod
202 vpmaddwd yhashes, yhashes, yhash_prod
203 vpand yhashes, yhashes, yhash_mask
204 vmovd hash %+ d, yhashes %+ x
205 cmp byte [tmp + _internal_state_has_hist], IGZIP_NO_HIST
207 ;; No history, the byte is a literal
208 xor prev_len, prev_len
209 xor prev_dist, prev_dist
210 mov byte [tmp + _internal_state_has_hist], IGZIP_HIST
214 ;; History exists, need to set prev_len and prev_dist accordingly
215 lea next_in, [f_i + file_start]
217 ;; Determine match lookback distance
219 mov tmp %+ w, f_i %+ w
221 sub tmp %+ w, word [hash_table + HASH_BYTES * hash]
223 and tmp %+ d, [rsp + dist_mask_offset]
226 ;; Check first 8 bytes of match
227 mov prev_len, [next_in]
228 xor prev_len, [next_in + tmp - 1]
235 ;; The third register is unused on Haswell and later,
236 ;; This line will not work on previous architectures
237 get_dist_icf_code tmp, prev_dist, tmp
245 tzcnt prev_len, prev_len
247 cmp prev_len, MIN_DEF_MATCH
251 mov word [hash_table + HASH_BYTES * hash], f_i %+ w
256 vmovdqu datas, [f_i + file_start]
257 vpermq yhashes, datas, 0x44
258 vpshufb yhashes, yhashes, [datas_shuf]
259 vpmaddwd yhashes, yhashes, yhash_prod
260 vpmaddwd yhashes, yhashes, yhash_prod
261 vpand yhashes, yhashes, yhash_mask
263 vpermq ylookup, datas, 0x44
264 vmovdqu yqword_shuf, [qword_shuf]
265 vpshufb ylookup, ylookup, yqword_shuf
266 vpermd ylookup2, ydatas_perm2, datas
267 vpshufb ylookup2, ylookup2, yqword_shuf
269 ;;gather/scatter hashes
270 vpcmpeqq ytmp, ytmp, ytmp
271 vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
273 vpbroadcastd ytmp2, [upper_word]
274 vpbroadcastd ytmp, [low_word]
275 vmovd yindex %+ x, f_i %+ d
276 vpbroadcastd yindex, yindex %+ x
277 vpaddd yindex, yindex, yincrement
278 vpand yscatter, ydists_lookup, ytmp2
279 vpand ytmp, yindex, ytmp
280 vpor yscatter, yscatter, ytmp
282 vmovd tmp %+ d, yhashes %+ x
283 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
284 vpextrd tmp %+ d, yhashes %+ x, 1
285 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
286 vpextrd tmp %+ d, yhashes %+ x, 2
287 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
288 vpextrd tmp %+ d,yhashes %+ x, 3
289 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
291 vextracti128 yscatter %+ x, yscatter, 1
292 vextracti128 yhashes %+ x, yhashes, 1
294 vmovd tmp %+ d, yhashes %+ x
295 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
296 vpextrd tmp %+ d, yhashes %+ x, 1
297 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
298 vpextrd tmp %+ d, yhashes %+ x, 2
299 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
300 vpextrd tmp %+ d,yhashes %+ x, 3
301 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
303 ;; Compute hash for next loop
304 vpbroadcastd yhash_prod, [hash_prod]
305 vpbroadcastd yhash_mask, [rsp + hash_mask_offset]
306 vmovdqu datas, [f_i + file_start + VECT_SIZE]
307 vpermq yhashes, datas, 0x44
308 vpshufb yhashes, yhashes, [datas_shuf]
309 vpmaddwd yhashes, yhashes, yhash_prod
310 vpmaddwd yhashes, yhashes, yhash_prod
311 vpand yhashes, yhashes, yhash_mask
313 vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
315 sub f_i_end, VECT_SIZE
320 lea next_in, [f_i + file_start]
322 ;; Calculate look back dists
323 vpbroadcastd ydist_mask, [rsp + dist_mask_offset]
324 vpaddd ydists, ydists_lookup, yones
325 vpsubd ydists, yindex, ydists
326 vpand ydists, ydists, ydist_mask
327 vpaddd ydists, ydists, yones
328 vpsubd ydists, yincrement, ydists
330 ;;gather/scatter hashes
333 vpcmpeqq ytmp, ytmp, ytmp
334 vpgatherdd ydists_lookup, [hash_table + HASH_BYTES * yhashes], ytmp
336 vpbroadcastd ytmp2, [upper_word]
337 vpbroadcastd ytmp, [low_word]
338 vmovd yindex %+ x, f_i %+ d
339 vpbroadcastd yindex, yindex %+ x
340 vpaddd yindex, yindex, yincrement
341 vpand yscatter, ydists_lookup, ytmp2
342 vpand ytmp, yindex, ytmp
343 vpor yscatter, yscatter, ytmp
345 vmovd tmp %+ d, yhashes %+ x
346 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
347 vpextrd tmp %+ d, yhashes %+ x, 1
348 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
349 vpextrd tmp %+ d, yhashes %+ x, 2
350 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
351 vpextrd tmp %+ d,yhashes %+ x, 3
352 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
354 vextracti128 yscatter %+ x, yscatter, 1
355 vextracti128 yhashes %+ x, yhashes, 1
357 vmovd tmp %+ d, yhashes %+ x
358 vmovd [hash_table + HASH_BYTES * tmp], yscatter %+ x
359 vpextrd tmp %+ d, yhashes %+ x, 1
360 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 1
361 vpextrd tmp %+ d, yhashes %+ x, 2
362 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 2
363 vpextrd tmp %+ d,yhashes %+ x, 3
364 vpextrd [hash_table + HASH_BYTES * tmp], yscatter %+ x, 3
366 ;; Compute hash for next loop
367 vpbroadcastd yhash_prod, [hash_prod]
368 vpbroadcastd yhash_mask, [rsp + hash_mask_offset]
369 vpermq yhashes, datas_lookup, 0x44
370 vpshufb yhashes, yhashes, [datas_shuf]
371 vpmaddwd yhashes, yhashes, yhash_prod
372 vpmaddwd yhashes, yhashes, yhash_prod
373 vpand yhashes, yhashes, yhash_mask
376 vextracti128 ydists2 %+ x, ydists, 1
378 vpcmpeqq ytmp, ytmp, ytmp
379 vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
380 vpcmpeqq ytmp, ytmp, ytmp
381 vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
383 ;; Calculate dist_icf_code
384 vpaddd ydists, ydists, yones
385 vpsubd ydists, yincrement, ydists
387 vpbroadcastd ytmp2, [low_nibble]
388 vbroadcasti128 ytmp3, [nibble_order]
389 vpslld ydist_extra, ydists, 12
390 vpor ydist_extra, ydists, ydist_extra
391 vpand ydist_extra, ydist_extra, ytmp2
392 vpshufb ydist_extra, ydist_extra, ytmp3
393 vbroadcasti128 ytmp2, [bit_index]
394 vpshufb ydist_extra, ytmp2, ydist_extra
395 vpxor ytmp2, ytmp2, ytmp2
396 vpcmpgtb ytmp2, ydist_extra, ytmp2
397 vpsrld ytmp3, ytmp2, 8
398 vpandn ytmp2, ytmp3, ytmp2
399 vpsrld ytmp3, ytmp2, 16
400 vpandn ytmp2, ytmp3, ytmp2
401 vpsrld ytmp3, ytmp2, 24
402 vpandn ytmp2, ytmp3, ytmp2
403 vpbroadcastd ytmp3, [base_offset]
404 vpaddb ydist_extra, ytmp3
405 vpand ydist_extra, ydist_extra, ytmp2
406 vpsrlq ytmp2, ydist_extra, 32
407 vpxor ytmp3, ytmp3, ytmp3
408 vpsadbw ydist_extra, ydist_extra, ytmp3
409 vpsadbw ytmp2, ytmp2, ytmp3
410 vpsubd ydist_extra, ydist_extra, ytmp2
411 vpsllq ytmp2, ytmp2, 32
412 vpor ydist_extra, ydist_extra, ytmp2
413 vpcmpgtb ytmp3, ydist_extra, ytmp3
414 vpand ydist_extra, ydist_extra, ytmp3
416 vpsllvd ycode, yones, ydist_extra
417 vpsubd ycode, ycode, yones
418 vpcmpgtd ytmp2, ydists, yones
419 vpand ycode, ydists, ycode
420 vpand ycode, ycode, ytmp2
421 vpsrlvd ydists, ydists, ydist_extra
422 vpslld ydist_extra, ydist_extra, 1
423 vpaddd ydists, ydists, ydist_extra
424 vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
425 vpaddd ydists, ydists, ycode
427 ;; Setup ydists for combining with ylens
428 vpslld ydists, ydists, DIST_OFFSET
430 ;; xor current data with lookback dist
431 vpxor ylens1, ylens1, ylookup
432 vpxor ylens2, ylens2, ylookup2
434 ;; Setup registers for next loop
435 vpermq ylookup, datas, 0x44
436 vmovdqu yqword_shuf, [qword_shuf]
437 vpshufb ylookup, ylookup, yqword_shuf
438 vpermd ylookup2, ydatas_perm2, datas
439 vpshufb ylookup2, ylookup2, yqword_shuf
441 ;; Compute match length
442 vpxor ytmp, ytmp, ytmp
443 vpcmpeqb ylens1, ylens1, ytmp
444 vpcmpeqb ylens2, ylens2, ytmp
445 vpbroadcastq yshift_finish, [shift_finish]
446 vpand ylens1, ylens1, yshift_finish
447 vpand ylens2, ylens2, yshift_finish
448 vpsadbw ylens1, ylens1, ytmp
449 vpsadbw ylens2, ylens2, ytmp
450 vmovdqu ydownconvert_qd, [downconvert_qd]
451 vpshufb ylens1, ylens1, ydownconvert_qd
452 vextracti128 ytmp %+ x, ylens1, 1
453 vpor ylens1, ylens1, ytmp
454 vpshufb ylens2, ylens2, ydownconvert_qd
455 vextracti128 ytmp %+ x, ylens2, 1
456 vpor ylens2, ylens2, ytmp
457 vinserti128 ylens1, ylens1, ylens2 %+ x, 1
458 vpbroadcastd ytmp, [low_nibble]
459 vpsrld ylens2, ylens1, 4
460 vpand ylens1, ylens1, ytmp
461 vbroadcasti128 ytmp, [match_cnt_perm]
462 vpbroadcastd ytmp2, [match_cnt_low_max]
463 vpshufb ylens1, ytmp, ylens1
464 vpshufb ylens2, ytmp, ylens2
465 vpcmpeqb ytmp, ylens1, ytmp2
466 vpand ylens2, ylens2, ytmp
467 vpaddd ylens1, ylens1, ylens2
469 ;; Preload for next loops
470 vmovdqu datas, datas_lookup
471 vmovdqu datas_lookup, [f_i + file_start + 2 * VECT_SIZE]
473 ;; Zero out matches which should not be taken
474 vmovdqu yrot_left, [drot_left]
475 vpermd ylens2, yrot_left, ylens1
476 vpermd ydists, yrot_left, ydists
478 vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
479 vmovd prev_len %+ d, ylens2 %+ x
480 vinserti128 ylens2, ylens2, ytmp %+ x, 0
482 vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
483 vmovd prev_dist %+ d, ydists %+ x
484 vinserti128 ydists, ydists, ytmp %+ x, 0
486 vpbroadcastd ytmp, [shortest_matches]
487 vpcmpgtd ytmp, ylens2, ytmp
488 vpcmpgtd ytmp2, ylens1, ylens2
490 vpcmpeqd ytmp3, ytmp3, ytmp3
491 vpxor ytmp, ytmp, ytmp3
492 vpor ytmp, ytmp, ytmp2
494 vpandn ylens1, ytmp, ylens2
496 ;; Update zdists to match ylens1
497 vpbroadcastd ytmp2, [twofiftyfour]
498 vpaddd ydists, ydists, ylens1
499 vpaddd ydists, ydists, ytmp2
501 vpbroadcastd ynull_syms, [null_dist_syms]
502 vpmovzxbd ytmp3, [f_i + file_start - VECT_SIZE - 1]
503 vpaddd ytmp3, ynull_syms
504 vpand ytmp3, ytmp3, ytmp
505 vpandn ydists, ytmp, ydists
506 vpor ydists, ydists, ytmp3
509 vmovdqu [matches_next], ydists
510 add matches_next, ICF_CODE_BYTES * VECT_SIZE
516 lea next_in, [f_i + file_start]
518 ;; Calculate look back dists
519 vpbroadcastd ydist_mask, [rsp + dist_mask_offset]
520 vpaddd ydists, ydists_lookup, yones
521 vpsubd ydists, yindex, ydists
522 vpand ydists, ydists, ydist_mask
523 vpaddd ydists, ydists, yones
524 vpsubd ydists, yincrement, ydists
527 vextracti128 ydists2 %+ x, ydists, 1
528 vpcmpeqq ytmp, ytmp, ytmp
529 vpgatherdq ylens1, [next_in + ydists %+ x], ytmp
530 vpcmpeqq ytmp, ytmp, ytmp
531 vpgatherdq ylens2, [next_in + ydists2 %+ x], ytmp
533 ;; Restore last update hash value
534 vpextrd tmp %+ d, ydists2 %+ x, 3
535 add tmp %+ d, f_i %+ d
537 vpbroadcastd yhash_prod %+ x, [hash_prod]
538 vpbroadcastd yhash_mask %+ x, [rsp + hash_mask_offset]
540 vmovd yhashes %+ x, dword [f_i + file_start + VECT_SIZE - 1]
541 vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x
542 vpmaddwd yhashes %+ x, yhashes %+ x, yhash_prod %+ x
543 vpand yhashes %+ x, yhashes %+ x, yhash_mask %+ x
544 vmovd hash %+ d, yhashes %+ x
546 mov word [hash_table + HASH_BYTES * hash], tmp %+ w
548 ;; Calculate dist_icf_code
549 vpaddd ydists, ydists, yones
550 vpsubd ydists, yincrement, ydists
552 vpbroadcastd ytmp2, [low_nibble]
553 vbroadcasti128 ytmp3, [nibble_order]
554 vpslld ydist_extra, ydists, 12
555 vpor ydist_extra, ydists, ydist_extra
556 vpand ydist_extra, ydist_extra, ytmp2
557 vpshufb ydist_extra, ydist_extra, ytmp3
558 vbroadcasti128 ytmp2, [bit_index]
559 vpshufb ydist_extra, ytmp2, ydist_extra
560 vpxor ytmp2, ytmp2, ytmp2
561 vpcmpgtb ytmp2, ydist_extra, ytmp2
562 vpsrld ytmp3, ytmp2, 8
563 vpandn ytmp2, ytmp3, ytmp2
564 vpsrld ytmp3, ytmp2, 16
565 vpandn ytmp2, ytmp3, ytmp2
566 vpsrld ytmp3, ytmp2, 24
567 vpandn ytmp2, ytmp3, ytmp2
568 vpbroadcastd ytmp3, [base_offset]
569 vpaddb ydist_extra, ytmp3
570 vpand ydist_extra, ydist_extra, ytmp2
571 vpsrlq ytmp2, ydist_extra, 32
572 vpxor ytmp3, ytmp3, ytmp3
573 vpsadbw ydist_extra, ydist_extra, ytmp3
574 vpsadbw ytmp2, ytmp2, ytmp3
575 vpsubd ydist_extra, ydist_extra, ytmp2
576 vpsllq ytmp2, ytmp2, 32
577 vpor ydist_extra, ydist_extra, ytmp2
578 vpcmpgtb ytmp3, ydist_extra, ytmp3
579 vpand ydist_extra, ydist_extra, ytmp3
581 vpsllvd ycode, yones, ydist_extra
582 vpsubd ycode, ycode, yones
583 vpcmpgtd ytmp2, ydists, yones
584 vpand ycode, ydists, ycode
585 vpand ycode, ycode, ytmp2
586 vpsrlvd ydists, ydists, ydist_extra
587 vpslld ydist_extra, ydist_extra, 1
588 vpaddd ydists, ydists, ydist_extra
589 vpslld ycode, ycode, EXTRA_BITS_OFFSET - DIST_OFFSET
590 vpaddd ydists, ydists, ycode
592 ;; Setup ydists for combining with ylens
593 vpslld ydists, ydists, DIST_OFFSET
595 ;; xor current data with lookback dist
596 vpxor ylens1, ylens1, ylookup
597 vpxor ylens2, ylens2, ylookup2
599 ;; Compute match length
600 vpxor ytmp, ytmp, ytmp
601 vpcmpeqb ylens1, ylens1, ytmp
602 vpcmpeqb ylens2, ylens2, ytmp
603 vpbroadcastq yshift_finish, [shift_finish]
604 vpand ylens1, ylens1, yshift_finish
605 vpand ylens2, ylens2, yshift_finish
606 vpsadbw ylens1, ylens1, ytmp
607 vpsadbw ylens2, ylens2, ytmp
608 vmovdqu ydownconvert_qd, [downconvert_qd]
609 vpshufb ylens1, ylens1, ydownconvert_qd
610 vextracti128 ytmp %+ x, ylens1, 1
611 vpor ylens1, ylens1, ytmp
612 vpshufb ylens2, ylens2, ydownconvert_qd
613 vextracti128 ytmp %+ x, ylens2, 1
614 vpor ylens2, ylens2, ytmp
615 vinserti128 ylens1, ylens1, ylens2 %+ x, 1
616 vpbroadcastd ytmp, [low_nibble]
617 vpsrld ylens2, ylens1, 4
618 vpand ylens1, ylens1, ytmp
619 vbroadcasti128 ytmp, [match_cnt_perm]
620 vpbroadcastd ytmp2, [match_cnt_low_max]
621 vpshufb ylens1, ytmp, ylens1
622 vpshufb ylens2, ytmp, ylens2
623 vpcmpeqb ytmp, ylens1, ytmp2
624 vpand ylens2, ylens2, ytmp
625 vpaddd ylens1, ylens1, ylens2
627 ;; Zero out matches which should not be taken
628 vmovdqu yrot_left, [drot_left]
629 vpermd ylens2, yrot_left, ylens1
630 vpermd ydists, yrot_left, ydists
632 vpinsrd ytmp %+ x, ylens2 %+ x, prev_len %+ d, 0
633 vinserti128 ylens2, ylens2, ytmp %+ x, 0
635 vpinsrd ytmp %+ x, ydists %+ x, prev_dist %+ d, 0
636 vinserti128 ydists, ydists, ytmp %+ x, 0
638 vpbroadcastd ytmp, [shortest_matches]
639 vpcmpgtd ytmp, ylens2, ytmp
640 vpcmpgtd ytmp2, ylens1, ylens2
642 vpcmpeqd ytmp3, ytmp3, ytmp3
643 vpxor ytmp, ytmp, ytmp3
644 vpor ytmp, ytmp, ytmp2
646 vpandn ylens1, ytmp, ylens2
648 ;; Update zdists to match ylens1
649 vpbroadcastd ytmp2, [twofiftyfour]
650 vpaddd ydists, ydists, ylens1
651 vpaddd ydists, ydists, ytmp2
653 vpbroadcastd ynull_syms, [null_dist_syms]
654 vpmovzxbd ytmp3, [f_i + file_start - 1]
655 vpaddd ytmp3, ynull_syms
656 vpand ytmp3, ytmp3, ytmp
657 vpandn ydists, ytmp, ydists
658 vpor ydists, ydists, ytmp3
661 vmovdqu [matches_next], ydists
680 dd 0x1, 0x2, 0x3, 0x4, 0x1, 0x2, 0x3, 0x4
682 dd 0x7, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6
684 db 0x0, 0x1, 0x2, 0x3
685 db 0x1, 0x2, 0x3, 0x4
686 db 0x2, 0x3, 0x4, 0x5
687 db 0x3, 0x4, 0x5, 0x6
688 db 0x4, 0x5, 0x6, 0x7
689 db 0x5, 0x6, 0x7, 0x8
690 db 0x6, 0x7, 0x8, 0x9
691 db 0x7, 0x8, 0x9, 0xa
693 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
694 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
695 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
696 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
698 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
700 db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
701 db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
702 db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
703 db 0x00, 0xff, 0xff, 0xff, 0x08, 0xff, 0xff, 0xff
707 db 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x3, 0x0, 0x1, 0x0, 0x2, 0x0, 0x1, 0x0, 0x4
709 db 0x0, 0x1, 0x2, 0x2, 0x3, 0x3, 0x3, 0x3
710 db 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4, 0x4
712 db 0x0, 0x2, 0x1, 0x3, 0x4, 0x6, 0x5, 0x7
713 db 0x8, 0xa, 0x9, 0xb, 0xc, 0xe, 0xd, 0xf
717 db 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
737 db 0x0f, 0x0f, 0x0f, 0x0f
741 db -0x2, 0x2, 0x6, 0xa