1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "huffman.asm"
39 %ifdef HAVE_AS_KNOWS_AVX512
40 %ifidn __OUTPUT_FORMAT__, win64
55 %define level_buf arg1
56 %define matches_next arg2
60 %define file_start rbp
67 %define hash_table level_buf + _hash_map_hash_table
70 %define datas_lookup zmm1
73 %define zdists_lookup zmm4
79 %define zlookup2 zmm10
80 %define match_lookups zmm11
82 %define zdist_extra zmm13
83 %define zdists_tmp zmm14
84 %define znull_dist_syms zmm15
87 %define zdist_mask zmm18
88 %define zshortest_matches zmm19
89 %define zrot_left zmm20
90 %define zdatas_perm zmm21
91 %define zdatas_perm2 zmm22
92 %define zdatas_perm3 zmm23
93 %define zdatas_shuf zmm24
94 %define zhash_prod zmm25
95 %define zhash_mask zmm26
96 %define zincrement zmm27
97 %define zqword_shuf zmm28
99 %define ztwofiftyfour zmm30
102 %ifidn __OUTPUT_FORMAT__, win64
103 %define stack_size 10*16 + 6 * 8 + 8
104 %define func(x) proc_frame x
107 alloc_stack stack_size
108 vmovdqa [rsp + 0*16], xmm6
109 vmovdqa [rsp + 1*16], xmm7
110 vmovdqa [rsp + 2*16], xmm8
111 vmovdqa [rsp + 3*16], xmm9
112 vmovdqa [rsp + 4*16], xmm10
113 vmovdqa [rsp + 5*16], xmm11
114 vmovdqa [rsp + 6*16], xmm12
115 vmovdqa [rsp + 7*16], xmm13
116 vmovdqu [rsp + 8*16], xmm14
117 vmovdqa [rsp + 9*16], xmm15
118 save_reg rsi, 10*16 + 0*8
119 save_reg rdi, 10*16 + 1*8
120 save_reg rbp, 10*16 + 2*8
121 save_reg r12, 10*16 + 3*8
122 save_reg r13, 10*16 + 4*8
126 %macro FUNC_RESTORE 0
127 vmovdqa xmm6, [rsp + 0*16]
128 vmovdqa xmm7, [rsp + 1*16]
129 vmovdqa xmm8, [rsp + 2*16]
130 vmovdqa xmm9, [rsp + 3*16]
131 vmovdqa xmm10, [rsp + 4*16]
132 vmovdqa xmm11, [rsp + 5*16]
133 vmovdqa xmm12, [rsp + 6*16]
134 vmovdqa xmm13, [rsp + 7*16]
135 vmovdqa xmm14, [rsp + 8*16]
136 vmovdqa xmm15, [rsp + 9*16]
138 mov rsi, [rsp + 10*16 + 0*8]
139 mov rdi, [rsp + 10*16 + 1*8]
140 mov rbp, [rsp + 10*16 + 2*8]
141 mov r12, [rsp + 10*16 + 3*8]
142 mov r13, [rsp + 10*16 + 4*8]
146 %define func(x) x: endbranch
153 %macro FUNC_RESTORE 0
167 global gen_icf_map_lh1_06
168 func(gen_icf_map_lh1_06)
172 mov file_start, [stream + _next_in]
173 mov f_i %+ d, dword [stream + _total_in]
181 ;; Prep for main loop
182 vpbroadcastd zdist_mask, dword [stream + _internal_state_dist_mask]
183 vpbroadcastd zhash_mask, dword [stream + _internal_state_hash_mask]
185 mov level_buf, [stream + _level_buf]
187 vmovdqu64 zdatas_perm, [datas_perm]
188 vbroadcasti32x8 zdatas_shuf, [datas_shuf]
189 vpbroadcastd zhash_prod, [hash_prod]
190 vmovdqu64 zincrement, [increment]
191 vmovdqu64 zqword_shuf, [qword_shuf]
192 vbroadcasti64x2 zdatas_perm2, [datas_perm2]
193 vbroadcasti64x2 zdatas_perm3, [datas_perm3]
194 vpbroadcastd zones, [ones]
195 vbroadcasti32x4 zbswap, [bswap_shuf]
196 vpbroadcastd zthirty, [thirty]
197 vmovdqu64 zrot_left, [drot_left]
198 vpbroadcastd zshortest_matches, [shortest_matches]
199 vpbroadcastd ztwofiftyfour, [twofiftyfour]
200 vpbroadcastd znull_dist_syms, [null_dist_syms]
205 ;; Process first byte
206 vmovd zhashes %+ x, dword [f_i + file_start]
207 vpmaddwd zhashes, zhashes, zhash_prod
208 vpmaddwd zhashes, zhashes, zhash_prod
209 vpandd zhashes, zhashes, zhash_mask
210 vmovd hash %+ d, zhashes %+ x
212 cmp byte [tmp + _internal_state_has_hist], IGZIP_NO_HIST
214 ;; No history, the byte is a literal
215 xor prev_len, prev_len
216 xor prev_dist, prev_dist
217 mov byte [tmp + _internal_state_has_hist], IGZIP_HIST
221 ;; History exists, need to set prev_len and prev_dist accordingly
222 lea next_in, [f_i + file_start]
224 ;; Determine match lookback distance
226 mov tmp %+ w, f_i %+ w
228 sub tmp %+ w, word [hash_table + HASH_BYTES * hash]
230 vmovd tmp2 %+ d, zdist_mask %+ x
231 and tmp %+ d, tmp2 %+ d
234 ;; Check first 8 bytes of match
235 mov prev_len, [next_in]
236 xor prev_len, [next_in + tmp - 1]
243 ;; The third register is unused on Haswell and later,
244 ;; This line will not work on previous architectures
245 get_dist_icf_code tmp, prev_dist, tmp
253 tzcnt prev_len, prev_len
255 cmp prev_len, MIN_DEF_MATCH
259 mov word [hash_table + HASH_BYTES * hash], f_i %+ w
266 vmovdqu64 datas %+ y, [f_i + file_start]
267 vpermq zhashes, zdatas_perm, datas
268 vpshufb zhashes, zhashes, zdatas_shuf
269 vpmaddwd zhashes, zhashes, zhash_prod
270 vpmaddwd zhashes, zhashes, zhash_prod
271 vpandd zhashes, zhashes, zhash_mask
273 vpermq zlookup, zdatas_perm2, datas
274 vpshufb zlookup, zlookup, zqword_shuf
275 vpermq zlookup2, zdatas_perm3, datas
276 vpshufb zlookup2, zlookup2, zqword_shuf
278 ;;gather/scatter hashes
280 vpgatherdd zdists_lookup {k6}, [hash_table + HASH_BYTES * zhashes]
282 vpbroadcastd zindex, f_i %+ d
283 vpaddd zindex, zindex, zincrement
284 vpblendmw zscatter {k1}, zindex, zdists_lookup
287 vpscatterdd [hash_table + HASH_BYTES * zhashes] {k6}, zscatter
289 ;; Compute hash for next loop
290 vmovdqu64 datas %+ y, [f_i + file_start + VECT_SIZE]
291 vpermq zhashes, zdatas_perm, datas
292 vpshufb zhashes, zhashes, zdatas_shuf
293 vpmaddwd zhashes, zhashes, zhash_prod
294 vpmaddwd zhashes, zhashes, zhash_prod
295 vpandd zhashes, zhashes, zhash_mask
297 vmovdqu64 datas_lookup %+ y, [f_i + file_start + 2 * VECT_SIZE]
299 sub f_i_end, VECT_SIZE
304 lea next_in, [f_i + file_start]
306 ;; Calculate look back dists
307 vpaddd zdists, zdists_lookup, zones
308 vpsubd zdists, zindex, zdists
309 vpandd zdists, zdists, zdist_mask
310 vpaddd zdists, zdists, zones
311 vpsubd zdists, zincrement, zdists
313 ;;gather/scatter hashes
318 vpgatherdd zdists_lookup {k6}, [hash_table + HASH_BYTES * zhashes]
320 vpbroadcastd zindex, f_i %+ d
321 vpaddd zindex, zindex, zincrement
322 vpblendmw zscatter {k1}, zindex, zdists_lookup
324 vpscatterdd [hash_table + HASH_BYTES * zhashes] {k7}, zscatter
326 ;; Compute hash for next loop
327 vpermq zhashes, zdatas_perm, datas_lookup
328 vpshufb zhashes, zhashes, zdatas_shuf
329 vpmaddwd zhashes, zhashes, zhash_prod
330 vpmaddwd zhashes, zhashes, zhash_prod
331 vpandd zhashes, zhashes, zhash_mask
334 vextracti32x8 zdists2 %+ y, zdists, 1
337 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y]
338 vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y]
340 ;; Calculate dist_icf_code
341 vpaddd zdists, zdists, zones
342 vpsubd zdists, zincrement, zdists
343 vpcmpgtd k5, zdists, zones
344 vplzcntd zdist_extra, zdists
345 vpsubd zdist_extra {k5}{z}, zthirty, zdist_extra
346 vpsllvd zcode, zones, zdist_extra
347 vpsubd zcode, zcode, zones
348 vpandd zcode {k5}{z}, zdists, zcode
349 vpsrlvd zdists, zdists, zdist_extra
350 vpslld zdist_extra, zdist_extra, 1
351 vpaddd zdists, zdists, zdist_extra
352 vpslld zcode, zcode, EXTRA_BITS_OFFSET - DIST_OFFSET
353 vpaddd zdists, zdists, zcode
355 ;; Setup zdists for combining with zlens
356 vpslld zdists, zdists, DIST_OFFSET
358 ;; xor current data with lookback dist
359 vpxorq zlens1, zlens1, zlookup
360 vpxorq zlens2, zlens2, zlookup2
362 ;; Setup registers for next loop
363 vpermq zlookup, zdatas_perm2, datas
364 vpshufb zlookup, zlookup, zqword_shuf
365 vpermq zlookup2, zdatas_perm3, datas
366 vpshufb zlookup2, zlookup2, zqword_shuf
368 ;; Compute match length
369 vpshufb zlens1, zlens1, zbswap
370 vpshufb zlens2, zlens2, zbswap
371 vplzcntq zlens1, zlens1
372 vplzcntq zlens2, zlens2
373 vpmovqd zlens1 %+ y, zlens1
374 vpmovqd zlens2 %+ y, zlens2
375 vinserti32x8 zlens1, zlens2 %+ y, 1
376 vpsrld zlens1, zlens1, 3
378 ;; Preload for next loops
379 vmovdqu64 datas, datas_lookup
380 vmovdqu64 datas_lookup %+ y, [f_i + file_start + 2 * VECT_SIZE]
382 ;; Zero out matches which should not be taken
384 vpermd zlens2, zrot_left, zlens1
385 vpermd zdists, zrot_left, zdists
387 vmovd zdists_tmp %+ x, prev_len %+ d
388 vmovd prev_len %+ d, zlens2 %+ x
389 vmovdqu32 zlens2 {k3}, zdists_tmp
391 vmovd zdists_tmp %+ x, prev_dist %+ d
392 vmovd prev_dist %+ d, zdists %+ x
393 vmovdqu32 zdists {k3}, zdists_tmp
395 vpcmpgtd k3, zlens2, zshortest_matches
396 vpcmpgtd k4, zlens1, zlens2
401 vmovdqu32 zlens1 {k4}{z}, zlens2
403 ;; Update zdists to match zlens1
404 vpaddd zdists, zdists, zlens1
405 vpaddd zdists, zdists, ztwofiftyfour
406 vpmovzxbd zdists {k3}, [f_i + file_start - VECT_SIZE - 1]
407 vpaddd zdists {k3}, zdists, znull_dist_syms
410 vmovdqu64 [matches_next], zdists
411 add matches_next, ICF_CODE_BYTES * VECT_SIZE
417 lea next_in, [f_i + file_start]
419 ;; Calculate look back dists
420 vpaddd zdists, zdists_lookup, zones
421 vpsubd zdists, zindex, zdists
422 vpandd zdists, zdists, zdist_mask
423 vpaddd zdists, zdists, zones
424 vpsubd zdists, zincrement, zdists
427 vextracti32x8 zdists2 %+ y, zdists, 1
430 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y]
431 vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y]
433 ;; Restore last update hash value
434 vextracti32x4 zdists2 %+ x, zdists, 3
435 vpextrd tmp %+ d, zdists2 %+ x, 3
436 add tmp %+ d, f_i %+ d
438 vmovd zhashes %+ x, dword [f_i + file_start + VECT_SIZE - 1]
439 vpmaddwd zhashes %+ x, zhashes %+ x, zhash_prod %+ x
440 vpmaddwd zhashes %+ x, zhashes %+ x, zhash_prod %+ x
441 vpandd zhashes %+ x, zhashes %+ x, zhash_mask %+ x
442 vmovd hash %+ d, zhashes %+ x
444 mov word [hash_table + HASH_BYTES * hash], tmp %+ w
446 ;; Calculate dist_icf_code
447 vpaddd zdists, zdists, zones
448 vpsubd zdists, zincrement, zdists
449 vpcmpgtd k5, zdists, zones
450 vplzcntd zdist_extra, zdists
451 vpsubd zdist_extra {k5}{z}, zthirty, zdist_extra
452 vpsllvd zcode, zones, zdist_extra
453 vpsubd zcode, zcode, zones
454 vpandd zcode {k5}{z}, zdists, zcode
455 vpsrlvd zdists, zdists, zdist_extra
456 vpslld zdist_extra, zdist_extra, 1
457 vpaddd zdists, zdists, zdist_extra
458 vpslld zcode, zcode, EXTRA_BITS_OFFSET - DIST_OFFSET
459 vpaddd zdists, zdists, zcode
461 ;; Setup zdists for combining with zlens
462 vpslld zdists, zdists, DIST_OFFSET
464 ;; xor current data with lookback dist
465 vpxorq zlens1, zlens1, zlookup
466 vpxorq zlens2, zlens2, zlookup2
468 ;; Compute match length
469 vpshufb zlens1, zlens1, zbswap
470 vpshufb zlens2, zlens2, zbswap
471 vplzcntq zlens1, zlens1
472 vplzcntq zlens2, zlens2
473 vpmovqd zlens1 %+ y, zlens1
474 vpmovqd zlens2 %+ y, zlens2
475 vinserti32x8 zlens1, zlens2 %+ y, 1
476 vpsrld zlens1, zlens1, 3
478 ;; Zero out matches which should not be taken
480 vpermd zlens2, zrot_left, zlens1
481 vpermd zdists, zrot_left, zdists
483 vmovd zdists_tmp %+ x, prev_len %+ d
484 vmovd prev_len %+ d, zlens2 %+ x
485 vmovdqu32 zlens2 {k3}, zdists_tmp
487 vmovd zdists_tmp %+ x, prev_dist %+ d
488 vmovd prev_dist %+ d, zdists %+ x
489 vmovdqu32 zdists {k3}, zdists_tmp
491 vpcmpgtd k3, zlens2, zshortest_matches
492 vpcmpgtd k4, zlens1, zlens2
497 vmovdqu32 zlens1 {k4}{z}, zlens2
499 ;; Update zdists to match zlens1
500 vpaddd zdists, zdists, zlens1
501 vpaddd zdists, zdists, ztwofiftyfour
502 vpmovzxbd zdists {k3}, [f_i + file_start - 1]
503 vpaddd zdists {k3}, zdists, znull_dist_syms
506 vmovdqu64 [matches_next], zdists
524 dq 0x0, 0x1, 0x0, 0x1, 0x1, 0x2, 0x1, 0x2
526 dd 0xf, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6
527 dd 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
529 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
530 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
531 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
532 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
533 db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb
534 db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc
535 db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd
536 db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
537 db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
539 db 0x0, 0x1, 0x2, 0x3
540 db 0x1, 0x2, 0x3, 0x4
541 db 0x2, 0x3, 0x4, 0x5
542 db 0x3, 0x4, 0x5, 0x6
543 db 0x4, 0x5, 0x6, 0x7
544 db 0x5, 0x6, 0x7, 0x8
545 db 0x6, 0x7, 0x8, 0x9
546 db 0x7, 0x8, 0x9, 0xa
548 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
549 dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
557 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
558 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
561 dq 0xaaaaaaaaaaaaaaaa