1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "igzip_compare_types.asm"
36 %ifdef HAVE_AS_KNOWS_AVX512
37 %ifidn __OUTPUT_FORMAT__, win64
54 %define end_processed arg2
56 %define match_lookup arg4
58 %define match_offset r10
60 %define end_processed_orig r12
64 %define zmatch_lookup zmm0
65 %define zmatch_lookup2 zmm1
67 %define zdist_codes zmm3
68 %define zdist_extras zmm4
74 %define zlookup2 zmm10
78 %define zvect_size zmm16
79 %define zmax_len zmm17
80 %define ztwofiftyfour zmm18
81 %define ztwofiftysix zmm19
82 %define ztwosixtytwo zmm20
83 %define znlen_mask zmm21
85 %define zqword_shuf zmm23
86 %define zdatas_perm3 zmm24
87 %define zdatas_perm2 zmm25
88 %define zincrement zmm26
89 %define zdists_mask zmm27
90 %define zdists_start zmm28
91 %define zlong_lens2 zmm29
92 %define zlong_lens zmm30
93 %define zlens_mask zmm31
95 %ifidn __OUTPUT_FORMAT__, win64
96 %define stack_size 8*16 + 4 * 8 + 8
97 %define func(x) proc_frame x
99 alloc_stack stack_size
100 vmovdqa [rsp + 0*16], xmm6
101 vmovdqa [rsp + 1*16], xmm7
102 vmovdqa [rsp + 2*16], xmm8
103 vmovdqa [rsp + 3*16], xmm9
104 vmovdqa [rsp + 4*16], xmm10
105 vmovdqa [rsp + 5*16], xmm11
106 vmovdqa [rsp + 6*16], xmm12
107 vmovdqa [rsp + 7*16], xmm13
108 save_reg rsi, 8*16 + 0*8
109 save_reg rdi, 8*16 + 1*8
110 save_reg r12, 8*16 + 2*8
111 save_reg r13, 8*16 + 3*8
115 %macro FUNC_RESTORE 0
116 vmovdqa xmm6, [rsp + 0*16]
117 vmovdqa xmm7, [rsp + 1*16]
118 vmovdqa xmm8, [rsp + 2*16]
119 vmovdqa xmm9, [rsp + 3*16]
120 vmovdqa xmm10, [rsp + 4*16]
121 vmovdqa xmm11, [rsp + 5*16]
122 vmovdqa xmm12, [rsp + 6*16]
123 vmovdqa xmm13, [rsp + 7*16]
125 mov rsi, [rsp + 8*16 + 0*8]
126 mov rdi, [rsp + 8*16 + 1*8]
127 mov r12, [rsp + 8*16 + 2*8]
128 mov r13, [rsp + 8*16 + 3*8]
132 %define func(x) x: endbranch
138 %macro FUNC_RESTORE 0
149 global set_long_icf_fg_06
150 func(set_long_icf_fg_06)
154 lea end_in, [next_in + arg3]
155 add end_processed, next_in
156 mov end_processed_orig, end_processed
157 lea tmp1, [end_processed + LA_STATELESS]
160 sub end_processed, 15
161 vpbroadcastd zlong_lens, [long_len]
162 vpbroadcastd zlong_lens2, [long_len2]
163 vpbroadcastd zlens_mask, [len_mask]
164 vmovdqu16 zdists_start, [dist_start]
165 vpbroadcastd zdists_mask, [dists_mask]
166 vmovdqu32 zincrement, [increment]
167 vbroadcasti64x2 zdatas_perm2, [datas_perm2]
168 vbroadcasti64x2 zdatas_perm3, [datas_perm3]
169 vmovdqu64 zqword_shuf, [qword_shuf]
170 vbroadcasti64x2 zbswap, [bswap_shuf]
171 vpbroadcastd znlen_mask, [nlen_mask]
172 vpbroadcastd zvect_size, [vect_size]
173 vpbroadcastd zmax_len, [max_len]
174 vpbroadcastd ztwofiftyfour, [twofiftyfour]
175 vpbroadcastd ztwofiftysix, [twofiftysix]
176 vpbroadcastd ztwosixtytwo, [twosixtytwo]
177 vmovdqu32 zmatch_lookup, [match_lookup]
179 .fill_loop: ; Tahiti is a magical place
180 vmovdqu32 zmatch_lookup2, zmatch_lookup
181 vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
183 cmp next_in, end_processed
187 vpandd zlens, zmatch_lookup2, zlens_mask
188 vpcmpgtd k3, zlens, zlong_lens
190 ;; Speculatively increment
191 add next_in, VECT_SIZE
192 add match_lookup, ICF_CODE_BYTES * VECT_SIZE
197 vpsrld zdist_codes, zmatch_lookup2, DIST_OFFSET
198 vpmovdw zdists %+ y, zdist_codes ; Relies on perm working mod 32
199 vpermw zdists, zdists, zdists_start
200 vpmovzxwd zdists, zdists %+ y
202 vpsrld zdist_extras, zmatch_lookup2, EXTRA_BITS_OFFSET
203 vpsubd zdist_extras, zincrement, zdist_extras
205 vpsubd zdists, zdist_extras, zdists
206 vextracti32x8 zdists2 %+ y, zdists, 1
209 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y - 8]
210 vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y - 8]
212 vmovdqu8 datas %+ y, [next_in - 8]
213 vpermq zlookup, zdatas_perm2, datas
214 vpshufb zlookup, zlookup, zqword_shuf
215 vpermq zlookup2, zdatas_perm3, datas
216 vpshufb zlookup2, zlookup2, zqword_shuf
218 vpxorq zlens1, zlens1, zlookup
219 vpxorq zlens2, zlens2, zlookup2
221 vpshufb zlens1, zlens1, zbswap
222 vpshufb zlens2, zlens2, zbswap
223 vplzcntq zlens1, zlens1
224 vplzcntq zlens2, zlens2
225 vpmovqd zlens1 %+ y, zlens1
226 vpmovqd zlens2 %+ y, zlens2
227 vinserti32x8 zlens1, zlens2 %+ y, 1
228 vpsrld zlens1 {k3}{z}, zlens1, 3
230 vpandd zmatch_lookup2 {k3}{z}, zmatch_lookup2, znlen_mask
231 vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, ztwosixtytwo
232 vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, zlens1
234 vmovdqu32 [match_lookup - ICF_CODE_BYTES * VECT_SIZE] {k3}, zmatch_lookup2
236 vpcmpgtd k3, zlens1, zlong_lens2
240 vpsubd zdists, zincrement, zdists
242 vpcompressd zdists2 {k3}, zdists
243 vpcompressd zmatch_lookup2 {k3}, zmatch_lookup2
244 kmovq match_offset, k3
245 tzcnt match_offset, match_offset
247 vmovd dist %+ d, zdists2 %+ x
248 lea next_in, [next_in + match_offset - VECT_SIZE]
249 lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
250 mov match_in, next_in
257 compare_z next_in, match_in, len, tmp2, tmp1, k3, ztmp1, ztmp2
259 vpbroadcastd zlens1, len %+ d
260 vpsubd zlens1, zlens1, zincrement
261 vpaddd zlens1, zlens1, ztwofiftyfour
263 mov tmp2, end_processed
269 lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
270 vmovdqu32 zmatch_lookup, [match_lookup]
272 vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x
273 vpandd zmatch_lookup2, zmatch_lookup2, znlen_mask
277 .update_match_lookup:
278 vpandd zlens2, zlens_mask, [match_lookup + ICF_CODE_BYTES * len]
279 vpcmpgtd k3, zlens1, zlens2
280 vpcmpgtd k4, zlens1, ztwofiftysix
283 vpcmpgtd k4, zlens1, zmax_len
284 vmovdqu32 zlens, zlens1
285 vmovdqu32 zlens {k4}, zmax_len
287 vpaddd zlens2 {k3}{z}, zlens, zmatch_lookup2
289 vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2
296 vpsubd zlens1, zlens1, zvect_size
298 jmp .update_match_lookup
301 mov end_processed, end_processed_orig
302 cmp next_in, end_processed
305 mov tmp1, end_processed
307 vpbroadcastd ztmp1, tmp1 %+ d
308 vpcmpd k3, ztmp1, zincrement, 6
309 vmovdqu32 zmatch_lookup2 {k3}{z}, zmatch_lookup2
323 dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
324 dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
325 dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
326 dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
328 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
329 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
330 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
331 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
332 db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb
333 db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc
334 db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd
335 db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
336 db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
340 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
341 dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
348 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
349 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08