1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "igzip_compare_types.asm"
38 %ifidn __OUTPUT_FORMAT__, win64
57 %define end_processed arg2
59 %define match_lookup arg4
61 %define match_offset r10
63 %define end_processed_orig r12
67 %define ymatch_lookup ymm0
68 %define ymatch_lookup2 ymm1
76 %define yvect_size ymm9
77 %define ymax_len ymm10
78 %define ytwofiftysix ymm11
79 %define ynlen_mask ymm12
80 %define ydists_mask ymm13
81 %define ylong_lens ymm14
82 %define ylens_mask ymm15
84 %ifidn __OUTPUT_FORMAT__, win64
85 %define stack_size 10*16 + 4 * 8 + 8
86 %define func(x) proc_frame x
88 alloc_stack stack_size
89 vmovdqa [rsp + 0*16], xmm6
90 vmovdqa [rsp + 1*16], xmm7
91 vmovdqa [rsp + 2*16], xmm8
92 vmovdqa [rsp + 3*16], xmm9
93 vmovdqa [rsp + 4*16], xmm10
94 vmovdqa [rsp + 5*16], xmm11
95 vmovdqa [rsp + 6*16], xmm12
96 vmovdqa [rsp + 7*16], xmm13
97 vmovdqa [rsp + 8*16], xmm14
98 vmovdqa [rsp + 9*16], xmm15
99 save_reg rsi, 10*16 + 0*8
100 save_reg rdi, 10*16 + 1*8
101 save_reg r12, 10*16 + 2*8
102 save_reg r13, 10*16 + 3*8
106 %macro FUNC_RESTORE 0
107 vmovdqa xmm6, [rsp + 0*16]
108 vmovdqa xmm7, [rsp + 1*16]
109 vmovdqa xmm8, [rsp + 2*16]
110 vmovdqa xmm9, [rsp + 3*16]
111 vmovdqa xmm10, [rsp + 4*16]
112 vmovdqa xmm11, [rsp + 5*16]
113 vmovdqa xmm12, [rsp + 6*16]
114 vmovdqa xmm13, [rsp + 7*16]
115 vmovdqa xmm14, [rsp + 8*16]
116 vmovdqa xmm15, [rsp + 9*16]
118 mov rsi, [rsp + 10*16 + 0*8]
119 mov rdi, [rsp + 10*16 + 1*8]
120 mov r12, [rsp + 10*16 + 2*8]
121 mov r13, [rsp + 10*16 + 3*8]
131 %macro FUNC_RESTORE 0
138 global set_long_icf_fg_04
139 func(set_long_icf_fg_04)
142 lea end_in, [next_in + arg3]
143 add end_processed, next_in
144 mov end_processed_orig, end_processed
145 lea tmp1, [end_processed + LA_STATELESS]
148 sub end_processed, VECT_SIZE - 1
149 vmovdqu ylong_lens, [long_len]
150 vmovdqu ylens_mask, [len_mask]
151 vmovdqu ydists_mask, [dists_mask]
152 vmovdqu ynlen_mask, [nlen_mask]
153 vmovdqu yvect_size, [vect_size]
154 vmovdqu ymax_len, [max_len]
155 vmovdqu ytwofiftysix, [twofiftysix]
156 vmovdqu ymatch_lookup, [match_lookup]
158 .fill_loop: ; Tahiti is a magical place
159 vmovdqu ymatch_lookup2, ymatch_lookup
160 vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
162 cmp next_in, end_processed
166 vpand ylens, ymatch_lookup2, ylens_mask
167 vpcmpgtd ycmp, ylens, ylong_lens
170 ;; Speculatively increment
171 add next_in, VECT_SIZE
172 add match_lookup, ICF_CODE_BYTES * VECT_SIZE
177 tzcnt match_offset, tmp1
180 lea next_in, [next_in + match_offset - VECT_SIZE]
181 lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
182 mov dist %+ d, [match_lookup]
183 vmovd ymatch_lookup2 %+ x, dist %+ d
186 shr dist, DIST_OFFSET
187 and dist, LIT_DIST_MASK
188 shr tmp1, EXTRA_BITS_OFFSET
189 lea tmp2, [dist_start]
190 mov dist %+ w, [tmp2 + 2 * dist]
193 mov match_in, next_in
200 compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2
202 vmovd ylens1 %+ x, len %+ d
203 vpbroadcastd ylens1, ylens1 %+ x
204 vpsubd ylens1, ylens1, [increment]
205 vpaddd ylens1, ylens1, [twofiftyfour]
207 mov tmp3, end_processed
213 lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
214 vmovdqu ymatch_lookup, [match_lookup]
216 vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x
217 vpand ymatch_lookup2, ymatch_lookup2, ynlen_mask
221 .update_match_lookup:
222 vpand ylens2, ylens_mask, [match_lookup + ICF_CODE_BYTES * len]
224 vpcmpgtd ycmp, ylens1, ylens2
225 vpcmpgtd ytmp1, ylens1, ytwofiftysix
226 vpand ycmp, ycmp, ytmp1
229 vpcmpgtd ycmp2, ylens1, ymax_len
230 vpandn ylens, ycmp2, ylens1
231 vpand ycmp2, ymax_len, ycmp2
234 vpaddd ylens2, ylens, ymatch_lookup2
235 vpand ylens2, ylens2, ycmp
237 vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2
239 test tmp1 %+ d, tmp1 %+ d
243 vpsubd ylens1, ylens1, yvect_size
245 jmp .update_match_lookup
248 mov end_processed, end_processed_orig
249 cmp next_in, end_processed
252 mov tmp1, end_processed
254 vmovd ytmp1 %+ x, tmp1 %+ d
255 vpbroadcastd ytmp1, ytmp1 %+ x
256 vpcmpgtd ytmp1, ytmp1, [increment]
257 vpand ymatch_lookup2, ymatch_lookup2, ytmp1
269 dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
270 dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
271 dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
272 dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
274 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
275 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
277 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
278 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
280 dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
282 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
284 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
285 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
287 dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
289 dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
291 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
292 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
294 dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102
295 dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102