]>
Commit | Line | Data |
---|---|---|
1 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
2 | ; Copyright(c) 2011-2018 Intel Corporation All rights reserved. | |
3 | ; | |
4 | ; Redistribution and use in source and binary forms, with or without | |
5 | ; modification, are permitted provided that the following conditions | |
6 | ; are met: | |
7 | ; * Redistributions of source code must retain the above copyright | |
8 | ; notice, this list of conditions and the following disclaimer. | |
9 | ; * Redistributions in binary form must reproduce the above copyright | |
10 | ; notice, this list of conditions and the following disclaimer in | |
11 | ; the documentation and/or other materials provided with the | |
12 | ; distribution. | |
13 | ; * Neither the name of Intel Corporation nor the names of its | |
14 | ; contributors may be used to endorse or promote products derived | |
15 | ; from this software without specific prior written permission. | |
16 | ; | |
17 | ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
18 | ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
19 | ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
20 | ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
21 | ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
22 | ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
23 | ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
24 | ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
25 | ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
26 | ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
27 | ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
28 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
29 | ||
30 | %include "reg_sizes.asm" | |
31 | %include "lz0a_const.asm" | |
32 | %include "data_struct2.asm" | |
33 | %include "igzip_compare_types.asm" | |
34 | %define NEQ 4 | |
35 | ||
36 | default rel | |
37 | ||
38 | %ifidn __OUTPUT_FORMAT__, win64 | |
39 | %define arg1 rcx | |
40 | %define arg2 rdx | |
41 | %define arg3 r8 | |
42 | %define arg4 r9 | |
43 | %define len rdi | |
44 | %define tmp2 rdi | |
45 | %define dist rsi | |
46 | %else | |
47 | %define arg1 rdi | |
48 | %define arg2 rsi | |
49 | %define arg3 rdx | |
50 | %define arg4 rcx | |
51 | %define len r8 | |
52 | %define tmp2 r8 | |
53 | %define dist r9 | |
54 | %endif | |
55 | ||
56 | %define next_in arg1 | |
57 | %define end_processed arg2 | |
58 | %define end_in arg3 | |
59 | %define match_lookup arg4 | |
60 | %define match_in rax | |
61 | %define match_offset r10 | |
62 | %define tmp1 r11 | |
63 | %define end_processed_orig r12 | |
64 | %define dist_code r13 | |
65 | %define tmp3 r13 | |
66 | ||
67 | %define ymatch_lookup ymm0 | |
68 | %define ymatch_lookup2 ymm1 | |
69 | %define ylens ymm2 | |
70 | %define ycmp2 ymm3 | |
71 | %define ylens1 ymm4 | |
72 | %define ylens2 ymm5 | |
73 | %define ycmp ymm6 | |
74 | %define ytmp1 ymm7 | |
75 | %define ytmp2 ymm8 | |
76 | %define yvect_size ymm9 | |
77 | %define ymax_len ymm10 | |
78 | %define ytwofiftysix ymm11 | |
79 | %define ynlen_mask ymm12 | |
80 | %define ydists_mask ymm13 | |
81 | %define ylong_lens ymm14 | |
82 | %define ylens_mask ymm15 | |
83 | ||
84 | %ifidn __OUTPUT_FORMAT__, win64 | |
85 | %define stack_size 10*16 + 4 * 8 + 8 | |
86 | %define func(x) proc_frame x | |
87 | %macro FUNC_SAVE 0 | |
88 | alloc_stack stack_size | |
89 | vmovdqa [rsp + 0*16], xmm6 | |
90 | vmovdqa [rsp + 1*16], xmm7 | |
91 | vmovdqa [rsp + 2*16], xmm8 | |
92 | vmovdqa [rsp + 3*16], xmm9 | |
93 | vmovdqa [rsp + 4*16], xmm10 | |
94 | vmovdqa [rsp + 5*16], xmm11 | |
95 | vmovdqa [rsp + 6*16], xmm12 | |
96 | vmovdqa [rsp + 7*16], xmm13 | |
97 | vmovdqa [rsp + 8*16], xmm14 | |
98 | vmovdqa [rsp + 9*16], xmm15 | |
99 | save_reg rsi, 10*16 + 0*8 | |
100 | save_reg rdi, 10*16 + 1*8 | |
101 | save_reg r12, 10*16 + 2*8 | |
102 | save_reg r13, 10*16 + 3*8 | |
103 | end_prolog | |
104 | %endm | |
105 | ||
106 | %macro FUNC_RESTORE 0 | |
107 | vmovdqa xmm6, [rsp + 0*16] | |
108 | vmovdqa xmm7, [rsp + 1*16] | |
109 | vmovdqa xmm8, [rsp + 2*16] | |
110 | vmovdqa xmm9, [rsp + 3*16] | |
111 | vmovdqa xmm10, [rsp + 4*16] | |
112 | vmovdqa xmm11, [rsp + 5*16] | |
113 | vmovdqa xmm12, [rsp + 6*16] | |
114 | vmovdqa xmm13, [rsp + 7*16] | |
115 | vmovdqa xmm14, [rsp + 8*16] | |
116 | vmovdqa xmm15, [rsp + 9*16] | |
117 | ||
118 | mov rsi, [rsp + 10*16 + 0*8] | |
119 | mov rdi, [rsp + 10*16 + 1*8] | |
120 | mov r12, [rsp + 10*16 + 2*8] | |
121 | mov r13, [rsp + 10*16 + 3*8] | |
122 | add rsp, stack_size | |
123 | %endm | |
124 | %else | |
125 | %define func(x) x: | |
126 | %macro FUNC_SAVE 0 | |
127 | push r12 | |
128 | push r13 | |
129 | %endm | |
130 | ||
131 | %macro FUNC_RESTORE 0 | |
132 | pop r13 | |
133 | pop r12 | |
134 | %endm | |
135 | %endif | |
136 | %define VECT_SIZE 8 | |
137 | ||
138 | global set_long_icf_fg_04 | |
139 | func(set_long_icf_fg_04) | |
140 | FUNC_SAVE | |
141 | ||
142 | lea end_in, [next_in + arg3] | |
143 | add end_processed, next_in | |
144 | mov end_processed_orig, end_processed | |
145 | lea tmp1, [end_processed + LA_STATELESS] | |
146 | cmp end_in, tmp1 | |
147 | cmovg end_in, tmp1 | |
148 | sub end_processed, VECT_SIZE - 1 | |
149 | vmovdqu ylong_lens, [long_len] | |
150 | vmovdqu ylens_mask, [len_mask] | |
151 | vmovdqu ydists_mask, [dists_mask] | |
152 | vmovdqu ynlen_mask, [nlen_mask] | |
153 | vmovdqu yvect_size, [vect_size] | |
154 | vmovdqu ymax_len, [max_len] | |
155 | vmovdqu ytwofiftysix, [twofiftysix] | |
156 | vmovdqu ymatch_lookup, [match_lookup] | |
157 | ||
158 | .fill_loop: ; Tahiti is a magical place | |
159 | vmovdqu ymatch_lookup2, ymatch_lookup | |
160 | vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE] | |
161 | ||
162 | cmp next_in, end_processed | |
163 | jae .end_fill | |
164 | ||
165 | .finish_entry: | |
166 | vpand ylens, ymatch_lookup2, ylens_mask | |
167 | vpcmpgtd ycmp, ylens, ylong_lens | |
168 | vpmovmskb tmp1, ycmp | |
169 | ||
170 | ;; Speculatively increment | |
171 | add next_in, VECT_SIZE | |
172 | add match_lookup, ICF_CODE_BYTES * VECT_SIZE | |
173 | ||
174 | test tmp1, tmp1 | |
175 | jz .fill_loop | |
176 | ||
177 | tzcnt match_offset, tmp1 | |
178 | shr match_offset, 2 | |
179 | ||
180 | lea next_in, [next_in + match_offset - VECT_SIZE] | |
181 | lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)] | |
182 | mov dist %+ d, [match_lookup] | |
183 | vmovd ymatch_lookup2 %+ x, dist %+ d | |
184 | ||
185 | mov tmp1, dist | |
186 | shr dist, DIST_OFFSET | |
187 | and dist, LIT_DIST_MASK | |
188 | shr tmp1, EXTRA_BITS_OFFSET | |
189 | lea tmp2, [dist_start] | |
190 | mov dist %+ w, [tmp2 + 2 * dist] | |
191 | add dist, tmp1 | |
192 | ||
193 | mov match_in, next_in | |
194 | sub match_in, dist | |
195 | ||
196 | mov len, 8 | |
197 | mov tmp3, end_in | |
198 | sub tmp3, next_in | |
199 | ||
200 | compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2 | |
201 | ||
202 | vmovd ylens1 %+ x, len %+ d | |
203 | vpbroadcastd ylens1, ylens1 %+ x | |
204 | vpsubd ylens1, ylens1, [increment] | |
205 | vpaddd ylens1, ylens1, [twofiftyfour] | |
206 | ||
207 | mov tmp3, end_processed | |
208 | sub tmp3, next_in | |
209 | cmp len, tmp3 | |
210 | cmovg len, tmp3 | |
211 | ||
212 | add next_in, len | |
213 | lea match_lookup, [match_lookup + ICF_CODE_BYTES * len] | |
214 | vmovdqu ymatch_lookup, [match_lookup] | |
215 | ||
216 | vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x | |
217 | vpand ymatch_lookup2, ymatch_lookup2, ynlen_mask | |
218 | ||
219 | neg len | |
220 | ||
221 | .update_match_lookup: | |
222 | vpand ylens2, ylens_mask, [match_lookup + ICF_CODE_BYTES * len] | |
223 | ||
224 | vpcmpgtd ycmp, ylens1, ylens2 | |
225 | vpcmpgtd ytmp1, ylens1, ytwofiftysix | |
226 | vpand ycmp, ycmp, ytmp1 | |
227 | vpmovmskb tmp1, ycmp | |
228 | ||
229 | vpcmpgtd ycmp2, ylens1, ymax_len | |
230 | vpandn ylens, ycmp2, ylens1 | |
231 | vpand ycmp2, ymax_len, ycmp2 | |
232 | vpor ylens, ycmp2 | |
233 | ||
234 | vpaddd ylens2, ylens, ymatch_lookup2 | |
235 | vpand ylens2, ylens2, ycmp | |
236 | ||
237 | vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2 | |
238 | ||
239 | test tmp1 %+ d, tmp1 %+ d | |
240 | jz .fill_loop | |
241 | ||
242 | add len, VECT_SIZE | |
243 | vpsubd ylens1, ylens1, yvect_size | |
244 | ||
245 | jmp .update_match_lookup | |
246 | ||
247 | .end_fill: | |
248 | mov end_processed, end_processed_orig | |
249 | cmp next_in, end_processed | |
250 | jge .finish | |
251 | ||
252 | mov tmp1, end_processed | |
253 | sub tmp1, next_in | |
254 | vmovd ytmp1 %+ x, tmp1 %+ d | |
255 | vpbroadcastd ytmp1, ytmp1 %+ x | |
256 | vpcmpgtd ytmp1, ytmp1, [increment] | |
257 | vpand ymatch_lookup2, ymatch_lookup2, ytmp1 | |
258 | jmp .finish_entry | |
259 | ||
260 | .finish: | |
261 | FUNC_RESTORE | |
262 | ret | |
263 | ||
264 | endproc_frame | |
265 | ||
266 | section .data | |
267 | align 64 | |
268 | dist_start: | |
269 | dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d | |
270 | dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1 | |
271 | dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01 | |
272 | dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000 | |
273 | len_mask: | |
274 | dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK | |
275 | dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK | |
276 | dists_mask: | |
277 | dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK | |
278 | dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK | |
279 | long_len: | |
280 | dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105 | |
281 | increment: | |
282 | dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 | |
283 | vect_size: | |
284 | dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE | |
285 | dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE | |
286 | twofiftyfour: | |
287 | dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe | |
288 | twofiftysix: | |
289 | dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100 | |
290 | nlen_mask: | |
291 | dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 | |
292 | dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00 | |
293 | max_len: | |
294 | dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102 | |
295 | dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102 |