]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / igzip / igzip_set_long_icf_fg_04.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "igzip_compare_types.asm"
34 %define NEQ 4
35
36 default rel
37
38 %ifidn __OUTPUT_FORMAT__, win64
39 %define arg1 rcx
40 %define arg2 rdx
41 %define arg3 r8
42 %define arg4 r9
43 %define len rdi
44 %define tmp2 rdi
45 %define dist rsi
46 %else
47 %define arg1 rdi
48 %define arg2 rsi
49 %define arg3 rdx
50 %define arg4 rcx
51 %define len r8
52 %define tmp2 r8
53 %define dist r9
54 %endif
55
56 %define next_in arg1
57 %define end_processed arg2
58 %define end_in arg3
59 %define match_lookup arg4
60 %define match_in rax
61 %define match_offset r10
62 %define tmp1 r11
63 %define end_processed_orig r12
64 %define dist_code r13
65 %define tmp3 r13
66
67 %define ymatch_lookup ymm0
68 %define ymatch_lookup2 ymm1
69 %define ylens ymm2
70 %define ycmp2 ymm3
71 %define ylens1 ymm4
72 %define ylens2 ymm5
73 %define ycmp ymm6
74 %define ytmp1 ymm7
75 %define ytmp2 ymm8
76 %define yvect_size ymm9
77 %define ymax_len ymm10
78 %define ytwofiftysix ymm11
79 %define ynlen_mask ymm12
80 %define ydists_mask ymm13
81 %define ylong_lens ymm14
82 %define ylens_mask ymm15
83
84 %ifidn __OUTPUT_FORMAT__, win64
85 %define stack_size 10*16 + 4 * 8 + 8
86 %define func(x) proc_frame x
87 %macro FUNC_SAVE 0
88 alloc_stack stack_size
89 vmovdqa [rsp + 0*16], xmm6
90 vmovdqa [rsp + 1*16], xmm7
91 vmovdqa [rsp + 2*16], xmm8
92 vmovdqa [rsp + 3*16], xmm9
93 vmovdqa [rsp + 4*16], xmm10
94 vmovdqa [rsp + 5*16], xmm11
95 vmovdqa [rsp + 6*16], xmm12
96 vmovdqa [rsp + 7*16], xmm13
97 vmovdqa [rsp + 8*16], xmm14
98 vmovdqa [rsp + 9*16], xmm15
99 save_reg rsi, 10*16 + 0*8
100 save_reg rdi, 10*16 + 1*8
101 save_reg r12, 10*16 + 2*8
102 save_reg r13, 10*16 + 3*8
103 end_prolog
104 %endm
105
106 %macro FUNC_RESTORE 0
107 vmovdqa xmm6, [rsp + 0*16]
108 vmovdqa xmm7, [rsp + 1*16]
109 vmovdqa xmm8, [rsp + 2*16]
110 vmovdqa xmm9, [rsp + 3*16]
111 vmovdqa xmm10, [rsp + 4*16]
112 vmovdqa xmm11, [rsp + 5*16]
113 vmovdqa xmm12, [rsp + 6*16]
114 vmovdqa xmm13, [rsp + 7*16]
115 vmovdqa xmm14, [rsp + 8*16]
116 vmovdqa xmm15, [rsp + 9*16]
117
118 mov rsi, [rsp + 10*16 + 0*8]
119 mov rdi, [rsp + 10*16 + 1*8]
120 mov r12, [rsp + 10*16 + 2*8]
121 mov r13, [rsp + 10*16 + 3*8]
122 add rsp, stack_size
123 %endm
124 %else
125 %define func(x) x:
126 %macro FUNC_SAVE 0
127 push r12
128 push r13
129 %endm
130
131 %macro FUNC_RESTORE 0
132 pop r13
133 pop r12
134 %endm
135 %endif
136 %define VECT_SIZE 8
137
138 global set_long_icf_fg_04
139 func(set_long_icf_fg_04)
140 FUNC_SAVE
141
142 lea end_in, [next_in + arg3]
143 add end_processed, next_in
144 mov end_processed_orig, end_processed
145 lea tmp1, [end_processed + LA_STATELESS]
146 cmp end_in, tmp1
147 cmovg end_in, tmp1
148 sub end_processed, VECT_SIZE - 1
149 vmovdqu ylong_lens, [long_len]
150 vmovdqu ylens_mask, [len_mask]
151 vmovdqu ydists_mask, [dists_mask]
152 vmovdqu ynlen_mask, [nlen_mask]
153 vmovdqu yvect_size, [vect_size]
154 vmovdqu ymax_len, [max_len]
155 vmovdqu ytwofiftysix, [twofiftysix]
156 vmovdqu ymatch_lookup, [match_lookup]
157
158 .fill_loop: ; Tahiti is a magical place
159 vmovdqu ymatch_lookup2, ymatch_lookup
160 vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
161
162 cmp next_in, end_processed
163 jae .end_fill
164
165 .finish_entry:
166 vpand ylens, ymatch_lookup2, ylens_mask
167 vpcmpgtd ycmp, ylens, ylong_lens
168 vpmovmskb tmp1, ycmp
169
170 ;; Speculatively increment
171 add next_in, VECT_SIZE
172 add match_lookup, ICF_CODE_BYTES * VECT_SIZE
173
174 test tmp1, tmp1
175 jz .fill_loop
176
177 tzcnt match_offset, tmp1
178 shr match_offset, 2
179
180 lea next_in, [next_in + match_offset - VECT_SIZE]
181 lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
182 mov dist %+ d, [match_lookup]
183 vmovd ymatch_lookup2 %+ x, dist %+ d
184
185 mov tmp1, dist
186 shr dist, DIST_OFFSET
187 and dist, LIT_DIST_MASK
188 shr tmp1, EXTRA_BITS_OFFSET
189 lea tmp2, [dist_start]
190 mov dist %+ w, [tmp2 + 2 * dist]
191 add dist, tmp1
192
193 mov match_in, next_in
194 sub match_in, dist
195
196 mov len, 8
197 mov tmp3, end_in
198 sub tmp3, next_in
199
200 compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2
201
202 vmovd ylens1 %+ x, len %+ d
203 vpbroadcastd ylens1, ylens1 %+ x
204 vpsubd ylens1, ylens1, [increment]
205 vpaddd ylens1, ylens1, [twofiftyfour]
206
207 mov tmp3, end_processed
208 sub tmp3, next_in
209 cmp len, tmp3
210 cmovg len, tmp3
211
212 add next_in, len
213 lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
214 vmovdqu ymatch_lookup, [match_lookup]
215
216 vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x
217 vpand ymatch_lookup2, ymatch_lookup2, ynlen_mask
218
219 neg len
220
221 .update_match_lookup:
222 vpand ylens2, ylens_mask, [match_lookup + ICF_CODE_BYTES * len]
223
224 vpcmpgtd ycmp, ylens1, ylens2
225 vpcmpgtd ytmp1, ylens1, ytwofiftysix
226 vpand ycmp, ycmp, ytmp1
227 vpmovmskb tmp1, ycmp
228
229 vpcmpgtd ycmp2, ylens1, ymax_len
230 vpandn ylens, ycmp2, ylens1
231 vpand ycmp2, ymax_len, ycmp2
232 vpor ylens, ycmp2
233
234 vpaddd ylens2, ylens, ymatch_lookup2
235 vpand ylens2, ylens2, ycmp
236
237 vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2
238
239 test tmp1 %+ d, tmp1 %+ d
240 jz .fill_loop
241
242 add len, VECT_SIZE
243 vpsubd ylens1, ylens1, yvect_size
244
245 jmp .update_match_lookup
246
247 .end_fill:
248 mov end_processed, end_processed_orig
249 cmp next_in, end_processed
250 jge .finish
251
252 mov tmp1, end_processed
253 sub tmp1, next_in
254 vmovd ytmp1 %+ x, tmp1 %+ d
255 vpbroadcastd ytmp1, ytmp1 %+ x
256 vpcmpgtd ytmp1, ytmp1, [increment]
257 vpand ymatch_lookup2, ymatch_lookup2, ytmp1
258 jmp .finish_entry
259
260 .finish:
261 FUNC_RESTORE
262 ret
263
264 endproc_frame
265
266 section .data
267 align 64
268 dist_start:
269 dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
270 dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
271 dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
272 dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
273 len_mask:
274 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
275 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
276 dists_mask:
277 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
278 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
279 long_len:
280 dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
281 increment:
282 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
283 vect_size:
284 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
285 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
286 twofiftyfour:
287 dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
288 twofiftysix:
289 dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
290 nlen_mask:
291 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
292 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
293 max_len:
294 dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102
295 dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102