]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/igzip_set_long_icf_fg_04.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / igzip / igzip_set_long_icf_fg_04.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "reg_sizes.asm"
31%include "lz0a_const.asm"
32%include "data_struct2.asm"
33%include "igzip_compare_types.asm"
34%define NEQ 4
35
36default rel
37
38%ifidn __OUTPUT_FORMAT__, win64
39%define arg1 rcx
40%define arg2 rdx
41%define arg3 r8
42%define arg4 r9
43%define len rdi
44%define tmp2 rdi
45%define dist rsi
46%else
47%define arg1 rdi
48%define arg2 rsi
49%define arg3 rdx
50%define arg4 rcx
51%define len r8
52%define tmp2 r8
53%define dist r9
54%endif
55
56%define next_in arg1
57%define end_processed arg2
58%define end_in arg3
59%define match_lookup arg4
60%define match_in rax
61%define match_offset r10
62%define tmp1 r11
63%define end_processed_orig r12
64%define dist_code r13
65%define tmp3 r13
66
67%define ymatch_lookup ymm0
68%define ymatch_lookup2 ymm1
69%define ylens ymm2
70%define ycmp2 ymm3
71%define ylens1 ymm4
72%define ylens2 ymm5
73%define ycmp ymm6
74%define ytmp1 ymm7
75%define ytmp2 ymm8
76%define yvect_size ymm9
77%define ymax_len ymm10
78%define ytwofiftysix ymm11
79%define ynlen_mask ymm12
80%define ydists_mask ymm13
81%define ylong_lens ymm14
82%define ylens_mask ymm15
83
84%ifidn __OUTPUT_FORMAT__, win64
85%define stack_size 10*16 + 4 * 8 + 8
86%define func(x) proc_frame x
87%macro FUNC_SAVE 0
88 alloc_stack stack_size
89 vmovdqa [rsp + 0*16], xmm6
90 vmovdqa [rsp + 1*16], xmm7
91 vmovdqa [rsp + 2*16], xmm8
92 vmovdqa [rsp + 3*16], xmm9
93 vmovdqa [rsp + 4*16], xmm10
94 vmovdqa [rsp + 5*16], xmm11
95 vmovdqa [rsp + 6*16], xmm12
96 vmovdqa [rsp + 7*16], xmm13
97 vmovdqa [rsp + 8*16], xmm14
98 vmovdqa [rsp + 9*16], xmm15
99 save_reg rsi, 10*16 + 0*8
100 save_reg rdi, 10*16 + 1*8
101 save_reg r12, 10*16 + 2*8
102 save_reg r13, 10*16 + 3*8
103 end_prolog
104%endm
105
106%macro FUNC_RESTORE 0
107 vmovdqa xmm6, [rsp + 0*16]
108 vmovdqa xmm7, [rsp + 1*16]
109 vmovdqa xmm8, [rsp + 2*16]
110 vmovdqa xmm9, [rsp + 3*16]
111 vmovdqa xmm10, [rsp + 4*16]
112 vmovdqa xmm11, [rsp + 5*16]
113 vmovdqa xmm12, [rsp + 6*16]
114 vmovdqa xmm13, [rsp + 7*16]
115 vmovdqa xmm14, [rsp + 8*16]
116 vmovdqa xmm15, [rsp + 9*16]
117
118 mov rsi, [rsp + 10*16 + 0*8]
119 mov rdi, [rsp + 10*16 + 1*8]
120 mov r12, [rsp + 10*16 + 2*8]
121 mov r13, [rsp + 10*16 + 3*8]
122 add rsp, stack_size
123%endm
124%else
20effc67 125%define func(x) x: endbranch
f91f0fd5
TL
126%macro FUNC_SAVE 0
127 push r12
128 push r13
129%endm
130
131%macro FUNC_RESTORE 0
132 pop r13
133 pop r12
134%endm
135%endif
136%define VECT_SIZE 8
137
20effc67
TL
138[bits 64]
139default rel
140section .text
141
f91f0fd5
TL
142global set_long_icf_fg_04
143func(set_long_icf_fg_04)
20effc67 144 endbranch
f91f0fd5
TL
145 FUNC_SAVE
146
147 lea end_in, [next_in + arg3]
148 add end_processed, next_in
149 mov end_processed_orig, end_processed
150 lea tmp1, [end_processed + LA_STATELESS]
151 cmp end_in, tmp1
152 cmovg end_in, tmp1
153 sub end_processed, VECT_SIZE - 1
154 vmovdqu ylong_lens, [long_len]
155 vmovdqu ylens_mask, [len_mask]
156 vmovdqu ydists_mask, [dists_mask]
157 vmovdqu ynlen_mask, [nlen_mask]
158 vmovdqu yvect_size, [vect_size]
159 vmovdqu ymax_len, [max_len]
160 vmovdqu ytwofiftysix, [twofiftysix]
161 vmovdqu ymatch_lookup, [match_lookup]
162
163.fill_loop: ; Tahiti is a magical place
164 vmovdqu ymatch_lookup2, ymatch_lookup
165 vmovdqu ymatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
166
167 cmp next_in, end_processed
168 jae .end_fill
169
170.finish_entry:
171 vpand ylens, ymatch_lookup2, ylens_mask
172 vpcmpgtd ycmp, ylens, ylong_lens
173 vpmovmskb tmp1, ycmp
174
175;; Speculatively increment
176 add next_in, VECT_SIZE
177 add match_lookup, ICF_CODE_BYTES * VECT_SIZE
178
179 test tmp1, tmp1
180 jz .fill_loop
181
182 tzcnt match_offset, tmp1
183 shr match_offset, 2
184
185 lea next_in, [next_in + match_offset - VECT_SIZE]
186 lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
187 mov dist %+ d, [match_lookup]
188 vmovd ymatch_lookup2 %+ x, dist %+ d
189
190 mov tmp1, dist
191 shr dist, DIST_OFFSET
192 and dist, LIT_DIST_MASK
193 shr tmp1, EXTRA_BITS_OFFSET
194 lea tmp2, [dist_start]
195 mov dist %+ w, [tmp2 + 2 * dist]
196 add dist, tmp1
197
198 mov match_in, next_in
199 sub match_in, dist
200
201 mov len, 8
202 mov tmp3, end_in
203 sub tmp3, next_in
204
205 compare_y next_in, match_in, len, tmp3, tmp1, ytmp1, ytmp2
206
207 vmovd ylens1 %+ x, len %+ d
208 vpbroadcastd ylens1, ylens1 %+ x
209 vpsubd ylens1, ylens1, [increment]
210 vpaddd ylens1, ylens1, [twofiftyfour]
211
212 mov tmp3, end_processed
213 sub tmp3, next_in
214 cmp len, tmp3
215 cmovg len, tmp3
216
217 add next_in, len
218 lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
219 vmovdqu ymatch_lookup, [match_lookup]
220
221 vpbroadcastd ymatch_lookup2, ymatch_lookup2 %+ x
222 vpand ymatch_lookup2, ymatch_lookup2, ynlen_mask
223
224 neg len
225
226.update_match_lookup:
227 vpand ylens2, ylens_mask, [match_lookup + ICF_CODE_BYTES * len]
228
229 vpcmpgtd ycmp, ylens1, ylens2
230 vpcmpgtd ytmp1, ylens1, ytwofiftysix
231 vpand ycmp, ycmp, ytmp1
232 vpmovmskb tmp1, ycmp
233
234 vpcmpgtd ycmp2, ylens1, ymax_len
235 vpandn ylens, ycmp2, ylens1
236 vpand ycmp2, ymax_len, ycmp2
237 vpor ylens, ycmp2
238
239 vpaddd ylens2, ylens, ymatch_lookup2
240 vpand ylens2, ylens2, ycmp
241
242 vpmaskmovd [match_lookup + ICF_CODE_BYTES * len], ycmp, ylens2
243
244 test tmp1 %+ d, tmp1 %+ d
245 jz .fill_loop
246
247 add len, VECT_SIZE
248 vpsubd ylens1, ylens1, yvect_size
249
250 jmp .update_match_lookup
251
252.end_fill:
253 mov end_processed, end_processed_orig
254 cmp next_in, end_processed
255 jge .finish
256
257 mov tmp1, end_processed
258 sub tmp1, next_in
259 vmovd ytmp1 %+ x, tmp1 %+ d
260 vpbroadcastd ytmp1, ytmp1 %+ x
261 vpcmpgtd ytmp1, ytmp1, [increment]
262 vpand ymatch_lookup2, ymatch_lookup2, ytmp1
263 jmp .finish_entry
264
265.finish:
266 FUNC_RESTORE
267 ret
268
269endproc_frame
270
271section .data
272align 64
273dist_start:
274 dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
275 dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
276 dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
277 dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
278len_mask:
279 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
280 dd LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK, LIT_LEN_MASK
281dists_mask:
282 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
283 dd LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK, LIT_DIST_MASK
284long_len:
285 dd 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105, 0x105
286increment:
287 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
288vect_size:
289 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
290 dd VECT_SIZE, VECT_SIZE, VECT_SIZE, VECT_SIZE
291twofiftyfour:
292 dd 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe
293twofiftysix:
294 dd 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100, 0x100
295nlen_mask:
296 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
297 dd 0xfffffc00, 0xfffffc00, 0xfffffc00, 0xfffffc00
298max_len:
299 dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102
300 dd 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102, 0xfe + 0x102