]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/igzip_set_long_icf_fg_06.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / igzip / igzip_set_long_icf_fg_06.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "reg_sizes.asm"
31 %include "lz0a_const.asm"
32 %include "data_struct2.asm"
33 %include "igzip_compare_types.asm"
34 %define NEQ 4
35
36 %ifdef HAVE_AS_KNOWS_AVX512
37 %ifidn __OUTPUT_FORMAT__, win64
38 %define arg1 rcx
39 %define arg2 rdx
40 %define arg3 r8
41 %define arg4 r9
42 %define len rdi
43 %define dist rsi
44 %else
45 %define arg1 rdi
46 %define arg2 rsi
47 %define arg3 rdx
48 %define arg4 rcx
49 %define len r8
50 %define dist r9
51 %endif
52
53 %define next_in arg1
54 %define end_processed arg2
55 %define end_in arg3
56 %define match_lookup arg4
57 %define match_in rax
58 %define match_offset r10
59 %define tmp1 r11
60 %define end_processed_orig r12
61 %define dist_code r13
62 %define tmp2 r13
63
64 %define zmatch_lookup zmm0
65 %define zmatch_lookup2 zmm1
66 %define zlens zmm2
67 %define zdist_codes zmm3
68 %define zdist_extras zmm4
69 %define zdists zmm5
70 %define zdists2 zmm6
71 %define zlens1 zmm7
72 %define zlens2 zmm8
73 %define zlookup zmm9
74 %define zlookup2 zmm10
75 %define datas zmm11
76 %define ztmp1 zmm12
77 %define ztmp2 zmm13
78 %define zvect_size zmm16
79 %define zmax_len zmm17
80 %define ztwofiftyfour zmm18
81 %define ztwofiftysix zmm19
82 %define ztwosixtytwo zmm20
83 %define znlen_mask zmm21
84 %define zbswap zmm22
85 %define zqword_shuf zmm23
86 %define zdatas_perm3 zmm24
87 %define zdatas_perm2 zmm25
88 %define zincrement zmm26
89 %define zdists_mask zmm27
90 %define zdists_start zmm28
91 %define zlong_lens2 zmm29
92 %define zlong_lens zmm30
93 %define zlens_mask zmm31
94
95 %ifidn __OUTPUT_FORMAT__, win64
96 %define stack_size 8*16 + 4 * 8 + 8
97 %define func(x) proc_frame x
98 %macro FUNC_SAVE 0
99 alloc_stack stack_size
100 vmovdqa [rsp + 0*16], xmm6
101 vmovdqa [rsp + 1*16], xmm7
102 vmovdqa [rsp + 2*16], xmm8
103 vmovdqa [rsp + 3*16], xmm9
104 vmovdqa [rsp + 4*16], xmm10
105 vmovdqa [rsp + 5*16], xmm11
106 vmovdqa [rsp + 6*16], xmm12
107 vmovdqa [rsp + 7*16], xmm13
108 save_reg rsi, 8*16 + 0*8
109 save_reg rdi, 8*16 + 1*8
110 save_reg r12, 8*16 + 2*8
111 save_reg r13, 8*16 + 3*8
112 end_prolog
113 %endm
114
115 %macro FUNC_RESTORE 0
116 vmovdqa xmm6, [rsp + 0*16]
117 vmovdqa xmm7, [rsp + 1*16]
118 vmovdqa xmm8, [rsp + 2*16]
119 vmovdqa xmm9, [rsp + 3*16]
120 vmovdqa xmm10, [rsp + 4*16]
121 vmovdqa xmm11, [rsp + 5*16]
122 vmovdqa xmm12, [rsp + 6*16]
123 vmovdqa xmm13, [rsp + 7*16]
124
125 mov rsi, [rsp + 8*16 + 0*8]
126 mov rdi, [rsp + 8*16 + 1*8]
127 mov r12, [rsp + 8*16 + 2*8]
128 mov r13, [rsp + 8*16 + 3*8]
129 add rsp, stack_size
130 %endm
131 %else
132 %define func(x) x: endbranch
133 %macro FUNC_SAVE 0
134 push r12
135 push r13
136 %endm
137
138 %macro FUNC_RESTORE 0
139 pop r13
140 pop r12
141 %endm
142 %endif
143 %define VECT_SIZE 16
144
145 [bits 64]
146 default rel
147 section .text
148
149 global set_long_icf_fg_06
150 func(set_long_icf_fg_06)
151 endbranch
152 FUNC_SAVE
153
154 lea end_in, [next_in + arg3]
155 add end_processed, next_in
156 mov end_processed_orig, end_processed
157 lea tmp1, [end_processed + LA_STATELESS]
158 cmp end_in, tmp1
159 cmovg end_in, tmp1
160 sub end_processed, 15
161 vpbroadcastd zlong_lens, [long_len]
162 vpbroadcastd zlong_lens2, [long_len2]
163 vpbroadcastd zlens_mask, [len_mask]
164 vmovdqu16 zdists_start, [dist_start]
165 vpbroadcastd zdists_mask, [dists_mask]
166 vmovdqu32 zincrement, [increment]
167 vbroadcasti64x2 zdatas_perm2, [datas_perm2]
168 vbroadcasti64x2 zdatas_perm3, [datas_perm3]
169 vmovdqu64 zqword_shuf, [qword_shuf]
170 vbroadcasti64x2 zbswap, [bswap_shuf]
171 vpbroadcastd znlen_mask, [nlen_mask]
172 vpbroadcastd zvect_size, [vect_size]
173 vpbroadcastd zmax_len, [max_len]
174 vpbroadcastd ztwofiftyfour, [twofiftyfour]
175 vpbroadcastd ztwofiftysix, [twofiftysix]
176 vpbroadcastd ztwosixtytwo, [twosixtytwo]
177 vmovdqu32 zmatch_lookup, [match_lookup]
178
179 .fill_loop: ; Tahiti is a magical place
180 vmovdqu32 zmatch_lookup2, zmatch_lookup
181 vmovdqu32 zmatch_lookup, [match_lookup + ICF_CODE_BYTES * VECT_SIZE]
182
183 cmp next_in, end_processed
184 jae .end_fill
185
186 .finish_entry:
187 vpandd zlens, zmatch_lookup2, zlens_mask
188 vpcmpgtd k3, zlens, zlong_lens
189
190 ;; Speculatively increment
191 add next_in, VECT_SIZE
192 add match_lookup, ICF_CODE_BYTES * VECT_SIZE
193
194 ktestw k3, k3
195 jz .fill_loop
196
197 vpsrld zdist_codes, zmatch_lookup2, DIST_OFFSET
198 vpmovdw zdists %+ y, zdist_codes ; Relies on perm working mod 32
199 vpermw zdists, zdists, zdists_start
200 vpmovzxwd zdists, zdists %+ y
201
202 vpsrld zdist_extras, zmatch_lookup2, EXTRA_BITS_OFFSET
203 vpsubd zdist_extras, zincrement, zdist_extras
204
205 vpsubd zdists, zdist_extras, zdists
206 vextracti32x8 zdists2 %+ y, zdists, 1
207 kmovb k6, k3
208 kshiftrw k7, k3, 8
209 vpgatherdq zlens1 {k6}, [next_in + zdists %+ y - 8]
210 vpgatherdq zlens2 {k7}, [next_in + zdists2 %+ y - 8]
211
212 vmovdqu8 datas %+ y, [next_in - 8]
213 vpermq zlookup, zdatas_perm2, datas
214 vpshufb zlookup, zlookup, zqword_shuf
215 vpermq zlookup2, zdatas_perm3, datas
216 vpshufb zlookup2, zlookup2, zqword_shuf
217
218 vpxorq zlens1, zlens1, zlookup
219 vpxorq zlens2, zlens2, zlookup2
220
221 vpshufb zlens1, zlens1, zbswap
222 vpshufb zlens2, zlens2, zbswap
223 vplzcntq zlens1, zlens1
224 vplzcntq zlens2, zlens2
225 vpmovqd zlens1 %+ y, zlens1
226 vpmovqd zlens2 %+ y, zlens2
227 vinserti32x8 zlens1, zlens2 %+ y, 1
228 vpsrld zlens1 {k3}{z}, zlens1, 3
229
230 vpandd zmatch_lookup2 {k3}{z}, zmatch_lookup2, znlen_mask
231 vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, ztwosixtytwo
232 vpaddd zmatch_lookup2 {k3}{z}, zmatch_lookup2, zlens1
233
234 vmovdqu32 [match_lookup - ICF_CODE_BYTES * VECT_SIZE] {k3}, zmatch_lookup2
235
236 vpcmpgtd k3, zlens1, zlong_lens2
237 ktestw k3, k3
238 jz .fill_loop
239
240 vpsubd zdists, zincrement, zdists
241
242 vpcompressd zdists2 {k3}, zdists
243 vpcompressd zmatch_lookup2 {k3}, zmatch_lookup2
244 kmovq match_offset, k3
245 tzcnt match_offset, match_offset
246
247 vmovd dist %+ d, zdists2 %+ x
248 lea next_in, [next_in + match_offset - VECT_SIZE]
249 lea match_lookup, [match_lookup + ICF_CODE_BYTES * (match_offset - VECT_SIZE)]
250 mov match_in, next_in
251 sub match_in, dist
252
253 mov len, 16
254 mov tmp2, end_in
255 sub tmp2, next_in
256
257 compare_z next_in, match_in, len, tmp2, tmp1, k3, ztmp1, ztmp2
258
259 vpbroadcastd zlens1, len %+ d
260 vpsubd zlens1, zlens1, zincrement
261 vpaddd zlens1, zlens1, ztwofiftyfour
262
263 mov tmp2, end_processed
264 sub tmp2, next_in
265 cmp len, tmp2
266 cmovg len, tmp2
267
268 add next_in, len
269 lea match_lookup, [match_lookup + ICF_CODE_BYTES * len]
270 vmovdqu32 zmatch_lookup, [match_lookup]
271
272 vpbroadcastd zmatch_lookup2, zmatch_lookup2 %+ x
273 vpandd zmatch_lookup2, zmatch_lookup2, znlen_mask
274
275 neg len
276
277 .update_match_lookup:
278 vpandd zlens2, zlens_mask, [match_lookup + ICF_CODE_BYTES * len]
279 vpcmpgtd k3, zlens1, zlens2
280 vpcmpgtd k4, zlens1, ztwofiftysix
281 kandw k3, k3, k4
282
283 vpcmpgtd k4, zlens1, zmax_len
284 vmovdqu32 zlens, zlens1
285 vmovdqu32 zlens {k4}, zmax_len
286
287 vpaddd zlens2 {k3}{z}, zlens, zmatch_lookup2
288
289 vmovdqu32 [match_lookup + ICF_CODE_BYTES * len] {k3}, zlens2
290
291 knotw k3, k3
292 ktestw k3, k3
293 jnz .fill_loop
294
295 add len, VECT_SIZE
296 vpsubd zlens1, zlens1, zvect_size
297
298 jmp .update_match_lookup
299
300 .end_fill:
301 mov end_processed, end_processed_orig
302 cmp next_in, end_processed
303 jge .finish
304
305 mov tmp1, end_processed
306 sub tmp1, next_in
307 vpbroadcastd ztmp1, tmp1 %+ d
308 vpcmpd k3, ztmp1, zincrement, 6
309 vmovdqu32 zmatch_lookup2 {k3}{z}, zmatch_lookup2
310 jmp .finish_entry
311
312 .finish:
313
314 FUNC_RESTORE
315 ret
316
317 endproc_frame
318
319 section .data
320 align 64
321 ;; 64 byte data
322 dist_start:
323 dw 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0007, 0x0009, 0x000d
324 dw 0x0011, 0x0019, 0x0021, 0x0031, 0x0041, 0x0061, 0x0081, 0x00c1
325 dw 0x0101, 0x0181, 0x0201, 0x0301, 0x0401, 0x0601, 0x0801, 0x0c01
326 dw 0x1001, 0x1801, 0x2001, 0x3001, 0x4001, 0x6001, 0x0000, 0x0000
327 qword_shuf:
328 db 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
329 db 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8
330 db 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9
331 db 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa
332 db 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb
333 db 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc
334 db 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd
335 db 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe
336 db 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
337
338 ;; 16 byte data
339 increment:
340 dd 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
341 dd 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
342
343 datas_perm2:
344 dq 0x0, 0x1
345 datas_perm3:
346 dq 0x1, 0x2
347 bswap_shuf:
348 db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
349 db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
350
351 ;; 4 byte data
352 len_mask:
353 dd LIT_LEN_MASK
354 dists_mask:
355 dd LIT_DIST_MASK
356 long_len:
357 dd 0x105
358 long_len2:
359 dd 0x7
360 max_len:
361 dd 0xfe + 0x102
362 vect_size:
363 dd VECT_SIZE
364 twofiftyfour:
365 dd 0xfe
366 twofiftysix:
367 dd 0x100
368 twosixtytwo:
369 dd 0x106
370 nlen_mask:
371 dd 0xfffffc00
372 %endif