]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/igzip/igzip_compare_types.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / igzip / igzip_compare_types.asm
CommitLineData
7c673cae
FG
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30%include "options.asm"
224ce89b
WB
31%include "stdmac.asm"
32
7c673cae
FG
33%ifndef UTILS_ASM
34%define UTILS_ASM
35; compare macro
36
37;; sttni2 is faster, but it can't be debugged
38;; so following code is based on "mine5"
39
40;; compare 258 bytes = 8 * 32 + 2
41;; tmp16 is a 16-bit version of tmp
42;; compare258 src1, src2, result, tmp
43%macro compare258 4
44%define %%src1 %1
45%define %%src2 %2
46%define %%result %3
47%define %%tmp %4
48%define %%tmp16 %4w ; tmp as a 16-bit register
49
50 xor %%result, %%result
51%%loop1:
52 mov %%tmp, [%%src1 + %%result]
53 xor %%tmp, [%%src2 + %%result]
54 jnz %%miscompare
55 add %%result, 8
56
57 mov %%tmp, [%%src1 + %%result]
58 xor %%tmp, [%%src2 + %%result]
59 jnz %%miscompare
60 add %%result, 8
61
62 cmp %%result, 256
63 jb %%loop1
64
65 ; compare last two bytes
66 mov %%tmp16, [%%src1 + %%result]
67 xor %%tmp16, [%%src2 + %%result]
68 jnz %%miscompare16
69
70 ; no miscompares, return 258
71 add %%result, 2
72 jmp %%end
73
74%%miscompare16:
75 and %%tmp, 0xFFFF
76%%miscompare:
77 bsf %%tmp, %%tmp
78 shr %%tmp, 3
79 add %%result, %%tmp
80%%end:
81%endm
82
83;; compare 258 bytes = 8 * 32 + 2
84;; tmp16 is a 16-bit version of tmp
85;; compare258 src1, src2, result, tmp
86%macro compare250 4
87%define %%src1 %1
88%define %%src2 %2
89%define %%result %3
90%define %%tmp %4
91%define %%tmp16 %4w ; tmp as a 16-bit register
92
93 mov %%result, 8
94 mov %%tmp, [%%src1 + 8]
95 xor %%tmp, [%%src2 + 8]
96 jnz %%miscompare
97 add %%result, 8
98
99%%loop1:
100 mov %%tmp, [%%src1 + %%result]
101 xor %%tmp, [%%src2 + %%result]
102 jnz %%miscompare
103 add %%result, 8
104
105 mov %%tmp, [%%src1 + %%result]
106 xor %%tmp, [%%src2 + %%result]
107 jnz %%miscompare
108 add %%result, 8
109
110 cmp %%result, 256
111 jb %%loop1
112
113 ; compare last two bytes
114 mov %%tmp16, [%%src1 + %%result]
115 xor %%tmp16, [%%src2 + %%result]
116 jnz %%miscompare16
117
118 ; no miscompares, return 258
119 add %%result, 2
120 jmp %%end
121
122%%miscompare16:
123 and %%tmp, 0xFFFF
124%%miscompare:
125 bsf %%tmp, %%tmp
126 shr %%tmp, 3
127 add %%result, %%tmp
128%%end:
129%endm
130
131;; compare 258 bytes = 8 * 32 + 2
132;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
133;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
134%macro compare258_x 6
135%define %%src1 %1
136%define %%src2 %2
137%define %%result %3
138%define %%tmp %4
139%define %%tmp32 %4d
140%define %%tmp16 %4w ; tmp as a 16-bit register
141%define %%xtmp %5
142%define %%xtmp2 %6
143
144 xor %%result, %%result
145%%loop1:
224ce89b
WB
146 MOVDQU %%xtmp, [%%src1 + %%result]
147 MOVDQU %%xtmp2, [%%src2 + %%result]
148 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
149 PMOVMSKB %%tmp32, %%xtmp
7c673cae
FG
150 xor %%tmp, 0xFFFF
151 jnz %%miscompare
152 add %%result, 16
153
224ce89b
WB
154 MOVDQU %%xtmp, [%%src1 + %%result]
155 MOVDQU %%xtmp2, [%%src2 + %%result]
156 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
157 PMOVMSKB %%tmp32, %%xtmp
7c673cae
FG
158 xor %%tmp, 0xFFFF
159 jnz %%miscompare
160 add %%result, 16
161
162 cmp %%result, 256
163 jb %%loop1
164
165 ; compare last two bytes
166 mov %%tmp16, [%%src1 + %%result]
167 xor %%tmp16, [%%src2 + %%result]
168 jnz %%miscompare16
169
170 ; no miscompares, return 258
171 add %%result, 2
172 jmp %%end
173
174%%miscompare16:
175 and %%tmp, 0xFFFF
176 bsf %%tmp, %%tmp
177 shr %%tmp, 3
178 add %%result, %%tmp
179 jmp %%end
180%%miscompare:
181 bsf %%tmp, %%tmp
182 add %%result, %%tmp
183%%end:
184%endm
185
186;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
187;; were already checked
188;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
189;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
190%macro compare250_x 6
191%define %%src1 %1
192%define %%src2 %2
193%define %%result %3
194%define %%tmp %4
195%define %%tmp32 %4d ; tmp as a 16-bit register
196%define %%xtmp %5
197%define %%xtmp2 %6
198
199 mov %%result, 8
224ce89b
WB
200 MOVDQU %%xtmp, [%%src1 + 8]
201 MOVDQU %%xtmp2, [%%src2 + 8]
202 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
203 PMOVMSKB %%tmp32, %%xtmp
7c673cae
FG
204 xor %%tmp, 0xFFFF
205 jnz %%miscompare
206 add %%result, 16
207%%loop1:
224ce89b
WB
208 MOVDQU %%xtmp, [%%src1 + %%result]
209 MOVDQU %%xtmp2, [%%src2 + %%result]
210 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
211 PMOVMSKB %%tmp32, %%xtmp
7c673cae
FG
212 xor %%tmp, 0xFFFF
213 jnz %%miscompare
214 add %%result, 16
215
224ce89b
WB
216 MOVDQU %%xtmp, [%%src1 + %%result]
217 MOVDQU %%xtmp2, [%%src2 + %%result]
218 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
219 PMOVMSKB %%tmp32, %%xtmp
7c673cae
FG
220 xor %%tmp, 0xFFFF
221 jnz %%miscompare
222 add %%result, 16
223
224 cmp %%result, 258 - 16
225 jb %%loop1
226
224ce89b
WB
227 MOVDQU %%xtmp, [%%src1 + %%result]
228 MOVDQU %%xtmp2, [%%src2 + %%result]
229 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
230 PMOVMSKB %%tmp32, %%xtmp
7c673cae
FG
231 xor %%tmp, 0xFFFF
232 jnz %%miscompare_last
233 ; no miscompares, return 258
234 mov %%result, 258
235 jmp %%end
236
237%%miscompare_last:
238 bsf %%tmp, %%tmp
239 add %%result, %%tmp
240
241 ;; Guarantee the result has length at most 258.
242 mov %%tmp, 258
243 cmp %%result, 258
244 cmova %%result, %%tmp
245 jmp %%end
246%%miscompare:
247 bsf %%tmp, %%tmp
248 add %%result, %%tmp
249%%end:
250%endm
251
252;; compare 258 bytes = 8 * 32 + 2
253;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
254;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
255%macro compare258_y 6
256%define %%src1 %1
257%define %%src2 %2
258%define %%result %3
259%define %%tmp %4
260%define %%tmp16 %4w ; tmp as a 16-bit register
261%define %%tmp32 %4d ; tmp as a 32-bit register
262%define %%ytmp %5
263%define %%ytmp2 %6
264
265 xor %%result, %%result
266%%loop1:
267 vmovdqu %%ytmp, [%%src1 + %%result]
268 vmovdqu %%ytmp2, [%%src2 + %%result]
269 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
270 vpmovmskb %%tmp, %%ytmp
271 xor %%tmp32, 0xFFFFFFFF
272 jnz %%miscompare
273 add %%result, 32
274
275 vmovdqu %%ytmp, [%%src1 + %%result]
276 vmovdqu %%ytmp2, [%%src2 + %%result]
277 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
278 vpmovmskb %%tmp, %%ytmp
279 xor %%tmp32, 0xFFFFFFFF
280 jnz %%miscompare
281 add %%result, 32
282
283 cmp %%result, 256
284 jb %%loop1
285
286 ; compare last two bytes
287 mov %%tmp16, [%%src1 + %%result]
288 xor %%tmp16, [%%src2 + %%result]
289 jnz %%miscompare16
290
291 ; no miscompares, return 258
292 add %%result, 2
293 jmp %%end
294
295%%miscompare16:
296 and %%tmp, 0xFFFF
297 bsf %%tmp, %%tmp
298 shr %%tmp, 3
299 add %%result, %%tmp
300 jmp %%end
301%%miscompare:
302 bsf %%tmp, %%tmp
303 add %%result, %%tmp
304%%end:
305%endm
306
307
308;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
309;; were already checked
310;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
311;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
312%macro compare250_y 6
313%define %%src1 %1
314%define %%src2 %2
315%define %%result %3
316%define %%tmp %4
317%define %%tmp16 %4w ; tmp as a 16-bit register
318%define %%tmp32 %4d ; tmp as a 32-bit register
319%define %%ytmp %5
320%define %%ytmp2 %6
321
322 mov %%result, 8
323 vmovdqu %%ytmp, [%%src1 + 8]
324 vmovdqu %%ytmp2, [%%src2 + 8]
325 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
326 vpmovmskb %%tmp, %%ytmp
327 xor %%tmp32, 0xFFFFFFFF
328 jnz %%miscompare
329 add %%result, 32
330%%loop1:
331 vmovdqu %%ytmp, [%%src1 + %%result]
332 vmovdqu %%ytmp2, [%%src2 + %%result]
333 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
334 vpmovmskb %%tmp, %%ytmp
335 xor %%tmp32, 0xFFFFFFFF
336 jnz %%miscompare
337 add %%result, 32
338
339 vmovdqu %%ytmp, [%%src1 + %%result]
340 vmovdqu %%ytmp2, [%%src2 + %%result]
341 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
342 vpmovmskb %%tmp, %%ytmp
343 xor %%tmp32, 0xFFFFFFFF
344 jnz %%miscompare
345 add %%result, 32
346
347 cmp %%result, 258 - 32
348 jb %%loop1
349
350 vmovdqu %%ytmp, [%%src1 + %%result]
351 vmovdqu %%ytmp2, [%%src2 + %%result]
352 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
353 vpmovmskb %%tmp, %%ytmp
354 xor %%tmp32, 0xFFFFFFFF
355 jnz %%miscompare_last
356 mov %%result, 258
357 jmp %%end
358
359%%miscompare_last:
360 bsf %%tmp, %%tmp
361 add %%result, %%tmp
362
363 ;; Guarantee the result has length at most 258.
364 mov %%tmp, 258
365 cmp %%result, 258
366 cmova %%result, %%tmp
367 jmp %%end
368
369%%miscompare:
370 bsf %%tmp, %%tmp
371 add %%result, %%tmp
372%%end:
373%endm
374
375;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
377
378;; compare size, src1, src2, result, tmp
379%macro compare 5
380%define %%size %1
381%define %%src1 %2
382%define %%src2 %3
383%define %%result %4
384%define %%tmp %5
385%define %%tmp8 %5b ; tmp as a 8-bit register
386
387 xor %%result, %%result
388 sub %%size, 7
389 jle %%lab2
390%%loop1:
391 mov %%tmp, [%%src1 + %%result]
392 xor %%tmp, [%%src2 + %%result]
393 jnz %%miscompare
394 add %%result, 8
395 sub %%size, 8
396 jg %%loop1
397%%lab2:
398 ;; if we fall through from above, we have found no mismatches,
399 ;; %%size+7 is the number of bytes left to look at, and %%result is the
400 ;; number of bytes that have matched
401 add %%size, 7
402 jle %%end
403%%loop3:
404 mov %%tmp8, [%%src1 + %%result]
405 cmp %%tmp8, [%%src2 + %%result]
406 jne %%end
407 inc %%result
408 dec %%size
409 jg %%loop3
410 jmp %%end
411%%miscompare:
412 bsf %%tmp, %%tmp
413 shr %%tmp, 3
414 add %%result, %%tmp
415%%end:
416%endm
417
418%endif ;UTILS_ASM