]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/igzip/igzip_compare_types.asm
6a4942461e84540a71ae56e4901ac3c9e2b67cdc
[ceph.git] / ceph / src / isa-l / igzip / igzip_compare_types.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 %include "options.asm"
31 %ifndef UTILS_ASM
32 %define UTILS_ASM
33 ; compare macro
34
35 ;; sttni2 is faster, but it can't be debugged
36 ;; so following code is based on "mine5"
37
38 ;; compare 258 bytes = 8 * 32 + 2
39 ;; tmp16 is a 16-bit version of tmp
40 ;; compare258 src1, src2, result, tmp
41 %macro compare258 4
42 %define %%src1 %1
43 %define %%src2 %2
44 %define %%result %3
45 %define %%tmp %4
46 %define %%tmp16 %4w ; tmp as a 16-bit register
47
48 xor %%result, %%result
49 %%loop1:
50 mov %%tmp, [%%src1 + %%result]
51 xor %%tmp, [%%src2 + %%result]
52 jnz %%miscompare
53 add %%result, 8
54
55 mov %%tmp, [%%src1 + %%result]
56 xor %%tmp, [%%src2 + %%result]
57 jnz %%miscompare
58 add %%result, 8
59
60 cmp %%result, 256
61 jb %%loop1
62
63 ; compare last two bytes
64 mov %%tmp16, [%%src1 + %%result]
65 xor %%tmp16, [%%src2 + %%result]
66 jnz %%miscompare16
67
68 ; no miscompares, return 258
69 add %%result, 2
70 jmp %%end
71
72 %%miscompare16:
73 and %%tmp, 0xFFFF
74 %%miscompare:
75 bsf %%tmp, %%tmp
76 shr %%tmp, 3
77 add %%result, %%tmp
78 %%end:
79 %endm
80
81 ;; compare 258 bytes = 8 * 32 + 2
82 ;; tmp16 is a 16-bit version of tmp
83 ;; compare258 src1, src2, result, tmp
84 %macro compare250 4
85 %define %%src1 %1
86 %define %%src2 %2
87 %define %%result %3
88 %define %%tmp %4
89 %define %%tmp16 %4w ; tmp as a 16-bit register
90
91 mov %%result, 8
92 mov %%tmp, [%%src1 + 8]
93 xor %%tmp, [%%src2 + 8]
94 jnz %%miscompare
95 add %%result, 8
96
97 %%loop1:
98 mov %%tmp, [%%src1 + %%result]
99 xor %%tmp, [%%src2 + %%result]
100 jnz %%miscompare
101 add %%result, 8
102
103 mov %%tmp, [%%src1 + %%result]
104 xor %%tmp, [%%src2 + %%result]
105 jnz %%miscompare
106 add %%result, 8
107
108 cmp %%result, 256
109 jb %%loop1
110
111 ; compare last two bytes
112 mov %%tmp16, [%%src1 + %%result]
113 xor %%tmp16, [%%src2 + %%result]
114 jnz %%miscompare16
115
116 ; no miscompares, return 258
117 add %%result, 2
118 jmp %%end
119
120 %%miscompare16:
121 and %%tmp, 0xFFFF
122 %%miscompare:
123 bsf %%tmp, %%tmp
124 shr %%tmp, 3
125 add %%result, %%tmp
126 %%end:
127 %endm
128
129 ;; compare 258 bytes = 8 * 32 + 2
130 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
131 ;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
132 %macro compare258_x 6
133 %define %%src1 %1
134 %define %%src2 %2
135 %define %%result %3
136 %define %%tmp %4
137 %define %%tmp32 %4d
138 %define %%tmp16 %4w ; tmp as a 16-bit register
139 %define %%xtmp %5
140 %define %%xtmp2 %6
141
142 xor %%result, %%result
143 %%loop1:
144 movdqu %%xtmp, [%%src1 + %%result]
145 movdqu %%xtmp2, [%%src2 + %%result]
146 pcmpeqb %%xtmp, %%xtmp2
147 pmovmskb %%tmp32, %%xtmp
148 xor %%tmp, 0xFFFF
149 jnz %%miscompare
150 add %%result, 16
151
152 movdqu %%xtmp, [%%src1 + %%result]
153 movdqu %%xtmp2, [%%src2 + %%result]
154 pcmpeqb %%xtmp, %%xtmp2
155 pmovmskb %%tmp32, %%xtmp
156 xor %%tmp, 0xFFFF
157 jnz %%miscompare
158 add %%result, 16
159
160 cmp %%result, 256
161 jb %%loop1
162
163 ; compare last two bytes
164 mov %%tmp16, [%%src1 + %%result]
165 xor %%tmp16, [%%src2 + %%result]
166 jnz %%miscompare16
167
168 ; no miscompares, return 258
169 add %%result, 2
170 jmp %%end
171
172 %%miscompare16:
173 and %%tmp, 0xFFFF
174 bsf %%tmp, %%tmp
175 shr %%tmp, 3
176 add %%result, %%tmp
177 jmp %%end
178 %%miscompare:
179 bsf %%tmp, %%tmp
180 add %%result, %%tmp
181 %%end:
182 %endm
183
184 ;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
185 ;; were already checked
186 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
187 ;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
188 %macro compare250_x 6
189 %define %%src1 %1
190 %define %%src2 %2
191 %define %%result %3
192 %define %%tmp %4
193 %define %%tmp32 %4d ; tmp as a 16-bit register
194 %define %%xtmp %5
195 %define %%xtmp2 %6
196
197 mov %%result, 8
198 movdqu %%xtmp, [%%src1 + 8]
199 movdqu %%xtmp2, [%%src2 + 8]
200 pcmpeqb %%xtmp, %%xtmp2
201 pmovmskb %%tmp32, %%xtmp
202 xor %%tmp, 0xFFFF
203 jnz %%miscompare
204 add %%result, 16
205 %%loop1:
206 movdqu %%xtmp, [%%src1 + %%result]
207 movdqu %%xtmp2, [%%src2 + %%result]
208 pcmpeqb %%xtmp, %%xtmp2
209 pmovmskb %%tmp32, %%xtmp
210 xor %%tmp, 0xFFFF
211 jnz %%miscompare
212 add %%result, 16
213
214 movdqu %%xtmp, [%%src1 + %%result]
215 movdqu %%xtmp2, [%%src2 + %%result]
216 pcmpeqb %%xtmp, %%xtmp2
217 pmovmskb %%tmp32, %%xtmp
218 xor %%tmp, 0xFFFF
219 jnz %%miscompare
220 add %%result, 16
221
222 cmp %%result, 258 - 16
223 jb %%loop1
224
225 movdqu %%xtmp, [%%src1 + %%result]
226 movdqu %%xtmp2, [%%src2 + %%result]
227 pcmpeqb %%xtmp, %%xtmp2
228 pmovmskb %%tmp32, %%xtmp
229 xor %%tmp, 0xFFFF
230 jnz %%miscompare_last
231 ; no miscompares, return 258
232 mov %%result, 258
233 jmp %%end
234
235 %%miscompare_last:
236 bsf %%tmp, %%tmp
237 add %%result, %%tmp
238
239 ;; Guarantee the result has length at most 258.
240 mov %%tmp, 258
241 cmp %%result, 258
242 cmova %%result, %%tmp
243 jmp %%end
244 %%miscompare:
245 bsf %%tmp, %%tmp
246 add %%result, %%tmp
247 %%end:
248 %endm
249
250 ;; compare 258 bytes = 8 * 32 + 2
251 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
252 ;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
253 %macro compare258_y 6
254 %define %%src1 %1
255 %define %%src2 %2
256 %define %%result %3
257 %define %%tmp %4
258 %define %%tmp16 %4w ; tmp as a 16-bit register
259 %define %%tmp32 %4d ; tmp as a 32-bit register
260 %define %%ytmp %5
261 %define %%ytmp2 %6
262
263 xor %%result, %%result
264 %%loop1:
265 vmovdqu %%ytmp, [%%src1 + %%result]
266 vmovdqu %%ytmp2, [%%src2 + %%result]
267 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
268 vpmovmskb %%tmp, %%ytmp
269 xor %%tmp32, 0xFFFFFFFF
270 jnz %%miscompare
271 add %%result, 32
272
273 vmovdqu %%ytmp, [%%src1 + %%result]
274 vmovdqu %%ytmp2, [%%src2 + %%result]
275 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
276 vpmovmskb %%tmp, %%ytmp
277 xor %%tmp32, 0xFFFFFFFF
278 jnz %%miscompare
279 add %%result, 32
280
281 cmp %%result, 256
282 jb %%loop1
283
284 ; compare last two bytes
285 mov %%tmp16, [%%src1 + %%result]
286 xor %%tmp16, [%%src2 + %%result]
287 jnz %%miscompare16
288
289 ; no miscompares, return 258
290 add %%result, 2
291 jmp %%end
292
293 %%miscompare16:
294 and %%tmp, 0xFFFF
295 bsf %%tmp, %%tmp
296 shr %%tmp, 3
297 add %%result, %%tmp
298 jmp %%end
299 %%miscompare:
300 bsf %%tmp, %%tmp
301 add %%result, %%tmp
302 %%end:
303 %endm
304
305
306 ;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
307 ;; were already checked
308 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
309 ;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
310 %macro compare250_y 6
311 %define %%src1 %1
312 %define %%src2 %2
313 %define %%result %3
314 %define %%tmp %4
315 %define %%tmp16 %4w ; tmp as a 16-bit register
316 %define %%tmp32 %4d ; tmp as a 32-bit register
317 %define %%ytmp %5
318 %define %%ytmp2 %6
319
320 mov %%result, 8
321 vmovdqu %%ytmp, [%%src1 + 8]
322 vmovdqu %%ytmp2, [%%src2 + 8]
323 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
324 vpmovmskb %%tmp, %%ytmp
325 xor %%tmp32, 0xFFFFFFFF
326 jnz %%miscompare
327 add %%result, 32
328 %%loop1:
329 vmovdqu %%ytmp, [%%src1 + %%result]
330 vmovdqu %%ytmp2, [%%src2 + %%result]
331 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
332 vpmovmskb %%tmp, %%ytmp
333 xor %%tmp32, 0xFFFFFFFF
334 jnz %%miscompare
335 add %%result, 32
336
337 vmovdqu %%ytmp, [%%src1 + %%result]
338 vmovdqu %%ytmp2, [%%src2 + %%result]
339 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
340 vpmovmskb %%tmp, %%ytmp
341 xor %%tmp32, 0xFFFFFFFF
342 jnz %%miscompare
343 add %%result, 32
344
345 cmp %%result, 258 - 32
346 jb %%loop1
347
348 vmovdqu %%ytmp, [%%src1 + %%result]
349 vmovdqu %%ytmp2, [%%src2 + %%result]
350 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
351 vpmovmskb %%tmp, %%ytmp
352 xor %%tmp32, 0xFFFFFFFF
353 jnz %%miscompare_last
354 mov %%result, 258
355 jmp %%end
356
357 %%miscompare_last:
358 bsf %%tmp, %%tmp
359 add %%result, %%tmp
360
361 ;; Guarantee the result has length at most 258.
362 mov %%tmp, 258
363 cmp %%result, 258
364 cmova %%result, %%tmp
365 jmp %%end
366
367 %%miscompare:
368 bsf %%tmp, %%tmp
369 add %%result, %%tmp
370 %%end:
371 %endm
372
373 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375
376 ;; compare size, src1, src2, result, tmp
377 %macro compare 5
378 %define %%size %1
379 %define %%src1 %2
380 %define %%src2 %3
381 %define %%result %4
382 %define %%tmp %5
383 %define %%tmp8 %5b ; tmp as a 8-bit register
384
385 xor %%result, %%result
386 sub %%size, 7
387 jle %%lab2
388 %%loop1:
389 mov %%tmp, [%%src1 + %%result]
390 xor %%tmp, [%%src2 + %%result]
391 jnz %%miscompare
392 add %%result, 8
393 sub %%size, 8
394 jg %%loop1
395 %%lab2:
396 ;; if we fall through from above, we have found no mismatches,
397 ;; %%size+7 is the number of bytes left to look at, and %%result is the
398 ;; number of bytes that have matched
399 add %%size, 7
400 jle %%end
401 %%loop3:
402 mov %%tmp8, [%%src1 + %%result]
403 cmp %%tmp8, [%%src2 + %%result]
404 jne %%end
405 inc %%result
406 dec %%size
407 jg %%loop3
408 jmp %%end
409 %%miscompare:
410 bsf %%tmp, %%tmp
411 shr %%tmp, 3
412 add %%result, %%tmp
413 %%end:
414 %endm
415
416 %endif ;UTILS_ASM