1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "options.asm"
35 ;; sttni2 is faster, but it can't be debugged
36 ;; so following code is based on "mine5"
38 ;; compare 258 bytes = 8 * 32 + 2
39 ;; tmp16 is a 16-bit version of tmp
40 ;; compare258 src1, src2, result, tmp
46 %define %%tmp16 %4w ; tmp as a 16-bit register
48 xor %%result, %%result
50 mov %%tmp, [%%src1 + %%result]
51 xor %%tmp, [%%src2 + %%result]
55 mov %%tmp, [%%src1 + %%result]
56 xor %%tmp, [%%src2 + %%result]
63 ; compare last two bytes
64 mov %%tmp16, [%%src1 + %%result]
65 xor %%tmp16, [%%src2 + %%result]
68 ; no miscompares, return 258
81 ;; compare 258 bytes = 8 * 32 + 2
82 ;; tmp16 is a 16-bit version of tmp
83 ;; compare258 src1, src2, result, tmp
89 %define %%tmp16 %4w ; tmp as a 16-bit register
92 mov %%tmp, [%%src1 + 8]
93 xor %%tmp, [%%src2 + 8]
98 mov %%tmp, [%%src1 + %%result]
99 xor %%tmp, [%%src2 + %%result]
103 mov %%tmp, [%%src1 + %%result]
104 xor %%tmp, [%%src2 + %%result]
111 ; compare last two bytes
112 mov %%tmp16, [%%src1 + %%result]
113 xor %%tmp16, [%%src2 + %%result]
116 ; no miscompares, return 258
129 ;; compare 258 bytes = 8 * 32 + 2
130 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
131 ;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
132 %macro compare258_x 6
138 %define %%tmp16 %4w ; tmp as a 16-bit register
142 xor %%result, %%result
144 movdqu %%xtmp, [%%src1 + %%result]
145 movdqu %%xtmp2, [%%src2 + %%result]
146 pcmpeqb %%xtmp, %%xtmp2
147 pmovmskb %%tmp32, %%xtmp
152 movdqu %%xtmp, [%%src1 + %%result]
153 movdqu %%xtmp2, [%%src2 + %%result]
154 pcmpeqb %%xtmp, %%xtmp2
155 pmovmskb %%tmp32, %%xtmp
163 ; compare last two bytes
164 mov %%tmp16, [%%src1 + %%result]
165 xor %%tmp16, [%%src2 + %%result]
168 ; no miscompares, return 258
184 ;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
185 ;; were already checked
186 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
187 ;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
188 %macro compare250_x 6
193 %define %%tmp32 %4d ; tmp as a 16-bit register
198 movdqu %%xtmp, [%%src1 + 8]
199 movdqu %%xtmp2, [%%src2 + 8]
200 pcmpeqb %%xtmp, %%xtmp2
201 pmovmskb %%tmp32, %%xtmp
206 movdqu %%xtmp, [%%src1 + %%result]
207 movdqu %%xtmp2, [%%src2 + %%result]
208 pcmpeqb %%xtmp, %%xtmp2
209 pmovmskb %%tmp32, %%xtmp
214 movdqu %%xtmp, [%%src1 + %%result]
215 movdqu %%xtmp2, [%%src2 + %%result]
216 pcmpeqb %%xtmp, %%xtmp2
217 pmovmskb %%tmp32, %%xtmp
222 cmp %%result, 258 - 16
225 movdqu %%xtmp, [%%src1 + %%result]
226 movdqu %%xtmp2, [%%src2 + %%result]
227 pcmpeqb %%xtmp, %%xtmp2
228 pmovmskb %%tmp32, %%xtmp
230 jnz %%miscompare_last
231 ; no miscompares, return 258
239 ;; Guarantee the result has length at most 258.
242 cmova %%result, %%tmp
250 ;; compare 258 bytes = 8 * 32 + 2
251 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
252 ;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
253 %macro compare258_y 6
258 %define %%tmp16 %4w ; tmp as a 16-bit register
259 %define %%tmp32 %4d ; tmp as a 32-bit register
263 xor %%result, %%result
265 vmovdqu %%ytmp, [%%src1 + %%result]
266 vmovdqu %%ytmp2, [%%src2 + %%result]
267 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
268 vpmovmskb %%tmp, %%ytmp
269 xor %%tmp32, 0xFFFFFFFF
273 vmovdqu %%ytmp, [%%src1 + %%result]
274 vmovdqu %%ytmp2, [%%src2 + %%result]
275 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
276 vpmovmskb %%tmp, %%ytmp
277 xor %%tmp32, 0xFFFFFFFF
284 ; compare last two bytes
285 mov %%tmp16, [%%src1 + %%result]
286 xor %%tmp16, [%%src2 + %%result]
289 ; no miscompares, return 258
306 ;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
307 ;; were already checked
308 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
309 ;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
310 %macro compare250_y 6
315 %define %%tmp16 %4w ; tmp as a 16-bit register
316 %define %%tmp32 %4d ; tmp as a 32-bit register
321 vmovdqu %%ytmp, [%%src1 + 8]
322 vmovdqu %%ytmp2, [%%src2 + 8]
323 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
324 vpmovmskb %%tmp, %%ytmp
325 xor %%tmp32, 0xFFFFFFFF
329 vmovdqu %%ytmp, [%%src1 + %%result]
330 vmovdqu %%ytmp2, [%%src2 + %%result]
331 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
332 vpmovmskb %%tmp, %%ytmp
333 xor %%tmp32, 0xFFFFFFFF
337 vmovdqu %%ytmp, [%%src1 + %%result]
338 vmovdqu %%ytmp2, [%%src2 + %%result]
339 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
340 vpmovmskb %%tmp, %%ytmp
341 xor %%tmp32, 0xFFFFFFFF
345 cmp %%result, 258 - 32
348 vmovdqu %%ytmp, [%%src1 + %%result]
349 vmovdqu %%ytmp2, [%%src2 + %%result]
350 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
351 vpmovmskb %%tmp, %%ytmp
352 xor %%tmp32, 0xFFFFFFFF
353 jnz %%miscompare_last
361 ;; Guarantee the result has length at most 258.
364 cmova %%result, %%tmp
373 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
374 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 ;; compare size, src1, src2, result, tmp
383 %define %%tmp8 %5b ; tmp as a 8-bit register
385 xor %%result, %%result
389 mov %%tmp, [%%src1 + %%result]
390 xor %%tmp, [%%src2 + %%result]
396 ;; if we fall through from above, we have found no mismatches,
397 ;; %%size+7 is the number of bytes left to look at, and %%result is the
398 ;; number of bytes that have matched
402 mov %%tmp8, [%%src1 + %%result]
403 cmp %%tmp8, [%%src2 + %%result]