1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
30 %include "options.asm"
37 ;; sttni2 is faster, but it can't be debugged
38 ;; so following code is based on "mine5"
40 ;; compare 258 bytes = 8 * 32 + 2
41 ;; tmp16 is a 16-bit version of tmp
42 ;; compare258 src1, src2, result, tmp
48 %define %%tmp16 %4w ; tmp as a 16-bit register
50 xor %%result, %%result
52 mov %%tmp, [%%src1 + %%result]
53 xor %%tmp, [%%src2 + %%result]
57 mov %%tmp, [%%src1 + %%result]
58 xor %%tmp, [%%src2 + %%result]
65 ; compare last two bytes
66 mov %%tmp16, [%%src1 + %%result]
67 xor %%tmp16, [%%src2 + %%result]
70 ; no miscompares, return 258
83 ;; compare 258 bytes = 8 * 32 + 2
84 ;; tmp16 is a 16-bit version of tmp
85 ;; compare258 src1, src2, result, tmp
91 %define %%tmp16 %4w ; tmp as a 16-bit register
94 mov %%tmp, [%%src1 + 8]
95 xor %%tmp, [%%src2 + 8]
100 mov %%tmp, [%%src1 + %%result]
101 xor %%tmp, [%%src2 + %%result]
105 mov %%tmp, [%%src1 + %%result]
106 xor %%tmp, [%%src2 + %%result]
113 ; compare last two bytes
114 mov %%tmp16, [%%src1 + %%result]
115 xor %%tmp16, [%%src2 + %%result]
118 ; no miscompares, return 258
131 ;; compare 258 bytes = 8 * 32 + 2
132 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
133 ;; compare258_x src1, src2, result, tmp, xtmp1, xtmp2
134 %macro compare258_x 6
140 %define %%tmp16 %4w ; tmp as a 16-bit register
144 xor %%result, %%result
146 MOVDQU %%xtmp, [%%src1 + %%result]
147 MOVDQU %%xtmp2, [%%src2 + %%result]
148 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
149 PMOVMSKB %%tmp32, %%xtmp
154 MOVDQU %%xtmp, [%%src1 + %%result]
155 MOVDQU %%xtmp2, [%%src2 + %%result]
156 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
157 PMOVMSKB %%tmp32, %%xtmp
165 ; compare last two bytes
166 mov %%tmp16, [%%src1 + %%result]
167 xor %%tmp16, [%%src2 + %%result]
170 ; no miscompares, return 258
186 ;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
187 ;; were already checked
188 ;; compares 16 bytes at a time, using pcmpeqb/pmovmskb
189 ;; compare250_x src1, src2, result, tmp, xtmp1, xtmp2
190 %macro compare250_x 6
195 %define %%tmp32 %4d ; tmp as a 16-bit register
200 MOVDQU %%xtmp, [%%src1 + 8]
201 MOVDQU %%xtmp2, [%%src2 + 8]
202 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
203 PMOVMSKB %%tmp32, %%xtmp
208 MOVDQU %%xtmp, [%%src1 + %%result]
209 MOVDQU %%xtmp2, [%%src2 + %%result]
210 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
211 PMOVMSKB %%tmp32, %%xtmp
216 MOVDQU %%xtmp, [%%src1 + %%result]
217 MOVDQU %%xtmp2, [%%src2 + %%result]
218 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
219 PMOVMSKB %%tmp32, %%xtmp
224 cmp %%result, 258 - 16
227 MOVDQU %%xtmp, [%%src1 + %%result]
228 MOVDQU %%xtmp2, [%%src2 + %%result]
229 PCMPEQB %%xtmp, %%xtmp, %%xtmp2
230 PMOVMSKB %%tmp32, %%xtmp
232 jnz %%miscompare_last
233 ; no miscompares, return 258
241 ;; Guarantee the result has length at most 258.
244 cmova %%result, %%tmp
252 ;; compare 258 bytes = 8 * 32 + 2
253 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
254 ;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
255 %macro compare258_y 6
260 %define %%tmp16 %4w ; tmp as a 16-bit register
261 %define %%tmp32 %4d ; tmp as a 32-bit register
265 xor %%result, %%result
267 vmovdqu %%ytmp, [%%src1 + %%result]
268 vmovdqu %%ytmp2, [%%src2 + %%result]
269 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
270 vpmovmskb %%tmp, %%ytmp
271 xor %%tmp32, 0xFFFFFFFF
275 vmovdqu %%ytmp, [%%src1 + %%result]
276 vmovdqu %%ytmp2, [%%src2 + %%result]
277 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
278 vpmovmskb %%tmp, %%ytmp
279 xor %%tmp32, 0xFFFFFFFF
286 ; compare last two bytes
287 mov %%tmp16, [%%src1 + %%result]
288 xor %%tmp16, [%%src2 + %%result]
291 ; no miscompares, return 258
308 ;; compare 258 bytes = 8 * 32 + 2, assuming first 8 bytes
309 ;; were already checked
310 ;; compares 32 bytes at a time, using pcmpeqb/pmovmskb
311 ;; compare258_y src1, src2, result, tmp, xtmp1, xtmp2
312 %macro compare250_y 6
317 %define %%tmp16 %4w ; tmp as a 16-bit register
318 %define %%tmp32 %4d ; tmp as a 32-bit register
323 vmovdqu %%ytmp, [%%src1 + 8]
324 vmovdqu %%ytmp2, [%%src2 + 8]
325 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
326 vpmovmskb %%tmp, %%ytmp
327 xor %%tmp32, 0xFFFFFFFF
331 vmovdqu %%ytmp, [%%src1 + %%result]
332 vmovdqu %%ytmp2, [%%src2 + %%result]
333 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
334 vpmovmskb %%tmp, %%ytmp
335 xor %%tmp32, 0xFFFFFFFF
339 vmovdqu %%ytmp, [%%src1 + %%result]
340 vmovdqu %%ytmp2, [%%src2 + %%result]
341 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
342 vpmovmskb %%tmp, %%ytmp
343 xor %%tmp32, 0xFFFFFFFF
347 cmp %%result, 258 - 32
350 vmovdqu %%ytmp, [%%src1 + %%result]
351 vmovdqu %%ytmp2, [%%src2 + %%result]
352 vpcmpeqb %%ytmp, %%ytmp, %%ytmp2
353 vpmovmskb %%tmp, %%ytmp
354 xor %%tmp32, 0xFFFFFFFF
355 jnz %%miscompare_last
363 ;; Guarantee the result has length at most 258.
366 cmova %%result, %%tmp
375 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
378 ;; compare size, src1, src2, result, tmp
385 %define %%tmp8 %5b ; tmp as a 8-bit register
387 xor %%result, %%result
391 mov %%tmp, [%%src1 + %%result]
392 xor %%tmp, [%%src2 + %%result]
398 ;; if we fall through from above, we have found no mismatches,
399 ;; %%size+7 is the number of bytes left to look at, and %%result is the
400 ;; number of bytes that have matched
404 mov %%tmp8, [%%src1 + %%result]
405 cmp %%tmp8, [%%src2 + %%result]