]>
Commit | Line | Data |
---|---|---|
2874c5fd | 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
15c2d45d AB |
2 | /* |
3 | * Author: Anton Blanchard <anton@au.ibm.com> | |
4 | * Copyright 2015 IBM Corporation. | |
15c2d45d AB |
5 | */ |
6 | #include <asm/ppc_asm.h> | |
9445aa1a | 7 | #include <asm/export.h> |
d58badfb | 8 | #include <asm/ppc-opcode.h> |
15c2d45d AB |
9 | |
10 | #define off8 r6 | |
11 | #define off16 r7 | |
12 | #define off24 r8 | |
13 | ||
14 | #define rA r9 | |
15 | #define rB r10 | |
16 | #define rC r11 | |
17 | #define rD r27 | |
18 | #define rE r28 | |
19 | #define rF r29 | |
20 | #define rG r30 | |
21 | #define rH r31 | |
22 | ||
23 | #ifdef __LITTLE_ENDIAN__ | |
2d9ee327 SG |
24 | #define LH lhbrx |
25 | #define LW lwbrx | |
15c2d45d | 26 | #define LD ldbrx |
d58badfb SG |
27 | #define LVS lvsr |
28 | #define VPERM(_VRT,_VRA,_VRB,_VRC) \ | |
29 | vperm _VRT,_VRB,_VRA,_VRC | |
15c2d45d | 30 | #else |
2d9ee327 SG |
31 | #define LH lhzx |
32 | #define LW lwzx | |
15c2d45d | 33 | #define LD ldx |
d58badfb SG |
34 | #define LVS lvsl |
35 | #define VPERM(_VRT,_VRA,_VRB,_VRC) \ | |
36 | vperm _VRT,_VRA,_VRB,_VRC | |
15c2d45d AB |
37 | #endif |
38 | ||
d58badfb SG |
39 | #define VMX_THRESH 4096 |
40 | #define ENTER_VMX_OPS \ | |
41 | mflr r0; \ | |
42 | std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ | |
43 | std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ | |
44 | std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ | |
45 | std r0,16(r1); \ | |
46 | stdu r1,-STACKFRAMESIZE(r1); \ | |
47 | bl enter_vmx_ops; \ | |
48 | cmpwi cr1,r3,0; \ | |
49 | ld r0,STACKFRAMESIZE+16(r1); \ | |
50 | ld r3,STK_REG(R31)(r1); \ | |
51 | ld r4,STK_REG(R30)(r1); \ | |
52 | ld r5,STK_REG(R29)(r1); \ | |
53 | addi r1,r1,STACKFRAMESIZE; \ | |
54 | mtlr r0 | |
55 | ||
56 | #define EXIT_VMX_OPS \ | |
57 | mflr r0; \ | |
58 | std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ | |
59 | std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ | |
60 | std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ | |
61 | std r0,16(r1); \ | |
62 | stdu r1,-STACKFRAMESIZE(r1); \ | |
63 | bl exit_vmx_ops; \ | |
64 | ld r0,STACKFRAMESIZE+16(r1); \ | |
65 | ld r3,STK_REG(R31)(r1); \ | |
66 | ld r4,STK_REG(R30)(r1); \ | |
67 | ld r5,STK_REG(R29)(r1); \ | |
68 | addi r1,r1,STACKFRAMESIZE; \ | |
69 | mtlr r0 | |
70 | ||
71 | /* | |
72 | * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with | |
73 | * 16 bytes boundary and permute the result with the 1st 16 bytes. | |
74 | ||
75 | * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | | |
76 | * ^ ^ ^ | |
77 | * 0xbbbb10 0xbbbb20 0xbbb30 | |
78 | * ^ | |
79 | * _vaddr | |
80 | * | |
81 | * | |
82 | * _vmask is the mask generated by LVS | |
83 | * _v1st_qw is the 1st aligned QW of current addr which is already loaded. | |
84 | * for example: 0xyyyyyyyyyyyyy012 for big endian | |
85 | * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. | |
86 | * for example: 0x3456789abcdefzzz for big endian | |
87 | * The permute result is saved in _v_res. | |
88 | * for example: 0x0123456789abcdef for big endian. | |
89 | */ | |
90 | #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ | |
91 | lvx _v2nd_qw,_vaddr,off16; \ | |
92 | VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) | |
93 | ||
2d9ee327 SG |
94 | /* |
95 | * There are 2 categories for memcmp: | |
96 | * 1) src/dst has the same offset to the 8 bytes boundary. The handlers | |
97 | * are named like .Lsameoffset_xxxx | |
98 | * 2) src/dst has different offset to the 8 bytes boundary. The handlers | |
99 | * are named like .Ldiffoffset_xxxx | |
100 | */ | |
d58badfb | 101 | _GLOBAL_TOC(memcmp) |
15c2d45d AB |
102 | cmpdi cr1,r5,0 |
103 | ||
2d9ee327 SG |
104 | /* Use the short loop if the src/dst addresses are not |
105 | * with the same offset of 8 bytes align boundary. | |
106 | */ | |
107 | xor r6,r3,r4 | |
15c2d45d AB |
108 | andi. r6,r6,7 |
109 | ||
2d9ee327 SG |
110 | /* Fall back to short loop if compare at aligned addrs |
111 | * with less than 8 bytes. | |
112 | */ | |
113 | cmpdi cr6,r5,7 | |
15c2d45d AB |
114 | |
115 | beq cr1,.Lzero | |
2d9ee327 | 116 | bgt cr6,.Lno_short |
15c2d45d AB |
117 | |
118 | .Lshort: | |
119 | mtctr r5 | |
15c2d45d AB |
120 | 1: lbz rA,0(r3) |
121 | lbz rB,0(r4) | |
122 | subf. rC,rB,rA | |
123 | bne .Lnon_zero | |
124 | bdz .Lzero | |
125 | ||
126 | lbz rA,1(r3) | |
127 | lbz rB,1(r4) | |
128 | subf. rC,rB,rA | |
129 | bne .Lnon_zero | |
130 | bdz .Lzero | |
131 | ||
132 | lbz rA,2(r3) | |
133 | lbz rB,2(r4) | |
134 | subf. rC,rB,rA | |
135 | bne .Lnon_zero | |
136 | bdz .Lzero | |
137 | ||
138 | lbz rA,3(r3) | |
139 | lbz rB,3(r4) | |
140 | subf. rC,rB,rA | |
141 | bne .Lnon_zero | |
142 | ||
143 | addi r3,r3,4 | |
144 | addi r4,r4,4 | |
145 | ||
146 | bdnz 1b | |
147 | ||
148 | .Lzero: | |
149 | li r3,0 | |
150 | blr | |
151 | ||
2d9ee327 SG |
152 | .Lno_short: |
153 | dcbt 0,r3 | |
154 | dcbt 0,r4 | |
155 | bne .Ldiffoffset_8bytes_make_align_start | |
156 | ||
157 | ||
158 | .Lsameoffset_8bytes_make_align_start: | |
159 | /* attempt to compare bytes not aligned with 8 bytes so that | |
160 | * rest comparison can run based on 8 bytes alignment. | |
161 | */ | |
162 | andi. r6,r3,7 | |
163 | ||
164 | /* Try to compare the first double word which is not 8 bytes aligned: | |
165 | * load the first double word at (src & ~7UL) and shift left appropriate | |
166 | * bits before comparision. | |
167 | */ | |
168 | rlwinm r6,r3,3,26,28 | |
169 | beq .Lsameoffset_8bytes_aligned | |
170 | clrrdi r3,r3,3 | |
171 | clrrdi r4,r4,3 | |
172 | LD rA,0,r3 | |
173 | LD rB,0,r4 | |
174 | sld rA,rA,r6 | |
175 | sld rB,rB,r6 | |
176 | cmpld cr0,rA,rB | |
177 | srwi r6,r6,3 | |
178 | bne cr0,.LcmpAB_lightweight | |
179 | subfic r6,r6,8 | |
180 | subf. r5,r6,r5 | |
181 | addi r3,r3,8 | |
182 | addi r4,r4,8 | |
183 | beq .Lzero | |
184 | ||
185 | .Lsameoffset_8bytes_aligned: | |
186 | /* now we are aligned with 8 bytes. | |
187 | * Use .Llong loop if left cmp bytes are equal or greater than 32B. | |
188 | */ | |
189 | cmpdi cr6,r5,31 | |
190 | bgt cr6,.Llong | |
191 | ||
192 | .Lcmp_lt32bytes: | |
d58badfb | 193 | /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ |
2d9ee327 SG |
194 | cmpdi cr5,r5,7 |
195 | srdi r0,r5,3 | |
196 | ble cr5,.Lcmp_rest_lt8bytes | |
197 | ||
198 | /* handle 8 ~ 31 bytes */ | |
199 | clrldi r5,r5,61 | |
200 | mtctr r0 | |
201 | 2: | |
202 | LD rA,0,r3 | |
203 | LD rB,0,r4 | |
204 | cmpld cr0,rA,rB | |
205 | addi r3,r3,8 | |
206 | addi r4,r4,8 | |
207 | bne cr0,.LcmpAB_lightweight | |
208 | bdnz 2b | |
209 | ||
210 | cmpwi r5,0 | |
211 | beq .Lzero | |
212 | ||
213 | .Lcmp_rest_lt8bytes: | |
d9470757 ME |
214 | /* |
215 | * Here we have less than 8 bytes to compare. At least s1 is aligned to | |
216 | * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a | |
217 | * page boundary, otherwise we might read past the end of the buffer and | |
218 | * trigger a page fault. We use 4K as the conservative minimum page | |
219 | * size. If we detect that case we go to the byte-by-byte loop. | |
220 | * | |
221 | * Otherwise the next double word is loaded from s1 and s2, and shifted | |
222 | * right to compare the appropriate bits. | |
2d9ee327 | 223 | */ |
d9470757 ME |
224 | clrldi r6,r4,(64-12) // r6 = r4 & 0xfff |
225 | cmpdi r6,0xff8 | |
226 | bgt .Lshort | |
227 | ||
2d9ee327 SG |
228 | subfic r6,r5,8 |
229 | slwi r6,r6,3 | |
230 | LD rA,0,r3 | |
231 | LD rB,0,r4 | |
232 | srd rA,rA,r6 | |
233 | srd rB,rB,r6 | |
234 | cmpld cr0,rA,rB | |
235 | bne cr0,.LcmpAB_lightweight | |
236 | b .Lzero | |
237 | ||
15c2d45d AB |
238 | .Lnon_zero: |
239 | mr r3,rC | |
240 | blr | |
241 | ||
242 | .Llong: | |
d58badfb SG |
243 | #ifdef CONFIG_ALTIVEC |
244 | BEGIN_FTR_SECTION | |
245 | /* Try to use vmx loop if length is equal or greater than 4K */ | |
246 | cmpldi cr6,r5,VMX_THRESH | |
247 | bge cr6,.Lsameoffset_vmx_cmp | |
248 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) | |
249 | ||
250 | .Llong_novmx_cmp: | |
251 | #endif | |
2d9ee327 | 252 | /* At least s1 addr is aligned with 8 bytes */ |
15c2d45d AB |
253 | li off8,8 |
254 | li off16,16 | |
255 | li off24,24 | |
256 | ||
257 | std r31,-8(r1) | |
258 | std r30,-16(r1) | |
259 | std r29,-24(r1) | |
260 | std r28,-32(r1) | |
261 | std r27,-40(r1) | |
262 | ||
263 | srdi r0,r5,5 | |
264 | mtctr r0 | |
265 | andi. r5,r5,31 | |
266 | ||
267 | LD rA,0,r3 | |
268 | LD rB,0,r4 | |
269 | ||
270 | LD rC,off8,r3 | |
271 | LD rD,off8,r4 | |
272 | ||
273 | LD rE,off16,r3 | |
274 | LD rF,off16,r4 | |
275 | ||
276 | LD rG,off24,r3 | |
277 | LD rH,off24,r4 | |
278 | cmpld cr0,rA,rB | |
279 | ||
280 | addi r3,r3,32 | |
281 | addi r4,r4,32 | |
282 | ||
283 | bdz .Lfirst32 | |
284 | ||
285 | LD rA,0,r3 | |
286 | LD rB,0,r4 | |
287 | cmpld cr1,rC,rD | |
288 | ||
289 | LD rC,off8,r3 | |
290 | LD rD,off8,r4 | |
291 | cmpld cr6,rE,rF | |
292 | ||
293 | LD rE,off16,r3 | |
294 | LD rF,off16,r4 | |
295 | cmpld cr7,rG,rH | |
296 | bne cr0,.LcmpAB | |
297 | ||
298 | LD rG,off24,r3 | |
299 | LD rH,off24,r4 | |
300 | cmpld cr0,rA,rB | |
301 | bne cr1,.LcmpCD | |
302 | ||
303 | addi r3,r3,32 | |
304 | addi r4,r4,32 | |
305 | ||
306 | bdz .Lsecond32 | |
307 | ||
308 | .balign 16 | |
309 | ||
310 | 1: LD rA,0,r3 | |
311 | LD rB,0,r4 | |
312 | cmpld cr1,rC,rD | |
313 | bne cr6,.LcmpEF | |
314 | ||
315 | LD rC,off8,r3 | |
316 | LD rD,off8,r4 | |
317 | cmpld cr6,rE,rF | |
318 | bne cr7,.LcmpGH | |
319 | ||
320 | LD rE,off16,r3 | |
321 | LD rF,off16,r4 | |
322 | cmpld cr7,rG,rH | |
323 | bne cr0,.LcmpAB | |
324 | ||
325 | LD rG,off24,r3 | |
326 | LD rH,off24,r4 | |
327 | cmpld cr0,rA,rB | |
328 | bne cr1,.LcmpCD | |
329 | ||
330 | addi r3,r3,32 | |
331 | addi r4,r4,32 | |
332 | ||
333 | bdnz 1b | |
334 | ||
335 | .Lsecond32: | |
336 | cmpld cr1,rC,rD | |
337 | bne cr6,.LcmpEF | |
338 | ||
339 | cmpld cr6,rE,rF | |
340 | bne cr7,.LcmpGH | |
341 | ||
342 | cmpld cr7,rG,rH | |
343 | bne cr0,.LcmpAB | |
344 | ||
345 | bne cr1,.LcmpCD | |
346 | bne cr6,.LcmpEF | |
347 | bne cr7,.LcmpGH | |
348 | ||
349 | .Ltail: | |
350 | ld r31,-8(r1) | |
351 | ld r30,-16(r1) | |
352 | ld r29,-24(r1) | |
353 | ld r28,-32(r1) | |
354 | ld r27,-40(r1) | |
355 | ||
356 | cmpdi r5,0 | |
357 | beq .Lzero | |
358 | b .Lshort | |
359 | ||
360 | .Lfirst32: | |
361 | cmpld cr1,rC,rD | |
362 | cmpld cr6,rE,rF | |
363 | cmpld cr7,rG,rH | |
364 | ||
365 | bne cr0,.LcmpAB | |
366 | bne cr1,.LcmpCD | |
367 | bne cr6,.LcmpEF | |
368 | bne cr7,.LcmpGH | |
369 | ||
370 | b .Ltail | |
371 | ||
372 | .LcmpAB: | |
373 | li r3,1 | |
374 | bgt cr0,.Lout | |
375 | li r3,-1 | |
376 | b .Lout | |
377 | ||
378 | .LcmpCD: | |
379 | li r3,1 | |
380 | bgt cr1,.Lout | |
381 | li r3,-1 | |
382 | b .Lout | |
383 | ||
384 | .LcmpEF: | |
385 | li r3,1 | |
386 | bgt cr6,.Lout | |
387 | li r3,-1 | |
388 | b .Lout | |
389 | ||
390 | .LcmpGH: | |
391 | li r3,1 | |
392 | bgt cr7,.Lout | |
393 | li r3,-1 | |
394 | ||
395 | .Lout: | |
396 | ld r31,-8(r1) | |
397 | ld r30,-16(r1) | |
398 | ld r29,-24(r1) | |
399 | ld r28,-32(r1) | |
400 | ld r27,-40(r1) | |
401 | blr | |
2d9ee327 SG |
402 | |
403 | .LcmpAB_lightweight: /* skip NV GPRS restore */ | |
404 | li r3,1 | |
405 | bgtlr | |
406 | li r3,-1 | |
407 | blr | |
408 | ||
d58badfb SG |
409 | #ifdef CONFIG_ALTIVEC |
410 | .Lsameoffset_vmx_cmp: | |
411 | /* Enter with src/dst addrs has the same offset with 8 bytes | |
c2a4e54e SG |
412 | * align boundary. |
413 | * | |
414 | * There is an optimization based on following fact: memcmp() | |
415 | * prones to fail early at the first 32 bytes. | |
416 | * Before applying VMX instructions which will lead to 32x128bits | |
417 | * VMX regs load/restore penalty, we compare the first 32 bytes | |
418 | * so that we can catch the ~80% fail cases. | |
d58badfb | 419 | */ |
c2a4e54e SG |
420 | |
421 | li r0,4 | |
422 | mtctr r0 | |
423 | .Lsameoffset_prechk_32B_loop: | |
424 | LD rA,0,r3 | |
425 | LD rB,0,r4 | |
426 | cmpld cr0,rA,rB | |
427 | addi r3,r3,8 | |
428 | addi r4,r4,8 | |
429 | bne cr0,.LcmpAB_lightweight | |
430 | addi r5,r5,-8 | |
431 | bdnz .Lsameoffset_prechk_32B_loop | |
432 | ||
d58badfb SG |
433 | ENTER_VMX_OPS |
434 | beq cr1,.Llong_novmx_cmp | |
435 | ||
436 | 3: | |
437 | /* need to check whether r4 has the same offset with r3 | |
438 | * for 16 bytes boundary. | |
439 | */ | |
440 | xor r0,r3,r4 | |
441 | andi. r0,r0,0xf | |
442 | bne .Ldiffoffset_vmx_cmp_start | |
443 | ||
444 | /* len is no less than 4KB. Need to align with 16 bytes further. | |
445 | */ | |
446 | andi. rA,r3,8 | |
447 | LD rA,0,r3 | |
448 | beq 4f | |
449 | LD rB,0,r4 | |
450 | cmpld cr0,rA,rB | |
451 | addi r3,r3,8 | |
452 | addi r4,r4,8 | |
453 | addi r5,r5,-8 | |
454 | ||
455 | beq cr0,4f | |
456 | /* save and restore cr0 */ | |
457 | mfocrf r5,128 | |
458 | EXIT_VMX_OPS | |
459 | mtocrf 128,r5 | |
460 | b .LcmpAB_lightweight | |
461 | ||
462 | 4: | |
463 | /* compare 32 bytes for each loop */ | |
464 | srdi r0,r5,5 | |
465 | mtctr r0 | |
466 | clrldi r5,r5,59 | |
467 | li off16,16 | |
468 | ||
469 | .balign 16 | |
470 | 5: | |
471 | lvx v0,0,r3 | |
472 | lvx v1,0,r4 | |
473 | VCMPEQUD_RC(v0,v0,v1) | |
474 | bnl cr6,7f | |
475 | lvx v0,off16,r3 | |
476 | lvx v1,off16,r4 | |
477 | VCMPEQUD_RC(v0,v0,v1) | |
478 | bnl cr6,6f | |
479 | addi r3,r3,32 | |
480 | addi r4,r4,32 | |
481 | bdnz 5b | |
482 | ||
483 | EXIT_VMX_OPS | |
484 | cmpdi r5,0 | |
485 | beq .Lzero | |
486 | b .Lcmp_lt32bytes | |
487 | ||
488 | 6: | |
489 | addi r3,r3,16 | |
490 | addi r4,r4,16 | |
491 | ||
492 | 7: | |
493 | /* diff the last 16 bytes */ | |
494 | EXIT_VMX_OPS | |
495 | LD rA,0,r3 | |
496 | LD rB,0,r4 | |
497 | cmpld cr0,rA,rB | |
498 | li off8,8 | |
499 | bne cr0,.LcmpAB_lightweight | |
500 | ||
501 | LD rA,off8,r3 | |
502 | LD rB,off8,r4 | |
503 | cmpld cr0,rA,rB | |
504 | bne cr0,.LcmpAB_lightweight | |
505 | b .Lzero | |
506 | #endif | |
507 | ||
2d9ee327 SG |
508 | .Ldiffoffset_8bytes_make_align_start: |
509 | /* now try to align s1 with 8 bytes */ | |
510 | rlwinm r6,r3,3,26,28 | |
511 | beq .Ldiffoffset_align_s1_8bytes | |
512 | ||
513 | clrrdi r3,r3,3 | |
514 | LD rA,0,r3 | |
515 | LD rB,0,r4 /* unaligned load */ | |
516 | sld rA,rA,r6 | |
517 | srd rA,rA,r6 | |
518 | srd rB,rB,r6 | |
519 | cmpld cr0,rA,rB | |
520 | srwi r6,r6,3 | |
521 | bne cr0,.LcmpAB_lightweight | |
522 | ||
523 | subfic r6,r6,8 | |
524 | subf. r5,r6,r5 | |
525 | addi r3,r3,8 | |
526 | add r4,r4,r6 | |
527 | ||
528 | beq .Lzero | |
529 | ||
530 | .Ldiffoffset_align_s1_8bytes: | |
531 | /* now s1 is aligned with 8 bytes. */ | |
c2a4e54e SG |
532 | #ifdef CONFIG_ALTIVEC |
533 | BEGIN_FTR_SECTION | |
534 | /* only do vmx ops when the size equal or greater than 4K bytes */ | |
535 | cmpdi cr5,r5,VMX_THRESH | |
536 | bge cr5,.Ldiffoffset_vmx_cmp | |
537 | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) | |
538 | ||
539 | .Ldiffoffset_novmx_cmp: | |
540 | #endif | |
541 | ||
542 | ||
2d9ee327 SG |
543 | cmpdi cr5,r5,31 |
544 | ble cr5,.Lcmp_lt32bytes | |
d58badfb SG |
545 | |
546 | #ifdef CONFIG_ALTIVEC | |
547 | b .Llong_novmx_cmp | |
548 | #else | |
2d9ee327 | 549 | b .Llong |
d58badfb SG |
550 | #endif |
551 | ||
552 | #ifdef CONFIG_ALTIVEC | |
553 | .Ldiffoffset_vmx_cmp: | |
c2a4e54e SG |
554 | /* perform a 32 bytes pre-checking before |
555 | * enable VMX operations. | |
556 | */ | |
557 | li r0,4 | |
558 | mtctr r0 | |
559 | .Ldiffoffset_prechk_32B_loop: | |
560 | LD rA,0,r3 | |
561 | LD rB,0,r4 | |
562 | cmpld cr0,rA,rB | |
563 | addi r3,r3,8 | |
564 | addi r4,r4,8 | |
565 | bne cr0,.LcmpAB_lightweight | |
566 | addi r5,r5,-8 | |
567 | bdnz .Ldiffoffset_prechk_32B_loop | |
568 | ||
d58badfb SG |
569 | ENTER_VMX_OPS |
570 | beq cr1,.Ldiffoffset_novmx_cmp | |
571 | ||
572 | .Ldiffoffset_vmx_cmp_start: | |
573 | /* Firstly try to align r3 with 16 bytes */ | |
574 | andi. r6,r3,0xf | |
575 | li off16,16 | |
576 | beq .Ldiffoffset_vmx_s1_16bytes_align | |
2d9ee327 | 577 | |
d58badfb SG |
578 | LVS v3,0,r3 |
579 | LVS v4,0,r4 | |
580 | ||
581 | lvx v5,0,r3 | |
582 | lvx v6,0,r4 | |
583 | LD_VSR_CROSS16B(r3,v3,v5,v7,v9) | |
584 | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) | |
585 | ||
586 | VCMPEQUB_RC(v7,v9,v10) | |
587 | bnl cr6,.Ldiffoffset_vmx_diff_found | |
588 | ||
589 | subfic r6,r6,16 | |
590 | subf r5,r6,r5 | |
591 | add r3,r3,r6 | |
592 | add r4,r4,r6 | |
593 | ||
594 | .Ldiffoffset_vmx_s1_16bytes_align: | |
595 | /* now s1 is aligned with 16 bytes */ | |
596 | lvx v6,0,r4 | |
597 | LVS v4,0,r4 | |
598 | srdi r6,r5,5 /* loop for 32 bytes each */ | |
599 | clrldi r5,r5,59 | |
600 | mtctr r6 | |
601 | ||
602 | .balign 16 | |
603 | .Ldiffoffset_vmx_32bytesloop: | |
604 | /* the first qw of r4 was saved in v6 */ | |
605 | lvx v9,0,r3 | |
606 | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) | |
607 | VCMPEQUB_RC(v7,v9,v10) | |
608 | vor v6,v8,v8 | |
609 | bnl cr6,.Ldiffoffset_vmx_diff_found | |
610 | ||
611 | addi r3,r3,16 | |
612 | addi r4,r4,16 | |
613 | ||
614 | lvx v9,0,r3 | |
615 | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) | |
616 | VCMPEQUB_RC(v7,v9,v10) | |
617 | vor v6,v8,v8 | |
618 | bnl cr6,.Ldiffoffset_vmx_diff_found | |
619 | ||
620 | addi r3,r3,16 | |
621 | addi r4,r4,16 | |
622 | ||
623 | bdnz .Ldiffoffset_vmx_32bytesloop | |
624 | ||
625 | EXIT_VMX_OPS | |
626 | ||
627 | cmpdi r5,0 | |
628 | beq .Lzero | |
629 | b .Lcmp_lt32bytes | |
630 | ||
631 | .Ldiffoffset_vmx_diff_found: | |
632 | EXIT_VMX_OPS | |
633 | /* anyway, the diff will appear in next 16 bytes */ | |
634 | li r5,16 | |
635 | b .Lcmp_lt32bytes | |
636 | ||
637 | #endif | |
9445aa1a | 638 | EXPORT_SYMBOL(memcmp) |