[mirror_ubuntu-kernels.git] / arch / powerpc / lib / memcmp_64.S

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Author: Anton Blanchard <anton@au.ibm.com>
 * Copyright 2015 IBM Corporation.
 */
#include <asm/ppc_asm.h>
#include <asm/export.h>
#include <asm/ppc-opcode.h>

#define off8	r6
#define off16	r7
#define off24	r8

#define rA	r9
#define rB	r10
#define rC	r11
#define rD	r27
#define rE	r28
#define rF	r29
#define rG	r30
#define rH	r31

#ifdef __LITTLE_ENDIAN__
#define LH	lhbrx
#define LW	lwbrx
#define LD	ldbrx
#define LVS	lvsr
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
	vperm _VRT,_VRB,_VRA,_VRC
#else
#define LH	lhzx
#define LW	lwzx
#define LD	ldx
#define LVS	lvsl
#define VPERM(_VRT,_VRA,_VRB,_VRC) \
	vperm _VRT,_VRA,_VRB,_VRC
#endif

#define VMX_THRESH 4096
#define ENTER_VMX_OPS	\
	mflr    r0;	\
	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
	std     r0,16(r1); \
	stdu    r1,-STACKFRAMESIZE(r1); \
	bl      enter_vmx_ops; \
	cmpwi   cr1,r3,0; \
	ld      r0,STACKFRAMESIZE+16(r1); \
	ld      r3,STK_REG(R31)(r1); \
	ld      r4,STK_REG(R30)(r1); \
	ld      r5,STK_REG(R29)(r1); \
	addi	r1,r1,STACKFRAMESIZE; \
	mtlr    r0

#define EXIT_VMX_OPS \
	mflr    r0; \
	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
	std     r0,16(r1); \
	stdu    r1,-STACKFRAMESIZE(r1); \
	bl      exit_vmx_ops; \
	ld      r0,STACKFRAMESIZE+16(r1); \
	ld      r3,STK_REG(R31)(r1); \
	ld      r4,STK_REG(R30)(r1); \
	ld      r5,STK_REG(R29)(r1); \
	addi	r1,r1,STACKFRAMESIZE; \
	mtlr    r0

/*
 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
 * 16 bytes boundary and permute the result with the 1st 16 bytes.

 *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
 *    ^                                  ^                                 ^
 * 0xbbbb10                          0xbbbb20                          0xbbb30
 *                                 ^
 *                                _vaddr
 *
 *
 * _vmask is the mask generated by LVS
 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
 *   for example: 0xyyyyyyyyyyyyy012 for big endian
 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
 *   for example: 0x3456789abcdefzzz for big endian
 * The permute result is saved in _v_res.
 *   for example: 0x0123456789abcdef for big endian.
 */
#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
        lvx     _v2nd_qw,_vaddr,off16; \
        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)

/*
 * There are 2 categories for memcmp:
 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
 * are named like .Lsameoffset_xxxx
 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
 * are named like .Ldiffoffset_xxxx
 */
_GLOBAL_TOC(memcmp)
	cmpdi	cr1,r5,0

	/* Use the short loop if the src/dst addresses are not
	 * with the same offset of 8 bytes align boundary.
	 */
	xor	r6,r3,r4
	andi.	r6,r6,7

	/* Fall back to short loop if compare at aligned addrs
	 * with less than 8 bytes.
	 */
	cmpdi   cr6,r5,7

	beq	cr1,.Lzero
	bgt	cr6,.Lno_short

.Lshort:
	mtctr	r5
1:	lbz	rA,0(r3)
	lbz	rB,0(r4)
	subf.	rC,rB,rA
	bne	.Lnon_zero
	bdz	.Lzero

	lbz	rA,1(r3)
	lbz	rB,1(r4)
	subf.	rC,rB,rA
	bne	.Lnon_zero
	bdz	.Lzero

	lbz	rA,2(r3)
	lbz	rB,2(r4)
	subf.	rC,rB,rA
	bne	.Lnon_zero
	bdz	.Lzero

	lbz	rA,3(r3)
	lbz	rB,3(r4)
	subf.	rC,rB,rA
	bne	.Lnon_zero

	addi	r3,r3,4
	addi	r4,r4,4

	bdnz	1b

.Lzero:
	li	r3,0
	blr

.Lno_short:
	dcbt	0,r3
	dcbt	0,r4
	bne	.Ldiffoffset_8bytes_make_align_start


.Lsameoffset_8bytes_make_align_start:
	/* attempt to compare bytes not aligned with 8 bytes so that
	 * rest comparison can run based on 8 bytes alignment.
	 */
	andi.   r6,r3,7

	/* Try to compare the first double word which is not 8 bytes aligned:
	 * load the first double word at (src & ~7UL) and shift left appropriate
	 * bits before comparision.
	 */
	rlwinm  r6,r3,3,26,28
	beq     .Lsameoffset_8bytes_aligned
	clrrdi	r3,r3,3
	clrrdi	r4,r4,3
	LD	rA,0,r3
	LD	rB,0,r4
	sld	rA,rA,r6
	sld	rB,rB,r6
	cmpld	cr0,rA,rB
	srwi	r6,r6,3
	bne	cr0,.LcmpAB_lightweight
	subfic  r6,r6,8
	subf.	r5,r6,r5
	addi	r3,r3,8
	addi	r4,r4,8
	beq	.Lzero

.Lsameoffset_8bytes_aligned:
	/* now we are aligned with 8 bytes.
	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
	 */
	cmpdi   cr6,r5,31
	bgt	cr6,.Llong

.Lcmp_lt32bytes:
	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
	cmpdi   cr5,r5,7
	srdi    r0,r5,3
	ble	cr5,.Lcmp_rest_lt8bytes

	/* handle 8 ~ 31 bytes */
	clrldi  r5,r5,61
	mtctr   r0
2:
	LD	rA,0,r3
	LD	rB,0,r4
	cmpld	cr0,rA,rB
	addi	r3,r3,8
	addi	r4,r4,8
	bne	cr0,.LcmpAB_lightweight
	bdnz	2b

	cmpwi   r5,0
	beq	.Lzero

.Lcmp_rest_lt8bytes:
	/*
	 * Here we have less than 8 bytes to compare. At least s1 is aligned to
	 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
	 * page boundary, otherwise we might read past the end of the buffer and
	 * trigger a page fault. We use 4K as the conservative minimum page
	 * size. If we detect that case we go to the byte-by-byte loop.
	 *
	 * Otherwise the next double word is loaded from s1 and s2, and shifted
	 * right to compare the appropriate bits.
	 */
	clrldi	r6,r4,(64-12)	// r6 = r4 & 0xfff
	cmpdi	r6,0xff8
	bgt	.Lshort

	subfic  r6,r5,8
	slwi	r6,r6,3
	LD	rA,0,r3
	LD	rB,0,r4
	srd	rA,rA,r6
	srd	rB,rB,r6
	cmpld	cr0,rA,rB
	bne	cr0,.LcmpAB_lightweight
	b	.Lzero

.Lnon_zero:
	mr	r3,rC
	blr

.Llong:
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION
	/* Try to use vmx loop if length is equal or greater than 4K */
	cmpldi  cr6,r5,VMX_THRESH
	bge	cr6,.Lsameoffset_vmx_cmp
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)

.Llong_novmx_cmp:
#endif
	/* At least s1 addr is aligned with 8 bytes */
	li	off8,8
	li	off16,16
	li	off24,24

	std	r31,-8(r1)
	std	r30,-16(r1)
	std	r29,-24(r1)
	std	r28,-32(r1)
	std	r27,-40(r1)

	srdi	r0,r5,5
	mtctr	r0
	andi.	r5,r5,31

	LD	rA,0,r3
	LD	rB,0,r4

	LD	rC,off8,r3
	LD	rD,off8,r4

	LD	rE,off16,r3
	LD	rF,off16,r4

	LD	rG,off24,r3
	LD	rH,off24,r4
	cmpld	cr0,rA,rB

	addi	r3,r3,32
	addi	r4,r4,32

	bdz	.Lfirst32

	LD	rA,0,r3
	LD	rB,0,r4
	cmpld	cr1,rC,rD

	LD	rC,off8,r3
	LD	rD,off8,r4
	cmpld	cr6,rE,rF

	LD	rE,off16,r3
	LD	rF,off16,r4
	cmpld	cr7,rG,rH
	bne	cr0,.LcmpAB

	LD	rG,off24,r3
	LD	rH,off24,r4
	cmpld	cr0,rA,rB
	bne	cr1,.LcmpCD

	addi	r3,r3,32
	addi	r4,r4,32

	bdz	.Lsecond32

	.balign	16

1:	LD	rA,0,r3
	LD	rB,0,r4
	cmpld	cr1,rC,rD
	bne	cr6,.LcmpEF

	LD	rC,off8,r3
	LD	rD,off8,r4
	cmpld	cr6,rE,rF
	bne	cr7,.LcmpGH

	LD	rE,off16,r3
	LD	rF,off16,r4
	cmpld	cr7,rG,rH
	bne	cr0,.LcmpAB

	LD	rG,off24,r3
	LD	rH,off24,r4
	cmpld	cr0,rA,rB
	bne	cr1,.LcmpCD

	addi	r3,r3,32
	addi	r4,r4,32

	bdnz	1b

.Lsecond32:
	cmpld	cr1,rC,rD
	bne	cr6,.LcmpEF

	cmpld	cr6,rE,rF
	bne	cr7,.LcmpGH

	cmpld	cr7,rG,rH
	bne	cr0,.LcmpAB

	bne	cr1,.LcmpCD
	bne	cr6,.LcmpEF
	bne	cr7,.LcmpGH

.Ltail:
	ld	r31,-8(r1)
	ld	r30,-16(r1)
	ld	r29,-24(r1)
	ld	r28,-32(r1)
	ld	r27,-40(r1)

	cmpdi	r5,0
	beq	.Lzero
	b	.Lshort

.Lfirst32:
	cmpld	cr1,rC,rD
	cmpld	cr6,rE,rF
	cmpld	cr7,rG,rH

	bne	cr0,.LcmpAB
	bne	cr1,.LcmpCD
	bne	cr6,.LcmpEF
	bne	cr7,.LcmpGH

	b	.Ltail

.LcmpAB:
	li	r3,1
	bgt	cr0,.Lout
	li	r3,-1
	b	.Lout

.LcmpCD:
	li	r3,1
	bgt	cr1,.Lout
	li	r3,-1
	b	.Lout

.LcmpEF:
	li	r3,1
	bgt	cr6,.Lout
	li	r3,-1
	b	.Lout

.LcmpGH:
	li	r3,1
	bgt	cr7,.Lout
	li	r3,-1

.Lout:
	ld	r31,-8(r1)
	ld	r30,-16(r1)
	ld	r29,-24(r1)
	ld	r28,-32(r1)
	ld	r27,-40(r1)
	blr

.LcmpAB_lightweight:   /* skip NV GPRS restore */
	li	r3,1
	bgtlr
	li	r3,-1
	blr

#ifdef CONFIG_ALTIVEC
.Lsameoffset_vmx_cmp:
	/* Enter with src/dst addrs has the same offset with 8 bytes
	 * align boundary.
	 *
	 * There is an optimization based on following fact: memcmp()
	 * prones to fail early at the first 32 bytes.
	 * Before applying VMX instructions which will lead to 32x128bits
	 * VMX regs load/restore penalty, we compare the first 32 bytes
	 * so that we can catch the ~80% fail cases.
	 */

	li	r0,4
	mtctr	r0
.Lsameoffset_prechk_32B_loop:
	LD	rA,0,r3
	LD	rB,0,r4
	cmpld	cr0,rA,rB
	addi	r3,r3,8
	addi	r4,r4,8
	bne     cr0,.LcmpAB_lightweight
	addi	r5,r5,-8
	bdnz	.Lsameoffset_prechk_32B_loop

	ENTER_VMX_OPS
	beq     cr1,.Llong_novmx_cmp

3:
	/* need to check whether r4 has the same offset with r3
	 * for 16 bytes boundary.
	 */
	xor	r0,r3,r4
	andi.	r0,r0,0xf
	bne	.Ldiffoffset_vmx_cmp_start

	/* len is no less than 4KB. Need to align with 16 bytes further.
	 */
	andi.	rA,r3,8
	LD	rA,0,r3
	beq	4f
	LD	rB,0,r4
	cmpld	cr0,rA,rB
	addi	r3,r3,8
	addi	r4,r4,8
	addi	r5,r5,-8

	beq	cr0,4f
	/* save and restore cr0 */
	mfocrf  r5,128
	EXIT_VMX_OPS
	mtocrf  128,r5
	b	.LcmpAB_lightweight

4:
	/* compare 32 bytes for each loop */
	srdi	r0,r5,5
	mtctr	r0
	clrldi  r5,r5,59
	li	off16,16

.balign 16
5:
	lvx 	v0,0,r3
	lvx 	v1,0,r4
	VCMPEQUD_RC(v0,v0,v1)
	bnl	cr6,7f
	lvx 	v0,off16,r3
	lvx 	v1,off16,r4
	VCMPEQUD_RC(v0,v0,v1)
	bnl	cr6,6f
	addi	r3,r3,32
	addi	r4,r4,32
	bdnz	5b

	EXIT_VMX_OPS
	cmpdi	r5,0
	beq	.Lzero
	b	.Lcmp_lt32bytes

6:
	addi	r3,r3,16
	addi	r4,r4,16

7:
	/* diff the last 16 bytes */
	EXIT_VMX_OPS
	LD	rA,0,r3
	LD	rB,0,r4
	cmpld	cr0,rA,rB
	li	off8,8
	bne	cr0,.LcmpAB_lightweight

	LD	rA,off8,r3
	LD	rB,off8,r4
	cmpld	cr0,rA,rB
	bne	cr0,.LcmpAB_lightweight
	b	.Lzero
#endif

.Ldiffoffset_8bytes_make_align_start:
	/* now try to align s1 with 8 bytes */
	rlwinm  r6,r3,3,26,28
	beq     .Ldiffoffset_align_s1_8bytes

	clrrdi	r3,r3,3
	LD	rA,0,r3
	LD	rB,0,r4  /* unaligned load */
	sld	rA,rA,r6
	srd	rA,rA,r6
	srd	rB,rB,r6
	cmpld	cr0,rA,rB
	srwi	r6,r6,3
	bne	cr0,.LcmpAB_lightweight

	subfic  r6,r6,8
	subf.	r5,r6,r5
	addi	r3,r3,8
	add	r4,r4,r6

	beq	.Lzero

.Ldiffoffset_align_s1_8bytes:
	/* now s1 is aligned with 8 bytes. */
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION
	/* only do vmx ops when the size equal or greater than 4K bytes */
	cmpdi	cr5,r5,VMX_THRESH
	bge	cr5,.Ldiffoffset_vmx_cmp
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)

.Ldiffoffset_novmx_cmp:
#endif


	cmpdi   cr5,r5,31
	ble	cr5,.Lcmp_lt32bytes

#ifdef CONFIG_ALTIVEC
	b	.Llong_novmx_cmp
#else
	b	.Llong
#endif

#ifdef CONFIG_ALTIVEC
.Ldiffoffset_vmx_cmp:
	/* perform a 32 bytes pre-checking before
	 * enable VMX operations.
	 */
	li	r0,4
	mtctr	r0
.Ldiffoffset_prechk_32B_loop:
	LD	rA,0,r3
	LD	rB,0,r4
	cmpld	cr0,rA,rB
	addi	r3,r3,8
	addi	r4,r4,8
	bne     cr0,.LcmpAB_lightweight
	addi	r5,r5,-8
	bdnz	.Ldiffoffset_prechk_32B_loop

	ENTER_VMX_OPS
	beq     cr1,.Ldiffoffset_novmx_cmp

.Ldiffoffset_vmx_cmp_start:
	/* Firstly try to align r3 with 16 bytes */
	andi.   r6,r3,0xf
	li	off16,16
	beq     .Ldiffoffset_vmx_s1_16bytes_align

	LVS	v3,0,r3
	LVS	v4,0,r4

	lvx     v5,0,r3
	lvx     v6,0,r4
	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)

	VCMPEQUB_RC(v7,v9,v10)
	bnl	cr6,.Ldiffoffset_vmx_diff_found

	subfic  r6,r6,16
	subf    r5,r6,r5
	add     r3,r3,r6
	add     r4,r4,r6

.Ldiffoffset_vmx_s1_16bytes_align:
	/* now s1 is aligned with 16 bytes */
	lvx     v6,0,r4
	LVS	v4,0,r4
	srdi	r6,r5,5  /* loop for 32 bytes each */
	clrldi  r5,r5,59
	mtctr	r6

.balign	16
.Ldiffoffset_vmx_32bytesloop:
	/* the first qw of r4 was saved in v6 */
	lvx	v9,0,r3
	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
	VCMPEQUB_RC(v7,v9,v10)
	vor	v6,v8,v8
	bnl	cr6,.Ldiffoffset_vmx_diff_found

	addi	r3,r3,16
	addi	r4,r4,16

	lvx	v9,0,r3
	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
	VCMPEQUB_RC(v7,v9,v10)
	vor	v6,v8,v8
	bnl	cr6,.Ldiffoffset_vmx_diff_found

	addi	r3,r3,16
	addi	r4,r4,16

	bdnz	.Ldiffoffset_vmx_32bytesloop

	EXIT_VMX_OPS

	cmpdi	r5,0
	beq	.Lzero
	b	.Lcmp_lt32bytes

.Ldiffoffset_vmx_diff_found:
	EXIT_VMX_OPS
	/* anyway, the diff will appear in next 16 bytes */
	li	r5,16
	b	.Lcmp_lt32bytes

#endif
EXPORT_SYMBOL(memcmp)
Commit	Line	Data
2874c5fd	1	/* SPDX-License-Identifier: GPL-2.0-or-later */
15c2d45d AB	2	/*
	3	* Author: Anton Blanchard <anton@au.ibm.com>
	4	* Copyright 2015 IBM Corporation.
15c2d45d AB	5	*/
15c2d45d AB	6	#include <asm/ppc_asm.h>
9445aa1a	7	#include <asm/export.h>
d58badfb	8	#include <asm/ppc-opcode.h>
15c2d45d AB	9
	10	#define off8 r6
	11	#define off16 r7
	12	#define off24 r8
	13
	14	#define rA r9
	15	#define rB r10
	16	#define rC r11
	17	#define rD r27
	18	#define rE r28
	19	#define rF r29
	20	#define rG r30
	21	#define rH r31
	22
	23	#ifdef __LITTLE_ENDIAN__
2d9ee327 SG	24	#define LH lhbrx
2d9ee327 SG	25	#define LW lwbrx
15c2d45d	26	#define LD ldbrx
d58badfb SG	27	#define LVS lvsr
	28	#define VPERM(_VRT,_VRA,_VRB,_VRC) \
	29	vperm _VRT,_VRB,_VRA,_VRC
15c2d45d	30	#else
2d9ee327 SG	31	#define LH lhzx
2d9ee327 SG	32	#define LW lwzx
15c2d45d	33	#define LD ldx
d58badfb SG	34	#define LVS lvsl
	35	#define VPERM(_VRT,_VRA,_VRB,_VRC) \
	36	vperm _VRT,_VRA,_VRB,_VRC
15c2d45d AB	37	#endif
15c2d45d AB	38
d58badfb SG	39	#define VMX_THRESH 4096
	40	#define ENTER_VMX_OPS \
	41	mflr r0; \
	42	std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
	43	std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
	44	std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
	45	std r0,16(r1); \
	46	stdu r1,-STACKFRAMESIZE(r1); \
	47	bl enter_vmx_ops; \
	48	cmpwi cr1,r3,0; \
	49	ld r0,STACKFRAMESIZE+16(r1); \
	50	ld r3,STK_REG(R31)(r1); \
	51	ld r4,STK_REG(R30)(r1); \
	52	ld r5,STK_REG(R29)(r1); \
	53	addi r1,r1,STACKFRAMESIZE; \
	54	mtlr r0
	55
	56	#define EXIT_VMX_OPS \
	57	mflr r0; \
	58	std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
	59	std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
	60	std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
	61	std r0,16(r1); \
	62	stdu r1,-STACKFRAMESIZE(r1); \
	63	bl exit_vmx_ops; \
	64	ld r0,STACKFRAMESIZE+16(r1); \
	65	ld r3,STK_REG(R31)(r1); \
	66	ld r4,STK_REG(R30)(r1); \
	67	ld r5,STK_REG(R29)(r1); \
	68	addi r1,r1,STACKFRAMESIZE; \
	69	mtlr r0
	70
	71	/*
	72	* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
	73	* 16 bytes boundary and permute the result with the 1st 16 bytes.
	74
	75	* \| y y y y y y y y y y y y y 0 1 2 \| 3 4 5 6 7 8 9 a b c d e f z z z \|
	76	* ^ ^ ^
	77	* 0xbbbb10 0xbbbb20 0xbbb30
	78	* ^
	79	* _vaddr
	80	*
	81	*
	82	* _vmask is the mask generated by LVS
	83	* _v1st_qw is the 1st aligned QW of current addr which is already loaded.
	84	* for example: 0xyyyyyyyyyyyyy012 for big endian
	85	* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
	86	* for example: 0x3456789abcdefzzz for big endian
	87	* The permute result is saved in _v_res.
	88	* for example: 0x0123456789abcdef for big endian.
	89	*/
	90	#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
	91	lvx _v2nd_qw,_vaddr,off16; \
	92	VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
	93
2d9ee327 SG	94	/*
	95	* There are 2 categories for memcmp:
	96	* 1) src/dst has the same offset to the 8 bytes boundary. The handlers
	97	* are named like .Lsameoffset_xxxx
	98	* 2) src/dst has different offset to the 8 bytes boundary. The handlers
	99	* are named like .Ldiffoffset_xxxx
	100	*/
d58badfb	101	_GLOBAL_TOC(memcmp)
15c2d45d AB	102	cmpdi cr1,r5,0
15c2d45d AB	103
2d9ee327 SG	104	/* Use the short loop if the src/dst addresses are not
	105	* with the same offset of 8 bytes align boundary.
	106	*/
	107	xor r6,r3,r4
15c2d45d AB	108	andi. r6,r6,7
15c2d45d AB	109
2d9ee327 SG	110	/* Fall back to short loop if compare at aligned addrs
	111	* with less than 8 bytes.
	112	*/
	113	cmpdi cr6,r5,7
15c2d45d AB	114
15c2d45d AB	115	beq cr1,.Lzero
2d9ee327	116	bgt cr6,.Lno_short
15c2d45d AB	117
	118	.Lshort:
	119	mtctr r5
15c2d45d AB	120	1: lbz rA,0(r3)
	121	lbz rB,0(r4)
	122	subf. rC,rB,rA
	123	bne .Lnon_zero
	124	bdz .Lzero
	125
	126	lbz rA,1(r3)
	127	lbz rB,1(r4)
	128	subf. rC,rB,rA
	129	bne .Lnon_zero
	130	bdz .Lzero
	131
	132	lbz rA,2(r3)
	133	lbz rB,2(r4)
	134	subf. rC,rB,rA
	135	bne .Lnon_zero
	136	bdz .Lzero
	137
	138	lbz rA,3(r3)
	139	lbz rB,3(r4)
	140	subf. rC,rB,rA
	141	bne .Lnon_zero
	142
	143	addi r3,r3,4
	144	addi r4,r4,4
	145
	146	bdnz 1b
	147
	148	.Lzero:
	149	li r3,0
	150	blr
	151
2d9ee327 SG	152	.Lno_short:
	153	dcbt 0,r3
	154	dcbt 0,r4
	155	bne .Ldiffoffset_8bytes_make_align_start
	156
	157
	158	.Lsameoffset_8bytes_make_align_start:
	159	/* attempt to compare bytes not aligned with 8 bytes so that
	160	* rest comparison can run based on 8 bytes alignment.
	161	*/
	162	andi. r6,r3,7
	163
	164	/* Try to compare the first double word which is not 8 bytes aligned:
	165	* load the first double word at (src & ~7UL) and shift left appropriate
	166	* bits before comparision.
	167	*/
	168	rlwinm r6,r3,3,26,28
	169	beq .Lsameoffset_8bytes_aligned
	170	clrrdi r3,r3,3
	171	clrrdi r4,r4,3
	172	LD rA,0,r3
	173	LD rB,0,r4
	174	sld rA,rA,r6
	175	sld rB,rB,r6
	176	cmpld cr0,rA,rB
	177	srwi r6,r6,3
	178	bne cr0,.LcmpAB_lightweight
	179	subfic r6,r6,8
	180	subf. r5,r6,r5
	181	addi r3,r3,8
	182	addi r4,r4,8
	183	beq .Lzero
	184
	185	.Lsameoffset_8bytes_aligned:
	186	/* now we are aligned with 8 bytes.
	187	* Use .Llong loop if left cmp bytes are equal or greater than 32B.
	188	*/
	189	cmpdi cr6,r5,31
	190	bgt cr6,.Llong
	191
	192	.Lcmp_lt32bytes:
d58badfb	193	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
2d9ee327 SG	194	cmpdi cr5,r5,7
	195	srdi r0,r5,3
	196	ble cr5,.Lcmp_rest_lt8bytes
	197
	198	/* handle 8 ~ 31 bytes */
	199	clrldi r5,r5,61
	200	mtctr r0
	201	2:
	202	LD rA,0,r3
	203	LD rB,0,r4
	204	cmpld cr0,rA,rB
	205	addi r3,r3,8
	206	addi r4,r4,8
	207	bne cr0,.LcmpAB_lightweight
	208	bdnz 2b
	209
	210	cmpwi r5,0
	211	beq .Lzero
	212
	213	.Lcmp_rest_lt8bytes:
d9470757 ME	214	/*
	215	* Here we have less than 8 bytes to compare. At least s1 is aligned to
	216	* 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
	217	* page boundary, otherwise we might read past the end of the buffer and
	218	* trigger a page fault. We use 4K as the conservative minimum page
	219	* size. If we detect that case we go to the byte-by-byte loop.
	220	*
	221	* Otherwise the next double word is loaded from s1 and s2, and shifted
	222	* right to compare the appropriate bits.
2d9ee327	223	*/
d9470757 ME	224	clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
	225	cmpdi r6,0xff8
	226	bgt .Lshort
	227
2d9ee327 SG	228	subfic r6,r5,8
	229	slwi r6,r6,3
	230	LD rA,0,r3
	231	LD rB,0,r4
	232	srd rA,rA,r6
	233	srd rB,rB,r6
	234	cmpld cr0,rA,rB
	235	bne cr0,.LcmpAB_lightweight
	236	b .Lzero
	237
15c2d45d AB	238	.Lnon_zero:
	239	mr r3,rC
	240	blr
	241
	242	.Llong:
d58badfb SG	243	#ifdef CONFIG_ALTIVEC
	244	BEGIN_FTR_SECTION
	245	/* Try to use vmx loop if length is equal or greater than 4K */
	246	cmpldi cr6,r5,VMX_THRESH
	247	bge cr6,.Lsameoffset_vmx_cmp
	248	END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
	249
	250	.Llong_novmx_cmp:
	251	#endif
2d9ee327	252	/* At least s1 addr is aligned with 8 bytes */
15c2d45d AB	253	li off8,8
	254	li off16,16
	255	li off24,24
	256
	257	std r31,-8(r1)
	258	std r30,-16(r1)
	259	std r29,-24(r1)
	260	std r28,-32(r1)
	261	std r27,-40(r1)
	262
	263	srdi r0,r5,5
	264	mtctr r0
	265	andi. r5,r5,31
	266
	267	LD rA,0,r3
	268	LD rB,0,r4
	269
	270	LD rC,off8,r3
	271	LD rD,off8,r4
	272
	273	LD rE,off16,r3
	274	LD rF,off16,r4
	275
	276	LD rG,off24,r3
	277	LD rH,off24,r4
	278	cmpld cr0,rA,rB
	279
	280	addi r3,r3,32
	281	addi r4,r4,32
	282
	283	bdz .Lfirst32
	284
	285	LD rA,0,r3
	286	LD rB,0,r4
	287	cmpld cr1,rC,rD
	288
	289	LD rC,off8,r3
	290	LD rD,off8,r4
	291	cmpld cr6,rE,rF
	292
	293	LD rE,off16,r3
	294	LD rF,off16,r4
	295	cmpld cr7,rG,rH
	296	bne cr0,.LcmpAB
	297
	298	LD rG,off24,r3
	299	LD rH,off24,r4
	300	cmpld cr0,rA,rB
	301	bne cr1,.LcmpCD
	302
	303	addi r3,r3,32
	304	addi r4,r4,32
	305
	306	bdz .Lsecond32
	307
	308	.balign 16
	309
	310	1: LD rA,0,r3
	311	LD rB,0,r4
	312	cmpld cr1,rC,rD
	313	bne cr6,.LcmpEF
	314
	315	LD rC,off8,r3
	316	LD rD,off8,r4
317	cmpld cr6,rE,rF
318	bne cr7,.LcmpGH
319
320	LD rE,off16,r3
321	LD rF,off16,r4
322	cmpld cr7,rG,rH
323	bne cr0,.LcmpAB
324
325	LD rG,off24,r3
326	LD rH,off24,r4
327	cmpld cr0,rA,rB
328	bne cr1,.LcmpCD
329
330	addi r3,r3,32
331	addi r4,r4,32
332
333	bdnz 1b
334
335	.Lsecond32:
336	cmpld cr1,rC,rD
337	bne cr6,.LcmpEF
338
339	cmpld cr6,rE,rF
340	bne cr7,.LcmpGH
341
342	cmpld cr7,rG,rH
343	bne cr0,.LcmpAB
344
345	bne cr1,.LcmpCD
346	bne cr6,.LcmpEF
347	bne cr7,.LcmpGH
348
349	.Ltail:
350	ld r31,-8(r1)
351	ld r30,-16(r1)
352	ld r29,-24(r1)
353	ld r28,-32(r1)
354	ld r27,-40(r1)
355
356	cmpdi r5,0
357	beq .Lzero
358	b .Lshort
359
360	.Lfirst32:
361	cmpld cr1,rC,rD
362	cmpld cr6,rE,rF
363	cmpld cr7,rG,rH
364
365	bne cr0,.LcmpAB
366	bne cr1,.LcmpCD
367	bne cr6,.LcmpEF
368	bne cr7,.LcmpGH
369
370	b .Ltail
371
372	.LcmpAB:
373	li r3,1
374	bgt cr0,.Lout
375	li r3,-1
376	b .Lout
377
378	.LcmpCD:
379	li r3,1
380	bgt cr1,.Lout
381	li r3,-1
382	b .Lout
383
384	.LcmpEF:
385	li r3,1
386	bgt cr6,.Lout
387	li r3,-1
388	b .Lout
389
390	.LcmpGH:
391	li r3,1
392	bgt cr7,.Lout
393	li r3,-1
394
395	.Lout:
396	ld r31,-8(r1)
397	ld r30,-16(r1)
398	ld r29,-24(r1)
399	ld r28,-32(r1)
400	ld r27,-40(r1)
401	blr
2d9ee327 SG	402
	403	.LcmpAB_lightweight: /* skip NV GPRS restore */
	404	li r3,1
	405	bgtlr
	406	li r3,-1
	407	blr
	408
d58badfb SG	409	#ifdef CONFIG_ALTIVEC
	410	.Lsameoffset_vmx_cmp:
	411	/* Enter with src/dst addrs has the same offset with 8 bytes
c2a4e54e SG	412	* align boundary.
	413	*
	414	* There is an optimization based on following fact: memcmp()
	415	* prones to fail early at the first 32 bytes.
	416	* Before applying VMX instructions which will lead to 32x128bits
	417	* VMX regs load/restore penalty, we compare the first 32 bytes
	418	* so that we can catch the ~80% fail cases.
d58badfb	419	*/
c2a4e54e SG	420
	421	li r0,4
	422	mtctr r0
	423	.Lsameoffset_prechk_32B_loop:
	424	LD rA,0,r3
	425	LD rB,0,r4
	426	cmpld cr0,rA,rB
	427	addi r3,r3,8
	428	addi r4,r4,8
	429	bne cr0,.LcmpAB_lightweight
	430	addi r5,r5,-8
	431	bdnz .Lsameoffset_prechk_32B_loop
	432
d58badfb SG	433	ENTER_VMX_OPS
	434	beq cr1,.Llong_novmx_cmp
	435
	436	3:
	437	/* need to check whether r4 has the same offset with r3
	438	* for 16 bytes boundary.
	439	*/
	440	xor r0,r3,r4
	441	andi. r0,r0,0xf
	442	bne .Ldiffoffset_vmx_cmp_start
	443
	444	/* len is no less than 4KB. Need to align with 16 bytes further.
	445	*/
	446	andi. rA,r3,8
	447	LD rA,0,r3
	448	beq 4f
	449	LD rB,0,r4
	450	cmpld cr0,rA,rB
	451	addi r3,r3,8
	452	addi r4,r4,8
	453	addi r5,r5,-8
	454
	455	beq cr0,4f
	456	/* save and restore cr0 */
	457	mfocrf r5,128
	458	EXIT_VMX_OPS
	459	mtocrf 128,r5
	460	b .LcmpAB_lightweight
	461
	462	4:
	463	/* compare 32 bytes for each loop */
	464	srdi r0,r5,5
	465	mtctr r0
	466	clrldi r5,r5,59
	467	li off16,16
	468
	469	.balign 16
	470	5:
	471	lvx v0,0,r3
	472	lvx v1,0,r4
	473	VCMPEQUD_RC(v0,v0,v1)
	474	bnl cr6,7f
	475	lvx v0,off16,r3
	476	lvx v1,off16,r4
	477	VCMPEQUD_RC(v0,v0,v1)
	478	bnl cr6,6f
	479	addi r3,r3,32
	480	addi r4,r4,32
	481	bdnz 5b
	482
	483	EXIT_VMX_OPS
	484	cmpdi r5,0
	485	beq .Lzero
	486	b .Lcmp_lt32bytes
	487
	488	6:
	489	addi r3,r3,16
	490	addi r4,r4,16
	491
	492	7:
	493	/* diff the last 16 bytes */
	494	EXIT_VMX_OPS
	495	LD rA,0,r3
	496	LD rB,0,r4
497	cmpld cr0,rA,rB
498	li off8,8
499	bne cr0,.LcmpAB_lightweight
500
501	LD rA,off8,r3
502	LD rB,off8,r4
503	cmpld cr0,rA,rB
504	bne cr0,.LcmpAB_lightweight
505	b .Lzero
506	#endif
507
2d9ee327 SG	508	.Ldiffoffset_8bytes_make_align_start:
	509	/* now try to align s1 with 8 bytes */
	510	rlwinm r6,r3,3,26,28
	511	beq .Ldiffoffset_align_s1_8bytes
	512
	513	clrrdi r3,r3,3
	514	LD rA,0,r3
	515	LD rB,0,r4 /* unaligned load */
	516	sld rA,rA,r6
	517	srd rA,rA,r6
	518	srd rB,rB,r6
	519	cmpld cr0,rA,rB
	520	srwi r6,r6,3
	521	bne cr0,.LcmpAB_lightweight
	522
	523	subfic r6,r6,8
	524	subf. r5,r6,r5
	525	addi r3,r3,8
	526	add r4,r4,r6
	527
	528	beq .Lzero
	529
	530	.Ldiffoffset_align_s1_8bytes:
	531	/* now s1 is aligned with 8 bytes. */
c2a4e54e SG	532	#ifdef CONFIG_ALTIVEC
	533	BEGIN_FTR_SECTION
	534	/* only do vmx ops when the size equal or greater than 4K bytes */
	535	cmpdi cr5,r5,VMX_THRESH
	536	bge cr5,.Ldiffoffset_vmx_cmp
	537	END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
	538
	539	.Ldiffoffset_novmx_cmp:
	540	#endif
	541
	542
2d9ee327 SG	543	cmpdi cr5,r5,31
2d9ee327 SG	544	ble cr5,.Lcmp_lt32bytes
d58badfb SG	545
	546	#ifdef CONFIG_ALTIVEC
	547	b .Llong_novmx_cmp
	548	#else
2d9ee327	549	b .Llong
d58badfb SG	550	#endif
	551
	552	#ifdef CONFIG_ALTIVEC
	553	.Ldiffoffset_vmx_cmp:
c2a4e54e SG	554	/* perform a 32 bytes pre-checking before
	555	* enable VMX operations.
	556	*/
	557	li r0,4
	558	mtctr r0
	559	.Ldiffoffset_prechk_32B_loop:
	560	LD rA,0,r3
	561	LD rB,0,r4
	562	cmpld cr0,rA,rB
	563	addi r3,r3,8
	564	addi r4,r4,8
	565	bne cr0,.LcmpAB_lightweight
	566	addi r5,r5,-8
	567	bdnz .Ldiffoffset_prechk_32B_loop
	568
d58badfb SG	569	ENTER_VMX_OPS
	570	beq cr1,.Ldiffoffset_novmx_cmp
	571
	572	.Ldiffoffset_vmx_cmp_start:
	573	/* Firstly try to align r3 with 16 bytes */
	574	andi. r6,r3,0xf
	575	li off16,16
	576	beq .Ldiffoffset_vmx_s1_16bytes_align
2d9ee327	577
d58badfb SG	578	LVS v3,0,r3
	579	LVS v4,0,r4
	580
	581	lvx v5,0,r3
	582	lvx v6,0,r4
	583	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
	584	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
	585
	586	VCMPEQUB_RC(v7,v9,v10)
	587	bnl cr6,.Ldiffoffset_vmx_diff_found
	588
	589	subfic r6,r6,16
	590	subf r5,r6,r5
	591	add r3,r3,r6
	592	add r4,r4,r6
	593
	594	.Ldiffoffset_vmx_s1_16bytes_align:
	595	/* now s1 is aligned with 16 bytes */
	596	lvx v6,0,r4
	597	LVS v4,0,r4
	598	srdi r6,r5,5 /* loop for 32 bytes each */
	599	clrldi r5,r5,59
	600	mtctr r6
	601
	602	.balign 16
	603	.Ldiffoffset_vmx_32bytesloop:
	604	/* the first qw of r4 was saved in v6 */
	605	lvx v9,0,r3
	606	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
	607	VCMPEQUB_RC(v7,v9,v10)
	608	vor v6,v8,v8
	609	bnl cr6,.Ldiffoffset_vmx_diff_found
	610
	611	addi r3,r3,16
	612	addi r4,r4,16
	613
	614	lvx v9,0,r3
	615	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
	616	VCMPEQUB_RC(v7,v9,v10)
	617	vor v6,v8,v8
	618	bnl cr6,.Ldiffoffset_vmx_diff_found
	619
	620	addi r3,r3,16
	621	addi r4,r4,16
	622
	623	bdnz .Ldiffoffset_vmx_32bytesloop
	624
	625	EXIT_VMX_OPS
	626
	627	cmpdi r5,0
	628	beq .Lzero
	629	b .Lcmp_lt32bytes
	630
	631	.Ldiffoffset_vmx_diff_found:
	632	EXIT_VMX_OPS
	633	/* anyway, the diff will appear in next 16 bytes */
	634	li r5,16
	635	b .Lcmp_lt32bytes
	636
	637	#endif
9445aa1a	638	EXPORT_SYMBOL(memcmp)