[mirror_ubuntu-artful-kernel.git] / arch / mips / lib / memcpy.S

/*
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file "COPYING" in the main directory of this archive
 * for more details.
 *
 * Unified implementation of memcpy, memmove and the __copy_user backend.
 *
 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
 * Copyright (C) 2002 Broadcom, Inc.
 *   memcpy/copy_user author: Mark Vandevoorde
 * Copyright (C) 2007  Maciej W. Rozycki
 *
 * Mnemonic names for arguments to memcpy/__copy_user
 */

/*
 * Hack to resolve longstanding prefetch issue
 *
 * Prefetching may be fatal on some systems if we're prefetching beyond the
 * end of memory on some systems.  It's also a seriously bad idea on non
 * dma-coherent systems.
 */
#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
#undef CONFIG_CPU_HAS_PREFETCH
#endif
#ifdef CONFIG_MIPS_MALTA
#undef CONFIG_CPU_HAS_PREFETCH
#endif

#include <asm/asm.h>
#include <asm/asm-offsets.h>
#include <asm/regdef.h>

#define dst a0
#define src a1
#define len a2

/*
 * Spec
 *
 * memcpy copies len bytes from src to dst and sets v0 to dst.
 * It assumes that
 *   - src and dst don't overlap
 *   - src is readable
 *   - dst is writable
 * memcpy uses the standard calling convention
 *
 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
 * the number of uncopied bytes due to an exception caused by a read or write.
 * __copy_user assumes that src and dst don't overlap, and that the call is
 * implementing one of the following:
 *   copy_to_user
 *     - src is readable  (no exceptions when reading src)
 *   copy_from_user
 *     - dst is writable  (no exceptions when writing dst)
 * __copy_user uses a non-standard calling convention; see
 * include/asm-mips/uaccess.h
 *
 * When an exception happens on a load, the handler must
 # ensure that all of the destination buffer is overwritten to prevent
 * leaking information to user mode programs.
 */

/*
 * Implementation
 */

/*
 * The exception handler for loads requires that:
 *  1- AT contain the address of the byte just past the end of the source
 *     of the copy,
 *  2- src_entry <= src < AT, and
 *  3- (dst - src) == (dst_entry - src_entry),
 * The _entry suffix denotes values when __copy_user was called.
 *
 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
 * (2) is met by incrementing src by the number of bytes copied
 * (3) is met by not doing loads between a pair of increments of dst and src
 *
 * The exception handlers for stores adjust len (if necessary) and return.
 * These handlers do not need to overwrite any data.
 *
 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
 * they're not protected.
 */

#define EXC(inst_reg,addr,handler)		\
9:	inst_reg, addr;				\
	.section __ex_table,"a";		\
	PTR	9b, handler;			\
	.previous

/*
 * Only on the 64-bit kernel we can made use of 64-bit registers.
 */
#ifdef CONFIG_64BIT
#define USE_DOUBLE
#endif

#ifdef USE_DOUBLE

#define LOAD   ld
#define LOADL  ldl
#define LOADR  ldr
#define STOREL sdl
#define STORER sdr
#define STORE  sd
#define ADD    daddu
#define SUB    dsubu
#define SRL    dsrl
#define SRA    dsra
#define SLL    dsll
#define SLLV   dsllv
#define SRLV   dsrlv
#define NBYTES 8
#define LOG_NBYTES 3

/*
 * As we are sharing code base with the mips32 tree (which use the o32 ABI
 * register definitions). We need to redefine the register definitions from
 * the n64 ABI register naming to the o32 ABI register naming.
 */
#undef t0
#undef t1
#undef t2
#undef t3
#define t0	$8
#define t1	$9
#define t2	$10
#define t3	$11
#define t4	$12
#define t5	$13
#define t6	$14
#define t7	$15

#else

#define LOAD   lw
#define LOADL  lwl
#define LOADR  lwr
#define STOREL swl
#define STORER swr
#define STORE  sw
#define ADD    addu
#define SUB    subu
#define SRL    srl
#define SLL    sll
#define SRA    sra
#define SLLV   sllv
#define SRLV   srlv
#define NBYTES 4
#define LOG_NBYTES 2

#endif /* USE_DOUBLE */

#ifdef CONFIG_CPU_LITTLE_ENDIAN
#define LDFIRST LOADR
#define LDREST  LOADL
#define STFIRST STORER
#define STREST  STOREL
#define SHIFT_DISCARD SLLV
#else
#define LDFIRST LOADL
#define LDREST  LOADR
#define STFIRST STOREL
#define STREST  STORER
#define SHIFT_DISCARD SRLV
#endif

#define FIRST(unit) ((unit)*NBYTES)
#define REST(unit)  (FIRST(unit)+NBYTES-1)
#define UNIT(unit)  FIRST(unit)

#define ADDRMASK (NBYTES-1)

	.text
	.set	noreorder
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
	.set	noat
#else
	.set	at=v1
#endif

/*
 * A combined memcpy/__copy_user
 * __copy_user sets len to 0 for success; else to an upper bound of
 * the number of uncopied bytes.
 * memcpy sets v0 to dst.
 */
	.align	5
LEAF(memcpy)					/* a0=dst a1=src a2=len */
	move	v0, dst				/* return value */
__memcpy:
FEXPORT(__copy_user)
	/*
	 * Note: dst & src may be unaligned, len may be 0
	 * Temps
	 */
#define rem t8

	/*
	 * The "issue break"s below are very approximate.
	 * Issue delays for dcache fills will perturb the schedule, as will
	 * load queue full replay traps, etc.
	 *
	 * If len < NBYTES use byte operations.
	 */
	PREF(	0, 0(src) )
	PREF(	1, 0(dst) )
	sltu	t2, len, NBYTES
	and	t1, dst, ADDRMASK
	PREF(	0, 1*32(src) )
	PREF(	1, 1*32(dst) )
	bnez	t2, copy_bytes_checklen
	 and	t0, src, ADDRMASK
	PREF(	0, 2*32(src) )
	PREF(	1, 2*32(dst) )
	bnez	t1, dst_unaligned
	 nop
	bnez	t0, src_unaligned_dst_aligned
	/*
	 * use delay slot for fall-through
	 * src and dst are aligned; need to compute rem
	 */
both_aligned:
	 SRL	t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
	beqz	t0, cleanup_both_aligned # len < 8*NBYTES
	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
	PREF(	0, 3*32(src) )
	PREF(	1, 3*32(dst) )
	.align	4
1:
EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
	SUB	len, len, 8*NBYTES
EXC(	LOAD	t4, UNIT(4)(src),	l_exc_copy)
EXC(	LOAD	t7, UNIT(5)(src),	l_exc_copy)
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p8u)
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p7u)
EXC(	LOAD	t0, UNIT(6)(src),	l_exc_copy)
EXC(	LOAD	t1, UNIT(7)(src),	l_exc_copy)
	ADD	src, src, 8*NBYTES
	ADD	dst, dst, 8*NBYTES
EXC(	STORE	t2, UNIT(-6)(dst),	s_exc_p6u)
EXC(	STORE	t3, UNIT(-5)(dst),	s_exc_p5u)
EXC(	STORE	t4, UNIT(-4)(dst),	s_exc_p4u)
EXC(	STORE	t7, UNIT(-3)(dst),	s_exc_p3u)
EXC(	STORE	t0, UNIT(-2)(dst),	s_exc_p2u)
EXC(	STORE	t1, UNIT(-1)(dst),	s_exc_p1u)
	PREF(	0, 8*32(src) )
	PREF(	1, 8*32(dst) )
	bne	len, rem, 1b
	 nop

	/*
	 * len == rem == the number of bytes left to copy < 8*NBYTES
	 */
cleanup_both_aligned:
	beqz	len, done
	 sltu	t0, len, 4*NBYTES
	bnez	t0, less_than_4units
	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
	/*
	 * len >= 4*NBYTES
	 */
EXC(	LOAD	t0, UNIT(0)(src),	l_exc)
EXC(	LOAD	t1, UNIT(1)(src),	l_exc_copy)
EXC(	LOAD	t2, UNIT(2)(src),	l_exc_copy)
EXC(	LOAD	t3, UNIT(3)(src),	l_exc_copy)
	SUB	len, len, 4*NBYTES
	ADD	src, src, 4*NBYTES
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
	beqz	len, done
	.set	noreorder
less_than_4units:
	/*
	 * rem = len % NBYTES
	 */
	beq	rem, len, copy_bytes
	 nop
1:
EXC(	LOAD	t0, 0(src),		l_exc)
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
EXC(	STORE	t0, 0(dst),		s_exc_p1u)
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
	bne	rem, len, 1b
	.set	noreorder

	/*
	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	 * A loop would do only a byte at a time with possible branch
	 * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
	 * because can't assume read-access to dst.  Instead, use
	 * STREST dst, which doesn't require read access to dst.
	 *
	 * This code should perform better than a simple loop on modern,
	 * wide-issue mips processors because the code has fewer branches and
	 * more instruction-level parallelism.
	 */
#define bits t2
	beqz	len, done
	 ADD	t1, dst, len	# t1 is just past last byte of dst
	li	bits, 8*NBYTES
	SLL	rem, len, 3	# rem = number of bits to keep
EXC(	LOAD	t0, 0(src),		l_exc)
	SUB	bits, bits, rem	# bits = number of bits to discard
	SHIFT_DISCARD t0, t0, bits
EXC(	STREST	t0, -1(t1),		s_exc)
	jr	ra
	 move	len, zero
dst_unaligned:
	/*
	 * dst is unaligned
	 * t0 = src & ADDRMASK
	 * t1 = dst & ADDRMASK; T1 > 0
	 * len >= NBYTES
	 *
	 * Copy enough bytes to align dst
	 * Set match = (src and dst have same alignment)
	 */
#define match rem
EXC(	LDFIRST	t3, FIRST(0)(src),	l_exc)
	ADD	t2, zero, NBYTES
EXC(	LDREST	t3, REST(0)(src),	l_exc_copy)
	SUB	t2, t2, t1	# t2 = number of bytes copied
	xor	match, t0, t1
EXC(	STFIRST t3, FIRST(0)(dst),	s_exc)
	beq	len, t2, done
	 SUB	len, len, t2
	ADD	dst, dst, t2
	beqz	match, both_aligned
	 ADD	src, src, t2

src_unaligned_dst_aligned:
	SRL	t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
	PREF(	0, 3*32(src) )
	beqz	t0, cleanup_src_unaligned
	 and	rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
	PREF(	1, 3*32(dst) )
1:
/*
 * Avoid consecutive LD*'s to the same register since some mips
 * implementations can't issue them in the same cycle.
 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 * are to the same unit (unless src is aligned, but it's not).
 */
EXC(	LDFIRST	t0, FIRST(0)(src),	l_exc)
EXC(	LDFIRST	t1, FIRST(1)(src),	l_exc_copy)
	SUB     len, len, 4*NBYTES
EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
EXC(	LDREST	t1, REST(1)(src),	l_exc_copy)
EXC(	LDFIRST	t2, FIRST(2)(src),	l_exc_copy)
EXC(	LDFIRST	t3, FIRST(3)(src),	l_exc_copy)
EXC(	LDREST	t2, REST(2)(src),	l_exc_copy)
EXC(	LDREST	t3, REST(3)(src),	l_exc_copy)
	PREF(	0, 9*32(src) )		# 0 is PREF_LOAD  (not streamed)
	ADD	src, src, 4*NBYTES
#ifdef CONFIG_CPU_SB1
	nop				# improves slotting
#endif
EXC(	STORE	t0, UNIT(0)(dst),	s_exc_p4u)
EXC(	STORE	t1, UNIT(1)(dst),	s_exc_p3u)
EXC(	STORE	t2, UNIT(2)(dst),	s_exc_p2u)
EXC(	STORE	t3, UNIT(3)(dst),	s_exc_p1u)
	PREF(	1, 9*32(dst) )     	# 1 is PREF_STORE (not streamed)
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 4*NBYTES
	bne	len, rem, 1b
	.set	noreorder

cleanup_src_unaligned:
	beqz	len, done
	 and	rem, len, NBYTES-1  # rem = len % NBYTES
	beq	rem, len, copy_bytes
	 nop
1:
EXC(	LDFIRST t0, FIRST(0)(src),	l_exc)
EXC(	LDREST	t0, REST(0)(src),	l_exc_copy)
	ADD	src, src, NBYTES
	SUB	len, len, NBYTES
EXC(	STORE	t0, 0(dst),		s_exc_p1u)
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, NBYTES
	bne	len, rem, 1b
	.set	noreorder

copy_bytes_checklen:
	beqz	len, done
	 nop
copy_bytes:
	/* 0 < len < NBYTES  */
#define COPY_BYTE(N)			\
EXC(	lb	t0, N(src), l_exc);	\
	SUB	len, len, 1;		\
	beqz	len, done;		\
EXC(	 sb	t0, N(dst), s_exc_p1)

	COPY_BYTE(0)
	COPY_BYTE(1)
#ifdef USE_DOUBLE
	COPY_BYTE(2)
	COPY_BYTE(3)
	COPY_BYTE(4)
	COPY_BYTE(5)
#endif
EXC(	lb	t0, NBYTES-2(src), l_exc)
	SUB	len, len, 1
	jr	ra
EXC(	 sb	t0, NBYTES-2(dst), s_exc_p1)
done:
	jr	ra
	 nop
	END(memcpy)

l_exc_copy:
	/*
	 * Copy bytes from src until faulting load address (or until a
	 * lb faults)
	 *
	 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	 * may be more than a byte beyond the last address.
	 * Hence, the lb below may get an exception.
	 *
	 * Assumes src < THREAD_BUADDR($28)
	 */
	LOAD	t0, TI_TASK($28)
	 nop
	LOAD	t0, THREAD_BUADDR(t0)
1:
EXC(	lb	t1, 0(src),	l_exc)
	ADD	src, src, 1
	sb	t1, 0(dst)	# can't fault -- we're copy_from_user
	.set	reorder				/* DADDI_WAR */
	ADD	dst, dst, 1
	bne	src, t0, 1b
	.set	noreorder
l_exc:
	LOAD	t0, TI_TASK($28)
	 nop
	LOAD	t0, THREAD_BUADDR(t0)	# t0 is just past last good address
	 nop
	SUB	len, AT, t0		# len number of uncopied bytes
	/*
	 * Here's where we rely on src and dst being incremented in tandem,
	 *   See (3) above.
	 * dst += (fault addr - src) to put dst at first byte to clear
	 */
	ADD	dst, t0			# compute start address in a1
	SUB	dst, src
	/*
	 * Clear len bytes starting at dst.  Can't call __bzero because it
	 * might modify len.  An inefficient loop for these rare times...
	 */
	.set	reorder				/* DADDI_WAR */
	SUB	src, len, 1
	beqz	len, done
	.set	noreorder
1:	sb	zero, 0(dst)
	ADD	dst, dst, 1
#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
	bnez	src, 1b
	 SUB	src, src, 1
#else
	.set	push
	.set	noat
	li	v1, 1
	bnez	src, 1b
	 SUB	src, src, v1
	.set	pop
#endif
	jr	ra
	 nop


#define SEXC(n)							\
	.set	reorder;			/* DADDI_WAR */	\
s_exc_p ## n ## u:						\
	ADD	len, len, n*NBYTES;				\
	jr	ra;						\
	.set	noreorder

SEXC(8)
SEXC(7)
SEXC(6)
SEXC(5)
SEXC(4)
SEXC(3)
SEXC(2)
SEXC(1)

s_exc_p1:
	.set	reorder				/* DADDI_WAR */
	ADD	len, len, 1
	jr	ra
	.set	noreorder
s_exc:
	jr	ra
	 nop

	.align	5
LEAF(memmove)
	ADD	t0, a0, a2
	ADD	t1, a1, a2
	sltu	t0, a1, t0			# dst + len <= src -> memcpy
	sltu	t1, a0, t1			# dst >= src + len -> memcpy
	and	t0, t1
	beqz	t0, __memcpy
	 move	v0, a0				/* return value */
	beqz	a2, r_out
	END(memmove)

	/* fall through to __rmemcpy */
LEAF(__rmemcpy)					/* a0=dst a1=src a2=len */
	 sltu	t0, a1, a0
	beqz	t0, r_end_bytes_up		# src >= dst
	 nop
	ADD	a0, a2				# dst = dst + len
	ADD	a1, a2				# src = src + len

r_end_bytes:
	lb	t0, -1(a1)
	SUB	a2, a2, 0x1
	sb	t0, -1(a0)
	SUB	a1, a1, 0x1
	.set	reorder				/* DADDI_WAR */
	SUB	a0, a0, 0x1
	bnez	a2, r_end_bytes
	.set	noreorder

r_out:
	jr	ra
	 move	a2, zero

r_end_bytes_up:
	lb	t0, (a1)
	SUB	a2, a2, 0x1
	sb	t0, (a0)
	ADD	a1, a1, 0x1
	.set	reorder				/* DADDI_WAR */
	ADD	a0, a0, 0x1
	bnez	a2, r_end_bytes_up
	.set	noreorder

	jr	ra
	 move	a2, zero
	END(__rmemcpy)
Commit	Line	Data
1da177e4 LT	1	/*
	2	* This file is subject to the terms and conditions of the GNU General Public
	3	* License. See the file "COPYING" in the main directory of this archive
	4	* for more details.
	5	*
	6	* Unified implementation of memcpy, memmove and the __copy_user backend.
	7	*
	8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
	9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
	10	* Copyright (C) 2002 Broadcom, Inc.
	11	* memcpy/copy_user author: Mark Vandevoorde
619b6e18	12	* Copyright (C) 2007 Maciej W. Rozycki
1da177e4 LT	13	*
	14	* Mnemonic names for arguments to memcpy/__copy_user
	15	*/
e5adb877 RB	16
	17	/*
	18	* Hack to resolve longstanding prefetch issue
	19	*
	20	* Prefetching may be fatal on some systems if we're prefetching beyond the
	21	* end of memory on some systems. It's also a seriously bad idea on non
	22	* dma-coherent systems.
	23	*/
	24	#if !defined(CONFIG_DMA_COHERENT) \|\| !defined(CONFIG_DMA_IP27)
	25	#undef CONFIG_CPU_HAS_PREFETCH
	26	#endif
	27	#ifdef CONFIG_MIPS_MALTA
	28	#undef CONFIG_CPU_HAS_PREFETCH
	29	#endif
	30
1da177e4	31	#include <asm/asm.h>
048eb582	32	#include <asm/asm-offsets.h>
1da177e4 LT	33	#include <asm/regdef.h>
	34
	35	#define dst a0
	36	#define src a1
	37	#define len a2
	38
	39	/*
	40	* Spec
	41	*
	42	* memcpy copies len bytes from src to dst and sets v0 to dst.
	43	* It assumes that
	44	* - src and dst don't overlap
	45	* - src is readable
	46	* - dst is writable
	47	* memcpy uses the standard calling convention
	48	*
	49	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
	50	* the number of uncopied bytes due to an exception caused by a read or write.
	51	* __copy_user assumes that src and dst don't overlap, and that the call is
	52	* implementing one of the following:
	53	* copy_to_user
	54	* - src is readable (no exceptions when reading src)
	55	* copy_from_user
	56	* - dst is writable (no exceptions when writing dst)
	57	* __copy_user uses a non-standard calling convention; see
	58	* include/asm-mips/uaccess.h
	59	*
	60	* When an exception happens on a load, the handler must
	61	# ensure that all of the destination buffer is overwritten to prevent
	62	* leaking information to user mode programs.
	63	*/
	64
	65	/*
	66	* Implementation
	67	*/
	68
	69	/*
	70	* The exception handler for loads requires that:
	71	* 1- AT contain the address of the byte just past the end of the source
	72	* of the copy,
	73	* 2- src_entry <= src < AT, and
	74	* 3- (dst - src) == (dst_entry - src_entry),
	75	* The _entry suffix denotes values when __copy_user was called.
	76	*
	77	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
	78	* (2) is met by incrementing src by the number of bytes copied
	79	* (3) is met by not doing loads between a pair of increments of dst and src
	80	*
	81	* The exception handlers for stores adjust len (if necessary) and return.
	82	* These handlers do not need to overwrite any data.
	83	*
	84	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
	85	* they're not protected.
	86	*/
	87
	88	#define EXC(inst_reg,addr,handler) \
	89	9: inst_reg, addr; \
	90	.section __ex_table,"a"; \
	91	PTR 9b, handler; \
	92	.previous
	93
	94	/*
	95	* Only on the 64-bit kernel we can made use of 64-bit registers.
	96	*/
875d43e7	97	#ifdef CONFIG_64BIT
1da177e4 LT	98	#define USE_DOUBLE
	99	#endif
	100
	101	#ifdef USE_DOUBLE
	102
	103	#define LOAD ld
	104	#define LOADL ldl
	105	#define LOADR ldr
	106	#define STOREL sdl
	107	#define STORER sdr
	108	#define STORE sd
	109	#define ADD daddu
	110	#define SUB dsubu
	111	#define SRL dsrl
	112	#define SRA dsra
	113	#define SLL dsll
	114	#define SLLV dsllv
	115	#define SRLV dsrlv
	116	#define NBYTES 8
	117	#define LOG_NBYTES 3
	118
42a3b4f2	119	/*
1da177e4 LT	120	* As we are sharing code base with the mips32 tree (which use the o32 ABI
	121	* register definitions). We need to redefine the register definitions from
	122	* the n64 ABI register naming to the o32 ABI register naming.
	123	*/
	124	#undef t0
	125	#undef t1
	126	#undef t2
	127	#undef t3
	128	#define t0 $8
	129	#define t1 $9
	130	#define t2 $10
	131	#define t3 $11
	132	#define t4 $12
	133	#define t5 $13
	134	#define t6 $14
	135	#define t7 $15
42a3b4f2	136
1da177e4 LT	137	#else
	138
	139	#define LOAD lw
	140	#define LOADL lwl
	141	#define LOADR lwr
	142	#define STOREL swl
	143	#define STORER swr
	144	#define STORE sw
	145	#define ADD addu
	146	#define SUB subu
	147	#define SRL srl
	148	#define SLL sll
	149	#define SRA sra
	150	#define SLLV sllv
	151	#define SRLV srlv
	152	#define NBYTES 4
	153	#define LOG_NBYTES 2
	154
	155	#endif /* USE_DOUBLE */
	156
	157	#ifdef CONFIG_CPU_LITTLE_ENDIAN
	158	#define LDFIRST LOADR
	159	#define LDREST LOADL
	160	#define STFIRST STORER
	161	#define STREST STOREL
	162	#define SHIFT_DISCARD SLLV
	163	#else
	164	#define LDFIRST LOADL
	165	#define LDREST LOADR
	166	#define STFIRST STOREL
	167	#define STREST STORER
	168	#define SHIFT_DISCARD SRLV
	169	#endif
	170
	171	#define FIRST(unit) ((unit)*NBYTES)
	172	#define REST(unit) (FIRST(unit)+NBYTES-1)
	173	#define UNIT(unit) FIRST(unit)
	174
	175	#define ADDRMASK (NBYTES-1)
	176
	177	.text
	178	.set noreorder
619b6e18	179	#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
1da177e4	180	.set noat
619b6e18 MR	181	#else
	182	.set at=v1
	183	#endif
1da177e4 LT	184
	185	/*
	186	* A combined memcpy/__copy_user
	187	* __copy_user sets len to 0 for success; else to an upper bound of
	188	* the number of uncopied bytes.
	189	* memcpy sets v0 to dst.
	190	*/
	191	.align 5
	192	LEAF(memcpy) /* a0=dst a1=src a2=len */
	193	move v0, dst /* return value */
	194	__memcpy:
	195	FEXPORT(__copy_user)
	196	/*
	197	* Note: dst & src may be unaligned, len may be 0
	198	* Temps
	199	*/
	200	#define rem t8
	201
	202	/*
	203	* The "issue break"s below are very approximate.
	204	* Issue delays for dcache fills will perturb the schedule, as will
	205	* load queue full replay traps, etc.
	206	*
	207	* If len < NBYTES use byte operations.
	208	*/
	209	PREF( 0, 0(src) )
	210	PREF( 1, 0(dst) )
	211	sltu t2, len, NBYTES
	212	and t1, dst, ADDRMASK
	213	PREF( 0, 1*32(src) )
	214	PREF( 1, 1*32(dst) )
	215	bnez t2, copy_bytes_checklen
	216	and t0, src, ADDRMASK
	217	PREF( 0, 2*32(src) )
	218	PREF( 1, 2*32(dst) )
	219	bnez t1, dst_unaligned
	220	nop
	221	bnez t0, src_unaligned_dst_aligned
	222	/*
	223	* use delay slot for fall-through
	224	* src and dst are aligned; need to compute rem
	225	*/
	226	both_aligned:
	227	SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
	228	beqz t0, cleanup_both_aligned # len < 8*NBYTES
	229	and rem, len, (8NBYTES-1) # rem = len % (8NBYTES)
	230	PREF( 0, 3*32(src) )
	231	PREF( 1, 3*32(dst) )
	232	.align 4
	233	1:
	234	EXC( LOAD t0, UNIT(0)(src), l_exc)
	235	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
	236	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
	237	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
	238	SUB len, len, 8*NBYTES
	239	EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
	240	EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
	241	EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
	242	EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
	243	EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
	244	EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
	245	ADD src, src, 8*NBYTES
	246	ADD dst, dst, 8*NBYTES
	247	EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
248	EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
249	EXC( STORE t4, UNIT(-4)(dst), s_exc_p4u)
250	EXC( STORE t7, UNIT(-3)(dst), s_exc_p3u)
251	EXC( STORE t0, UNIT(-2)(dst), s_exc_p2u)
252	EXC( STORE t1, UNIT(-1)(dst), s_exc_p1u)
253	PREF( 0, 8*32(src) )
254	PREF( 1, 8*32(dst) )
255	bne len, rem, 1b
256	nop
257
258	/*
259	* len == rem == the number of bytes left to copy < 8*NBYTES
260	*/
261	cleanup_both_aligned:
262	beqz len, done
263	sltu t0, len, 4*NBYTES
264	bnez t0, less_than_4units
265	and rem, len, (NBYTES-1) # rem = len % NBYTES
266	/*
267	* len >= 4*NBYTES
268	*/
269	EXC( LOAD t0, UNIT(0)(src), l_exc)
270	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
271	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
272	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
273	SUB len, len, 4*NBYTES
274	ADD src, src, 4*NBYTES
275	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
276	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
277	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
278	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
619b6e18 MR	279	.set reorder /* DADDI_WAR */
619b6e18 MR	280	ADD dst, dst, 4*NBYTES
1da177e4	281	beqz len, done
619b6e18	282	.set noreorder
1da177e4 LT	283	less_than_4units:
	284	/*
	285	* rem = len % NBYTES
	286	*/
	287	beq rem, len, copy_bytes
	288	nop
	289	1:
	290	EXC( LOAD t0, 0(src), l_exc)
	291	ADD src, src, NBYTES
	292	SUB len, len, NBYTES
	293	EXC( STORE t0, 0(dst), s_exc_p1u)
619b6e18 MR	294	.set reorder /* DADDI_WAR */
619b6e18 MR	295	ADD dst, dst, NBYTES
1da177e4	296	bne rem, len, 1b
619b6e18	297	.set noreorder
1da177e4 LT	298
	299	/*
	300	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
	301	* A loop would do only a byte at a time with possible branch
	302	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
	303	* because can't assume read-access to dst. Instead, use
	304	* STREST dst, which doesn't require read access to dst.
	305	*
	306	* This code should perform better than a simple loop on modern,
	307	* wide-issue mips processors because the code has fewer branches and
	308	* more instruction-level parallelism.
	309	*/
	310	#define bits t2
	311	beqz len, done
	312	ADD t1, dst, len # t1 is just past last byte of dst
	313	li bits, 8*NBYTES
	314	SLL rem, len, 3 # rem = number of bits to keep
	315	EXC( LOAD t0, 0(src), l_exc)
	316	SUB bits, bits, rem # bits = number of bits to discard
	317	SHIFT_DISCARD t0, t0, bits
	318	EXC( STREST t0, -1(t1), s_exc)
	319	jr ra
	320	move len, zero
	321	dst_unaligned:
	322	/*
	323	* dst is unaligned
	324	* t0 = src & ADDRMASK
	325	* t1 = dst & ADDRMASK; T1 > 0
	326	* len >= NBYTES
	327	*
	328	* Copy enough bytes to align dst
	329	* Set match = (src and dst have same alignment)
	330	*/
	331	#define match rem
	332	EXC( LDFIRST t3, FIRST(0)(src), l_exc)
	333	ADD t2, zero, NBYTES
	334	EXC( LDREST t3, REST(0)(src), l_exc_copy)
	335	SUB t2, t2, t1 # t2 = number of bytes copied
	336	xor match, t0, t1
	337	EXC( STFIRST t3, FIRST(0)(dst), s_exc)
	338	beq len, t2, done
	339	SUB len, len, t2
	340	ADD dst, dst, t2
	341	beqz match, both_aligned
	342	ADD src, src, t2
	343
	344	src_unaligned_dst_aligned:
	345	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
	346	PREF( 0, 3*32(src) )
	347	beqz t0, cleanup_src_unaligned
	348	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
	349	PREF( 1, 3*32(dst) )
	350	1:
	351	/*
	352	* Avoid consecutive LD*'s to the same register since some mips
	353	* implementations can't issue them in the same cycle.
	354	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
	355	* are to the same unit (unless src is aligned, but it's not).
	356	*/
	357	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
	358	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
	359	SUB len, len, 4*NBYTES
	360	EXC( LDREST t0, REST(0)(src), l_exc_copy)
	361	EXC( LDREST t1, REST(1)(src), l_exc_copy)
362	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
363	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
364	EXC( LDREST t2, REST(2)(src), l_exc_copy)
365	EXC( LDREST t3, REST(3)(src), l_exc_copy)
366	PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
367	ADD src, src, 4*NBYTES
368	#ifdef CONFIG_CPU_SB1
369	nop # improves slotting
370	#endif
371	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
372	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
373	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
374	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
375	PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
619b6e18 MR	376	.set reorder /* DADDI_WAR */
619b6e18 MR	377	ADD dst, dst, 4*NBYTES
1da177e4	378	bne len, rem, 1b
619b6e18	379	.set noreorder
1da177e4 LT	380
	381	cleanup_src_unaligned:
	382	beqz len, done
	383	and rem, len, NBYTES-1 # rem = len % NBYTES
	384	beq rem, len, copy_bytes
	385	nop
	386	1:
	387	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
	388	EXC( LDREST t0, REST(0)(src), l_exc_copy)
	389	ADD src, src, NBYTES
	390	SUB len, len, NBYTES
	391	EXC( STORE t0, 0(dst), s_exc_p1u)
619b6e18 MR	392	.set reorder /* DADDI_WAR */
619b6e18 MR	393	ADD dst, dst, NBYTES
1da177e4	394	bne len, rem, 1b
619b6e18	395	.set noreorder
1da177e4 LT	396
	397	copy_bytes_checklen:
	398	beqz len, done
	399	nop
	400	copy_bytes:
	401	/* 0 < len < NBYTES */
	402	#define COPY_BYTE(N) \
	403	EXC( lb t0, N(src), l_exc); \
	404	SUB len, len, 1; \
	405	beqz len, done; \
	406	EXC( sb t0, N(dst), s_exc_p1)
	407
	408	COPY_BYTE(0)
	409	COPY_BYTE(1)
	410	#ifdef USE_DOUBLE
	411	COPY_BYTE(2)
	412	COPY_BYTE(3)
	413	COPY_BYTE(4)
	414	COPY_BYTE(5)
	415	#endif
	416	EXC( lb t0, NBYTES-2(src), l_exc)
	417	SUB len, len, 1
	418	jr ra
	419	EXC( sb t0, NBYTES-2(dst), s_exc_p1)
	420	done:
	421	jr ra
	422	nop
	423	END(memcpy)
	424
	425	l_exc_copy:
	426	/*
	427	* Copy bytes from src until faulting load address (or until a
	428	* lb faults)
	429	*
	430	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
	431	* may be more than a byte beyond the last address.
	432	* Hence, the lb below may get an exception.
	433	*
	434	* Assumes src < THREAD_BUADDR($28)
	435	*/
	436	LOAD t0, TI_TASK($28)
	437	nop
	438	LOAD t0, THREAD_BUADDR(t0)
	439	1:
	440	EXC( lb t1, 0(src), l_exc)
	441	ADD src, src, 1
	442	sb t1, 0(dst) # can't fault -- we're copy_from_user
619b6e18 MR	443	.set reorder /* DADDI_WAR */
619b6e18 MR	444	ADD dst, dst, 1
1da177e4	445	bne src, t0, 1b
619b6e18	446	.set noreorder
1da177e4 LT	447	l_exc:
	448	LOAD t0, TI_TASK($28)
	449	nop
	450	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
	451	nop
	452	SUB len, AT, t0 # len number of uncopied bytes
	453	/*
	454	* Here's where we rely on src and dst being incremented in tandem,
	455	* See (3) above.
	456	* dst += (fault addr - src) to put dst at first byte to clear
	457	*/
	458	ADD dst, t0 # compute start address in a1
	459	SUB dst, src
	460	/*
	461	* Clear len bytes starting at dst. Can't call __bzero because it
	462	* might modify len. An inefficient loop for these rare times...
	463	*/
619b6e18 MR	464	.set reorder /* DADDI_WAR */
619b6e18 MR	465	SUB src, len, 1
1da177e4	466	beqz len, done
619b6e18	467	.set noreorder
1da177e4 LT	468	1: sb zero, 0(dst)
1da177e4 LT	469	ADD dst, dst, 1
619b6e18	470	#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
1da177e4 LT	471	bnez src, 1b
1da177e4 LT	472	SUB src, src, 1
619b6e18 MR	473	#else
	474	.set push
	475	.set noat
	476	li v1, 1
	477	bnez src, 1b
	478	SUB src, src, v1
	479	.set pop
	480	#endif
1da177e4 LT	481	jr ra
	482	nop
	483
	484
619b6e18 MR	485	#define SEXC(n) \
	486	.set reorder; /* DADDI_WAR */ \
	487	s_exc_p ## n ## u: \
	488	ADD len, len, n*NBYTES; \
	489	jr ra; \
	490	.set noreorder
1da177e4 LT	491
	492	SEXC(8)
	493	SEXC(7)
	494	SEXC(6)
	495	SEXC(5)
	496	SEXC(4)
	497	SEXC(3)
	498	SEXC(2)
	499	SEXC(1)
	500
	501	s_exc_p1:
619b6e18 MR	502	.set reorder /* DADDI_WAR */
619b6e18 MR	503	ADD len, len, 1
1da177e4	504	jr ra
619b6e18	505	.set noreorder
1da177e4 LT	506	s_exc:
	507	jr ra
	508	nop
	509
	510	.align 5
	511	LEAF(memmove)
	512	ADD t0, a0, a2
	513	ADD t1, a1, a2
	514	sltu t0, a1, t0 # dst + len <= src -> memcpy
	515	sltu t1, a0, t1 # dst >= src + len -> memcpy
	516	and t0, t1
	517	beqz t0, __memcpy
	518	move v0, a0 /* return value */
	519	beqz a2, r_out
	520	END(memmove)
	521
	522	/* fall through to __rmemcpy */
	523	LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
	524	sltu t0, a1, a0
	525	beqz t0, r_end_bytes_up # src >= dst
	526	nop
	527	ADD a0, a2 # dst = dst + len
	528	ADD a1, a2 # src = src + len
	529
	530	r_end_bytes:
	531	lb t0, -1(a1)
	532	SUB a2, a2, 0x1
	533	sb t0, -1(a0)
	534	SUB a1, a1, 0x1
619b6e18 MR	535	.set reorder /* DADDI_WAR */
619b6e18 MR	536	SUB a0, a0, 0x1
1da177e4	537	bnez a2, r_end_bytes
619b6e18	538	.set noreorder
1da177e4 LT	539
	540	r_out:
	541	jr ra
	542	move a2, zero
	543
	544	r_end_bytes_up:
	545	lb t0, (a1)
	546	SUB a2, a2, 0x1
	547	sb t0, (a0)
	548	ADD a1, a1, 0x1
619b6e18 MR	549	.set reorder /* DADDI_WAR */
619b6e18 MR	550	ADD a0, a0, 0x1
1da177e4	551	bnez a2, r_end_bytes_up
619b6e18	552	.set noreorder
1da177e4 LT	553
	554	jr ra
	555	move a2, zero
	556	END(__rmemcpy)