[mirror_ubuntu-artful-kernel.git] / arch / powerpc / lib / checksum_64.S

/*
 * This file contains assembly-language implementations
 * of IP-style 1's complement checksum routines.
 *	
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
 */

#include <linux/sys.h>
#include <asm/processor.h>
#include <asm/errno.h>
#include <asm/ppc_asm.h>

/*
 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
 * len is in words and is always >= 5.
 *
 * In practice len == 5, but this is not guaranteed.  So this code does not
 * attempt to use doubleword instructions.
 */
_GLOBAL(ip_fast_csum)
	lwz	r0,0(r3)
	lwzu	r5,4(r3)
	addic.	r4,r4,-2
	addc	r0,r0,r5
	mtctr	r4
	blelr-
1:	lwzu	r4,4(r3)
	adde	r0,r0,r4
	bdnz	1b
	addze	r0,r0		/* add in final carry */
        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
        add     r0,r0,r4
        srdi    r0,r0,32
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
	add	r3,r0,r3
	not	r3,r3
	srwi	r3,r3,16
	blr

/*
 * Compute checksum of TCP or UDP pseudo-header:
 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
 * No real gain trying to do this specially for 64 bit, but
 * the 32 bit addition may spill into the upper bits of
 * the doubleword so we still must fold it down from 64.
 */	
_GLOBAL(csum_tcpudp_magic)
	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
	addc	r0,r3,r4	/* add 4 32-bit words together */
	adde	r0,r0,r5
	adde	r0,r0,r7
        rldicl  r4,r0,32,0      /* fold 64 bit value */
        add     r0,r4,r0
        srdi    r0,r0,32
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
	add	r3,r0,r3
	not	r3,r3
	srwi	r3,r3,16
	blr

/*
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
 *
 * csum_partial(r3=buff, r4=len, r5=sum)
 */
_GLOBAL(csum_partial)
	addic	r0,r5,0			/* clear carry */

	srdi.	r6,r4,3			/* less than 8 bytes? */
	beq	.Lcsum_tail_word

	/*
	 * If only halfword aligned, align to a double word. Since odd
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcsum_aligned

	li	r7,4
	sub	r6,r7,r6
	mtctr	r6

1:
	lhz	r6,0(r3)		/* align to doubleword */
	subi	r4,r4,2
	addi	r3,r3,2
	adde	r0,r0,r6
	bdnz	1b

.Lcsum_aligned:
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r4,7
	beq	.Lcsum_tail_doublewords		/* len < 128 */

	srdi	r6,r4,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(R14)(r1)
	std	r15,STK_REG(R15)(r1)
	std	r16,STK_REG(R16)(r1)

	ld	r6,0(r3)
	ld	r9,8(r3)

	ld	r10,16(r3)
	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10

	adde	r0,r0,r11

	adde	r0,r0,r12

	adde	r0,r0,r14

	adde	r0,r0,r15
	ld	r6,0(r3)
	ld	r9,8(r3)

	adde	r0,r0,r16
	ld	r10,16(r3)
	ld	r11,24(r3)
	bdnz	2b


	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
	adde	r0,r0,r11
	adde	r0,r0,r12
	adde	r0,r0,r14
	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(R14)(r1)
	ld	r15,STK_REG(R15)(r1)
	ld	r16,STK_REG(R16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r4,r4,63

.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r4,3
	beq	.Lcsum_tail_word

	mtctr	r6
3:
	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
	bdnz	3b

	andi.	r4,r4,7

.Lcsum_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r4,2
	beq	.Lcsum_tail_halfword

	lwz	r6,0(r3)
	addi	r3,r3,4
	adde	r0,r0,r6
	subi	r4,r4,4

.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
	srdi.	r6,r4,1
	beq	.Lcsum_tail_byte

	lhz	r6,0(r3)
	addi	r3,r3,2
	adde	r0,r0,r6
	subi	r4,r4,2

.Lcsum_tail_byte:			/* Up to 1 byte to go */
	andi.	r6,r4,1
	beq	.Lcsum_finish

	lbz	r6,0(r3)
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	adde	r0,r0,r9

.Lcsum_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	blr


	.macro source
100:
	.section __ex_table,"a"
	.align 3
	.llong 100b,.Lsrc_error
	.previous
	.endm

	.macro dest
200:
	.section __ex_table,"a"
	.align 3
	.llong 200b,.Ldest_error
	.previous
	.endm

/*
 * Computes the checksum of a memory block at src, length len,
 * and adds in "sum" (32-bit), while copying the block to dst.
 * If an access exception occurs on src or dst, it stores -EFAULT
 * to *src_err or *dst_err respectively. The caller must take any action
 * required in this case (zeroing memory, recalculating partial checksum etc).
 *
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 */
_GLOBAL(csum_partial_copy_generic)
	addic	r0,r6,0			/* clear carry */

	srdi.	r6,r5,3			/* less than 8 bytes? */
	beq	.Lcopy_tail_word

	/*
	 * If only halfword aligned, align to a double word. Since odd
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 *
	 * If the source and destination are relatively unaligned we only
	 * align the source. This keeps things simple.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcopy_aligned

	li	r7,4
	sub	r6,r7,r6
	mtctr	r6

1:
source;	lhz	r6,0(r3)		/* align to doubleword */
	subi	r5,r5,2
	addi	r3,r3,2
	adde	r0,r0,r6
dest;	sth	r6,0(r4)
	addi	r4,r4,2
	bdnz	1b

.Lcopy_aligned:
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r5,7
	beq	.Lcopy_tail_doublewords		/* len < 128 */

	srdi	r6,r5,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(R14)(r1)
	std	r15,STK_REG(R15)(r1)
	std	r16,STK_REG(R16)(r1)

source;	ld	r6,0(r3)
source;	ld	r9,8(r3)

source;	ld	r10,16(r3)
source;	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
source;	ld	r12,32(r3)
source;	ld	r14,40(r3)

	adde	r0,r0,r9
source;	ld	r15,48(r3)
source;	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
dest;	std	r6,0(r4)
dest;	std	r9,8(r4)

	adde	r0,r0,r11
dest;	std	r10,16(r4)
dest;	std	r11,24(r4)

	adde	r0,r0,r12
dest;	std	r12,32(r4)
dest;	std	r14,40(r4)

	adde	r0,r0,r14
dest;	std	r15,48(r4)
dest;	std	r16,56(r4)
	addi	r4,r4,64

	adde	r0,r0,r15
source;	ld	r6,0(r3)
source;	ld	r9,8(r3)

	adde	r0,r0,r16
source;	ld	r10,16(r3)
source;	ld	r11,24(r3)
	bdnz	2b


	adde	r0,r0,r6
source;	ld	r12,32(r3)
source;	ld	r14,40(r3)

	adde	r0,r0,r9
source;	ld	r15,48(r3)
source;	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
dest;	std	r6,0(r4)
dest;	std	r9,8(r4)

	adde	r0,r0,r11
dest;	std	r10,16(r4)
dest;	std	r11,24(r4)

	adde	r0,r0,r12
dest;	std	r12,32(r4)
dest;	std	r14,40(r4)

	adde	r0,r0,r14
dest;	std	r15,48(r4)
dest;	std	r16,56(r4)
	addi	r4,r4,64

	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(R14)(r1)
	ld	r15,STK_REG(R15)(r1)
	ld	r16,STK_REG(R16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r5,r5,63

.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r5,3
	beq	.Lcopy_tail_word

	mtctr	r6
3:
source;	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
dest;	std	r6,0(r4)
	addi	r4,r4,8
	bdnz	3b

	andi.	r5,r5,7

.Lcopy_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r5,2
	beq	.Lcopy_tail_halfword

source;	lwz	r6,0(r3)
	addi	r3,r3,4
	adde	r0,r0,r6
dest;	stw	r6,0(r4)
	addi	r4,r4,4
	subi	r5,r5,4

.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
	srdi.	r6,r5,1
	beq	.Lcopy_tail_byte

source;	lhz	r6,0(r3)
	addi	r3,r3,2
	adde	r0,r0,r6
dest;	sth	r6,0(r4)
	addi	r4,r4,2
	subi	r5,r5,2

.Lcopy_tail_byte:			/* Up to 1 byte to go */
	andi.	r6,r5,1
	beq	.Lcopy_finish

source;	lbz	r6,0(r3)
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	adde	r0,r0,r9
dest;	stb	r6,0(r4)

.Lcopy_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	blr

.Lsrc_error:
	cmpdi	0,r7,0
	beqlr
	li	r6,-EFAULT
	stw	r6,0(r7)
	blr

.Ldest_error:
	cmpdi	0,r8,0
	beqlr
	li	r6,-EFAULT
	stw	r6,0(r8)
	blr
Commit	Line	Data
14cf11af PM	1	/*
	2	* This file contains assembly-language implementations
	3	* of IP-style 1's complement checksum routines.
	4	*
	5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
	6	*
	7	* This program is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU General Public License
	9	* as published by the Free Software Foundation; either version
	10	* 2 of the License, or (at your option) any later version.
	11	*
	12	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
	13	*/
	14
	15	#include <linux/sys.h>
	16	#include <asm/processor.h>
	17	#include <asm/errno.h>
	18	#include <asm/ppc_asm.h>
	19
	20	/*
	21	* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
	22	* len is in words and is always >= 5.
	23	*
	24	* In practice len == 5, but this is not guaranteed. So this code does not
	25	* attempt to use doubleword instructions.
	26	*/
	27	_GLOBAL(ip_fast_csum)
	28	lwz r0,0(r3)
	29	lwzu r5,4(r3)
	30	addic. r4,r4,-2
	31	addc r0,r0,r5
	32	mtctr r4
	33	blelr-
	34	1: lwzu r4,4(r3)
	35	adde r0,r0,r4
	36	bdnz 1b
	37	addze r0,r0 /* add in final carry */
	38	rldicl r4,r0,32,0 /* fold two 32-bit halves together */
	39	add r0,r0,r4
	40	srdi r0,r0,32
	41	rlwinm r3,r0,16,0,31 /* fold two halves together */
	42	add r3,r0,r3
	43	not r3,r3
	44	srwi r3,r3,16
	45	blr
	46
	47	/*
	48	* Compute checksum of TCP or UDP pseudo-header:
	49	* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
	50	* No real gain trying to do this specially for 64 bit, but
	51	* the 32 bit addition may spill into the upper bits of
	52	* the doubleword so we still must fold it down from 64.
	53	*/
	54	_GLOBAL(csum_tcpudp_magic)
	55	rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
	56	addc r0,r3,r4 /* add 4 32-bit words together */
	57	adde r0,r0,r5
	58	adde r0,r0,r7
	59	rldicl r4,r0,32,0 /* fold 64 bit value */
	60	add r0,r4,r0
	61	srdi r0,r0,32
	62	rlwinm r3,r0,16,0,31 /* fold two halves together */
	63	add r3,r0,r3
	64	not r3,r3
65	srwi r3,r3,16
66	blr
67
68	/*
69	* Computes the checksum of a memory block at buff, length len,
70	* and adds in "sum" (32-bit).
71	*
14cf11af PM	72	* csum_partial(r3=buff, r4=len, r5=sum)
	73	*/
	74	_GLOBAL(csum_partial)
9b83ecb0 AB	75	addic r0,r5,0 /* clear carry */
	76
	77	srdi. r6,r4,3 /* less than 8 bytes? */
	78	beq .Lcsum_tail_word
	79
	80	/*
	81	* If only halfword aligned, align to a double word. Since odd
	82	* aligned addresses should be rare and they would require more
	83	* work to calculate the correct checksum, we ignore that case
	84	* and take the potential slowdown of unaligned loads.
	85	*/
	86	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
	87	beq .Lcsum_aligned
	88
	89	li r7,4
	90	sub r6,r7,r6
	91	mtctr r6
	92
	93	1:
	94	lhz r6,0(r3) /* align to doubleword */
	95	subi r4,r4,2
	96	addi r3,r3,2
	97	adde r0,r0,r6
	98	bdnz 1b
	99
	100	.Lcsum_aligned:
	101	/*
	102	* We unroll the loop such that each iteration is 64 bytes with an
	103	* entry and exit limb of 64 bytes, meaning a minimum size of
	104	* 128 bytes.
	105	*/
	106	srdi. r6,r4,7
	107	beq .Lcsum_tail_doublewords /* len < 128 */
	108
	109	srdi r6,r4,6
	110	subi r6,r6,1
	111	mtctr r6
	112
	113	stdu r1,-STACKFRAMESIZE(r1)
c75df6f9 MN	114	std r14,STK_REG(R14)(r1)
	115	std r15,STK_REG(R15)(r1)
	116	std r16,STK_REG(R16)(r1)
9b83ecb0 AB	117
	118	ld r6,0(r3)
	119	ld r9,8(r3)
	120
	121	ld r10,16(r3)
	122	ld r11,24(r3)
	123
	124	/*
	125	* On POWER6 and POWER7 back to back addes take 2 cycles because of
	126	* the XER dependency. This means the fastest this loop can go is
	127	* 16 cycles per iteration. The scheduling of the loop below has
	128	* been shown to hit this on both POWER6 and POWER7.
	129	*/
	130	.align 5
	131	2:
	132	adde r0,r0,r6
	133	ld r12,32(r3)
	134	ld r14,40(r3)
	135
	136	adde r0,r0,r9
	137	ld r15,48(r3)
	138	ld r16,56(r3)
	139	addi r3,r3,64
	140
	141	adde r0,r0,r10
	142
	143	adde r0,r0,r11
	144
	145	adde r0,r0,r12
	146
	147	adde r0,r0,r14
	148
	149	adde r0,r0,r15
	150	ld r6,0(r3)
	151	ld r9,8(r3)
	152
	153	adde r0,r0,r16
	154	ld r10,16(r3)
	155	ld r11,24(r3)
	156	bdnz 2b
	157
	158
	159	adde r0,r0,r6
	160	ld r12,32(r3)
	161	ld r14,40(r3)
	162
	163	adde r0,r0,r9
	164	ld r15,48(r3)
	165	ld r16,56(r3)
	166	addi r3,r3,64
	167
	168	adde r0,r0,r10
	169	adde r0,r0,r11
	170	adde r0,r0,r12
	171	adde r0,r0,r14
	172	adde r0,r0,r15
	173	adde r0,r0,r16
	174
c75df6f9 MN	175	ld r14,STK_REG(R14)(r1)
	176	ld r15,STK_REG(R15)(r1)
	177	ld r16,STK_REG(R16)(r1)
9b83ecb0 AB	178	addi r1,r1,STACKFRAMESIZE
	179
	180	andi. r4,r4,63
	181
	182	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
	183	srdi. r6,r4,3
	184	beq .Lcsum_tail_word
	185
	186	mtctr r6
	187	3:
	188	ld r6,0(r3)
	189	addi r3,r3,8
	190	adde r0,r0,r6
	191	bdnz 3b
	192
	193	andi. r4,r4,7
	194
	195	.Lcsum_tail_word: /* Up to 7 bytes to go */
	196	srdi. r6,r4,2
	197	beq .Lcsum_tail_halfword
	198
	199	lwz r6,0(r3)
14cf11af	200	addi r3,r3,4
9b83ecb0	201	adde r0,r0,r6
14cf11af	202	subi r4,r4,4
9b83ecb0 AB	203
	204	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
	205	srdi. r6,r4,1
	206	beq .Lcsum_tail_byte
	207
	208	lhz r6,0(r3)
	209	addi r3,r3,2
	210	adde r0,r0,r6
	211	subi r4,r4,2
	212
	213	.Lcsum_tail_byte: /* Up to 1 byte to go */
	214	andi. r6,r4,1
	215	beq .Lcsum_finish
	216
	217	lbz r6,0(r3)
	218	sldi r9,r6,8 /* Pad the byte out to 16 bits */
	219	adde r0,r0,r9
	220
	221	.Lcsum_finish:
	222	addze r0,r0 /* add in final carry */
	223	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
	224	add r3,r4,r0
	225	srdi r3,r3,32
	226	blr
14cf11af	227
fdd374b6 AB	228
	229	.macro source
	230	100:
	231	.section __ex_table,"a"
	232	.align 3
	233	.llong 100b,.Lsrc_error
	234	.previous
	235	.endm
	236
	237	.macro dest
	238	200:
	239	.section __ex_table,"a"
	240	.align 3
	241	.llong 200b,.Ldest_error
	242	.previous
	243	.endm
	244
14cf11af PM	245	/*
	246	* Computes the checksum of a memory block at src, length len,
	247	* and adds in "sum" (32-bit), while copying the block to dst.
	248	* If an access exception occurs on src or dst, it stores -EFAULT
fdd374b6 AB	249	* to src_err or dst_err respectively. The caller must take any action
fdd374b6 AB	250	* required in this case (zeroing memory, recalculating partial checksum etc).
14cf11af PM	251	*
	252	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
	253	*/
	254	_GLOBAL(csum_partial_copy_generic)
fdd374b6 AB	255	addic r0,r6,0 /* clear carry */
	256
	257	srdi. r6,r5,3 /* less than 8 bytes? */
	258	beq .Lcopy_tail_word
	259
	260	/*
	261	* If only halfword aligned, align to a double word. Since odd
	262	* aligned addresses should be rare and they would require more
	263	* work to calculate the correct checksum, we ignore that case
	264	* and take the potential slowdown of unaligned loads.
	265	*
	266	* If the source and destination are relatively unaligned we only
	267	* align the source. This keeps things simple.
	268	*/
	269	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
	270	beq .Lcopy_aligned
	271
	272	li r7,4
	273	sub r6,r7,r6
	274	mtctr r6
	275
	276	1:
	277	source; lhz r6,0(r3) /* align to doubleword */
14cf11af	278	subi r5,r5,2
14cf11af	279	addi r3,r3,2
fdd374b6 AB	280	adde r0,r0,r6
fdd374b6 AB	281	dest; sth r6,0(r4)
14cf11af	282	addi r4,r4,2
fdd374b6 AB	283	bdnz 1b
	284
	285	.Lcopy_aligned:
	286	/*
	287	* We unroll the loop such that each iteration is 64 bytes with an
	288	* entry and exit limb of 64 bytes, meaning a minimum size of
	289	* 128 bytes.
	290	*/
	291	srdi. r6,r5,7
	292	beq .Lcopy_tail_doublewords /* len < 128 */
	293
	294	srdi r6,r5,6
	295	subi r6,r6,1
	296	mtctr r6
	297
	298	stdu r1,-STACKFRAMESIZE(r1)
c75df6f9 MN	299	std r14,STK_REG(R14)(r1)
	300	std r15,STK_REG(R15)(r1)
	301	std r16,STK_REG(R16)(r1)
fdd374b6 AB	302
	303	source; ld r6,0(r3)
	304	source; ld r9,8(r3)
	305
	306	source; ld r10,16(r3)
	307	source; ld r11,24(r3)
	308
	309	/*
	310	* On POWER6 and POWER7 back to back addes take 2 cycles because of
	311	* the XER dependency. This means the fastest this loop can go is
	312	* 16 cycles per iteration. The scheduling of the loop below has
	313	* been shown to hit this on both POWER6 and POWER7.
	314	*/
	315	.align 5
	316	2:
	317	adde r0,r0,r6
	318	source; ld r12,32(r3)
	319	source; ld r14,40(r3)
	320
	321	adde r0,r0,r9
	322	source; ld r15,48(r3)
	323	source; ld r16,56(r3)
	324	addi r3,r3,64
	325
	326	adde r0,r0,r10
	327	dest; std r6,0(r4)
	328	dest; std r9,8(r4)
	329
	330	adde r0,r0,r11
	331	dest; std r10,16(r4)
	332	dest; std r11,24(r4)
	333
	334	adde r0,r0,r12
	335	dest; std r12,32(r4)
	336	dest; std r14,40(r4)
	337
	338	adde r0,r0,r14
	339	dest; std r15,48(r4)
	340	dest; std r16,56(r4)
	341	addi r4,r4,64
	342
	343	adde r0,r0,r15
	344	source; ld r6,0(r3)
	345	source; ld r9,8(r3)
	346
	347	adde r0,r0,r16
	348	source; ld r10,16(r3)
	349	source; ld r11,24(r3)
	350	bdnz 2b
	351
	352
14cf11af	353	adde r0,r0,r6
fdd374b6 AB	354	source; ld r12,32(r3)
	355	source; ld r14,40(r3)
	356
	357	adde r0,r0,r9
	358	source; ld r15,48(r3)
	359	source; ld r16,56(r3)
	360	addi r3,r3,64
	361
	362	adde r0,r0,r10
	363	dest; std r6,0(r4)
	364	dest; std r9,8(r4)
	365
	366	adde r0,r0,r11
	367	dest; std r10,16(r4)
	368	dest; std r11,24(r4)
	369
	370	adde r0,r0,r12
	371	dest; std r12,32(r4)
	372	dest; std r14,40(r4)
	373
	374	adde r0,r0,r14
	375	dest; std r15,48(r4)
	376	dest; std r16,56(r4)
	377	addi r4,r4,64
	378
	379	adde r0,r0,r15
	380	adde r0,r0,r16
	381
c75df6f9 MN	382	ld r14,STK_REG(R14)(r1)
	383	ld r15,STK_REG(R15)(r1)
	384	ld r16,STK_REG(R16)(r1)
fdd374b6 AB	385	addi r1,r1,STACKFRAMESIZE
	386
	387	andi. r5,r5,63
	388
	389	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
	390	srdi. r6,r5,3
	391	beq .Lcopy_tail_word
	392
	393	mtctr r6
	394	3:
	395	source; ld r6,0(r3)
	396	addi r3,r3,8
14cf11af	397	adde r0,r0,r6
fdd374b6 AB	398	dest; std r6,0(r4)
	399	addi r4,r4,8
	400	bdnz 3b
14cf11af	401
fdd374b6	402	andi. r5,r5,7
14cf11af	403
fdd374b6 AB	404	.Lcopy_tail_word: /* Up to 7 bytes to go */
	405	srdi. r6,r5,2
	406	beq .Lcopy_tail_halfword
	407
	408	source; lwz r6,0(r3)
	409	addi r3,r3,4
	410	adde r0,r0,r6
	411	dest; stw r6,0(r4)
	412	addi r4,r4,4
	413	subi r5,r5,4
	414
	415	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
	416	srdi. r6,r5,1
	417	beq .Lcopy_tail_byte
	418
	419	source; lhz r6,0(r3)
	420	addi r3,r3,2
	421	adde r0,r0,r6
	422	dest; sth r6,0(r4)
14cf11af	423	addi r4,r4,2
fdd374b6 AB	424	subi r5,r5,2
	425
	426	.Lcopy_tail_byte: /* Up to 1 byte to go */
	427	andi. r6,r5,1
	428	beq .Lcopy_finish
	429
	430	source; lbz r6,0(r3)
	431	sldi r9,r6,8 /* Pad the byte out to 16 bits */
	432	adde r0,r0,r9
	433	dest; stb r6,0(r4)
	434
	435	.Lcopy_finish:
	436	addze r0,r0 /* add in final carry */
	437	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
	438	add r3,r4,r0
	439	srdi r3,r3,32
	440	blr
	441
	442	.Lsrc_error:
14cf11af	443	cmpdi 0,r7,0
fdd374b6	444	beqlr
14cf11af PM	445	li r6,-EFAULT
14cf11af PM	446	stw r6,0(r7)
14cf11af PM	447	blr
14cf11af PM	448
fdd374b6	449	.Ldest_error:
14cf11af	450	cmpdi 0,r8,0
fdd374b6	451	beqlr
14cf11af PM	452	li r6,-EFAULT
14cf11af PM	453	stw r6,0(r8)
14cf11af	454	blr