[mirror_ubuntu-artful-kernel.git] / arch / x86 / lib / copy_user_64.S

/*
 * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
 * Copyright 2002 Andi Kleen, SuSE Labs.
 * Subject to the GNU Public License v2.
 *
 * Functions to copy from and to user space.
 */

#include <linux/linkage.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>

/*
 * copy_user_generic_unrolled - memory copy with exception handling.
 * This version is for CPUs like P4 that don't have efficient micro
 * code for rep movsq
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
ENTRY(copy_user_generic_unrolled)
	ASM_STAC
	cmpl $8,%edx
	jb 20f		/* less then 8 bytes, go to byte copy loop */
	ALIGN_DESTINATION
	movl %edx,%ecx
	andl $63,%edx
	shrl $6,%ecx
	jz .L_copy_short_string
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
5:	movq %r8,(%rdi)
6:	movq %r9,1*8(%rdi)
7:	movq %r10,2*8(%rdi)
8:	movq %r11,3*8(%rdi)
9:	movq 4*8(%rsi),%r8
10:	movq 5*8(%rsi),%r9
11:	movq 6*8(%rsi),%r10
12:	movq 7*8(%rsi),%r11
13:	movq %r8,4*8(%rdi)
14:	movq %r9,5*8(%rdi)
15:	movq %r10,6*8(%rdi)
16:	movq %r11,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz 1b
.L_copy_short_string:
	movl %edx,%ecx
	andl $7,%edx
	shrl $3,%ecx
	jz 20f
18:	movq (%rsi),%r8
19:	movq %r8,(%rdi)
	leaq 8(%rsi),%rsi
	leaq 8(%rdi),%rdi
	decl %ecx
	jnz 18b
20:	andl %edx,%edx
	jz 23f
	movl %edx,%ecx
21:	movb (%rsi),%al
22:	movb %al,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz 21b
23:	xor %eax,%eax
	ASM_CLAC
	ret

	.section .fixup,"ax"
30:	shll $6,%ecx
	addl %ecx,%edx
	jmp 60f
40:	leal (%rdx,%rcx,8),%edx
	jmp 60f
50:	movl %ecx,%edx
60:	jmp copy_user_handle_tail /* ecx is zerorest also */
	.previous

	_ASM_EXTABLE(1b,30b)
	_ASM_EXTABLE(2b,30b)
	_ASM_EXTABLE(3b,30b)
	_ASM_EXTABLE(4b,30b)
	_ASM_EXTABLE(5b,30b)
	_ASM_EXTABLE(6b,30b)
	_ASM_EXTABLE(7b,30b)
	_ASM_EXTABLE(8b,30b)
	_ASM_EXTABLE(9b,30b)
	_ASM_EXTABLE(10b,30b)
	_ASM_EXTABLE(11b,30b)
	_ASM_EXTABLE(12b,30b)
	_ASM_EXTABLE(13b,30b)
	_ASM_EXTABLE(14b,30b)
	_ASM_EXTABLE(15b,30b)
	_ASM_EXTABLE(16b,30b)
	_ASM_EXTABLE(18b,40b)
	_ASM_EXTABLE(19b,40b)
	_ASM_EXTABLE(21b,50b)
	_ASM_EXTABLE(22b,50b)
ENDPROC(copy_user_generic_unrolled)
EXPORT_SYMBOL(copy_user_generic_unrolled)

/* Some CPUs run faster using the string copy instructions.
 * This is also a lot simpler. Use them when possible.
 *
 * Only 4GB of copy is supported. This shouldn't be a problem
 * because the kernel normally only writes from/to page sized chunks
 * even if user space passed a longer buffer.
 * And more would be dangerous because both Intel and AMD have
 * errata with rep movsq > 4GB. If someone feels the need to fix
 * this please consider this.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
ENTRY(copy_user_generic_string)
	ASM_STAC
	cmpl $8,%edx
	jb 2f		/* less than 8 bytes, go to byte copy loop */
	ALIGN_DESTINATION
	movl %edx,%ecx
	shrl $3,%ecx
	andl $7,%edx
1:	rep
	movsq
2:	movl %edx,%ecx
3:	rep
	movsb
	xorl %eax,%eax
	ASM_CLAC
	ret

	.section .fixup,"ax"
11:	leal (%rdx,%rcx,8),%ecx
12:	movl %ecx,%edx		/* ecx is zerorest also */
	jmp copy_user_handle_tail
	.previous

	_ASM_EXTABLE(1b,11b)
	_ASM_EXTABLE(3b,12b)
ENDPROC(copy_user_generic_string)
EXPORT_SYMBOL(copy_user_generic_string)

/*
 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
 *
 * Input:
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:
 * eax uncopied bytes or 0 if successful.
 */
ENTRY(copy_user_enhanced_fast_string)
	ASM_STAC
	cmpl $64,%edx
	jb .L_copy_short_string	/* less then 64 bytes, avoid the costly 'rep' */
	movl %edx,%ecx
1:	rep
	movsb
	xorl %eax,%eax
	ASM_CLAC
	ret

	.section .fixup,"ax"
12:	movl %ecx,%edx		/* ecx is zerorest also */
	jmp copy_user_handle_tail
	.previous

	_ASM_EXTABLE(1b,12b)
ENDPROC(copy_user_enhanced_fast_string)
EXPORT_SYMBOL(copy_user_enhanced_fast_string)

/*
 * copy_user_nocache - Uncached memory copy with exception handling
 * This will force destination out of cache for more performance.
 *
 * Note: Cached memory copy is used when destination or size is not
 * naturally aligned. That is:
 *  - Require 8-byte alignment when size is 8 bytes or larger.
 *  - Require 4-byte alignment when size is 4 bytes.
 */
ENTRY(__copy_user_nocache)
	ASM_STAC

	/* If size is less than 8 bytes, go to 4-byte copy */
	cmpl $8,%edx
	jb .L_4b_nocache_copy_entry

	/* If destination is not 8-byte aligned, "cache" copy to align it */
	ALIGN_DESTINATION

	/* Set 4x8-byte copy count and remainder */
	movl %edx,%ecx
	andl $63,%edx
	shrl $6,%ecx
	jz .L_8b_nocache_copy_entry	/* jump if count is 0 */

	/* Perform 4x8-byte nocache loop-copy */
.L_4x8b_nocache_copy_loop:
1:	movq (%rsi),%r8
2:	movq 1*8(%rsi),%r9
3:	movq 2*8(%rsi),%r10
4:	movq 3*8(%rsi),%r11
5:	movnti %r8,(%rdi)
6:	movnti %r9,1*8(%rdi)
7:	movnti %r10,2*8(%rdi)
8:	movnti %r11,3*8(%rdi)
9:	movq 4*8(%rsi),%r8
10:	movq 5*8(%rsi),%r9
11:	movq 6*8(%rsi),%r10
12:	movq 7*8(%rsi),%r11
13:	movnti %r8,4*8(%rdi)
14:	movnti %r9,5*8(%rdi)
15:	movnti %r10,6*8(%rdi)
16:	movnti %r11,7*8(%rdi)
	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi
	decl %ecx
	jnz .L_4x8b_nocache_copy_loop

	/* Set 8-byte copy count and remainder */
.L_8b_nocache_copy_entry:
	movl %edx,%ecx
	andl $7,%edx
	shrl $3,%ecx
	jz .L_4b_nocache_copy_entry	/* jump if count is 0 */

	/* Perform 8-byte nocache loop-copy */
.L_8b_nocache_copy_loop:
20:	movq (%rsi),%r8
21:	movnti %r8,(%rdi)
	leaq 8(%rsi),%rsi
	leaq 8(%rdi),%rdi
	decl %ecx
	jnz .L_8b_nocache_copy_loop

	/* If no byte left, we're done */
.L_4b_nocache_copy_entry:
	andl %edx,%edx
	jz .L_finish_copy

	/* If destination is not 4-byte aligned, go to byte copy: */
	movl %edi,%ecx
	andl $3,%ecx
	jnz .L_1b_cache_copy_entry

	/* Set 4-byte copy count (1 or 0) and remainder */
	movl %edx,%ecx
	andl $3,%edx
	shrl $2,%ecx
	jz .L_1b_cache_copy_entry	/* jump if count is 0 */

	/* Perform 4-byte nocache copy: */
30:	movl (%rsi),%r8d
31:	movnti %r8d,(%rdi)
	leaq 4(%rsi),%rsi
	leaq 4(%rdi),%rdi

	/* If no bytes left, we're done: */
	andl %edx,%edx
	jz .L_finish_copy

	/* Perform byte "cache" loop-copy for the remainder */
.L_1b_cache_copy_entry:
	movl %edx,%ecx
.L_1b_cache_copy_loop:
40:	movb (%rsi),%al
41:	movb %al,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .L_1b_cache_copy_loop

	/* Finished copying; fence the prior stores */
.L_finish_copy:
	xorl %eax,%eax
	ASM_CLAC
	sfence
	ret

	.section .fixup,"ax"
.L_fixup_4x8b_copy:
	shll $6,%ecx
	addl %ecx,%edx
	jmp .L_fixup_handle_tail
.L_fixup_8b_copy:
	lea (%rdx,%rcx,8),%rdx
	jmp .L_fixup_handle_tail
.L_fixup_4b_copy:
	lea (%rdx,%rcx,4),%rdx
	jmp .L_fixup_handle_tail
.L_fixup_1b_copy:
	movl %ecx,%edx
.L_fixup_handle_tail:
	sfence
	jmp copy_user_handle_tail
	.previous

	_ASM_EXTABLE(1b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(2b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(3b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(4b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(5b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(6b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(7b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(8b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(9b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(10b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(11b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(12b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(13b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(14b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(15b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(16b,.L_fixup_4x8b_copy)
	_ASM_EXTABLE(20b,.L_fixup_8b_copy)
	_ASM_EXTABLE(21b,.L_fixup_8b_copy)
	_ASM_EXTABLE(30b,.L_fixup_4b_copy)
	_ASM_EXTABLE(31b,.L_fixup_4b_copy)
	_ASM_EXTABLE(40b,.L_fixup_1b_copy)
	_ASM_EXTABLE(41b,.L_fixup_1b_copy)
ENDPROC(__copy_user_nocache)
EXPORT_SYMBOL(__copy_user_nocache)
Commit	Line	Data
ad2fc2cd VM	1	/*
	2	* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
	3	* Copyright 2002 Andi Kleen, SuSE Labs.
1da177e4	4	* Subject to the GNU Public License v2.
ad2fc2cd VM	5	*
	6	* Functions to copy from and to user space.
	7	*/
1da177e4	8
8d379dad	9	#include <linux/linkage.h>
3022d734 AK	10	#include <asm/current.h>
	11	#include <asm/asm-offsets.h>
	12	#include <asm/thread_info.h>
cd4d09ec	13	#include <asm/cpufeatures.h>
4307bec9	14	#include <asm/alternative-asm.h>
9732da8c	15	#include <asm/asm.h>
63bcff2a	16	#include <asm/smap.h>
784d5699	17	#include <asm/export.h>
3022d734	18
1da177e4	19	/*
3022d734	20	* copy_user_generic_unrolled - memory copy with exception handling.
ad2fc2cd VM	21	* This version is for CPUs like P4 that don't have efficient micro
	22	* code for rep movsq
	23	*
	24	* Input:
1da177e4 LT	25	* rdi destination
	26	* rsi source
	27	* rdx count
	28	*
ad2fc2cd	29	* Output:
0d2eb44f	30	* eax uncopied bytes or 0 if successful.
1da177e4	31	*/
3022d734	32	ENTRY(copy_user_generic_unrolled)
63bcff2a	33	ASM_STAC
ad2fc2cd VM	34	cmpl $8,%edx
	35	jb 20f /* less then 8 bytes, go to byte copy loop */
	36	ALIGN_DESTINATION
	37	movl %edx,%ecx
	38	andl $63,%edx
	39	shrl $6,%ecx
236222d3	40	jz .L_copy_short_string
ad2fc2cd VM	41	1: movq (%rsi),%r8
	42	2: movq 1*8(%rsi),%r9
	43	3: movq 2*8(%rsi),%r10
	44	4: movq 3*8(%rsi),%r11
	45	5: movq %r8,(%rdi)
	46	6: movq %r9,1*8(%rdi)
	47	7: movq %r10,2*8(%rdi)
	48	8: movq %r11,3*8(%rdi)
	49	9: movq 4*8(%rsi),%r8
	50	10: movq 5*8(%rsi),%r9
	51	11: movq 6*8(%rsi),%r10
	52	12: movq 7*8(%rsi),%r11
	53	13: movq %r8,4*8(%rdi)
	54	14: movq %r9,5*8(%rdi)
	55	15: movq %r10,6*8(%rdi)
	56	16: movq %r11,7*8(%rdi)
7bcd3f34 AK	57	leaq 64(%rsi),%rsi
7bcd3f34 AK	58	leaq 64(%rdi),%rdi
7bcd3f34	59	decl %ecx
ad2fc2cd	60	jnz 1b
236222d3 PA	61	.L_copy_short_string:
236222d3 PA	62	movl %edx,%ecx
ad2fc2cd VM	63	andl $7,%edx
	64	shrl $3,%ecx
	65	jz 20f
	66	18: movq (%rsi),%r8
	67	19: movq %r8,(%rdi)
7bcd3f34	68	leaq 8(%rsi),%rsi
ad2fc2cd VM	69	leaq 8(%rdi),%rdi
	70	decl %ecx
	71	jnz 18b
	72	20: andl %edx,%edx
	73	jz 23f
7bcd3f34	74	movl %edx,%ecx
ad2fc2cd VM	75	21: movb (%rsi),%al
ad2fc2cd VM	76	22: movb %al,(%rdi)
7bcd3f34	77	incq %rsi
ad2fc2cd	78	incq %rdi
7bcd3f34	79	decl %ecx
ad2fc2cd VM	80	jnz 21b
ad2fc2cd VM	81	23: xor %eax,%eax
63bcff2a	82	ASM_CLAC
7bcd3f34 AK	83	ret
7bcd3f34 AK	84
ad2fc2cd VM	85	.section .fixup,"ax"
	86	30: shll $6,%ecx
	87	addl %ecx,%edx
	88	jmp 60f
661c8019	89	40: leal (%rdx,%rcx,8),%edx
ad2fc2cd VM	90	jmp 60f
	91	50: movl %ecx,%edx
	92	60: jmp copy_user_handle_tail /* ecx is zerorest also */
	93	.previous
7bcd3f34	94
9732da8c PA	95	_ASM_EXTABLE(1b,30b)
	96	_ASM_EXTABLE(2b,30b)
	97	_ASM_EXTABLE(3b,30b)
	98	_ASM_EXTABLE(4b,30b)
	99	_ASM_EXTABLE(5b,30b)
	100	_ASM_EXTABLE(6b,30b)
	101	_ASM_EXTABLE(7b,30b)
	102	_ASM_EXTABLE(8b,30b)
	103	_ASM_EXTABLE(9b,30b)
	104	_ASM_EXTABLE(10b,30b)
	105	_ASM_EXTABLE(11b,30b)
	106	_ASM_EXTABLE(12b,30b)
	107	_ASM_EXTABLE(13b,30b)
	108	_ASM_EXTABLE(14b,30b)
	109	_ASM_EXTABLE(15b,30b)
	110	_ASM_EXTABLE(16b,30b)
	111	_ASM_EXTABLE(18b,40b)
	112	_ASM_EXTABLE(19b,40b)
	113	_ASM_EXTABLE(21b,50b)
	114	_ASM_EXTABLE(22b,50b)
ad2fc2cd	115	ENDPROC(copy_user_generic_unrolled)
784d5699	116	EXPORT_SYMBOL(copy_user_generic_unrolled)
8d379dad	117
ad2fc2cd VM	118	/* Some CPUs run faster using the string copy instructions.
	119	* This is also a lot simpler. Use them when possible.
	120	*
	121	* Only 4GB of copy is supported. This shouldn't be a problem
	122	* because the kernel normally only writes from/to page sized chunks
	123	* even if user space passed a longer buffer.
	124	* And more would be dangerous because both Intel and AMD have
	125	* errata with rep movsq > 4GB. If someone feels the need to fix
	126	* this please consider this.
	127	*
	128	* Input:
	129	* rdi destination
	130	* rsi source
	131	* rdx count
	132	*
	133	* Output:
	134	* eax uncopied bytes or 0 if successful.
	135	*/
3022d734	136	ENTRY(copy_user_generic_string)
63bcff2a	137	ASM_STAC
ad2fc2cd VM	138	cmpl $8,%edx
	139	jb 2f /* less than 8 bytes, go to byte copy loop */
	140	ALIGN_DESTINATION
1da177e4 LT	141	movl %edx,%ecx
1da177e4 LT	142	shrl $3,%ecx
ad2fc2cd VM	143	andl $7,%edx
ad2fc2cd VM	144	1: rep
3022d734	145	movsq
ad2fc2cd VM	146	2: movl %edx,%ecx
	147	3: rep
	148	movsb
f4cb1cc1	149	xorl %eax,%eax
63bcff2a	150	ASM_CLAC
1da177e4	151	ret
3022d734	152
ad2fc2cd	153	.section .fixup,"ax"
661c8019	154	11: leal (%rdx,%rcx,8),%ecx
ad2fc2cd VM	155	12: movl %ecx,%edx /* ecx is zerorest also */
	156	jmp copy_user_handle_tail
	157	.previous
2cbc9ee3	158
9732da8c PA	159	_ASM_EXTABLE(1b,11b)
9732da8c PA	160	_ASM_EXTABLE(3b,12b)
ad2fc2cd	161	ENDPROC(copy_user_generic_string)
784d5699	162	EXPORT_SYMBOL(copy_user_generic_string)
4307bec9 FY	163
	164	/*
	165	* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
	166	* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
	167	*
	168	* Input:
	169	* rdi destination
	170	* rsi source
	171	* rdx count
	172	*
	173	* Output:
	174	* eax uncopied bytes or 0 if successful.
	175	*/
	176	ENTRY(copy_user_enhanced_fast_string)
63bcff2a	177	ASM_STAC
236222d3 PA	178	cmpl $64,%edx
236222d3 PA	179	jb .L_copy_short_string /* less then 64 bytes, avoid the costly 'rep' */
4307bec9 FY	180	movl %edx,%ecx
	181	1: rep
	182	movsb
f4cb1cc1	183	xorl %eax,%eax
63bcff2a	184	ASM_CLAC
4307bec9 FY	185	ret
	186
	187	.section .fixup,"ax"
	188	12: movl %ecx,%edx /* ecx is zerorest also */
	189	jmp copy_user_handle_tail
	190	.previous
	191
9732da8c	192	_ASM_EXTABLE(1b,12b)
4307bec9	193	ENDPROC(copy_user_enhanced_fast_string)
784d5699	194	EXPORT_SYMBOL(copy_user_enhanced_fast_string)
b41e6ec2 BP	195
	196	/*
	197	* copy_user_nocache - Uncached memory copy with exception handling
ee9737c9 TK	198	* This will force destination out of cache for more performance.
	199	*
	200	* Note: Cached memory copy is used when destination or size is not
	201	* naturally aligned. That is:
	202	* - Require 8-byte alignment when size is 8 bytes or larger.
a82eee74	203	* - Require 4-byte alignment when size is 4 bytes.
b41e6ec2 BP	204	*/
b41e6ec2 BP	205	ENTRY(__copy_user_nocache)
b41e6ec2	206	ASM_STAC
ee9737c9	207
a82eee74	208	/* If size is less than 8 bytes, go to 4-byte copy */
b41e6ec2	209	cmpl $8,%edx
a82eee74	210	jb .L_4b_nocache_copy_entry
ee9737c9 TK	211
ee9737c9 TK	212	/* If destination is not 8-byte aligned, "cache" copy to align it */
b41e6ec2	213	ALIGN_DESTINATION
ee9737c9 TK	214
ee9737c9 TK	215	/* Set 4x8-byte copy count and remainder */
b41e6ec2 BP	216	movl %edx,%ecx
	217	andl $63,%edx
	218	shrl $6,%ecx
ee9737c9 TK	219	jz .L_8b_nocache_copy_entry /* jump if count is 0 */
	220
	221	/* Perform 4x8-byte nocache loop-copy */
	222	.L_4x8b_nocache_copy_loop:
b41e6ec2 BP	223	1: movq (%rsi),%r8
	224	2: movq 1*8(%rsi),%r9
	225	3: movq 2*8(%rsi),%r10
	226	4: movq 3*8(%rsi),%r11
	227	5: movnti %r8,(%rdi)
	228	6: movnti %r9,1*8(%rdi)
	229	7: movnti %r10,2*8(%rdi)
	230	8: movnti %r11,3*8(%rdi)
	231	9: movq 4*8(%rsi),%r8
	232	10: movq 5*8(%rsi),%r9
	233	11: movq 6*8(%rsi),%r10
	234	12: movq 7*8(%rsi),%r11
	235	13: movnti %r8,4*8(%rdi)
	236	14: movnti %r9,5*8(%rdi)
	237	15: movnti %r10,6*8(%rdi)
	238	16: movnti %r11,7*8(%rdi)
	239	leaq 64(%rsi),%rsi
	240	leaq 64(%rdi),%rdi
	241	decl %ecx
ee9737c9 TK	242	jnz .L_4x8b_nocache_copy_loop
	243
	244	/* Set 8-byte copy count and remainder */
	245	.L_8b_nocache_copy_entry:
	246	movl %edx,%ecx
b41e6ec2 BP	247	andl $7,%edx
b41e6ec2 BP	248	shrl $3,%ecx
a82eee74	249	jz .L_4b_nocache_copy_entry /* jump if count is 0 */
ee9737c9 TK	250
	251	/* Perform 8-byte nocache loop-copy */
	252	.L_8b_nocache_copy_loop:
	253	20: movq (%rsi),%r8
	254	21: movnti %r8,(%rdi)
b41e6ec2 BP	255	leaq 8(%rsi),%rsi
	256	leaq 8(%rdi),%rdi
	257	decl %ecx
ee9737c9 TK	258	jnz .L_8b_nocache_copy_loop
	259
	260	/* If no byte left, we're done */
a82eee74 TK	261	.L_4b_nocache_copy_entry:
	262	andl %edx,%edx
	263	jz .L_finish_copy
	264
	265	/* If destination is not 4-byte aligned, go to byte copy: */
	266	movl %edi,%ecx
	267	andl $3,%ecx
	268	jnz .L_1b_cache_copy_entry
	269
	270	/* Set 4-byte copy count (1 or 0) and remainder */
b41e6ec2	271	movl %edx,%ecx
a82eee74 TK	272	andl $3,%edx
	273	shrl $2,%ecx
	274	jz .L_1b_cache_copy_entry /* jump if count is 0 */
	275
	276	/* Perform 4-byte nocache copy: */
	277	30: movl (%rsi),%r8d
	278	31: movnti %r8d,(%rdi)
	279	leaq 4(%rsi),%rsi
	280	leaq 4(%rdi),%rdi
	281
	282	/* If no bytes left, we're done: */
ee9737c9 TK	283	andl %edx,%edx
	284	jz .L_finish_copy
	285
	286	/* Perform byte "cache" loop-copy for the remainder */
a82eee74	287	.L_1b_cache_copy_entry:
b41e6ec2	288	movl %edx,%ecx
ee9737c9 TK	289	.L_1b_cache_copy_loop:
	290	40: movb (%rsi),%al
	291	41: movb %al,(%rdi)
b41e6ec2 BP	292	incq %rsi
	293	incq %rdi
	294	decl %ecx
ee9737c9 TK	295	jnz .L_1b_cache_copy_loop
	296
	297	/* Finished copying; fence the prior stores */
	298	.L_finish_copy:
	299	xorl %eax,%eax
b41e6ec2 BP	300	ASM_CLAC
	301	sfence
	302	ret
	303
	304	.section .fixup,"ax"
ee9737c9 TK	305	.L_fixup_4x8b_copy:
ee9737c9 TK	306	shll $6,%ecx
b41e6ec2	307	addl %ecx,%edx
ee9737c9 TK	308	jmp .L_fixup_handle_tail
	309	.L_fixup_8b_copy:
	310	lea (%rdx,%rcx,8),%rdx
	311	jmp .L_fixup_handle_tail
a82eee74 TK	312	.L_fixup_4b_copy:
	313	lea (%rdx,%rcx,4),%rdx
	314	jmp .L_fixup_handle_tail
ee9737c9 TK	315	.L_fixup_1b_copy:
	316	movl %ecx,%edx
	317	.L_fixup_handle_tail:
	318	sfence
b41e6ec2 BP	319	jmp copy_user_handle_tail
	320	.previous
	321
ee9737c9 TK	322	_ASM_EXTABLE(1b,.L_fixup_4x8b_copy)
	323	_ASM_EXTABLE(2b,.L_fixup_4x8b_copy)
	324	_ASM_EXTABLE(3b,.L_fixup_4x8b_copy)
	325	_ASM_EXTABLE(4b,.L_fixup_4x8b_copy)
	326	_ASM_EXTABLE(5b,.L_fixup_4x8b_copy)
	327	_ASM_EXTABLE(6b,.L_fixup_4x8b_copy)
	328	_ASM_EXTABLE(7b,.L_fixup_4x8b_copy)
	329	_ASM_EXTABLE(8b,.L_fixup_4x8b_copy)
	330	_ASM_EXTABLE(9b,.L_fixup_4x8b_copy)
	331	_ASM_EXTABLE(10b,.L_fixup_4x8b_copy)
	332	_ASM_EXTABLE(11b,.L_fixup_4x8b_copy)
	333	_ASM_EXTABLE(12b,.L_fixup_4x8b_copy)
	334	_ASM_EXTABLE(13b,.L_fixup_4x8b_copy)
	335	_ASM_EXTABLE(14b,.L_fixup_4x8b_copy)
	336	_ASM_EXTABLE(15b,.L_fixup_4x8b_copy)
	337	_ASM_EXTABLE(16b,.L_fixup_4x8b_copy)
	338	_ASM_EXTABLE(20b,.L_fixup_8b_copy)
	339	_ASM_EXTABLE(21b,.L_fixup_8b_copy)
a82eee74 TK	340	_ASM_EXTABLE(30b,.L_fixup_4b_copy)
a82eee74 TK	341	_ASM_EXTABLE(31b,.L_fixup_4b_copy)
ee9737c9 TK	342	_ASM_EXTABLE(40b,.L_fixup_1b_copy)
ee9737c9 TK	343	_ASM_EXTABLE(41b,.L_fixup_1b_copy)
b41e6ec2	344	ENDPROC(__copy_user_nocache)
784d5699	345	EXPORT_SYMBOL(__copy_user_nocache)