[mirror_ubuntu-jammy-kernel.git] / arch / x86_64 / lib / copy_user.S

/* Copyright 2002 Andi Kleen, SuSE Labs.
 * Subject to the GNU Public License v2.
 * 
 * Functions to copy from and to user space.		
 */		 

#define FIX_ALIGNMENT 1

	#include <asm/current.h>
	#include <asm/asm-offsets.h>
	#include <asm/thread_info.h>
	#include <asm/cpufeature.h>

/* Standard copy_to_user with segment limit checking */		
	.globl copy_to_user
	.p2align 4	
copy_to_user:
	GET_THREAD_INFO(%rax)
	movq %rdi,%rcx
	addq %rdx,%rcx
	jc  bad_to_user
	cmpq threadinfo_addr_limit(%rax),%rcx
	jae bad_to_user
2:
	.byte 0xe9	/* 32bit jump */
	.long .Lcug-1f
1:

	.section .altinstr_replacement,"ax"
3:	.byte 0xe9			/* replacement jmp with 8 bit immediate */
	.long copy_user_generic_c-1b	/* offset */
	.previous
	.section .altinstructions,"a"
	.align 8
	.quad  2b
	.quad  3b
	.byte  X86_FEATURE_REP_GOOD
	.byte  5
	.byte  5
	.previous

/* Standard copy_from_user with segment limit checking */	
	.globl copy_from_user
	.p2align 4	
copy_from_user:
	GET_THREAD_INFO(%rax)
	movq %rsi,%rcx
	addq %rdx,%rcx
	jc  bad_from_user
	cmpq threadinfo_addr_limit(%rax),%rcx
	jae  bad_from_user
	/* FALL THROUGH to copy_user_generic */
	
	.section .fixup,"ax"
	/* must zero dest */
bad_from_user:
	movl %edx,%ecx
	xorl %eax,%eax
	rep
	stosb
bad_to_user:
	movl	%edx,%eax
	ret
	.previous
	
		
/*
 * copy_user_generic - memory copy with exception handling.
 * 	
 * Input:	
 * rdi destination
 * rsi source
 * rdx count
 *
 * Output:		
 * eax uncopied bytes or 0 if successful.
 */
	.globl copy_user_generic
	.p2align 4
copy_user_generic:
	.byte 0x66,0x66,0x90	/* 5 byte nop for replacement jump */
	.byte 0x66,0x90
1:
	.section .altinstr_replacement,"ax"
2:	.byte 0xe9	             /* near jump with 32bit immediate */
	.long copy_user_generic_c-1b /* offset */
	.previous
	.section .altinstructions,"a"
	.align 8
	.quad  copy_user_generic
	.quad  2b
	.byte  X86_FEATURE_REP_GOOD
	.byte  5
	.byte  5
	.previous
.Lcug:
	pushq %rbx
	xorl %eax,%eax		/*zero for the exception handler */

#ifdef FIX_ALIGNMENT
	/* check for bad alignment of destination */
	movl %edi,%ecx
	andl $7,%ecx
	jnz  .Lbad_alignment
.Lafter_bad_alignment:
#endif

	movq %rdx,%rcx

	movl $64,%ebx
	shrq $6,%rdx
	decq %rdx
	js   .Lhandle_tail

	.p2align 4
.Lloop:
.Ls1:	movq (%rsi),%r11
.Ls2:	movq 1*8(%rsi),%r8
.Ls3:	movq 2*8(%rsi),%r9
.Ls4:	movq 3*8(%rsi),%r10
.Ld1:	movq %r11,(%rdi)
.Ld2:	movq %r8,1*8(%rdi)
.Ld3:	movq %r9,2*8(%rdi)
.Ld4:	movq %r10,3*8(%rdi)

.Ls5:	movq 4*8(%rsi),%r11
.Ls6:	movq 5*8(%rsi),%r8
.Ls7:	movq 6*8(%rsi),%r9
.Ls8:	movq 7*8(%rsi),%r10
.Ld5:	movq %r11,4*8(%rdi)
.Ld6:	movq %r8,5*8(%rdi)
.Ld7:	movq %r9,6*8(%rdi)
.Ld8:	movq %r10,7*8(%rdi)

	decq %rdx

	leaq 64(%rsi),%rsi
	leaq 64(%rdi),%rdi

	jns  .Lloop

	.p2align 4
.Lhandle_tail:
	movl %ecx,%edx
	andl $63,%ecx
	shrl $3,%ecx
	jz   .Lhandle_7
	movl $8,%ebx
	.p2align 4
.Lloop_8:
.Ls9:	movq (%rsi),%r8
.Ld9:	movq %r8,(%rdi)
	decl %ecx
	leaq 8(%rdi),%rdi
	leaq 8(%rsi),%rsi
	jnz .Lloop_8

.Lhandle_7:
	movl %edx,%ecx
	andl $7,%ecx
	jz   .Lende
	.p2align 4
.Lloop_1:
.Ls10:	movb (%rsi),%bl
.Ld10:	movb %bl,(%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

.Lende:
	popq %rbx
	ret

#ifdef FIX_ALIGNMENT
	/* align destination */
	.p2align 4
.Lbad_alignment:
	movl $8,%r9d
	subl %ecx,%r9d
	movl %r9d,%ecx
	cmpq %r9,%rdx
	jz   .Lhandle_7
	js   .Lhandle_7
.Lalign_1:
.Ls11:	movb (%rsi),%bl
.Ld11:	movb %bl,(%rdi)
	incq %rsi
	incq %rdi
	decl %ecx
	jnz .Lalign_1
	subq %r9,%rdx
	jmp .Lafter_bad_alignment
#endif

	/* table sorted by exception address */
	.section __ex_table,"a"
	.align 8
	.quad .Ls1,.Ls1e
	.quad .Ls2,.Ls2e
	.quad .Ls3,.Ls3e
	.quad .Ls4,.Ls4e
	.quad .Ld1,.Ls1e
	.quad .Ld2,.Ls2e
	.quad .Ld3,.Ls3e
	.quad .Ld4,.Ls4e
	.quad .Ls5,.Ls5e
	.quad .Ls6,.Ls6e
	.quad .Ls7,.Ls7e
	.quad .Ls8,.Ls8e
	.quad .Ld5,.Ls5e
	.quad .Ld6,.Ls6e
	.quad .Ld7,.Ls7e
	.quad .Ld8,.Ls8e
	.quad .Ls9,.Le_quad
	.quad .Ld9,.Le_quad
	.quad .Ls10,.Le_byte
	.quad .Ld10,.Le_byte
#ifdef FIX_ALIGNMENT
	.quad .Ls11,.Lzero_rest
	.quad .Ld11,.Lzero_rest
#endif
	.quad .Le5,.Le_zero
	.previous

	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
	   pessimistic side. this is gross. it would be better to fix the
	   interface. */
	/* eax: zero, ebx: 64 */
.Ls1e: 	addl $8,%eax
.Ls2e: 	addl $8,%eax
.Ls3e: 	addl $8,%eax
.Ls4e: 	addl $8,%eax
.Ls5e: 	addl $8,%eax
.Ls6e: 	addl $8,%eax
.Ls7e: 	addl $8,%eax
.Ls8e: 	addl $8,%eax
	addq %rbx,%rdi	/* +64 */
	subq %rax,%rdi  /* correct destination with computed offset */

	shlq $6,%rdx	/* loop counter * 64 (stride length) */
	addq %rax,%rdx	/* add offset to loopcnt */
	andl $63,%ecx	/* remaining bytes */
	addq %rcx,%rdx	/* add them */
	jmp .Lzero_rest

	/* exception on quad word loop in tail handling */
	/* ecx:	loopcnt/8, %edx: length, rdi: correct */
.Le_quad:
	shll $3,%ecx
	andl $7,%edx
	addl %ecx,%edx
	/* edx: bytes to zero, rdi: dest, eax:zero */
.Lzero_rest:
	movq %rdx,%rcx
.Le_byte:
	xorl %eax,%eax
.Le5:	rep
	stosb
	/* when there is another exception while zeroing the rest just return */
.Le_zero:
	movq %rdx,%rax
	jmp .Lende

	/* Some CPUs run faster using the string copy instructions.
	   This is also a lot simpler. Use them when possible.
	   Patch in jmps to this code instead of copying it fully
	   to avoid unwanted aliasing in the exception tables. */

 /* rdi	destination
  * rsi source
  * rdx count
  *
  * Output:
  * eax uncopied bytes or 0 if successfull.
  *
  * Only 4GB of copy is supported. This shouldn't be a problem
  * because the kernel normally only writes from/to page sized chunks
  * even if user space passed a longer buffer.
  * And more would be dangerous because both Intel and AMD have
  * errata with rep movsq > 4GB. If someone feels the need to fix
  * this please consider this.
   */
copy_user_generic_c:
	movl %edx,%ecx
	shrl $3,%ecx
	andl $7,%edx	
1:	rep 
	movsq 
	movl %edx,%ecx
2:	rep
	movsb
4:	movl %ecx,%eax
	ret
3:	lea (%rdx,%rcx,8),%rax
	ret

	.section __ex_table,"a"
	.quad 1b,3b
	.quad 2b,4b
	.previous
Commit	Line	Data
1da177e4 LT	1	/* Copyright 2002 Andi Kleen, SuSE Labs.
	2	* Subject to the GNU Public License v2.
	3	*
	4	* Functions to copy from and to user space.
	5	*/
	6
7bcd3f34 AK	7	#define FIX_ALIGNMENT 1
7bcd3f34 AK	8
1da177e4	9	#include <asm/current.h>
e2d5df93	10	#include <asm/asm-offsets.h>
1da177e4	11	#include <asm/thread_info.h>
7bcd3f34	12	#include <asm/cpufeature.h>
1da177e4 LT	13
	14	/* Standard copy_to_user with segment limit checking */
	15	.globl copy_to_user
	16	.p2align 4
	17	copy_to_user:
	18	GET_THREAD_INFO(%rax)
	19	movq %rdi,%rcx
	20	addq %rdx,%rcx
	21	jc bad_to_user
	22	cmpq threadinfo_addr_limit(%rax),%rcx
	23	jae bad_to_user
7bcd3f34 AK	24	2:
	25	.byte 0xe9 /* 32bit jump */
	26	.long .Lcug-1f
	27	1:
	28
	29	.section .altinstr_replacement,"ax"
	30	3: .byte 0xe9 /* replacement jmp with 8 bit immediate */
	31	.long copy_user_generic_c-1b /* offset */
	32	.previous
	33	.section .altinstructions,"a"
	34	.align 8
	35	.quad 2b
	36	.quad 3b
	37	.byte X86_FEATURE_REP_GOOD
	38	.byte 5
	39	.byte 5
	40	.previous
1da177e4 LT	41
	42	/* Standard copy_from_user with segment limit checking */
	43	.globl copy_from_user
	44	.p2align 4
	45	copy_from_user:
	46	GET_THREAD_INFO(%rax)
	47	movq %rsi,%rcx
	48	addq %rdx,%rcx
	49	jc bad_from_user
	50	cmpq threadinfo_addr_limit(%rax),%rcx
	51	jae bad_from_user
	52	/* FALL THROUGH to copy_user_generic */
	53
	54	.section .fixup,"ax"
	55	/* must zero dest */
	56	bad_from_user:
	57	movl %edx,%ecx
	58	xorl %eax,%eax
	59	rep
	60	stosb
	61	bad_to_user:
	62	movl %edx,%eax
	63	ret
	64	.previous
	65
	66
	67	/*
	68	* copy_user_generic - memory copy with exception handling.
	69	*
	70	* Input:
	71	* rdi destination
	72	* rsi source
	73	* rdx count
	74	*
	75	* Output:
	76	* eax uncopied bytes or 0 if successful.
	77	*/
2cbc9ee3	78	.globl copy_user_generic
7bcd3f34	79	.p2align 4
2cbc9ee3	80	copy_user_generic:
7bcd3f34 AK	81	.byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
	82	.byte 0x66,0x90
	83	1:
	84	.section .altinstr_replacement,"ax"
	85	2: .byte 0xe9 /* near jump with 32bit immediate */
	86	.long copy_user_generic_c-1b /* offset */
	87	.previous
	88	.section .altinstructions,"a"
	89	.align 8
	90	.quad copy_user_generic
	91	.quad 2b
	92	.byte X86_FEATURE_REP_GOOD
	93	.byte 5
	94	.byte 5
	95	.previous
	96	.Lcug:
	97	pushq %rbx
	98	xorl %eax,%eax /zero for the exception handler /
	99
	100	#ifdef FIX_ALIGNMENT
	101	/* check for bad alignment of destination */
	102	movl %edi,%ecx
	103	andl $7,%ecx
	104	jnz .Lbad_alignment
	105	.Lafter_bad_alignment:
	106	#endif
	107
	108	movq %rdx,%rcx
	109
	110	movl $64,%ebx
	111	shrq $6,%rdx
	112	decq %rdx
	113	js .Lhandle_tail
	114
	115	.p2align 4
	116	.Lloop:
	117	.Ls1: movq (%rsi),%r11
	118	.Ls2: movq 1*8(%rsi),%r8
	119	.Ls3: movq 2*8(%rsi),%r9
	120	.Ls4: movq 3*8(%rsi),%r10
	121	.Ld1: movq %r11,(%rdi)
	122	.Ld2: movq %r8,1*8(%rdi)
	123	.Ld3: movq %r9,2*8(%rdi)
	124	.Ld4: movq %r10,3*8(%rdi)
	125
	126	.Ls5: movq 4*8(%rsi),%r11
	127	.Ls6: movq 5*8(%rsi),%r8
	128	.Ls7: movq 6*8(%rsi),%r9
	129	.Ls8: movq 7*8(%rsi),%r10
	130	.Ld5: movq %r11,4*8(%rdi)
	131	.Ld6: movq %r8,5*8(%rdi)
	132	.Ld7: movq %r9,6*8(%rdi)
	133	.Ld8: movq %r10,7*8(%rdi)
	134
	135	decq %rdx
	136
	137	leaq 64(%rsi),%rsi
	138	leaq 64(%rdi),%rdi
	139
	140	jns .Lloop
	141
	142	.p2align 4
	143	.Lhandle_tail:
	144	movl %ecx,%edx
145	andl $63,%ecx
146	shrl $3,%ecx
147	jz .Lhandle_7
148	movl $8,%ebx
149	.p2align 4
150	.Lloop_8:
151	.Ls9: movq (%rsi),%r8
152	.Ld9: movq %r8,(%rdi)
153	decl %ecx
154	leaq 8(%rdi),%rdi
155	leaq 8(%rsi),%rsi
156	jnz .Lloop_8
157
158	.Lhandle_7:
159	movl %edx,%ecx
160	andl $7,%ecx
161	jz .Lende
162	.p2align 4
163	.Lloop_1:
164	.Ls10: movb (%rsi),%bl
165	.Ld10: movb %bl,(%rdi)
166	incq %rdi
167	incq %rsi
168	decl %ecx
169	jnz .Lloop_1
170
171	.Lende:
172	popq %rbx
173	ret
174
175	#ifdef FIX_ALIGNMENT
176	/* align destination */
177	.p2align 4
178	.Lbad_alignment:
179	movl $8,%r9d
180	subl %ecx,%r9d
181	movl %r9d,%ecx
182	cmpq %r9,%rdx
183	jz .Lhandle_7
184	js .Lhandle_7
185	.Lalign_1:
186	.Ls11: movb (%rsi),%bl
187	.Ld11: movb %bl,(%rdi)
188	incq %rsi
189	incq %rdi
190	decl %ecx
191	jnz .Lalign_1
192	subq %r9,%rdx
193	jmp .Lafter_bad_alignment
194	#endif
195
196	/* table sorted by exception address */
197	.section __ex_table,"a"
198	.align 8
199	.quad .Ls1,.Ls1e
200	.quad .Ls2,.Ls2e
201	.quad .Ls3,.Ls3e
202	.quad .Ls4,.Ls4e
203	.quad .Ld1,.Ls1e
204	.quad .Ld2,.Ls2e
205	.quad .Ld3,.Ls3e
206	.quad .Ld4,.Ls4e
207	.quad .Ls5,.Ls5e
208	.quad .Ls6,.Ls6e
209	.quad .Ls7,.Ls7e
210	.quad .Ls8,.Ls8e
211	.quad .Ld5,.Ls5e
212	.quad .Ld6,.Ls6e
213	.quad .Ld7,.Ls7e
214	.quad .Ld8,.Ls8e
215	.quad .Ls9,.Le_quad
216	.quad .Ld9,.Le_quad
217	.quad .Ls10,.Le_byte
218	.quad .Ld10,.Le_byte
219	#ifdef FIX_ALIGNMENT
220	.quad .Ls11,.Lzero_rest
221	.quad .Ld11,.Lzero_rest
222	#endif
223	.quad .Le5,.Le_zero
224	.previous
225
226	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
227	pessimistic side. this is gross. it would be better to fix the
228	interface. */
229	/* eax: zero, ebx: 64 */
230	.Ls1e: addl $8,%eax
231	.Ls2e: addl $8,%eax
232	.Ls3e: addl $8,%eax
233	.Ls4e: addl $8,%eax
234	.Ls5e: addl $8,%eax
235	.Ls6e: addl $8,%eax
236	.Ls7e: addl $8,%eax
237	.Ls8e: addl $8,%eax
238	addq %rbx,%rdi /* +64 */
239	subq %rax,%rdi /* correct destination with computed offset */
240
241	shlq $6,%rdx /* loop counter * 64 (stride length) */
242	addq %rax,%rdx /* add offset to loopcnt */
243	andl $63,%ecx /* remaining bytes */
244	addq %rcx,%rdx /* add them */
245	jmp .Lzero_rest
246
247	/* exception on quad word loop in tail handling */
248	/* ecx: loopcnt/8, %edx: length, rdi: correct */
249	.Le_quad:
250	shll $3,%ecx
251	andl $7,%edx
252	addl %ecx,%edx
253	/* edx: bytes to zero, rdi: dest, eax:zero */
254	.Lzero_rest:
255	movq %rdx,%rcx
256	.Le_byte:
257	xorl %eax,%eax
258	.Le5: rep
259	stosb
260	/* when there is another exception while zeroing the rest just return */
261	.Le_zero:
262	movq %rdx,%rax
263	jmp .Lende
264
265	/* Some CPUs run faster using the string copy instructions.
266	This is also a lot simpler. Use them when possible.
267	Patch in jmps to this code instead of copying it fully
268	to avoid unwanted aliasing in the exception tables. */
269
270	/* rdi destination
271	* rsi source
272	* rdx count
273	*
274	* Output:
275	* eax uncopied bytes or 0 if successfull.
276	*
277	* Only 4GB of copy is supported. This shouldn't be a problem
278	* because the kernel normally only writes from/to page sized chunks
279	* even if user space passed a longer buffer.
280	* And more would be dangerous because both Intel and AMD have
281	* errata with rep movsq > 4GB. If someone feels the need to fix
282	* this please consider this.
283	*/
284	copy_user_generic_c:
1da177e4 LT	285	movl %edx,%ecx
	286	shrl $3,%ecx
	287	andl $7,%edx
	288	1: rep
	289	movsq
	290	movl %edx,%ecx
	291	2: rep
	292	movsb
7bcd3f34	293	4: movl %ecx,%eax
1da177e4 LT	294	ret
	295	3: lea (%rdx,%rcx,8),%rax
	296	ret
2cbc9ee3	297
1da177e4 LT	298	.section __ex_table,"a"
	299	.quad 1b,3b
	300	.quad 2b,4b
	301	.previous