[mirror_ubuntu-disco-kernel.git] / arch / x86 / lib / memmove_64.S

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Normally compiler builtins are used, but sometimes the compiler calls out
 * of line code. Based on asm-i386/string.h.
 *
 * This assembly file is re-written from memmove_64.c file.
 *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
 */
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
#include <asm/export.h>

#undef memmove

/*
 * Implement memmove(). This can handle overlap between src and dst.
 *
 * Input:
 * rdi: dest
 * rsi: src
 * rdx: count
 *
 * Output:
 * rax: dest
 */
.weak memmove

ENTRY(memmove)
ENTRY(__memmove)

	/* Handle more 32 bytes in loop */
	mov %rdi, %rax
	cmp $0x20, %rdx
	jb	1f

	/* Decide forward/backward copy mode */
	cmp %rdi, %rsi
	jge .Lmemmove_begin_forward
	mov %rsi, %r8
	add %rdx, %r8
	cmp %rdi, %r8
	jg 2f

.Lmemmove_begin_forward:
	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS

	/*
	 * movsq instruction have many startup latency
	 * so we handle small size by general register.
	 */
	cmp  $680, %rdx
	jb	3f
	/*
	 * movsq instruction is only good for aligned case.
	 */

	cmpb %dil, %sil
	je 4f
3:
	sub $0x20, %rdx
	/*
	 * We gobble 32 bytes forward in each loop.
	 */
5:
	sub $0x20, %rdx
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r10
	movq 2*8(%rsi), %r9
	movq 3*8(%rsi), %r8
	leaq 4*8(%rsi), %rsi

	movq %r11, 0*8(%rdi)
	movq %r10, 1*8(%rdi)
	movq %r9, 2*8(%rdi)
	movq %r8, 3*8(%rdi)
	leaq 4*8(%rdi), %rdi
	jae 5b
	addq $0x20, %rdx
	jmp 1f
	/*
	 * Handle data forward by movsq.
	 */
	.p2align 4
4:
	movq %rdx, %rcx
	movq -8(%rsi, %rdx), %r11
	lea -8(%rdi, %rdx), %r10
	shrq $3, %rcx
	rep movsq
	movq %r11, (%r10)
	jmp 13f
.Lmemmove_end_forward:

	/*
	 * Handle data backward by movsq.
	 */
	.p2align 4
7:
	movq %rdx, %rcx
	movq (%rsi), %r11
	movq %rdi, %r10
	leaq -8(%rsi, %rdx), %rsi
	leaq -8(%rdi, %rdx), %rdi
	shrq $3, %rcx
	std
	rep movsq
	cld
	movq %r11, (%r10)
	jmp 13f

	/*
	 * Start to prepare for backward copy.
	 */
	.p2align 4
2:
	cmp $680, %rdx
	jb 6f
	cmp %dil, %sil
	je 7b
6:
	/*
	 * Calculate copy position to tail.
	 */
	addq %rdx, %rsi
	addq %rdx, %rdi
	subq $0x20, %rdx
	/*
	 * We gobble 32 bytes backward in each loop.
	 */
8:
	subq $0x20, %rdx
	movq -1*8(%rsi), %r11
	movq -2*8(%rsi), %r10
	movq -3*8(%rsi), %r9
	movq -4*8(%rsi), %r8
	leaq -4*8(%rsi), %rsi

	movq %r11, -1*8(%rdi)
	movq %r10, -2*8(%rdi)
	movq %r9, -3*8(%rdi)
	movq %r8, -4*8(%rdi)
	leaq -4*8(%rdi), %rdi
	jae 8b
	/*
	 * Calculate copy position to head.
	 */
	addq $0x20, %rdx
	subq %rdx, %rsi
	subq %rdx, %rdi
1:
	cmpq $16, %rdx
	jb 9f
	/*
	 * Move data from 16 bytes to 31 bytes.
	 */
	movq 0*8(%rsi), %r11
	movq 1*8(%rsi), %r10
	movq -2*8(%rsi, %rdx), %r9
	movq -1*8(%rsi, %rdx), %r8
	movq %r11, 0*8(%rdi)
	movq %r10, 1*8(%rdi)
	movq %r9, -2*8(%rdi, %rdx)
	movq %r8, -1*8(%rdi, %rdx)
	jmp 13f
	.p2align 4
9:
	cmpq $8, %rdx
	jb 10f
	/*
	 * Move data from 8 bytes to 15 bytes.
	 */
	movq 0*8(%rsi), %r11
	movq -1*8(%rsi, %rdx), %r10
	movq %r11, 0*8(%rdi)
	movq %r10, -1*8(%rdi, %rdx)
	jmp 13f
10:
	cmpq $4, %rdx
	jb 11f
	/*
	 * Move data from 4 bytes to 7 bytes.
	 */
	movl (%rsi), %r11d
	movl -4(%rsi, %rdx), %r10d
	movl %r11d, (%rdi)
	movl %r10d, -4(%rdi, %rdx)
	jmp 13f
11:
	cmp $2, %rdx
	jb 12f
	/*
	 * Move data from 2 bytes to 3 bytes.
	 */
	movw (%rsi), %r11w
	movw -2(%rsi, %rdx), %r10w
	movw %r11w, (%rdi)
	movw %r10w, -2(%rdi, %rdx)
	jmp 13f
12:
	cmp $1, %rdx
	jb 13f
	/*
	 * Move data for 1 byte.
	 */
	movb (%rsi), %r11b
	movb %r11b, (%rdi)
13:
	retq
ENDPROC(__memmove)
ENDPROC(memmove)
EXPORT_SYMBOL(__memmove)
EXPORT_SYMBOL(memmove)
Commit	Line	Data
	1	/* SPDX-License-Identifier: GPL-2.0 */
	2	/*
	3	* Normally compiler builtins are used, but sometimes the compiler calls out
	4	* of line code. Based on asm-i386/string.h.
	5	*
	6	* This assembly file is re-written from memmove_64.c file.
	7	* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
	8	*/
	9	#include <linux/linkage.h>
	10	#include <asm/cpufeatures.h>
	11	#include <asm/alternative-asm.h>
	12	#include <asm/export.h>
	13
	14	#undef memmove
	15
	16	/*
	17	* Implement memmove(). This can handle overlap between src and dst.
	18	*
	19	* Input:
	20	* rdi: dest
	21	* rsi: src
	22	* rdx: count
	23	*
	24	* Output:
	25	* rax: dest
	26	*/
	27	.weak memmove
	28
	29	ENTRY(memmove)
	30	ENTRY(__memmove)
	31
	32	/* Handle more 32 bytes in loop */
	33	mov %rdi, %rax
	34	cmp $0x20, %rdx
	35	jb 1f
	36
	37	/* Decide forward/backward copy mode */
	38	cmp %rdi, %rsi
	39	jge .Lmemmove_begin_forward
	40	mov %rsi, %r8
	41	add %rdx, %r8
	42	cmp %rdi, %r8
	43	jg 2f
	44
	45	.Lmemmove_begin_forward:
	46	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
	47
	48	/*
	49	* movsq instruction have many startup latency
	50	* so we handle small size by general register.
	51	*/
	52	cmp $680, %rdx
	53	jb 3f
	54	/*
	55	* movsq instruction is only good for aligned case.
	56	*/
	57
	58	cmpb %dil, %sil
	59	je 4f
	60	3:
	61	sub $0x20, %rdx
	62	/*
	63	* We gobble 32 bytes forward in each loop.
	64	*/
	65	5:
	66	sub $0x20, %rdx
	67	movq 0*8(%rsi), %r11
	68	movq 1*8(%rsi), %r10
	69	movq 2*8(%rsi), %r9
	70	movq 3*8(%rsi), %r8
	71	leaq 4*8(%rsi), %rsi
	72
	73	movq %r11, 0*8(%rdi)
	74	movq %r10, 1*8(%rdi)
	75	movq %r9, 2*8(%rdi)
	76	movq %r8, 3*8(%rdi)
	77	leaq 4*8(%rdi), %rdi
	78	jae 5b
	79	addq $0x20, %rdx
	80	jmp 1f
	81	/*
	82	* Handle data forward by movsq.
	83	*/
	84	.p2align 4
	85	4:
	86	movq %rdx, %rcx
	87	movq -8(%rsi, %rdx), %r11
	88	lea -8(%rdi, %rdx), %r10
	89	shrq $3, %rcx
	90	rep movsq
	91	movq %r11, (%r10)
	92	jmp 13f
	93	.Lmemmove_end_forward:
	94
	95	/*
	96	* Handle data backward by movsq.
	97	*/
	98	.p2align 4
	99	7:
	100	movq %rdx, %rcx
	101	movq (%rsi), %r11
	102	movq %rdi, %r10
	103	leaq -8(%rsi, %rdx), %rsi
	104	leaq -8(%rdi, %rdx), %rdi
	105	shrq $3, %rcx
	106	std
	107	rep movsq
	108	cld
	109	movq %r11, (%r10)
	110	jmp 13f
	111
	112	/*
	113	* Start to prepare for backward copy.
	114	*/
	115	.p2align 4
	116	2:
	117	cmp $680, %rdx
	118	jb 6f
	119	cmp %dil, %sil
	120	je 7b
	121	6:
	122	/*
	123	* Calculate copy position to tail.
	124	*/
	125	addq %rdx, %rsi
	126	addq %rdx, %rdi
	127	subq $0x20, %rdx
	128	/*
	129	* We gobble 32 bytes backward in each loop.
	130	*/
	131	8:
	132	subq $0x20, %rdx
	133	movq -1*8(%rsi), %r11
	134	movq -2*8(%rsi), %r10
	135	movq -3*8(%rsi), %r9
	136	movq -4*8(%rsi), %r8
	137	leaq -4*8(%rsi), %rsi
	138
	139	movq %r11, -1*8(%rdi)
	140	movq %r10, -2*8(%rdi)
	141	movq %r9, -3*8(%rdi)
	142	movq %r8, -4*8(%rdi)
	143	leaq -4*8(%rdi), %rdi
	144	jae 8b
	145	/*
	146	* Calculate copy position to head.
	147	*/
	148	addq $0x20, %rdx
	149	subq %rdx, %rsi
	150	subq %rdx, %rdi
	151	1:
	152	cmpq $16, %rdx
	153	jb 9f
	154	/*
	155	* Move data from 16 bytes to 31 bytes.
	156	*/
	157	movq 0*8(%rsi), %r11
	158	movq 1*8(%rsi), %r10
	159	movq -2*8(%rsi, %rdx), %r9
	160	movq -1*8(%rsi, %rdx), %r8
	161	movq %r11, 0*8(%rdi)
	162	movq %r10, 1*8(%rdi)
	163	movq %r9, -2*8(%rdi, %rdx)
	164	movq %r8, -1*8(%rdi, %rdx)
	165	jmp 13f
	166	.p2align 4
	167	9:
	168	cmpq $8, %rdx
	169	jb 10f
	170	/*
	171	* Move data from 8 bytes to 15 bytes.
	172	*/
	173	movq 0*8(%rsi), %r11
	174	movq -1*8(%rsi, %rdx), %r10
	175	movq %r11, 0*8(%rdi)
	176	movq %r10, -1*8(%rdi, %rdx)
	177	jmp 13f
	178	10:
	179	cmpq $4, %rdx
	180	jb 11f
	181	/*
	182	* Move data from 4 bytes to 7 bytes.
	183	*/
	184	movl (%rsi), %r11d
	185	movl -4(%rsi, %rdx), %r10d
	186	movl %r11d, (%rdi)
	187	movl %r10d, -4(%rdi, %rdx)
	188	jmp 13f
	189	11:
	190	cmp $2, %rdx
	191	jb 12f
	192	/*
	193	* Move data from 2 bytes to 3 bytes.
	194	*/
	195	movw (%rsi), %r11w
	196	movw -2(%rsi, %rdx), %r10w
	197	movw %r11w, (%rdi)
	198	movw %r10w, -2(%rdi, %rdx)
	199	jmp 13f
	200	12:
	201	cmp $1, %rdx
	202	jb 13f
	203	/*
	204	* Move data for 1 byte.
	205	*/
	206	movb (%rsi), %r11b
	207	movb %r11b, (%rdi)
	208	13:
	209	retq
	210	ENDPROC(__memmove)
	211	ENDPROC(memmove)
	212	EXPORT_SYMBOL(__memmove)
	213	EXPORT_SYMBOL(memmove)