]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blob - arch/x86/lib/memcpy_64.S
Merge branches 'x86/apic', 'x86/asm', 'x86/cleanups', 'x86/debug', 'x86/kconfig'...
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / lib / memcpy_64.S
1 /* Copyright 2002 Andi Kleen */
2
3 #include <linux/linkage.h>
4
5 #include <asm/cpufeature.h>
6 #include <asm/dwarf2.h>
7
8 /*
9 * memcpy - Copy a memory block.
10 *
11 * Input:
12 * rdi destination
13 * rsi source
14 * rdx count
15 *
16 * Output:
17 * rax original destination
18 */
19
20 /*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 *
23 * Calls to this get patched into the kernel image via the
24 * alternative instructions framework:
25 */
26 ALIGN
27 memcpy_c:
28 CFI_STARTPROC
29 movq %rdi, %rax
30
31 movl %edx, %ecx
32 shrl $3, %ecx
33 andl $7, %edx
34 rep movsq
35 movl %edx, %ecx
36 rep movsb
37 ret
38 CFI_ENDPROC
39 ENDPROC(memcpy_c)
40
41 ENTRY(__memcpy)
42 ENTRY(memcpy)
43 CFI_STARTPROC
44
45 /*
46 * Put the number of full 64-byte blocks into %ecx.
47 * Tail portion is handled at the end:
48 */
49 movq %rdi, %rax
50 movl %edx, %ecx
51 shrl $6, %ecx
52 jz .Lhandle_tail
53
54 .p2align 4
55 .Lloop_64:
56 /*
57 * We decrement the loop index here - and the zero-flag is
58 * checked at the end of the loop (instructions inbetween do
59 * not change the zero flag):
60 */
61 decl %ecx
62
63 /*
64 * Move in blocks of 4x16 bytes:
65 */
66 movq 0*8(%rsi), %r11
67 movq 1*8(%rsi), %r8
68 movq %r11, 0*8(%rdi)
69 movq %r8, 1*8(%rdi)
70
71 movq 2*8(%rsi), %r9
72 movq 3*8(%rsi), %r10
73 movq %r9, 2*8(%rdi)
74 movq %r10, 3*8(%rdi)
75
76 movq 4*8(%rsi), %r11
77 movq 5*8(%rsi), %r8
78 movq %r11, 4*8(%rdi)
79 movq %r8, 5*8(%rdi)
80
81 movq 6*8(%rsi), %r9
82 movq 7*8(%rsi), %r10
83 movq %r9, 6*8(%rdi)
84 movq %r10, 7*8(%rdi)
85
86 leaq 64(%rsi), %rsi
87 leaq 64(%rdi), %rdi
88
89 jnz .Lloop_64
90
91 .Lhandle_tail:
92 movl %edx, %ecx
93 andl $63, %ecx
94 shrl $3, %ecx
95 jz .Lhandle_7
96
97 .p2align 4
98 .Lloop_8:
99 decl %ecx
100 movq (%rsi), %r8
101 movq %r8, (%rdi)
102 leaq 8(%rdi), %rdi
103 leaq 8(%rsi), %rsi
104 jnz .Lloop_8
105
106 .Lhandle_7:
107 movl %edx, %ecx
108 andl $7, %ecx
109 jz .Lend
110
111 .p2align 4
112 .Lloop_1:
113 movb (%rsi), %r8b
114 movb %r8b, (%rdi)
115 incq %rdi
116 incq %rsi
117 decl %ecx
118 jnz .Lloop_1
119
120 .Lend:
121 ret
122 CFI_ENDPROC
123 ENDPROC(memcpy)
124 ENDPROC(__memcpy)
125
126 /*
127 * Some CPUs run faster using the string copy instructions.
128 * It is also a lot simpler. Use this when possible:
129 */
130
131 .section .altinstr_replacement, "ax"
132 1: .byte 0xeb /* jmp <disp8> */
133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
134 2:
135 .previous
136
137 .section .altinstructions, "a"
138 .align 8
139 .quad memcpy
140 .quad 1b
141 .byte X86_FEATURE_REP_GOOD
142
143 /*
144 * Replace only beginning, memcpy is used to apply alternatives,
145 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome...
147 */
148 .byte 2b - 1b
149 .byte 2b - 1b
150 .previous