]>
Commit | Line | Data |
---|---|---|
1da177e4 | 1 | /* Copyright 2002 Andi Kleen */ |
038b0a6d | 2 | |
8d379dad | 3 | #include <linux/linkage.h> |
cbf8b5a2 | 4 | #include <asm/errno.h> |
cd4d09ec | 5 | #include <asm/cpufeatures.h> |
101068c1 | 6 | #include <asm/alternative-asm.h> |
784d5699 | 7 | #include <asm/export.h> |
8d379dad | 8 | |
e0bc8d17 BP |
9 | /* |
10 | * We build a jump to memcpy_orig by default which gets NOPped out on | |
11 | * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which | |
12 | * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs | |
13 | * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. | |
14 | */ | |
15 | ||
16 | .weak memcpy | |
17 | ||
1da177e4 LT |
18 | /* |
19 | * memcpy - Copy a memory block. | |
20 | * | |
f3b6eaf0 IM |
21 | * Input: |
22 | * rdi destination | |
23 | * rsi source | |
24 | * rdx count | |
25 | * | |
1da177e4 LT |
26 | * Output: |
27 | * rax original destination | |
f3b6eaf0 | 28 | */ |
e0bc8d17 BP |
29 | ENTRY(__memcpy) |
30 | ENTRY(memcpy) | |
31 | ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ | |
32 | "jmp memcpy_erms", X86_FEATURE_ERMS | |
1da177e4 | 33 | |
f3b6eaf0 | 34 | movq %rdi, %rax |
2ab56091 JB |
35 | movq %rdx, %rcx |
36 | shrq $3, %rcx | |
f3b6eaf0 | 37 | andl $7, %edx |
8d379dad | 38 | rep movsq |
f3b6eaf0 | 39 | movl %edx, %ecx |
8d379dad JB |
40 | rep movsb |
41 | ret | |
e0bc8d17 BP |
42 | ENDPROC(memcpy) |
43 | ENDPROC(__memcpy) | |
784d5699 AV |
44 | EXPORT_SYMBOL(memcpy) |
45 | EXPORT_SYMBOL(__memcpy) | |
8d379dad | 46 | |
101068c1 | 47 | /* |
e0bc8d17 BP |
48 | * memcpy_erms() - enhanced fast string memcpy. This is faster and |
49 | * simpler than memcpy. Use memcpy_erms when possible. | |
101068c1 | 50 | */ |
e0bc8d17 | 51 | ENTRY(memcpy_erms) |
101068c1 | 52 | movq %rdi, %rax |
2ab56091 | 53 | movq %rdx, %rcx |
101068c1 FY |
54 | rep movsb |
55 | ret | |
e0bc8d17 | 56 | ENDPROC(memcpy_erms) |
393f203f | 57 | |
e0bc8d17 | 58 | ENTRY(memcpy_orig) |
59daa706 | 59 | movq %rdi, %rax |
7bcd3f34 | 60 | |
2ab56091 | 61 | cmpq $0x20, %rdx |
59daa706 | 62 | jb .Lhandle_tail |
7bcd3f34 | 63 | |
f3b6eaf0 | 64 | /* |
9de4966a | 65 | * We check whether memory false dependence could occur, |
59daa706 | 66 | * then jump to corresponding copy mode. |
f3b6eaf0 | 67 | */ |
59daa706 ML |
68 | cmp %dil, %sil |
69 | jl .Lcopy_backward | |
2ab56091 | 70 | subq $0x20, %rdx |
59daa706 ML |
71 | .Lcopy_forward_loop: |
72 | subq $0x20, %rdx | |
7bcd3f34 | 73 | |
f3b6eaf0 | 74 | /* |
59daa706 | 75 | * Move in blocks of 4x8 bytes: |
f3b6eaf0 | 76 | */ |
59daa706 ML |
77 | movq 0*8(%rsi), %r8 |
78 | movq 1*8(%rsi), %r9 | |
79 | movq 2*8(%rsi), %r10 | |
80 | movq 3*8(%rsi), %r11 | |
81 | leaq 4*8(%rsi), %rsi | |
82 | ||
83 | movq %r8, 0*8(%rdi) | |
84 | movq %r9, 1*8(%rdi) | |
85 | movq %r10, 2*8(%rdi) | |
86 | movq %r11, 3*8(%rdi) | |
87 | leaq 4*8(%rdi), %rdi | |
88 | jae .Lcopy_forward_loop | |
2ab56091 | 89 | addl $0x20, %edx |
59daa706 ML |
90 | jmp .Lhandle_tail |
91 | ||
92 | .Lcopy_backward: | |
93 | /* | |
94 | * Calculate copy position to tail. | |
95 | */ | |
96 | addq %rdx, %rsi | |
97 | addq %rdx, %rdi | |
98 | subq $0x20, %rdx | |
99 | /* | |
100 | * At most 3 ALU operations in one cycle, | |
d50ba368 | 101 | * so append NOPS in the same 16 bytes trunk. |
59daa706 ML |
102 | */ |
103 | .p2align 4 | |
104 | .Lcopy_backward_loop: | |
105 | subq $0x20, %rdx | |
106 | movq -1*8(%rsi), %r8 | |
107 | movq -2*8(%rsi), %r9 | |
108 | movq -3*8(%rsi), %r10 | |
109 | movq -4*8(%rsi), %r11 | |
110 | leaq -4*8(%rsi), %rsi | |
111 | movq %r8, -1*8(%rdi) | |
112 | movq %r9, -2*8(%rdi) | |
113 | movq %r10, -3*8(%rdi) | |
114 | movq %r11, -4*8(%rdi) | |
115 | leaq -4*8(%rdi), %rdi | |
116 | jae .Lcopy_backward_loop | |
7bcd3f34 | 117 | |
59daa706 ML |
118 | /* |
119 | * Calculate copy position to head. | |
120 | */ | |
2ab56091 | 121 | addl $0x20, %edx |
59daa706 ML |
122 | subq %rdx, %rsi |
123 | subq %rdx, %rdi | |
7bcd3f34 | 124 | .Lhandle_tail: |
2ab56091 | 125 | cmpl $16, %edx |
59daa706 | 126 | jb .Lless_16bytes |
f3b6eaf0 | 127 | |
59daa706 ML |
128 | /* |
129 | * Move data from 16 bytes to 31 bytes. | |
130 | */ | |
131 | movq 0*8(%rsi), %r8 | |
132 | movq 1*8(%rsi), %r9 | |
133 | movq -2*8(%rsi, %rdx), %r10 | |
134 | movq -1*8(%rsi, %rdx), %r11 | |
135 | movq %r8, 0*8(%rdi) | |
136 | movq %r9, 1*8(%rdi) | |
137 | movq %r10, -2*8(%rdi, %rdx) | |
138 | movq %r11, -1*8(%rdi, %rdx) | |
139 | retq | |
7bcd3f34 | 140 | .p2align 4 |
59daa706 | 141 | .Lless_16bytes: |
2ab56091 | 142 | cmpl $8, %edx |
59daa706 ML |
143 | jb .Lless_8bytes |
144 | /* | |
145 | * Move data from 8 bytes to 15 bytes. | |
146 | */ | |
147 | movq 0*8(%rsi), %r8 | |
148 | movq -1*8(%rsi, %rdx), %r9 | |
149 | movq %r8, 0*8(%rdi) | |
150 | movq %r9, -1*8(%rdi, %rdx) | |
151 | retq | |
152 | .p2align 4 | |
153 | .Lless_8bytes: | |
2ab56091 | 154 | cmpl $4, %edx |
59daa706 | 155 | jb .Lless_3bytes |
f3b6eaf0 | 156 | |
59daa706 ML |
157 | /* |
158 | * Move data from 4 bytes to 7 bytes. | |
159 | */ | |
160 | movl (%rsi), %ecx | |
161 | movl -4(%rsi, %rdx), %r8d | |
162 | movl %ecx, (%rdi) | |
163 | movl %r8d, -4(%rdi, %rdx) | |
164 | retq | |
7bcd3f34 | 165 | .p2align 4 |
59daa706 | 166 | .Lless_3bytes: |
9d8e2277 JB |
167 | subl $1, %edx |
168 | jb .Lend | |
59daa706 ML |
169 | /* |
170 | * Move data from 1 bytes to 3 bytes. | |
171 | */ | |
9d8e2277 JB |
172 | movzbl (%rsi), %ecx |
173 | jz .Lstore_1byte | |
174 | movzbq 1(%rsi), %r8 | |
175 | movzbq (%rsi, %rdx), %r9 | |
176 | movb %r8b, 1(%rdi) | |
177 | movb %r9b, (%rdi, %rdx) | |
178 | .Lstore_1byte: | |
179 | movb %cl, (%rdi) | |
7bcd3f34 | 180 | |
f3b6eaf0 | 181 | .Lend: |
59daa706 | 182 | retq |
e0bc8d17 | 183 | ENDPROC(memcpy_orig) |
92b0729c TL |
184 | |
185 | #ifndef CONFIG_UML | |
186 | /* | |
9a6fb28a | 187 | * memcpy_mcsafe_unrolled - memory copy with machine check exception handling |
92b0729c TL |
188 | * Note that we only catch machine checks when reading the source addresses. |
189 | * Writes to target are posted and don't generate machine checks. | |
190 | */ | |
9a6fb28a | 191 | ENTRY(memcpy_mcsafe_unrolled) |
92b0729c TL |
192 | cmpl $8, %edx |
193 | /* Less than 8 bytes? Go to byte copy loop */ | |
194 | jb .L_no_whole_words | |
195 | ||
196 | /* Check for bad alignment of source */ | |
197 | testl $7, %esi | |
198 | /* Already aligned */ | |
199 | jz .L_8byte_aligned | |
200 | ||
201 | /* Copy one byte at a time until source is 8-byte aligned */ | |
202 | movl %esi, %ecx | |
203 | andl $7, %ecx | |
204 | subl $8, %ecx | |
205 | negl %ecx | |
206 | subl %ecx, %edx | |
207 | .L_copy_leading_bytes: | |
208 | movb (%rsi), %al | |
209 | movb %al, (%rdi) | |
210 | incq %rsi | |
211 | incq %rdi | |
212 | decl %ecx | |
213 | jnz .L_copy_leading_bytes | |
214 | ||
215 | .L_8byte_aligned: | |
216 | /* Figure out how many whole cache lines (64-bytes) to copy */ | |
217 | movl %edx, %ecx | |
218 | andl $63, %edx | |
219 | shrl $6, %ecx | |
220 | jz .L_no_whole_cache_lines | |
221 | ||
222 | /* Loop copying whole cache lines */ | |
223 | .L_cache_w0: movq (%rsi), %r8 | |
224 | .L_cache_w1: movq 1*8(%rsi), %r9 | |
225 | .L_cache_w2: movq 2*8(%rsi), %r10 | |
226 | .L_cache_w3: movq 3*8(%rsi), %r11 | |
227 | movq %r8, (%rdi) | |
228 | movq %r9, 1*8(%rdi) | |
229 | movq %r10, 2*8(%rdi) | |
230 | movq %r11, 3*8(%rdi) | |
231 | .L_cache_w4: movq 4*8(%rsi), %r8 | |
232 | .L_cache_w5: movq 5*8(%rsi), %r9 | |
233 | .L_cache_w6: movq 6*8(%rsi), %r10 | |
234 | .L_cache_w7: movq 7*8(%rsi), %r11 | |
235 | movq %r8, 4*8(%rdi) | |
236 | movq %r9, 5*8(%rdi) | |
237 | movq %r10, 6*8(%rdi) | |
238 | movq %r11, 7*8(%rdi) | |
239 | leaq 64(%rsi), %rsi | |
240 | leaq 64(%rdi), %rdi | |
241 | decl %ecx | |
242 | jnz .L_cache_w0 | |
243 | ||
244 | /* Are there any trailing 8-byte words? */ | |
245 | .L_no_whole_cache_lines: | |
246 | movl %edx, %ecx | |
247 | andl $7, %edx | |
248 | shrl $3, %ecx | |
249 | jz .L_no_whole_words | |
250 | ||
251 | /* Copy trailing words */ | |
252 | .L_copy_trailing_words: | |
253 | movq (%rsi), %r8 | |
254 | mov %r8, (%rdi) | |
255 | leaq 8(%rsi), %rsi | |
256 | leaq 8(%rdi), %rdi | |
257 | decl %ecx | |
258 | jnz .L_copy_trailing_words | |
259 | ||
260 | /* Any trailing bytes? */ | |
261 | .L_no_whole_words: | |
262 | andl %edx, %edx | |
263 | jz .L_done_memcpy_trap | |
264 | ||
265 | /* Copy trailing bytes */ | |
266 | movl %edx, %ecx | |
267 | .L_copy_trailing_bytes: | |
268 | movb (%rsi), %al | |
269 | movb %al, (%rdi) | |
270 | incq %rsi | |
271 | incq %rdi | |
272 | decl %ecx | |
273 | jnz .L_copy_trailing_bytes | |
274 | ||
cbf8b5a2 | 275 | /* Copy successful. Return zero */ |
92b0729c TL |
276 | .L_done_memcpy_trap: |
277 | xorq %rax, %rax | |
278 | ret | |
9a6fb28a | 279 | ENDPROC(memcpy_mcsafe_unrolled) |
84d69848 | 280 | EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled) |
92b0729c TL |
281 | |
282 | .section .fixup, "ax" | |
cbf8b5a2 | 283 | /* Return -EFAULT for any failure */ |
92b0729c | 284 | .L_memcpy_mcsafe_fail: |
cbf8b5a2 | 285 | mov $-EFAULT, %rax |
92b0729c TL |
286 | ret |
287 | ||
288 | .previous | |
289 | ||
290 | _ASM_EXTABLE_FAULT(.L_copy_leading_bytes, .L_memcpy_mcsafe_fail) | |
291 | _ASM_EXTABLE_FAULT(.L_cache_w0, .L_memcpy_mcsafe_fail) | |
292 | _ASM_EXTABLE_FAULT(.L_cache_w1, .L_memcpy_mcsafe_fail) | |
26a37ab3 | 293 | _ASM_EXTABLE_FAULT(.L_cache_w2, .L_memcpy_mcsafe_fail) |
92b0729c TL |
294 | _ASM_EXTABLE_FAULT(.L_cache_w3, .L_memcpy_mcsafe_fail) |
295 | _ASM_EXTABLE_FAULT(.L_cache_w4, .L_memcpy_mcsafe_fail) | |
296 | _ASM_EXTABLE_FAULT(.L_cache_w5, .L_memcpy_mcsafe_fail) | |
297 | _ASM_EXTABLE_FAULT(.L_cache_w6, .L_memcpy_mcsafe_fail) | |
298 | _ASM_EXTABLE_FAULT(.L_cache_w7, .L_memcpy_mcsafe_fail) | |
299 | _ASM_EXTABLE_FAULT(.L_copy_trailing_words, .L_memcpy_mcsafe_fail) | |
300 | _ASM_EXTABLE_FAULT(.L_copy_trailing_bytes, .L_memcpy_mcsafe_fail) | |
301 | #endif |