]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | // SPDX-License-Identifier: GPL-2.0 |
1da177e4 LT |
2 | /* |
3 | * MMX 3DNow! library helper functions | |
4 | * | |
5 | * To do: | |
ca5d3f14 | 6 | * We can use MMX just for prefetch in IRQ's. This may be a win. |
1da177e4 LT |
7 | * (reported so on K6-III) |
8 | * We should use a better code neutral filler for the short jump | |
9 | * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ?? | |
10 | * We also want to clobber the filler register so we don't get any | |
ca5d3f14 | 11 | * register forwarding stalls on the filler. |
1da177e4 LT |
12 | * |
13 | * Add *user handling. Checksums are not a win with MMX on any CPU | |
14 | * tested so far for any MMX solution figured. | |
15 | * | |
ca5d3f14 IM |
16 | * 22/09/2000 - Arjan van de Ven |
17 | * Improved for non-egineering-sample Athlons | |
1da177e4 LT |
18 | * |
19 | */ | |
ca5d3f14 IM |
20 | #include <linux/hardirq.h> |
21 | #include <linux/string.h> | |
e683014c | 22 | #include <linux/export.h> |
ca5d3f14 IM |
23 | #include <linux/sched.h> |
24 | #include <linux/types.h> | |
25 | ||
df6b35f4 | 26 | #include <asm/fpu/api.h> |
ca5d3f14 IM |
27 | #include <asm/asm.h> |
28 | ||
67de8dca AL |
29 | /* |
30 | * Use KFPU_387. MMX instructions are not affected by MXCSR, | |
31 | * but both AMD and Intel documentation states that even integer MMX | |
32 | * operations will result in #MF if an exception is pending in FCW. | |
33 | * | |
34 | * EMMS is not needed afterwards because, after calling kernel_fpu_end(), | |
35 | * any subsequent user of the 387 stack will reinitialize it using | |
36 | * KFPU_387. | |
37 | */ | |
38 | ||
1da177e4 LT |
39 | void *_mmx_memcpy(void *to, const void *from, size_t len) |
40 | { | |
41 | void *p; | |
42 | int i; | |
43 | ||
44 | if (unlikely(in_interrupt())) | |
45 | return __memcpy(to, from, len); | |
46 | ||
47 | p = to; | |
48 | i = len >> 6; /* len/64 */ | |
49 | ||
67de8dca | 50 | kernel_fpu_begin_mask(KFPU_387); |
1da177e4 LT |
51 | |
52 | __asm__ __volatile__ ( | |
53 | "1: prefetch (%0)\n" /* This set is 28 bytes */ | |
54 | " prefetch 64(%0)\n" | |
55 | " prefetch 128(%0)\n" | |
56 | " prefetch 192(%0)\n" | |
57 | " prefetch 256(%0)\n" | |
58 | "2: \n" | |
59 | ".section .fixup, \"ax\"\n" | |
60 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
61 | " jmp 2b\n" | |
62 | ".previous\n" | |
ca5d3f14 IM |
63 | _ASM_EXTABLE(1b, 3b) |
64 | : : "r" (from)); | |
65 | ||
66 | for ( ; i > 5; i--) { | |
1da177e4 LT |
67 | __asm__ __volatile__ ( |
68 | "1: prefetch 320(%0)\n" | |
69 | "2: movq (%0), %%mm0\n" | |
70 | " movq 8(%0), %%mm1\n" | |
71 | " movq 16(%0), %%mm2\n" | |
72 | " movq 24(%0), %%mm3\n" | |
73 | " movq %%mm0, (%1)\n" | |
74 | " movq %%mm1, 8(%1)\n" | |
75 | " movq %%mm2, 16(%1)\n" | |
76 | " movq %%mm3, 24(%1)\n" | |
77 | " movq 32(%0), %%mm0\n" | |
78 | " movq 40(%0), %%mm1\n" | |
79 | " movq 48(%0), %%mm2\n" | |
80 | " movq 56(%0), %%mm3\n" | |
81 | " movq %%mm0, 32(%1)\n" | |
82 | " movq %%mm1, 40(%1)\n" | |
83 | " movq %%mm2, 48(%1)\n" | |
84 | " movq %%mm3, 56(%1)\n" | |
85 | ".section .fixup, \"ax\"\n" | |
86 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
87 | " jmp 2b\n" | |
88 | ".previous\n" | |
ca5d3f14 IM |
89 | _ASM_EXTABLE(1b, 3b) |
90 | : : "r" (from), "r" (to) : "memory"); | |
91 | ||
92 | from += 64; | |
93 | to += 64; | |
1da177e4 LT |
94 | } |
95 | ||
ca5d3f14 | 96 | for ( ; i > 0; i--) { |
1da177e4 LT |
97 | __asm__ __volatile__ ( |
98 | " movq (%0), %%mm0\n" | |
99 | " movq 8(%0), %%mm1\n" | |
100 | " movq 16(%0), %%mm2\n" | |
101 | " movq 24(%0), %%mm3\n" | |
102 | " movq %%mm0, (%1)\n" | |
103 | " movq %%mm1, 8(%1)\n" | |
104 | " movq %%mm2, 16(%1)\n" | |
105 | " movq %%mm3, 24(%1)\n" | |
106 | " movq 32(%0), %%mm0\n" | |
107 | " movq 40(%0), %%mm1\n" | |
108 | " movq 48(%0), %%mm2\n" | |
109 | " movq 56(%0), %%mm3\n" | |
110 | " movq %%mm0, 32(%1)\n" | |
111 | " movq %%mm1, 40(%1)\n" | |
112 | " movq %%mm2, 48(%1)\n" | |
113 | " movq %%mm3, 56(%1)\n" | |
ca5d3f14 IM |
114 | : : "r" (from), "r" (to) : "memory"); |
115 | ||
116 | from += 64; | |
117 | to += 64; | |
1da177e4 LT |
118 | } |
119 | /* | |
ca5d3f14 | 120 | * Now do the tail of the block: |
1da177e4 | 121 | */ |
ca5d3f14 | 122 | __memcpy(to, from, len & 63); |
1da177e4 | 123 | kernel_fpu_end(); |
ca5d3f14 | 124 | |
1da177e4 LT |
125 | return p; |
126 | } | |
ca5d3f14 | 127 | EXPORT_SYMBOL(_mmx_memcpy); |
1da177e4 LT |
128 | |
129 | #ifdef CONFIG_MK7 | |
130 | ||
131 | /* | |
132 | * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and | |
133 | * other MMX using processors do not. | |
134 | */ | |
135 | ||
136 | static void fast_clear_page(void *page) | |
137 | { | |
138 | int i; | |
139 | ||
67de8dca | 140 | kernel_fpu_begin_mask(KFPU_387); |
ca5d3f14 | 141 | |
1da177e4 LT |
142 | __asm__ __volatile__ ( |
143 | " pxor %%mm0, %%mm0\n" : : | |
144 | ); | |
145 | ||
ca5d3f14 | 146 | for (i = 0; i < 4096/64; i++) { |
1da177e4 LT |
147 | __asm__ __volatile__ ( |
148 | " movntq %%mm0, (%0)\n" | |
149 | " movntq %%mm0, 8(%0)\n" | |
150 | " movntq %%mm0, 16(%0)\n" | |
151 | " movntq %%mm0, 24(%0)\n" | |
152 | " movntq %%mm0, 32(%0)\n" | |
153 | " movntq %%mm0, 40(%0)\n" | |
154 | " movntq %%mm0, 48(%0)\n" | |
155 | " movntq %%mm0, 56(%0)\n" | |
156 | : : "r" (page) : "memory"); | |
ca5d3f14 | 157 | page += 64; |
1da177e4 | 158 | } |
ca5d3f14 IM |
159 | |
160 | /* | |
161 | * Since movntq is weakly-ordered, a "sfence" is needed to become | |
162 | * ordered again: | |
1da177e4 | 163 | */ |
ca5d3f14 IM |
164 | __asm__ __volatile__("sfence\n"::); |
165 | ||
1da177e4 LT |
166 | kernel_fpu_end(); |
167 | } | |
168 | ||
169 | static void fast_copy_page(void *to, void *from) | |
170 | { | |
171 | int i; | |
172 | ||
67de8dca | 173 | kernel_fpu_begin_mask(KFPU_387); |
1da177e4 | 174 | |
ca5d3f14 IM |
175 | /* |
176 | * maybe the prefetch stuff can go before the expensive fnsave... | |
1da177e4 LT |
177 | * but that is for later. -AV |
178 | */ | |
ca5d3f14 | 179 | __asm__ __volatile__( |
1da177e4 LT |
180 | "1: prefetch (%0)\n" |
181 | " prefetch 64(%0)\n" | |
182 | " prefetch 128(%0)\n" | |
183 | " prefetch 192(%0)\n" | |
184 | " prefetch 256(%0)\n" | |
185 | "2: \n" | |
186 | ".section .fixup, \"ax\"\n" | |
187 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
188 | " jmp 2b\n" | |
189 | ".previous\n" | |
ca5d3f14 | 190 | _ASM_EXTABLE(1b, 3b) : : "r" (from)); |
1da177e4 | 191 | |
ca5d3f14 | 192 | for (i = 0; i < (4096-320)/64; i++) { |
1da177e4 LT |
193 | __asm__ __volatile__ ( |
194 | "1: prefetch 320(%0)\n" | |
195 | "2: movq (%0), %%mm0\n" | |
196 | " movntq %%mm0, (%1)\n" | |
197 | " movq 8(%0), %%mm1\n" | |
198 | " movntq %%mm1, 8(%1)\n" | |
199 | " movq 16(%0), %%mm2\n" | |
200 | " movntq %%mm2, 16(%1)\n" | |
201 | " movq 24(%0), %%mm3\n" | |
202 | " movntq %%mm3, 24(%1)\n" | |
203 | " movq 32(%0), %%mm4\n" | |
204 | " movntq %%mm4, 32(%1)\n" | |
205 | " movq 40(%0), %%mm5\n" | |
206 | " movntq %%mm5, 40(%1)\n" | |
207 | " movq 48(%0), %%mm6\n" | |
208 | " movntq %%mm6, 48(%1)\n" | |
209 | " movq 56(%0), %%mm7\n" | |
210 | " movntq %%mm7, 56(%1)\n" | |
211 | ".section .fixup, \"ax\"\n" | |
212 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
213 | " jmp 2b\n" | |
214 | ".previous\n" | |
ca5d3f14 IM |
215 | _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory"); |
216 | ||
217 | from += 64; | |
218 | to += 64; | |
1da177e4 | 219 | } |
ca5d3f14 IM |
220 | |
221 | for (i = (4096-320)/64; i < 4096/64; i++) { | |
1da177e4 LT |
222 | __asm__ __volatile__ ( |
223 | "2: movq (%0), %%mm0\n" | |
224 | " movntq %%mm0, (%1)\n" | |
225 | " movq 8(%0), %%mm1\n" | |
226 | " movntq %%mm1, 8(%1)\n" | |
227 | " movq 16(%0), %%mm2\n" | |
228 | " movntq %%mm2, 16(%1)\n" | |
229 | " movq 24(%0), %%mm3\n" | |
230 | " movntq %%mm3, 24(%1)\n" | |
231 | " movq 32(%0), %%mm4\n" | |
232 | " movntq %%mm4, 32(%1)\n" | |
233 | " movq 40(%0), %%mm5\n" | |
234 | " movntq %%mm5, 40(%1)\n" | |
235 | " movq 48(%0), %%mm6\n" | |
236 | " movntq %%mm6, 48(%1)\n" | |
237 | " movq 56(%0), %%mm7\n" | |
238 | " movntq %%mm7, 56(%1)\n" | |
ca5d3f14 IM |
239 | : : "r" (from), "r" (to) : "memory"); |
240 | from += 64; | |
241 | to += 64; | |
1da177e4 | 242 | } |
ca5d3f14 IM |
243 | /* |
244 | * Since movntq is weakly-ordered, a "sfence" is needed to become | |
245 | * ordered again: | |
1da177e4 | 246 | */ |
ca5d3f14 | 247 | __asm__ __volatile__("sfence \n"::); |
1da177e4 LT |
248 | kernel_fpu_end(); |
249 | } | |
250 | ||
ca5d3f14 | 251 | #else /* CONFIG_MK7 */ |
1da177e4 LT |
252 | |
253 | /* | |
254 | * Generic MMX implementation without K7 specific streaming | |
255 | */ | |
1da177e4 LT |
256 | static void fast_clear_page(void *page) |
257 | { | |
258 | int i; | |
ca5d3f14 | 259 | |
67de8dca | 260 | kernel_fpu_begin_mask(KFPU_387); |
ca5d3f14 | 261 | |
1da177e4 LT |
262 | __asm__ __volatile__ ( |
263 | " pxor %%mm0, %%mm0\n" : : | |
264 | ); | |
265 | ||
ca5d3f14 | 266 | for (i = 0; i < 4096/128; i++) { |
1da177e4 LT |
267 | __asm__ __volatile__ ( |
268 | " movq %%mm0, (%0)\n" | |
269 | " movq %%mm0, 8(%0)\n" | |
270 | " movq %%mm0, 16(%0)\n" | |
271 | " movq %%mm0, 24(%0)\n" | |
272 | " movq %%mm0, 32(%0)\n" | |
273 | " movq %%mm0, 40(%0)\n" | |
274 | " movq %%mm0, 48(%0)\n" | |
275 | " movq %%mm0, 56(%0)\n" | |
276 | " movq %%mm0, 64(%0)\n" | |
277 | " movq %%mm0, 72(%0)\n" | |
278 | " movq %%mm0, 80(%0)\n" | |
279 | " movq %%mm0, 88(%0)\n" | |
280 | " movq %%mm0, 96(%0)\n" | |
281 | " movq %%mm0, 104(%0)\n" | |
282 | " movq %%mm0, 112(%0)\n" | |
283 | " movq %%mm0, 120(%0)\n" | |
ca5d3f14 IM |
284 | : : "r" (page) : "memory"); |
285 | page += 128; | |
1da177e4 LT |
286 | } |
287 | ||
288 | kernel_fpu_end(); | |
289 | } | |
290 | ||
291 | static void fast_copy_page(void *to, void *from) | |
292 | { | |
293 | int i; | |
ca5d3f14 | 294 | |
67de8dca | 295 | kernel_fpu_begin_mask(KFPU_387); |
1da177e4 LT |
296 | |
297 | __asm__ __volatile__ ( | |
298 | "1: prefetch (%0)\n" | |
299 | " prefetch 64(%0)\n" | |
300 | " prefetch 128(%0)\n" | |
301 | " prefetch 192(%0)\n" | |
302 | " prefetch 256(%0)\n" | |
303 | "2: \n" | |
304 | ".section .fixup, \"ax\"\n" | |
305 | "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ | |
306 | " jmp 2b\n" | |
307 | ".previous\n" | |
ca5d3f14 | 308 | _ASM_EXTABLE(1b, 3b) : : "r" (from)); |
1da177e4 | 309 | |
ca5d3f14 | 310 | for (i = 0; i < 4096/64; i++) { |
1da177e4 LT |
311 | __asm__ __volatile__ ( |
312 | "1: prefetch 320(%0)\n" | |
313 | "2: movq (%0), %%mm0\n" | |
314 | " movq 8(%0), %%mm1\n" | |
315 | " movq 16(%0), %%mm2\n" | |
316 | " movq 24(%0), %%mm3\n" | |
317 | " movq %%mm0, (%1)\n" | |
318 | " movq %%mm1, 8(%1)\n" | |
319 | " movq %%mm2, 16(%1)\n" | |
320 | " movq %%mm3, 24(%1)\n" | |
321 | " movq 32(%0), %%mm0\n" | |
322 | " movq 40(%0), %%mm1\n" | |
323 | " movq 48(%0), %%mm2\n" | |
324 | " movq 56(%0), %%mm3\n" | |
325 | " movq %%mm0, 32(%1)\n" | |
326 | " movq %%mm1, 40(%1)\n" | |
327 | " movq %%mm2, 48(%1)\n" | |
328 | " movq %%mm3, 56(%1)\n" | |
329 | ".section .fixup, \"ax\"\n" | |
330 | "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ | |
331 | " jmp 2b\n" | |
332 | ".previous\n" | |
ca5d3f14 IM |
333 | _ASM_EXTABLE(1b, 3b) |
334 | : : "r" (from), "r" (to) : "memory"); | |
335 | ||
336 | from += 64; | |
337 | to += 64; | |
1da177e4 LT |
338 | } |
339 | kernel_fpu_end(); | |
340 | } | |
341 | ||
ca5d3f14 | 342 | #endif /* !CONFIG_MK7 */ |
1da177e4 LT |
343 | |
344 | /* | |
ca5d3f14 | 345 | * Favour MMX for page clear and copy: |
1da177e4 | 346 | */ |
ca5d3f14 | 347 | static void slow_zero_page(void *page) |
1da177e4 LT |
348 | { |
349 | int d0, d1; | |
ca5d3f14 IM |
350 | |
351 | __asm__ __volatile__( | |
352 | "cld\n\t" | |
353 | "rep ; stosl" | |
354 | ||
355 | : "=&c" (d0), "=&D" (d1) | |
356 | :"a" (0), "1" (page), "0" (1024) | |
357 | :"memory"); | |
1da177e4 | 358 | } |
ca5d3f14 IM |
359 | |
360 | void mmx_clear_page(void *page) | |
1da177e4 | 361 | { |
ca5d3f14 | 362 | if (unlikely(in_interrupt())) |
1da177e4 LT |
363 | slow_zero_page(page); |
364 | else | |
365 | fast_clear_page(page); | |
366 | } | |
ca5d3f14 | 367 | EXPORT_SYMBOL(mmx_clear_page); |
1da177e4 LT |
368 | |
369 | static void slow_copy_page(void *to, void *from) | |
370 | { | |
371 | int d0, d1, d2; | |
ca5d3f14 IM |
372 | |
373 | __asm__ __volatile__( | |
374 | "cld\n\t" | |
375 | "rep ; movsl" | |
376 | : "=&c" (d0), "=&D" (d1), "=&S" (d2) | |
377 | : "0" (1024), "1" ((long) to), "2" ((long) from) | |
1da177e4 LT |
378 | : "memory"); |
379 | } | |
1da177e4 LT |
380 | |
381 | void mmx_copy_page(void *to, void *from) | |
382 | { | |
ca5d3f14 | 383 | if (unlikely(in_interrupt())) |
1da177e4 LT |
384 | slow_copy_page(to, from); |
385 | else | |
386 | fast_copy_page(to, from); | |
387 | } | |
129f6946 | 388 | EXPORT_SYMBOL(mmx_copy_page); |