]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/lib/mmx_32.c
Merge branch 'akpm' (patches from Andrew)
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / lib / mmx_32.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4
LT
2/*
3 * MMX 3DNow! library helper functions
4 *
5 * To do:
ca5d3f14 6 * We can use MMX just for prefetch in IRQ's. This may be a win.
1da177e4
LT
7 * (reported so on K6-III)
8 * We should use a better code neutral filler for the short jump
9 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
10 * We also want to clobber the filler register so we don't get any
ca5d3f14 11 * register forwarding stalls on the filler.
1da177e4
LT
12 *
13 * Add *user handling. Checksums are not a win with MMX on any CPU
14 * tested so far for any MMX solution figured.
15 *
ca5d3f14
IM
16 * 22/09/2000 - Arjan van de Ven
17 * Improved for non-egineering-sample Athlons
1da177e4
LT
18 *
19 */
ca5d3f14
IM
20#include <linux/hardirq.h>
21#include <linux/string.h>
e683014c 22#include <linux/export.h>
ca5d3f14
IM
23#include <linux/sched.h>
24#include <linux/types.h>
25
df6b35f4 26#include <asm/fpu/api.h>
ca5d3f14
IM
27#include <asm/asm.h>
28
67de8dca
AL
29/*
30 * Use KFPU_387. MMX instructions are not affected by MXCSR,
31 * but both AMD and Intel documentation states that even integer MMX
32 * operations will result in #MF if an exception is pending in FCW.
33 *
34 * EMMS is not needed afterwards because, after calling kernel_fpu_end(),
35 * any subsequent user of the 387 stack will reinitialize it using
36 * KFPU_387.
37 */
38
1da177e4
LT
39void *_mmx_memcpy(void *to, const void *from, size_t len)
40{
41 void *p;
42 int i;
43
44 if (unlikely(in_interrupt()))
45 return __memcpy(to, from, len);
46
47 p = to;
48 i = len >> 6; /* len/64 */
49
67de8dca 50 kernel_fpu_begin_mask(KFPU_387);
1da177e4
LT
51
52 __asm__ __volatile__ (
53 "1: prefetch (%0)\n" /* This set is 28 bytes */
54 " prefetch 64(%0)\n"
55 " prefetch 128(%0)\n"
56 " prefetch 192(%0)\n"
57 " prefetch 256(%0)\n"
58 "2: \n"
59 ".section .fixup, \"ax\"\n"
60 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
61 " jmp 2b\n"
62 ".previous\n"
ca5d3f14
IM
63 _ASM_EXTABLE(1b, 3b)
64 : : "r" (from));
65
66 for ( ; i > 5; i--) {
1da177e4
LT
67 __asm__ __volatile__ (
68 "1: prefetch 320(%0)\n"
69 "2: movq (%0), %%mm0\n"
70 " movq 8(%0), %%mm1\n"
71 " movq 16(%0), %%mm2\n"
72 " movq 24(%0), %%mm3\n"
73 " movq %%mm0, (%1)\n"
74 " movq %%mm1, 8(%1)\n"
75 " movq %%mm2, 16(%1)\n"
76 " movq %%mm3, 24(%1)\n"
77 " movq 32(%0), %%mm0\n"
78 " movq 40(%0), %%mm1\n"
79 " movq 48(%0), %%mm2\n"
80 " movq 56(%0), %%mm3\n"
81 " movq %%mm0, 32(%1)\n"
82 " movq %%mm1, 40(%1)\n"
83 " movq %%mm2, 48(%1)\n"
84 " movq %%mm3, 56(%1)\n"
85 ".section .fixup, \"ax\"\n"
86 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
87 " jmp 2b\n"
88 ".previous\n"
ca5d3f14
IM
89 _ASM_EXTABLE(1b, 3b)
90 : : "r" (from), "r" (to) : "memory");
91
92 from += 64;
93 to += 64;
1da177e4
LT
94 }
95
ca5d3f14 96 for ( ; i > 0; i--) {
1da177e4
LT
97 __asm__ __volatile__ (
98 " movq (%0), %%mm0\n"
99 " movq 8(%0), %%mm1\n"
100 " movq 16(%0), %%mm2\n"
101 " movq 24(%0), %%mm3\n"
102 " movq %%mm0, (%1)\n"
103 " movq %%mm1, 8(%1)\n"
104 " movq %%mm2, 16(%1)\n"
105 " movq %%mm3, 24(%1)\n"
106 " movq 32(%0), %%mm0\n"
107 " movq 40(%0), %%mm1\n"
108 " movq 48(%0), %%mm2\n"
109 " movq 56(%0), %%mm3\n"
110 " movq %%mm0, 32(%1)\n"
111 " movq %%mm1, 40(%1)\n"
112 " movq %%mm2, 48(%1)\n"
113 " movq %%mm3, 56(%1)\n"
ca5d3f14
IM
114 : : "r" (from), "r" (to) : "memory");
115
116 from += 64;
117 to += 64;
1da177e4
LT
118 }
119 /*
ca5d3f14 120 * Now do the tail of the block:
1da177e4 121 */
ca5d3f14 122 __memcpy(to, from, len & 63);
1da177e4 123 kernel_fpu_end();
ca5d3f14 124
1da177e4
LT
125 return p;
126}
ca5d3f14 127EXPORT_SYMBOL(_mmx_memcpy);
1da177e4
LT
128
129#ifdef CONFIG_MK7
130
131/*
132 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
133 * other MMX using processors do not.
134 */
135
136static void fast_clear_page(void *page)
137{
138 int i;
139
67de8dca 140 kernel_fpu_begin_mask(KFPU_387);
ca5d3f14 141
1da177e4
LT
142 __asm__ __volatile__ (
143 " pxor %%mm0, %%mm0\n" : :
144 );
145
ca5d3f14 146 for (i = 0; i < 4096/64; i++) {
1da177e4
LT
147 __asm__ __volatile__ (
148 " movntq %%mm0, (%0)\n"
149 " movntq %%mm0, 8(%0)\n"
150 " movntq %%mm0, 16(%0)\n"
151 " movntq %%mm0, 24(%0)\n"
152 " movntq %%mm0, 32(%0)\n"
153 " movntq %%mm0, 40(%0)\n"
154 " movntq %%mm0, 48(%0)\n"
155 " movntq %%mm0, 56(%0)\n"
156 : : "r" (page) : "memory");
ca5d3f14 157 page += 64;
1da177e4 158 }
ca5d3f14
IM
159
160 /*
161 * Since movntq is weakly-ordered, a "sfence" is needed to become
162 * ordered again:
1da177e4 163 */
ca5d3f14
IM
164 __asm__ __volatile__("sfence\n"::);
165
1da177e4
LT
166 kernel_fpu_end();
167}
168
169static void fast_copy_page(void *to, void *from)
170{
171 int i;
172
67de8dca 173 kernel_fpu_begin_mask(KFPU_387);
1da177e4 174
ca5d3f14
IM
175 /*
176 * maybe the prefetch stuff can go before the expensive fnsave...
1da177e4
LT
177 * but that is for later. -AV
178 */
ca5d3f14 179 __asm__ __volatile__(
1da177e4
LT
180 "1: prefetch (%0)\n"
181 " prefetch 64(%0)\n"
182 " prefetch 128(%0)\n"
183 " prefetch 192(%0)\n"
184 " prefetch 256(%0)\n"
185 "2: \n"
186 ".section .fixup, \"ax\"\n"
187 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
188 " jmp 2b\n"
189 ".previous\n"
ca5d3f14 190 _ASM_EXTABLE(1b, 3b) : : "r" (from));
1da177e4 191
ca5d3f14 192 for (i = 0; i < (4096-320)/64; i++) {
1da177e4
LT
193 __asm__ __volatile__ (
194 "1: prefetch 320(%0)\n"
195 "2: movq (%0), %%mm0\n"
196 " movntq %%mm0, (%1)\n"
197 " movq 8(%0), %%mm1\n"
198 " movntq %%mm1, 8(%1)\n"
199 " movq 16(%0), %%mm2\n"
200 " movntq %%mm2, 16(%1)\n"
201 " movq 24(%0), %%mm3\n"
202 " movntq %%mm3, 24(%1)\n"
203 " movq 32(%0), %%mm4\n"
204 " movntq %%mm4, 32(%1)\n"
205 " movq 40(%0), %%mm5\n"
206 " movntq %%mm5, 40(%1)\n"
207 " movq 48(%0), %%mm6\n"
208 " movntq %%mm6, 48(%1)\n"
209 " movq 56(%0), %%mm7\n"
210 " movntq %%mm7, 56(%1)\n"
211 ".section .fixup, \"ax\"\n"
212 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
213 " jmp 2b\n"
214 ".previous\n"
ca5d3f14
IM
215 _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
216
217 from += 64;
218 to += 64;
1da177e4 219 }
ca5d3f14
IM
220
221 for (i = (4096-320)/64; i < 4096/64; i++) {
1da177e4
LT
222 __asm__ __volatile__ (
223 "2: movq (%0), %%mm0\n"
224 " movntq %%mm0, (%1)\n"
225 " movq 8(%0), %%mm1\n"
226 " movntq %%mm1, 8(%1)\n"
227 " movq 16(%0), %%mm2\n"
228 " movntq %%mm2, 16(%1)\n"
229 " movq 24(%0), %%mm3\n"
230 " movntq %%mm3, 24(%1)\n"
231 " movq 32(%0), %%mm4\n"
232 " movntq %%mm4, 32(%1)\n"
233 " movq 40(%0), %%mm5\n"
234 " movntq %%mm5, 40(%1)\n"
235 " movq 48(%0), %%mm6\n"
236 " movntq %%mm6, 48(%1)\n"
237 " movq 56(%0), %%mm7\n"
238 " movntq %%mm7, 56(%1)\n"
ca5d3f14
IM
239 : : "r" (from), "r" (to) : "memory");
240 from += 64;
241 to += 64;
1da177e4 242 }
ca5d3f14
IM
243 /*
244 * Since movntq is weakly-ordered, a "sfence" is needed to become
245 * ordered again:
1da177e4 246 */
ca5d3f14 247 __asm__ __volatile__("sfence \n"::);
1da177e4
LT
248 kernel_fpu_end();
249}
250
ca5d3f14 251#else /* CONFIG_MK7 */
1da177e4
LT
252
253/*
254 * Generic MMX implementation without K7 specific streaming
255 */
1da177e4
LT
256static void fast_clear_page(void *page)
257{
258 int i;
ca5d3f14 259
67de8dca 260 kernel_fpu_begin_mask(KFPU_387);
ca5d3f14 261
1da177e4
LT
262 __asm__ __volatile__ (
263 " pxor %%mm0, %%mm0\n" : :
264 );
265
ca5d3f14 266 for (i = 0; i < 4096/128; i++) {
1da177e4
LT
267 __asm__ __volatile__ (
268 " movq %%mm0, (%0)\n"
269 " movq %%mm0, 8(%0)\n"
270 " movq %%mm0, 16(%0)\n"
271 " movq %%mm0, 24(%0)\n"
272 " movq %%mm0, 32(%0)\n"
273 " movq %%mm0, 40(%0)\n"
274 " movq %%mm0, 48(%0)\n"
275 " movq %%mm0, 56(%0)\n"
276 " movq %%mm0, 64(%0)\n"
277 " movq %%mm0, 72(%0)\n"
278 " movq %%mm0, 80(%0)\n"
279 " movq %%mm0, 88(%0)\n"
280 " movq %%mm0, 96(%0)\n"
281 " movq %%mm0, 104(%0)\n"
282 " movq %%mm0, 112(%0)\n"
283 " movq %%mm0, 120(%0)\n"
ca5d3f14
IM
284 : : "r" (page) : "memory");
285 page += 128;
1da177e4
LT
286 }
287
288 kernel_fpu_end();
289}
290
291static void fast_copy_page(void *to, void *from)
292{
293 int i;
ca5d3f14 294
67de8dca 295 kernel_fpu_begin_mask(KFPU_387);
1da177e4
LT
296
297 __asm__ __volatile__ (
298 "1: prefetch (%0)\n"
299 " prefetch 64(%0)\n"
300 " prefetch 128(%0)\n"
301 " prefetch 192(%0)\n"
302 " prefetch 256(%0)\n"
303 "2: \n"
304 ".section .fixup, \"ax\"\n"
305 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
306 " jmp 2b\n"
307 ".previous\n"
ca5d3f14 308 _ASM_EXTABLE(1b, 3b) : : "r" (from));
1da177e4 309
ca5d3f14 310 for (i = 0; i < 4096/64; i++) {
1da177e4
LT
311 __asm__ __volatile__ (
312 "1: prefetch 320(%0)\n"
313 "2: movq (%0), %%mm0\n"
314 " movq 8(%0), %%mm1\n"
315 " movq 16(%0), %%mm2\n"
316 " movq 24(%0), %%mm3\n"
317 " movq %%mm0, (%1)\n"
318 " movq %%mm1, 8(%1)\n"
319 " movq %%mm2, 16(%1)\n"
320 " movq %%mm3, 24(%1)\n"
321 " movq 32(%0), %%mm0\n"
322 " movq 40(%0), %%mm1\n"
323 " movq 48(%0), %%mm2\n"
324 " movq 56(%0), %%mm3\n"
325 " movq %%mm0, 32(%1)\n"
326 " movq %%mm1, 40(%1)\n"
327 " movq %%mm2, 48(%1)\n"
328 " movq %%mm3, 56(%1)\n"
329 ".section .fixup, \"ax\"\n"
330 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
331 " jmp 2b\n"
332 ".previous\n"
ca5d3f14
IM
333 _ASM_EXTABLE(1b, 3b)
334 : : "r" (from), "r" (to) : "memory");
335
336 from += 64;
337 to += 64;
1da177e4
LT
338 }
339 kernel_fpu_end();
340}
341
ca5d3f14 342#endif /* !CONFIG_MK7 */
1da177e4
LT
343
344/*
ca5d3f14 345 * Favour MMX for page clear and copy:
1da177e4 346 */
ca5d3f14 347static void slow_zero_page(void *page)
1da177e4
LT
348{
349 int d0, d1;
ca5d3f14
IM
350
351 __asm__ __volatile__(
352 "cld\n\t"
353 "rep ; stosl"
354
355 : "=&c" (d0), "=&D" (d1)
356 :"a" (0), "1" (page), "0" (1024)
357 :"memory");
1da177e4 358}
ca5d3f14
IM
359
360void mmx_clear_page(void *page)
1da177e4 361{
ca5d3f14 362 if (unlikely(in_interrupt()))
1da177e4
LT
363 slow_zero_page(page);
364 else
365 fast_clear_page(page);
366}
ca5d3f14 367EXPORT_SYMBOL(mmx_clear_page);
1da177e4
LT
368
369static void slow_copy_page(void *to, void *from)
370{
371 int d0, d1, d2;
ca5d3f14
IM
372
373 __asm__ __volatile__(
374 "cld\n\t"
375 "rep ; movsl"
376 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
377 : "0" (1024), "1" ((long) to), "2" ((long) from)
1da177e4
LT
378 : "memory");
379}
1da177e4
LT
380
381void mmx_copy_page(void *to, void *from)
382{
ca5d3f14 383 if (unlikely(in_interrupt()))
1da177e4
LT
384 slow_copy_page(to, from);
385 else
386 fast_copy_page(to, from);
387}
129f6946 388EXPORT_SYMBOL(mmx_copy_page);