]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/powerpc/lib/copy_32.S
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[mirror_ubuntu-bionic-kernel.git] / arch / powerpc / lib / copy_32.S
1 /*
2 * Memory copy functions for 32-bit PowerPC.
3 *
4 * Copyright (C) 1996-2005 Paul Mackerras.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11 #include <asm/processor.h>
12 #include <asm/cache.h>
13 #include <asm/errno.h>
14 #include <asm/ppc_asm.h>
15 #include <asm/export.h>
16
17 #define COPY_16_BYTES \
18 lwz r7,4(r4); \
19 lwz r8,8(r4); \
20 lwz r9,12(r4); \
21 lwzu r10,16(r4); \
22 stw r7,4(r6); \
23 stw r8,8(r6); \
24 stw r9,12(r6); \
25 stwu r10,16(r6)
26
27 #define COPY_16_BYTES_WITHEX(n) \
28 8 ## n ## 0: \
29 lwz r7,4(r4); \
30 8 ## n ## 1: \
31 lwz r8,8(r4); \
32 8 ## n ## 2: \
33 lwz r9,12(r4); \
34 8 ## n ## 3: \
35 lwzu r10,16(r4); \
36 8 ## n ## 4: \
37 stw r7,4(r6); \
38 8 ## n ## 5: \
39 stw r8,8(r6); \
40 8 ## n ## 6: \
41 stw r9,12(r6); \
42 8 ## n ## 7: \
43 stwu r10,16(r6)
44
45 #define COPY_16_BYTES_EXCODE(n) \
46 9 ## n ## 0: \
47 addi r5,r5,-(16 * n); \
48 b 104f; \
49 9 ## n ## 1: \
50 addi r5,r5,-(16 * n); \
51 b 105f; \
52 EX_TABLE(8 ## n ## 0b,9 ## n ## 0b); \
53 EX_TABLE(8 ## n ## 1b,9 ## n ## 0b); \
54 EX_TABLE(8 ## n ## 2b,9 ## n ## 0b); \
55 EX_TABLE(8 ## n ## 3b,9 ## n ## 0b); \
56 EX_TABLE(8 ## n ## 4b,9 ## n ## 1b); \
57 EX_TABLE(8 ## n ## 5b,9 ## n ## 1b); \
58 EX_TABLE(8 ## n ## 6b,9 ## n ## 1b); \
59 EX_TABLE(8 ## n ## 7b,9 ## n ## 1b)
60
61 .text
62 .stabs "arch/powerpc/lib/",N_SO,0,0,0f
63 .stabs "copy_32.S",N_SO,0,0,0f
64 0:
65
66 CACHELINE_BYTES = L1_CACHE_BYTES
67 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
68 CACHELINE_MASK = (L1_CACHE_BYTES-1)
69
70 _GLOBAL(memset16)
71 rlwinm. r0 ,r5, 31, 1, 31
72 addi r6, r3, -4
73 beq- 2f
74 rlwimi r4 ,r4 ,16 ,0 ,15
75 mtctr r0
76 1: stwu r4, 4(r6)
77 bdnz 1b
78 2: andi. r0, r5, 1
79 beqlr
80 sth r4, 4(r6)
81 blr
82 EXPORT_SYMBOL(memset16)
83
84 /*
85 * Use dcbz on the complete cache lines in the destination
86 * to set them to zero. This requires that the destination
87 * area is cacheable. -- paulus
88 *
89 * During early init, cache might not be active yet, so dcbz cannot be used.
90 * We therefore skip the optimised bloc that uses dcbz. This jump is
91 * replaced by a nop once cache is active. This is done in machine_init()
92 */
93 _GLOBAL(memset)
94 cmplwi 0,r5,4
95 blt 7f
96
97 rlwimi r4,r4,8,16,23
98 rlwimi r4,r4,16,0,15
99
100 stw r4,0(r3)
101 beqlr
102 andi. r0,r3,3
103 add r5,r0,r5
104 subf r6,r0,r3
105 cmplwi 0,r4,0
106 /*
107 * Skip optimised bloc until cache is enabled. Will be replaced
108 * by 'bne' during boot to use normal procedure if r4 is not zero
109 */
110 _GLOBAL(memset_nocache_branch)
111 b 2f
112
113 clrlwi r7,r6,32-LG_CACHELINE_BYTES
114 add r8,r7,r5
115 srwi r9,r8,LG_CACHELINE_BYTES
116 addic. r9,r9,-1 /* total number of complete cachelines */
117 ble 2f
118 xori r0,r7,CACHELINE_MASK & ~3
119 srwi. r0,r0,2
120 beq 3f
121 mtctr r0
122 4: stwu r4,4(r6)
123 bdnz 4b
124 3: mtctr r9
125 li r7,4
126 10: dcbz r7,r6
127 addi r6,r6,CACHELINE_BYTES
128 bdnz 10b
129 clrlwi r5,r8,32-LG_CACHELINE_BYTES
130 addi r5,r5,4
131
132 2: srwi r0,r5,2
133 mtctr r0
134 bdz 6f
135 1: stwu r4,4(r6)
136 bdnz 1b
137 6: andi. r5,r5,3
138 beqlr
139 mtctr r5
140 addi r6,r6,3
141 8: stbu r4,1(r6)
142 bdnz 8b
143 blr
144
145 7: cmpwi 0,r5,0
146 beqlr
147 mtctr r5
148 addi r6,r3,-1
149 9: stbu r4,1(r6)
150 bdnz 9b
151 blr
152 EXPORT_SYMBOL(memset)
153
154 /*
155 * This version uses dcbz on the complete cache lines in the
156 * destination area to reduce memory traffic. This requires that
157 * the destination area is cacheable.
158 * We only use this version if the source and dest don't overlap.
159 * -- paulus.
160 *
161 * During early init, cache might not be active yet, so dcbz cannot be used.
162 * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
163 * replaced by a nop once cache is active. This is done in machine_init()
164 */
165 _GLOBAL(memmove)
166 cmplw 0,r3,r4
167 bgt backwards_memcpy
168 /* fall through */
169
170 _GLOBAL(memcpy)
171 b generic_memcpy
172 add r7,r3,r5 /* test if the src & dst overlap */
173 add r8,r4,r5
174 cmplw 0,r4,r7
175 cmplw 1,r3,r8
176 crand 0,0,4 /* cr0.lt &= cr1.lt */
177 blt generic_memcpy /* if regions overlap */
178
179 addi r4,r4,-4
180 addi r6,r3,-4
181 neg r0,r3
182 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
183 beq 58f
184
185 cmplw 0,r5,r0 /* is this more than total to do? */
186 blt 63f /* if not much to do */
187 andi. r8,r0,3 /* get it word-aligned first */
188 subf r5,r0,r5
189 mtctr r8
190 beq+ 61f
191 70: lbz r9,4(r4) /* do some bytes */
192 addi r4,r4,1
193 addi r6,r6,1
194 stb r9,3(r6)
195 bdnz 70b
196 61: srwi. r0,r0,2
197 mtctr r0
198 beq 58f
199 72: lwzu r9,4(r4) /* do some words */
200 stwu r9,4(r6)
201 bdnz 72b
202
203 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
204 clrlwi r5,r5,32-LG_CACHELINE_BYTES
205 li r11,4
206 mtctr r0
207 beq 63f
208 53:
209 dcbz r11,r6
210 COPY_16_BYTES
211 #if L1_CACHE_BYTES >= 32
212 COPY_16_BYTES
213 #if L1_CACHE_BYTES >= 64
214 COPY_16_BYTES
215 COPY_16_BYTES
216 #if L1_CACHE_BYTES >= 128
217 COPY_16_BYTES
218 COPY_16_BYTES
219 COPY_16_BYTES
220 COPY_16_BYTES
221 #endif
222 #endif
223 #endif
224 bdnz 53b
225
226 63: srwi. r0,r5,2
227 mtctr r0
228 beq 64f
229 30: lwzu r0,4(r4)
230 stwu r0,4(r6)
231 bdnz 30b
232
233 64: andi. r0,r5,3
234 mtctr r0
235 beq+ 65f
236 addi r4,r4,3
237 addi r6,r6,3
238 40: lbzu r0,1(r4)
239 stbu r0,1(r6)
240 bdnz 40b
241 65: blr
242 EXPORT_SYMBOL(memcpy)
243 EXPORT_SYMBOL(memmove)
244
245 generic_memcpy:
246 srwi. r7,r5,3
247 addi r6,r3,-4
248 addi r4,r4,-4
249 beq 2f /* if less than 8 bytes to do */
250 andi. r0,r6,3 /* get dest word aligned */
251 mtctr r7
252 bne 5f
253 1: lwz r7,4(r4)
254 lwzu r8,8(r4)
255 stw r7,4(r6)
256 stwu r8,8(r6)
257 bdnz 1b
258 andi. r5,r5,7
259 2: cmplwi 0,r5,4
260 blt 3f
261 lwzu r0,4(r4)
262 addi r5,r5,-4
263 stwu r0,4(r6)
264 3: cmpwi 0,r5,0
265 beqlr
266 mtctr r5
267 addi r4,r4,3
268 addi r6,r6,3
269 4: lbzu r0,1(r4)
270 stbu r0,1(r6)
271 bdnz 4b
272 blr
273 5: subfic r0,r0,4
274 mtctr r0
275 6: lbz r7,4(r4)
276 addi r4,r4,1
277 stb r7,4(r6)
278 addi r6,r6,1
279 bdnz 6b
280 subf r5,r0,r5
281 rlwinm. r7,r5,32-3,3,31
282 beq 2b
283 mtctr r7
284 b 1b
285
286 _GLOBAL(backwards_memcpy)
287 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */
288 add r6,r3,r5
289 add r4,r4,r5
290 beq 2f
291 andi. r0,r6,3
292 mtctr r7
293 bne 5f
294 1: lwz r7,-4(r4)
295 lwzu r8,-8(r4)
296 stw r7,-4(r6)
297 stwu r8,-8(r6)
298 bdnz 1b
299 andi. r5,r5,7
300 2: cmplwi 0,r5,4
301 blt 3f
302 lwzu r0,-4(r4)
303 subi r5,r5,4
304 stwu r0,-4(r6)
305 3: cmpwi 0,r5,0
306 beqlr
307 mtctr r5
308 4: lbzu r0,-1(r4)
309 stbu r0,-1(r6)
310 bdnz 4b
311 blr
312 5: mtctr r0
313 6: lbzu r7,-1(r4)
314 stbu r7,-1(r6)
315 bdnz 6b
316 subf r5,r0,r5
317 rlwinm. r7,r5,32-3,3,31
318 beq 2b
319 mtctr r7
320 b 1b
321
322 _GLOBAL(__copy_tofrom_user)
323 addi r4,r4,-4
324 addi r6,r3,-4
325 neg r0,r3
326 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
327 beq 58f
328
329 cmplw 0,r5,r0 /* is this more than total to do? */
330 blt 63f /* if not much to do */
331 andi. r8,r0,3 /* get it word-aligned first */
332 mtctr r8
333 beq+ 61f
334 70: lbz r9,4(r4) /* do some bytes */
335 71: stb r9,4(r6)
336 addi r4,r4,1
337 addi r6,r6,1
338 bdnz 70b
339 61: subf r5,r0,r5
340 srwi. r0,r0,2
341 mtctr r0
342 beq 58f
343 72: lwzu r9,4(r4) /* do some words */
344 73: stwu r9,4(r6)
345 bdnz 72b
346
347 EX_TABLE(70b,100f)
348 EX_TABLE(71b,101f)
349 EX_TABLE(72b,102f)
350 EX_TABLE(73b,103f)
351
352 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
353 clrlwi r5,r5,32-LG_CACHELINE_BYTES
354 li r11,4
355 beq 63f
356
357 /* Here we decide how far ahead to prefetch the source */
358 li r3,4
359 cmpwi r0,1
360 li r7,0
361 ble 114f
362 li r7,1
363 #if MAX_COPY_PREFETCH > 1
364 /* Heuristically, for large transfers we prefetch
365 MAX_COPY_PREFETCH cachelines ahead. For small transfers
366 we prefetch 1 cacheline ahead. */
367 cmpwi r0,MAX_COPY_PREFETCH
368 ble 112f
369 li r7,MAX_COPY_PREFETCH
370 112: mtctr r7
371 111: dcbt r3,r4
372 addi r3,r3,CACHELINE_BYTES
373 bdnz 111b
374 #else
375 dcbt r3,r4
376 addi r3,r3,CACHELINE_BYTES
377 #endif /* MAX_COPY_PREFETCH > 1 */
378
379 114: subf r8,r7,r0
380 mr r0,r7
381 mtctr r8
382
383 53: dcbt r3,r4
384 54: dcbz r11,r6
385 EX_TABLE(54b,105f)
386 /* the main body of the cacheline loop */
387 COPY_16_BYTES_WITHEX(0)
388 #if L1_CACHE_BYTES >= 32
389 COPY_16_BYTES_WITHEX(1)
390 #if L1_CACHE_BYTES >= 64
391 COPY_16_BYTES_WITHEX(2)
392 COPY_16_BYTES_WITHEX(3)
393 #if L1_CACHE_BYTES >= 128
394 COPY_16_BYTES_WITHEX(4)
395 COPY_16_BYTES_WITHEX(5)
396 COPY_16_BYTES_WITHEX(6)
397 COPY_16_BYTES_WITHEX(7)
398 #endif
399 #endif
400 #endif
401 bdnz 53b
402 cmpwi r0,0
403 li r3,4
404 li r7,0
405 bne 114b
406
407 63: srwi. r0,r5,2
408 mtctr r0
409 beq 64f
410 30: lwzu r0,4(r4)
411 31: stwu r0,4(r6)
412 bdnz 30b
413
414 64: andi. r0,r5,3
415 mtctr r0
416 beq+ 65f
417 40: lbz r0,4(r4)
418 41: stb r0,4(r6)
419 addi r4,r4,1
420 addi r6,r6,1
421 bdnz 40b
422 65: li r3,0
423 blr
424
425 /* read fault, initial single-byte copy */
426 100: li r9,0
427 b 90f
428 /* write fault, initial single-byte copy */
429 101: li r9,1
430 90: subf r5,r8,r5
431 li r3,0
432 b 99f
433 /* read fault, initial word copy */
434 102: li r9,0
435 b 91f
436 /* write fault, initial word copy */
437 103: li r9,1
438 91: li r3,2
439 b 99f
440
441 /*
442 * this stuff handles faults in the cacheline loop and branches to either
443 * 104f (if in read part) or 105f (if in write part), after updating r5
444 */
445 COPY_16_BYTES_EXCODE(0)
446 #if L1_CACHE_BYTES >= 32
447 COPY_16_BYTES_EXCODE(1)
448 #if L1_CACHE_BYTES >= 64
449 COPY_16_BYTES_EXCODE(2)
450 COPY_16_BYTES_EXCODE(3)
451 #if L1_CACHE_BYTES >= 128
452 COPY_16_BYTES_EXCODE(4)
453 COPY_16_BYTES_EXCODE(5)
454 COPY_16_BYTES_EXCODE(6)
455 COPY_16_BYTES_EXCODE(7)
456 #endif
457 #endif
458 #endif
459
460 /* read fault in cacheline loop */
461 104: li r9,0
462 b 92f
463 /* fault on dcbz (effectively a write fault) */
464 /* or write fault in cacheline loop */
465 105: li r9,1
466 92: li r3,LG_CACHELINE_BYTES
467 mfctr r8
468 add r0,r0,r8
469 b 106f
470 /* read fault in final word loop */
471 108: li r9,0
472 b 93f
473 /* write fault in final word loop */
474 109: li r9,1
475 93: andi. r5,r5,3
476 li r3,2
477 b 99f
478 /* read fault in final byte loop */
479 110: li r9,0
480 b 94f
481 /* write fault in final byte loop */
482 111: li r9,1
483 94: li r5,0
484 li r3,0
485 /*
486 * At this stage the number of bytes not copied is
487 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write.
488 */
489 99: mfctr r0
490 106: slw r3,r0,r3
491 add. r3,r3,r5
492 beq 120f /* shouldn't happen */
493 cmpwi 0,r9,0
494 bne 120f
495 /* for a read fault, first try to continue the copy one byte at a time */
496 mtctr r3
497 130: lbz r0,4(r4)
498 131: stb r0,4(r6)
499 addi r4,r4,1
500 addi r6,r6,1
501 bdnz 130b
502 /* then clear out the destination: r3 bytes starting at 4(r6) */
503 132: mfctr r3
504 120: blr
505
506 EX_TABLE(30b,108b)
507 EX_TABLE(31b,109b)
508 EX_TABLE(40b,110b)
509 EX_TABLE(41b,111b)
510 EX_TABLE(130b,132b)
511 EX_TABLE(131b,120b)
512
513 EXPORT_SYMBOL(__copy_tofrom_user)