]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
1da177e4 LT |
2 | /* |
3 | * "memcpy" implementation of SuperH | |
4 | * | |
5 | * Copyright (C) 1999 Niibe Yutaka | |
6 | * Copyright (c) 2002 STMicroelectronics Ltd | |
7 | * Modified from memcpy.S and micro-optimised for SH4 | |
8 | * Stuart Menefy (stuart.menefy@st.com) | |
9 | * | |
10 | */ | |
11 | #include <linux/linkage.h> | |
1da177e4 LT |
12 | |
13 | /* | |
14 | * void *memcpy(void *dst, const void *src, size_t n); | |
15 | * | |
16 | * It is assumed that there is no overlap between src and dst. | |
17 | * If there is an overlap, then the results are undefined. | |
18 | */ | |
19 | ||
20 | ! | |
21 | ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. | |
22 | ! | |
23 | ||
24 | ! Size is 16 or greater, and may have trailing bytes | |
25 | ||
26 | .balign 32 | |
27 | .Lcase1: | |
28 | ! Read a long word and write a long word at once | |
29 | ! At the start of each iteration, r7 contains last long load | |
30 | add #-1,r5 ! 79 EX | |
31 | mov r4,r2 ! 5 MT (0 cycles latency) | |
32 | ||
33 | mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) | |
34 | add #-4,r5 ! 50 EX | |
35 | ||
36 | add #7,r2 ! 79 EX | |
37 | ! | |
38 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | |
39 | ! 6 cycles, 4 bytes per iteration | |
40 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK | |
41 | mov r7, r3 ! 5 MT (latency=0) ! RQPO | |
42 | ||
43 | cmp/hi r2,r0 ! 57 MT | |
44 | shll16 r3 ! 103 EX | |
45 | ||
46 | mov r1,r6 ! 5 MT (latency=0) | |
47 | shll8 r3 ! 102 EX ! Oxxx | |
48 | ||
49 | shlr8 r6 ! 106 EX ! xNML | |
50 | mov r1, r7 ! 5 MT (latency=0) | |
51 | ||
52 | or r6,r3 ! 82 EX ! ONML | |
53 | bt/s 3b ! 109 BR | |
54 | ||
55 | mov.l r3,@-r0 ! 30 LS | |
56 | #else | |
57 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN | |
58 | mov r7,r3 ! 5 MT (latency=0) ! OPQR | |
59 | ||
60 | cmp/hi r2,r0 ! 57 MT | |
61 | shlr16 r3 ! 107 EX | |
62 | ||
63 | shlr8 r3 ! 106 EX ! xxxO | |
64 | mov r1,r6 ! 5 MT (latency=0) | |
65 | ||
66 | shll8 r6 ! 102 EX ! LMNx | |
67 | mov r1,r7 ! 5 MT (latency=0) | |
68 | ||
69 | or r6,r3 ! 82 EX ! LMNO | |
70 | bt/s 3b ! 109 BR | |
71 | ||
72 | mov.l r3,@-r0 ! 30 LS | |
73 | #endif | |
74 | ! Finally, copy a byte at once, if necessary | |
75 | ||
76 | add #4,r5 ! 50 EX | |
77 | cmp/eq r4,r0 ! 54 MT | |
78 | ||
79 | add #-6,r2 ! 50 EX | |
80 | bt 9f ! 109 BR | |
81 | ||
82 | 8: cmp/hi r2,r0 ! 57 MT | |
83 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
84 | ||
85 | bt/s 8b ! 109 BR | |
86 | ||
87 | mov.b r1,@-r0 ! 29 LS | |
88 | ||
89 | 9: rts | |
90 | nop | |
91 | ||
92 | ||
93 | ! | |
94 | ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... | |
95 | ! | |
96 | ||
97 | ! Size is 16 or greater, and may have trailing bytes | |
98 | ||
99 | .balign 32 | |
100 | .Lcase3: | |
101 | ! Read a long word and write a long word at once | |
102 | ! At the start of each iteration, r7 contains last long load | |
103 | add #-3,r5 ! 79 EX | |
104 | mov r4,r2 ! 5 MT (0 cycles latency) | |
105 | ||
106 | mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) | |
107 | add #-4,r5 ! 50 EX | |
108 | ||
109 | add #7,r2 ! 79 EX | |
110 | ! | |
111 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | |
112 | ! 6 cycles, 4 bytes per iteration | |
113 | 3: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK | |
114 | mov r7, r3 ! 5 MT (latency=0) ! RQPO | |
115 | ||
116 | cmp/hi r2,r0 ! 57 MT | |
117 | shll8 r3 ! 102 EX ! QPOx | |
118 | ||
119 | mov r1,r6 ! 5 MT (latency=0) | |
120 | shlr16 r6 ! 107 EX | |
121 | ||
122 | shlr8 r6 ! 106 EX ! xxxN | |
123 | mov r1, r7 ! 5 MT (latency=0) | |
124 | ||
125 | or r6,r3 ! 82 EX ! QPON | |
126 | bt/s 3b ! 109 BR | |
127 | ||
128 | mov.l r3,@-r0 ! 30 LS | |
129 | #else | |
e08b954c | 130 | 3: mov r7,r3 ! OPQR |
1da177e4 | 131 | shlr8 r3 ! xOPQ |
e08b954c HS |
132 | mov.l @(r0,r5),r7 ! KLMN |
133 | mov r7,r6 | |
1da177e4 LT |
134 | shll16 r6 |
135 | shll8 r6 ! Nxxx | |
136 | or r6,r3 ! NOPQ | |
137 | cmp/hi r2,r0 | |
138 | bt/s 3b | |
139 | mov.l r3,@-r0 | |
140 | #endif | |
141 | ||
142 | ! Finally, copy a byte at once, if necessary | |
143 | ||
144 | add #6,r5 ! 50 EX | |
145 | cmp/eq r4,r0 ! 54 MT | |
146 | ||
147 | add #-6,r2 ! 50 EX | |
148 | bt 9f ! 109 BR | |
149 | ||
150 | 8: cmp/hi r2,r0 ! 57 MT | |
151 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
152 | ||
153 | bt/s 8b ! 109 BR | |
154 | ||
155 | mov.b r1,@-r0 ! 29 LS | |
156 | ||
157 | 9: rts | |
158 | nop | |
159 | ||
160 | ENTRY(memcpy) | |
161 | ||
162 | ! Calculate the invariants which will be used in the remainder | |
163 | ! of the code: | |
164 | ! | |
165 | ! r4 --> [ ... ] DST [ ... ] SRC | |
166 | ! [ ... ] [ ... ] | |
167 | ! : : | |
168 | ! r0 --> [ ... ] r0+r5 --> [ ... ] | |
169 | ! | |
170 | ! | |
171 | ||
172 | ! Short circuit the common case of src, dst and len being 32 bit aligned | |
173 | ! and test for zero length move | |
174 | ||
175 | mov r6, r0 ! 5 MT (0 cycle latency) | |
176 | or r4, r0 ! 82 EX | |
177 | ||
178 | or r5, r0 ! 82 EX | |
179 | tst r6, r6 ! 86 MT | |
180 | ||
181 | bt/s 99f ! 111 BR (zero len) | |
182 | tst #3, r0 ! 87 MT | |
183 | ||
184 | mov r4, r0 ! 5 MT (0 cycle latency) | |
185 | add r6, r0 ! 49 EX | |
186 | ||
187 | mov #16, r1 ! 6 EX | |
188 | bt/s .Lcase00 ! 111 BR (aligned) | |
189 | ||
190 | sub r4, r5 ! 75 EX | |
191 | ||
192 | ! Arguments are not nicely long word aligned or zero len. | |
193 | ! Check for small copies, and if so do a simple byte at a time copy. | |
194 | ! | |
195 | ! Deciding on an exact value of 'small' is not easy, as the point at which | |
196 | ! using the optimised routines become worthwhile varies (these are the | |
197 | ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): | |
198 | ! size byte-at-time long word byte | |
199 | ! 16 42 39-40 46-50 50-55 | |
200 | ! 24 58 43-44 54-58 62-67 | |
201 | ! 36 82 49-50 66-70 80-85 | |
202 | ! However the penalty for getting it 'wrong' is much higher for long word | |
203 | ! aligned data (and this is more common), so use a value of 16. | |
204 | ||
205 | cmp/gt r6,r1 ! 56 MT | |
206 | ||
207 | add #-1,r5 ! 50 EX | |
208 | bf/s 6f ! 108 BR (not small) | |
209 | ||
210 | mov r5, r3 ! 5 MT (latency=0) | |
211 | shlr r6 ! 104 EX | |
212 | ||
213 | mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
214 | bf/s 4f ! 111 BR | |
215 | ||
216 | add #-1,r3 ! 50 EX | |
217 | tst r6, r6 ! 86 MT | |
218 | ||
219 | bt/s 98f ! 110 BR | |
220 | mov.b r1,@-r0 ! 29 LS | |
221 | ||
222 | ! 4 cycles, 2 bytes per iteration | |
223 | 3: mov.b @(r0,r5),r1 ! 20 LS (latency=2) | |
224 | ||
225 | 4: mov.b @(r0,r3),r2 ! 20 LS (latency=2) | |
226 | dt r6 ! 67 EX | |
227 | ||
228 | mov.b r1,@-r0 ! 29 LS | |
229 | bf/s 3b ! 111 BR | |
230 | ||
231 | mov.b r2,@-r0 ! 29 LS | |
232 | 98: | |
233 | rts | |
234 | nop | |
235 | ||
236 | 99: rts | |
237 | mov r4, r0 | |
238 | ||
239 | ! Size is not small, so its worthwhile looking for optimisations. | |
240 | ! First align destination to a long word boundary. | |
241 | ! | |
242 | ! r5 = normal value -1 | |
243 | ||
244 | 6: tst #3, r0 ! 87 MT | |
245 | mov #3, r3 ! 6 EX | |
246 | ||
247 | bt/s 2f ! 111 BR | |
248 | and r0,r3 ! 78 EX | |
249 | ||
250 | ! 3 cycles, 1 byte per iteration | |
251 | 1: dt r3 ! 67 EX | |
252 | mov.b @(r0,r5),r1 ! 19 LS (latency=2) | |
253 | ||
254 | add #-1, r6 ! 79 EX | |
255 | bf/s 1b ! 109 BR | |
256 | ||
257 | mov.b r1,@-r0 ! 28 LS | |
258 | ||
259 | 2: add #1, r5 ! 79 EX | |
260 | ||
261 | ! Now select the appropriate bulk transfer code based on relative | |
262 | ! alignment of src and dst. | |
263 | ||
264 | mov r0, r3 ! 5 MT (latency=0) | |
265 | ||
266 | mov r5, r0 ! 5 MT (latency=0) | |
267 | tst #1, r0 ! 87 MT | |
268 | ||
269 | bf/s 1f ! 111 BR | |
270 | mov #64, r7 ! 6 EX | |
271 | ||
272 | ! bit 0 clear | |
273 | ||
274 | cmp/ge r7, r6 ! 55 MT | |
275 | ||
276 | bt/s 2f ! 111 BR | |
277 | tst #2, r0 ! 87 MT | |
278 | ||
279 | ! small | |
280 | bt/s .Lcase0 | |
281 | mov r3, r0 | |
282 | ||
283 | bra .Lcase2 | |
284 | nop | |
285 | ||
286 | ! big | |
287 | 2: bt/s .Lcase0b | |
288 | mov r3, r0 | |
289 | ||
290 | bra .Lcase2b | |
291 | nop | |
292 | ||
293 | ! bit 0 set | |
294 | 1: tst #2, r0 ! 87 MT | |
295 | ||
296 | bt/s .Lcase1 | |
297 | mov r3, r0 | |
298 | ||
299 | bra .Lcase3 | |
300 | nop | |
301 | ||
302 | ||
303 | ! | |
304 | ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR | |
305 | ! | |
306 | ||
307 | ! src, dst and size are all long word aligned | |
308 | ! size is non-zero | |
309 | ||
310 | .balign 32 | |
311 | .Lcase00: | |
312 | mov #64, r1 ! 6 EX | |
313 | mov r5, r3 ! 5 MT (latency=0) | |
314 | ||
315 | cmp/gt r6, r1 ! 56 MT | |
316 | add #-4, r5 ! 50 EX | |
317 | ||
318 | bf .Lcase00b ! 108 BR (big loop) | |
319 | shlr2 r6 ! 105 EX | |
320 | ||
321 | shlr r6 ! 104 EX | |
322 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
323 | ||
324 | bf/s 4f ! 111 BR | |
325 | add #-8, r3 ! 50 EX | |
326 | ||
327 | tst r6, r6 ! 86 MT | |
328 | bt/s 5f ! 110 BR | |
329 | ||
330 | mov.l r1,@-r0 ! 30 LS | |
331 | ||
332 | ! 4 cycles, 2 long words per iteration | |
333 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
334 | ||
335 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | |
336 | dt r6 ! 67 EX | |
337 | ||
338 | mov.l r1, @-r0 ! 30 LS | |
339 | bf/s 3b ! 109 BR | |
340 | ||
341 | mov.l r2, @-r0 ! 30 LS | |
342 | ||
343 | 5: rts | |
344 | nop | |
345 | ||
346 | ||
347 | ! Size is 16 or greater and less than 64, but may have trailing bytes | |
348 | ||
349 | .balign 32 | |
350 | .Lcase0: | |
351 | add #-4, r5 ! 50 EX | |
352 | mov r4, r7 ! 5 MT (latency=0) | |
353 | ||
354 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
355 | mov #4, r2 ! 6 EX | |
356 | ||
357 | add #11, r7 ! 50 EX | |
358 | tst r2, r6 ! 86 MT | |
359 | ||
360 | mov r5, r3 ! 5 MT (latency=0) | |
361 | bt/s 4f ! 111 BR | |
362 | ||
363 | add #-4, r3 ! 50 EX | |
364 | mov.l r1,@-r0 ! 30 LS | |
365 | ||
366 | ! 4 cycles, 2 long words per iteration | |
367 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
368 | ||
369 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | |
370 | cmp/hi r7, r0 | |
371 | ||
372 | mov.l r1, @-r0 ! 30 LS | |
373 | bt/s 3b ! 109 BR | |
374 | ||
375 | mov.l r2, @-r0 ! 30 LS | |
376 | ||
377 | ! Copy the final 0-3 bytes | |
378 | ||
379 | add #3,r5 ! 50 EX | |
380 | ||
381 | cmp/eq r0, r4 ! 54 MT | |
382 | add #-10, r7 ! 50 EX | |
383 | ||
384 | bt 9f ! 110 BR | |
385 | ||
386 | ! 3 cycles, 1 byte per iteration | |
387 | 1: mov.b @(r0,r5),r1 ! 19 LS | |
388 | cmp/hi r7,r0 ! 57 MT | |
389 | ||
390 | bt/s 1b ! 111 BR | |
391 | mov.b r1,@-r0 ! 28 LS | |
392 | ||
393 | 9: rts | |
394 | nop | |
395 | ||
396 | ! Size is at least 64 bytes, so will be going round the big loop at least once. | |
397 | ! | |
398 | ! r2 = rounded up r4 | |
399 | ! r3 = rounded down r0 | |
400 | ||
401 | .balign 32 | |
402 | .Lcase0b: | |
403 | add #-4, r5 ! 50 EX | |
404 | ||
405 | .Lcase00b: | |
406 | mov r0, r3 ! 5 MT (latency=0) | |
407 | mov #(~0x1f), r1 ! 6 EX | |
408 | ||
409 | and r1, r3 ! 78 EX | |
410 | mov r4, r2 ! 5 MT (latency=0) | |
411 | ||
412 | cmp/eq r3, r0 ! 54 MT | |
413 | add #0x1f, r2 ! 50 EX | |
414 | ||
415 | bt/s 1f ! 110 BR | |
416 | and r1, r2 ! 78 EX | |
417 | ||
418 | ! copy initial words until cache line aligned | |
419 | ||
420 | mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
421 | tst #4, r0 ! 87 MT | |
422 | ||
423 | mov r5, r6 ! 5 MT (latency=0) | |
424 | add #-4, r6 ! 50 EX | |
425 | ||
426 | bt/s 4f ! 111 BR | |
427 | add #8, r3 ! 50 EX | |
428 | ||
429 | tst #0x18, r0 ! 87 MT | |
430 | ||
431 | bt/s 1f ! 109 BR | |
432 | mov.l r1,@-r0 ! 30 LS | |
433 | ||
434 | ! 4 cycles, 2 long words per iteration | |
435 | 3: mov.l @(r0, r5), r1 ! 21 LS (latency=2) | |
436 | ||
437 | 4: mov.l @(r0, r6), r7 ! 21 LS (latency=2) | |
438 | cmp/eq r3, r0 ! 54 MT | |
439 | ||
440 | mov.l r1, @-r0 ! 30 LS | |
441 | bf/s 3b ! 109 BR | |
442 | ||
443 | mov.l r7, @-r0 ! 30 LS | |
444 | ||
445 | ! Copy the cache line aligned blocks | |
446 | ! | |
447 | ! In use: r0, r2, r4, r5 | |
448 | ! Scratch: r1, r3, r6, r7 | |
449 | ! | |
450 | ! We could do this with the four scratch registers, but if src | |
451 | ! and dest hit the same cache line, this will thrash, so make | |
452 | ! use of additional registers. | |
453 | ! | |
454 | ! We also need r0 as a temporary (for movca), so 'undo' the invariant: | |
455 | ! r5: src (was r0+r5) | |
456 | ! r1: dest (was r0) | |
457 | ! this can be reversed at the end, so we don't need to save any extra | |
458 | ! state. | |
459 | ! | |
460 | 1: mov.l r8, @-r15 ! 30 LS | |
461 | add r0, r5 ! 49 EX | |
462 | ||
463 | mov.l r9, @-r15 ! 30 LS | |
464 | mov r0, r1 ! 5 MT (latency=0) | |
465 | ||
466 | mov.l r10, @-r15 ! 30 LS | |
467 | add #-0x1c, r5 ! 50 EX | |
468 | ||
469 | mov.l r11, @-r15 ! 30 LS | |
470 | ||
471 | ! 16 cycles, 32 bytes per iteration | |
472 | 2: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) | |
473 | add #-0x20, r1 ! 50 EX | |
474 | mov.l @(0x04,r5),r3 ! 18 LS (latency=2) | |
475 | mov.l @(0x08,r5),r6 ! 18 LS (latency=2) | |
476 | mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) | |
477 | mov.l @(0x10,r5),r8 ! 18 LS (latency=2) | |
478 | mov.l @(0x14,r5),r9 ! 18 LS (latency=2) | |
479 | mov.l @(0x18,r5),r10 ! 18 LS (latency=2) | |
480 | mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) | |
481 | movca.l r0,@r1 ! 40 LS (latency=3-7) | |
482 | mov.l r3,@(0x04,r1) ! 33 LS | |
483 | mov.l r6,@(0x08,r1) ! 33 LS | |
484 | mov.l r7,@(0x0c,r1) ! 33 LS | |
485 | ||
486 | mov.l r8,@(0x10,r1) ! 33 LS | |
487 | add #-0x20, r5 ! 50 EX | |
488 | ||
489 | mov.l r9,@(0x14,r1) ! 33 LS | |
490 | cmp/eq r2,r1 ! 54 MT | |
491 | ||
492 | mov.l r10,@(0x18,r1) ! 33 LS | |
493 | bf/s 2b ! 109 BR | |
494 | ||
495 | mov.l r11,@(0x1c,r1) ! 33 LS | |
496 | ||
497 | mov r1, r0 ! 5 MT (latency=0) | |
498 | ||
499 | mov.l @r15+, r11 ! 15 LS | |
500 | sub r1, r5 ! 75 EX | |
501 | ||
502 | mov.l @r15+, r10 ! 15 LS | |
503 | cmp/eq r4, r0 ! 54 MT | |
504 | ||
505 | bf/s 1f ! 109 BR | |
506 | mov.l @r15+, r9 ! 15 LS | |
507 | ||
508 | rts | |
509 | 1: mov.l @r15+, r8 ! 15 LS | |
510 | sub r4, r1 ! 75 EX (len remaining) | |
511 | ||
512 | ! number of trailing bytes is non-zero | |
513 | ! | |
514 | ! invariants restored (r5 already decremented by 4) | |
515 | ! also r1=num bytes remaining | |
516 | ||
517 | mov #4, r2 ! 6 EX | |
518 | mov r4, r7 ! 5 MT (latency=0) | |
519 | ||
520 | add #0x1c, r5 ! 50 EX (back to -4) | |
521 | cmp/hs r2, r1 ! 58 MT | |
522 | ||
523 | bf/s 5f ! 108 BR | |
524 | add #11, r7 ! 50 EX | |
525 | ||
526 | mov.l @(r0, r5), r6 ! 21 LS (latency=2) | |
527 | tst r2, r1 ! 86 MT | |
528 | ||
529 | mov r5, r3 ! 5 MT (latency=0) | |
530 | bt/s 4f ! 111 BR | |
531 | ||
532 | add #-4, r3 ! 50 EX | |
533 | cmp/hs r2, r1 ! 58 MT | |
534 | ||
535 | bt/s 5f ! 111 BR | |
536 | mov.l r6,@-r0 ! 30 LS | |
537 | ||
538 | ! 4 cycles, 2 long words per iteration | |
539 | 3: mov.l @(r0, r5), r6 ! 21 LS (latency=2) | |
540 | ||
541 | 4: mov.l @(r0, r3), r2 ! 21 LS (latency=2) | |
542 | cmp/hi r7, r0 | |
543 | ||
544 | mov.l r6, @-r0 ! 30 LS | |
545 | bt/s 3b ! 109 BR | |
546 | ||
547 | mov.l r2, @-r0 ! 30 LS | |
548 | ||
549 | ! Copy the final 0-3 bytes | |
550 | ||
551 | 5: cmp/eq r0, r4 ! 54 MT | |
552 | add #-10, r7 ! 50 EX | |
553 | ||
554 | bt 9f ! 110 BR | |
555 | add #3,r5 ! 50 EX | |
556 | ||
557 | ! 3 cycles, 1 byte per iteration | |
558 | 1: mov.b @(r0,r5),r1 ! 19 LS | |
559 | cmp/hi r7,r0 ! 57 MT | |
560 | ||
561 | bt/s 1b ! 111 BR | |
562 | mov.b r1,@-r0 ! 28 LS | |
563 | ||
564 | 9: rts | |
565 | nop | |
566 | ||
567 | ! | |
568 | ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. | |
569 | ! | |
570 | ||
571 | .balign 32 | |
572 | .Lcase2: | |
573 | ! Size is 16 or greater and less then 64, but may have trailing bytes | |
574 | ||
575 | 2: mov r5, r6 ! 5 MT (latency=0) | |
576 | add #-2,r5 ! 50 EX | |
577 | ||
578 | mov r4,r2 ! 5 MT (latency=0) | |
579 | add #-4,r6 ! 50 EX | |
580 | ||
581 | add #7,r2 ! 50 EX | |
582 | 3: mov.w @(r0,r5),r1 ! 20 LS (latency=2) | |
583 | ||
584 | mov.w @(r0,r6),r3 ! 20 LS (latency=2) | |
585 | cmp/hi r2,r0 ! 57 MT | |
586 | ||
587 | mov.w r1,@-r0 ! 29 LS | |
588 | bt/s 3b ! 111 BR | |
589 | ||
590 | mov.w r3,@-r0 ! 29 LS | |
591 | ||
592 | bra 10f | |
593 | nop | |
594 | ||
595 | ||
596 | .balign 32 | |
597 | .Lcase2b: | |
598 | ! Size is at least 64 bytes, so will be going round the big loop at least once. | |
599 | ! | |
600 | ! r2 = rounded up r4 | |
601 | ! r3 = rounded down r0 | |
602 | ||
603 | mov r0, r3 ! 5 MT (latency=0) | |
604 | mov #(~0x1f), r1 ! 6 EX | |
605 | ||
606 | and r1, r3 ! 78 EX | |
607 | mov r4, r2 ! 5 MT (latency=0) | |
608 | ||
609 | cmp/eq r3, r0 ! 54 MT | |
610 | add #0x1f, r2 ! 50 EX | |
611 | ||
612 | add #-2, r5 ! 50 EX | |
613 | bt/s 1f ! 110 BR | |
614 | and r1, r2 ! 78 EX | |
615 | ||
616 | ! Copy a short word one at a time until we are cache line aligned | |
617 | ! Normal values: r0, r2, r3, r4 | |
618 | ! Unused: r1, r6, r7 | |
619 | ! Mod: r5 (=r5-2) | |
620 | ! | |
621 | add #2, r3 ! 50 EX | |
622 | ||
623 | 2: mov.w @(r0,r5),r1 ! 20 LS (latency=2) | |
624 | cmp/eq r3,r0 ! 54 MT | |
625 | ||
626 | bf/s 2b ! 111 BR | |
627 | ||
628 | mov.w r1,@-r0 ! 29 LS | |
629 | ||
630 | ! Copy the cache line aligned blocks | |
631 | ! | |
632 | ! In use: r0, r2, r4, r5 (=r5-2) | |
633 | ! Scratch: r1, r3, r6, r7 | |
634 | ! | |
635 | ! We could do this with the four scratch registers, but if src | |
636 | ! and dest hit the same cache line, this will thrash, so make | |
637 | ! use of additional registers. | |
638 | ! | |
639 | ! We also need r0 as a temporary (for movca), so 'undo' the invariant: | |
640 | ! r5: src (was r0+r5) | |
641 | ! r1: dest (was r0) | |
642 | ! this can be reversed at the end, so we don't need to save any extra | |
643 | ! state. | |
644 | ! | |
645 | 1: mov.l r8, @-r15 ! 30 LS | |
646 | add r0, r5 ! 49 EX | |
647 | ||
648 | mov.l r9, @-r15 ! 30 LS | |
649 | mov r0, r1 ! 5 MT (latency=0) | |
650 | ||
651 | mov.l r10, @-r15 ! 30 LS | |
652 | add #-0x1e, r5 ! 50 EX | |
653 | ||
654 | mov.l r11, @-r15 ! 30 LS | |
655 | ||
656 | mov.l r12, @-r15 ! 30 LS | |
657 | ||
658 | ! 17 cycles, 32 bytes per iteration | |
659 | #ifdef CONFIG_CPU_LITTLE_ENDIAN | |
660 | 2: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI | |
661 | add #-0x20, r1 ! 50 EX | |
662 | ||
663 | mov.l @r5+, r3 ! 15 LS (latency=2) NMLK | |
664 | ||
665 | mov.l @r5+, r6 ! 15 LS (latency=2) RQPO | |
666 | shll16 r0 ! 103 EX JI.. | |
667 | ||
668 | mov.l @r5+, r7 ! 15 LS (latency=2) | |
669 | xtrct r3, r0 ! 48 EX LKJI | |
670 | ||
671 | mov.l @r5+, r8 ! 15 LS (latency=2) | |
672 | xtrct r6, r3 ! 48 EX PONM | |
673 | ||
674 | mov.l @r5+, r9 ! 15 LS (latency=2) | |
675 | xtrct r7, r6 ! 48 EX | |
676 | ||
677 | mov.l @r5+, r10 ! 15 LS (latency=2) | |
678 | xtrct r8, r7 ! 48 EX | |
679 | ||
680 | mov.l @r5+, r11 ! 15 LS (latency=2) | |
681 | xtrct r9, r8 ! 48 EX | |
682 | ||
683 | mov.w @r5+, r12 ! 15 LS (latency=2) | |
684 | xtrct r10, r9 ! 48 EX | |
685 | ||
686 | movca.l r0,@r1 ! 40 LS (latency=3-7) | |
687 | xtrct r11, r10 ! 48 EX | |
688 | ||
689 | mov.l r3, @(0x04,r1) ! 33 LS | |
690 | xtrct r12, r11 ! 48 EX | |
691 | ||
692 | mov.l r6, @(0x08,r1) ! 33 LS | |
693 | ||
694 | mov.l r7, @(0x0c,r1) ! 33 LS | |
695 | ||
696 | mov.l r8, @(0x10,r1) ! 33 LS | |
697 | add #-0x40, r5 ! 50 EX | |
698 | ||
699 | mov.l r9, @(0x14,r1) ! 33 LS | |
700 | cmp/eq r2,r1 ! 54 MT | |
701 | ||
702 | mov.l r10, @(0x18,r1) ! 33 LS | |
703 | bf/s 2b ! 109 BR | |
704 | ||
705 | mov.l r11, @(0x1c,r1) ! 33 LS | |
706 | #else | |
707 | 2: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) | |
708 | add #-2, r5 ! 50 EX | |
709 | ||
710 | mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) | |
711 | add #-4, r1 ! 50 EX | |
712 | ||
713 | mov.l @(0x18,r5), r6 ! 18 LS (latency=2) | |
714 | shll16 r0 ! 103 EX | |
715 | ||
716 | mov.l @(0x14,r5), r7 ! 18 LS (latency=2) | |
717 | xtrct r3, r0 ! 48 EX | |
718 | ||
719 | mov.l @(0x10,r5), r8 ! 18 LS (latency=2) | |
720 | xtrct r6, r3 ! 48 EX | |
721 | ||
722 | mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) | |
723 | xtrct r7, r6 ! 48 EX | |
724 | ||
725 | mov.l @(0x08,r5), r10 ! 18 LS (latency=2) | |
726 | xtrct r8, r7 ! 48 EX | |
727 | ||
728 | mov.l @(0x04,r5), r11 ! 18 LS (latency=2) | |
729 | xtrct r9, r8 ! 48 EX | |
730 | ||
c7afb7e5 NI |
731 | mov.l @(0x00,r5), r12 ! 18 LS (latency=2) |
732 | xtrct r10, r9 ! 48 EX | |
1da177e4 LT |
733 | |
734 | movca.l r0,@r1 ! 40 LS (latency=3-7) | |
735 | add #-0x1c, r1 ! 50 EX | |
736 | ||
e08b954c | 737 | mov.l r3, @(0x18,r1) ! 33 LS |
1da177e4 LT |
738 | xtrct r11, r10 ! 48 EX |
739 | ||
e08b954c | 740 | mov.l r6, @(0x14,r1) ! 33 LS |
1da177e4 LT |
741 | xtrct r12, r11 ! 48 EX |
742 | ||
e08b954c | 743 | mov.l r7, @(0x10,r1) ! 33 LS |
1da177e4 | 744 | |
e08b954c HS |
745 | mov.l r8, @(0x0c,r1) ! 33 LS |
746 | add #-0x1e, r5 ! 50 EX | |
1da177e4 | 747 | |
e08b954c | 748 | mov.l r9, @(0x08,r1) ! 33 LS |
1da177e4 LT |
749 | cmp/eq r2,r1 ! 54 MT |
750 | ||
e08b954c | 751 | mov.l r10, @(0x04,r1) ! 33 LS |
1da177e4 LT |
752 | bf/s 2b ! 109 BR |
753 | ||
e08b954c | 754 | mov.l r11, @(0x00,r1) ! 33 LS |
1da177e4 LT |
755 | #endif |
756 | ||
757 | mov.l @r15+, r12 | |
758 | mov r1, r0 ! 5 MT (latency=0) | |
759 | ||
760 | mov.l @r15+, r11 ! 15 LS | |
761 | sub r1, r5 ! 75 EX | |
762 | ||
763 | mov.l @r15+, r10 ! 15 LS | |
764 | cmp/eq r4, r0 ! 54 MT | |
765 | ||
766 | bf/s 1f ! 109 BR | |
767 | mov.l @r15+, r9 ! 15 LS | |
768 | ||
769 | rts | |
770 | 1: mov.l @r15+, r8 ! 15 LS | |
771 | ||
772 | add #0x1e, r5 ! 50 EX | |
773 | ||
774 | ! Finish off a short word at a time | |
775 | ! r5 must be invariant - 2 | |
776 | 10: mov r4,r2 ! 5 MT (latency=0) | |
777 | add #1,r2 ! 50 EX | |
778 | ||
779 | cmp/hi r2, r0 ! 57 MT | |
780 | bf/s 1f ! 109 BR | |
781 | ||
782 | add #2, r2 ! 50 EX | |
783 | ||
784 | 3: mov.w @(r0,r5),r1 ! 20 LS | |
785 | cmp/hi r2,r0 ! 57 MT | |
786 | ||
787 | bt/s 3b ! 109 BR | |
788 | ||
789 | mov.w r1,@-r0 ! 29 LS | |
790 | 1: | |
791 | ||
792 | ! | |
793 | ! Finally, copy the last byte if necessary | |
794 | cmp/eq r4,r0 ! 54 MT | |
795 | bt/s 9b | |
796 | add #1,r5 | |
797 | mov.b @(r0,r5),r1 | |
798 | rts | |
799 | mov.b r1,@-r0 | |
800 |